[libvirt] [PATCH] qemu: Fix race between device rebind and kvm cleanup
Daniel Veillard
veillard at redhat.com
Fri Jan 22 17:01:11 UTC 2010
On Fri, Jan 22, 2010 at 11:40:38AM -0500, Chris Lalancette wrote:
> Certain hypervisors (like qemu/kvm) map the PCI bar(s) on
> the host when doing device passthrough. This can lead to a race
> condition where the hypervisor is still cleaning up the device while
> libvirt is trying to re-attach it to the host device driver. To avoid
> this situation, we look through /proc/iomem, and if the hypervisor is
> still holding onto the bar (denoted by the string in the matcher variable),
> then we can wait around a bit for that to clear up.
>
> v2: Thanks to review by DV, make sure we wait the full timeout per-device
>
> Signed-off-by: Chris Lalancette <clalance at redhat.com>
> ---
> src/libvirt_private.syms | 1 +
> src/qemu/qemu_driver.c | 19 ++++++---
> src/util/pci.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++
> src/util/pci.h | 1 +
> 4 files changed, 112 insertions(+), 6 deletions(-)
>
> diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
> index c912d81..e42c090 100644
> --- a/src/libvirt_private.syms
> +++ b/src/libvirt_private.syms
> @@ -439,6 +439,7 @@ pciGetDevice;
> pciFreeDevice;
> pciDettachDevice;
> pciReAttachDevice;
> +pciWaitForDeviceCleanup;
> pciResetDevice;
> pciDeviceSetManaged;
> pciDeviceGetManaged;
> diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
> index e14ed8f..55550ef 100644
> --- a/src/qemu/qemu_driver.c
> +++ b/src/qemu/qemu_driver.c
> @@ -2277,12 +2277,19 @@ qemuDomainReAttachHostDevices(virConnectPtr conn,
>
> for (i = 0; i < pciDeviceListCount(pcidevs); i++) {
> pciDevice *dev = pciDeviceListGet(pcidevs, i);
> - if (pciDeviceGetManaged(dev) &&
> - pciReAttachDevice(NULL, dev) < 0) {
> - virErrorPtr err = virGetLastError();
> - VIR_ERROR(_("Failed to re-attach PCI device: %s"),
> - err ? err->message : "");
> - virResetError(err);
> + int retries = 100;
> + if (pciDeviceGetManaged(dev)) {
> + while (pciWaitForDeviceCleanup(dev, "kvm_assigned_device")
> + && retries) {
> + usleep(100*1000);
> + retries--;
> + }
> + if (pciReAttachDevice(NULL, dev) < 0) {
> + virErrorPtr err = virGetLastError();
> + VIR_ERROR(_("Failed to re-attach PCI device: %s"),
> + err ? err->message : "");
> + virResetError(err);
> + }
> }
> }
>
> diff --git a/src/util/pci.c b/src/util/pci.c
> index 0defbfe..09535bd 100644
> --- a/src/util/pci.c
> +++ b/src/util/pci.c
> @@ -883,6 +883,103 @@ pciReAttachDevice(virConnectPtr conn, pciDevice *dev)
> return pciUnBindDeviceFromStub(conn, dev, driver);
> }
>
> +/* Certain hypervisors (like qemu/kvm) map the PCI bar(s) on
> + * the host when doing device passthrough. This can lead to a race
> + * condition where the hypervisor is still cleaning up the device while
> + * libvirt is trying to re-attach it to the host device driver. To avoid
> + * this situation, we look through /proc/iomem, and if the hypervisor is
> + * still holding onto the bar (denoted by the string in the matcher variable),
> + * then we can wait around a bit for that to clear up.
> + *
> + * A typical /proc/iomem looks like this (snipped for brevity):
> + * 00010000-0008efff : System RAM
> + * 0008f000-0008ffff : reserved
> + * ...
> + * 00100000-cc9fcfff : System RAM
> + * 00200000-00483d3b : Kernel code
> + * 00483d3c-005c88df : Kernel data
> + * cc9fd000-ccc71fff : ACPI Non-volatile Storage
> + * ...
> + * d0200000-d02fffff : PCI Bus #05
> + * d0200000-d021ffff : 0000:05:00.0
> + * d0200000-d021ffff : e1000e
> + * d0220000-d023ffff : 0000:05:00.0
> + * d0220000-d023ffff : e1000e
> + * ...
> + * f0000000-f0003fff : 0000:00:1b.0
> + * f0000000-f0003fff : kvm_assigned_device
> + *
> + * Returns 0 if we are clear to continue, and 1 if the hypervisor is still
> + * holding onto the resource.
> + */
> +int
> +pciWaitForDeviceCleanup(pciDevice *dev, const char *matcher)
> +{
> + FILE *fp;
> + char line[160];
> + unsigned long long start, end;
> + int consumed;
> + char *rest;
> + unsigned long long domain;
> + int bus, slot, function;
> + int in_matching_device;
> + int ret;
> + size_t match_depth;
> +
> + fp = fopen("/proc/iomem", "r");
> + if (!fp) {
> + /* If we failed to open iomem, we just basically ignore the error. The
> + * unbind might succeed anyway, and besides, it's very likely we have
> + * no way to report the error
> + */
> + VIR_DEBUG0("Failed to open /proc/iomem, trying to continue anyway");
> + return 0;
> + }
> +
> + ret = 0;
> + in_matching_device = 0;
> + match_depth = 0;
> + while (fgets(line, sizeof(line), fp) != 0) {
> + /* the logic here is a bit confusing. For each line, we look to
> + * see if it matches the domain:bus:slot.function we were given.
> + * If this line matches the DBSF, then any subsequent lines indented
> + * by 2 spaces are the PCI regions for this device. It's also
> + * possible that none of the PCI regions are currently mapped, in
> + * which case we have no indented regions. This code handles all
> + * of these situations
> + */
> + if (in_matching_device && (strspn(line, " ") == (match_depth + 2))) {
> + if (sscanf(line, "%Lx-%Lx : %n", &start, &end, &consumed) != 2)
> + continue;
> +
> + rest = line + consumed;
> + if (STRPREFIX(rest, matcher)) {
> + ret = 1;
> + break;
> + }
> + }
> + else {
> + in_matching_device = 0;
> + if (sscanf(line, "%Lx-%Lx : %n", &start, &end, &consumed) != 2)
> + continue;
> +
> + rest = line + consumed;
> + if (sscanf(rest, "%Lx:%x:%x.%x", &domain, &bus, &slot, &function) != 4)
> + continue;
> +
> + if (domain != dev->domain || bus != dev->bus || slot != dev->slot ||
> + function != dev->function)
> + continue;
> + in_matching_device = 1;
> + match_depth = strspn(line, " ");
> + }
> + }
> +
> + fclose(fp);
> +
> + return ret;
> +}
> +
> static char *
> pciReadDeviceID(pciDevice *dev, const char *id_name)
> {
> diff --git a/src/util/pci.h b/src/util/pci.h
> index a1a19e7..e6ab137 100644
> --- a/src/util/pci.h
> +++ b/src/util/pci.h
> @@ -81,5 +81,6 @@ int pciDeviceFileIterate(virConnectPtr conn,
> int pciDeviceIsAssignable(virConnectPtr conn,
> pciDevice *dev,
> int strict_acs_check);
> +int pciWaitForDeviceCleanup(pciDevice *dev, const char *matcher);
>
> #endif /* __VIR_PCI_H__ */
ACK,
Daniel
--
Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/
daniel at veillard.com | Rpmfind RPM search engine http://rpmfind.net/
http://veillard.com/ | virtualization library http://libvirt.org/
More information about the libvir-list
mailing list