[libvirt] [PATCH] qemu: Fix race between device rebind and kvm cleanup

Daniel Veillard veillard at redhat.com
Fri Jan 22 17:01:11 UTC 2010


On Fri, Jan 22, 2010 at 11:40:38AM -0500, Chris Lalancette wrote:
> Certain hypervisors (like qemu/kvm) map the PCI bar(s) on
> the host when doing device passthrough.  This can lead to a race
> condition where the hypervisor is still cleaning up the device while
> libvirt is trying to re-attach it to the host device driver.  To avoid
> this situation, we look through /proc/iomem, and if the hypervisor is
> still holding onto the bar (denoted by the string in the matcher variable),
> then we can wait around a bit for that to clear up.
> 
> v2: Thanks to review by DV, make sure we wait the full timeout per-device
> 
> Signed-off-by: Chris Lalancette <clalance at redhat.com>
> ---
>  src/libvirt_private.syms |    1 +
>  src/qemu/qemu_driver.c   |   19 ++++++---
>  src/util/pci.c           |   97 ++++++++++++++++++++++++++++++++++++++++++++++
>  src/util/pci.h           |    1 +
>  4 files changed, 112 insertions(+), 6 deletions(-)
> 
> diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
> index c912d81..e42c090 100644
> --- a/src/libvirt_private.syms
> +++ b/src/libvirt_private.syms
> @@ -439,6 +439,7 @@ pciGetDevice;
>  pciFreeDevice;
>  pciDettachDevice;
>  pciReAttachDevice;
> +pciWaitForDeviceCleanup;
>  pciResetDevice;
>  pciDeviceSetManaged;
>  pciDeviceGetManaged;
> diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
> index e14ed8f..55550ef 100644
> --- a/src/qemu/qemu_driver.c
> +++ b/src/qemu/qemu_driver.c
> @@ -2277,12 +2277,19 @@ qemuDomainReAttachHostDevices(virConnectPtr conn,
>  
>      for (i = 0; i < pciDeviceListCount(pcidevs); i++) {
>          pciDevice *dev = pciDeviceListGet(pcidevs, i);
> -        if (pciDeviceGetManaged(dev) &&
> -            pciReAttachDevice(NULL, dev) < 0) {
> -            virErrorPtr err = virGetLastError();
> -            VIR_ERROR(_("Failed to re-attach PCI device: %s"),
> -                      err ? err->message : "");
> -            virResetError(err);
> +        int retries = 100;
> +        if (pciDeviceGetManaged(dev)) {
> +            while (pciWaitForDeviceCleanup(dev, "kvm_assigned_device")
> +                   && retries) {
> +                usleep(100*1000);
> +                retries--;
> +            }
> +            if (pciReAttachDevice(NULL, dev) < 0) {
> +                virErrorPtr err = virGetLastError();
> +                VIR_ERROR(_("Failed to re-attach PCI device: %s"),
> +                          err ? err->message : "");
> +                virResetError(err);
> +            }
>          }
>      }
>  
> diff --git a/src/util/pci.c b/src/util/pci.c
> index 0defbfe..09535bd 100644
> --- a/src/util/pci.c
> +++ b/src/util/pci.c
> @@ -883,6 +883,103 @@ pciReAttachDevice(virConnectPtr conn, pciDevice *dev)
>      return pciUnBindDeviceFromStub(conn, dev, driver);
>  }
>  
> +/* Certain hypervisors (like qemu/kvm) map the PCI bar(s) on
> + * the host when doing device passthrough.  This can lead to a race
> + * condition where the hypervisor is still cleaning up the device while
> + * libvirt is trying to re-attach it to the host device driver.  To avoid
> + * this situation, we look through /proc/iomem, and if the hypervisor is
> + * still holding onto the bar (denoted by the string in the matcher variable),
> + * then we can wait around a bit for that to clear up.
> + *
> + * A typical /proc/iomem looks like this (snipped for brevity):
> + * 00010000-0008efff : System RAM
> + * 0008f000-0008ffff : reserved
> + * ...
> + * 00100000-cc9fcfff : System RAM
> + *   00200000-00483d3b : Kernel code
> + *   00483d3c-005c88df : Kernel data
> + * cc9fd000-ccc71fff : ACPI Non-volatile Storage
> + * ...
> + * d0200000-d02fffff : PCI Bus #05
> + *   d0200000-d021ffff : 0000:05:00.0
> + *     d0200000-d021ffff : e1000e
> + *   d0220000-d023ffff : 0000:05:00.0
> + *     d0220000-d023ffff : e1000e
> + * ...
> + * f0000000-f0003fff : 0000:00:1b.0
> + *   f0000000-f0003fff : kvm_assigned_device
> + *
> + * Returns 0 if we are clear to continue, and 1 if the hypervisor is still
> + * holding onto the resource.
> + */
> +int
> +pciWaitForDeviceCleanup(pciDevice *dev, const char *matcher)
> +{
> +    FILE *fp;
> +    char line[160];
> +    unsigned long long start, end;
> +    int consumed;
> +    char *rest;
> +    unsigned long long domain;
> +    int bus, slot, function;
> +    int in_matching_device;
> +    int ret;
> +    size_t match_depth;
> +
> +    fp = fopen("/proc/iomem", "r");
> +    if (!fp) {
> +        /* If we failed to open iomem, we just basically ignore the error.  The
> +         * unbind might succeed anyway, and besides, it's very likely we have
> +         * no way to report the error
> +         */
> +        VIR_DEBUG0("Failed to open /proc/iomem, trying to continue anyway");
> +        return 0;
> +    }
> +
> +    ret = 0;
> +    in_matching_device = 0;
> +    match_depth = 0;
> +    while (fgets(line, sizeof(line), fp) != 0) {
> +        /* the logic here is a bit confusing.  For each line, we look to
> +         * see if it matches the domain:bus:slot.function we were given.
> +         * If this line matches the DBSF, then any subsequent lines indented
> +         * by 2 spaces are the PCI regions for this device.  It's also
> +         * possible that none of the PCI regions are currently mapped, in
> +         * which case we have no indented regions.  This code handles all
> +         * of these situations
> +         */
> +        if (in_matching_device && (strspn(line, " ") == (match_depth + 2))) {
> +            if (sscanf(line, "%Lx-%Lx : %n", &start, &end, &consumed) != 2)
> +                continue;
> +
> +            rest = line + consumed;
> +            if (STRPREFIX(rest, matcher)) {
> +                ret = 1;
> +                break;
> +            }
> +        }
> +        else {
> +            in_matching_device = 0;
> +            if (sscanf(line, "%Lx-%Lx : %n", &start, &end, &consumed) != 2)
> +                continue;
> +
> +            rest = line + consumed;
> +            if (sscanf(rest, "%Lx:%x:%x.%x", &domain, &bus, &slot, &function) != 4)
> +                continue;
> +
> +            if (domain != dev->domain || bus != dev->bus || slot != dev->slot ||
> +                function != dev->function)
> +                continue;
> +            in_matching_device = 1;
> +            match_depth = strspn(line, " ");
> +        }
> +    }
> +
> +    fclose(fp);
> +
> +    return ret;
> +}
> +
>  static char *
>  pciReadDeviceID(pciDevice *dev, const char *id_name)
>  {
> diff --git a/src/util/pci.h b/src/util/pci.h
> index a1a19e7..e6ab137 100644
> --- a/src/util/pci.h
> +++ b/src/util/pci.h
> @@ -81,5 +81,6 @@ int pciDeviceFileIterate(virConnectPtr conn,
>  int pciDeviceIsAssignable(virConnectPtr conn,
>                            pciDevice *dev,
>                            int strict_acs_check);
> +int pciWaitForDeviceCleanup(pciDevice *dev, const char *matcher);
>  
>  #endif /* __VIR_PCI_H__ */

  ACK,

Daniel

-- 
Daniel Veillard      | libxml Gnome XML XSLT toolkit  http://xmlsoft.org/
daniel at veillard.com  | Rpmfind RPM search engine http://rpmfind.net/
http://veillard.com/ | virtualization library  http://libvirt.org/




More information about the libvir-list mailing list