[libvirt] [PATCH] qemu: Fix race between device rebind and kvm cleanup

Daniel Veillard veillard at redhat.com
Fri Jan 22 09:24:25 UTC 2010


On Thu, Jan 21, 2010 at 11:33:22AM -0500, Chris Lalancette wrote:
> Certain hypervisors (like qemu/kvm) map the PCI bar(s) on
> the host when doing device passthrough.  This can lead to a race
> condition where the hypervisor is still cleaning up the device while
> libvirt is trying to re-attach it to the host device driver.  To avoid
> this situation, we look through /proc/iomem, and if the hypervisor is
> still holding onto the bar (denoted by the string in the matcher variable),
> then we can wait around a bit for that to clear up.
> 
> Signed-off-by: Chris Lalancette <clalance at redhat.com>
> ---
>  src/libvirt_private.syms |    1 +
>  src/qemu/qemu_driver.c   |   19 ++++++---
>  src/util/pci.c           |   97 ++++++++++++++++++++++++++++++++++++++++++++++
>  src/util/pci.h           |    1 +
>  4 files changed, 112 insertions(+), 6 deletions(-)
> 
> diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
> index c912d81..e42c090 100644
> --- a/src/libvirt_private.syms
> +++ b/src/libvirt_private.syms
> @@ -439,6 +439,7 @@ pciGetDevice;
>  pciFreeDevice;
>  pciDettachDevice;
>  pciReAttachDevice;
> +pciWaitForDeviceCleanup;
>  pciResetDevice;
>  pciDeviceSetManaged;
>  pciDeviceGetManaged;
> diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
> index e14ed8f..cde0034 100644
> --- a/src/qemu/qemu_driver.c
> +++ b/src/qemu/qemu_driver.c
> @@ -2244,6 +2244,7 @@ qemuDomainReAttachHostDevices(virConnectPtr conn,
>  {
>      pciDeviceList *pcidevs;
>      int i;
> +    int retries = 100;
>  
>      if (!def->nhostdevs)
>          return;
> @@ -2277,12 +2278,18 @@ qemuDomainReAttachHostDevices(virConnectPtr conn,
>  
>      for (i = 0; i < pciDeviceListCount(pcidevs); i++) {
>          pciDevice *dev = pciDeviceListGet(pcidevs, i);
> -        if (pciDeviceGetManaged(dev) &&
> -            pciReAttachDevice(NULL, dev) < 0) {
> -            virErrorPtr err = virGetLastError();
> -            VIR_ERROR(_("Failed to re-attach PCI device: %s"),
> -                      err ? err->message : "");
> -            virResetError(err);
> +        if (pciDeviceGetManaged(dev)) {
> +            while (pciWaitForDeviceCleanup(dev, "kvm_assigned_device")
> +                   && retries) {
> +                usleep(100*1000);
> +                retries--;
> +            }
> +            if (pciReAttachDevice(NULL, dev) < 0) {
> +                virErrorPtr err = virGetLastError();
> +                VIR_ERROR(_("Failed to re-attach PCI device: %s"),
> +                          err ? err->message : "");
> +                virResetError(err);
> +            }
>          }
>      }

  Okay so the maximum retries number for the whole set of managed
devices is 100, not 100 per device (as retries is not incremented in the
pci device loop) so the longuest we may wait is 10ms as a result
whatever the number of devices, that's the intent, right ?

> diff --git a/src/util/pci.c b/src/util/pci.c
> index 0defbfe..09535bd 100644
> --- a/src/util/pci.c
> +++ b/src/util/pci.c
> @@ -883,6 +883,103 @@ pciReAttachDevice(virConnectPtr conn, pciDevice *dev)
>      return pciUnBindDeviceFromStub(conn, dev, driver);
>  }
>  
> +/* Certain hypervisors (like qemu/kvm) map the PCI bar(s) on
> + * the host when doing device passthrough.  This can lead to a race
> + * condition where the hypervisor is still cleaning up the device while
> + * libvirt is trying to re-attach it to the host device driver.  To avoid
> + * this situation, we look through /proc/iomem, and if the hypervisor is
> + * still holding onto the bar (denoted by the string in the matcher variable),
> + * then we can wait around a bit for that to clear up.
> + *
> + * A typical /proc/iomem looks like this (snipped for brevity):
> + * 00010000-0008efff : System RAM
> + * 0008f000-0008ffff : reserved
> + * ...
> + * 00100000-cc9fcfff : System RAM
> + *   00200000-00483d3b : Kernel code
> + *   00483d3c-005c88df : Kernel data
> + * cc9fd000-ccc71fff : ACPI Non-volatile Storage
> + * ...
> + * d0200000-d02fffff : PCI Bus #05
> + *   d0200000-d021ffff : 0000:05:00.0
> + *     d0200000-d021ffff : e1000e
> + *   d0220000-d023ffff : 0000:05:00.0
> + *     d0220000-d023ffff : e1000e
> + * ...
> + * f0000000-f0003fff : 0000:00:1b.0
> + *   f0000000-f0003fff : kvm_assigned_device
> + *
> + * Returns 0 if we are clear to continue, and 1 if the hypervisor is still
> + * holding onto the resource.
> + */
> +int
> +pciWaitForDeviceCleanup(pciDevice *dev, const char *matcher)
> +{
> +    FILE *fp;
> +    char line[160];
> +    unsigned long long start, end;
> +    int consumed;
> +    char *rest;
> +    unsigned long long domain;
> +    int bus, slot, function;
> +    int in_matching_device;
> +    int ret;
> +    size_t match_depth;
> +
> +    fp = fopen("/proc/iomem", "r");
> +    if (!fp) {
> +        /* If we failed to open iomem, we just basically ignore the error.  The
> +         * unbind might succeed anyway, and besides, it's very likely we have
> +         * no way to report the error
> +         */
> +        VIR_DEBUG0("Failed to open /proc/iomem, trying to continue anyway");
> +        return 0;
> +    }
> +
> +    ret = 0;
> +    in_matching_device = 0;
> +    match_depth = 0;
> +    while (fgets(line, sizeof(line), fp) != 0) {
> +        /* the logic here is a bit confusing.  For each line, we look to
> +         * see if it matches the domain:bus:slot.function we were given.
> +         * If this line matches the DBSF, then any subsequent lines indented
> +         * by 2 spaces are the PCI regions for this device.  It's also
> +         * possible that none of the PCI regions are currently mapped, in
> +         * which case we have no indented regions.  This code handles all
> +         * of these situations
> +         */
> +        if (in_matching_device && (strspn(line, " ") == (match_depth + 2))) {
> +            if (sscanf(line, "%Lx-%Lx : %n", &start, &end, &consumed) != 2)
> +                continue;
> +
> +            rest = line + consumed;
> +            if (STRPREFIX(rest, matcher)) {
> +                ret = 1;
> +                break;
> +            }
> +        }
> +        else {
> +            in_matching_device = 0;
> +            if (sscanf(line, "%Lx-%Lx : %n", &start, &end, &consumed) != 2)
> +                continue;
> +
> +            rest = line + consumed;
> +            if (sscanf(rest, "%Lx:%x:%x.%x", &domain, &bus, &slot, &function) != 4)
> +                continue;
> +
> +            if (domain != dev->domain || bus != dev->bus || slot != dev->slot ||
> +                function != dev->function)
> +                continue;
> +            in_matching_device = 1;
> +            match_depth = strspn(line, " ");
> +        }
> +    }
> +
> +    fclose(fp);
> +
> +    return ret;
> +}
> +
>  static char *
>  pciReadDeviceID(pciDevice *dev, const char *id_name)
>  {
> diff --git a/src/util/pci.h b/src/util/pci.h
> index a1a19e7..e6ab137 100644
> --- a/src/util/pci.h
> +++ b/src/util/pci.h
> @@ -81,5 +81,6 @@ int pciDeviceFileIterate(virConnectPtr conn,
>  int pciDeviceIsAssignable(virConnectPtr conn,
>                            pciDevice *dev,
>                            int strict_acs_check);
> +int pciWaitForDeviceCleanup(pciDevice *dev, const char *matcher);
>  
>  #endif /* __VIR_PCI_H__ */

  Looks fine, ACK,

Daniel

-- 
Daniel Veillard      | libxml Gnome XML XSLT toolkit  http://xmlsoft.org/
daniel at veillard.com  | Rpmfind RPM search engine http://rpmfind.net/
http://veillard.com/ | virtualization library  http://libvirt.org/




More information about the libvir-list mailing list