[libvirt] [PATCH] expose SR IOV physical/virtual function relationships

Daniel P. Berrange berrange at redhat.com
Thu Dec 10 20:16:57 UTC 2009


On Fri, Dec 04, 2009 at 10:30:46AM -0500, Dave Allan wrote:
> Daniel P. Berrange wrote:
> >
> >I don't much like including the raw BDF format, because it is effectively
> >adding a 3rd way of specifying PCI addresses in libvirt XML. We already
> >have 2 different ways, which is 1 too many
> >
> >Node dev XML
> >
> >  <capability type='pci'>
> >    <domain>0</domain>
> >    <bus>0</bus>
> >    <slot>31</slot>
> >    <function>1</function>
> >  </capability>
> >
> >
> >Domain XML for host devices & guest addresses
> >
> >  <address domain='0x0000' bus='0x00' slot='0x1f' function='0x5'/>
> >
> >Yes, my bad for screwing up the nodedev  XML format when I designed
> >it, stupidly choosing decimal instead of hex :-(
> >
> >I think we should make the virtual/physical function follow the domain
> >XML style, with an <address/> element. Perhaps also use nested capabilities
> >in the way we did for NPIV host/vport stuff
> >
> >  <capability type='physical_function'>
> >    <address domain='0x0000' bus='0x00' slot='0x1f' function='0x1'/>
> >  </capability>
> >
> >  <capability type='virtual_functions'>
> >    <address domain='0x0000' bus='0x00' slot='0x1f' function='0x2'/>
> >    <address domain='0x0000' bus='0x00' slot='0x1f' function='0x3'/>
> >    <address domain='0x0000' bus='0x00' slot='0x1f' function='0x4'/>
> >  </capability>
> >
> >
> >I'd even be inclined to retro-fit the existing <capability type='pci'>
> >XML to duplicate the address info in this saner format eg
> >
> >  <capability type='pci'>
> >    <address domain='0x0000' bus='0x00' slot='0x1f' function='0x1' />
> >    <domain>0</domain>
> >    <bus>0</bus>
> >    <slot>31</slot>
> >    <function>1</function>
> >  </capability>
> >
> >
> >So, a PCI card with SR-IOV would end up looking like
> >
> >  <capability type='pci'>
> >    <domain>0</domain>
> >    <bus>0</bus>
> >    <slot>31</slot>
> >    <function>1</function>
> >    <address domain='0x0000' bus='0x00' slot='0x1f' function='0x1' />
> >
> >    <capability type='virtual_functions'>
> >      <address domain='0x0000' bus='0x00' slot='0x1f' function='0x2'/>
> >      <address domain='0x0000' bus='0x00' slot='0x1f' function='0x3'/>
> >      <address domain='0x0000' bus='0x00' slot='0x1f' function='0x4'/>
> >    </capability>
> >  </capability>
> >
> >The nice thing about this kind of nesting is that it would let us show
> >that a card is capable of exposing virtual functions, even if it does
> >not currently have any enabled
> >
> >
> >Regards,
> >Daniel
> 
> Here's an updated patch reflecting the above suggestions.
> 
> Dave
> 

> From d11d00d93c4bfbfd1125560ce80abfdbf88de94b Mon Sep 17 00:00:00 2001
> From: David Allan <dallan at redhat.com>
> Date: Mon, 30 Nov 2009 15:58:47 -0500
> Subject: [PATCH 1/1] Add SR IOV physical and virtual function relationships
> 
> ---
>  src/conf/node_device_conf.c               |   30 +++++
>  src/conf/node_device_conf.h               |   16 +++
>  src/node_device/node_device_driver.h      |   14 ++
>  src/node_device/node_device_hal.c         |    6 +
>  src/node_device/node_device_linux_sysfs.c |  200 ++++++++++++++++++++++++++++-
>  5 files changed, 265 insertions(+), 1 deletions(-)
> 
> diff --git a/src/conf/node_device_conf.c b/src/conf/node_device_conf.c
> index 6003ab1..a8ecfb0 100644
> --- a/src/conf/node_device_conf.c
> +++ b/src/conf/node_device_conf.c
> @@ -246,6 +246,7 @@ char *virNodeDeviceDefFormat(virConnectPtr conn,
>  {
>      virBuffer buf = VIR_BUFFER_INITIALIZER;
>      virNodeDevCapsDefPtr caps;
> +    unsigned int i = 0;
>      char *tmp;
> 
>      virBufferAddLit(&buf, "<device>\n");
> @@ -318,6 +319,30 @@ char *virNodeDeviceDefFormat(virConnectPtr conn,
>                                        data->pci_dev.vendor_name);
>              else
>                  virBufferAddLit(&buf, " />\n");
> +            if (data->pci_dev.flags & VIR_NODE_DEV_CAP_FLAG_PCI_PHYSICAL_FUNCTION) {
> +                virBufferAddLit(&buf, "    <capability type='physical function'>\n");

Can we change that to 'phys_function'

> +                virBufferVSprintf(&buf,
> +                                  "      <address domain='0x%.4x' bus='0x%.2x' "
> +                                  "slot='0x%.2x' function='0x%.1x'/>\n",
> +                                  data->pci_dev.physical_function->domain,
> +                                  data->pci_dev.physical_function->bus,
> +                                  data->pci_dev.physical_function->slot,
> +                                  data->pci_dev.physical_function->function);
> +                virBufferAddLit(&buf, "    </capability>\n");
> +            }
> +            if (data->pci_dev.flags & VIR_NODE_DEV_CAP_FLAG_PCI_VIRTUAL_FUNCTION) {
> +                virBufferAddLit(&buf, "    <capability type='virtual functions'>\n");

And  'virt_function', because its nice to not have whitespace in the values

> +                for (i = 0 ; i < data->pci_dev.num_virtual_functions ; i++) {
> +                    virBufferVSprintf(&buf,
> +                                      "      <address domain='0x%.4x' bus='0x%.2x' "
> +                                      "slot='0x%.2x' function='0x%.1x'/>\n",
> +                                      data->pci_dev.virtual_functions[i]->domain,
> +                                      data->pci_dev.virtual_functions[i]->bus,
> +                                      data->pci_dev.virtual_functions[i]->slot,
> +                                      data->pci_dev.virtual_functions[i]->function);
> +                }
> +                virBufferAddLit(&buf, "    </capability>\n");
> +            }
>              break;
>          case VIR_NODE_DEV_CAP_USB_DEV:
>              virBufferVSprintf(&buf, "    <bus>%d</bus>\n", data->usb_dev.bus);
> @@ -1387,6 +1412,7 @@ out:
> 
>  void virNodeDevCapsDefFree(virNodeDevCapsDefPtr caps)
>  {
> +    int i = 0;
>      union _virNodeDevCapData *data = &caps->data;
> 
>      switch (caps->type) {
> @@ -1402,6 +1428,10 @@ void virNodeDevCapsDefFree(virNodeDevCapsDefPtr caps)
>      case VIR_NODE_DEV_CAP_PCI_DEV:
>          VIR_FREE(data->pci_dev.product_name);
>          VIR_FREE(data->pci_dev.vendor_name);
> +        VIR_FREE(data->pci_dev.physical_function);
> +        for (i = 0 ; i < data->pci_dev.num_virtual_functions ; i++) {
> +            VIR_FREE(data->pci_dev.virtual_functions[i]);
> +        }
>          break;
>      case VIR_NODE_DEV_CAP_USB_DEV:
>          VIR_FREE(data->usb_dev.product_name);
> diff --git a/src/conf/node_device_conf.h b/src/conf/node_device_conf.h
> index 7a20bd6..4bfac90 100644
> --- a/src/conf/node_device_conf.h
> +++ b/src/conf/node_device_conf.h
> @@ -76,6 +76,18 @@ enum virNodeDevScsiHostCapFlags {
>      VIR_NODE_DEV_CAP_FLAG_HBA_VPORT_OPS			= (1 << 1),
>  };
> 
> +enum virNodeDevPCICapFlags {
> +    VIR_NODE_DEV_CAP_FLAG_PCI_PHYSICAL_FUNCTION		= (1 << 0),
> +    VIR_NODE_DEV_CAP_FLAG_PCI_VIRTUAL_FUNCTION		= (1 << 1),
> +};
> +
> +struct pci_config_address {
> +    unsigned domain;
> +    unsigned bus;
> +    unsigned slot;
> +    unsigned function;
> +};
> +
>  typedef struct _virNodeDevCapsDef virNodeDevCapsDef;
>  typedef virNodeDevCapsDef *virNodeDevCapsDefPtr;
>  struct _virNodeDevCapsDef {
> @@ -105,6 +117,10 @@ struct _virNodeDevCapsDef {
>              unsigned class;
>              char *product_name;
>              char *vendor_name;
> +            struct pci_config_address *physical_function;
> +            struct pci_config_address **virtual_functions;
> +            unsigned num_virtual_functions;
> +            unsigned flags;
>          } pci_dev;
>          struct {
>              unsigned bus;
> diff --git a/src/node_device/node_device_driver.h b/src/node_device/node_device_driver.h
> index 4f0822c..7b68f41 100644
> --- a/src/node_device/node_device_driver.h
> +++ b/src/node_device/node_device_driver.h
> @@ -37,6 +37,10 @@
>  #define LINUX_SYSFS_VPORT_CREATE_POSTFIX "/vport_create"
>  #define LINUX_SYSFS_VPORT_DELETE_POSTFIX "/vport_delete"
> 
> +#define SRIOV_FOUND 0
> +#define SRIOV_NOT_FOUND 1
> +#define SRIOV_ERROR -1
> +
>  #define LINUX_NEW_DEVICE_WAIT_TIME 60
> 
>  #ifdef HAVE_HAL
> @@ -61,6 +65,14 @@ int check_fc_host_linux(union _virNodeDevCapData *d);
>  #define check_vport_capable(d) check_vport_capable_linux(d)
>  int check_vport_capable_linux(union _virNodeDevCapData *d);
> 
> +#define get_physical_function(s,d) get_physical_function_linux(s,d)
> +int get_physical_function_linux(const char *sysfs_path,
> +                                union _virNodeDevCapData *d);
> +
> +#define get_virtual_functions(s,d) get_virtual_functions_linux(s,d)
> +int get_virtual_functions_linux(const char *sysfs_path,
> +                                union _virNodeDevCapData *d);
> +
>  #define read_wwn(host, file, wwn) read_wwn_linux(host, file, wwn)
>  int read_wwn_linux(int host, const char *file, char **wwn);
> 
> @@ -68,6 +80,8 @@ int read_wwn_linux(int host, const char *file, char **wwn);
> 
>  #define check_fc_host(d)
>  #define check_vport_capable(d)
> +#define get_physical_function(sysfs_path, d)
> +#define get_virtual_functions(sysfs_path, d)
>  #define read_wwn(host, file, wwn)
> 
>  #endif /* __linux__ */
> diff --git a/src/node_device/node_device_hal.c b/src/node_device/node_device_hal.c
> index 6de7e3d..4496301 100644
> --- a/src/node_device/node_device_hal.c
> +++ b/src/node_device/node_device_hal.c
> @@ -145,14 +145,20 @@ static int gather_pci_cap(LibHalContext *ctx, const char *udi,
>              (void)virStrToLong_ui(p+1, &p, 16, &d->pci_dev.slot);
>              (void)virStrToLong_ui(p+1, &p, 16, &d->pci_dev.function);
>          }
> +
> +        get_physical_function(sysfs_path, d);
> +        get_virtual_functions(sysfs_path, d);
> +
>          VIR_FREE(sysfs_path);
>      }
> +
>      (void)get_int_prop(ctx, udi, "pci.vendor_id", (int *)&d->pci_dev.vendor);
>      if (get_str_prop(ctx, udi, "pci.vendor", &d->pci_dev.vendor_name) != 0)
>          (void)get_str_prop(ctx, udi, "info.vendor", &d->pci_dev.vendor_name);
>      (void)get_int_prop(ctx, udi, "pci.product_id", (int *)&d->pci_dev.product);
>      if (get_str_prop(ctx, udi, "pci.product", &d->pci_dev.product_name) != 0)
>          (void)get_str_prop(ctx, udi, "info.product", &d->pci_dev.product_name);
> +
>      return 0;
>  }
> 
> diff --git a/src/node_device/node_device_linux_sysfs.c b/src/node_device/node_device_linux_sysfs.c
> index b7cf782..d2b10dd 100644
> --- a/src/node_device/node_device_linux_sysfs.c
> +++ b/src/node_device/node_device_linux_sysfs.c
> @@ -29,6 +29,7 @@
>  #include "virterror_internal.h"
>  #include "memory.h"
>  #include "logging.h"
> +#include <dirent.h>
> 
>  #define VIR_FROM_THIS VIR_FROM_NODEDEV
> 
> @@ -70,7 +71,7 @@ int read_wwn_linux(int host, const char *file, char **wwn)
>      char buf[64];
> 
>      if (open_wwn_file(LINUX_SYSFS_FC_HOST_PREFIX, host, file, &fd) < 0) {
> -            goto out;
> +        goto out;
>      }
> 
>      memset(buf, 0, sizeof(buf));
> @@ -184,4 +185,201 @@ out:
>      return retval;
>  }
> 
> +
> +static int logStrToLong_ui(char const *s,
> +                           char **end_ptr,
> +                           int base,
> +                           unsigned int *result)
> +{
> +    int ret = 0;
> +
> +    ret = virStrToLong_ui(s, end_ptr, base, result);
> +    if (ret != 0) {
> +        VIR_ERROR("Failed to convert '%s' to unsigned int\n", s);
> +    } else {
> +        VIR_DEBUG("Converted '%s' to unsigned int %u\n", s, *result);
> +    }

There's no need for '\n' in log messages.

> +
> +    return ret;
> +}
> +
> +
> +static int parse_pci_config_address(char *address, struct pci_config_address *bdf)
> +{
> +    char *p = NULL;
> +    int ret = -1;
> +
> +    if ((address == NULL) || (logStrToLong_ui(address,
> +                                              &p,
> +                                              16,
> +                                              &bdf->domain) == -1)) {
> +        goto out;
> +    }
> +
> +    if ((p == NULL) || (logStrToLong_ui(p+1,
> +                                        &p,
> +                                        16,
> +                                        &bdf->bus) == -1)) {
> +        goto out;
> +    }
> +
> +    if ((p == NULL) || (logStrToLong_ui(p+1,
> +                                        &p,
> +                                        16,
> +                                        &bdf->slot) == -1)) {
> +        goto out;
> +    }
> +
> +    if ((p == NULL) || (logStrToLong_ui(p+1,
> +                                        &p,
> +                                        16,
> +                                        &bdf->function) == -1)) {
> +        goto out;
> +    }
> +
> +    ret = 0;
> +
> +out:
> +    return ret;
> +}
> +
> +
> +
> +
> +static int get_sriov_function(const char *device_link,
> +                              struct pci_config_address **bdf)
> +{
> +    char *device_path = NULL, *config_address = NULL;
> +    char errbuf[64];
> +    int ret = SRIOV_ERROR;
> +
> +    VIR_DEBUG("Attempting to resolve device path from device link '%s'\n",
> +              device_link);
> +
> +    if (!virFileExists(device_link)) {
> +
> +        VIR_DEBUG("SR IOV function link '%s' does not exist\n", device_link);
> +        /* Not an SR IOV device, not an error, either. */
> +        ret = SRIOV_NOT_FOUND;
> +
> +        goto out;
> +
> +    }
> +
> +    device_path = realpath(device_link, device_path);
> +    if (device_path == NULL) {
> +        memset(errbuf, '\0', sizeof(errbuf));
> +        VIR_ERROR("Failed to resolve device link '%s': '%s'\n", device_link,
> +                  strerror_r(errno, errbuf, sizeof(errbuf)));

Can you replace that with virStrerror() because strerror_r() has an
annoying API difference on Linux compared to POSIX (returns char *
instead of int, or vica-verca).

> +        goto out;
> +    }
> +
> +    VIR_DEBUG("SR IOV device path is '%s'\n", device_path);
> +    config_address = basename(device_path);
> +    if (VIR_ALLOC(*bdf) != 0) {
> +        VIR_ERROR0("Failed to allocate memory for PCI device name\n");
> +        goto out;
> +    }
> +
> +    if (parse_pci_config_address(config_address, *bdf) != 0) {
> +        VIR_ERROR("Failed to parse PCI config address '%s'\n", config_address);
> +        goto out;
> +    }
> +
> +    VIR_DEBUG("SR IOV function %.4x:%.2x:%.2x.%.1x/>\n",
> +              (*bdf)->domain,
> +              (*bdf)->bus,
> +              (*bdf)->slot,
> +              (*bdf)->function);
> +
> +    ret = SRIOV_FOUND;
> +
> +out:
> +    VIR_FREE(device_path);
> +    return ret;
> +}
> +
> +
> +int get_physical_function_linux(const char *sysfs_path,
> +                                union _virNodeDevCapData *d ATTRIBUTE_UNUSED)
> +{
> +    int ret = -1;
> +    char *device_link = NULL;
> +
> +    VIR_DEBUG("Attempting to get SR IOV physical function for device "
> +              "with sysfs path '%s'\n", sysfs_path);
> +
> +    if (virBuildPath(&device_link, sysfs_path, "physfn") == -1) {
> +        virReportOOMError(NULL);
> +    } else {
> +        ret = get_sriov_function(device_link, &d->pci_dev.physical_function);
> +        if (ret == SRIOV_FOUND) {
> +            d->pci_dev.flags |= VIR_NODE_DEV_CAP_FLAG_PCI_PHYSICAL_FUNCTION;
> +        }
> +    }
> +
> +    VIR_FREE(device_link);
> +    return ret;
> +}
> +
> +
> +int get_virtual_functions_linux(const char *sysfs_path,
> +                                union _virNodeDevCapData *d)
> +{
> +    int ret = -1;
> +    DIR *dir = NULL;
> +    struct dirent *entry = NULL;
> +    char *device_link = NULL;
> +
> +    VIR_DEBUG("Attempting to get SR IOV virtual functions for device"
> +              "with sysfs path '%s'\n", sysfs_path);
> +
> +    dir = opendir(sysfs_path);
> +    if (dir == NULL) {
> +        goto out;
> +    }
> +
> +    while ((entry = readdir(dir))) {
> +        if (STRPREFIX(entry->d_name, "virtfn")) {
> +            /* This local is just to avoid lines of code much > 80 col. */
> +            unsigned int *num_funcs = &d->pci_dev.num_virtual_functions;
> +
> +            if (virBuildPath(&device_link, sysfs_path, entry->d_name) == -1) {
> +                virReportOOMError(NULL);
> +                goto out;
> +            }
> +
> +            VIR_DEBUG("Number of virtual functions: %d\n", *num_funcs);
> +            if (VIR_REALLOC_N(d->pci_dev.virtual_functions, (*num_funcs) + 1) != 0) {
> +                virReportOOMError(NULL);
> +                goto out;
> +            }
> +
> +            if (get_sriov_function(device_link,
> +                                   &d->pci_dev.virtual_functions[*num_funcs]) !=
> +                SRIOV_FOUND) {
> +
> +                /* We should not get back SRIOV_NOT_FOUND in this
> +                 * case, so if we do, it's an error. */
> +                VIR_ERROR("Failed to get SR IOV function from device link '%s'\n",
> +                          device_link);
> +                goto out;
> +            } else {
> +                (*num_funcs)++;
> +                d->pci_dev.flags |= VIR_NODE_DEV_CAP_FLAG_PCI_VIRTUAL_FUNCTION;
> +            }
> +
> +            VIR_FREE(device_link);
> +        }
> +    }
> +
> +    closedir(dir);
> +
> +    ret = 0;
> +
> +out:
> +    VIR_FREE(device_link);
> +    return 0;
> +}
> +
>  #endif /* __linux__ */
> -- 

ACK aside from the minor logging fixes & attribute whitespace 

Daniel
-- 
|: Red Hat, Engineering, London   -o-   http://people.redhat.com/berrange/ :|
|: http://libvirt.org  -o-  http://virt-manager.org  -o-  http://ovirt.org :|
|: http://autobuild.org       -o-         http://search.cpan.org/~danberr/ :|
|: GnuPG: 7D3B9505  -o-  F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|




More information about the libvir-list mailing list