[libvirt] [PATCH v2 3/7] virnuma: Introduce pages helpers

Thu Jun 19 11:06:44 UTC 2014

On Mon, Jun 16, 2014 at 05:08:26PM +0200, Michal Privoznik wrote:
> For future work we need two functions that fetches total number of
> pages and number of free pages for given NUMA node and page size
> (virNumaGetPageInfo()).
> 
> Then we need to learn pages of what sizes are supported on given node
> (virNumaGetPages()).
> 
> Signed-off-by: Michal Privoznik <mprivozn at redhat.com>
> ---
>  src/libvirt_private.syms |   2 +
>  src/util/virnuma.c       | 325 +++++++++++++++++++++++++++++++++++++++++++++++
>  src/util/virnuma.h       |  10 ++
>  3 files changed, 337 insertions(+)
> 
> diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
> index 18fde54..a7834ed 100644
> --- a/src/libvirt_private.syms
> +++ b/src/libvirt_private.syms
> @@ -1662,6 +1662,8 @@ virNumaGetAutoPlacementAdvice;
>  virNumaGetDistances;
>  virNumaGetMaxNode;
>  virNumaGetNodeMemory;
> +virNumaGetPageInfo;
> +virNumaGetPages;
>  virNumaIsAvailable;
>  virNumaNodeIsAvailable;
>  virNumaSetupMemoryPolicy;
> diff --git a/src/util/virnuma.c b/src/util/virnuma.c
> index 5814cba..a59feca 100644
> --- a/src/util/virnuma.c
> +++ b/src/util/virnuma.c
> @@ -34,12 +34,18 @@
>  
>  #endif /* WITH_NUMACTL */
>  
> +#include <sys/types.h>
> +#include <dirent.h>
> +
>  #include "virnuma.h"
>  #include "vircommand.h"
>  #include "virerror.h"
>  #include "virlog.h"
>  #include "viralloc.h"
>  #include "virbitmap.h"
> +#include "virstring.h"
> +#include "virfile.h"
> +#include "nodeinfo.h"
>  
>  #define VIR_FROM_THIS VIR_FROM_NONE
>  
> @@ -504,3 +510,322 @@ virNumaGetDistances(int node ATTRIBUTE_UNUSED,
>      return 0;
>  }
>  #endif
> +
> +
> +#define HUGEPAGES_NUMA_PREFIX "/sys/devices/system/node/"
> +#define HUGEPAGES_SYSTEM_PREFIX "/sys/kernel/mm/hugepages/"
> +#define HUGEPAGES_PREFIX "hugepages-"
> +
> +static int
> +virNumaGetHugePageInfoPath(char **path,
> +                           int node,
> +                           unsigned int page_size,
> +                           const char *suffix)
> +{
> +
> +    int ret = -1;
> +
> +    if (node == -1) {
> +        /* We are aiming at overall system info */
> +        if (page_size) {
> +            /* And even on specific huge page size */
> +            if (virAsprintf(path,
> +                            HUGEPAGES_SYSTEM_PREFIX HUGEPAGES_PREFIX "%ukB/%s",
> +                            page_size, suffix ? suffix : "") < 0)
> +                goto cleanup;
> +        } else {
> +            if (VIR_STRDUP(*path, HUGEPAGES_SYSTEM_PREFIX) < 0)
> +                goto cleanup;
> +        }
> +
> +    } else {
> +        /* We are aiming on specific NUMA node */
> +        if (page_size) {
> +            /* And even on specific huge page size */
> +            if (virAsprintf(path,
> +                            HUGEPAGES_NUMA_PREFIX "node%d/hugepages/"
> +                            HUGEPAGES_PREFIX "%ukB/%s",
> +                            node, page_size, suffix ? suffix : "") < 0)
> +                goto cleanup;
> +        } else {
> +            if (virAsprintf(path,
> +                            HUGEPAGES_NUMA_PREFIX "node%d/hugepages/",
> +                            node) < 0)
> +                goto cleanup;
> +        }
> +    }
> +
> +    ret = 0;
> + cleanup:
> +    return ret;
> +}
> +
> +
> +/**
> + * virNumaGetHugePageInfo:
> + * @node: NUMA node id
> + * @page_size: which huge page are we interested in
> + * @page_avail: total number of huge pages in the pool
> + * @page_free: the number of free huge pages in the pool
> + *
> + * For given NUMA node and huge page size fetch information on
> + * total number of huge pages in the pool (both free and taken)
> + * and count for free huge pages in the pool.
> + *
> + * If you're interested in just one bit, pass NULL to the other one.
> + *
> + * As a special case, if @node == -1, overall info is fetched
> + * from the system.
> + *
> + * Returns 0 on success, -1 otherwise (with error reported).
> + */
> +static int
> +virNumaGetHugePageInfo(int node,
> +                       unsigned int page_size,
> +                       unsigned int *page_avail,
> +                       unsigned int *page_free)
> +{
> +    int ret = -1;
> +    char *path = NULL;
> +    char *buf = NULL;
> +    char *end;
> +
> +    if (page_avail) {
> +        if (virNumaGetHugePageInfoPath(&path, node,
> +                                       page_size, "nr_hugepages") < 0)
> +            goto cleanup;
> +
> +        if (virFileReadAll(path, 1024, &buf) < 0)
> +            goto cleanup;
> +
> +        if (virStrToLong_ui(buf, &end, 10, page_avail) < 0 ||
> +            *end != '\n') {
> +            virReportError(VIR_ERR_INTERNAL_ERROR,
> +                           _("unable to parse: %s"),
> +                           buf);
> +            goto cleanup;
> +        }

It would probably be worth our while to introduce a helper
API  virFileReadLong_ui() and likewise for the other
virStrToLong variants - we have this need in quite a few
places that interact with sysfs.

No requirement to do it in this patch unless you want to
though - its a long standing issue we can fix separately.

> +/**
> + * virNumaGetPageInfo:
> + * @node: NUMA node id
> + * @page_size: which huge page are we interested in (in KiB)
> + * @page_avail: total number of huge pages in the pool
> + * @page_free: the number of free huge pages in the pool
> + *
> + * For given NUMA node and page size fetch information on
> + * total number of pages in the pool (both free and taken)
> + * and count for free pages in the pool.
> + *
> + * If you're interested in just one bit, pass NULL to the other one.
> + *
> + * As a special case, if @node == -1, overall info is fetched
> + * from the system.
> + *
> + * Returns 0 on success, -1 otherwise (with error reported).
> + */
> +int
> +virNumaGetPageInfo(int node,
> +                   unsigned int page_size,
> +                   unsigned int *page_avail,
> +                   unsigned int *page_free)
> +{
> +    int ret = -1;
> +    long system_page_size = sysconf(_SC_PAGESIZE);
> +
> +    /* sysconf() returns page size in bytes,
> +     * the @page_size is however in kibibytes */
> +    if (page_size == system_page_size / 1024) {
> +        unsigned long long memsize, memfree;
> +
> +        /* TODO: come up with better algorithm that takes huge pages into
> +         * account. The problem is huge pages cut off regular memory. */

Hmm, so this code is returning normal page count that ignores the fact
that some pages are not in fact usable because they've been stolen for
huge pages ? I was thinking that the total memory reported by the kernel
was reduced when you allocated huage pages, but testing now, it seems I
was mistaken in that belief.  So this is a bit of a nasty gotcha because
a user of this API would probably expect that the sum of page size *
page count for all page sizes would equal total physical RAM (give or
take).

I still like the idea of including the default page size in this info,
but perhaps we should disable the default system page size for now &
revisit later if we can figure out a way to accurately report it,
rather than reporting misleading info.

> +        if (node == -1) {
> +            if (nodeGetMemory(&memsize, &memfree) < 0)
> +                goto cleanup;
> +        } else {
> +            if (virNumaGetNodeMemory(node, &memsize, &memfree) < 0)
> +                goto cleanup;
> +        }
> +
> +        if (page_avail)
> +            *page_avail = memsize / system_page_size;
> +
> +        if (page_free)
> +            *page_free = memfree / system_page_size;
> +    } else {
> +        if (virNumaGetHugePageInfo(node, page_size, page_avail, page_free) < 0)
> +            goto cleanup;
> +    }
> +
> +    ret = 0;
> + cleanup:
> +    return ret;
> +}
> +
> +
> +/**
> + * virNumaGetPages:
> + * @node: NUMA node id
> + * @pages_size: list of pages supported on @node
> + * @pages_avail: list of the pool sizes on @node
> + * @pages_free: list of free pages on @node
> + * @npages: the lists size
> + *
> + * For given NUMA node fetch info on pages. The size of pages
> + * (e.g.  4K, 2M, 1G) is stored into @pages_size, the size of the
> + * pool is then stored into @pages_avail and the number of free
> + * pages in the pool is stored into @pages_free.
> + *
> + * If you're interested only in some lists, pass NULL to the
> + * other ones.
> + *
> + * As a special case, if @node == -1, overall info is fetched
> + * from the system.
> + *
> + * Returns 0 on success, -1 otherwise.
> + */
> +int
> +virNumaGetPages(int node,
> +                unsigned int **pages_size,
> +                unsigned int **pages_avail,
> +                unsigned int **pages_free,
> +                size_t *npages)
> +{
> +    int ret = -1;
> +    char *path = NULL;
> +    DIR *dir = NULL;
> +    struct dirent *entry;
> +    unsigned int *tmp_size = NULL, *tmp_avail = NULL, *tmp_free = NULL;
> +    unsigned int ntmp = 0;
> +    size_t i;
> +    bool exchange;
> +    long system_page_size;
> +
> +    /* sysconf() returns page size in bytes,
> +     * but we are storing the page size in kibibytes. */
> +    system_page_size = sysconf(_SC_PAGESIZE) / 1024;
> +
> +    /* We know that ordinary system pages are supported
> +     * if nothing else is. */
> +    if (VIR_REALLOC_N(tmp_size, 1) < 0 ||
> +        VIR_REALLOC_N(tmp_avail, 1) < 0 ||
> +        VIR_REALLOC_N(tmp_free, 1) < 0)
> +        goto cleanup;
> +
> +    if (virNumaGetPageInfo(node, system_page_size,
> +                           &tmp_avail[ntmp], &tmp_free[ntmp]) < 0)
> +        goto cleanup;
> +    tmp_size[ntmp] = system_page_size;
> +    ntmp++;
> +
> +    /* Now that we got ordinary system pages, lets get info on huge pages */
> +    if (virNumaGetHugePageInfoPath(&path, node, 0, NULL) < 0)
> +        goto cleanup;
> +
> +    if (!(dir = opendir(path))) {
> +        virReportSystemError(errno,
> +                             _("unable to open path: %s"),
> +                             path);
> +        goto cleanup;
> +    }
> +
> +    while (virDirRead(dir, &entry, path) > 0) {
> +        const char *page_name = entry->d_name;
> +        unsigned int page_size, page_avail = 0, page_free = 0;
> +        char *end;
> +
> +        /* Just to give you a hint, we're dealing with this:
> +         * hugepages-2048kB/  or   hugepages-1048576kB/ */
> +        if (!STRPREFIX(entry->d_name, HUGEPAGES_PREFIX))
> +            continue;
> +
> +        page_name += strlen(HUGEPAGES_PREFIX);
> +
> +        if (virStrToLong_ui(page_name, &end, 10, &page_size) < 0 ||
> +            STRCASENEQ(end, "kB")) {
> +            virReportError(VIR_ERR_INTERNAL_ERROR,
> +                           _("unable to parse %s"),
> +                           entry->d_name);
> +            goto cleanup;
> +        }
> +
> +        /* Querying more detailed info makes sense only sometimes */
> +        if ((pages_avail || pages_free) &&
> +            virNumaGetHugePageInfo(node, page_size,
> +                                   &page_avail, &page_free) < 0)
> +            goto cleanup;
> +
> +        if (VIR_REALLOC_N(tmp_size, ntmp + 1) < 0 ||
> +            VIR_REALLOC_N(tmp_avail, ntmp + 1) < 0 ||
> +            VIR_REALLOC_N(tmp_free, ntmp + 1) < 0)
> +            goto cleanup;
> +
> +        tmp_size[ntmp] = page_size;
> +        tmp_avail[ntmp] = page_avail;
> +        tmp_free[ntmp] = page_free;
> +        ntmp++;
> +    }
> +
> +    /* Just to produce nice output, sort the arrays by increasing page size */
> +    do {
> +        exchange = false;
> +        for (i = 0; i < ntmp -1; i++) {
> +            if (tmp_size[i] > tmp_size[i + 1]) {
> +                exchange = true;
> +                SWAP(tmp_size[i], tmp_size[i + 1]);
> +                SWAP(tmp_avail[i], tmp_avail[i + 1]);
> +                SWAP(tmp_free[i], tmp_free[i + 1]);
> +            }
> +        }
> +    } while (exchange);
> +
> +    if (pages_size) {
> +        *pages_size = tmp_size;
> +        tmp_size = NULL;
> +    }
> +    if (pages_avail) {
> +        *pages_avail = tmp_avail;
> +        tmp_avail = NULL;
> +    }
> +    if (pages_free) {
> +        *pages_free = tmp_free;
> +        tmp_free = NULL;
> +    }
> +    *npages = ntmp;
> +    ret = 0;
> + cleanup:
> +    VIR_FREE(tmp_free);
> +    VIR_FREE(tmp_avail);
> +    VIR_FREE(tmp_size);
> +    closedir(dir);
> +    VIR_FREE(path);
> +    return ret;
> +}

Regards,
Daniel
-- 
|: http://berrange.com      -o-    http://www.flickr.com/photos/dberrange/ :|
|: http://libvirt.org              -o-             http://virt-manager.org :|
|: http://autobuild.org       -o-         http://search.cpan.org/~danberr/ :|
|: http://entangle-photo.org       -o-       http://live.gnome.org/gtk-vnc :|