[libvirt] [PATCH v2 3/7] virnuma: Introduce pages helpers
Daniel P. Berrange
berrange at redhat.com
Thu Jun 19 11:06:44 UTC 2014
On Mon, Jun 16, 2014 at 05:08:26PM +0200, Michal Privoznik wrote:
> For future work we need two functions that fetches total number of
> pages and number of free pages for given NUMA node and page size
> (virNumaGetPageInfo()).
>
> Then we need to learn pages of what sizes are supported on given node
> (virNumaGetPages()).
>
> Signed-off-by: Michal Privoznik <mprivozn at redhat.com>
> ---
> src/libvirt_private.syms | 2 +
> src/util/virnuma.c | 325 +++++++++++++++++++++++++++++++++++++++++++++++
> src/util/virnuma.h | 10 ++
> 3 files changed, 337 insertions(+)
>
> diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
> index 18fde54..a7834ed 100644
> --- a/src/libvirt_private.syms
> +++ b/src/libvirt_private.syms
> @@ -1662,6 +1662,8 @@ virNumaGetAutoPlacementAdvice;
> virNumaGetDistances;
> virNumaGetMaxNode;
> virNumaGetNodeMemory;
> +virNumaGetPageInfo;
> +virNumaGetPages;
> virNumaIsAvailable;
> virNumaNodeIsAvailable;
> virNumaSetupMemoryPolicy;
> diff --git a/src/util/virnuma.c b/src/util/virnuma.c
> index 5814cba..a59feca 100644
> --- a/src/util/virnuma.c
> +++ b/src/util/virnuma.c
> @@ -34,12 +34,18 @@
>
> #endif /* WITH_NUMACTL */
>
> +#include <sys/types.h>
> +#include <dirent.h>
> +
> #include "virnuma.h"
> #include "vircommand.h"
> #include "virerror.h"
> #include "virlog.h"
> #include "viralloc.h"
> #include "virbitmap.h"
> +#include "virstring.h"
> +#include "virfile.h"
> +#include "nodeinfo.h"
>
> #define VIR_FROM_THIS VIR_FROM_NONE
>
> @@ -504,3 +510,322 @@ virNumaGetDistances(int node ATTRIBUTE_UNUSED,
> return 0;
> }
> #endif
> +
> +
> +#define HUGEPAGES_NUMA_PREFIX "/sys/devices/system/node/"
> +#define HUGEPAGES_SYSTEM_PREFIX "/sys/kernel/mm/hugepages/"
> +#define HUGEPAGES_PREFIX "hugepages-"
> +
> +static int
> +virNumaGetHugePageInfoPath(char **path,
> + int node,
> + unsigned int page_size,
> + const char *suffix)
> +{
> +
> + int ret = -1;
> +
> + if (node == -1) {
> + /* We are aiming at overall system info */
> + if (page_size) {
> + /* And even on specific huge page size */
> + if (virAsprintf(path,
> + HUGEPAGES_SYSTEM_PREFIX HUGEPAGES_PREFIX "%ukB/%s",
> + page_size, suffix ? suffix : "") < 0)
> + goto cleanup;
> + } else {
> + if (VIR_STRDUP(*path, HUGEPAGES_SYSTEM_PREFIX) < 0)
> + goto cleanup;
> + }
> +
> + } else {
> + /* We are aiming on specific NUMA node */
> + if (page_size) {
> + /* And even on specific huge page size */
> + if (virAsprintf(path,
> + HUGEPAGES_NUMA_PREFIX "node%d/hugepages/"
> + HUGEPAGES_PREFIX "%ukB/%s",
> + node, page_size, suffix ? suffix : "") < 0)
> + goto cleanup;
> + } else {
> + if (virAsprintf(path,
> + HUGEPAGES_NUMA_PREFIX "node%d/hugepages/",
> + node) < 0)
> + goto cleanup;
> + }
> + }
> +
> + ret = 0;
> + cleanup:
> + return ret;
> +}
> +
> +
> +/**
> + * virNumaGetHugePageInfo:
> + * @node: NUMA node id
> + * @page_size: which huge page are we interested in
> + * @page_avail: total number of huge pages in the pool
> + * @page_free: the number of free huge pages in the pool
> + *
> + * For given NUMA node and huge page size fetch information on
> + * total number of huge pages in the pool (both free and taken)
> + * and count for free huge pages in the pool.
> + *
> + * If you're interested in just one bit, pass NULL to the other one.
> + *
> + * As a special case, if @node == -1, overall info is fetched
> + * from the system.
> + *
> + * Returns 0 on success, -1 otherwise (with error reported).
> + */
> +static int
> +virNumaGetHugePageInfo(int node,
> + unsigned int page_size,
> + unsigned int *page_avail,
> + unsigned int *page_free)
> +{
> + int ret = -1;
> + char *path = NULL;
> + char *buf = NULL;
> + char *end;
> +
> + if (page_avail) {
> + if (virNumaGetHugePageInfoPath(&path, node,
> + page_size, "nr_hugepages") < 0)
> + goto cleanup;
> +
> + if (virFileReadAll(path, 1024, &buf) < 0)
> + goto cleanup;
> +
> + if (virStrToLong_ui(buf, &end, 10, page_avail) < 0 ||
> + *end != '\n') {
> + virReportError(VIR_ERR_INTERNAL_ERROR,
> + _("unable to parse: %s"),
> + buf);
> + goto cleanup;
> + }
It would probably be worth our while to introduce a helper
API virFileReadLong_ui() and likewise for the other
virStrToLong variants - we have this need in quite a few
places that interact with sysfs.
No requirement to do it in this patch unless you want to
though - its a long standing issue we can fix separately.
> +/**
> + * virNumaGetPageInfo:
> + * @node: NUMA node id
> + * @page_size: which huge page are we interested in (in KiB)
> + * @page_avail: total number of huge pages in the pool
> + * @page_free: the number of free huge pages in the pool
> + *
> + * For given NUMA node and page size fetch information on
> + * total number of pages in the pool (both free and taken)
> + * and count for free pages in the pool.
> + *
> + * If you're interested in just one bit, pass NULL to the other one.
> + *
> + * As a special case, if @node == -1, overall info is fetched
> + * from the system.
> + *
> + * Returns 0 on success, -1 otherwise (with error reported).
> + */
> +int
> +virNumaGetPageInfo(int node,
> + unsigned int page_size,
> + unsigned int *page_avail,
> + unsigned int *page_free)
> +{
> + int ret = -1;
> + long system_page_size = sysconf(_SC_PAGESIZE);
> +
> + /* sysconf() returns page size in bytes,
> + * the @page_size is however in kibibytes */
> + if (page_size == system_page_size / 1024) {
> + unsigned long long memsize, memfree;
> +
> + /* TODO: come up with better algorithm that takes huge pages into
> + * account. The problem is huge pages cut off regular memory. */
Hmm, so this code is returning normal page count that ignores the fact
that some pages are not in fact usable because they've been stolen for
huge pages ? I was thinking that the total memory reported by the kernel
was reduced when you allocated huage pages, but testing now, it seems I
was mistaken in that belief. So this is a bit of a nasty gotcha because
a user of this API would probably expect that the sum of page size *
page count for all page sizes would equal total physical RAM (give or
take).
I still like the idea of including the default page size in this info,
but perhaps we should disable the default system page size for now &
revisit later if we can figure out a way to accurately report it,
rather than reporting misleading info.
> + if (node == -1) {
> + if (nodeGetMemory(&memsize, &memfree) < 0)
> + goto cleanup;
> + } else {
> + if (virNumaGetNodeMemory(node, &memsize, &memfree) < 0)
> + goto cleanup;
> + }
> +
> + if (page_avail)
> + *page_avail = memsize / system_page_size;
> +
> + if (page_free)
> + *page_free = memfree / system_page_size;
> + } else {
> + if (virNumaGetHugePageInfo(node, page_size, page_avail, page_free) < 0)
> + goto cleanup;
> + }
> +
> + ret = 0;
> + cleanup:
> + return ret;
> +}
> +
> +
> +/**
> + * virNumaGetPages:
> + * @node: NUMA node id
> + * @pages_size: list of pages supported on @node
> + * @pages_avail: list of the pool sizes on @node
> + * @pages_free: list of free pages on @node
> + * @npages: the lists size
> + *
> + * For given NUMA node fetch info on pages. The size of pages
> + * (e.g. 4K, 2M, 1G) is stored into @pages_size, the size of the
> + * pool is then stored into @pages_avail and the number of free
> + * pages in the pool is stored into @pages_free.
> + *
> + * If you're interested only in some lists, pass NULL to the
> + * other ones.
> + *
> + * As a special case, if @node == -1, overall info is fetched
> + * from the system.
> + *
> + * Returns 0 on success, -1 otherwise.
> + */
> +int
> +virNumaGetPages(int node,
> + unsigned int **pages_size,
> + unsigned int **pages_avail,
> + unsigned int **pages_free,
> + size_t *npages)
> +{
> + int ret = -1;
> + char *path = NULL;
> + DIR *dir = NULL;
> + struct dirent *entry;
> + unsigned int *tmp_size = NULL, *tmp_avail = NULL, *tmp_free = NULL;
> + unsigned int ntmp = 0;
> + size_t i;
> + bool exchange;
> + long system_page_size;
> +
> + /* sysconf() returns page size in bytes,
> + * but we are storing the page size in kibibytes. */
> + system_page_size = sysconf(_SC_PAGESIZE) / 1024;
> +
> + /* We know that ordinary system pages are supported
> + * if nothing else is. */
> + if (VIR_REALLOC_N(tmp_size, 1) < 0 ||
> + VIR_REALLOC_N(tmp_avail, 1) < 0 ||
> + VIR_REALLOC_N(tmp_free, 1) < 0)
> + goto cleanup;
> +
> + if (virNumaGetPageInfo(node, system_page_size,
> + &tmp_avail[ntmp], &tmp_free[ntmp]) < 0)
> + goto cleanup;
> + tmp_size[ntmp] = system_page_size;
> + ntmp++;
> +
> + /* Now that we got ordinary system pages, lets get info on huge pages */
> + if (virNumaGetHugePageInfoPath(&path, node, 0, NULL) < 0)
> + goto cleanup;
> +
> + if (!(dir = opendir(path))) {
> + virReportSystemError(errno,
> + _("unable to open path: %s"),
> + path);
> + goto cleanup;
> + }
> +
> + while (virDirRead(dir, &entry, path) > 0) {
> + const char *page_name = entry->d_name;
> + unsigned int page_size, page_avail = 0, page_free = 0;
> + char *end;
> +
> + /* Just to give you a hint, we're dealing with this:
> + * hugepages-2048kB/ or hugepages-1048576kB/ */
> + if (!STRPREFIX(entry->d_name, HUGEPAGES_PREFIX))
> + continue;
> +
> + page_name += strlen(HUGEPAGES_PREFIX);
> +
> + if (virStrToLong_ui(page_name, &end, 10, &page_size) < 0 ||
> + STRCASENEQ(end, "kB")) {
> + virReportError(VIR_ERR_INTERNAL_ERROR,
> + _("unable to parse %s"),
> + entry->d_name);
> + goto cleanup;
> + }
> +
> + /* Querying more detailed info makes sense only sometimes */
> + if ((pages_avail || pages_free) &&
> + virNumaGetHugePageInfo(node, page_size,
> + &page_avail, &page_free) < 0)
> + goto cleanup;
> +
> + if (VIR_REALLOC_N(tmp_size, ntmp + 1) < 0 ||
> + VIR_REALLOC_N(tmp_avail, ntmp + 1) < 0 ||
> + VIR_REALLOC_N(tmp_free, ntmp + 1) < 0)
> + goto cleanup;
> +
> + tmp_size[ntmp] = page_size;
> + tmp_avail[ntmp] = page_avail;
> + tmp_free[ntmp] = page_free;
> + ntmp++;
> + }
> +
> + /* Just to produce nice output, sort the arrays by increasing page size */
> + do {
> + exchange = false;
> + for (i = 0; i < ntmp -1; i++) {
> + if (tmp_size[i] > tmp_size[i + 1]) {
> + exchange = true;
> + SWAP(tmp_size[i], tmp_size[i + 1]);
> + SWAP(tmp_avail[i], tmp_avail[i + 1]);
> + SWAP(tmp_free[i], tmp_free[i + 1]);
> + }
> + }
> + } while (exchange);
> +
> + if (pages_size) {
> + *pages_size = tmp_size;
> + tmp_size = NULL;
> + }
> + if (pages_avail) {
> + *pages_avail = tmp_avail;
> + tmp_avail = NULL;
> + }
> + if (pages_free) {
> + *pages_free = tmp_free;
> + tmp_free = NULL;
> + }
> + *npages = ntmp;
> + ret = 0;
> + cleanup:
> + VIR_FREE(tmp_free);
> + VIR_FREE(tmp_avail);
> + VIR_FREE(tmp_size);
> + closedir(dir);
> + VIR_FREE(path);
> + return ret;
> +}
Regards,
Daniel
--
|: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :|
|: http://libvirt.org -o- http://virt-manager.org :|
|: http://autobuild.org -o- http://search.cpan.org/~danberr/ :|
|: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|
More information about the libvir-list
mailing list