[libvirt] [RFC PATCH v1 1/4] XML definitions for guest vNUMA and parsing routines

Wim Ten Have wim.ten.have at oracle.com
Mon Oct 21 19:21:05 UTC 2019


From: Wim ten Have <wim.ten.have at oracle.com>

This patch adds XML definitions to a guest with a vNUMA layout and
contains routines to parse the same.  The guest vNUMA specification
looks like:

  <vnuma mode='host|node'
         distribution='contiguous|siblings|round-robin|interleave'>
    <memory unit='#unitsize'>size</memory>
    <partition nodeset='#nodes' cells='#cells'/>
  </vnuma>

With mode='host' the guest XML is rendered to match the host's NUMA
topology.

With mode='node' the guest XML is rendered according to the "nodes"
and "cells" attributes of the <partition> element.

Signed-off-by: Wim ten Have <wim.ten.have at oracle.com>
---
 docs/formatdomain.html.in     |  94 +++++++
 docs/schemas/domaincommon.rng |  65 +++++
 src/conf/domain_conf.c        | 482 +++++++++++++++++++++++++++++++++-
 src/conf/domain_conf.h        |   2 +
 src/conf/numa_conf.c          | 241 ++++++++++++++++-
 src/conf/numa_conf.h          |  58 +++-
 src/libvirt_private.syms      |   8 +
 7 files changed, 932 insertions(+), 18 deletions(-)

diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in
index 962766b792d3..80165f9bd896 100644
--- a/docs/formatdomain.html.in
+++ b/docs/formatdomain.html.in
@@ -1294,6 +1294,98 @@
     </dl>
 
 
+    <h3><a id="elementsvNUMAPartitioning">NUMA Host or Node Partitioning</a></h3>
+
+    <p>
+      With the help of the <code>vnuma</code> element, libvirt can
+      dynamically partition a guest domain for vNUMA by rendering its XML
+      into a 'host' or 'node' <a href="#elementsNUMAtopology"><code>NUMA
+      topology</code></a> matching model.
+    </p>
+
+<pre>
+<domain>
+  ...
+  <vnuma mode='host|node' distribution='contiguous|siblings|round-robin|interleave'>
+    <memory unit='KiB'>524288</memory>
+    <partition nodeset="1-4,^3" cells="8"/>
+  </vnuma>
+  ...
+</domain>
+</pre>
+
+    <dl>
+      <dt><code>vnuma</code></dt>
+      <dd>
+        The attribute <code>mode</code> selects a specific rendering
+        method. Its value is either "host" or "node." If <code>mode</code>
+        is set to "host" the guest domain is automatically partitioned
+        to match the host NUMA topology. If <code>mode</code>
+        is "node," the guest domain is partitioned according to the
+        <code>nodeset</code> and <code>cells</code> under the
+        <code>vnuma</code> <code>partition</code> subelement.
+            <span class="since">Since 5.9</span>
+
+        The optional attribute <code>distribution</code> selects the
+        guest <a href="#elementsNUMAtopology"><code>numa</code></a>
+        <code>cell</code> <code>cpus</code> distribution. It allows
+        <span class="since">Since 5.9</span> for:
+        <dl>
+          <dt><code>contiguous</code></dt>
+          <dd> The cpus are enumerate sequentially over the
+            <a href="#elementsNUMAtopology"><code>numa</code></a> defined
+            cells.
+          </dd>
+          <dt><code>siblings</code></dt>
+          <dd> The cpus are distributed over the
+            <a href="#elementsNUMAtopology"><code>numa</code></a>
+            cells matching the host CPU SMT model.
+          </dd>
+          <dt><code>round-robin</code></dt>
+          <dd> The cpus are distributed over the
+            <a href="#elementsNUMAtopology"><code>numa</code></a>
+            cells matching the host CPU topology.
+          </dd>
+          <dt><code>interleave</code></dt>
+          <dd> The cpus are interleaved one at a time over the
+            <a href="#elementsNUMAtopology"><code>numa</code></a> cells.
+          </dd>
+         </dl>
+      </dd>
+
+      <dt><code>memory</code></dt>
+      <dd>
+        The optional subelement <code>memory</code> specifies the
+        memory size reserved for the guest assigned
+        <a href="#elementsNUMAtopology"><code>numa</code></a> cells.
+        <span class="since">Since 1.2.11</span>, one can use an additional
+        <a href="#elementsMemoryAllocation"><code>unit</code></a>
+        attribute to define units in which this <code>memory</code>
+        size is quantified. If no <code>memory</code> is specified, the
+        <a href="#elementsMemoryAllocation">memory</a> setting is
+        acquired to set this subelement documented
+        <a href="#elementsvNUMAPartitioning"><code>vnuma</code></a> value.
+        <span class="since">Since 5.9</span>
+      </dd>
+
+      <dt><code>partition</code></dt>
+      <dd>
+        The optional attribute <code>partition</code> is only active when
+        <a href="#elementsvNUMAPartitioning"><code>vnuma</code></a>
+        <code>mode</code> "node" is selected and allows for defining the
+        active "nodeset" and "cells" to target for under the "guest" domain.
+        For example; the specified <code>nodeset</code> can limit the
+        <a href="#elementsNUMATuning"><code>numatune</code></a> assigned
+        host NUMA nodes in effect under the "guest".   Alternatively,
+        the provided <code>cells</code> attribute can define the number
+        of <a href="#elementsNUMAtopology"><code>numa</code></a> cells
+        to render.
+
+        <span class="since">Since 5.9</span>
+      </dd>
+    </dl>
+
+
     <h3><a id="elementsNUMATuning">NUMA Node Tuning</a></h3>
 
 <pre>
@@ -1755,6 +1847,8 @@
       </dd>
     </dl>
 
+    <h3><a id="elementsNUMAtopology">NUMA topology</a></h3>
+
     <p>
       Guest NUMA topology can be specified using the <code>numa</code> element.
       <span class="since">Since 0.9.8</span>
diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng
index e06f892da393..227c856a362c 100644
--- a/docs/schemas/domaincommon.rng
+++ b/docs/schemas/domaincommon.rng
@@ -786,6 +786,10 @@
         <ref name="cputune"/>
       </optional>
 
+      <optional>
+        <ref name="vnuma"/>
+      </optional>
+
       <optional>
         <ref name="numatune"/>
       </optional>
@@ -1062,6 +1066,67 @@
     </choice>
   </define>
 
+  <!-- All the "host vnuma" related tunables would go in the vnuma -->
+  <define name="vnuma">
+    <element name="vnuma">
+      <optional>
+        <ref name="vnumaMode"/>
+      </optional>
+      <optional>
+        <ref name="vnumaDistribution"/>
+      </optional>
+      <interleave>
+        <optional>
+          <element name="memory">
+            <ref name="scaledInteger"/>
+          </element>
+        </optional>
+        <optional>
+          <element name="partition">
+            <optional>
+              <ref name="vnumaNodeset"/>
+            </optional>
+            <optional>
+              <ref name="vnumaCells"/>
+            </optional>
+          </element>
+        </optional>
+      </interleave>
+    </element>
+  </define>
+
+  <define name="vnumaMode">
+    <attribute name="mode">
+      <choice>
+        <value>host</value>
+        <value>node</value>
+      </choice>
+    </attribute>
+  </define>
+
+  <define name="vnumaDistribution">
+    <attribute name="distribution">
+      <choice>
+        <value>contiguous</value>
+        <value>siblings</value>
+        <value>round-robin</value>
+        <value>interleave</value>
+      </choice>
+    </attribute>
+  </define>
+
+  <define name="vnumaNodeset">
+    <attribute name='nodeset'>
+      <ref name='cpuset'/>
+    </attribute>
+  </define>
+
+  <define name="vnumaCells">
+    <attribute name='cells'>
+      <ref name="positiveInteger"/>
+    </attribute>
+  </define>
+
   <!-- All the NUMA related tunables would go in the numatune -->
   <define name="numatune">
     <element name="numatune">
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
index 317e7846ceb0..32b29740bffd 100644
--- a/src/conf/domain_conf.c
+++ b/src/conf/domain_conf.c
@@ -1824,6 +1824,18 @@ virDomainDefSetVcpusMax(virDomainDefPtr def,
     if (def->maxvcpus == maxvcpus)
         return 0;
 
+    if (virDomainVnumaIsEnabled(def->numa)) {
+        size_t nnumaCell = virDomainNumaGetNodeCount(def->numa);
+
+        if (maxvcpus % nnumaCell) {
+            virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+                           _("vNUMA: the maximum vCPU count %d is not a "
+                             "multiple of the configured vNUMA node count %ld"),
+                           maxvcpus, nnumaCell);
+            return -1;
+        }
+    }
+
     if (def->maxvcpus < maxvcpus) {
         if (VIR_EXPAND_N(def->vcpus, def->maxvcpus, maxvcpus - def->maxvcpus) < 0)
             return -1;
@@ -2067,6 +2079,394 @@ virDomainDefGetVcpusTopology(const virDomainDef *def,
 }
 
 
+void
+virDomainDefSetVcpusVnuma(virDomainDefPtr def,
+                          size_t nvcpus)
+{
+    int vcpuscnt = nvcpus;
+    size_t cell, i;
+    size_t vcpu_node;
+    size_t nnumaCell = virDomainNumaGetNodeCount(def->numa);
+
+    if (!nnumaCell)
+        return;
+
+    /* vcpu_node represents the maximum vcpus per vNUMA
+     * node that theoretically could be within a set.
+     */
+    vcpu_node = (def->maxvcpus / nnumaCell) + ((def->maxvcpus % nnumaCell) ? 1 : 0);
+
+    for (i = 0; i < vcpu_node; i++) {
+        for (cell = 0; cell < nnumaCell; cell++) {
+            virDomainVcpuDefPtr vcpu;
+            size_t cid = cell * vcpu_node + i;
+
+            if (cid >= def->maxvcpus)
+                break;
+
+            vcpu = def->vcpus[cid];
+
+            if (vcpuscnt-- > 0)
+                vcpu->online = true;
+            else
+                vcpu->online = false;
+
+            /* vCPU0 cannot be hotplugged */
+            if (cid)
+                vcpu->hotpluggable = true;
+        }
+    }
+    def->individualvcpus = true;
+
+    return;
+}
+
+
+/**
+ * virDomainNumaAutoconfig: vNUMA automatic host partition processing
+ * @def: domain definition
+ * @caps: host capabilities
+ *
+ * vNUMA automatic host partitioning is requested by adding the <vnuma
+ * mode=...> element to the guest XML. See virDomainVnumaParseXML() for
+ * parsing the related XML and filling the virDomainAutoPartition structure.
+ *
+ * If the virDomainAutoPartition structure is valid, libvirt takes into
+ * account the host hardware configuration (including maxvcpus, online
+ * vcpus, and memory) and creates the guest such that vcpus and memory
+ * are spread evenly across the host.
+ *
+ * Returns 0 on success and -1 on error.
+ */
+static int
+virDomainNumaAutoconfig(virDomainDefPtr def,
+                        virCapsPtr caps)
+{
+    int ret = -1;
+    virBitmapPtr nodeset = NULL;
+    virDomainNumaPtr numa = def->numa;
+    virDomainAutoPartitionPtr avnuma;
+
+    if (!numa)
+        goto error;
+
+    if (caps &&
+        (avnuma = virDomainVnumaParseXML(numa, NULL))) {
+
+        size_t i, j, cell;
+        size_t nvcpus = 0;
+        size_t nnumaCell = 0;
+        size_t vcpu_node;
+        unsigned long long memsizeCell = 0;
+        virCapsHostPtr host = &caps->host;
+        unsigned int threads = host->cpu->threads;
+
+        if (!def->cpu) {
+            virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+                           _("vNUMA: unable to render <vnuma> partitioning for "
+                             "domain %s because of undefined <cpu ... /> topology."),
+                           def->name);
+            goto error;
+        }
+
+        if (!avnuma->nodeset) {
+            if (!(avnuma->nodeset = virBitmapNew(host->nnumaCell)))
+                goto cleanup;
+
+            for (i = 0; i < host->nnumaCell; i++)
+                if (virBitmapSetBit(avnuma->nodeset, i) < 0)
+                    goto cleanup;
+        }
+
+        /* Set the vNUMA cell count */
+        nnumaCell = avnuma->vcell ? avnuma->vcell : virBitmapCountBits(avnuma->nodeset);
+
+        if (!nnumaCell)
+            goto cleanup;
+
+        /* Compute the online vcpus */
+        for (i = 0; i < def->maxvcpus; i++)
+            if (def->vcpus[i]->online)
+                nvcpus++;
+
+        /* vcpu_node represents the maximum vcpus per numanode that
+         * theoretically could be within a set.
+         */
+        vcpu_node = (def->maxvcpus / nnumaCell) + ((def->maxvcpus % nnumaCell) ? 1 : 0);
+
+        /* Do the host provided "CPU topology" threads fit */
+        threads = (nnumaCell % threads) ? 1 : threads;
+
+        /* Is it possible to render the guest for vNUMA auto partition? */
+        if ((def->maxvcpus % nnumaCell) ||
+            (def->maxvcpus < (nnumaCell * threads))) {
+            virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+                           _("vNUMA: %ld vcpus is insufficient to "
+                             "arrange a vNUMA topology for %ld nodes."),
+                           def->maxvcpus, nnumaCell);
+            goto error;
+        }
+
+        /* Compute the memory size (memsizeCell) per arranged nnumaCell.
+         * If no memory for vNUMA auto partitioning was specified then
+         * compute its value from the total_memory configuration.
+         */
+        if ((memsizeCell = avnuma->mem / nnumaCell) == 0) {
+            unsigned long long hotplugMemory = 0;
+
+            /* Calculate the size of hotplug memory */
+            for (i = 0; i < def->nmems; i++)
+                hotplugMemory += def->mems[i]->size;
+
+            memsizeCell = (def->mem.total_memory - hotplugMemory) / nnumaCell;
+        }
+
+        /* Under vNUMA automatic host partitioning the 'memballoon' controlled
+         * cur_balloon value should reflect the guest's total_memory setting.
+         */
+        def->mem.cur_balloon = def->mem.total_memory;
+
+        /* Correct vNUMA can only be accomplished if the number of maxvcpus
+         * is a multiple of the number of physical nodes.  If this is not
+         * possible we set sockets, cores and threads to 0 so libvirt creates
+         * a default topology where all vcpus appear as sockets and cores and
+         * threads are set to 1.
+         */
+        if (def->maxvcpus % (nnumaCell * threads)) {
+            VIR_WARN("Disabling guest %s auto vNUMA topology because configured "
+                     "%ld vCPUs do not match the host's %ld NUMA nodes to produce "
+                     "an evenly balanced CPU topology.",
+                     def->name, def->maxvcpus, nnumaCell);
+            def->cpu->sockets = def->cpu->cores = def->cpu->threads = 0;
+        } else {
+            /* Below computed topology aims to align the guest's sockets,
+             * cores and threads with the host's topology.
+             */
+            def->cpu->cores = def->maxvcpus / (nnumaCell * threads);
+            def->cpu->threads = threads;
+            def->cpu->sockets = nnumaCell;
+        }
+
+        /* Build the vNUMA topology. The previous configuration may
+         * have changed entirely, so free the current NUMA allocation
+         * and start over from scratch.
+         */
+        virDomainNumaFree(numa);
+        if (!(numa = virDomainNumaNew()))
+            goto error;
+
+        /* We're clean and good to rebuild the entire guest domain
+         * respecting the requested vNUMA topoplogy provided by <vnuma>
+         * avnuma stored objects.
+         */
+        avnuma->mem = memsizeCell * nnumaCell;
+
+        if (!virDomainNumaSetNodeCount(numa, nnumaCell))
+            goto error;
+
+        if (!(nodeset = virBitmapNewCopy(avnuma->nodeset)))
+            goto error;
+
+        for (cell = 0; cell < nnumaCell; cell++) {
+            size_t ndistances;
+            size_t vcell = cell % host->nnumaCell;
+            size_t vcpu_strt, vcpu_last, vcpu_left;
+            ssize_t node = 0;
+            unsigned int cores = def->cpu->cores;
+            virBitmapPtr cpumask = NULL;
+            virBitmapPtr vnumask = NULL;
+            virCapsHostNUMACell *numaCell = NULL;
+
+            /* per NUMA cell memory size */
+            virDomainNumaSetNodeMemorySize(numa, cell, memsizeCell);
+
+            /* per NUMA cell bind memory (mode='strict') */
+            if ((node = virBitmapNextSetBit(nodeset, (vcell-1))) < 0)
+                node = vcell - 1;
+
+            if (node >= host->nnumaCell) {
+                virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+                               _("vNUMA: domain %s defined nodeset node %ld "
+                                 "is out of range. Valid range is 0-%ld"),
+                               def->name, node, (host->nnumaCell-1));
+                goto error;
+            }
+
+            if (virDomainNumatuneSetmemset(numa, cell, node,
+                                           VIR_DOMAIN_NUMATUNE_MEM_STRICT) < 0)
+                goto error;
+
+            /* per NUMA cell vcpu range to mask */
+            if (!(cpumask = virBitmapNew(def->maxvcpus)))
+                goto error;
+
+            switch (avnuma->distribution) {
+                case VIR_DOMAIN_VNUMA_DISTRIBUTION_CONTIGUOUS:
+                /* vcpus are equally balanced from 0 to highest vcpu id
+                 * available, keeping ranges contiguous where the maximum vcpu
+                 * sets run from lowest vNUMA cells to highest available.
+                 */
+                vcpu_strt = cell * vcpu_node;
+                vcpu_last = MIN(vcpu_strt + vcpu_node, def->maxvcpus);
+
+                for (i = vcpu_strt; i < vcpu_last; i++) {
+                    if (virBitmapSetBitExpand(cpumask, i) < 0) {
+                        virBitmapFree(cpumask);
+                        goto error;
+                    }
+                }
+                break;
+
+                case VIR_DOMAIN_VNUMA_DISTRIBUTION_SIBLINGS:
+                /* Create vNUMA node vcpu ranges that represent a clean
+                 * processor sockets/core/threads model, placing one
+                 * socket per NUMA node.
+                 */
+                vcpu_strt = cell * cores;
+                vcpu_last = def->maxvcpus;
+                vcpu_left = def->maxvcpus / threads;
+
+                for (i = vcpu_strt; i < vcpu_last; i += vcpu_left) {
+                    for (j = 0; j < cores; j++) {
+                        unsigned int id = i + j;
+
+                        if (id < def->maxvcpus &&
+                            virBitmapSetBitExpand(cpumask, id) < 0) {
+                            virBitmapFree(cpumask);
+                            goto error;
+                        }
+                    }
+                }
+                break;
+
+                case VIR_DOMAIN_VNUMA_DISTRIBUTION_ROUNDROBIN:
+                /* Create vNUMA node vcpu ranges that round-robin
+                 * interleave one core per node over the available nodes.
+                 */
+                vcpu_strt = cell * threads;
+                vcpu_last = def->maxvcpus;
+                vcpu_left = threads * nnumaCell;
+
+                for (i = vcpu_strt; i < vcpu_last; i += vcpu_left) {
+                    for (j = 0; j < threads; j++) {
+                        unsigned int id = i + j;
+
+                        if (id < def->maxvcpus &&
+                            virBitmapSetBitExpand(cpumask, id) < 0) {
+                            virBitmapFree(cpumask);
+                            goto error;
+                        }
+                    }
+                }
+                break;
+
+                case VIR_DOMAIN_VNUMA_DISTRIBUTION_INTERLEAVE:
+                /* Distribute vCPUs over the NUMA nodes in a round-robin,
+                 * interleaved fashion, with one vCPU (thread) per node.
+                 */
+                def->cpu->sockets = def->cpu->cores = def->cpu->threads = 0;
+                for (i = cell; i < def->maxvcpus; i += nnumaCell) {
+                    if (virBitmapSetBitExpand(cpumask, i) < 0) {
+                        virBitmapFree(cpumask);
+                        goto error;
+                    }
+                }
+                break;
+
+                default:
+                virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+                               _("vNUMA: domain %s non-existent vCPU distribution requested."));
+                goto error;
+                break;
+            }
+
+            if (virDomainNumaSetNodeCpumask(numa, cell, cpumask) == NULL)
+                goto error;
+
+            /* per NUMA cpus sibling vNUMA pinning */
+            numaCell = host->numaCell[node];
+            if (!(vnumask = virBitmapNew(nnumaCell * numaCell->ncpus)))
+                goto error;
+
+            for (i = 0; i < numaCell->ncpus; i++) {
+                unsigned int id = numaCell->cpus[i].id;
+
+                if (virBitmapSetBitExpand(vnumask, id) < 0) {
+                    virBitmapFree(vnumask);
+                    goto error;
+                }
+            }
+
+            for (i = 0; i < def->maxvcpus; i++) {
+                if (virBitmapIsBitSet(cpumask, i)) {
+                    if (!(def->vcpus[i]->cpumask = virBitmapNewCopy(vnumask)))
+                        goto error;
+                }
+            }
+            virBitmapFree(vnumask);
+
+            /* per NUMA cell sibling distances */
+            numaCell = host->numaCell[node];
+            switch (avnuma->mode) {
+                case VIR_DOMAIN_VNUMA_MODE_HOST:
+                    ndistances = numaCell->nsiblings;
+                    break;
+
+                case VIR_DOMAIN_VNUMA_MODE_NODE:
+                    ndistances = 1;
+                    if (avnuma->vcell)
+                        vcell = cell;
+                    else
+                        if (virBitmapClearBit(nodeset, node) < 0)
+                            goto error;
+
+                    break;
+
+                default:
+                    goto error;
+            }
+
+            /* Set vNUMA distances */
+            if (ndistances > 1) {
+                if (virDomainNumaSetNodeDistanceCount(numa,
+                                                      vcell,
+                                                      ndistances) < 0) {
+                    virReportError(VIR_ERR_INTERNAL_ERROR,
+                                   _("vNUMA: domain %s failed to render a "
+                                     "matching vNUMA node distances set, defined "
+                                     "vNUMA nodes %ld build on %ld host nodes."),
+                                   def->name, nnumaCell, ndistances);
+                    goto error;
+                }
+
+                for (i = 0; i < ndistances; i++) {
+                    unsigned int distance = numaCell->siblings[i].distance;
+
+                    if (virDomainNumaSetNodeDistance(numa, cell, i, distance) != distance)
+                        goto error;
+                }
+            }
+        }
+
+        /* We're done - enable the vNUMA marker */
+        virDomainVnumaSetEnabled(numa, avnuma);
+
+        /* Adjust the new created vNUMA description */
+        def->numa = numa;
+
+        /* per NUMA cpus sibling vNUMA hotplugging directives */
+        virDomainDefSetVcpusVnuma(def, virDomainDefGetVcpus(def));
+    }
+ cleanup:
+
+    ret = 0;
+
+ error:
+    virBitmapFree(nodeset);
+    return ret;
+}
+
+
 virDomainDiskDefPtr
 virDomainDiskDefNew(virDomainXMLOptionPtr xmlopt)
 {
@@ -10510,6 +10910,38 @@ virDomainDefSetMemoryTotal(virDomainDefPtr def,
 }
 
 
+/**
+ * virDomainDefSetNUMAMemoryTotal:
+ * @def: domain definition
+ * @size: size to set
+ * @caps: host capabilities
+ *
+ * A frontend to set the total memory size in @def. If the guest's
+ * configured "total_memory" setting and the requested "size" differ,
+ * call virDomainNumaAutoconfig() to evenly distribute the additional
+ * memory across all vNUMA nodes.
+ */
+int
+virDomainDefSetNUMAMemoryTotal(virDomainDefPtr def,
+                               unsigned long long size,
+                               virCapsPtr caps)
+{
+    bool DoNumaAutoConfig = (def->mem.total_memory != size);
+
+    if (DoNumaAutoConfig) {
+        if (virDomainVnumaSetMemory(def->numa, size) < 0)
+            return -1;
+
+        if (virDomainNumaAutoconfig(def, caps))
+            return -1;
+
+        if (virDomainDefPostParseMemory(def, VIR_DOMAIN_DEF_PARSE_ABI_UPDATE) < 0)
+            return -1;
+    }
+    return 0;
+}
+
+
 /**
  * virDomainDefGetMemoryTotal:
  * @def: domain definition
@@ -18809,7 +19241,8 @@ virDomainIOThreadSchedParse(xmlNodePtr node,
 static int
 virDomainVcpuParse(virDomainDefPtr def,
                    xmlXPathContextPtr ctxt,
-                   virDomainXMLOptionPtr xmlopt)
+                   virDomainXMLOptionPtr xmlopt,
+                   bool IsAvNUMA)
 {
     int n;
     xmlNodePtr vcpuNode;
@@ -18876,6 +19309,15 @@ virDomainVcpuParse(virDomainDefPtr def,
     if (virDomainDefSetVcpusMax(def, maxvcpus, xmlopt) < 0)
         return -1;
 
+    /* If vNUMA applies def->numa is reinitialized later */
+    if (IsAvNUMA) {
+
+        if (virDomainDefSetVcpus(def, vcpus) < 0)
+            return -1;
+
+        return 0;
+    }
+
     if ((n = virXPathNodeSet("./vcpus/vcpu", ctxt, &nodes)) < 0)
         return -1;
 
@@ -19746,6 +20188,7 @@ virDomainDefParseXML(xmlDocPtr xml,
     char *netprefix = NULL;
     g_autofree xmlNodePtr *nodes = NULL;
     g_autofree char *tmp = NULL;
+    bool IsAvNUMA;
 
     if (flags & VIR_DOMAIN_DEF_PARSE_VALIDATE_SCHEMA) {
         g_autofree char *schema = NULL;
@@ -19871,6 +20314,8 @@ virDomainDefParseXML(xmlDocPtr xml,
     }
     VIR_FREE(tmp);
 
+    IsAvNUMA = virDomainVnumaParseXML(def->numa, ctxt) ? true : false;
+
     tmp = virXPathString("string(./memoryBacking/source/@type)", ctxt);
     if (tmp) {
         if ((def->mem.source = virDomainMemorySourceTypeFromString(tmp)) <= 0) {
@@ -19986,7 +20431,7 @@ virDomainDefParseXML(xmlDocPtr xml,
                                   &def->mem.swap_hard_limit) < 0)
         goto error;
 
-    if (virDomainVcpuParse(def, ctxt, xmlopt) < 0)
+    if (virDomainVcpuParse(def, ctxt, xmlopt, IsAvNUMA) < 0)
         goto error;
 
     if (virDomainDefParseIOThreads(def, ctxt) < 0)
@@ -20059,14 +20504,16 @@ virDomainDefParseXML(xmlDocPtr xml,
         goto error;
     }
 
-    if ((n = virXPathNodeSet("./cputune/vcpupin", ctxt, &nodes)) < 0)
-        goto error;
-
-    for (i = 0; i < n; i++) {
-        if (virDomainVcpuPinDefParseXML(def, nodes[i]))
+    if (!IsAvNUMA) {
+        if ((n = virXPathNodeSet("./cputune/vcpupin", ctxt, &nodes)) < 0)
             goto error;
+
+        for (i = 0; i < n; i++) {
+            if (virDomainVcpuPinDefParseXML(def, nodes[i]))
+                goto error;
+        }
+        VIR_FREE(nodes);
     }
-    VIR_FREE(nodes);
 
     if ((n = virXPathNodeSet("./cputune/emulatorpin", ctxt, &nodes)) < 0) {
         virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
@@ -20173,6 +20620,10 @@ virDomainDefParseXML(xmlDocPtr xml,
     if (virDomainNumaDefCPUParseXML(def->numa, ctxt) < 0)
         goto error;
 
+    /* Check and update the guest's XML vNUMA topology if needed */
+    if (virDomainNumaAutoconfig(def, caps))
+        goto error;
+
     if (virDomainNumaGetCPUCountTotal(def->numa) > virDomainDefGetVcpusMax(def)) {
         virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                        _("Number of CPUs in <numa> exceeds the"
@@ -20186,10 +20637,11 @@ virDomainDefParseXML(xmlDocPtr xml,
         goto error;
     }
 
-    if (virDomainNumatuneParseXML(def->numa,
-                                  def->placement_mode ==
-                                  VIR_DOMAIN_CPU_PLACEMENT_MODE_STATIC,
-                                  ctxt) < 0)
+    if (!virDomainVnumaIsEnabled(def->numa) &&
+        (virDomainNumatuneParseXML(def->numa,
+                                   def->placement_mode ==
+                                   VIR_DOMAIN_CPU_PLACEMENT_MODE_STATIC,
+                                   ctxt) < 0))
         goto error;
 
     if (virDomainNumatuneHasPlacementAuto(def->numa) &&
@@ -28496,6 +28948,9 @@ virDomainDefFormatInternalSetRootName(virDomainDefPtr def,
     if (virDomainMemtuneFormat(buf, &def->mem) < 0)
         goto error;
 
+    if (virDomainVnumaFormatXML(buf, def->numa) < 0)
+        goto error;
+
     if (virDomainCpuDefFormat(buf, def) < 0)
         goto error;
 
@@ -29148,6 +29603,9 @@ virDomainSaveConfig(const char *configDir,
 {
     g_autofree char *xml = NULL;
 
+    if (virDomainNumaAutoconfig(def, caps) < 0)
+        return -1;
+
     if (!(xml = virDomainDefFormat(def, caps, VIR_DOMAIN_DEF_FORMAT_SECURE)))
         return -1;
 
diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h
index 5a17acedf299..0db77d9247a1 100644
--- a/src/conf/domain_conf.h
+++ b/src/conf/domain_conf.h
@@ -2535,6 +2535,7 @@ struct _virDomainDef {
 
 unsigned long long virDomainDefGetMemoryInitial(const virDomainDef *def);
 void virDomainDefSetMemoryTotal(virDomainDefPtr def, unsigned long long size);
+int virDomainDefSetNUMAMemoryTotal(virDomainDefPtr def, unsigned long long size, virCapsPtr caps);
 unsigned long long virDomainDefGetMemoryTotal(const virDomainDef *def);
 bool virDomainDefHasMemoryHotplug(const virDomainDef *def);
 
@@ -2816,6 +2817,7 @@ int virDomainDefSetVcpusMax(virDomainDefPtr def,
 bool virDomainDefHasVcpusOffline(const virDomainDef *def);
 unsigned int virDomainDefGetVcpusMax(const virDomainDef *def);
 int virDomainDefSetVcpus(virDomainDefPtr def, unsigned int vcpus);
+void virDomainDefSetVcpusVnuma(virDomainDefPtr def, size_t vcpus);
 unsigned int virDomainDefGetVcpus(const virDomainDef *def);
 virBitmapPtr virDomainDefGetOnlineVcpumap(const virDomainDef *def);
 virDomainVcpuDefPtr virDomainDefGetVcpu(virDomainDefPtr def, unsigned int vcpu)
diff --git a/src/conf/numa_conf.c b/src/conf/numa_conf.c
index 6720d5620d1d..8e6ef4008b8d 100644
--- a/src/conf/numa_conf.c
+++ b/src/conf/numa_conf.c
@@ -45,6 +45,20 @@ VIR_ENUM_IMPL(virDomainNumatuneMemMode,
               "interleave",
 );
 
+VIR_ENUM_IMPL(virDomainVnumaMode,
+              VIR_DOMAIN_VNUMA_MODE_LAST,
+              "host",
+              "node",
+);
+
+VIR_ENUM_IMPL(virDomainVnumaDistribution,
+              VIR_DOMAIN_VNUMA_DISTRIBUTION_LAST,
+              "contiguous",
+              "siblings",
+              "round-robin",
+              "interleave",
+);
+
 VIR_ENUM_IMPL(virDomainNumatunePlacement,
               VIR_DOMAIN_NUMATUNE_PLACEMENT_LAST,
               "default",
@@ -90,6 +104,7 @@ struct _virDomainNuma {
     size_t nmem_nodes;
 
     /* Future NUMA tuning related stuff should go here. */
+    virDomainAutoPartitionPtr avnuma;
 };
 
 
@@ -353,6 +368,156 @@ virDomainNumatuneFormatXML(virBufferPtr buf,
     return 0;
 }
 
+int
+virDomainVnumaFormatXML(virBufferPtr buf,
+                        virDomainNumaPtr numa)
+{
+    char *nodeset = NULL;
+    if (numa && virDomainVnumaIsEnabled(numa)) {
+
+        virBufferAddLit(buf, "<vnuma");
+        virBufferAsprintf(buf, " mode='%s'",
+                          virDomainVnumaModeTypeToString(numa->avnuma->mode));
+        virBufferAsprintf(buf, " distribution='%s'",
+                          virDomainVnumaDistributionTypeToString(numa->avnuma->distribution));
+        virBufferAddLit(buf, ">\n");
+
+        virBufferAdjustIndent(buf, 2);
+        virBufferAsprintf(buf, "<memory unit='KiB'>%llu</memory>\n",
+                          numa->avnuma->mem);
+
+
+        if (numa->avnuma->mode == VIR_DOMAIN_VNUMA_MODE_NODE) {
+                if ((nodeset = virBitmapFormat(numa->avnuma->nodeset))) {
+                    virBufferAsprintf(buf, "<partition nodeset='%s'", nodeset);
+                    VIR_FREE(nodeset);
+                }
+
+                if (numa->avnuma->vcell)
+                    virBufferAsprintf(buf, " cells='%u'", numa->avnuma->vcell);
+                virBufferAddLit(buf, "/>\n");
+        }
+        virBufferAdjustIndent(buf, -2);
+
+        virBufferAddLit(buf, "</vnuma>\n");
+    }
+
+    return 0;
+}
+
+virDomainAutoPartitionPtr
+virDomainVnumaParseXML(virDomainNumaPtr numa,
+                       xmlXPathContextPtr ctxt)
+{
+    int ret = -1;
+    char *tmp = NULL;
+    xmlNodePtr node, oldnode;
+    virDomainAutoPartitionPtr avnuma = NULL;
+
+    if (!numa)
+        return NULL;
+
+    if (!ctxt)
+        return avnuma = numa->avnuma;
+
+    oldnode = ctxt->node;
+    node = virXPathNode("./vnuma[1]", ctxt);
+    if (node) {
+        int mode = -1;
+        int distribution = VIR_DOMAIN_VNUMA_DISTRIBUTION_CONTIGUOUS;
+        unsigned int maxvcell = 0;
+        unsigned long long mem = 0L;
+        virBitmapPtr nodeset = NULL;
+
+        if (!virXMLNodeNameEqual(node, "vnuma")) {
+            virReportError(VIR_ERR_XML_ERROR, "%s",
+                           _("domain definition does not contain expected 'vnuma' element"));
+            goto cleanup;
+        }
+
+        if (VIR_ALLOC(avnuma) < 0)
+            goto cleanup;
+
+        /* There has to be a valid vnuma mode setting */
+        if (!(tmp = virXMLPropString(node, "mode"))) {
+            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+                           _("No vNUMA 'mode' specified for automatic host partitioning"));
+            goto cleanup;
+        }
+
+        if ((mode = virDomainVnumaModeTypeFromString(tmp)) < 0) {
+            virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+                           _("Unsupported automatic vNUMA partitioning mode '%s'"), tmp);
+            goto cleanup;
+        }
+        VIR_FREE(tmp);
+
+        /* If specified get the vcpu 'distribution' type */
+        if ((tmp = virXMLPropString(node, "distribution")) &&
+            (distribution = virDomainVnumaDistributionTypeFromString(tmp)) < 0) {
+            virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+                           _("Unsupported automatic vNUMA partitioning distribution '%s'"), tmp);
+            goto cleanup;
+        }
+        VIR_FREE(tmp);
+
+        /* Obtain the designated <vnuma mode='node' attributes */
+        ctxt->node = node;
+        switch (mode) {
+            case VIR_DOMAIN_VNUMA_MODE_NODE:
+                if ((node = virXPathNode("./partition[1]", ctxt))) {
+
+                    /* Get the host <partition> nodeset='#nodeset' for <numatune> */
+                    if ((tmp = virXMLPropString(node, "nodeset"))) {
+                        if (virBitmapParse(tmp, &nodeset, VIR_DOMAIN_CPUMASK_LEN) < 0)
+                            goto cleanup;
+                        VIR_FREE(tmp);
+                    }
+
+                    /* Get the fictitious <partition> cells='#count' attribute */
+                    if ((tmp = virXMLPropString(node, "cells"))) {
+                        if (virStrToLong_ui(tmp, NULL, 10, &maxvcell) < 0) {
+                            virReportError(VIR_ERR_XML_ERROR, "%s",
+                                    _("maximum vcpus count must be an integer"));
+                            goto cleanup;
+                        }
+                        VIR_FREE(tmp);
+                    }
+                }
+                break;
+
+            case VIR_DOMAIN_VNUMA_MODE_HOST:
+            default:
+                break;
+        }
+
+        /* Get the <memory> size to render the <numa> nodes with */
+        if (virDomainParseMemory("./memory[1]", NULL, ctxt,
+                             &mem, false, true) < 0)
+            goto cleanup;
+
+        /* We're set and good to go */
+        avnuma->mode = mode;
+        avnuma->distribution = distribution;
+        avnuma->nodeset = nodeset;
+        avnuma->mem = mem;
+        avnuma->vcell = maxvcell;
+
+        numa->avnuma = avnuma;
+    }
+    ret = 0;
+
+ cleanup:
+    if (ret) {
+        VIR_FREE(tmp);
+        VIR_FREE(avnuma);
+        avnuma = NULL;
+    }
+    ctxt->node = oldnode;
+
+    return avnuma;
+}
+
 void
 virDomainNumaFree(virDomainNumaPtr numa)
 {
@@ -572,6 +737,76 @@ virDomainNumatuneSet(virDomainNumaPtr numa,
     return ret;
 }
 
+int
+virDomainNumatuneSetmemset(virDomainNumaPtr numa,
+                           size_t cell,
+                           size_t node,
+                           int mode)
+{
+    int ret = -1;
+    virDomainNumaNodePtr mem_node = &numa->mem_nodes[cell];
+
+    /* Get out if this is under control of numad! */
+    if (numa->memory.specified)
+        goto cleanup;
+
+    /* Get out if numa does not apply */
+    if (cell > numa->nmem_nodes)
+        goto cleanup;
+
+    /* Get out if mode is out of range */
+    if (mode < 0 || mode >= VIR_DOMAIN_NUMATUNE_MEM_LAST) {
+        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+                       _("Unsupported numatune mode '%d'"),
+                       mode);
+        goto cleanup;
+    }
+
+    /* Force the numatune/memset setting */
+    if (!(mem_node->nodeset = virBitmapNew(numa->nmem_nodes)) ||
+        virBitmapSetBitExpand(mem_node->nodeset, node) < 0) {
+        virBitmapFree(mem_node->nodeset);
+        goto cleanup;
+    }
+    mem_node->mode = mode;
+
+    ret = 0;
+
+ cleanup:
+    return ret;
+}
+
+bool
+virDomainVnumaIsEnabled(virDomainNumaPtr numa)
+{
+    if (numa && numa->avnuma)
+        return numa->avnuma->specified;
+
+    return false;
+}
+
+void
+virDomainVnumaSetEnabled(virDomainNumaPtr numa,
+                         virDomainAutoPartitionPtr avnuma)
+{
+    if (numa && avnuma) {
+        numa->avnuma = avnuma;
+        numa->avnuma->specified = true;
+    }
+}
+
+int
+virDomainVnumaSetMemory(virDomainNumaPtr numa,
+                        unsigned long long size)
+{
+    if (!numa)
+        return -1;
+
+    numa->avnuma->mem = size;
+
+    return 0;
+}
+
 static bool
 virDomainNumaNodesEqual(virDomainNumaPtr n1,
                         virDomainNumaPtr n2)
@@ -1273,7 +1508,7 @@ virDomainNumaSetNodeDistance(virDomainNumaPtr numa,
 }
 
 
-size_t
+int
 virDomainNumaSetNodeDistanceCount(virDomainNumaPtr numa,
                                   size_t node,
                                   size_t ndistances)
@@ -1285,11 +1520,11 @@ virDomainNumaSetNodeDistanceCount(virDomainNumaPtr numa,
         virReportError(VIR_ERR_INTERNAL_ERROR,
                        _("Cannot alter an existing nmem_nodes distances set for node: %zu"),
                        node);
-        return 0;
+        return -1;
     }
 
     if (VIR_ALLOC_N(distances, ndistances) < 0)
-        return 0;
+        return -1;
 
     numa->mem_nodes[node].distances = distances;
     numa->mem_nodes[node].ndistances = ndistances;
diff --git a/src/conf/numa_conf.h b/src/conf/numa_conf.h
index e76a09c20cdc..bdc1deb6e143 100644
--- a/src/conf/numa_conf.h
+++ b/src/conf/numa_conf.h
@@ -32,6 +32,9 @@
 typedef struct _virDomainNuma virDomainNuma;
 typedef virDomainNuma *virDomainNumaPtr;
 
+typedef struct _virDomainAutoPartition virDomainAutoPartition;
+typedef virDomainAutoPartition *virDomainAutoPartitionPtr;
+
 typedef enum {
     VIR_DOMAIN_NUMATUNE_PLACEMENT_DEFAULT = 0,
     VIR_DOMAIN_NUMATUNE_PLACEMENT_STATIC,
@@ -43,6 +46,24 @@ typedef enum {
 VIR_ENUM_DECL(virDomainNumatunePlacement);
 VIR_ENUM_DECL(virDomainNumatuneMemMode);
 
+typedef enum {
+    VIR_DOMAIN_VNUMA_MODE_HOST = 0,
+    VIR_DOMAIN_VNUMA_MODE_NODE,
+
+    VIR_DOMAIN_VNUMA_MODE_LAST
+} virDomainVnumaMode;
+VIR_ENUM_DECL(virDomainVnumaMode);
+
+typedef enum {
+    VIR_DOMAIN_VNUMA_DISTRIBUTION_CONTIGUOUS = 0,
+    VIR_DOMAIN_VNUMA_DISTRIBUTION_SIBLINGS,
+    VIR_DOMAIN_VNUMA_DISTRIBUTION_ROUNDROBIN,
+    VIR_DOMAIN_VNUMA_DISTRIBUTION_INTERLEAVE,
+
+    VIR_DOMAIN_VNUMA_DISTRIBUTION_LAST
+} virDomainVnumaDistribution;
+VIR_ENUM_DECL(virDomainVnumaDistribution);
+
 typedef enum {
     VIR_DOMAIN_MEMORY_ACCESS_DEFAULT = 0,  /*  No memory access defined */
     VIR_DOMAIN_MEMORY_ACCESS_SHARED,    /* Memory access is set as shared */
@@ -52,6 +73,14 @@ typedef enum {
 } virDomainMemoryAccess;
 VIR_ENUM_DECL(virDomainMemoryAccess);
 
+struct _virDomainAutoPartition {
+    bool specified; /* Auto vNUMA active */
+    int mode;       /* Auto vNUMA mode */
+    int distribution; /* Auto vNUMA distribution */
+    unsigned long long mem; /* Auto vNUMA total memory */
+    unsigned int vcell;     /* Auto vNUMA node Cell */
+    virBitmapPtr nodeset;   /* Auto vNUMA host nodes where this guest node resides */
+};
 
 virDomainNumaPtr virDomainNumaNew(void);
 void virDomainNumaFree(virDomainNumaPtr numa);
@@ -67,9 +96,19 @@ int virDomainNumatuneParseXML(virDomainNumaPtr numa,
 int virDomainNumatuneFormatXML(virBufferPtr buf, virDomainNumaPtr numatune)
     ATTRIBUTE_NONNULL(1);
 
+virDomainAutoPartitionPtr virDomainVnumaParseXML(virDomainNumaPtr numa,
+                                                 xmlXPathContextPtr ctxt)
+    ATTRIBUTE_NONNULL(1);
+
+int virDomainVnumaFormatXML(virBufferPtr buf, virDomainNumaPtr numa)
+    ATTRIBUTE_NONNULL(1);
+
 /*
  * Getters
  */
+bool virDomainVnumaIsEnabled(virDomainNumaPtr numa)
+    ATTRIBUTE_NONNULL(1);
+
 int virDomainNumatuneGetMode(virDomainNumaPtr numatune,
                              int cellid,
                              virDomainNumatuneMemMode *mode);
@@ -134,6 +173,19 @@ int virDomainNumatuneSet(virDomainNumaPtr numa,
                          virBitmapPtr nodeset)
     ATTRIBUTE_NONNULL(1);
 
+void virDomainVnumaSetEnabled(virDomainNumaPtr numa,
+                              virDomainAutoPartitionPtr avnuma)
+    ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2);
+int virDomainVnumaSetMemory(virDomainNumaPtr numa,
+                            unsigned long long size)
+    ATTRIBUTE_NONNULL(1);
+
+int virDomainNumatuneSetmemset(virDomainNumaPtr numa,
+                               size_t cell,
+                               size_t node,
+                               int mode)
+    ATTRIBUTE_NONNULL(1);
+
 size_t virDomainNumaSetNodeCount(virDomainNumaPtr numa,
                                  size_t nmem_nodes)
     ATTRIBUTE_NONNULL(1);
@@ -149,9 +201,9 @@ int virDomainNumaSetNodeDistance(virDomainNumaPtr numa,
                                  unsigned int value)
     ATTRIBUTE_NONNULL(1);
 
-size_t virDomainNumaSetNodeDistanceCount(virDomainNumaPtr numa,
-                                         size_t node,
-                                         size_t ndistances)
+int virDomainNumaSetNodeDistanceCount(virDomainNumaPtr numa,
+                                      size_t node,
+                                      size_t ndistances)
     ATTRIBUTE_NONNULL(1);
 
 virBitmapPtr virDomainNumaSetNodeCpumask(virDomainNumaPtr numa,
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
index 17977229d18f..7f7c3fdeafaa 100644
--- a/src/libvirt_private.syms
+++ b/src/libvirt_private.syms
@@ -311,8 +311,10 @@ virDomainDefParseNode;
 virDomainDefParseString;
 virDomainDefPostParse;
 virDomainDefSetMemoryTotal;
+virDomainDefSetNUMAMemoryTotal;
 virDomainDefSetVcpus;
 virDomainDefSetVcpusMax;
+virDomainDefSetVcpusVnuma;
 virDomainDefValidate;
 virDomainDefVcpuOrderClear;
 virDomainDeleteConfig;
@@ -828,7 +830,13 @@ virDomainNumatuneParseXML;
 virDomainNumatunePlacementTypeFromString;
 virDomainNumatunePlacementTypeToString;
 virDomainNumatuneSet;
+virDomainNumatuneSetmemset;
 virDomainNumatuneSpecifiedMaxNode;
+virDomainVnumaFormatXML;
+virDomainVnumaIsEnabled;
+virDomainVnumaParseXML;
+virDomainVnumaSetEnabled;
+virDomainVnumaSetMemory;
 
 
 # conf/nwfilter_conf.h
-- 
2.21.0




More information about the libvir-list mailing list