[libvirt PATCH 08/13] ch_cgroup: methods for cgroup mgmt in ch driver

Daniel Henrique Barboza danielhb413 at gmail.com
Fri Nov 12 20:45:50 UTC 2021



On 10/21/21 16:31, Vineeth Pillai wrote:
> Signed-off-by: Vineeth Pillai <viremana at linux.microsoft.com>
> Signed-off-by: Praveen K Paladugu <prapal at linux.microsoft.com>
> ---
>   po/POTFILES.in      |   1 +
>   src/ch/ch_cgroup.c  | 457 ++++++++++++++++++++++++++++++++++++++++++++
>   src/ch/ch_cgroup.h  |  45 +++++
>   src/ch/ch_conf.c    |   2 +
>   src/ch/ch_conf.h    |   4 +-
>   src/ch/ch_domain.c  |  33 ++++
>   src/ch/ch_domain.h  |   3 +-
>   src/ch/ch_monitor.c | 125 ++++++++++--
>   src/ch/ch_monitor.h |  54 +++++-
>   src/ch/ch_process.c | 288 +++++++++++++++++++++++++++-
>   src/ch/ch_process.h |   3 +
>   src/ch/meson.build  |   2 +
>   12 files changed, 991 insertions(+), 26 deletions(-)
>   create mode 100644 src/ch/ch_cgroup.c
>   create mode 100644 src/ch/ch_cgroup.h
> 
> diff --git a/po/POTFILES.in b/po/POTFILES.in
> index b554cf08ca..3a8db501bc 100644
> --- a/po/POTFILES.in
> +++ b/po/POTFILES.in
> @@ -19,6 +19,7 @@
>   @SRCDIR at src/bhyve/bhyve_parse_command.c
>   @SRCDIR at src/bhyve/bhyve_process.c
>   @SRCDIR at src/ch/ch_conf.c
> + at SRCDIR@src/ch/ch_cgroup.c
>   @SRCDIR at src/ch/ch_domain.c
>   @SRCDIR at src/ch/ch_driver.c
>   @SRCDIR at src/ch/ch_monitor.c
> diff --git a/src/ch/ch_cgroup.c b/src/ch/ch_cgroup.c
> new file mode 100644
> index 0000000000..6be2184cf1
> --- /dev/null
> +++ b/src/ch/ch_cgroup.c
> @@ -0,0 +1,457 @@
> +/*
> + * ch_cgroup.c: CH cgroup management
> + *
> + * Copyright Microsoft Corp. 2020-2021
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library.  If not, see
> + * <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <config.h>
> +
> +#include "ch_cgroup.h"
> +#include "ch_domain.h"
> +#include "ch_process.h"
> +#include "vircgroup.h"
> +#include "virlog.h"
> +#include "viralloc.h"
> +#include "virerror.h"
> +#include "domain_audit.h"
> +#include "domain_cgroup.h"
> +#include "virscsi.h"
> +#include "virstring.h"
> +#include "virfile.h"
> +#include "virtypedparam.h"
> +#include "virnuma.h"
> +#include "virdevmapper.h"
> +#include "virutil.h"
> +
> +#define VIR_FROM_THIS VIR_FROM_CH
> +
> +VIR_LOG_INIT("ch.ch_cgroup");
> +
> +static int
> +chSetupBlkioCgroup(virDomainObj * vm)
> +{
> +    virCHDomainObjPrivate *priv = vm->privateData;
> +
> +    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_BLKIO)) {
> +        if (vm->def->blkio.weight || vm->def->blkio.ndevices) {
> +            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
> +                           _("Block I/O tuning is not available on this host"));
> +            return -1;
> +        } else {
> +            return 0;
> +        }
> +    }
> +
> +    return virDomainCgroupSetupBlkio(priv->cgroup, vm->def->blkio);
> +}
> +
> +
> +static int
> +chSetupMemoryCgroup(virDomainObj * vm)
> +{
> +    virCHDomainObjPrivate *priv = vm->privateData;
> +
> +    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) {
> +        if (virMemoryLimitIsSet(vm->def->mem.hard_limit) ||
> +            virMemoryLimitIsSet(vm->def->mem.soft_limit) ||
> +            virMemoryLimitIsSet(vm->def->mem.swap_hard_limit)) {
> +            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
> +                           _("Memory cgroup is not available on this host"));
> +            return -1;
> +        } else {
> +            return 0;
> +        }
> +    }
> +
> +    return virDomainCgroupSetupMemtune(priv->cgroup, vm->def->mem);
> +}
> +
> +static int
> +chSetupCpusetCgroup(virDomainObj * vm)
> +{
> +    virCHDomainObjPrivate *priv = vm->privateData;
> +
> +    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
> +        return 0;
> +
> +    if (virCgroupSetCpusetMemoryMigrate(priv->cgroup, true) < 0)
> +        return -1;
> +
> +    return 0;
> +}
> +
> +
> +static int
> +chSetupCpuCgroup(virDomainObj * vm)
> +{
> +    virCHDomainObjPrivate *priv = vm->privateData;
> +
> +    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
> +        if (vm->def->cputune.sharesSpecified) {
> +            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
> +                           _("CPU tuning is not available on this host"));
> +            return -1;
> +        } else {
> +            return 0;
> +        }
> +    }
> +
> +    if (vm->def->cputune.sharesSpecified) {
> +
> +        if (virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares) < 0)
> +            return -1;
> +
> +    }
> +
> +    return 0;
> +}
> +
> +
> +static int
> +chInitCgroup(virDomainObj * vm, size_t nnicindexes, int *nicindexes)
> +{
> +    virCHDomainObjPrivate *priv = vm->privateData;
> +
> +    g_autoptr(virCHDriverConfig) cfg = virCHDriverGetConfig(priv->driver);
> +
> +    if (!priv->driver->privileged)
> +        return 0;
> +
> +    if (!virCgroupAvailable())
> +        return 0;
> +
> +    virCgroupFree(priv->cgroup);
> +
> +    if (!vm->def->resource) {
> +        virDomainResourceDef *res;
> +
> +        res = g_new0(virDomainResourceDef, 1);
> +
> +        res->partition = g_strdup("/machine");
> +
> +        vm->def->resource = res;
> +    }
> +
> +    if (vm->def->resource->partition[0] != '/') {
> +        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
> +                       _("Resource partition '%s' must start with '/'"),
> +                       vm->def->resource->partition);
> +        return -1;
> +    }
> +
> +    if (virCgroupNewMachine(priv->machineName, "ch", vm->def->uuid, NULL, vm->pid, false, nnicindexes, nicindexes, vm->def->resource->partition, cfg->cgroupControllers, 0,     /* maxThreadsPerProc */


Break this big boy to at least 100 chars per line, please.


> +                            &priv->cgroup) < 0) {
> +        if (virCgroupNewIgnoreError())
> +            return 0;
> +
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +static void
> +chRestoreCgroupState(virDomainObj * vm)
> +{
> +    g_autofree char *mem_mask = NULL;
> +    g_autofree char *nodeset = NULL;
> +    virCHDomainObjPrivate *priv = vm->privateData;
> +    size_t i = 0;
> +
> +    g_autoptr(virBitmap) all_nodes = NULL;
> +    virCgroup *cgroup_temp = NULL;
> +
> +    if (!virNumaIsAvailable() ||
> +        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
> +        return;
> +
> +    if (!(all_nodes = virNumaGetHostMemoryNodeset()))
> +        goto error;
> +
> +    if (!(mem_mask = virBitmapFormat(all_nodes)))
> +        goto error;
> +
> +    if ((virCgroupHasEmptyTasks(priv->cgroup,
> +                                VIR_CGROUP_CONTROLLER_CPUSET)) <= 0)
> +        goto error;
> +
> +    if (virCgroupSetCpusetMems(priv->cgroup, mem_mask) < 0)
> +        goto error;
> +
> +    for (i = 0; i < virDomainDefGetVcpusMax(vm->def); i++) {
> +        virDomainVcpuDef *vcpu = virDomainDefGetVcpu(vm->def, i);
> +
> +        if (!vcpu->online)
> +            continue;
> +
> +        if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_VCPU, i,
> +                               false, &cgroup_temp) < 0 ||
> +            virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 ||
> +            virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 ||
> +            virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0)
> +            goto cleanup;
> +
> +        g_free(nodeset);
> +        virCgroupFree(cgroup_temp);
> +    }
> +
> +    for (i = 0; i < vm->def->niothreadids; i++) {
> +        if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_IOTHREAD,
> +                               vm->def->iothreadids[i]->iothread_id,
> +                               false, &cgroup_temp) < 0 ||
> +            virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 ||
> +            virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 ||
> +            virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0)
> +            goto cleanup;
> +
> +        g_free(nodeset);
> +        virCgroupFree(cgroup_temp);
> +    }
> +
> +    if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_EMULATOR, 0,
> +                           false, &cgroup_temp) < 0 ||
> +        virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 ||
> +        virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 ||
> +        virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0)
> +        goto cleanup;
> +
> +    cleanup:
> +        virCgroupFree(cgroup_temp);
> +        return;

Up there in the cgroup_temp declaration you can use

g_autoptr(virCgroup) cgroup_temp = NULL;

And then you won't need a 'cleanup' label.

> +
> +    error:

Wrong label indentation.


> +        virResetLastError();
> +        VIR_DEBUG("Couldn't restore cgroups to meaningful state");
> +        goto cleanup;
> +}
> +
> +int
> +chConnectCgroup(virDomainObj * vm)
> +{
> +    virCHDomainObjPrivate *priv = vm->privateData;
> +
> +    g_autoptr(virCHDriverConfig) cfg = virCHDriverGetConfig(priv->driver);
> +
> +    if (!priv->driver->privileged)
> +        return 0;
> +
> +    if (!virCgroupAvailable())
> +        return 0;
> +
> +    virCgroupFree(priv->cgroup);
> +
> +    if (virCgroupNewDetectMachine(vm->def->name,
> +                                  "ch",
> +                                  vm->pid,
> +                                  cfg->cgroupControllers,
> +                                  priv->machineName, &priv->cgroup) < 0)
> +        return -1;
> +
> +    chRestoreCgroupState(vm);
> +    return 0;
> +}
> +
> +int
> +chSetupCgroup(virDomainObj * vm, size_t nnicindexes, int *nicindexes)
> +{
> +    virCHDomainObjPrivate *priv = vm->privateData;
> +
> +    if (!vm->pid) {
> +        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> +                       _("Cannot setup cgroups until process is started"));
> +        return -1;
> +    }
> +
> +    if (chInitCgroup(vm, nnicindexes, nicindexes) < 0)
> +        return -1;
> +
> +    if (!priv->cgroup)
> +        return 0;
> +
> +    if (chSetupBlkioCgroup(vm) < 0)
> +        return -1;
> +
> +    if (chSetupMemoryCgroup(vm) < 0)
> +        return -1;
> +
> +    if (chSetupCpuCgroup(vm) < 0)
> +        return -1;
> +
> +    if (chSetupCpusetCgroup(vm) < 0)
> +        return -1;
> +
> +    return 0;
> +}
> +
> +int
> +chSetupCgroupVcpuBW(virCgroup * cgroup,
> +                    unsigned long long period, long long quota)
> +{
> +    return virCgroupSetupCpuPeriodQuota(cgroup, period, quota);
> +}
> +
> +
> +int
> +chSetupCgroupCpusetCpus(virCgroup * cgroup, virBitmap * cpumask)
> +{
> +    return virCgroupSetupCpusetCpus(cgroup, cpumask);
> +}
> +
> +int
> +chSetupGlobalCpuCgroup(virDomainObj * vm)
> +{
> +    virCHDomainObjPrivate *priv = vm->privateData;
> +    unsigned long long period = vm->def->cputune.global_period;
> +    long long quota = vm->def->cputune.global_quota;
> +    g_autofree char *mem_mask = NULL;
> +    virDomainNumatuneMemMode mem_mode;
> +
> +    if ((period || quota) &&
> +        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
> +        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
> +                       _("cgroup cpu is required for scheduler tuning"));
> +        return -1;
> +    }
> +
> +    /*
> +     * If CPU cgroup controller is not initialized here, then we need
> +     * neither period nor quota settings.  And if CPUSET controller is
> +     * not initialized either, then there's nothing to do anyway.
> +     */
> +    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
> +        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
> +        return 0;
> +
> +
> +    if (virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 &&
> +        mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT &&
> +        virDomainNumatuneMaybeFormatNodeset(vm->def->numa,
> +                                            priv->autoNodeset,
> +                                            &mem_mask, -1) < 0)
> +        return -1;
> +
> +    if (period || quota) {
> +        if (chSetupCgroupVcpuBW(priv->cgroup, period, quota) < 0)
> +            return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +
> +int
> +chRemoveCgroup(virDomainObj * vm)
> +{
> +    virCHDomainObjPrivate *priv = vm->privateData;
> +
> +    if (priv->cgroup == NULL)
> +        return 0;               /* Not supported, so claim success */
> +
> +    if (virCgroupTerminateMachine(priv->machineName) < 0) {
> +        if (!virCgroupNewIgnoreError())
> +            VIR_DEBUG("Failed to terminate cgroup for %s", vm->def->name);
> +    }
> +
> +    return virCgroupRemove(priv->cgroup);
> +}
> +
> +
> +static void
> +chCgroupEmulatorAllNodesDataFree(chCgroupEmulatorAllNodesData * data)
> +{
> +    if (!data)
> +        return;
> +
> +    virCgroupFree(data->emulatorCgroup);
> +    g_free(data->emulatorMemMask);
> +    g_free(data);
> +}
> +
> +
> +/**
> + * chCgroupEmulatorAllNodesAllow:
> + * @cgroup: domain cgroup pointer
> + * @retData: filled with structure used to roll back the operation
> + *
> + * Allows all NUMA nodes for the cloud hypervisor thread temporarily. This is
> + * necessary when hotplugging cpus since it requires memory allocated in the
> + * DMA region. Afterwards the operation can be reverted by
> + * chCgroupEmulatorAllNodesRestore.
> + *
> + * Returns 0 on success -1 on error
> + */
> +int
> +chCgroupEmulatorAllNodesAllow(virCgroup * cgroup,
> +                              chCgroupEmulatorAllNodesData ** retData)
> +{
> +    chCgroupEmulatorAllNodesData *data = NULL;
> +    g_autofree char *all_nodes_str = NULL;
> +

Extra line in the middle of var declarations.

> +    g_autoptr(virBitmap) all_nodes = NULL;
> +    int ret = -1;
> +
> +    if (!virNumaIsAvailable() ||
> +        !virCgroupHasController(cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
> +        return 0;
> +
> +    if (!(all_nodes = virNumaGetHostMemoryNodeset()))
> +        goto cleanup;
> +
> +    if (!(all_nodes_str = virBitmapFormat(all_nodes)))
> +        goto cleanup;
> +
> +    data = g_new0(chCgroupEmulatorAllNodesData, 1);
> +
> +    if (virCgroupNewThread(cgroup, VIR_CGROUP_THREAD_EMULATOR, 0,
> +                           false, &data->emulatorCgroup) < 0)
> +        goto cleanup;
> +
> +    if (virCgroupGetCpusetMems(data->emulatorCgroup, &data->emulatorMemMask) < 0
> +        || virCgroupSetCpusetMems(data->emulatorCgroup, all_nodes_str) < 0)
> +        goto cleanup;
> +
> +    *retData = g_steal_pointer(&data);
> +    ret = 0;
> +
> +    cleanup:

Wrong label indentation.

> +        chCgroupEmulatorAllNodesDataFree(data);
> +
> +    return ret;
> +}
> +
> +
> +/**
> + * chCgroupEmulatorAllNodesRestore:
> + * @data: data structure created by chCgroupEmulatorAllNodesAllow
> + *
> + * Rolls back the setting done by chCgroupEmulatorAllNodesAllow and frees the
> + * associated data.
> + */
> +void
> +chCgroupEmulatorAllNodesRestore(chCgroupEmulatorAllNodesData * data)
> +{
> +    virError *err;
> +
> +    if (!data)
> +        return;
> +
> +    virErrorPreserveLast(&err);
> +    virCgroupSetCpusetMems(data->emulatorCgroup, data->emulatorMemMask);
> +    virErrorRestore(&err);
> +
> +    chCgroupEmulatorAllNodesDataFree(data);
> +}
> diff --git a/src/ch/ch_cgroup.h b/src/ch/ch_cgroup.h
> new file mode 100644
> index 0000000000..0152b5477c
> --- /dev/null
> +++ b/src/ch/ch_cgroup.h
> @@ -0,0 +1,45 @@
> +/*
> + * ch_cgroup.h: CH cgroup management
> + *
> + * Copyright Microsoft Corp. 2020-2021
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library.  If not, see
> + * <http://www.gnu.org/licenses/>.
> + */
> +
> +#pragma once
> +
> +#include "virusb.h"
> +#include "vircgroup.h"
> +#include "domain_conf.h"
> +#include "ch_conf.h"
> +
> +int chConnectCgroup(virDomainObj * vm);
> +int chSetupCgroup(virDomainObj * vm, size_t nnicindexes, int *nicindexes);
> +int chSetupCgroupVcpuBW(virCgroup * cgroup,
> +                        unsigned long long period, long long quota);
> +int chSetupCgroupCpusetCpus(virCgroup * cgroup, virBitmap * cpumask);
> +int chSetupGlobalCpuCgroup(virDomainObj * vm);
> +int chRemoveCgroup(virDomainObj * vm);
> +
> +typedef struct _chCgroupEmulatorAllNodesData chCgroupEmulatorAllNodesData;
> +
> +struct _chCgroupEmulatorAllNodesData {
> +    virCgroup *emulatorCgroup;
> +    char *emulatorMemMask;
> +};
> +
> +int chCgroupEmulatorAllNodesAllow(virCgroup * cgroup,
> +                                  chCgroupEmulatorAllNodesData ** data);
> +void chCgroupEmulatorAllNodesRestore(chCgroupEmulatorAllNodesData * data);
> diff --git a/src/ch/ch_conf.c b/src/ch/ch_conf.c
> index ed0fffe5d6..7f70452296 100644
> --- a/src/ch/ch_conf.c
> +++ b/src/ch/ch_conf.c
> @@ -141,6 +141,8 @@ virCHDriverConfigNew(bool privileged)
>       if (!(cfg = virObjectNew(virCHDriverConfigClass)))
>           return NULL;
>   
> +    cfg->cgroupControllers = -1; /* Auto detect */
> +
>       if (privileged) {
>           if (virGetUserID(CH_USER, &cfg->user) < 0)
>               return NULL;
> diff --git a/src/ch/ch_conf.h b/src/ch/ch_conf.h
> index 49f286f97a..19deb8e568 100644
> --- a/src/ch/ch_conf.h
> +++ b/src/ch/ch_conf.h
> @@ -35,11 +35,13 @@ struct _virCHDriverConfig {
>   
>       char *stateDir;
>       char *logDir;
> -
> +    int cgroupControllers;
>       uid_t user;
>       gid_t group;
>   };
>   
> +G_DEFINE_AUTOPTR_CLEANUP_FUNC(virCHDriverConfig, virObjectUnref);
> +
>   struct _virCHDriver
>   {
>       virMutex lock;
> diff --git a/src/ch/ch_domain.c b/src/ch/ch_domain.c
> index e1030800aa..d0aaeed1f4 100644
> --- a/src/ch/ch_domain.c
> +++ b/src/ch/ch_domain.c
> @@ -326,6 +326,39 @@ chValidateDomainDeviceDef(const virDomainDeviceDef *dev,
>                          _("Serial can only be enabled for a PTY"));
>           return -1;
>       }
> +    return 0;
> +}
> +int
> +virCHDomainRefreshThreadInfo(virDomainObj *vm)
> +{
> +    size_t maxvcpus = virDomainDefGetVcpusMax(vm->def);
> +    virCHMonitorThreadInfo *info = NULL;
> +    size_t nthreads, ncpus = 0;
> +    size_t i;
> +
> +    nthreads = virCHMonitorGetThreadInfo(virCHDomainGetMonitor(vm),
> +                                         true, &info);
> +
> +    for (i = 0; i < nthreads; i++) {
> +        virCHDomainVcpuPrivate *vcpupriv;
> +        virDomainVcpuDef *vcpu;
> +        virCHMonitorCPUInfo *vcpuInfo;
> +
> +        if (info[i].type != virCHThreadTypeVcpu)
> +            continue;
> +
> +        // TODO: hotplug support
> +        vcpuInfo = &info[i].vcpuInfo;
> +        vcpu = virDomainDefGetVcpu(vm->def, vcpuInfo->cpuid);
> +        vcpupriv = CH_DOMAIN_VCPU_PRIVATE(vcpu);
> +        vcpupriv->tid = vcpuInfo->tid;
> +        ncpus++;
> +    }
> +
> +    // TODO: Remove the warning when hotplug is implemented.
> +    if (ncpus != maxvcpus)
> +        VIR_WARN("Mismatch in the number of cpus, expected: %ld, actual: %ld",
> +                 maxvcpus, ncpus);
>   
>       return 0;
>   }
> diff --git a/src/ch/ch_domain.h b/src/ch/ch_domain.h
> index 3ac3421015..2ce3e2cef3 100644
> --- a/src/ch/ch_domain.h
> +++ b/src/ch/ch_domain.h
> @@ -89,7 +89,8 @@ virCHDomainObjBeginJob(virDomainObj *obj, enum virCHDomainJob job)
>   void
>   virCHDomainObjEndJob(virDomainObj *obj);
>   
> -int virCHDomainRefreshVcpuInfo(virDomainObj *vm);
> +int virCHDomainRefreshThreadInfo(virDomainObj *vm);
> +
>   pid_t virCHDomainGetVcpuPid(virDomainObj *vm, unsigned int vcpuid);
>   bool virCHDomainHasVcpuPids(virDomainObj *vm);
>   
> diff --git a/src/ch/ch_monitor.c b/src/ch/ch_monitor.c
> index c0ae031200..095779cb3f 100644
> --- a/src/ch/ch_monitor.c
> +++ b/src/ch/ch_monitor.c
> @@ -41,6 +41,7 @@ VIR_LOG_INIT("ch.ch_monitor");
>   
>   static virClass *virCHMonitorClass;
>   static void virCHMonitorDispose(void *obj);
> +static void virCHMonitorThreadInfoFree(virCHMonitor *mon);
>   
>   static int virCHMonitorOnceInit(void)
>   {
> @@ -571,6 +572,7 @@ static void virCHMonitorDispose(void *opaque)
>       virCHMonitor *mon = opaque;
>   
>       VIR_DEBUG("mon=%p", mon);
> +    virCHMonitorThreadInfoFree(mon);
>       virObjectUnref(mon->vm);
>   }
>   
> @@ -736,6 +738,114 @@ virCHMonitorGet(virCHMonitor *mon, const char *endpoint, virJSONValue **response
>       return ret;
>   }
>   
> +/**
> + * virCHMonitorGetInfo:
> + * @mon: Pointer to the monitor
> + * @info: Get VM info
> + *
> + * Retrieve the VM info and store in @info
> + *
> + * Returns 0 on success.
> + */
> +int
> +virCHMonitorGetInfo(virCHMonitor *mon, virJSONValue **info)
> +{
> +    return virCHMonitorGet(mon, URL_VM_INFO, info);
> +}
> +
> +static void
> +virCHMonitorThreadInfoFree(virCHMonitor *mon)
> +{
> +    mon->nthreads = 0;
> +    if (mon->threads)
> +        VIR_FREE(mon->threads);
> +}
> +
> +static size_t
> +virCHMonitorRefreshThreadInfo(virCHMonitor *mon)
> +{
> +    virCHMonitorThreadInfo *info = NULL;
> +    g_autofree pid_t *tids = NULL;
> +    virDomainObj *vm = mon->vm;
> +    size_t ntids = 0;
> +    size_t i;
> +
> +
> +    virCHMonitorThreadInfoFree(mon);
> +    if (virProcessGetPids(vm->pid, &ntids, &tids) < 0) {
> +        mon->threads = NULL;
> +        return 0;
> +    }
> +
> +    info = g_new0(virCHMonitorThreadInfo, ntids);
> +    for (i = 0; i < ntids; i++) {
> +        g_autofree char *proc = NULL;
> +        g_autofree char *data = NULL;
> +
> +        proc = g_strdup_printf("/proc/%d/task/%d/comm",
> +                (int)vm->pid, (int)tids[i]);
> +
> +        if (virFileReadAll(proc, (1<<16), &data) < 0) {
> +            continue;
> +        }
> +
> +        VIR_DEBUG("VM PID: %d, TID %d, COMM: %s",
> +                (int)vm->pid, (int)tids[i], data);
> +        if (STRPREFIX(data, "vcpu")) {
> +            int cpuid;
> +            char *tmp;
> +            if (virStrToLong_i(data + 4, &tmp, 0, &cpuid) < 0) {
> +                VIR_WARN("Index is not specified correctly");
> +                continue;
> +            }
> +            info[i].type = virCHThreadTypeVcpu;
> +            info[i].vcpuInfo.tid = tids[i];
> +            info[i].vcpuInfo.online = true;
> +            info[i].vcpuInfo.cpuid = cpuid;
> +            VIR_DEBUG("vcpu%d -> tid: %d", cpuid, tids[i]);
> +        } else if (STRPREFIX(data, "_disk") || STRPREFIX(data, "_net") ||
> +                   STRPREFIX(data, "_rng")) {
> +        /* Prefixes used by cloud-hypervisor for IO Threads are captured at
> +        https://github.com/cloud-hypervisor/cloud-hypervisor/blob/main/vmm/src/device_manager.rs */
> +            info[i].type = virCHThreadTypeIO;
> +            info[i].ioInfo.tid = tids[i];
> +            virStrcpy(info[i].ioInfo.thrName, data, VIRCH_THREAD_NAME_LEN - 1);
> +        }else {
> +            info[i].type = virCHThreadTypeEmulator;
> +            info[i].emuInfo.tid = tids[i];
> +            virStrcpy(info[i].emuInfo.thrName, data, VIRCH_THREAD_NAME_LEN - 1);
> +        }
> +        mon->nthreads++;
> +
> +    }
> +    mon->threads = info;
> +
> +    return mon->nthreads;
> +}
> +
> +/**
> + * virCHMonitorGetThreadInfo:
> + * @mon: Pointer to the monitor
> + * @refresh: Refresh thread info or not
> + *
> + * Retrive thread info and store to @threads
> + *
> + * Returns count of threads on success.
> + */
> +size_t
> +virCHMonitorGetThreadInfo(virCHMonitor *mon, bool refresh,
> +                          virCHMonitorThreadInfo **threads)
> +{
> +    int nthreads = 0;
> +
> +    if (refresh)
> +        nthreads = virCHMonitorRefreshThreadInfo(mon);
> +
> +    *threads = mon->threads;
> +
> +    return nthreads;
> +}
> +
>   int
>   virCHMonitorShutdownVMM(virCHMonitor *mon)
>   {
> @@ -810,18 +920,3 @@ virCHMonitorResumeVM(virCHMonitor *mon)
>   {
>       return virCHMonitorPutNoContent(mon, URL_VM_RESUME);
>   }
> -
> -/**
> - * virCHMonitorGetInfo:
> - * @mon: Pointer to the monitor
> - * @info: Get VM info
> - *
> - * Retrieve the VM info and store in @info
> - *
> - * Returns 0 on success.
> - */
> -int
> -virCHMonitorGetInfo(virCHMonitor *mon, virJSONValue **info)
> -{
> -    return virCHMonitorGet(mon, URL_VM_INFO, info);
> -}
> diff --git a/src/ch/ch_monitor.h b/src/ch/ch_monitor.h
> index 8ca9e17a9a..f8c3fa75e8 100644
> --- a/src/ch/ch_monitor.h
> +++ b/src/ch/ch_monitor.h
> @@ -37,6 +37,50 @@
>   #define URL_VM_RESUME "vm.resume"
>   #define URL_VM_INFO "vm.info"
>   
> +#define VIRCH_THREAD_NAME_LEN   16
> +
> +typedef enum {
> +    virCHThreadTypeEmulator,
> +    virCHThreadTypeVcpu,
> +    virCHThreadTypeIO,
> +    virCHThreadTypeMax
> +} virCHThreadType;
> +
> +typedef struct _virCHMonitorCPUInfo virCHMonitorCPUInfo;
> +
> +struct _virCHMonitorCPUInfo {
> +    int cpuid;
> +    pid_t tid;
> +
> +    bool online;
> +};
> +
> +typedef struct _virCHMonitorEmuThreadInfo virCHMonitorEmuThreadInfo;
> +
> +struct _virCHMonitorEmuThreadInfo {
> +    char    thrName[VIRCH_THREAD_NAME_LEN];
> +    pid_t   tid;
> +};
> +
> +typedef struct _virCHMonitorIOThreadInfo virCHMonitorIOThreadInfo;
> +
> +struct _virCHMonitorIOThreadInfo {
> +    char    thrName[VIRCH_THREAD_NAME_LEN];
> +    pid_t   tid;
> +};
> +
> +typedef struct _virCHMonitorThreadInfo virCHMonitorThreadInfo;
> +
> +struct _virCHMonitorThreadInfo {
> +    virCHThreadType type;
> +
> +    union {
> +        virCHMonitorCPUInfo vcpuInfo;
> +        virCHMonitorEmuThreadInfo emuInfo;
> +        virCHMonitorIOThreadInfo ioInfo;
> +    };
> +};
> +
>   typedef struct _virCHMonitor virCHMonitor;
>   
>   struct _virCHMonitor {
> @@ -49,6 +93,9 @@ struct _virCHMonitor {
>       pid_t pid;
>   
>       virDomainObj *vm;
> +
> +    size_t nthreads;
> +    virCHMonitorThreadInfo *threads;
>   };
>   
>   virCHMonitor *virCHMonitorNew(virDomainObj *vm, const char *socketdir);
> @@ -65,12 +112,9 @@ int virCHMonitorSuspendVM(virCHMonitor *mon);
>   int virCHMonitorResumeVM(virCHMonitor *mon);
>   int virCHMonitorGetInfo(virCHMonitor *mon, virJSONValue **info);
>   
> -typedef struct _virCHMonitorCPUInfo virCHMonitorCPUInfo;
> -struct _virCHMonitorCPUInfo {
> -    pid_t tid;
> -    bool online;
> -};
>   void virCHMonitorCPUInfoFree(virCHMonitorCPUInfo *cpus);
>   int virCHMonitorGetCPUInfo(virCHMonitor *mon,
>                          virCHMonitorCPUInfo **vcpus,
>                          size_t maxvcpus);
> +size_t virCHMonitorGetThreadInfo(virCHMonitor *mon, bool refresh,
> +                                 virCHMonitorThreadInfo **threads);
> diff --git a/src/ch/ch_process.c b/src/ch/ch_process.c
> index 3b7f6fcddf..8dce737adb 100644
> --- a/src/ch/ch_process.c
> +++ b/src/ch/ch_process.c
> @@ -26,6 +26,8 @@
>   #include "ch_domain.h"
>   #include "ch_monitor.h"
>   #include "ch_process.h"
> +#include "ch_cgroup.h"
> +#include "virnuma.h"
>   #include "viralloc.h"
>   #include "virerror.h"
>   #include "virjson.h"
> @@ -133,6 +135,257 @@ virCHProcessUpdateInfo(virDomainObj *vm)
>       return 0;
>   }
>   
> +static int
> +virCHProcessGetAllCpuAffinity(virBitmap **cpumapRet)
> +{
> +    *cpumapRet = NULL;
> +
> +    if (!virHostCPUHasBitmap())
> +        return 0;
> +
> +    if (!(*cpumapRet = virHostCPUGetOnlineBitmap()))
> +        return -1;
> +
> +    return 0;
> +}
> +
> +#if defined(WITH_SCHED_GETAFFINITY) || defined(WITH_BSD_CPU_AFFINITY)
> +static int
> +virCHProcessInitCpuAffinity(virDomainObj *vm)
> +{
> +    g_autoptr(virBitmap) cpumapToSet = NULL;
> +    virDomainNumatuneMemMode mem_mode;
> +    virCHDomainObjPrivate *priv = vm->privateData;
> +
> +    if (!vm->pid) {
> +        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> +                       _("Cannot setup CPU affinity until process is started"));
> +        return -1;
> +    }
> +
> +    if (virDomainNumaGetNodeCount(vm->def->numa) <= 1 &&
> +        virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 &&
> +        mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) {
> +        virBitmap *nodeset = NULL;
> +
> +        if (virDomainNumatuneMaybeGetNodeset(vm->def->numa,
> +                                             priv->autoNodeset,
> +                                             &nodeset,
> +                                             -1) < 0)
> +            return -1;
> +
> +        if (virNumaNodesetToCPUset(nodeset, &cpumapToSet) < 0)
> +            return -1;
> +    } else if (vm->def->cputune.emulatorpin) {
> +        if (!(cpumapToSet = virBitmapNewCopy(vm->def->cputune.emulatorpin)))
> +            return -1;
> +    } else {
> +        if (virCHProcessGetAllCpuAffinity(&cpumapToSet) < 0)
> +            return -1;
> +    }
> +
> +    if (cpumapToSet &&
> +        virProcessSetAffinity(vm->pid, cpumapToSet, false) < 0) {
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +#else /* !defined(WITH_SCHED_GETAFFINITY) && !defined(WITH_BSD_CPU_AFFINITY) */
> +static int
> +virCHProcessInitCpuAffinity(virDomainObj *vm G_GNUC_UNUSED)
> +{
> +    return 0;
> +}
> +#endif /* !defined(WITH_SCHED_GETAFFINITY) && !defined(WITH_BSD_CPU_AFFINITY) */
> +
> +/**
> + * virCHProcessSetupPid:
> + *
> + * This function sets resource properties (affinity, cgroups,
> + * scheduler) for any PID associated with a domain.  It should be used
> + * to set up emulator PIDs as well as vCPU and I/O thread pids to
> + * ensure they are all handled the same way.
> + *
> + * Returns 0 on success, -1 on error.
> + */
> +static int
> +virCHProcessSetupPid(virDomainObj *vm,
> +                     pid_t pid,
> +                     virCgroupThreadName nameval,
> +                     int id,
> +                     virBitmap *cpumask,
> +                     unsigned long long period,
> +                     long long quota,
> +                     virDomainThreadSchedParam *sched)
> +{
> +    virCHDomainObjPrivate *priv = vm->privateData;
> +    virDomainNumatuneMemMode mem_mode;
> +    virCgroup *cgroup = NULL;
> +    virBitmap *use_cpumask = NULL;
> +    virBitmap *affinity_cpumask = NULL;
> +    g_autoptr(virBitmap) hostcpumap = NULL;
> +    g_autofree char *mem_mask = NULL;
> +    int ret = -1;
> +
> +    if ((period || quota) &&
> +        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
> +        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
> +                       _("cgroup cpu is required for scheduler tuning"));
> +        goto cleanup;
> +    }
> +
> +    /* Infer which cpumask shall be used. */
> +    if (cpumask) {
> +        use_cpumask = cpumask;
> +    } else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
> +        use_cpumask = priv->autoCpuset;
> +    } else if (vm->def->cpumask) {
> +        use_cpumask = vm->def->cpumask;
> +    } else {
> +        /* we can't assume cloud-hypervisor itself is running on all pCPUs,
> +         * so we need to explicitly set the spawned instance to all pCPUs. */
> +        if (virCHProcessGetAllCpuAffinity(&hostcpumap) < 0)
> +            goto cleanup;
> +        affinity_cpumask = hostcpumap;
> +    }
> +
> +    /*
> +     * If CPU cgroup controller is not initialized here, then we need
> +     * neither period nor quota settings.  And if CPUSET controller is
> +     * not initialized either, then there's nothing to do anyway.
> +     */
> +    if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) ||
> +        virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) {
> +
> +        if (virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 &&
> +            mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT &&
> +            virDomainNumatuneMaybeFormatNodeset(vm->def->numa,
> +                                                priv->autoNodeset,
> +                                                &mem_mask, -1) < 0)
> +            goto cleanup;
> +
> +        if (virCgroupNewThread(priv->cgroup, nameval, id, true, &cgroup) < 0)
> +            goto cleanup;
> +
> +        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) {
> +            if (use_cpumask &&
> +                chSetupCgroupCpusetCpus(cgroup, use_cpumask) < 0)
> +                goto cleanup;
> +
> +            if (mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0)
> +                goto cleanup;
> +
> +        }
> +
> +        if ((period || quota) &&
> +            chSetupCgroupVcpuBW(cgroup, period, quota) < 0)
> +            goto cleanup;
> +
> +        /* Move the thread to the sub dir */
> +        VIR_INFO("Adding pid %d to cgroup", pid);
> +        if (virCgroupAddThread(cgroup, pid) < 0)
> +            goto cleanup;
> +
> +    }
> +
> +    if (!affinity_cpumask)
> +        affinity_cpumask = use_cpumask;
> +
> +    /* Setup legacy affinity. */
> +    if (affinity_cpumask && virProcessSetAffinity(pid, affinity_cpumask, false) < 0)
> +        goto cleanup;
> +
> +    /* Set scheduler type and priority, but not for the main thread. */
> +    if (sched &&
> +        nameval != VIR_CGROUP_THREAD_EMULATOR &&
> +        virProcessSetScheduler(pid, sched->policy, sched->priority) < 0)
> +        goto cleanup;
> +
> +    ret = 0;
> + cleanup:
> +    if (cgroup) {
> +        if (ret < 0)
> +            virCgroupRemove(cgroup);
> +        virCgroupFree(cgroup);
> +    }
> +
> +    return ret;
> +}
> +
> +/**
> + * virCHProcessSetupVcpu:
> + * @vm: domain object
> + * @vcpuid: id of VCPU to set defaults
> + *
> + * This function sets resource properties (cgroups, affinity, scheduler) for a
> + * vCPU. This function expects that the vCPU is online and the vCPU pids were
> + * correctly detected at the point when it's called.
> + *
> + * Returns 0 on success, -1 on error.
> + */
> +int
> +virCHProcessSetupVcpu(virDomainObj *vm,
> +                      unsigned int vcpuid)
> +{
> +    pid_t vcpupid = virCHDomainGetVcpuPid(vm, vcpuid);
> +    virDomainVcpuDef *vcpu = virDomainDefGetVcpu(vm->def, vcpuid);
> +
> +    return virCHProcessSetupPid(vm, vcpupid, VIR_CGROUP_THREAD_VCPU,
> +                                vcpuid, vcpu->cpumask,
> +                                vm->def->cputune.period,
> +                                vm->def->cputune.quota,
> +                                &vcpu->sched);
> +}
> +
> +static int
> +virCHProcessSetupVcpus(virDomainObj *vm)
> +{
> +    virDomainVcpuDef *vcpu;
> +    unsigned int maxvcpus = virDomainDefGetVcpusMax(vm->def);
> +    size_t i;
> +
> +    if ((vm->def->cputune.period || vm->def->cputune.quota) &&
> +        !virCgroupHasController(((virCHDomainObjPrivate *) vm->privateData)->cgroup,
> +                                VIR_CGROUP_CONTROLLER_CPU)) {
> +        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
> +                       _("cgroup cpu is required for scheduler tuning"));
> +        return -1;
> +    }
> +
> +    if (!virCHDomainHasVcpuPids(vm)) {
> +        /* If any CPU has custom affinity that differs from the
> +         * VM default affinity, we must reject it */
> +        for (i = 0; i < maxvcpus; i++) {
> +            vcpu = virDomainDefGetVcpu(vm->def, i);
> +
> +            if (!vcpu->online)
> +                continue;
> +
> +            if (vcpu->cpumask &&
> +                !virBitmapEqual(vm->def->cpumask, vcpu->cpumask)) {
> +                virReportError(VIR_ERR_OPERATION_INVALID, "%s",
> +                                _("cpu affinity is not supported"));
> +                return -1;
> +            }
> +        }
> +
> +        return 0;
> +    }
> +
> +    for (i = 0; i < maxvcpus; i++) {
> +        vcpu = virDomainDefGetVcpu(vm->def, i);
> +
> +        if (!vcpu->online)
> +            continue;
> +
> +        if (virCHProcessSetupVcpu(vm, i) < 0)
> +            return -1;
> +    }
> +
> +    return 0;
> +}
> +
>   /**
>    * virCHProcessStart:
>    * @driver: pointer to driver structure
> @@ -168,18 +421,33 @@ int virCHProcessStart(virCHDriver *driver,
>           }
>       }
>   
> +    vm->pid = priv->monitor->pid;
> +    vm->def->id = vm->pid;
> +    priv->machineName = virCHDomainGetMachineName(vm);
> +
> +    if (chSetupCgroup(vm, nnicindexes, nicindexes) < 0)
> +        goto cleanup;
> +
> +    if (virCHProcessInitCpuAffinity(vm) < 0)
> +        goto cleanup;
> +
>       if (virCHMonitorBootVM(priv->monitor) < 0) {
>           virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
>                          _("failed to boot guest VM"));
>           goto cleanup;
>       }
>   
> -    priv->machineName = virCHDomainGetMachineName(vm);
> -    vm->pid = priv->monitor->pid;
> -    vm->def->id = vm->pid;
> +    virCHDomainRefreshThreadInfo(vm);
>   
> -    virCHProcessUpdateInfo(vm);
> +    VIR_DEBUG("Setting global CPU cgroup (if required)");
> +    if (chSetupGlobalCpuCgroup(vm) < 0)
> +        goto cleanup;
> +
> +    VIR_DEBUG("Setting vCPU tuning/settings");
> +    if (virCHProcessSetupVcpus(vm) < 0)
> +        goto cleanup;
>   
> +    virCHProcessUpdateInfo(vm);
>       virDomainObjSetState(vm, VIR_DOMAIN_RUNNING, reason);
>   
>       return 0;
> @@ -195,6 +463,8 @@ int virCHProcessStop(virCHDriver *driver G_GNUC_UNUSED,
>                        virDomainObj *vm,
>                        virDomainShutoffReason reason)
>   {
> +    int ret;
> +    int retries = 0;
>       virCHDomainObjPrivate *priv = vm->privateData;
>   
>       VIR_DEBUG("Stopping VM name=%s pid=%d reason=%d",
> @@ -205,6 +475,16 @@ int virCHProcessStop(virCHDriver *driver G_GNUC_UNUSED,
>           priv->monitor = NULL;
>       }
>   
> +    retry:

Wrong label indentation.


Daniel

> +        if ((ret = chRemoveCgroup(vm)) < 0) {
> +            if (ret == -EBUSY && (retries++ < 5)) {
> +                g_usleep(200*1000);
> +                goto retry;
> +            }
> +            VIR_WARN("Failed to remove cgroup for %s",
> +                    vm->def->name);
> +        }
> +
>       vm->pid = -1;
>       vm->def->id = -1;
>   
> diff --git a/src/ch/ch_process.h b/src/ch/ch_process.h
> index abc4915979..800e3f4e23 100644
> --- a/src/ch/ch_process.h
> +++ b/src/ch/ch_process.h
> @@ -29,3 +29,6 @@ int virCHProcessStart(virCHDriver *driver,
>   int virCHProcessStop(virCHDriver *driver,
>                        virDomainObj *vm,
>                        virDomainShutoffReason reason);
> +
> +int virCHProcessSetupVcpu(virDomainObj *vm,
> +                          unsigned int vcpuid);
> diff --git a/src/ch/meson.build b/src/ch/meson.build
> index 2b2bdda26c..0b20de56fd 100644
> --- a/src/ch/meson.build
> +++ b/src/ch/meson.build
> @@ -1,6 +1,8 @@
>   ch_driver_sources = [
>     'ch_conf.c',
>     'ch_conf.h',
> +  'ch_cgroup.c',
> +  'ch_cgroup.h',
>     'ch_domain.c',
>     'ch_domain.h',
>     'ch_driver.c',
> 




More information about the libvir-list mailing list