[libvirt PATCH 08/13] ch_cgroup: methods for cgroup mgmt in ch driver
Daniel Henrique Barboza
danielhb413 at gmail.com
Fri Nov 12 20:45:50 UTC 2021
On 10/21/21 16:31, Vineeth Pillai wrote:
> Signed-off-by: Vineeth Pillai <viremana at linux.microsoft.com>
> Signed-off-by: Praveen K Paladugu <prapal at linux.microsoft.com>
> ---
> po/POTFILES.in | 1 +
> src/ch/ch_cgroup.c | 457 ++++++++++++++++++++++++++++++++++++++++++++
> src/ch/ch_cgroup.h | 45 +++++
> src/ch/ch_conf.c | 2 +
> src/ch/ch_conf.h | 4 +-
> src/ch/ch_domain.c | 33 ++++
> src/ch/ch_domain.h | 3 +-
> src/ch/ch_monitor.c | 125 ++++++++++--
> src/ch/ch_monitor.h | 54 +++++-
> src/ch/ch_process.c | 288 +++++++++++++++++++++++++++-
> src/ch/ch_process.h | 3 +
> src/ch/meson.build | 2 +
> 12 files changed, 991 insertions(+), 26 deletions(-)
> create mode 100644 src/ch/ch_cgroup.c
> create mode 100644 src/ch/ch_cgroup.h
>
> diff --git a/po/POTFILES.in b/po/POTFILES.in
> index b554cf08ca..3a8db501bc 100644
> --- a/po/POTFILES.in
> +++ b/po/POTFILES.in
> @@ -19,6 +19,7 @@
> @SRCDIR at src/bhyve/bhyve_parse_command.c
> @SRCDIR at src/bhyve/bhyve_process.c
> @SRCDIR at src/ch/ch_conf.c
> + at SRCDIR@src/ch/ch_cgroup.c
> @SRCDIR at src/ch/ch_domain.c
> @SRCDIR at src/ch/ch_driver.c
> @SRCDIR at src/ch/ch_monitor.c
> diff --git a/src/ch/ch_cgroup.c b/src/ch/ch_cgroup.c
> new file mode 100644
> index 0000000000..6be2184cf1
> --- /dev/null
> +++ b/src/ch/ch_cgroup.c
> @@ -0,0 +1,457 @@
> +/*
> + * ch_cgroup.c: CH cgroup management
> + *
> + * Copyright Microsoft Corp. 2020-2021
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library. If not, see
> + * <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <config.h>
> +
> +#include "ch_cgroup.h"
> +#include "ch_domain.h"
> +#include "ch_process.h"
> +#include "vircgroup.h"
> +#include "virlog.h"
> +#include "viralloc.h"
> +#include "virerror.h"
> +#include "domain_audit.h"
> +#include "domain_cgroup.h"
> +#include "virscsi.h"
> +#include "virstring.h"
> +#include "virfile.h"
> +#include "virtypedparam.h"
> +#include "virnuma.h"
> +#include "virdevmapper.h"
> +#include "virutil.h"
> +
> +#define VIR_FROM_THIS VIR_FROM_CH
> +
> +VIR_LOG_INIT("ch.ch_cgroup");
> +
> +static int
> +chSetupBlkioCgroup(virDomainObj * vm)
> +{
> + virCHDomainObjPrivate *priv = vm->privateData;
> +
> + if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_BLKIO)) {
> + if (vm->def->blkio.weight || vm->def->blkio.ndevices) {
> + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
> + _("Block I/O tuning is not available on this host"));
> + return -1;
> + } else {
> + return 0;
> + }
> + }
> +
> + return virDomainCgroupSetupBlkio(priv->cgroup, vm->def->blkio);
> +}
> +
> +
> +static int
> +chSetupMemoryCgroup(virDomainObj * vm)
> +{
> + virCHDomainObjPrivate *priv = vm->privateData;
> +
> + if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) {
> + if (virMemoryLimitIsSet(vm->def->mem.hard_limit) ||
> + virMemoryLimitIsSet(vm->def->mem.soft_limit) ||
> + virMemoryLimitIsSet(vm->def->mem.swap_hard_limit)) {
> + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
> + _("Memory cgroup is not available on this host"));
> + return -1;
> + } else {
> + return 0;
> + }
> + }
> +
> + return virDomainCgroupSetupMemtune(priv->cgroup, vm->def->mem);
> +}
> +
> +static int
> +chSetupCpusetCgroup(virDomainObj * vm)
> +{
> + virCHDomainObjPrivate *priv = vm->privateData;
> +
> + if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
> + return 0;
> +
> + if (virCgroupSetCpusetMemoryMigrate(priv->cgroup, true) < 0)
> + return -1;
> +
> + return 0;
> +}
> +
> +
> +static int
> +chSetupCpuCgroup(virDomainObj * vm)
> +{
> + virCHDomainObjPrivate *priv = vm->privateData;
> +
> + if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
> + if (vm->def->cputune.sharesSpecified) {
> + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
> + _("CPU tuning is not available on this host"));
> + return -1;
> + } else {
> + return 0;
> + }
> + }
> +
> + if (vm->def->cputune.sharesSpecified) {
> +
> + if (virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares) < 0)
> + return -1;
> +
> + }
> +
> + return 0;
> +}
> +
> +
> +static int
> +chInitCgroup(virDomainObj * vm, size_t nnicindexes, int *nicindexes)
> +{
> + virCHDomainObjPrivate *priv = vm->privateData;
> +
> + g_autoptr(virCHDriverConfig) cfg = virCHDriverGetConfig(priv->driver);
> +
> + if (!priv->driver->privileged)
> + return 0;
> +
> + if (!virCgroupAvailable())
> + return 0;
> +
> + virCgroupFree(priv->cgroup);
> +
> + if (!vm->def->resource) {
> + virDomainResourceDef *res;
> +
> + res = g_new0(virDomainResourceDef, 1);
> +
> + res->partition = g_strdup("/machine");
> +
> + vm->def->resource = res;
> + }
> +
> + if (vm->def->resource->partition[0] != '/') {
> + virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
> + _("Resource partition '%s' must start with '/'"),
> + vm->def->resource->partition);
> + return -1;
> + }
> +
> + if (virCgroupNewMachine(priv->machineName, "ch", vm->def->uuid, NULL, vm->pid, false, nnicindexes, nicindexes, vm->def->resource->partition, cfg->cgroupControllers, 0, /* maxThreadsPerProc */
Break this big boy to at least 100 chars per line, please.
> + &priv->cgroup) < 0) {
> + if (virCgroupNewIgnoreError())
> + return 0;
> +
> + return -1;
> + }
> +
> + return 0;
> +}
> +
> +static void
> +chRestoreCgroupState(virDomainObj * vm)
> +{
> + g_autofree char *mem_mask = NULL;
> + g_autofree char *nodeset = NULL;
> + virCHDomainObjPrivate *priv = vm->privateData;
> + size_t i = 0;
> +
> + g_autoptr(virBitmap) all_nodes = NULL;
> + virCgroup *cgroup_temp = NULL;
> +
> + if (!virNumaIsAvailable() ||
> + !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
> + return;
> +
> + if (!(all_nodes = virNumaGetHostMemoryNodeset()))
> + goto error;
> +
> + if (!(mem_mask = virBitmapFormat(all_nodes)))
> + goto error;
> +
> + if ((virCgroupHasEmptyTasks(priv->cgroup,
> + VIR_CGROUP_CONTROLLER_CPUSET)) <= 0)
> + goto error;
> +
> + if (virCgroupSetCpusetMems(priv->cgroup, mem_mask) < 0)
> + goto error;
> +
> + for (i = 0; i < virDomainDefGetVcpusMax(vm->def); i++) {
> + virDomainVcpuDef *vcpu = virDomainDefGetVcpu(vm->def, i);
> +
> + if (!vcpu->online)
> + continue;
> +
> + if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_VCPU, i,
> + false, &cgroup_temp) < 0 ||
> + virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 ||
> + virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 ||
> + virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0)
> + goto cleanup;
> +
> + g_free(nodeset);
> + virCgroupFree(cgroup_temp);
> + }
> +
> + for (i = 0; i < vm->def->niothreadids; i++) {
> + if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_IOTHREAD,
> + vm->def->iothreadids[i]->iothread_id,
> + false, &cgroup_temp) < 0 ||
> + virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 ||
> + virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 ||
> + virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0)
> + goto cleanup;
> +
> + g_free(nodeset);
> + virCgroupFree(cgroup_temp);
> + }
> +
> + if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_EMULATOR, 0,
> + false, &cgroup_temp) < 0 ||
> + virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 ||
> + virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 ||
> + virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0)
> + goto cleanup;
> +
> + cleanup:
> + virCgroupFree(cgroup_temp);
> + return;
Up there in the cgroup_temp declaration you can use
g_autoptr(virCgroup) cgroup_temp = NULL;
And then you won't need a 'cleanup' label.
> +
> + error:
Wrong label indentation.
> + virResetLastError();
> + VIR_DEBUG("Couldn't restore cgroups to meaningful state");
> + goto cleanup;
> +}
> +
> +int
> +chConnectCgroup(virDomainObj * vm)
> +{
> + virCHDomainObjPrivate *priv = vm->privateData;
> +
> + g_autoptr(virCHDriverConfig) cfg = virCHDriverGetConfig(priv->driver);
> +
> + if (!priv->driver->privileged)
> + return 0;
> +
> + if (!virCgroupAvailable())
> + return 0;
> +
> + virCgroupFree(priv->cgroup);
> +
> + if (virCgroupNewDetectMachine(vm->def->name,
> + "ch",
> + vm->pid,
> + cfg->cgroupControllers,
> + priv->machineName, &priv->cgroup) < 0)
> + return -1;
> +
> + chRestoreCgroupState(vm);
> + return 0;
> +}
> +
> +int
> +chSetupCgroup(virDomainObj * vm, size_t nnicindexes, int *nicindexes)
> +{
> + virCHDomainObjPrivate *priv = vm->privateData;
> +
> + if (!vm->pid) {
> + virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> + _("Cannot setup cgroups until process is started"));
> + return -1;
> + }
> +
> + if (chInitCgroup(vm, nnicindexes, nicindexes) < 0)
> + return -1;
> +
> + if (!priv->cgroup)
> + return 0;
> +
> + if (chSetupBlkioCgroup(vm) < 0)
> + return -1;
> +
> + if (chSetupMemoryCgroup(vm) < 0)
> + return -1;
> +
> + if (chSetupCpuCgroup(vm) < 0)
> + return -1;
> +
> + if (chSetupCpusetCgroup(vm) < 0)
> + return -1;
> +
> + return 0;
> +}
> +
> +int
> +chSetupCgroupVcpuBW(virCgroup * cgroup,
> + unsigned long long period, long long quota)
> +{
> + return virCgroupSetupCpuPeriodQuota(cgroup, period, quota);
> +}
> +
> +
> +int
> +chSetupCgroupCpusetCpus(virCgroup * cgroup, virBitmap * cpumask)
> +{
> + return virCgroupSetupCpusetCpus(cgroup, cpumask);
> +}
> +
> +int
> +chSetupGlobalCpuCgroup(virDomainObj * vm)
> +{
> + virCHDomainObjPrivate *priv = vm->privateData;
> + unsigned long long period = vm->def->cputune.global_period;
> + long long quota = vm->def->cputune.global_quota;
> + g_autofree char *mem_mask = NULL;
> + virDomainNumatuneMemMode mem_mode;
> +
> + if ((period || quota) &&
> + !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
> + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
> + _("cgroup cpu is required for scheduler tuning"));
> + return -1;
> + }
> +
> + /*
> + * If CPU cgroup controller is not initialized here, then we need
> + * neither period nor quota settings. And if CPUSET controller is
> + * not initialized either, then there's nothing to do anyway.
> + */
> + if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
> + !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
> + return 0;
> +
> +
> + if (virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 &&
> + mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT &&
> + virDomainNumatuneMaybeFormatNodeset(vm->def->numa,
> + priv->autoNodeset,
> + &mem_mask, -1) < 0)
> + return -1;
> +
> + if (period || quota) {
> + if (chSetupCgroupVcpuBW(priv->cgroup, period, quota) < 0)
> + return -1;
> + }
> +
> + return 0;
> +}
> +
> +
> +int
> +chRemoveCgroup(virDomainObj * vm)
> +{
> + virCHDomainObjPrivate *priv = vm->privateData;
> +
> + if (priv->cgroup == NULL)
> + return 0; /* Not supported, so claim success */
> +
> + if (virCgroupTerminateMachine(priv->machineName) < 0) {
> + if (!virCgroupNewIgnoreError())
> + VIR_DEBUG("Failed to terminate cgroup for %s", vm->def->name);
> + }
> +
> + return virCgroupRemove(priv->cgroup);
> +}
> +
> +
> +static void
> +chCgroupEmulatorAllNodesDataFree(chCgroupEmulatorAllNodesData * data)
> +{
> + if (!data)
> + return;
> +
> + virCgroupFree(data->emulatorCgroup);
> + g_free(data->emulatorMemMask);
> + g_free(data);
> +}
> +
> +
> +/**
> + * chCgroupEmulatorAllNodesAllow:
> + * @cgroup: domain cgroup pointer
> + * @retData: filled with structure used to roll back the operation
> + *
> + * Allows all NUMA nodes for the cloud hypervisor thread temporarily. This is
> + * necessary when hotplugging cpus since it requires memory allocated in the
> + * DMA region. Afterwards the operation can be reverted by
> + * chCgroupEmulatorAllNodesRestore.
> + *
> + * Returns 0 on success -1 on error
> + */
> +int
> +chCgroupEmulatorAllNodesAllow(virCgroup * cgroup,
> + chCgroupEmulatorAllNodesData ** retData)
> +{
> + chCgroupEmulatorAllNodesData *data = NULL;
> + g_autofree char *all_nodes_str = NULL;
> +
Extra line in the middle of var declarations.
> + g_autoptr(virBitmap) all_nodes = NULL;
> + int ret = -1;
> +
> + if (!virNumaIsAvailable() ||
> + !virCgroupHasController(cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
> + return 0;
> +
> + if (!(all_nodes = virNumaGetHostMemoryNodeset()))
> + goto cleanup;
> +
> + if (!(all_nodes_str = virBitmapFormat(all_nodes)))
> + goto cleanup;
> +
> + data = g_new0(chCgroupEmulatorAllNodesData, 1);
> +
> + if (virCgroupNewThread(cgroup, VIR_CGROUP_THREAD_EMULATOR, 0,
> + false, &data->emulatorCgroup) < 0)
> + goto cleanup;
> +
> + if (virCgroupGetCpusetMems(data->emulatorCgroup, &data->emulatorMemMask) < 0
> + || virCgroupSetCpusetMems(data->emulatorCgroup, all_nodes_str) < 0)
> + goto cleanup;
> +
> + *retData = g_steal_pointer(&data);
> + ret = 0;
> +
> + cleanup:
Wrong label indentation.
> + chCgroupEmulatorAllNodesDataFree(data);
> +
> + return ret;
> +}
> +
> +
> +/**
> + * chCgroupEmulatorAllNodesRestore:
> + * @data: data structure created by chCgroupEmulatorAllNodesAllow
> + *
> + * Rolls back the setting done by chCgroupEmulatorAllNodesAllow and frees the
> + * associated data.
> + */
> +void
> +chCgroupEmulatorAllNodesRestore(chCgroupEmulatorAllNodesData * data)
> +{
> + virError *err;
> +
> + if (!data)
> + return;
> +
> + virErrorPreserveLast(&err);
> + virCgroupSetCpusetMems(data->emulatorCgroup, data->emulatorMemMask);
> + virErrorRestore(&err);
> +
> + chCgroupEmulatorAllNodesDataFree(data);
> +}
> diff --git a/src/ch/ch_cgroup.h b/src/ch/ch_cgroup.h
> new file mode 100644
> index 0000000000..0152b5477c
> --- /dev/null
> +++ b/src/ch/ch_cgroup.h
> @@ -0,0 +1,45 @@
> +/*
> + * ch_cgroup.h: CH cgroup management
> + *
> + * Copyright Microsoft Corp. 2020-2021
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library. If not, see
> + * <http://www.gnu.org/licenses/>.
> + */
> +
> +#pragma once
> +
> +#include "virusb.h"
> +#include "vircgroup.h"
> +#include "domain_conf.h"
> +#include "ch_conf.h"
> +
> +int chConnectCgroup(virDomainObj * vm);
> +int chSetupCgroup(virDomainObj * vm, size_t nnicindexes, int *nicindexes);
> +int chSetupCgroupVcpuBW(virCgroup * cgroup,
> + unsigned long long period, long long quota);
> +int chSetupCgroupCpusetCpus(virCgroup * cgroup, virBitmap * cpumask);
> +int chSetupGlobalCpuCgroup(virDomainObj * vm);
> +int chRemoveCgroup(virDomainObj * vm);
> +
> +typedef struct _chCgroupEmulatorAllNodesData chCgroupEmulatorAllNodesData;
> +
> +struct _chCgroupEmulatorAllNodesData {
> + virCgroup *emulatorCgroup;
> + char *emulatorMemMask;
> +};
> +
> +int chCgroupEmulatorAllNodesAllow(virCgroup * cgroup,
> + chCgroupEmulatorAllNodesData ** data);
> +void chCgroupEmulatorAllNodesRestore(chCgroupEmulatorAllNodesData * data);
> diff --git a/src/ch/ch_conf.c b/src/ch/ch_conf.c
> index ed0fffe5d6..7f70452296 100644
> --- a/src/ch/ch_conf.c
> +++ b/src/ch/ch_conf.c
> @@ -141,6 +141,8 @@ virCHDriverConfigNew(bool privileged)
> if (!(cfg = virObjectNew(virCHDriverConfigClass)))
> return NULL;
>
> + cfg->cgroupControllers = -1; /* Auto detect */
> +
> if (privileged) {
> if (virGetUserID(CH_USER, &cfg->user) < 0)
> return NULL;
> diff --git a/src/ch/ch_conf.h b/src/ch/ch_conf.h
> index 49f286f97a..19deb8e568 100644
> --- a/src/ch/ch_conf.h
> +++ b/src/ch/ch_conf.h
> @@ -35,11 +35,13 @@ struct _virCHDriverConfig {
>
> char *stateDir;
> char *logDir;
> -
> + int cgroupControllers;
> uid_t user;
> gid_t group;
> };
>
> +G_DEFINE_AUTOPTR_CLEANUP_FUNC(virCHDriverConfig, virObjectUnref);
> +
> struct _virCHDriver
> {
> virMutex lock;
> diff --git a/src/ch/ch_domain.c b/src/ch/ch_domain.c
> index e1030800aa..d0aaeed1f4 100644
> --- a/src/ch/ch_domain.c
> +++ b/src/ch/ch_domain.c
> @@ -326,6 +326,39 @@ chValidateDomainDeviceDef(const virDomainDeviceDef *dev,
> _("Serial can only be enabled for a PTY"));
> return -1;
> }
> + return 0;
> +}
> +int
> +virCHDomainRefreshThreadInfo(virDomainObj *vm)
> +{
> + size_t maxvcpus = virDomainDefGetVcpusMax(vm->def);
> + virCHMonitorThreadInfo *info = NULL;
> + size_t nthreads, ncpus = 0;
> + size_t i;
> +
> + nthreads = virCHMonitorGetThreadInfo(virCHDomainGetMonitor(vm),
> + true, &info);
> +
> + for (i = 0; i < nthreads; i++) {
> + virCHDomainVcpuPrivate *vcpupriv;
> + virDomainVcpuDef *vcpu;
> + virCHMonitorCPUInfo *vcpuInfo;
> +
> + if (info[i].type != virCHThreadTypeVcpu)
> + continue;
> +
> + // TODO: hotplug support
> + vcpuInfo = &info[i].vcpuInfo;
> + vcpu = virDomainDefGetVcpu(vm->def, vcpuInfo->cpuid);
> + vcpupriv = CH_DOMAIN_VCPU_PRIVATE(vcpu);
> + vcpupriv->tid = vcpuInfo->tid;
> + ncpus++;
> + }
> +
> + // TODO: Remove the warning when hotplug is implemented.
> + if (ncpus != maxvcpus)
> + VIR_WARN("Mismatch in the number of cpus, expected: %ld, actual: %ld",
> + maxvcpus, ncpus);
>
> return 0;
> }
> diff --git a/src/ch/ch_domain.h b/src/ch/ch_domain.h
> index 3ac3421015..2ce3e2cef3 100644
> --- a/src/ch/ch_domain.h
> +++ b/src/ch/ch_domain.h
> @@ -89,7 +89,8 @@ virCHDomainObjBeginJob(virDomainObj *obj, enum virCHDomainJob job)
> void
> virCHDomainObjEndJob(virDomainObj *obj);
>
> -int virCHDomainRefreshVcpuInfo(virDomainObj *vm);
> +int virCHDomainRefreshThreadInfo(virDomainObj *vm);
> +
> pid_t virCHDomainGetVcpuPid(virDomainObj *vm, unsigned int vcpuid);
> bool virCHDomainHasVcpuPids(virDomainObj *vm);
>
> diff --git a/src/ch/ch_monitor.c b/src/ch/ch_monitor.c
> index c0ae031200..095779cb3f 100644
> --- a/src/ch/ch_monitor.c
> +++ b/src/ch/ch_monitor.c
> @@ -41,6 +41,7 @@ VIR_LOG_INIT("ch.ch_monitor");
>
> static virClass *virCHMonitorClass;
> static void virCHMonitorDispose(void *obj);
> +static void virCHMonitorThreadInfoFree(virCHMonitor *mon);
>
> static int virCHMonitorOnceInit(void)
> {
> @@ -571,6 +572,7 @@ static void virCHMonitorDispose(void *opaque)
> virCHMonitor *mon = opaque;
>
> VIR_DEBUG("mon=%p", mon);
> + virCHMonitorThreadInfoFree(mon);
> virObjectUnref(mon->vm);
> }
>
> @@ -736,6 +738,114 @@ virCHMonitorGet(virCHMonitor *mon, const char *endpoint, virJSONValue **response
> return ret;
> }
>
> +/**
> + * virCHMonitorGetInfo:
> + * @mon: Pointer to the monitor
> + * @info: Get VM info
> + *
> + * Retrieve the VM info and store in @info
> + *
> + * Returns 0 on success.
> + */
> +int
> +virCHMonitorGetInfo(virCHMonitor *mon, virJSONValue **info)
> +{
> + return virCHMonitorGet(mon, URL_VM_INFO, info);
> +}
> +
> +static void
> +virCHMonitorThreadInfoFree(virCHMonitor *mon)
> +{
> + mon->nthreads = 0;
> + if (mon->threads)
> + VIR_FREE(mon->threads);
> +}
> +
> +static size_t
> +virCHMonitorRefreshThreadInfo(virCHMonitor *mon)
> +{
> + virCHMonitorThreadInfo *info = NULL;
> + g_autofree pid_t *tids = NULL;
> + virDomainObj *vm = mon->vm;
> + size_t ntids = 0;
> + size_t i;
> +
> +
> + virCHMonitorThreadInfoFree(mon);
> + if (virProcessGetPids(vm->pid, &ntids, &tids) < 0) {
> + mon->threads = NULL;
> + return 0;
> + }
> +
> + info = g_new0(virCHMonitorThreadInfo, ntids);
> + for (i = 0; i < ntids; i++) {
> + g_autofree char *proc = NULL;
> + g_autofree char *data = NULL;
> +
> + proc = g_strdup_printf("/proc/%d/task/%d/comm",
> + (int)vm->pid, (int)tids[i]);
> +
> + if (virFileReadAll(proc, (1<<16), &data) < 0) {
> + continue;
> + }
> +
> + VIR_DEBUG("VM PID: %d, TID %d, COMM: %s",
> + (int)vm->pid, (int)tids[i], data);
> + if (STRPREFIX(data, "vcpu")) {
> + int cpuid;
> + char *tmp;
> + if (virStrToLong_i(data + 4, &tmp, 0, &cpuid) < 0) {
> + VIR_WARN("Index is not specified correctly");
> + continue;
> + }
> + info[i].type = virCHThreadTypeVcpu;
> + info[i].vcpuInfo.tid = tids[i];
> + info[i].vcpuInfo.online = true;
> + info[i].vcpuInfo.cpuid = cpuid;
> + VIR_DEBUG("vcpu%d -> tid: %d", cpuid, tids[i]);
> + } else if (STRPREFIX(data, "_disk") || STRPREFIX(data, "_net") ||
> + STRPREFIX(data, "_rng")) {
> + /* Prefixes used by cloud-hypervisor for IO Threads are captured at
> + https://github.com/cloud-hypervisor/cloud-hypervisor/blob/main/vmm/src/device_manager.rs */
> + info[i].type = virCHThreadTypeIO;
> + info[i].ioInfo.tid = tids[i];
> + virStrcpy(info[i].ioInfo.thrName, data, VIRCH_THREAD_NAME_LEN - 1);
> + }else {
> + info[i].type = virCHThreadTypeEmulator;
> + info[i].emuInfo.tid = tids[i];
> + virStrcpy(info[i].emuInfo.thrName, data, VIRCH_THREAD_NAME_LEN - 1);
> + }
> + mon->nthreads++;
> +
> + }
> + mon->threads = info;
> +
> + return mon->nthreads;
> +}
> +
> +/**
> + * virCHMonitorGetThreadInfo:
> + * @mon: Pointer to the monitor
> + * @refresh: Refresh thread info or not
> + *
> + * Retrive thread info and store to @threads
> + *
> + * Returns count of threads on success.
> + */
> +size_t
> +virCHMonitorGetThreadInfo(virCHMonitor *mon, bool refresh,
> + virCHMonitorThreadInfo **threads)
> +{
> + int nthreads = 0;
> +
> + if (refresh)
> + nthreads = virCHMonitorRefreshThreadInfo(mon);
> +
> + *threads = mon->threads;
> +
> + return nthreads;
> +}
> +
> int
> virCHMonitorShutdownVMM(virCHMonitor *mon)
> {
> @@ -810,18 +920,3 @@ virCHMonitorResumeVM(virCHMonitor *mon)
> {
> return virCHMonitorPutNoContent(mon, URL_VM_RESUME);
> }
> -
> -/**
> - * virCHMonitorGetInfo:
> - * @mon: Pointer to the monitor
> - * @info: Get VM info
> - *
> - * Retrieve the VM info and store in @info
> - *
> - * Returns 0 on success.
> - */
> -int
> -virCHMonitorGetInfo(virCHMonitor *mon, virJSONValue **info)
> -{
> - return virCHMonitorGet(mon, URL_VM_INFO, info);
> -}
> diff --git a/src/ch/ch_monitor.h b/src/ch/ch_monitor.h
> index 8ca9e17a9a..f8c3fa75e8 100644
> --- a/src/ch/ch_monitor.h
> +++ b/src/ch/ch_monitor.h
> @@ -37,6 +37,50 @@
> #define URL_VM_RESUME "vm.resume"
> #define URL_VM_INFO "vm.info"
>
> +#define VIRCH_THREAD_NAME_LEN 16
> +
> +typedef enum {
> + virCHThreadTypeEmulator,
> + virCHThreadTypeVcpu,
> + virCHThreadTypeIO,
> + virCHThreadTypeMax
> +} virCHThreadType;
> +
> +typedef struct _virCHMonitorCPUInfo virCHMonitorCPUInfo;
> +
> +struct _virCHMonitorCPUInfo {
> + int cpuid;
> + pid_t tid;
> +
> + bool online;
> +};
> +
> +typedef struct _virCHMonitorEmuThreadInfo virCHMonitorEmuThreadInfo;
> +
> +struct _virCHMonitorEmuThreadInfo {
> + char thrName[VIRCH_THREAD_NAME_LEN];
> + pid_t tid;
> +};
> +
> +typedef struct _virCHMonitorIOThreadInfo virCHMonitorIOThreadInfo;
> +
> +struct _virCHMonitorIOThreadInfo {
> + char thrName[VIRCH_THREAD_NAME_LEN];
> + pid_t tid;
> +};
> +
> +typedef struct _virCHMonitorThreadInfo virCHMonitorThreadInfo;
> +
> +struct _virCHMonitorThreadInfo {
> + virCHThreadType type;
> +
> + union {
> + virCHMonitorCPUInfo vcpuInfo;
> + virCHMonitorEmuThreadInfo emuInfo;
> + virCHMonitorIOThreadInfo ioInfo;
> + };
> +};
> +
> typedef struct _virCHMonitor virCHMonitor;
>
> struct _virCHMonitor {
> @@ -49,6 +93,9 @@ struct _virCHMonitor {
> pid_t pid;
>
> virDomainObj *vm;
> +
> + size_t nthreads;
> + virCHMonitorThreadInfo *threads;
> };
>
> virCHMonitor *virCHMonitorNew(virDomainObj *vm, const char *socketdir);
> @@ -65,12 +112,9 @@ int virCHMonitorSuspendVM(virCHMonitor *mon);
> int virCHMonitorResumeVM(virCHMonitor *mon);
> int virCHMonitorGetInfo(virCHMonitor *mon, virJSONValue **info);
>
> -typedef struct _virCHMonitorCPUInfo virCHMonitorCPUInfo;
> -struct _virCHMonitorCPUInfo {
> - pid_t tid;
> - bool online;
> -};
> void virCHMonitorCPUInfoFree(virCHMonitorCPUInfo *cpus);
> int virCHMonitorGetCPUInfo(virCHMonitor *mon,
> virCHMonitorCPUInfo **vcpus,
> size_t maxvcpus);
> +size_t virCHMonitorGetThreadInfo(virCHMonitor *mon, bool refresh,
> + virCHMonitorThreadInfo **threads);
> diff --git a/src/ch/ch_process.c b/src/ch/ch_process.c
> index 3b7f6fcddf..8dce737adb 100644
> --- a/src/ch/ch_process.c
> +++ b/src/ch/ch_process.c
> @@ -26,6 +26,8 @@
> #include "ch_domain.h"
> #include "ch_monitor.h"
> #include "ch_process.h"
> +#include "ch_cgroup.h"
> +#include "virnuma.h"
> #include "viralloc.h"
> #include "virerror.h"
> #include "virjson.h"
> @@ -133,6 +135,257 @@ virCHProcessUpdateInfo(virDomainObj *vm)
> return 0;
> }
>
> +static int
> +virCHProcessGetAllCpuAffinity(virBitmap **cpumapRet)
> +{
> + *cpumapRet = NULL;
> +
> + if (!virHostCPUHasBitmap())
> + return 0;
> +
> + if (!(*cpumapRet = virHostCPUGetOnlineBitmap()))
> + return -1;
> +
> + return 0;
> +}
> +
> +#if defined(WITH_SCHED_GETAFFINITY) || defined(WITH_BSD_CPU_AFFINITY)
> +static int
> +virCHProcessInitCpuAffinity(virDomainObj *vm)
> +{
> + g_autoptr(virBitmap) cpumapToSet = NULL;
> + virDomainNumatuneMemMode mem_mode;
> + virCHDomainObjPrivate *priv = vm->privateData;
> +
> + if (!vm->pid) {
> + virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> + _("Cannot setup CPU affinity until process is started"));
> + return -1;
> + }
> +
> + if (virDomainNumaGetNodeCount(vm->def->numa) <= 1 &&
> + virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 &&
> + mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) {
> + virBitmap *nodeset = NULL;
> +
> + if (virDomainNumatuneMaybeGetNodeset(vm->def->numa,
> + priv->autoNodeset,
> + &nodeset,
> + -1) < 0)
> + return -1;
> +
> + if (virNumaNodesetToCPUset(nodeset, &cpumapToSet) < 0)
> + return -1;
> + } else if (vm->def->cputune.emulatorpin) {
> + if (!(cpumapToSet = virBitmapNewCopy(vm->def->cputune.emulatorpin)))
> + return -1;
> + } else {
> + if (virCHProcessGetAllCpuAffinity(&cpumapToSet) < 0)
> + return -1;
> + }
> +
> + if (cpumapToSet &&
> + virProcessSetAffinity(vm->pid, cpumapToSet, false) < 0) {
> + return -1;
> + }
> +
> + return 0;
> +}
> +#else /* !defined(WITH_SCHED_GETAFFINITY) && !defined(WITH_BSD_CPU_AFFINITY) */
> +static int
> +virCHProcessInitCpuAffinity(virDomainObj *vm G_GNUC_UNUSED)
> +{
> + return 0;
> +}
> +#endif /* !defined(WITH_SCHED_GETAFFINITY) && !defined(WITH_BSD_CPU_AFFINITY) */
> +
> +/**
> + * virCHProcessSetupPid:
> + *
> + * This function sets resource properties (affinity, cgroups,
> + * scheduler) for any PID associated with a domain. It should be used
> + * to set up emulator PIDs as well as vCPU and I/O thread pids to
> + * ensure they are all handled the same way.
> + *
> + * Returns 0 on success, -1 on error.
> + */
> +static int
> +virCHProcessSetupPid(virDomainObj *vm,
> + pid_t pid,
> + virCgroupThreadName nameval,
> + int id,
> + virBitmap *cpumask,
> + unsigned long long period,
> + long long quota,
> + virDomainThreadSchedParam *sched)
> +{
> + virCHDomainObjPrivate *priv = vm->privateData;
> + virDomainNumatuneMemMode mem_mode;
> + virCgroup *cgroup = NULL;
> + virBitmap *use_cpumask = NULL;
> + virBitmap *affinity_cpumask = NULL;
> + g_autoptr(virBitmap) hostcpumap = NULL;
> + g_autofree char *mem_mask = NULL;
> + int ret = -1;
> +
> + if ((period || quota) &&
> + !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
> + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
> + _("cgroup cpu is required for scheduler tuning"));
> + goto cleanup;
> + }
> +
> + /* Infer which cpumask shall be used. */
> + if (cpumask) {
> + use_cpumask = cpumask;
> + } else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
> + use_cpumask = priv->autoCpuset;
> + } else if (vm->def->cpumask) {
> + use_cpumask = vm->def->cpumask;
> + } else {
> + /* we can't assume cloud-hypervisor itself is running on all pCPUs,
> + * so we need to explicitly set the spawned instance to all pCPUs. */
> + if (virCHProcessGetAllCpuAffinity(&hostcpumap) < 0)
> + goto cleanup;
> + affinity_cpumask = hostcpumap;
> + }
> +
> + /*
> + * If CPU cgroup controller is not initialized here, then we need
> + * neither period nor quota settings. And if CPUSET controller is
> + * not initialized either, then there's nothing to do anyway.
> + */
> + if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) ||
> + virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) {
> +
> + if (virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 &&
> + mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT &&
> + virDomainNumatuneMaybeFormatNodeset(vm->def->numa,
> + priv->autoNodeset,
> + &mem_mask, -1) < 0)
> + goto cleanup;
> +
> + if (virCgroupNewThread(priv->cgroup, nameval, id, true, &cgroup) < 0)
> + goto cleanup;
> +
> + if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) {
> + if (use_cpumask &&
> + chSetupCgroupCpusetCpus(cgroup, use_cpumask) < 0)
> + goto cleanup;
> +
> + if (mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0)
> + goto cleanup;
> +
> + }
> +
> + if ((period || quota) &&
> + chSetupCgroupVcpuBW(cgroup, period, quota) < 0)
> + goto cleanup;
> +
> + /* Move the thread to the sub dir */
> + VIR_INFO("Adding pid %d to cgroup", pid);
> + if (virCgroupAddThread(cgroup, pid) < 0)
> + goto cleanup;
> +
> + }
> +
> + if (!affinity_cpumask)
> + affinity_cpumask = use_cpumask;
> +
> + /* Setup legacy affinity. */
> + if (affinity_cpumask && virProcessSetAffinity(pid, affinity_cpumask, false) < 0)
> + goto cleanup;
> +
> + /* Set scheduler type and priority, but not for the main thread. */
> + if (sched &&
> + nameval != VIR_CGROUP_THREAD_EMULATOR &&
> + virProcessSetScheduler(pid, sched->policy, sched->priority) < 0)
> + goto cleanup;
> +
> + ret = 0;
> + cleanup:
> + if (cgroup) {
> + if (ret < 0)
> + virCgroupRemove(cgroup);
> + virCgroupFree(cgroup);
> + }
> +
> + return ret;
> +}
> +
> +/**
> + * virCHProcessSetupVcpu:
> + * @vm: domain object
> + * @vcpuid: id of VCPU to set defaults
> + *
> + * This function sets resource properties (cgroups, affinity, scheduler) for a
> + * vCPU. This function expects that the vCPU is online and the vCPU pids were
> + * correctly detected at the point when it's called.
> + *
> + * Returns 0 on success, -1 on error.
> + */
> +int
> +virCHProcessSetupVcpu(virDomainObj *vm,
> + unsigned int vcpuid)
> +{
> + pid_t vcpupid = virCHDomainGetVcpuPid(vm, vcpuid);
> + virDomainVcpuDef *vcpu = virDomainDefGetVcpu(vm->def, vcpuid);
> +
> + return virCHProcessSetupPid(vm, vcpupid, VIR_CGROUP_THREAD_VCPU,
> + vcpuid, vcpu->cpumask,
> + vm->def->cputune.period,
> + vm->def->cputune.quota,
> + &vcpu->sched);
> +}
> +
> +static int
> +virCHProcessSetupVcpus(virDomainObj *vm)
> +{
> + virDomainVcpuDef *vcpu;
> + unsigned int maxvcpus = virDomainDefGetVcpusMax(vm->def);
> + size_t i;
> +
> + if ((vm->def->cputune.period || vm->def->cputune.quota) &&
> + !virCgroupHasController(((virCHDomainObjPrivate *) vm->privateData)->cgroup,
> + VIR_CGROUP_CONTROLLER_CPU)) {
> + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
> + _("cgroup cpu is required for scheduler tuning"));
> + return -1;
> + }
> +
> + if (!virCHDomainHasVcpuPids(vm)) {
> + /* If any CPU has custom affinity that differs from the
> + * VM default affinity, we must reject it */
> + for (i = 0; i < maxvcpus; i++) {
> + vcpu = virDomainDefGetVcpu(vm->def, i);
> +
> + if (!vcpu->online)
> + continue;
> +
> + if (vcpu->cpumask &&
> + !virBitmapEqual(vm->def->cpumask, vcpu->cpumask)) {
> + virReportError(VIR_ERR_OPERATION_INVALID, "%s",
> + _("cpu affinity is not supported"));
> + return -1;
> + }
> + }
> +
> + return 0;
> + }
> +
> + for (i = 0; i < maxvcpus; i++) {
> + vcpu = virDomainDefGetVcpu(vm->def, i);
> +
> + if (!vcpu->online)
> + continue;
> +
> + if (virCHProcessSetupVcpu(vm, i) < 0)
> + return -1;
> + }
> +
> + return 0;
> +}
> +
> /**
> * virCHProcessStart:
> * @driver: pointer to driver structure
> @@ -168,18 +421,33 @@ int virCHProcessStart(virCHDriver *driver,
> }
> }
>
> + vm->pid = priv->monitor->pid;
> + vm->def->id = vm->pid;
> + priv->machineName = virCHDomainGetMachineName(vm);
> +
> + if (chSetupCgroup(vm, nnicindexes, nicindexes) < 0)
> + goto cleanup;
> +
> + if (virCHProcessInitCpuAffinity(vm) < 0)
> + goto cleanup;
> +
> if (virCHMonitorBootVM(priv->monitor) < 0) {
> virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> _("failed to boot guest VM"));
> goto cleanup;
> }
>
> - priv->machineName = virCHDomainGetMachineName(vm);
> - vm->pid = priv->monitor->pid;
> - vm->def->id = vm->pid;
> + virCHDomainRefreshThreadInfo(vm);
>
> - virCHProcessUpdateInfo(vm);
> + VIR_DEBUG("Setting global CPU cgroup (if required)");
> + if (chSetupGlobalCpuCgroup(vm) < 0)
> + goto cleanup;
> +
> + VIR_DEBUG("Setting vCPU tuning/settings");
> + if (virCHProcessSetupVcpus(vm) < 0)
> + goto cleanup;
>
> + virCHProcessUpdateInfo(vm);
> virDomainObjSetState(vm, VIR_DOMAIN_RUNNING, reason);
>
> return 0;
> @@ -195,6 +463,8 @@ int virCHProcessStop(virCHDriver *driver G_GNUC_UNUSED,
> virDomainObj *vm,
> virDomainShutoffReason reason)
> {
> + int ret;
> + int retries = 0;
> virCHDomainObjPrivate *priv = vm->privateData;
>
> VIR_DEBUG("Stopping VM name=%s pid=%d reason=%d",
> @@ -205,6 +475,16 @@ int virCHProcessStop(virCHDriver *driver G_GNUC_UNUSED,
> priv->monitor = NULL;
> }
>
> + retry:
Wrong label indentation.
Daniel
> + if ((ret = chRemoveCgroup(vm)) < 0) {
> + if (ret == -EBUSY && (retries++ < 5)) {
> + g_usleep(200*1000);
> + goto retry;
> + }
> + VIR_WARN("Failed to remove cgroup for %s",
> + vm->def->name);
> + }
> +
> vm->pid = -1;
> vm->def->id = -1;
>
> diff --git a/src/ch/ch_process.h b/src/ch/ch_process.h
> index abc4915979..800e3f4e23 100644
> --- a/src/ch/ch_process.h
> +++ b/src/ch/ch_process.h
> @@ -29,3 +29,6 @@ int virCHProcessStart(virCHDriver *driver,
> int virCHProcessStop(virCHDriver *driver,
> virDomainObj *vm,
> virDomainShutoffReason reason);
> +
> +int virCHProcessSetupVcpu(virDomainObj *vm,
> + unsigned int vcpuid);
> diff --git a/src/ch/meson.build b/src/ch/meson.build
> index 2b2bdda26c..0b20de56fd 100644
> --- a/src/ch/meson.build
> +++ b/src/ch/meson.build
> @@ -1,6 +1,8 @@
> ch_driver_sources = [
> 'ch_conf.c',
> 'ch_conf.h',
> + 'ch_cgroup.c',
> + 'ch_cgroup.h',
> 'ch_domain.c',
> 'ch_domain.h',
> 'ch_driver.c',
>
More information about the libvir-list
mailing list