[libvirt] [PATCH] qemu: Rework setting process affinity

Daniel P. Berrangé berrange at redhat.com
Wed Jan 30 14:34:38 UTC 2019


On Wed, Jan 30, 2019 at 02:56:46PM +0100, Michal Privoznik wrote:
> https://bugzilla.redhat.com/show_bug.cgi?id=1503284
> 
> The way we currently start qemu from CPU affinity POV is as
> follows:
> 
> 1) the child process is set affinity to all online CPUs (unless
> some vcpu pinning was given in the domain XML)
> 
> 2) Once qemu is running, cpuset cgroup is configured taking
> memory pinning into account
> 
> Problem is that we let qemu allocate its memory just anywhere in
> 1) and then rely in 2) to be able to move the memory to
> configured NUMA nodes. This might not be always possible (e.g.
> qemu might lock some parts of its memory) and is very suboptimal
> (copying large memory between NUMA nodes takes significant amount
> of time). The solution is to set the affinity correctly from the
> beginning and then possibly refine it later via cgroups.
> 
> Signed-off-by: Michal Privoznik <mprivozn at redhat.com>
> ---
>  src/qemu/qemu_process.c | 152 ++++++++++++++++++++++------------------
>  1 file changed, 83 insertions(+), 69 deletions(-)
> 
> diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
> index 9ccc3601a2..a4668f6773 100644
> --- a/src/qemu/qemu_process.c
> +++ b/src/qemu/qemu_process.c
> @@ -2435,6 +2435,44 @@ qemuProcessDetectIOThreadPIDs(virQEMUDriverPtr driver,
>  }
>  
>  
> +static int
> +qemuProcessGetAllCpuAffinity(pid_t pid,
> +                             virBitmapPtr *cpumapRet)
> +{
> +    VIR_AUTOPTR(virBitmap) cpumap = NULL;
> +    VIR_AUTOPTR(virBitmap) hostcpumap = NULL;
> +    int hostcpus;
> +
> +    *cpumapRet = NULL;
> +
> +    if (!virHostCPUHasBitmap())
> +        return 0;
> +
> +    if (!(hostcpumap = virHostCPUGetOnlineBitmap()) ||
> +        !(cpumap = virProcessGetAffinity(pid)))
> +        return -1;
> +
> +    if (!virBitmapEqual(hostcpumap, cpumap)) {
> +        /* setaffinity fails if you set bits for CPUs which
> +         * aren't present, so we have to limit ourselves */
> +        if ((hostcpus = virHostCPUGetCount()) < 0)
> +            return -1;
> +
> +        if (hostcpus > QEMUD_CPUMASK_LEN)
> +            hostcpus = QEMUD_CPUMASK_LEN;
> +
> +        virBitmapFree(cpumap);
> +        if (!(cpumap = virBitmapNew(hostcpus)))
> +            return -1;
> +
> +        virBitmapSetAll(cpumap);
> +    }
> +
> +    VIR_STEAL_PTR(*cpumapRet, cpumap);

IIUC, if the QEMU process affinity matches the current
set of online host CPUs, we just return the current QEMU
affinity.  Otherwise we construct a mask of all host CPUs,
but seemingly ignoring whether the host CPUs are online
or not.  Seems a bit odd to return list of online host CPUs
in one case, or a list of all possible host CPUs in the
other cases.

So I guess I'm wondering why this method is not simply
reduced to 1 single line

   *cpumapRet = virHostCPUGetOnlineBitmap();

?



> @@ -2454,59 +2492,36 @@ qemuProcessInitCpuAffinity(virDomainObjPtr vm)
>          return -1;
>      }
>  
> -    if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
> -        VIR_DEBUG("Set CPU affinity with advisory nodeset from numad");
> -        cpumapToSet = priv->autoCpuset;
> +    /* Here is the deal, we can't set cpuset.mems before qemu is
> +     * started as it clashes with KVM allocation. Therefore we
> +     * used to let qemu allocate its memory anywhere as we would
> +     * then move the memory to desired NUMA node via CGroups.
> +     * However, that might not be always possible because qemu
> +     * might lock some parts of its memory (e.g. due to VFIO).
> +     * Solution is to set some temporary affinity now and then
> +     * fix it later, once qemu is already running. */
> +    if (virDomainNumaGetNodeCount(vm->def->numa) <= 1 &&
> +        virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 &&
> +        mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) {
> +        if (virDomainNumatuneMaybeGetNodeset(vm->def->numa,
> +                                             priv->autoNodeset,
> +                                             &cpumapToSet,
> +                                             -1) < 0)
> +            goto cleanup;
> +    } else if (vm->def->cputune.emulatorpin) {
> +        cpumapToSet = vm->def->cputune.emulatorpin;
>      } else {
> -        VIR_DEBUG("Set CPU affinity with specified cpuset");
> -        if (vm->def->cpumask) {
> -            cpumapToSet = vm->def->cpumask;
> -        } else {
> -            /* You may think this is redundant, but we can't assume libvirtd
> -             * itself is running on all pCPUs, so we need to explicitly set
> -             * the spawned QEMU instance to all pCPUs if no map is given in
> -             * its config file */
> -            int hostcpus;
> -
> -            if (virHostCPUHasBitmap()) {
> -                hostcpumap = virHostCPUGetOnlineBitmap();
> -                cpumap = virProcessGetAffinity(vm->pid);
> -            }
> -
> -            if (hostcpumap && cpumap && virBitmapEqual(hostcpumap, cpumap)) {
> -                /* we're using all available CPUs, no reason to set
> -                 * mask. If libvirtd is running without explicit
> -                 * affinity, we can use hotplugged CPUs for this VM */
> -                ret = 0;
> -                goto cleanup;
> -            } else {
> -                /* setaffinity fails if you set bits for CPUs which
> -                 * aren't present, so we have to limit ourselves */
> -                if ((hostcpus = virHostCPUGetCount()) < 0)
> -                    goto cleanup;
> -
> -                if (hostcpus > QEMUD_CPUMASK_LEN)
> -                    hostcpus = QEMUD_CPUMASK_LEN;
> -
> -                virBitmapFree(cpumap);
> -                if (!(cpumap = virBitmapNew(hostcpus)))
> -                    goto cleanup;
> -
> -                virBitmapSetAll(cpumap);
> -
> -                cpumapToSet = cpumap;
> -            }
> -        }
> +        if (qemuProcessGetAllCpuAffinity(vm->pid, &hostcpumap) < 0)
> +            goto cleanup;
> +        cpumapToSet = hostcpumap;

IIUC, the end up setting affinity to one of (in priority order)

   - The CPUs associated with NUMA memory affinity mask
   - The CPUs associated with emulator pinning
   - All online host CPUs

Later (once QEMU has allocated its memory) we then change this
again to be simpler

   - The CPUs associated with emulator pinning
   - All online host CPUs

I think it would be nice to mention this explicitly in the
commit message, as it clarifies how we're solving the problem
the commit message mentions.

>      }
>  
> -    if (virProcessSetAffinity(vm->pid, cpumapToSet) < 0)
> +    if (cpumapToSet &&
> +        virProcessSetAffinity(vm->pid, cpumapToSet) < 0)
>          goto cleanup;
>  
>      ret = 0;
> -
>   cleanup:
> -    virBitmapFree(cpumap);
> -    virBitmapFree(hostcpumap);
>      return ret;
>  }
>  #else /* !defined(HAVE_SCHED_GETAFFINITY) && !defined(HAVE_BSD_CPU_AFFINITY) */
> @@ -2586,7 +2601,8 @@ qemuProcessSetupPid(virDomainObjPtr vm,
>      qemuDomainObjPrivatePtr priv = vm->privateData;
>      virDomainNumatuneMemMode mem_mode;
>      virCgroupPtr cgroup = NULL;
> -    virBitmapPtr use_cpumask;
> +    virBitmapPtr use_cpumask = NULL;
> +    VIR_AUTOPTR(virBitmap) hostcpumap = NULL;
>      char *mem_mask = NULL;
>      int ret = -1;
>  
> @@ -2598,12 +2614,21 @@ qemuProcessSetupPid(virDomainObjPtr vm,
>      }
>  
>      /* Infer which cpumask shall be used. */
> -    if (cpumask)
> +    if (cpumask) {
>          use_cpumask = cpumask;
> -    else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)
> +    } else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
>          use_cpumask = priv->autoCpuset;
> -    else
> +    } else if (vm->def->cpumask) {
>          use_cpumask = vm->def->cpumask;
> +    } else if (virHostCPUHasBitmap()) {
> +        /* You may think this is redundant, but we can't assume libvirtd
> +         * itself is running on all pCPUs, so we need to explicitly set
> +         * the spawned QEMU instance to all pCPUs if no map is given in
> +         * its config file */
> +        if (qemuProcessGetAllCpuAffinity(pid, &hostcpumap) < 0)
> +            goto cleanup;
> +        use_cpumask = hostcpumap;
> +    }
>  
>      /*
>       * If CPU cgroup controller is not initialized here, then we need
> @@ -2628,13 +2653,7 @@ qemuProcessSetupPid(virDomainObjPtr vm,
>                  qemuSetupCgroupCpusetCpus(cgroup, use_cpumask) < 0)
>                  goto cleanup;
>  
> -            /*
> -             * Don't setup cpuset.mems for the emulator, they need to
> -             * be set up after initialization in order for kvm
> -             * allocations to succeed.
> -             */
> -            if (nameval != VIR_CGROUP_THREAD_EMULATOR &&
> -                mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0)
> +            if (mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0)
>                  goto cleanup;
>  
>          }
> @@ -6634,12 +6653,7 @@ qemuProcessLaunch(virConnectPtr conn,
>  
>      /* This must be done after cgroup placement to avoid resetting CPU
>       * affinity */
> -    if (!vm->def->cputune.emulatorpin &&
> -        qemuProcessInitCpuAffinity(vm) < 0)
> -        goto cleanup;
> -
> -    VIR_DEBUG("Setting emulator tuning/settings");
> -    if (qemuProcessSetupEmulator(vm) < 0)
> +    if (qemuProcessInitCpuAffinity(vm) < 0)
>          goto cleanup;
>  
>      VIR_DEBUG("Setting cgroup for external devices (if required)");
> @@ -6708,10 +6722,6 @@ qemuProcessLaunch(virConnectPtr conn,
>      if (qemuProcessUpdateAndVerifyCPU(driver, vm, asyncJob) < 0)
>          goto cleanup;
>  
> -    VIR_DEBUG("Setting up post-init cgroup restrictions");
> -    if (qemuSetupCpusetMems(vm) < 0)
> -        goto cleanup;
> -
>      VIR_DEBUG("setting up hotpluggable cpus");
>      if (qemuDomainHasHotpluggableStartupVcpus(vm->def)) {
>          if (qemuDomainRefreshVcpuInfo(driver, vm, asyncJob, false) < 0)
> @@ -6737,6 +6747,10 @@ qemuProcessLaunch(virConnectPtr conn,
>      if (qemuProcessDetectIOThreadPIDs(driver, vm, asyncJob) < 0)
>          goto cleanup;
>  
> +    VIR_DEBUG("Setting emulator tuning/settings");
> +    if (qemuProcessSetupEmulator(vm) < 0)
> +        goto cleanup;
> +
>      VIR_DEBUG("Setting global CPU cgroup (if required)");
>      if (qemuSetupGlobalCpuCgroup(vm) < 0)
>          goto cleanup;
> -- 
> 2.19.2
> 
> --
> libvir-list mailing list
> libvir-list at redhat.com
> https://www.redhat.com/mailman/listinfo/libvir-list

Regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|




More information about the libvir-list mailing list