[libvirt] [PATCH v8 1/5] nodeinfo: Fix output on PPC64 KVM hosts
Shivaprasad bhat
shivaprasadbhat at gmail.com
Wed Jul 29 07:42:37 UTC 2015
On Mon, Jul 27, 2015 at 1:38 PM, Andrea Bolognani <abologna at redhat.com> wrote:
> From: Shivaprasad G Bhat <sbhat at linux.vnet.ibm.com>
>
> The nodeinfo is reporting incorrect number of cpus and incorrect host
> topology on PPC64 KVM hosts. The KVM hypervisor on PPC64 needs only
> the primary thread in a core to be online, and the secondaries offlined.
> While scheduling a guest in, the kvm scheduler wakes up the secondaries to
> run in guest context.
>
> The host scheduling of the guests happen at the core level(as only primary
> thread is online). The kvm scheduler exploits as many threads of the core
> as needed by guest. Further, starting POWER8, the processor allows splitting
> a physical core into multiple subcores with 2 or 4 threads each. Again, only
> the primary thread in a subcore is online in the host. The KVM-PPC
> scheduler allows guests to exploit all the offline threads in the subcore,
> by bringing them online when needed.
> (Kernel patches on split-core http://www.spinics.net/lists/kvm-ppc/msg09121.html)
>
> Recently with dynamic micro-threading changes in ppc-kvm, makes sure
> to utilize all the offline cpus across guests, and across guests with
> different cpu topologies.
> (https://www.mail-archive.com/kvm@vger.kernel.org/msg115978.html)
>
> Since the offline cpus are brought online in the guest context, it is safe
> to count them as online. Nodeinfo today discounts these offline cpus from
> cpu count/topology calclulation, and the nodeinfo output is not of any help
> and the host appears overcommited when it is actually not.
>
> The patch carefully counts those offline threads whose primary threads are
> online. The host topology displayed by the nodeinfo is also fixed when the
> host is in valid kvm state.
>
> Signed-off-by: Shivaprasad G Bhat <sbhat at linux.vnet.ibm.com>
> Signed-off-by: Andrea Bolognani <abologna at redhat.com>
> ---
> src/libvirt_private.syms | 1 +
> src/nodeinfo.c | 144 +++++++++++++++++++++++++++++++++++++++++++++--
> src/nodeinfo.h | 1 +
> 3 files changed, 142 insertions(+), 4 deletions(-)
>
> diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
> index ff322d6..0517c24 100644
> --- a/src/libvirt_private.syms
> +++ b/src/libvirt_private.syms
> @@ -1010,6 +1010,7 @@ nodeGetMemoryParameters;
> nodeGetMemoryStats;
> nodeGetOnlineCPUBitmap;
> nodeGetPresentCPUBitmap;
> +nodeGetThreadsPerSubcore;
> nodeSetMemoryParameters;
>
>
> diff --git a/src/nodeinfo.c b/src/nodeinfo.c
> index ba633a1..53bbc54 100644
> --- a/src/nodeinfo.c
> +++ b/src/nodeinfo.c
> @@ -31,6 +31,12 @@
> #include <dirent.h>
> #include <sys/utsname.h>
> #include "conf/domain_conf.h"
> +#include <fcntl.h>
> +#include <sys/ioctl.h>
> +
> +#if HAVE_LINUX_KVM_H
> +# include <linux/kvm.h>
> +#endif
>
> #if defined(__FreeBSD__) || defined(__APPLE__)
> # include <sys/time.h>
> @@ -392,13 +398,14 @@ virNodeParseSocket(const char *dir,
> * filling arguments */
> static int
> ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(3)
> -ATTRIBUTE_NONNULL(4) ATTRIBUTE_NONNULL(5)
> -ATTRIBUTE_NONNULL(6) ATTRIBUTE_NONNULL(7)
> -ATTRIBUTE_NONNULL(8)
> +ATTRIBUTE_NONNULL(4) ATTRIBUTE_NONNULL(6)
> +ATTRIBUTE_NONNULL(7) ATTRIBUTE_NONNULL(8)
> +ATTRIBUTE_NONNULL(9)
> virNodeParseNode(const char *node,
> virArch arch,
> virBitmapPtr present_cpus_map,
> virBitmapPtr online_cpus_map,
> + int threads_per_subcore,
> int *sockets,
> int *cores,
> int *threads,
> @@ -492,7 +499,18 @@ virNodeParseNode(const char *node,
> continue;
>
> if (!virBitmapIsBitSet(online_cpus_map, cpu)) {
> - (*offline)++;
> + if (threads_per_subcore > 0 &&
> + cpu % threads_per_subcore != 0 &&
> + virBitmapIsBitSet(online_cpus_map,
> + cpu - (cpu % threads_per_subcore))) {
> + /* Secondary offline threads are counted as online when
> + * subcores are in use and the corresponding primary
> + * thread is online */
> + processors++;
> + } else {
> + /* But they are counted as offline otherwise */
> + (*offline)++;
> + }
> continue;
> }
>
> @@ -545,6 +563,12 @@ virNodeParseNode(const char *node,
> *cores = core;
> }
>
> + if (threads_per_subcore > 0) {
> + /* The thread count ignores offline threads, which means that only
> + * only primary threads have been considered so far. If subcores
> + * are in use, we need to also account for secondary threads */
> + *threads *= threads_per_subcore;
> + }
> ret = processors;
>
> cleanup:
> @@ -563,6 +587,48 @@ virNodeParseNode(const char *node,
> return ret;
> }
>
> +/* Check whether the host subcore configuration is valid.
> + *
> + * A valid configuration is one where no secondary thread is online;
> + * the primary thread in a subcore is always the first one */
> +static bool
> +nodeHasValidSubcoreConfiguration(const char *sysfs_prefix,
> + int threads_per_subcore)
> +{
> + virBitmapPtr online_cpus = NULL;
> + int nonline_cpus;
> + int cpu;
> + bool ret = false;
> +
> + /* No point in checking if subcores are not in use */
> + if (threads_per_subcore <= 0)
> + goto cleanup;
> +
> + online_cpus = nodeGetOnlineCPUBitmap(sysfs_prefix);
> + if (online_cpus == NULL)
> + goto cleanup;
> +
> + nonline_cpus = virBitmapSize(online_cpus);
> +
> + for (cpu = 0; cpu < nonline_cpus; cpu++) {
> +
> + /* Skip primary threads */
> + if (cpu % threads_per_subcore == 0)
> + continue;
> +
> + /* A single online subthread is enough to make the
> + * configuration invalid */
> + if (virBitmapIsBitSet(online_cpus, cpu))
> + goto cleanup;
> + }
> +
> + ret = true;
> +
> + cleanup:
> + virBitmapFree(online_cpus);
> + return ret;
> +}
> +
> int
> linuxNodeInfoCPUPopulate(const char *sysfs_prefix,
> FILE *cpuinfo,
> @@ -576,6 +642,7 @@ linuxNodeInfoCPUPopulate(const char *sysfs_prefix,
> DIR *nodedir = NULL;
> struct dirent *nodedirent = NULL;
> int cpus, cores, socks, threads, offline = 0;
> + int threads_per_subcore = 0;
> unsigned int node;
> int ret = -1;
> char *sysfs_nodedir = NULL;
> @@ -683,6 +750,36 @@ linuxNodeInfoCPUPopulate(const char *sysfs_prefix,
> goto fallback;
> }
>
> + /* PPC-KVM needs the secondary threads of a core to be offline on the
> + * host. The kvm scheduler brings the secondary threads online in the
> + * guest context. Moreover, P8 processor has split-core capability
> + * where, there can be 1,2 or 4 subcores per core. The primaries of the
> + * subcores alone will be online on the host for a subcore in the
> + * host. Even though the actual threads per core for P8 processor is 8,
> + * depending on the subcores_per_core = 1, 2 or 4, the threads per
> + * subcore will vary accordingly to 8, 4 and 2 repectively.
> + * So, On host threads_per_core what is arrived at from sysfs in the
> + * current logic is actually the subcores_per_core. Threads per subcore
> + * can only be obtained from the kvm device. For example, on P8 wih 1
> + * core having 8 threads, sub_cores_percore=4, the threads 0,2,4 & 6
> + * will be online. The sysfs reflects this and in the current logic
> + * variable 'threads' will be 4 which is nothing but subcores_per_core.
> + * If the user tampers the cpu online/offline states using chcpu or other
> + * means, then it is an unsupported configuration for kvm.
> + * The code below tries to keep in mind
> + * - when the libvirtd is run inside a KVM guest or Phyp based guest.
> + * - Or on the kvm host where user manually tampers the cpu states to
> + * offline/online randomly.
> + * On hosts other than POWER this will be 0, in which case a simpler
> + * thread-counting logic will be used */
> + if ((threads_per_subcore = nodeGetThreadsPerSubcore(arch)) < 0)
> + goto cleanup;
> +
> + /* If the subcore configuration is not valid, just pretend subcores
> + * are not in use and count threads one by one */
> + if (!nodeHasValidSubcoreConfiguration(sysfs_prefix, threads_per_subcore))
> + threads_per_subcore = 0;
> +
> while ((direrr = virDirRead(nodedir, &nodedirent, sysfs_nodedir)) > 0) {
> if (sscanf(nodedirent->d_name, "node%u", &node) != 1)
> continue;
> @@ -696,6 +793,7 @@ linuxNodeInfoCPUPopulate(const char *sysfs_prefix,
> if ((cpus = virNodeParseNode(sysfs_cpudir, arch,
> present_cpus_map,
> online_cpus_map,
> + threads_per_subcore,
> &socks, &cores,
> &threads, &offline)) < 0)
> goto cleanup;
> @@ -729,6 +827,7 @@ linuxNodeInfoCPUPopulate(const char *sysfs_prefix,
> if ((cpus = virNodeParseNode(sysfs_cpudir, arch,
> present_cpus_map,
> online_cpus_map,
> + threads_per_subcore,
> &socks, &cores,
> &threads, &offline)) < 0)
> goto cleanup;
> @@ -2248,3 +2347,40 @@ nodeAllocPages(unsigned int npages,
> cleanup:
> return ret;
> }
> +
> +/* Get the number of threads per subcore.
> + *
> + * This will be 2, 4 or 8 on POWER hosts, depending on the current
> + * micro-threading configuration, and 0 everywhere else.
> + *
> + * Returns the number of threads per subcore if subcores are in use, zero
> + * if subcores are not in use, and a negative value on error */
> +int
> +nodeGetThreadsPerSubcore(virArch arch)
> +{
> + int kvmfd;
> + int threads_per_subcore = 0;
> +
> +#if HAVE_LINUX_KVM_H && defined(KVM_CAP_PPC_SMT)
> + if (ARCH_IS_PPC64(arch)) {
> +
> + kvmfd = open("/dev/kvm", O_RDONLY);
> + if (kvmfd < 0) {
> + threads_per_subcore = -1;
Its okay for a guest to not have kvm/qemu packages installed and open()
would fail.
The caller goes to cleanup because of -1 and we get this error
error: failed to get node information
error: An error occurred, but the cause is unknown
If we remove the -1 assignment we should be good. Even on a host,
user might not want to use kvm that should also be treated with the usual
cpu counting.
Thanks,
Shivaprasad
> + goto out;
> + }
> +
> + /* For Phyp and KVM based guests the ioctl for KVM_CAP_PPC_SMT
> + * returns zero and both primary and secondary threads will be
> + * online */
> + threads_per_subcore = ioctl(kvmfd,
> + KVM_CHECK_EXTENSION,
> + KVM_CAP_PPC_SMT);
> +
> + out:
> + VIR_FORCE_CLOSE(kvmfd);
> + }
> +#endif /* HAVE_LINUX_KVM_H && defined(KVM_CAP_PPC_SMT) */
> +
> + return threads_per_subcore;
> +}
> diff --git a/src/nodeinfo.h b/src/nodeinfo.h
> index 1810c1c..ac96dca 100644
> --- a/src/nodeinfo.h
> +++ b/src/nodeinfo.h
> @@ -47,6 +47,7 @@ int nodeGetMemory(unsigned long long *mem,
> virBitmapPtr nodeGetPresentCPUBitmap(const char *sysfs_prefix);
> virBitmapPtr nodeGetOnlineCPUBitmap(const char *sysfs_prefix);
> int nodeGetCPUCount(const char *sysfs_prefix);
> +int nodeGetThreadsPerSubcore(virArch arch);
>
> int nodeGetMemoryParameters(virTypedParameterPtr params,
> int *nparams,
> --
> 2.4.3
>
More information about the libvir-list
mailing list