[libvirt] [PATCH v1 1/1] qemu_process.c: add CAP_IPC_LOCK when using libcap-ng
Daniel Henrique Barboza
danielhb413 at gmail.com
Wed Feb 6 17:05:37 UTC 2019
On 2/6/19 2:34 PM, Alex Williamson wrote:
> [cc +Alexey]
>
> On Wed, 6 Feb 2019 13:44:53 -0200
> Daniel Henrique Barboza <danielhb413 at gmail.com> wrote:
>
>> QEMU virtual machines with PCI passthrough of certain devices,
>> such as the NVIDIA Tesla V100 GPU, requires allocation of an
>> amount of memory pages that can break KVM limits. When that
>> happens, the KVM module checks if the process is IPC_LOCK capable
>> and, in case it doesn't, it refuses to allocate the mem pages,
>> causing the guest to misbehave.
>>
>> When Libvirt is compiled to use libcap-ng support, the resulting
>> QEMU guest Libvirt creates does not have CAP_IPC_LOCK, hurting
>> guests that are doing PCI passthrough and that requires extra
>> memory to work.
>>
>> This patch addresses this issue by checking whether we're
>> running with libcap-ng support (WITH_CAPNG) and if the
>> guest is using PCI passthrough with the VFIO driver. If those
>> conditions are met, allow CAP_IPC_LOCK capability to the
>> QEMU process.
> That effectively allows the QEMU process to lock all the memory it
> wants and thereby generate a denial of service on the host, which is
> the thing we're originally trying to prevent by accounting pinned pages
> and enforcing the user's locked memory limits. The commit log doesn't
> mention that this is only seen on POWER, but I infer that from the
> previous thread. Does this happen for any assigned device on POWER or
> is this something to do with the new NVLink support that Alexey has been
> adding? Thanks,
You're right in both accounts. The behavior I am addressing here is seen
in a
Power 9 server using a QEMU build with NVLink2 passthrough support
from Alexey.
Not all assigned devices in Power wants to lock more memory than KVM allows,
at least as far as I can tell.
About your argument of allowing QEMU to lock all memory, I agree that
is a problem. Aside from trusting that QEMU will not get greedy and lock
down all the host memory (which is what I'm doing here, in a sense), I
can only speculate alternatives. I'd rather wait for Alexey's input.
Thanks,
DHB
>
> Alex
>
>> Signed-off-by: Daniel Henrique Barboza <danielhb413 at gmail.com>
>> ---
>> src/libvirt_private.syms | 1 +
>> src/qemu/qemu_process.c | 34 ++++++++++++++++++++++++++++++++++
>> src/util/virhostdev.c | 2 +-
>> src/util/virhostdev.h | 4 ++++
>> 4 files changed, 40 insertions(+), 1 deletion(-)
>>
>> diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
>> index 6b401aa5e0..b7384a67a8 100644
>> --- a/src/libvirt_private.syms
>> +++ b/src/libvirt_private.syms
>> @@ -1992,6 +1992,7 @@ virHostCPUStatsAssign;
>>
>> # util/virhostdev.h
>> virHostdevFindUSBDevice;
>> +virHostdevGetPCIHostDeviceList;
>> virHostdevIsMdevDevice;
>> virHostdevIsSCSIDevice;
>> virHostdevManagerGetDefault;
>> diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
>> index 0583eb03f2..24aef5904f 100644
>> --- a/src/qemu/qemu_process.c
>> +++ b/src/qemu/qemu_process.c
>> @@ -4997,6 +4997,38 @@ qemuProcessSetupRawIO(virQEMUDriverPtr driver,
>> }
>>
>>
>> +static void
>> +qemuProcessSetupVFIOCaps(virDomainObjPtr vm ATTRIBUTE_UNUSED,
>> + virCommandPtr cmd ATTRIBUTE_UNUSED)
>> +{
>> +#if WITH_CAPNG
>> + virPCIDeviceListPtr pcidevs = NULL;
>> + bool has_vfio = false;
>> + size_t i;
>> +
>> + pcidevs = virHostdevGetPCIHostDeviceList(vm->def->hostdevs,
>> + vm->def->nhostdevs);
>> + if (!pcidevs)
>> + return;
>> +
>> + for (i = 0; i < virPCIDeviceListCount(pcidevs); i++) {
>> + virPCIDevicePtr pci = virPCIDeviceListGet(pcidevs, i);
>> + if (virPCIDeviceGetStubDriver(pci) == VIR_PCI_STUB_DRIVER_VFIO) {
>> + has_vfio = true;
>> + break;
>> + }
>> + }
>> +
>> + if (has_vfio) {
>> + VIR_DEBUG("Adding CAP_IPC_LOCK to QEMU process");
>> + virCommandAllowCap(cmd, CAP_IPC_LOCK);
>> + }
>> +
>> + virObjectUnref(pcidevs);
>> +#endif
>> +}
>> +
>> +
>> static int
>> qemuProcessSetupBalloon(virQEMUDriverPtr driver,
>> virDomainObjPtr vm,
>> @@ -6569,6 +6601,8 @@ qemuProcessLaunch(virConnectPtr conn,
>> if (qemuProcessSetupRawIO(driver, vm, cmd) < 0)
>> goto cleanup;
>>
>> + qemuProcessSetupVFIOCaps(vm, cmd);
>> +
>> virCommandSetPreExecHook(cmd, qemuProcessHook, &hookData);
>> virCommandSetMaxProcesses(cmd, cfg->maxProcesses);
>> virCommandSetMaxFiles(cmd, cfg->maxFiles);
>> diff --git a/src/util/virhostdev.c b/src/util/virhostdev.c
>> index 6be395cdda..ee389a1729 100644
>> --- a/src/util/virhostdev.c
>> +++ b/src/util/virhostdev.c
>> @@ -218,7 +218,7 @@ virHostdevManagerGetDefault(void)
>> return virObjectRef(manager);
>> }
>>
>> -static virPCIDeviceListPtr
>> +virPCIDeviceListPtr
>> virHostdevGetPCIHostDeviceList(virDomainHostdevDefPtr *hostdevs, int nhostdevs)
>> {
>> virPCIDeviceListPtr pcidevs;
>> diff --git a/src/util/virhostdev.h b/src/util/virhostdev.h
>> index 7263f320a2..9235581242 100644
>> --- a/src/util/virhostdev.h
>> +++ b/src/util/virhostdev.h
>> @@ -57,6 +57,10 @@ struct _virHostdevManager {
>> };
>>
>> virHostdevManagerPtr virHostdevManagerGetDefault(void);
>> +virPCIDeviceListPtr
>> +virHostdevGetPCIHostDeviceList(virDomainHostdevDefPtr *hostdevs,
>> + int nhostdevs)
>> + ATTRIBUTE_NONNULL(1);
>> int
>> virHostdevPreparePCIDevices(virHostdevManagerPtr hostdev_mgr,
>> const char *drv_name,
More information about the libvir-list
mailing list