[libvirt] [PATCH v1 1/1] qemu_process.c: add CAP_IPC_LOCK when using libcap-ng

Daniel Henrique Barboza danielhb413 at gmail.com
Wed Feb 6 17:05:37 UTC 2019



On 2/6/19 2:34 PM, Alex Williamson wrote:
> [cc +Alexey]
>
> On Wed,  6 Feb 2019 13:44:53 -0200
> Daniel Henrique Barboza <danielhb413 at gmail.com> wrote:
>
>> QEMU virtual machines with PCI passthrough of certain devices,
>> such as the NVIDIA Tesla V100 GPU, requires allocation of an
>> amount of memory pages that can break KVM limits. When that
>> happens, the KVM module checks if the process is IPC_LOCK capable
>> and, in case it doesn't, it refuses to allocate the mem pages,
>> causing the guest to misbehave.
>>
>> When Libvirt is compiled to use libcap-ng support, the resulting
>> QEMU guest Libvirt creates does not have CAP_IPC_LOCK, hurting
>> guests that are doing PCI passthrough and that requires extra
>> memory to work.
>>
>> This patch addresses this issue by checking whether we're
>> running with libcap-ng support (WITH_CAPNG) and if the
>> guest is using PCI passthrough with the VFIO driver. If those
>> conditions are met, allow CAP_IPC_LOCK capability to the
>> QEMU process.
> That effectively allows the QEMU process to lock all the memory it
> wants and thereby generate a denial of service on the host, which is
> the thing we're originally trying to prevent by accounting pinned pages
> and enforcing the user's locked memory limits.  The commit log doesn't
> mention that this is only seen on POWER, but I infer that from the
> previous thread.  Does this happen for any assigned device on POWER or
> is this something to do with the new NVLink support that Alexey has been
> adding?  Thanks,

You're right in both accounts. The behavior I am addressing here is seen 
in a
Power 9 server using a QEMU build with NVLink2 passthrough support
from Alexey.

Not all assigned devices in Power wants to lock more memory than KVM allows,
at least as far as I can tell.

About your argument of allowing QEMU to lock all memory, I agree that
is a problem. Aside from trusting that QEMU will not get greedy and lock
down all the host memory (which is what I'm doing here, in a sense), I
can only speculate alternatives. I'd rather wait for Alexey's input.



Thanks,


DHB




>
> Alex
>
>> Signed-off-by: Daniel Henrique Barboza <danielhb413 at gmail.com>
>> ---
>>   src/libvirt_private.syms |  1 +
>>   src/qemu/qemu_process.c  | 34 ++++++++++++++++++++++++++++++++++
>>   src/util/virhostdev.c    |  2 +-
>>   src/util/virhostdev.h    |  4 ++++
>>   4 files changed, 40 insertions(+), 1 deletion(-)
>>
>> diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
>> index 6b401aa5e0..b7384a67a8 100644
>> --- a/src/libvirt_private.syms
>> +++ b/src/libvirt_private.syms
>> @@ -1992,6 +1992,7 @@ virHostCPUStatsAssign;
>>   
>>   # util/virhostdev.h
>>   virHostdevFindUSBDevice;
>> +virHostdevGetPCIHostDeviceList;
>>   virHostdevIsMdevDevice;
>>   virHostdevIsSCSIDevice;
>>   virHostdevManagerGetDefault;
>> diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
>> index 0583eb03f2..24aef5904f 100644
>> --- a/src/qemu/qemu_process.c
>> +++ b/src/qemu/qemu_process.c
>> @@ -4997,6 +4997,38 @@ qemuProcessSetupRawIO(virQEMUDriverPtr driver,
>>   }
>>   
>>   
>> +static void
>> +qemuProcessSetupVFIOCaps(virDomainObjPtr vm ATTRIBUTE_UNUSED,
>> +                         virCommandPtr cmd ATTRIBUTE_UNUSED)
>> +{
>> +#if WITH_CAPNG
>> +    virPCIDeviceListPtr pcidevs = NULL;
>> +    bool has_vfio = false;
>> +    size_t i;
>> +
>> +    pcidevs = virHostdevGetPCIHostDeviceList(vm->def->hostdevs,
>> +		                             vm->def->nhostdevs);
>> +    if (!pcidevs)
>> +        return;
>> +
>> +    for (i = 0; i < virPCIDeviceListCount(pcidevs); i++) {
>> +        virPCIDevicePtr pci = virPCIDeviceListGet(pcidevs, i);
>> +        if (virPCIDeviceGetStubDriver(pci) == VIR_PCI_STUB_DRIVER_VFIO) {
>> +	    has_vfio = true;
>> +	    break;
>> +	}
>> +    }
>> +
>> +    if (has_vfio) {
>> +        VIR_DEBUG("Adding CAP_IPC_LOCK to QEMU process");
>> +        virCommandAllowCap(cmd, CAP_IPC_LOCK);
>> +    }
>> +
>> +    virObjectUnref(pcidevs);
>> +#endif
>> +}
>> +
>> +
>>   static int
>>   qemuProcessSetupBalloon(virQEMUDriverPtr driver,
>>                           virDomainObjPtr vm,
>> @@ -6569,6 +6601,8 @@ qemuProcessLaunch(virConnectPtr conn,
>>       if (qemuProcessSetupRawIO(driver, vm, cmd) < 0)
>>           goto cleanup;
>>   
>> +    qemuProcessSetupVFIOCaps(vm, cmd);
>> +
>>       virCommandSetPreExecHook(cmd, qemuProcessHook, &hookData);
>>       virCommandSetMaxProcesses(cmd, cfg->maxProcesses);
>>       virCommandSetMaxFiles(cmd, cfg->maxFiles);
>> diff --git a/src/util/virhostdev.c b/src/util/virhostdev.c
>> index 6be395cdda..ee389a1729 100644
>> --- a/src/util/virhostdev.c
>> +++ b/src/util/virhostdev.c
>> @@ -218,7 +218,7 @@ virHostdevManagerGetDefault(void)
>>       return virObjectRef(manager);
>>   }
>>   
>> -static virPCIDeviceListPtr
>> +virPCIDeviceListPtr
>>   virHostdevGetPCIHostDeviceList(virDomainHostdevDefPtr *hostdevs, int nhostdevs)
>>   {
>>       virPCIDeviceListPtr pcidevs;
>> diff --git a/src/util/virhostdev.h b/src/util/virhostdev.h
>> index 7263f320a2..9235581242 100644
>> --- a/src/util/virhostdev.h
>> +++ b/src/util/virhostdev.h
>> @@ -57,6 +57,10 @@ struct _virHostdevManager {
>>   };
>>   
>>   virHostdevManagerPtr virHostdevManagerGetDefault(void);
>> +virPCIDeviceListPtr
>> +virHostdevGetPCIHostDeviceList(virDomainHostdevDefPtr *hostdevs,
>> +		               int nhostdevs)
>> +    ATTRIBUTE_NONNULL(1);
>>   int
>>   virHostdevPreparePCIDevices(virHostdevManagerPtr hostdev_mgr,
>>                               const char *drv_name,




More information about the libvir-list mailing list