[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]

Re: [vfio-users] vfio passthrough devices behind pcie switcher problem



sorry , update infomation right now 


i installed centos7.3 at my 8 gpus machine yesterday, and i made a successful passthrough, the vm guest os can use gpu with no problem. so i think this is a software problem, i need to patch some patch.

i also made a test in my 4 gpus machine without any software change, the result is success. the 4 gpus are attached at pci root without pcie switch , so , i think the software problem have some correlation with pcie switch .

thank you Alex .

[root 64 /data]# lspci|grep NV
04:00.0 3D controller: NVIDIA Corporation Device 17fd (rev a1)
05:00.0 3D controller: NVIDIA Corporation Device 1b38 (rev a1)
08:00.0 3D controller: NVIDIA Corporation Device 1b38 (rev a1)
09:00.0 3D controller: NVIDIA Corporation Device 1b38 (rev a1)
85:00.0 3D controller: NVIDIA Corporation Device 1b38 (rev a1)
86:00.0 3D controller: NVIDIA Corporation Device 17fd (rev a1)
89:00.0 3D controller: NVIDIA Corporation Device 1b38 (rev a1)
8a:00.0 3D controller: NVIDIA Corporation Device 1b38 (rev a1)


[root 64 /data]# lspci -t
-+-[0000:ff]-+-08.0
 |           +-08.2
 |           +-08.3
 |           +-09.0
 |           +-09.2
 |           +-09.3
 |           +-0b.0
 |           +-0b.1
 |           +-0b.2
 |           +-0b.3
 |           +-0c.0
 |           +-0c.1
 |           +-0c.2
 |           +-0c.3
 |           +-0c.4
 |           +-0c.5
 |           +-0c.6
 |           +-0c.7
 |           +-0d.0
 |           +-0d.1
 |           +-0d.2
 |           +-0d.3
 |           +-0d.4
 |           +-0d.5
 |           +-0f.0
 |           +-0f.1
 |           +-0f.2
 |           +-0f.3
 |           +-0f.4
 |           +-0f.5
 |           +-0f.6
 |           +-10.0
 |           +-10.1
 |           +-10.5
 |           +-10.6
 |           +-10.7
 |           +-12.0
 |           +-12.1
 |           +-12.4
 |           +-12.5
 |           +-13.0
 |           +-13.1
 |           +-13.2
 |           +-13.3
 |           +-13.6
 |           +-13.7
 |           +-14.0
 |           +-14.1
 |           +-14.2
 |           +-14.3
 |           +-14.4
 |           +-14.5
 |           +-14.6
 |           +-14.7
 |           +-16.0
 |           +-16.1
 |           +-16.2
 |           +-16.3
 |           +-16.6
 |           +-16.7
 |           +-17.0
 |           +-17.1
 |           +-17.2
 |           +-17.3
 |           +-17.4
 |           +-17.5
 |           +-17.6
 |           +-17.7
 |           +-1e.0
 |           +-1e.1
 |           +-1e.2
 |           +-1e.3
 |           +-1e.4
 |           +-1f.0
 |           \-1f.2
 +-[0000:80]-+-00.0-[81]--+-00.0
 |           |            \-00.1
 |           +-01.0-[82]----00.0
 |           +-02.0-[83-86]----00.0-[84-86]--+-08.0-[85]----00.0
 |           |                                            \-10.0-[86]----00.0
 |           +-03.0-[87-8a]----00.0-[88-8a]--+-08.0-[89]----00.0
 |           |                                            \-10.0-[8a]----00.0
 |           +-04.0
 |           +-04.1
 |           +-04.2
 |           +-04.3
 |           +-04.4
 |           +-04.5
 |           +-04.6
 |           +-04.7
 |           +-05.0
 |           +-05.1
 |           +-05.2
 |           \-05.4
 +-[0000:7f]-+-08.0
 |           +-08.2
 |           +-08.3
 |           +-09.0
 |           +-09.2
 |           +-09.3
 |           +-0b.0
 |           +-0b.1
 |           +-0b.2
 |           +-0b.3
 |           +-0c.0
 |           +-0c.1
 |           +-0c.2
 |           +-0c.3
 |           +-0c.4
 |           +-0c.5
 |           +-0c.6
 |           +-0c.7
 |           +-0d.0
 |           +-0d.1
 |           +-0d.2
 |           +-0d.3
 |           +-0d.4
 |           +-0d.5
 |           +-0f.0
 |           +-0f.1
 |           +-0f.2
 |           +-0f.3
 |           +-0f.4
 |           +-0f.5
 |           +-0f.6
 |           +-10.0
 |           +-10.1
 |           +-10.5
 |           +-10.6
 |           +-10.7
 |           +-12.0
 |           +-12.1
 |           +-12.4
 |           +-12.5
 |           +-13.0
 |           +-13.1
 |           +-13.2
 |           +-13.3
 |           +-13.6
 |           +-13.7
 |           +-14.0
 |           +-14.1
 |           +-14.2
 |           +-14.3
 |           +-14.4
 |           +-14.5
 |           +-14.6
 |           +-14.7
 |           +-16.0
 |           +-16.1
 |           +-16.2
 |           +-16.3
 |           +-16.6
 |           +-16.7
 |           +-17.0
 |           +-17.1
 |           +-17.2
 |           +-17.3
 |           +-17.4
 |           +-17.5
 |           +-17.6
 |           +-17.7
 |           +-1e.0
 |           +-1e.1
 |           +-1e.2
 |           +-1e.3
 |           +-1e.4
 |           +-1f.0
 |           \-1f.2
 \-[0000:00]-+-00.0
             +-01.0-[01]--
             +-02.0-[02-05]----00.0-[03-05]--+-08.0-[04]----00.0
             |                                            \-10.0-[05]----00.0
             +-03.0-[06-09]----00.0-[07-09]--+-08.0-[08]----00.0
             |                                            \-10.0-[09]----00.0
             +-04.0
             +-04.1
             +-04.2
             +-04.3
             +-04.4
             +-04.5
             +-04.6
             +-04.7
             +-05.0
             +-05.1
             +-05.2
             +-05.4
             +-11.0
             +-11.4
             +-14.0
             +-16.0
             +-16.1
             +-1a.0
             +-1c.0-[0a]--
             +-1c.7-[0b-0c]----00.0-[0c]----00.0
             +-1d.0
             +-1f.0
             +-1f.2


and the xml 

<domain type='kvm'>
  <name>win</name>
  <uuid>a2021423-89d8-4a33-aaa5-07102ae7ad4e</uuid>
  <memory unit='KiB'>8388608</memory>
  <currentMemory unit='KiB'>8388608</currentMemory>
  <vcpu placement='static' cpuset='0-8'>8</vcpu>
  <sysinfo type='smbios'>
    <system>
      <entry name='serial'>21aa32e5-8233-40d4-b323-128824f6becf</entry>
      <entry name='uuid'>a2021423-89d8-4a33-aaa5-07102ae7ad4e</entry>
    </system>
  </sysinfo>
  <os>
    <type arch='x86_64' machine='pc'>hvm</type>
    <boot dev='hd'/>
    <smbios mode='sysinfo'/>
  </os>
  <features>
    <acpi/>
    <apic/>
    <pae/>
    <hap/>
    <hyperv>
      <relaxed state='on'/>
    </hyperv>
  </features>
  <cpu>
    <topology sockets='2' cores='6' threads='2'/>
  </cpu>
  <clock offset='localtime'>
    <timer name='pit' tickpolicy='delay'/>
    <timer name='rtc' tickpolicy='catchup' track='guest'/>
    <timer name='hpet' present='no'/>
  </clock>
  <on_poweroff>destroy</on_poweroff>
  <on_reboot>restart</on_reboot>
  <on_crash>restart</on_crash>
  <devices>
    <emulator>/usr/local/bin/qemu-system-x86_64</emulator>
    <disk type='file' device='disk'>
      <driver name='qemu' type='qcow2' cache='none'/>
      <source file='/data/win.qcow2'/>
      <target dev='vda' bus='virtio'/>
      <address type='pci' domain='0x0000' bus='0x00' slot='0x04' function='0x0'/>
    </disk>
    <controller type='usb' index='0'>
      <address type='pci' domain='0x0000' bus='0x00' slot='0x01' function='0x2'/>
    </controller>
    <serial type='pty'>
      <target port='0'/>
    </serial>
    <console type='pty'>
      <target type='serial' port='0'/>
    </console>
    <input type='tablet' bus='usb'/>
    <input type='mouse' bus='ps2'/>
    <graphics type='vnc' port='-1' autoport='yes' listen='0.0.0.0' keymap='en-us'>
      <listen type='address' address='0.0.0.0'/>
    </graphics>
    <video>
      <model type='cirrus' vram='9216' heads='1'/>
      <address type='pci' domain='0x0000' bus='0x00' slot='0x02' function='0x0'/>
    </video>
    <memballoon model='virtio'>
      <address type='pci' domain='0x0000' bus='0x00' slot='0x05' function='0x0'/>
    </memballoon>
    <hostdev mode='subsystem' type='pci' managed='yes'>
      <source>
        <address domain='0x0000' bus='0x84' slot='0x00' function='0x0'/>
      </source>
    </hostdev>
  </devices>
</domain>



2017-03-09 21:49 GMT+08:00 Alex Williamson <alex williamson redhat com>:
On Thu, 9 Mar 2017 11:47:32 +0800
rhett rhett <rhett kernel gmail com> wrote:

> somebody can help me ?

I asked for VM commandline or XML, you haven't provided it.  I asked
for lspci info, you haven't provided it.  Help us help you.

> 2017-03-08 14:34 GMT+08:00 rhett rhett <rhett kernel gmail com>:
>
> > here's some more error log from centos guest:
> >
> > Mar  7 05:38:07 localhost kernel: NVRM: loading NVIDIA UNIX x86_64 Kernel
> > Module  375.39  Tue Jan 31 20:47:00 PST 2017 (using threaded interrupts)
> > Mar  7 05:38:08 localhost kernel: nvidia-modeset: Loading NVIDIA Kernel
> > Mode Setting Driver for UNIX platforms  375.39  Tue Jan 31 19:41:48 PST 2017
> > Mar  7 05:39:27 localhost kernel: NVRM: RmInitAdapter failed!
> > (0x24:0x51:1060)
> > Mar  7 05:39:27 localhost kernel: NVRM: rm_init_adapter failed for device
> > bearing minor number 0
> > Mar  7 05:43:40 localhost kernel: NVRM: RmInitAdapter failed!
> > (0x24:0x51:1060)
> > Mar  7 05:43:40 localhost kernel: NVRM: rm_init_adapter failed for device
> > bearing minor number 0
> > Mar  8 05:07:47 localhost kernel: nvidia: module license 'NVIDIA' taints
> > kernel.
> > Mar  8 05:07:47 localhost kernel: NVRM: loading NVIDIA UNIX x86_64 Kernel
> > Module
> >
> > 2017-03-08 14:31 GMT+08:00 rhett rhett <rhett kernel gmail com>:
> >
> >> i have two guest , a windows 2008 server and a centos 7.2 . in windows,
> >> the device manager said the gpu can't start ,error code 10.
> >> in centos, when i run nvidia-smi,  it said no device found.
> >>
> >> no specil vm configurations,  whit the same config, i can use gpu
> >> successfully in my two gpu server. the biggest different is , that server
> >> is no pcie switcher.
> >>
> >> 2017-03-08 11:55 GMT+08:00 Alex Williamson <alex williamson redhat com>:
> >>
> >>> On Wed, 8 Mar 2017 11:26:17 +0800
> >>> rhett rhett <rhett kernel gmail com> wrote:
> >>>
> >>> > two gpus share the same irq , i found the reason. because the msi be
> >>> > disabled later , so irq 140 is being reused.
> >>> >
> >>> > but i don't know why somebady calls vfio_pci_ioctl to disable the msi.
> >>>
> >>> vfio just does what the guest requests, but you're really providing
> >>> hardly any more information than when you asked off list.  My wild
> >>> guess, is that maybe you're running a Windows guest and not configuring
> >>> the VM for a vCPU type where Windows supports MSI.  For more
> >>> assistance, please provide basic information, like the QEMU command
> >>> line or VM XML, also the PCI information from the host (sudo lspci
> >>> -vvv), and of course any error codes in the guest or an actual
> >>> description of how the device doesn't work in the guest.  Thanks,
> >>>
> >>> Alex
> >>>
> >>>
> >>> > 2017-03-08 10:55 GMT+08:00 rhett rhett <rhett kernel gmail com>:
> >>> >
> >>> > > i have a question about vfio , here is my description.
> >>> > >
> >>> > > i have 8 gpus in my server machine ,  but they are all behind a pcie
> >>> > > bridge.  when i make a vfio passthrough , i can't use the gpus in my
> >>> guest
> >>> > > os.
> >>> > > dmesg shows the following message
> >>> > >
> >>> > > [  662.208072] vfio-pci 0000:87:00.0: irq 140 for MSI/MSI-X
> >>> > > [  725.761623] vfio-pci 0000:04:00.0: irq 140 for MSI/MSI-X
> >>> > >
> >>> > > i started two vm , one use 87 and another use 04,  dmesg shows that
> >>> they
> >>> > > share the same irq 140 . is this normal ?
> >>> > >
> >>> > > i also saw the iommu groups, each gpu stays in a separate group, and
> >>> with
> >>> > > no other device in group. so this means ACS works correctly ?
> >>> > >
> >>> > > hope to get your helps !
> >>> > >
> >>>
> >>>
> >>
> >



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]