diff mbox series

[RFC] hw/display: add blocklist for known bad drivers

Message ID 20250221160101.2318357-1-alex.bennee@linaro.org
State New
Headers show
Series [RFC] hw/display: add blocklist for known bad drivers | expand

Commit Message

Alex Bennée Feb. 21, 2025, 4:01 p.m. UTC
While running the new GPU tests it was noted that the proprietary
nVidia driver barfed when run under the sanitiser:

  2025-02-20 11:13:08,226: [11:13:07.782] Output 'headless' attempts
  EOTF mode SDR and colorimetry mode default.
  2025-02-20 11:13:08,227: [11:13:07.784] Output 'headless' using color
  profile: stock sRGB color profile

  and that's the last thing it outputs.

  The sanitizer reports that when the framework sends the SIGTERM
  because of the timeout we get a write to a NULL pointer (but
  interesting not this time in an atexit callback):

  UndefinedBehaviorSanitizer:DEADLYSIGNAL
  ==471863==ERROR: UndefinedBehaviorSanitizer: SEGV on unknown address
  0x000000000000 (pc 0x7a18ceaafe80 bp 0x000000000000 sp 0x7ffe8e3ff6d0
  T471863)
  ==471863==The signal is caused by a WRITE memory access.
  ==471863==Hint: address points to the zero page.
      #0 0x7a18ceaafe80
  (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x16afe80)
  (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
      #1 0x7a18ce9e72c0
  (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x15e72c0)
  (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
      #2 0x7a18ce9f11bb
  (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x15f11bb)
  (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
      #3 0x7a18ce6dc9d1
  (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x12dc9d1)
  (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
      #4 0x7a18e7d15326 in vrend_renderer_create_fence
  /usr/src/virglrenderer-1.0.0-1ubuntu2/obj-x86_64-linux-gnu/../src/vrend_renderer.c:10883:26
      #5 0x55bfb6621871 in virtio_gpu_virgl_process_cmd

The #dri-devel channel confirmed:

  <digetx> stsquad: nv driver is known to not work with venus, don't use
      it for testing

So lets implement a blocklist to stop users starting a known bad
setup.

Reported-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Cc: Dmitry Osipenko <dmitry.osipenko@collabora.com>
---
 meson.build                               |   4 +
 include/qemu/host-gpu.h                   |  23 +++++
 hw/display/virtio-gpu.c                   |   4 +
 stubs/host-gpu.c                          |  17 ++++
 util/host-gpu.c                           | 102 ++++++++++++++++++++++
 stubs/meson.build                         |   4 +
 tests/functional/test_aarch64_virt_gpu.py |   2 +
 util/meson.build                          |   2 +
 8 files changed, 158 insertions(+)
 create mode 100644 include/qemu/host-gpu.h
 create mode 100644 stubs/host-gpu.c
 create mode 100644 util/host-gpu.c

Comments

Philippe Mathieu-Daudé Feb. 21, 2025, 4:34 p.m. UTC | #1
On 21/2/25 17:01, Alex Bennée wrote:
> While running the new GPU tests it was noted that the proprietary
> nVidia driver barfed when run under the sanitiser:
> 
>    2025-02-20 11:13:08,226: [11:13:07.782] Output 'headless' attempts
>    EOTF mode SDR and colorimetry mode default.
>    2025-02-20 11:13:08,227: [11:13:07.784] Output 'headless' using color
>    profile: stock sRGB color profile
> 
>    and that's the last thing it outputs.
> 
>    The sanitizer reports that when the framework sends the SIGTERM
>    because of the timeout we get a write to a NULL pointer (but
>    interesting not this time in an atexit callback):
> 
>    UndefinedBehaviorSanitizer:DEADLYSIGNAL
>    ==471863==ERROR: UndefinedBehaviorSanitizer: SEGV on unknown address
>    0x000000000000 (pc 0x7a18ceaafe80 bp 0x000000000000 sp 0x7ffe8e3ff6d0
>    T471863)
>    ==471863==The signal is caused by a WRITE memory access.
>    ==471863==Hint: address points to the zero page.
>        #0 0x7a18ceaafe80
>    (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x16afe80)
>    (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
>        #1 0x7a18ce9e72c0
>    (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x15e72c0)
>    (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
>        #2 0x7a18ce9f11bb
>    (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x15f11bb)
>    (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
>        #3 0x7a18ce6dc9d1
>    (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x12dc9d1)
>    (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
>        #4 0x7a18e7d15326 in vrend_renderer_create_fence
>    /usr/src/virglrenderer-1.0.0-1ubuntu2/obj-x86_64-linux-gnu/../src/vrend_renderer.c:10883:26
>        #5 0x55bfb6621871 in virtio_gpu_virgl_process_cmd
> 
> The #dri-devel channel confirmed:
> 
>    <digetx> stsquad: nv driver is known to not work with venus, don't use
>        it for testing
> 
> So lets implement a blocklist to stop users starting a known bad
> setup.
> 
> Reported-by: Peter Maydell <peter.maydell@linaro.org>
> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> Cc: Dmitry Osipenko <dmitry.osipenko@collabora.com>
> ---
>   meson.build                               |   4 +
>   include/qemu/host-gpu.h                   |  23 +++++
>   hw/display/virtio-gpu.c                   |   4 +
>   stubs/host-gpu.c                          |  17 ++++
>   util/host-gpu.c                           | 102 ++++++++++++++++++++++
>   stubs/meson.build                         |   4 +
>   tests/functional/test_aarch64_virt_gpu.py |   2 +
>   util/meson.build                          |   2 +
>   8 files changed, 158 insertions(+)
>   create mode 100644 include/qemu/host-gpu.h
>   create mode 100644 stubs/host-gpu.c
>   create mode 100644 util/host-gpu.c
> 
> diff --git a/meson.build b/meson.build
> index 4588bfd864..8f4a431445 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -1373,12 +1373,16 @@ if not get_option('qatzip').auto() or have_system
>   endif
>   
>   virgl = not_found
> +vulkan = not_found
>   
>   have_vhost_user_gpu = have_tools and host_os == 'linux' and pixman.found()
>   if not get_option('virglrenderer').auto() or have_system or have_vhost_user_gpu
>     virgl = dependency('virglrenderer',
>                        method: 'pkg-config',
>                        required: get_option('virglrenderer'))
> +  vulkan = dependency('vulkan',
> +                      method: 'pkg-config',
> +                      required: get_option('virglrenderer'))
>   endif
>   rutabaga = not_found
>   if not get_option('rutabaga_gfx').auto() or have_system or have_vhost_user_gpu
> diff --git a/include/qemu/host-gpu.h b/include/qemu/host-gpu.h
> new file mode 100644
> index 0000000000..45053c2f77
> --- /dev/null
> +++ b/include/qemu/host-gpu.h
> @@ -0,0 +1,23 @@
> +/*
> + * Utility functions to probe host GPU
> + *
> + * Copyright (c) 2025 Linaro Ltd
> + *
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + */
> +#ifndef HOST_GPU_H
> +#define HOST_GPU_H
> +
> +#include "qapi/error.h"
> +
> +/**
> + * validate_vulkan_backend() - verify working backend
> + *
> + * errp: error pointer
> + *
> + * If the system vulkan implementation is known to not work return
> + * false otherwise true.
> + */
> +bool validate_vulkan_backend(Error **errp);
> +
> +#endif /* HOST_GPU_H */
> diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
> index 11a7a85750..816eedf838 100644
> --- a/hw/display/virtio-gpu.c
> +++ b/hw/display/virtio-gpu.c
> @@ -32,6 +32,7 @@
>   #include "qemu/module.h"
>   #include "qapi/error.h"
>   #include "qemu/error-report.h"
> +#include "qemu/host-gpu.h"
>   
>   #define VIRTIO_GPU_VM_VERSION 1
>   
> @@ -1498,6 +1499,9 @@ void virtio_gpu_device_realize(DeviceState *qdev, Error **errp)
>               error_setg(errp, "venus requires enabled blob and hostmem options");
>               return;
>           }

Why don't we check VIRTIO_GPU_FLAG_VENUS_ENABLED in 
virtio_gpu_gl_device_realize()?

> +        if (!validate_vulkan_backend(errp)) {
> +            return;
> +        }
>       #else
>           error_setg(errp, "old virglrenderer, venus unsupported");
>           return;
> diff --git a/stubs/host-gpu.c b/stubs/host-gpu.c
> new file mode 100644
> index 0000000000..7bf76ee4f6
> --- /dev/null
> +++ b/stubs/host-gpu.c
> @@ -0,0 +1,17 @@
> +/*
> + * Stub of utility functions to probe host GPU
> + *
> + * Copyright (c) 2025 Linaro Ltd
> + *
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qapi/error.h"
> +#include "qemu/host-gpu.h"
> +
> +bool validate_vulkan_backend(Error **errp)
> +{
> +    error_setg(errp, "No vulkan library present");
> +    return false;

Do we really fail virtio_gpu_device_realize() in this case?

> +}
> diff --git a/util/host-gpu.c b/util/host-gpu.c
> new file mode 100644
> index 0000000000..5e7bf2557c
> --- /dev/null
> +++ b/util/host-gpu.c
> @@ -0,0 +1,102 @@
> +/*
> + * Utility functions to probe host GPU
> + *
> + * Copyright (c) 2025 Linaro Ltd
> + *
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qapi/error.h"
> +#include "qemu/host-gpu.h"
> +
> +#include <vulkan/vulkan.h>
> +

static

> +const char *extensions[] = {
> +    /* needed to query the driver details */
> +    "VK_KHR_get_physical_device_properties2",
> +};
> +
> +/*
> + * Info for known broken drivers. Sadly driver version info tends to
> + * be in the driverInfo text field which is free form so tricky to
> + * parse.
> + */
> +struct VkDriverBlockList {
> +    VkDriverId id;
> +    const char *reason;
> +};
> +

static const

> +struct VkDriverBlockList vulkan_blocklist[] = {
> +    /* at least 535.183.01 is reported to SEGV in libnvidia-eglcore.so */
> +    { VK_DRIVER_ID_NVIDIA_PROPRIETARY, "proprietary nVidia driver is broken" },
> +};
> +
> +static bool is_driver_blocked(VkPhysicalDevice dev, Error **errp)
> +{
> +    VkPhysicalDeviceDriverProperties driverProperties = {
> +        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES,
> +        .pNext = NULL
> +    };
> +    VkPhysicalDeviceProperties2 deviceProperties2 = {
> +        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2,
> +        .pNext = &driverProperties
> +    };
> +    VkPhysicalDeviceProperties *deviceProperties = &deviceProperties2.properties;
> +
> +    vkGetPhysicalDeviceProperties2(dev, &deviceProperties2);
> +
> +    for (int i = 0; i < ARRAY_SIZE(vulkan_blocklist); i++) {
> +        if (driverProperties.driverID == vulkan_blocklist[i].id) {
> +            error_setg(errp, "Blocked GPU %s because %s",
> +                       deviceProperties->deviceName,
> +                       vulkan_blocklist[i].reason);
> +            return true;
> +        }
> +    }
> +
> +    return false;
> +}
> +
> +bool validate_vulkan_backend(Error **errp)
> +{

static const

> +    VkInstanceCreateInfo instance_info = {
> +        VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
> +        NULL, /* pNext extension */
> +        0,    /* VkInstanceCreateFlags */
> +        NULL, /* Application Info */
> +        0, NULL, /* no Enabled Layers */
> +        ARRAY_SIZE(extensions), extensions, /* Extensions */
> +    };
> +
> +    VkInstance inst;
> +    VkResult res;
> +
> +    res = vkCreateInstance(&instance_info, NULL, &inst);
> +
> +    if ( res == VK_SUCCESS ) {
> +        uint32_t count;

g_autofree

> +        VkPhysicalDevice *devices;
> +
> +        /* Do the enumeration two-step */
> +        vkEnumeratePhysicalDevices(inst, &count, NULL);
> +        devices = g_new0(VkPhysicalDevice, count);
> +        vkEnumeratePhysicalDevices(inst, &count, devices);
> +
> +        for (int i = 0; i  < count; i++) {
> +            if (is_driver_blocked(devices[i], errp)) {
> +                return false;
> +            }
> +        }
> +    } else {
> +        error_setg(errp, "Could not initialise a Vulkan instance");
> +        return false;
> +    }
> +
> +    /*
> +     * It would be nice to g_autofree the instance, but returning
> +     * false will abort start-up anyway.
> +     */
> +    vkDestroyInstance(inst, NULL);
> +    return true;
> +}
> diff --git a/stubs/meson.build b/stubs/meson.build
> index b0fee37e05..c18501aa6d 100644
> --- a/stubs/meson.build
> +++ b/stubs/meson.build
> @@ -89,3 +89,7 @@ if have_system or have_user
>     stub_ss.add(files('hotplug-stubs.c'))
>     stub_ss.add(files('sysbus.c'))
>   endif
> +
> +if not vulkan.found()
> +  stubs_ss.add(files('host-gpu.c'))
> +endif
> diff --git a/tests/functional/test_aarch64_virt_gpu.py b/tests/functional/test_aarch64_virt_gpu.py
> index 7a8471d1ca..9a0e694049 100755
> --- a/tests/functional/test_aarch64_virt_gpu.py
> +++ b/tests/functional/test_aarch64_virt_gpu.py
> @@ -79,6 +79,8 @@ def _run_virt_gpu_test(self, gpu_device,  weston_cmd, weston_pattern):
>                   self.skipTest("Can't access host DRM render node")
>               elif "'type' does not accept value 'egl-headless'" in excp.output:
>                   self.skipTest("egl-headless support is not available")
> +            elif "Blocked GPU" in excp.output:
> +                self.skipTest("GPU is in block list")
>               else:
>                   self.log.info(f"unhandled launch failure: {excp.output}")
>                   raise excp
> diff --git a/util/meson.build b/util/meson.build
> index 780b5977a8..7c6cc36e07 100644
> --- a/util/meson.build
> +++ b/util/meson.build
> @@ -132,3 +132,5 @@ elif cpu in ['ppc', 'ppc64']
>   elif cpu in ['riscv32', 'riscv64']
>     util_ss.add(files('cpuinfo-riscv.c'))
>   endif
> +
> +util_ss.add(when: vulkan, if_true: files('host-gpu.c'))
Dmitry Osipenko Feb. 24, 2025, 4:50 a.m. UTC | #2
On 2/21/25 19:01, Alex Bennée wrote:
...
> diff --git a/stubs/host-gpu.c b/stubs/host-gpu.c
> new file mode 100644
> index 0000000000..7bf76ee4f6
> --- /dev/null
> +++ b/stubs/host-gpu.c

Looks okay

Nit: host-gpu is a too generic name, IMO. Name should reflect Vulkan
dedication, like host-vk-gpu.c for example.
Daniel P. Berrangé Feb. 24, 2025, 10:09 a.m. UTC | #3
On Fri, Feb 21, 2025 at 04:01:01PM +0000, Alex Bennée wrote:
> While running the new GPU tests it was noted that the proprietary
> nVidia driver barfed when run under the sanitiser:
> 
>   2025-02-20 11:13:08,226: [11:13:07.782] Output 'headless' attempts
>   EOTF mode SDR and colorimetry mode default.
>   2025-02-20 11:13:08,227: [11:13:07.784] Output 'headless' using color
>   profile: stock sRGB color profile
> 
>   and that's the last thing it outputs.
> 
>   The sanitizer reports that when the framework sends the SIGTERM
>   because of the timeout we get a write to a NULL pointer (but
>   interesting not this time in an atexit callback):
> 
>   UndefinedBehaviorSanitizer:DEADLYSIGNAL
>   ==471863==ERROR: UndefinedBehaviorSanitizer: SEGV on unknown address
>   0x000000000000 (pc 0x7a18ceaafe80 bp 0x000000000000 sp 0x7ffe8e3ff6d0
>   T471863)
>   ==471863==The signal is caused by a WRITE memory access.
>   ==471863==Hint: address points to the zero page.
>       #0 0x7a18ceaafe80
>   (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x16afe80)
>   (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
>       #1 0x7a18ce9e72c0
>   (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x15e72c0)
>   (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
>       #2 0x7a18ce9f11bb
>   (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x15f11bb)
>   (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
>       #3 0x7a18ce6dc9d1
>   (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x12dc9d1)
>   (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
>       #4 0x7a18e7d15326 in vrend_renderer_create_fence
>   /usr/src/virglrenderer-1.0.0-1ubuntu2/obj-x86_64-linux-gnu/../src/vrend_renderer.c:10883:26
>       #5 0x55bfb6621871 in virtio_gpu_virgl_process_cmd
> 
> The #dri-devel channel confirmed:
> 
>   <digetx> stsquad: nv driver is known to not work with venus, don't use
>       it for testing
> 
> So lets implement a blocklist to stop users starting a known bad
> setup.

I don't much like the conceptual idea of blocking usage of QEMU itself
based on current point-in-time bugs in the host OS driver stack, because
it is making an assertion that all future versions of the driver will
also be broken and that's not generally valid.

If the user chose to use a dodgy graphics driver, they can deal with
the consequences of their choice.

Skipping only the functional test, without any qemu-system code changes
though is more palettable as that's not a hard block on usage.

> 
> Reported-by: Peter Maydell <peter.maydell@linaro.org>
> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> Cc: Dmitry Osipenko <dmitry.osipenko@collabora.com>
> ---
>  meson.build                               |   4 +
>  include/qemu/host-gpu.h                   |  23 +++++
>  hw/display/virtio-gpu.c                   |   4 +
>  stubs/host-gpu.c                          |  17 ++++
>  util/host-gpu.c                           | 102 ++++++++++++++++++++++
>  stubs/meson.build                         |   4 +
>  tests/functional/test_aarch64_virt_gpu.py |   2 +
>  util/meson.build                          |   2 +
>  8 files changed, 158 insertions(+)
>  create mode 100644 include/qemu/host-gpu.h
>  create mode 100644 stubs/host-gpu.c
>  create mode 100644 util/host-gpu.c
> 
> diff --git a/meson.build b/meson.build
> index 4588bfd864..8f4a431445 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -1373,12 +1373,16 @@ if not get_option('qatzip').auto() or have_system
>  endif
>  
>  virgl = not_found
> +vulkan = not_found
>  
>  have_vhost_user_gpu = have_tools and host_os == 'linux' and pixman.found()
>  if not get_option('virglrenderer').auto() or have_system or have_vhost_user_gpu
>    virgl = dependency('virglrenderer',
>                       method: 'pkg-config',
>                       required: get_option('virglrenderer'))
> +  vulkan = dependency('vulkan',
> +                      method: 'pkg-config',
> +                      required: get_option('virglrenderer'))
>  endif
>  rutabaga = not_found
>  if not get_option('rutabaga_gfx').auto() or have_system or have_vhost_user_gpu
> diff --git a/include/qemu/host-gpu.h b/include/qemu/host-gpu.h
> new file mode 100644
> index 0000000000..45053c2f77
> --- /dev/null
> +++ b/include/qemu/host-gpu.h
> @@ -0,0 +1,23 @@
> +/*
> + * Utility functions to probe host GPU
> + *
> + * Copyright (c) 2025 Linaro Ltd
> + *
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + */
> +#ifndef HOST_GPU_H
> +#define HOST_GPU_H
> +
> +#include "qapi/error.h"
> +
> +/**
> + * validate_vulkan_backend() - verify working backend
> + *
> + * errp: error pointer
> + *
> + * If the system vulkan implementation is known to not work return
> + * false otherwise true.
> + */
> +bool validate_vulkan_backend(Error **errp);
> +
> +#endif /* HOST_GPU_H */
> diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
> index 11a7a85750..816eedf838 100644
> --- a/hw/display/virtio-gpu.c
> +++ b/hw/display/virtio-gpu.c
> @@ -32,6 +32,7 @@
>  #include "qemu/module.h"
>  #include "qapi/error.h"
>  #include "qemu/error-report.h"
> +#include "qemu/host-gpu.h"
>  
>  #define VIRTIO_GPU_VM_VERSION 1
>  
> @@ -1498,6 +1499,9 @@ void virtio_gpu_device_realize(DeviceState *qdev, Error **errp)
>              error_setg(errp, "venus requires enabled blob and hostmem options");
>              return;
>          }
> +        if (!validate_vulkan_backend(errp)) {
> +            return;
> +        }
>      #else
>          error_setg(errp, "old virglrenderer, venus unsupported");
>          return;
> diff --git a/stubs/host-gpu.c b/stubs/host-gpu.c
> new file mode 100644
> index 0000000000..7bf76ee4f6
> --- /dev/null
> +++ b/stubs/host-gpu.c
> @@ -0,0 +1,17 @@
> +/*
> + * Stub of utility functions to probe host GPU
> + *
> + * Copyright (c) 2025 Linaro Ltd
> + *
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qapi/error.h"
> +#include "qemu/host-gpu.h"
> +
> +bool validate_vulkan_backend(Error **errp)
> +{
> +    error_setg(errp, "No vulkan library present");
> +    return false;
> +}
> diff --git a/util/host-gpu.c b/util/host-gpu.c
> new file mode 100644
> index 0000000000..5e7bf2557c
> --- /dev/null
> +++ b/util/host-gpu.c
> @@ -0,0 +1,102 @@
> +/*
> + * Utility functions to probe host GPU
> + *
> + * Copyright (c) 2025 Linaro Ltd
> + *
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qapi/error.h"
> +#include "qemu/host-gpu.h"
> +
> +#include <vulkan/vulkan.h>
> +
> +const char *extensions[] = {
> +    /* needed to query the driver details */
> +    "VK_KHR_get_physical_device_properties2",
> +};
> +
> +/*
> + * Info for known broken drivers. Sadly driver version info tends to
> + * be in the driverInfo text field which is free form so tricky to
> + * parse.
> + */
> +struct VkDriverBlockList {
> +    VkDriverId id;
> +    const char *reason;
> +};
> +
> +struct VkDriverBlockList vulkan_blocklist[] = {
> +    /* at least 535.183.01 is reported to SEGV in libnvidia-eglcore.so */
> +    { VK_DRIVER_ID_NVIDIA_PROPRIETARY, "proprietary nVidia driver is broken" },
> +};
> +
> +static bool is_driver_blocked(VkPhysicalDevice dev, Error **errp)
> +{
> +    VkPhysicalDeviceDriverProperties driverProperties = {
> +        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES,
> +        .pNext = NULL
> +    };
> +    VkPhysicalDeviceProperties2 deviceProperties2 = {
> +        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2,
> +        .pNext = &driverProperties
> +    };
> +    VkPhysicalDeviceProperties *deviceProperties = &deviceProperties2.properties;
> +
> +    vkGetPhysicalDeviceProperties2(dev, &deviceProperties2);
> +
> +    for (int i = 0; i < ARRAY_SIZE(vulkan_blocklist); i++) {
> +        if (driverProperties.driverID == vulkan_blocklist[i].id) {
> +            error_setg(errp, "Blocked GPU %s because %s",
> +                       deviceProperties->deviceName,
> +                       vulkan_blocklist[i].reason);
> +            return true;
> +        }
> +    }
> +
> +    return false;
> +}
> +
> +bool validate_vulkan_backend(Error **errp)
> +{
> +    VkInstanceCreateInfo instance_info = {
> +        VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
> +        NULL, /* pNext extension */
> +        0,    /* VkInstanceCreateFlags */
> +        NULL, /* Application Info */
> +        0, NULL, /* no Enabled Layers */
> +        ARRAY_SIZE(extensions), extensions, /* Extensions */
> +    };
> +
> +    VkInstance inst;
> +    VkResult res;
> +
> +    res = vkCreateInstance(&instance_info, NULL, &inst);
> +
> +    if ( res == VK_SUCCESS ) {
> +        uint32_t count;
> +        VkPhysicalDevice *devices;
> +
> +        /* Do the enumeration two-step */
> +        vkEnumeratePhysicalDevices(inst, &count, NULL);
> +        devices = g_new0(VkPhysicalDevice, count);
> +        vkEnumeratePhysicalDevices(inst, &count, devices);
> +
> +        for (int i = 0; i  < count; i++) {
> +            if (is_driver_blocked(devices[i], errp)) {
> +                return false;
> +            }
> +        }
> +    } else {
> +        error_setg(errp, "Could not initialise a Vulkan instance");
> +        return false;
> +    }
> +
> +    /*
> +     * It would be nice to g_autofree the instance, but returning
> +     * false will abort start-up anyway.
> +     */
> +    vkDestroyInstance(inst, NULL);
> +    return true;
> +}
> diff --git a/stubs/meson.build b/stubs/meson.build
> index b0fee37e05..c18501aa6d 100644
> --- a/stubs/meson.build
> +++ b/stubs/meson.build
> @@ -89,3 +89,7 @@ if have_system or have_user
>    stub_ss.add(files('hotplug-stubs.c'))
>    stub_ss.add(files('sysbus.c'))
>  endif
> +
> +if not vulkan.found()
> +  stubs_ss.add(files('host-gpu.c'))
> +endif
> diff --git a/tests/functional/test_aarch64_virt_gpu.py b/tests/functional/test_aarch64_virt_gpu.py
> index 7a8471d1ca..9a0e694049 100755
> --- a/tests/functional/test_aarch64_virt_gpu.py
> +++ b/tests/functional/test_aarch64_virt_gpu.py
> @@ -79,6 +79,8 @@ def _run_virt_gpu_test(self, gpu_device,  weston_cmd, weston_pattern):
>                  self.skipTest("Can't access host DRM render node")
>              elif "'type' does not accept value 'egl-headless'" in excp.output:
>                  self.skipTest("egl-headless support is not available")
> +            elif "Blocked GPU" in excp.output:
> +                self.skipTest("GPU is in block list")
>              else:
>                  self.log.info(f"unhandled launch failure: {excp.output}")
>                  raise excp
> diff --git a/util/meson.build b/util/meson.build
> index 780b5977a8..7c6cc36e07 100644
> --- a/util/meson.build
> +++ b/util/meson.build
> @@ -132,3 +132,5 @@ elif cpu in ['ppc', 'ppc64']
>  elif cpu in ['riscv32', 'riscv64']
>    util_ss.add(files('cpuinfo-riscv.c'))
>  endif
> +
> +util_ss.add(when: vulkan, if_true: files('host-gpu.c'))
> -- 
> 2.39.5
> 

With regards,
Daniel
Alex Bennée Feb. 24, 2025, 10:56 a.m. UTC | #4
Daniel P. Berrangé <berrange@redhat.com> writes:

> On Fri, Feb 21, 2025 at 04:01:01PM +0000, Alex Bennée wrote:
>> While running the new GPU tests it was noted that the proprietary
>> nVidia driver barfed when run under the sanitiser:
>> 
>>   2025-02-20 11:13:08,226: [11:13:07.782] Output 'headless' attempts
>>   EOTF mode SDR and colorimetry mode default.
>>   2025-02-20 11:13:08,227: [11:13:07.784] Output 'headless' using color
>>   profile: stock sRGB color profile
>> 
>>   and that's the last thing it outputs.
>> 
>>   The sanitizer reports that when the framework sends the SIGTERM
>>   because of the timeout we get a write to a NULL pointer (but
>>   interesting not this time in an atexit callback):
>> 
>>   UndefinedBehaviorSanitizer:DEADLYSIGNAL
>>   ==471863==ERROR: UndefinedBehaviorSanitizer: SEGV on unknown address
>>   0x000000000000 (pc 0x7a18ceaafe80 bp 0x000000000000 sp 0x7ffe8e3ff6d0
>>   T471863)
>>   ==471863==The signal is caused by a WRITE memory access.
>>   ==471863==Hint: address points to the zero page.
>>       #0 0x7a18ceaafe80
>>   (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x16afe80)
>>   (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
>>       #1 0x7a18ce9e72c0
>>   (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x15e72c0)
>>   (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
>>       #2 0x7a18ce9f11bb
>>   (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x15f11bb)
>>   (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
>>       #3 0x7a18ce6dc9d1
>>   (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x12dc9d1)
>>   (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
>>       #4 0x7a18e7d15326 in vrend_renderer_create_fence
>>   /usr/src/virglrenderer-1.0.0-1ubuntu2/obj-x86_64-linux-gnu/../src/vrend_renderer.c:10883:26
>>       #5 0x55bfb6621871 in virtio_gpu_virgl_process_cmd
>> 
>> The #dri-devel channel confirmed:
>> 
>>   <digetx> stsquad: nv driver is known to not work with venus, don't use
>>       it for testing
>> 
>> So lets implement a blocklist to stop users starting a known bad
>> setup.
>
> I don't much like the conceptual idea of blocking usage of QEMU itself
> based on current point-in-time bugs in the host OS driver stack, because
> it is making an assertion that all future versions of the driver will
> also be broken and that's not generally valid.
>
> If the user chose to use a dodgy graphics driver, they can deal with
> the consequences of their choice.
>
> Skipping only the functional test, without any qemu-system code changes
> though is more palettable as that's not a hard block on usage.

Well how do you do one without the other? I don't want to always skip the
vulkan testing because some developer setups have broken drivers. Unless
you are suggesting something like:

  -device virtio-vga-gl,hostmem=4G,blob=on,venus=on,ignore-nvidia=on

or something like that?

>
>> 
>> Reported-by: Peter Maydell <peter.maydell@linaro.org>
>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>> Cc: Dmitry Osipenko <dmitry.osipenko@collabora.com>
>> ---
>>  meson.build                               |   4 +
>>  include/qemu/host-gpu.h                   |  23 +++++
>>  hw/display/virtio-gpu.c                   |   4 +
>>  stubs/host-gpu.c                          |  17 ++++
>>  util/host-gpu.c                           | 102 ++++++++++++++++++++++
>>  stubs/meson.build                         |   4 +
>>  tests/functional/test_aarch64_virt_gpu.py |   2 +
>>  util/meson.build                          |   2 +
>>  8 files changed, 158 insertions(+)
>>  create mode 100644 include/qemu/host-gpu.h
>>  create mode 100644 stubs/host-gpu.c
>>  create mode 100644 util/host-gpu.c
>> 
>> diff --git a/meson.build b/meson.build
>> index 4588bfd864..8f4a431445 100644
>> --- a/meson.build
>> +++ b/meson.build
>> @@ -1373,12 +1373,16 @@ if not get_option('qatzip').auto() or have_system
>>  endif
>>  
>>  virgl = not_found
>> +vulkan = not_found
>>  
>>  have_vhost_user_gpu = have_tools and host_os == 'linux' and pixman.found()
>>  if not get_option('virglrenderer').auto() or have_system or have_vhost_user_gpu
>>    virgl = dependency('virglrenderer',
>>                       method: 'pkg-config',
>>                       required: get_option('virglrenderer'))
>> +  vulkan = dependency('vulkan',
>> +                      method: 'pkg-config',
>> +                      required: get_option('virglrenderer'))
>>  endif
>>  rutabaga = not_found
>>  if not get_option('rutabaga_gfx').auto() or have_system or have_vhost_user_gpu
>> diff --git a/include/qemu/host-gpu.h b/include/qemu/host-gpu.h
>> new file mode 100644
>> index 0000000000..45053c2f77
>> --- /dev/null
>> +++ b/include/qemu/host-gpu.h
>> @@ -0,0 +1,23 @@
>> +/*
>> + * Utility functions to probe host GPU
>> + *
>> + * Copyright (c) 2025 Linaro Ltd
>> + *
>> + * SPDX-License-Identifier: GPL-2.0-or-later
>> + */
>> +#ifndef HOST_GPU_H
>> +#define HOST_GPU_H
>> +
>> +#include "qapi/error.h"
>> +
>> +/**
>> + * validate_vulkan_backend() - verify working backend
>> + *
>> + * errp: error pointer
>> + *
>> + * If the system vulkan implementation is known to not work return
>> + * false otherwise true.
>> + */
>> +bool validate_vulkan_backend(Error **errp);
>> +
>> +#endif /* HOST_GPU_H */
>> diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
>> index 11a7a85750..816eedf838 100644
>> --- a/hw/display/virtio-gpu.c
>> +++ b/hw/display/virtio-gpu.c
>> @@ -32,6 +32,7 @@
>>  #include "qemu/module.h"
>>  #include "qapi/error.h"
>>  #include "qemu/error-report.h"
>> +#include "qemu/host-gpu.h"
>>  
>>  #define VIRTIO_GPU_VM_VERSION 1
>>  
>> @@ -1498,6 +1499,9 @@ void virtio_gpu_device_realize(DeviceState *qdev, Error **errp)
>>              error_setg(errp, "venus requires enabled blob and hostmem options");
>>              return;
>>          }
>> +        if (!validate_vulkan_backend(errp)) {
>> +            return;
>> +        }
>>      #else
>>          error_setg(errp, "old virglrenderer, venus unsupported");
>>          return;
>> diff --git a/stubs/host-gpu.c b/stubs/host-gpu.c
>> new file mode 100644
>> index 0000000000..7bf76ee4f6
>> --- /dev/null
>> +++ b/stubs/host-gpu.c
>> @@ -0,0 +1,17 @@
>> +/*
>> + * Stub of utility functions to probe host GPU
>> + *
>> + * Copyright (c) 2025 Linaro Ltd
>> + *
>> + * SPDX-License-Identifier: GPL-2.0-or-later
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "qapi/error.h"
>> +#include "qemu/host-gpu.h"
>> +
>> +bool validate_vulkan_backend(Error **errp)
>> +{
>> +    error_setg(errp, "No vulkan library present");
>> +    return false;
>> +}
>> diff --git a/util/host-gpu.c b/util/host-gpu.c
>> new file mode 100644
>> index 0000000000..5e7bf2557c
>> --- /dev/null
>> +++ b/util/host-gpu.c
>> @@ -0,0 +1,102 @@
>> +/*
>> + * Utility functions to probe host GPU
>> + *
>> + * Copyright (c) 2025 Linaro Ltd
>> + *
>> + * SPDX-License-Identifier: GPL-2.0-or-later
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "qapi/error.h"
>> +#include "qemu/host-gpu.h"
>> +
>> +#include <vulkan/vulkan.h>
>> +
>> +const char *extensions[] = {
>> +    /* needed to query the driver details */
>> +    "VK_KHR_get_physical_device_properties2",
>> +};
>> +
>> +/*
>> + * Info for known broken drivers. Sadly driver version info tends to
>> + * be in the driverInfo text field which is free form so tricky to
>> + * parse.
>> + */
>> +struct VkDriverBlockList {
>> +    VkDriverId id;
>> +    const char *reason;
>> +};
>> +
>> +struct VkDriverBlockList vulkan_blocklist[] = {
>> +    /* at least 535.183.01 is reported to SEGV in libnvidia-eglcore.so */
>> +    { VK_DRIVER_ID_NVIDIA_PROPRIETARY, "proprietary nVidia driver is broken" },
>> +};
>> +
>> +static bool is_driver_blocked(VkPhysicalDevice dev, Error **errp)
>> +{
>> +    VkPhysicalDeviceDriverProperties driverProperties = {
>> +        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES,
>> +        .pNext = NULL
>> +    };
>> +    VkPhysicalDeviceProperties2 deviceProperties2 = {
>> +        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2,
>> +        .pNext = &driverProperties
>> +    };
>> +    VkPhysicalDeviceProperties *deviceProperties = &deviceProperties2.properties;
>> +
>> +    vkGetPhysicalDeviceProperties2(dev, &deviceProperties2);
>> +
>> +    for (int i = 0; i < ARRAY_SIZE(vulkan_blocklist); i++) {
>> +        if (driverProperties.driverID == vulkan_blocklist[i].id) {
>> +            error_setg(errp, "Blocked GPU %s because %s",
>> +                       deviceProperties->deviceName,
>> +                       vulkan_blocklist[i].reason);
>> +            return true;
>> +        }
>> +    }
>> +
>> +    return false;
>> +}
>> +
>> +bool validate_vulkan_backend(Error **errp)
>> +{
>> +    VkInstanceCreateInfo instance_info = {
>> +        VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
>> +        NULL, /* pNext extension */
>> +        0,    /* VkInstanceCreateFlags */
>> +        NULL, /* Application Info */
>> +        0, NULL, /* no Enabled Layers */
>> +        ARRAY_SIZE(extensions), extensions, /* Extensions */
>> +    };
>> +
>> +    VkInstance inst;
>> +    VkResult res;
>> +
>> +    res = vkCreateInstance(&instance_info, NULL, &inst);
>> +
>> +    if ( res == VK_SUCCESS ) {
>> +        uint32_t count;
>> +        VkPhysicalDevice *devices;
>> +
>> +        /* Do the enumeration two-step */
>> +        vkEnumeratePhysicalDevices(inst, &count, NULL);
>> +        devices = g_new0(VkPhysicalDevice, count);
>> +        vkEnumeratePhysicalDevices(inst, &count, devices);
>> +
>> +        for (int i = 0; i  < count; i++) {
>> +            if (is_driver_blocked(devices[i], errp)) {
>> +                return false;
>> +            }
>> +        }
>> +    } else {
>> +        error_setg(errp, "Could not initialise a Vulkan instance");
>> +        return false;
>> +    }
>> +
>> +    /*
>> +     * It would be nice to g_autofree the instance, but returning
>> +     * false will abort start-up anyway.
>> +     */
>> +    vkDestroyInstance(inst, NULL);
>> +    return true;
>> +}
>> diff --git a/stubs/meson.build b/stubs/meson.build
>> index b0fee37e05..c18501aa6d 100644
>> --- a/stubs/meson.build
>> +++ b/stubs/meson.build
>> @@ -89,3 +89,7 @@ if have_system or have_user
>>    stub_ss.add(files('hotplug-stubs.c'))
>>    stub_ss.add(files('sysbus.c'))
>>  endif
>> +
>> +if not vulkan.found()
>> +  stubs_ss.add(files('host-gpu.c'))
>> +endif
>> diff --git a/tests/functional/test_aarch64_virt_gpu.py b/tests/functional/test_aarch64_virt_gpu.py
>> index 7a8471d1ca..9a0e694049 100755
>> --- a/tests/functional/test_aarch64_virt_gpu.py
>> +++ b/tests/functional/test_aarch64_virt_gpu.py
>> @@ -79,6 +79,8 @@ def _run_virt_gpu_test(self, gpu_device,  weston_cmd, weston_pattern):
>>                  self.skipTest("Can't access host DRM render node")
>>              elif "'type' does not accept value 'egl-headless'" in excp.output:
>>                  self.skipTest("egl-headless support is not available")
>> +            elif "Blocked GPU" in excp.output:
>> +                self.skipTest("GPU is in block list")
>>              else:
>>                  self.log.info(f"unhandled launch failure: {excp.output}")
>>                  raise excp
>> diff --git a/util/meson.build b/util/meson.build
>> index 780b5977a8..7c6cc36e07 100644
>> --- a/util/meson.build
>> +++ b/util/meson.build
>> @@ -132,3 +132,5 @@ elif cpu in ['ppc', 'ppc64']
>>  elif cpu in ['riscv32', 'riscv64']
>>    util_ss.add(files('cpuinfo-riscv.c'))
>>  endif
>> +
>> +util_ss.add(when: vulkan, if_true: files('host-gpu.c'))
>> -- 
>> 2.39.5
>> 
>
> With regards,
> Daniel
Daniel P. Berrangé Feb. 24, 2025, 11:07 a.m. UTC | #5
On Mon, Feb 24, 2025 at 10:56:12AM +0000, Alex Bennée wrote:
> Daniel P. Berrangé <berrange@redhat.com> writes:
> 
> > On Fri, Feb 21, 2025 at 04:01:01PM +0000, Alex Bennée wrote:
> >> While running the new GPU tests it was noted that the proprietary
> >> nVidia driver barfed when run under the sanitiser:
> >> 
> >>   2025-02-20 11:13:08,226: [11:13:07.782] Output 'headless' attempts
> >>   EOTF mode SDR and colorimetry mode default.
> >>   2025-02-20 11:13:08,227: [11:13:07.784] Output 'headless' using color
> >>   profile: stock sRGB color profile
> >> 
> >>   and that's the last thing it outputs.
> >> 
> >>   The sanitizer reports that when the framework sends the SIGTERM
> >>   because of the timeout we get a write to a NULL pointer (but
> >>   interesting not this time in an atexit callback):
> >> 
> >>   UndefinedBehaviorSanitizer:DEADLYSIGNAL
> >>   ==471863==ERROR: UndefinedBehaviorSanitizer: SEGV on unknown address
> >>   0x000000000000 (pc 0x7a18ceaafe80 bp 0x000000000000 sp 0x7ffe8e3ff6d0
> >>   T471863)
> >>   ==471863==The signal is caused by a WRITE memory access.
> >>   ==471863==Hint: address points to the zero page.
> >>       #0 0x7a18ceaafe80
> >>   (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x16afe80)
> >>   (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
> >>       #1 0x7a18ce9e72c0
> >>   (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x15e72c0)
> >>   (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
> >>       #2 0x7a18ce9f11bb
> >>   (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x15f11bb)
> >>   (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
> >>       #3 0x7a18ce6dc9d1
> >>   (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x12dc9d1)
> >>   (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
> >>       #4 0x7a18e7d15326 in vrend_renderer_create_fence
> >>   /usr/src/virglrenderer-1.0.0-1ubuntu2/obj-x86_64-linux-gnu/../src/vrend_renderer.c:10883:26
> >>       #5 0x55bfb6621871 in virtio_gpu_virgl_process_cmd
> >> 
> >> The #dri-devel channel confirmed:
> >> 
> >>   <digetx> stsquad: nv driver is known to not work with venus, don't use
> >>       it for testing
> >> 
> >> So lets implement a blocklist to stop users starting a known bad
> >> setup.
> >
> > I don't much like the conceptual idea of blocking usage of QEMU itself
> > based on current point-in-time bugs in the host OS driver stack, because
> > it is making an assertion that all future versions of the driver will
> > also be broken and that's not generally valid.
> >
> > If the user chose to use a dodgy graphics driver, they can deal with
> > the consequences of their choice.
> >
> > Skipping only the functional test, without any qemu-system code changes
> > though is more palettable as that's not a hard block on usage.
> 
> Well how do you do one without the other? I don't want to always skip the
> vulkan testing because some developer setups have broken drivers. Unless
> you are suggesting something like:
> 
>   -device virtio-vga-gl,hostmem=4G,blob=on,venus=on,ignore-nvidia=on
> 
> or something like that?

I was thinking that test_aarch64_virt_gpu.py would dynamically check
the kernel driver and use that in its @skip annotation.


With regards,
Daniel
Alex Bennée Feb. 24, 2025, 11:42 a.m. UTC | #6
Daniel P. Berrangé <berrange@redhat.com> writes:

> On Mon, Feb 24, 2025 at 10:56:12AM +0000, Alex Bennée wrote:
>> Daniel P. Berrangé <berrange@redhat.com> writes:
>> 
>> > On Fri, Feb 21, 2025 at 04:01:01PM +0000, Alex Bennée wrote:
>> >> While running the new GPU tests it was noted that the proprietary
>> >> nVidia driver barfed when run under the sanitiser:
>> >> 
>> >>   2025-02-20 11:13:08,226: [11:13:07.782] Output 'headless' attempts
>> >>   EOTF mode SDR and colorimetry mode default.
>> >>   2025-02-20 11:13:08,227: [11:13:07.784] Output 'headless' using color
>> >>   profile: stock sRGB color profile
>> >> 
>> >>   and that's the last thing it outputs.
>> >> 
>> >>   The sanitizer reports that when the framework sends the SIGTERM
>> >>   because of the timeout we get a write to a NULL pointer (but
>> >>   interesting not this time in an atexit callback):
>> >> 
>> >>   UndefinedBehaviorSanitizer:DEADLYSIGNAL
>> >>   ==471863==ERROR: UndefinedBehaviorSanitizer: SEGV on unknown address
>> >>   0x000000000000 (pc 0x7a18ceaafe80 bp 0x000000000000 sp 0x7ffe8e3ff6d0
>> >>   T471863)
>> >>   ==471863==The signal is caused by a WRITE memory access.
>> >>   ==471863==Hint: address points to the zero page.
>> >>       #0 0x7a18ceaafe80
>> >>   (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x16afe80)
>> >>   (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
>> >>       #1 0x7a18ce9e72c0
>> >>   (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x15e72c0)
>> >>   (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
>> >>       #2 0x7a18ce9f11bb
>> >>   (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x15f11bb)
>> >>   (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
>> >>       #3 0x7a18ce6dc9d1
>> >>   (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x12dc9d1)
>> >>   (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
>> >>       #4 0x7a18e7d15326 in vrend_renderer_create_fence
>> >>   /usr/src/virglrenderer-1.0.0-1ubuntu2/obj-x86_64-linux-gnu/../src/vrend_renderer.c:10883:26
>> >>       #5 0x55bfb6621871 in virtio_gpu_virgl_process_cmd
>> >> 
>> >> The #dri-devel channel confirmed:
>> >> 
>> >>   <digetx> stsquad: nv driver is known to not work with venus, don't use
>> >>       it for testing
>> >> 
>> >> So lets implement a blocklist to stop users starting a known bad
>> >> setup.
>> >
>> > I don't much like the conceptual idea of blocking usage of QEMU itself
>> > based on current point-in-time bugs in the host OS driver stack, because
>> > it is making an assertion that all future versions of the driver will
>> > also be broken and that's not generally valid.
>> >
>> > If the user chose to use a dodgy graphics driver, they can deal with
>> > the consequences of their choice.
>> >
>> > Skipping only the functional test, without any qemu-system code changes
>> > though is more palettable as that's not a hard block on usage.
>> 
>> Well how do you do one without the other? I don't want to always skip the
>> vulkan testing because some developer setups have broken drivers. Unless
>> you are suggesting something like:
>> 
>>   -device virtio-vga-gl,hostmem=4G,blob=on,venus=on,ignore-nvidia=on
>> 
>> or something like that?
>
> I was thinking that test_aarch64_virt_gpu.py would dynamically check
> the kernel driver and use that in its @skip annotation.

If we can make the vulkan-info tool a dependency we could certainly do
that - otherwise the host-gpu code would need to be built as a command
line helper.

>
>
> With regards,
> Daniel
diff mbox series

Patch

diff --git a/meson.build b/meson.build
index 4588bfd864..8f4a431445 100644
--- a/meson.build
+++ b/meson.build
@@ -1373,12 +1373,16 @@  if not get_option('qatzip').auto() or have_system
 endif
 
 virgl = not_found
+vulkan = not_found
 
 have_vhost_user_gpu = have_tools and host_os == 'linux' and pixman.found()
 if not get_option('virglrenderer').auto() or have_system or have_vhost_user_gpu
   virgl = dependency('virglrenderer',
                      method: 'pkg-config',
                      required: get_option('virglrenderer'))
+  vulkan = dependency('vulkan',
+                      method: 'pkg-config',
+                      required: get_option('virglrenderer'))
 endif
 rutabaga = not_found
 if not get_option('rutabaga_gfx').auto() or have_system or have_vhost_user_gpu
diff --git a/include/qemu/host-gpu.h b/include/qemu/host-gpu.h
new file mode 100644
index 0000000000..45053c2f77
--- /dev/null
+++ b/include/qemu/host-gpu.h
@@ -0,0 +1,23 @@ 
+/*
+ * Utility functions to probe host GPU
+ *
+ * Copyright (c) 2025 Linaro Ltd
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+#ifndef HOST_GPU_H
+#define HOST_GPU_H
+
+#include "qapi/error.h"
+
+/**
+ * validate_vulkan_backend() - verify working backend
+ *
+ * errp: error pointer
+ *
+ * If the system vulkan implementation is known to not work return
+ * false otherwise true.
+ */
+bool validate_vulkan_backend(Error **errp);
+
+#endif /* HOST_GPU_H */
diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
index 11a7a85750..816eedf838 100644
--- a/hw/display/virtio-gpu.c
+++ b/hw/display/virtio-gpu.c
@@ -32,6 +32,7 @@ 
 #include "qemu/module.h"
 #include "qapi/error.h"
 #include "qemu/error-report.h"
+#include "qemu/host-gpu.h"
 
 #define VIRTIO_GPU_VM_VERSION 1
 
@@ -1498,6 +1499,9 @@  void virtio_gpu_device_realize(DeviceState *qdev, Error **errp)
             error_setg(errp, "venus requires enabled blob and hostmem options");
             return;
         }
+        if (!validate_vulkan_backend(errp)) {
+            return;
+        }
     #else
         error_setg(errp, "old virglrenderer, venus unsupported");
         return;
diff --git a/stubs/host-gpu.c b/stubs/host-gpu.c
new file mode 100644
index 0000000000..7bf76ee4f6
--- /dev/null
+++ b/stubs/host-gpu.c
@@ -0,0 +1,17 @@ 
+/*
+ * Stub of utility functions to probe host GPU
+ *
+ * Copyright (c) 2025 Linaro Ltd
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/host-gpu.h"
+
+bool validate_vulkan_backend(Error **errp)
+{
+    error_setg(errp, "No vulkan library present");
+    return false;
+}
diff --git a/util/host-gpu.c b/util/host-gpu.c
new file mode 100644
index 0000000000..5e7bf2557c
--- /dev/null
+++ b/util/host-gpu.c
@@ -0,0 +1,102 @@ 
+/*
+ * Utility functions to probe host GPU
+ *
+ * Copyright (c) 2025 Linaro Ltd
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/host-gpu.h"
+
+#include <vulkan/vulkan.h>
+
+const char *extensions[] = {
+    /* needed to query the driver details */
+    "VK_KHR_get_physical_device_properties2",
+};
+
+/*
+ * Info for known broken drivers. Sadly driver version info tends to
+ * be in the driverInfo text field which is free form so tricky to
+ * parse.
+ */
+struct VkDriverBlockList {
+    VkDriverId id;
+    const char *reason;
+};
+
+struct VkDriverBlockList vulkan_blocklist[] = {
+    /* at least 535.183.01 is reported to SEGV in libnvidia-eglcore.so */
+    { VK_DRIVER_ID_NVIDIA_PROPRIETARY, "proprietary nVidia driver is broken" },
+};
+
+static bool is_driver_blocked(VkPhysicalDevice dev, Error **errp)
+{
+    VkPhysicalDeviceDriverProperties driverProperties = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES,
+        .pNext = NULL
+    };
+    VkPhysicalDeviceProperties2 deviceProperties2 = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2,
+        .pNext = &driverProperties
+    };
+    VkPhysicalDeviceProperties *deviceProperties = &deviceProperties2.properties;
+
+    vkGetPhysicalDeviceProperties2(dev, &deviceProperties2);
+
+    for (int i = 0; i < ARRAY_SIZE(vulkan_blocklist); i++) {
+        if (driverProperties.driverID == vulkan_blocklist[i].id) {
+            error_setg(errp, "Blocked GPU %s because %s",
+                       deviceProperties->deviceName,
+                       vulkan_blocklist[i].reason);
+            return true;
+        }
+    }
+
+    return false;
+}
+
+bool validate_vulkan_backend(Error **errp)
+{
+    VkInstanceCreateInfo instance_info = {
+        VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
+        NULL, /* pNext extension */
+        0,    /* VkInstanceCreateFlags */
+        NULL, /* Application Info */
+        0, NULL, /* no Enabled Layers */
+        ARRAY_SIZE(extensions), extensions, /* Extensions */
+    };
+
+    VkInstance inst;
+    VkResult res;
+
+    res = vkCreateInstance(&instance_info, NULL, &inst);
+
+    if ( res == VK_SUCCESS ) {
+        uint32_t count;
+        VkPhysicalDevice *devices;
+
+        /* Do the enumeration two-step */
+        vkEnumeratePhysicalDevices(inst, &count, NULL);
+        devices = g_new0(VkPhysicalDevice, count);
+        vkEnumeratePhysicalDevices(inst, &count, devices);
+
+        for (int i = 0; i  < count; i++) {
+            if (is_driver_blocked(devices[i], errp)) {
+                return false;
+            }
+        }
+    } else {
+        error_setg(errp, "Could not initialise a Vulkan instance");
+        return false;
+    }
+
+    /*
+     * It would be nice to g_autofree the instance, but returning
+     * false will abort start-up anyway.
+     */
+    vkDestroyInstance(inst, NULL);
+    return true;
+}
diff --git a/stubs/meson.build b/stubs/meson.build
index b0fee37e05..c18501aa6d 100644
--- a/stubs/meson.build
+++ b/stubs/meson.build
@@ -89,3 +89,7 @@  if have_system or have_user
   stub_ss.add(files('hotplug-stubs.c'))
   stub_ss.add(files('sysbus.c'))
 endif
+
+if not vulkan.found()
+  stubs_ss.add(files('host-gpu.c'))
+endif
diff --git a/tests/functional/test_aarch64_virt_gpu.py b/tests/functional/test_aarch64_virt_gpu.py
index 7a8471d1ca..9a0e694049 100755
--- a/tests/functional/test_aarch64_virt_gpu.py
+++ b/tests/functional/test_aarch64_virt_gpu.py
@@ -79,6 +79,8 @@  def _run_virt_gpu_test(self, gpu_device,  weston_cmd, weston_pattern):
                 self.skipTest("Can't access host DRM render node")
             elif "'type' does not accept value 'egl-headless'" in excp.output:
                 self.skipTest("egl-headless support is not available")
+            elif "Blocked GPU" in excp.output:
+                self.skipTest("GPU is in block list")
             else:
                 self.log.info(f"unhandled launch failure: {excp.output}")
                 raise excp
diff --git a/util/meson.build b/util/meson.build
index 780b5977a8..7c6cc36e07 100644
--- a/util/meson.build
+++ b/util/meson.build
@@ -132,3 +132,5 @@  elif cpu in ['ppc', 'ppc64']
 elif cpu in ['riscv32', 'riscv64']
   util_ss.add(files('cpuinfo-riscv.c'))
 endif
+
+util_ss.add(when: vulkan, if_true: files('host-gpu.c'))