From e9d130a4867153b814427fb8f39bac53b53f9b88 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Thu, 29 Jan 2026 09:11:22 +0100
Subject: [PATCH 01/10] ggml-virtgpu: regenerate_remoting.py: add the ability
 to deprecate a function

---
 ggml/src/ggml-virtgpu/regenerate_remoting.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-virtgpu/regenerate_remoting.py b/ggml/src/ggml-virtgpu/regenerate_remoting.py
index 4174a24327..aeb48a4087 100755
--- a/ggml/src/ggml-virtgpu/regenerate_remoting.py
+++ b/ggml/src/ggml-virtgpu/regenerate_remoting.py
@@ -116,7 +116,7 @@ class RemotingCodebaseGenerator:
                         'frontend_return': func_metadata.get('frontend_return', 'void'),
                         'frontend_extra_params': func_metadata.get('frontend_extra_params', []),
                         'group_description': group_description,
-                        'newly_added': func_metadata.get('newly_added', False)
+                        'deprecated': func_metadata.get('deprecated', False),
                     })
                     enum_value += 1
 
@@ -165,6 +165,9 @@ class RemotingCodebaseGenerator:
 
             signature = "uint32_t"
             params = "apir_encoder *enc, apir_decoder *dec, virgl_apir_context *ctx"
+            if func['deprecated']:
+                decl_lines.append(f"/* {func['enum_name']} is deprecated. Keeping the handler for backward compatibility. */")
+
             decl_lines.append(f"{signature} {func['backend_function']}({params});")
 
         # Switch cases
@@ -176,7 +179,9 @@ class RemotingCodebaseGenerator:
                 switch_lines.append(f"  /* {func['group_description']} */")
                 current_group = func['group_name']
 
-            switch_lines.append(f"  case {func['enum_name']}: return \"{func['backend_function']}\";")
+            deprecated = " (DEPRECATED)" if func['deprecated'] else ""
+
+            switch_lines.append(f"  case {func['enum_name']}: return \"{func['backend_function']}{deprecated}\";")
 
         # Dispatch table
         table_lines = []
@@ -188,7 +193,8 @@ class RemotingCodebaseGenerator:
                 table_lines.append("")
                 current_group = func['group_name']
 
-            table_lines.append(f"  /* {func['enum_name']}  = */ {func['backend_function']},")
+            deprecated = " /* DEPRECATED */" if func['deprecated'] else ""
+            table_lines.append(f"  /* {func['enum_name']}  = */ {func['backend_function']}{deprecated},")
 
         header_content = f'''\
 #pragma once
@@ -225,6 +231,10 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC
                 decl_lines.append(f"/* {func['group_description']} */")
                 current_group = func['group_name']
 
+            if func['deprecated']:
+                decl_lines.append(f"/* {func['frontend_function']} is deprecated. */")
+                continue
+
             # Build parameter list
             params = [self.naming_patterns['frontend_base_param']]
             params.extend(func['frontend_extra_params'])
@@ -287,7 +297,7 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC
         generated_files = [apir_backend_path, backend_dispatched_path, virtgpu_forward_path]
 
         if not self.clang_format_available:
-            logging.warning("\n⚠️clang-format not found in PATH. Generated files will not be formatted."
+            logging.warning("\n⚠️clang-format not found in PATH. Generated files will not be formatted.\n"
                             "   Install clang-format to enable automatic code formatting.")
         else:
             logging.info("\n🎨 Formatting files with clang-format...")

From c6a60859721554d71c952997eb7015ab53657297 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 28 Jan 2026 21:04:02 +0100
Subject: [PATCH 02/10] ggml-virtgpu: deprecate buffer_type is_host remoting

not necessary
---
 .../backend-dispatched-buffer-type.cpp        |  6 +++---
 .../backend/backend-dispatched.gen.h          |  5 +++--
 .../ggml-virtgpu/ggml-backend-buffer-type.cpp |  6 ------
 .../ggml-virtgpu/ggmlremoting_functions.yaml  |  4 +---
 .../virtgpu-forward-buffer-type.cpp           | 19 -------------------
 ggml/src/ggml-virtgpu/virtgpu-forward.gen.h   |  1 -
 6 files changed, 7 insertions(+), 34 deletions(-)

diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp
index 8ea1bb4fb4..3a97fbd956 100644
--- a/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp
+++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp
@@ -42,12 +42,12 @@ uint32_t backend_buffer_type_get_max_size(apir_encoder * enc, apir_decoder * dec
     return 0;
 }
 
+/* APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST is deprecated. Keeping the handler for backward compatibility. */
 uint32_t backend_buffer_type_is_host(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
     GGML_UNUSED(ctx);
-    ggml_backend_buffer_type_t buft;
-    buft = apir_decode_ggml_buffer_type(dec);
+    GGML_UNUSED(dec);
+    const bool is_host = false;
 
-    bool is_host = buft->iface.is_host(buft);
     apir_encode_bool_t(enc, &is_host);
 
     return 0;
diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h b/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h
index b81fd5039b..481d7f3150 100644
--- a/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h
+++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h
@@ -16,6 +16,7 @@ uint32_t backend_device_buffer_from_ptr(apir_encoder * enc, apir_decoder * dec,
 uint32_t backend_buffer_type_get_name(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
 uint32_t backend_buffer_type_get_alignment(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
 uint32_t backend_buffer_type_get_max_size(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
+/* APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST is deprecated. Keeping the handler for backward compatibility. */
 uint32_t backend_buffer_type_is_host(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
 uint32_t backend_buffer_type_alloc_buffer(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
 uint32_t backend_buffer_type_get_alloc_size(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
@@ -62,7 +63,7 @@ static inline const char * backend_dispatch_command_name(ApirBackendCommandType
         case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE:
             return "backend_buffer_type_get_max_size";
         case APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST:
-            return "backend_buffer_type_is_host";
+            return "backend_buffer_type_is_host (DEPRECATED)";
         case APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER:
             return "backend_buffer_type_alloc_buffer";
         case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE:
@@ -110,7 +111,7 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC
     /* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME  = */ backend_buffer_type_get_name,
     /* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT  = */ backend_buffer_type_get_alignment,
     /* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE  = */ backend_buffer_type_get_max_size,
-    /* APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST  = */ backend_buffer_type_is_host,
+    /* APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST  = */ backend_buffer_type_is_host /* DEPRECATED */,
     /* APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER  = */ backend_buffer_type_alloc_buffer,
     /* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE  = */ backend_buffer_type_get_alloc_size,
 
diff --git a/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp b/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp
index 7f650659b8..a2fa5246aa 100644
--- a/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp
+++ b/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp
@@ -60,12 +60,6 @@ static size_t ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer
     return max_size;
 }
 
-static bool ggml_backend_remoting_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
-    virtgpu * gpu = BUFT_TO_GPU(buft);
-
-    return apir_buffer_type_is_host(gpu, buft);
-}
-
 static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft,
                                                                const ggml_tensor *        tensor) {
     virtgpu * gpu = BUFT_TO_GPU(buft);
diff --git a/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml b/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml
index 0b7cccfe9c..775c5ca573 100644
--- a/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml
+++ b/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml
@@ -79,9 +79,7 @@ functions:
         - "ggml_backend_buffer_type_t buft"
 
       is_host:
-        frontend_return: "bool"
-        frontend_extra_params:
-        - "ggml_backend_buffer_type_t buft"
+        deprecated: true
 
       alloc_buffer:
         frontend_return: "apir_buffer_context_t"
diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp
index 03cb09e064..0069b47a5b 100644
--- a/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp
+++ b/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp
@@ -62,25 +62,6 @@ size_t apir_buffer_type_get_max_size(virtgpu * gpu, ggml_backend_buffer_type_t b
     return max_size;
 }
 
-bool apir_buffer_type_is_host(virtgpu * gpu, ggml_backend_buffer_type_t buft) {
-    apir_encoder *        encoder;
-    apir_decoder *        decoder;
-    ApirForwardReturnCode ret;
-
-    REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST);
-
-    apir_encode_ggml_buffer_type(encoder, buft);
-
-    REMOTE_CALL(gpu, encoder, decoder, ret);
-
-    bool is_host;
-    apir_decode_bool_t(decoder, &is_host);
-
-    remote_call_finish(gpu, encoder, decoder);
-
-    return is_host;
-}
-
 apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, ggml_backend_buffer_type_t buft, size_t size) {
     apir_encoder *        encoder;
     apir_decoder *        decoder;
diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h b/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h
index c27c07f086..6216ebad1f 100644
--- a/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h
+++ b/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h
@@ -20,7 +20,6 @@ apir_buffer_context_t          apir_device_buffer_from_ptr(struct virtgpu * gpu,
 const char *          apir_buffer_type_get_name(struct virtgpu * gpu, ggml_backend_buffer_type_t buft);
 size_t                apir_buffer_type_get_alignment(struct virtgpu * gpu, ggml_backend_buffer_type_t buft);
 size_t                apir_buffer_type_get_max_size(struct virtgpu * gpu, ggml_backend_buffer_type_t buft);
-bool                  apir_buffer_type_is_host(struct virtgpu * gpu, ggml_backend_buffer_type_t buft);
 apir_buffer_context_t apir_buffer_type_alloc_buffer(struct virtgpu *           gpu,
                                                     ggml_backend_buffer_type_t buffer_buft,
                                                     size_t                     size);

From 92390ad9f6e08d6b83c31171781e5112f559dc5e Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 28 Jan 2026 12:59:49 +0100
Subject: [PATCH 03/10] ggml-virtgpu: stop using static vars as cache

The static init isn't thread safe.
---
 .../backend/shared/apir_cs_ggml.h             |  4 ++
 .../ggml-virtgpu/ggml-backend-buffer-type.cpp | 21 ++-----
 ggml/src/ggml-virtgpu/ggml-backend-device.cpp | 59 +++++++++++--------
 ggml/src/ggml-virtgpu/ggml-backend-reg.cpp    | 55 +++++++++++++----
 .../ggml-virtgpu/ggmlremoting_functions.yaml  | 16 ++---
 .../virtgpu-forward-buffer-type.cpp           | 20 +++----
 .../ggml-virtgpu/virtgpu-forward-device.cpp   | 10 +---
 ggml/src/ggml-virtgpu/virtgpu-forward.gen.h   | 20 ++++---
 ggml/src/ggml-virtgpu/virtgpu.h               | 18 ++++++
 9 files changed, 140 insertions(+), 83 deletions(-)

diff --git a/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h b/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h
index 070c3b25fb..28f7f270ef 100644
--- a/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h
+++ b/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h
@@ -71,6 +71,10 @@ static inline ggml_backend_buffer_type_t apir_decode_ggml_buffer_type(apir_decod
     return (ggml_backend_buffer_type_t) handle;
 }
 
+static inline void apir_encode_apir_buffer_type_host_handle(apir_encoder * enc, apir_buffer_type_host_handle_t handle) {
+    apir_encoder_write(enc, sizeof(handle), &handle, sizeof(handle));
+}
+
 static inline apir_buffer_type_host_handle_t apir_decode_apir_buffer_type_host_handle(apir_decoder * dec) {
     apir_buffer_type_host_handle_t handle;
 
diff --git a/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp b/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp
index a2fa5246aa..68f378e516 100644
--- a/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp
+++ b/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp
@@ -20,7 +20,7 @@ static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml
         context->base         = context->apir_context.shmem.mmap_ptr;
         context->is_from_ptr  = true;
     } else {
-        context->apir_context = apir_buffer_type_alloc_buffer(gpu, buft, size);
+        context->apir_context = apir_buffer_type_alloc_buffer(gpu, gpu->cached_buffer_type.host_handle, size);
         context->is_from_ptr  = false;
         context->base         = NULL;
     }
@@ -34,30 +34,19 @@ static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml
 static const char * ggml_backend_remoting_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
     virtgpu * gpu = BUFT_TO_GPU(buft);
 
-    return apir_buffer_type_get_name(gpu, buft);
+    return gpu->cached_buffer_type.name;
 }
 
 static size_t ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
     virtgpu * gpu = BUFT_TO_GPU(buft);
 
-    static size_t align = 0;
-
-    if (align == 0) {
-        align = apir_buffer_type_get_alignment(gpu, buft);
-    }
-
-    return align;
+    return gpu->cached_buffer_type.alignment;
 }
 
 static size_t ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
     virtgpu * gpu = BUFT_TO_GPU(buft);
 
-    static size_t max_size = 0;
-    if (max_size == 0) {
-        max_size = apir_buffer_type_get_max_size(gpu, buft);
-    }
-
-    return max_size;
+    return gpu->cached_buffer_type.max_size;
 }
 
 static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft,
@@ -70,7 +59,7 @@ static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buff
         return ggml_nbytes(tensor);
     }
 
-    return apir_buffer_type_get_alloc_size(gpu, buft, tensor);
+    return apir_buffer_type_get_alloc_size(gpu, gpu->cached_buffer_type.host_handle, tensor);
 }
 
 const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = {
diff --git a/ggml/src/ggml-virtgpu/ggml-backend-device.cpp b/ggml/src/ggml-virtgpu/ggml-backend-device.cpp
index 579eb99078..3f98ee58d4 100644
--- a/ggml/src/ggml-virtgpu/ggml-backend-device.cpp
+++ b/ggml/src/ggml-virtgpu/ggml-backend-device.cpp
@@ -3,32 +3,27 @@
 static const char * ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) {
     virtgpu * gpu = DEV_TO_GPU(dev);
 
-    return apir_device_get_name(gpu);
+    return gpu->cached_device_info.name;
 }
 
 static const char * ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) {
     virtgpu * gpu = DEV_TO_GPU(dev);
 
-    return apir_device_get_description(gpu);
+    // Return the pre-cached description from the virtgpu structure
+    return gpu->cached_device_info.description;
 }
 
 static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) {
     virtgpu * gpu = DEV_TO_GPU(dev);
 
-    static enum ggml_backend_dev_type type;
-    static bool                       has_type = false;
-    if (!has_type) {
-        has_type = true;
-        type     = (enum ggml_backend_dev_type) apir_device_get_type(gpu);
-    }
-
-    return type;
+    return (enum ggml_backend_dev_type) gpu->cached_device_info.type;
 }
 
 static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
     virtgpu * gpu = DEV_TO_GPU(dev);
 
-    return apir_device_get_memory(gpu, free, total);
+    *free = gpu->cached_device_info.memory_free;
+    *total = gpu->cached_device_info.memory_total;
 }
 
 static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
@@ -77,13 +72,22 @@ static void ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, ggml_
 ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) {
     virtgpu * gpu = DEV_TO_GPU(dev);
 
-    apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu);
+    static std::atomic<bool> initialized = false;
+    static ggml_backend_buffer_type buft;
 
-    static ggml_backend_buffer_type buft{
-        /* .iface    = */ ggml_backend_remoting_buffer_type_interface,
-        /* .device   = */ dev,
-        /* .context  = */ (void *) ctx,
-    };
+    if (!initialized) {
+        static std::mutex           mutex;
+        std::lock_guard<std::mutex> lock(mutex);
+
+        if (!initialized) {
+            buft = {
+                /* .iface    = */ ggml_backend_remoting_buffer_type_interface,
+                /* .device   = */ dev,
+                /* .context  = */ (void *) gpu->cached_buffer_type.host_handle,
+            };
+            initialized = true;
+        }
+    }
 
     return &buft;
 }
@@ -91,13 +95,22 @@ ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_bac
 static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_from_ptr_type(ggml_backend_dev_t dev) {
     virtgpu * gpu = DEV_TO_GPU(dev);
 
-    apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu);
+    static std::atomic<bool> initialized = false;
+    static ggml_backend_buffer_type buft;
 
-    static ggml_backend_buffer_type buft{
-        /* .iface    = */ ggml_backend_remoting_buffer_from_ptr_type_interface,
-        /* .device   = */ dev,
-        /* .context  = */ (void *) ctx,
-    };
+    if (!initialized) {
+        static std::mutex           mutex;
+        std::lock_guard<std::mutex> lock(mutex);
+
+        if (!initialized) {
+            buft = {
+                /* .iface    = */ ggml_backend_remoting_buffer_from_ptr_type_interface,
+                /* .device   = */ dev,
+                /* .context  = */ (void *) gpu->cached_buffer_type.host_handle,
+            };
+            initialized = true;
+        }
+    }
 
     return &buft;
 }
diff --git a/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp b/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp
index c46cf51c02..ca8235bb65 100644
--- a/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp
@@ -5,26 +5,57 @@
 #include <mutex>
 
 static virtgpu * apir_initialize() {
-    static virtgpu * apir_gpu_instance = NULL;
-    static bool      apir_initialized  = false;
+    static virtgpu *         gpu          = NULL;
+    static std::atomic<bool> initialized  = false;
+
+    if (initialized) {
+        // fast track
+        return gpu;
+    }
 
     {
         static std::mutex           mutex;
         std::lock_guard<std::mutex> lock(mutex);
 
-        if (apir_initialized) {
-            return apir_gpu_instance;
+        if (initialized) {
+            // thread safe
+            return gpu;
         }
 
-        apir_gpu_instance = create_virtgpu();
-        if (!apir_gpu_instance) {
+        gpu = create_virtgpu();
+        if (!gpu) {
             GGML_ABORT("failed to initialize the virtgpu");
         }
 
-        apir_initialized = true;
+        // Pre-fetch and cache all device information, it will not change
+        gpu->cached_device_info.description  = apir_device_get_description(gpu);
+        if (!gpu->cached_device_info.description) {
+            GGML_ABORT("failed to initialize the virtgpu device description");
+        }
+        gpu->cached_device_info.name         = apir_device_get_name(gpu);
+        if (!gpu->cached_device_info.name) {
+            GGML_ABORT("failed to initialize the virtgpu device name");
+        }
+        gpu->cached_device_info.device_count = apir_device_get_count(gpu);
+        gpu->cached_device_info.type         = apir_device_get_type(gpu);
+
+        apir_device_get_memory(gpu,
+                              &gpu->cached_device_info.memory_free,
+                              &gpu->cached_device_info.memory_total);
+
+        apir_buffer_type_host_handle_t buft_host_handle = apir_device_get_buffer_type(gpu);
+        gpu->cached_buffer_type.host_handle             = buft_host_handle;
+        gpu->cached_buffer_type.name                    = apir_buffer_type_get_name(gpu, buft_host_handle);
+        if (!gpu->cached_buffer_type.name) {
+            GGML_ABORT("failed to initialize the virtgpu buffer type name");
+        }
+        gpu->cached_buffer_type.alignment               = apir_buffer_type_get_alignment(gpu, buft_host_handle);
+        gpu->cached_buffer_type.max_size                = apir_buffer_type_get_max_size(gpu, buft_host_handle);
+
+        initialized = true;
     }
 
-    return apir_gpu_instance;
+    return gpu;
 }
 
 static int ggml_backend_remoting_get_device_count() {
@@ -34,7 +65,7 @@ static int ggml_backend_remoting_get_device_count() {
         return 0;
     }
 
-    return apir_device_get_count(gpu);
+    return gpu->cached_device_info.device_count;
 }
 
 static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) {
@@ -62,7 +93,11 @@ static void ggml_backend_remoting_reg_init_devices(ggml_backend_reg_t reg) {
         return;
     }
 
-    static bool initialized = false;
+    static std::atomic<bool> initialized = false;
+
+    if (initialized) {
+        return; // fast track
+    }
 
     {
         static std::mutex           mutex;
diff --git a/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml b/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml
index 775c5ca573..14ef2433e4 100644
--- a/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml
+++ b/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml
@@ -24,10 +24,10 @@ functions:
         frontend_return: "int"
 
       get_name:
-        frontend_return: "const char *"
+        frontend_return: "char *"
 
       get_description:
-        frontend_return: "const char *"
+        frontend_return: "char *"
 
       get_type:
         frontend_return: "uint32_t"
@@ -64,19 +64,19 @@ functions:
     group_description: "buffer-type"
     functions:
       get_name:
-        frontend_return: "const char *"
+        frontend_return: "char *"
         frontend_extra_params:
-        - "ggml_backend_buffer_type_t buft"
+        - "apir_buffer_type_host_handle_t host_handle"
 
       get_alignment:
         frontend_return: "size_t"
         frontend_extra_params:
-        - "ggml_backend_buffer_type_t buft"
+        - "apir_buffer_type_host_handle_t host_handle"
 
       get_max_size:
         frontend_return: "size_t"
         frontend_extra_params:
-        - "ggml_backend_buffer_type_t buft"
+        - "apir_buffer_type_host_handle_t host_handle"
 
       is_host:
         deprecated: true
@@ -84,13 +84,13 @@ functions:
       alloc_buffer:
         frontend_return: "apir_buffer_context_t"
         frontend_extra_params:
-        - "ggml_backend_buffer_type_t buffer_buft"
+        - "apir_buffer_type_host_handle_t host_handle"
         - "size_t size"
 
       get_alloc_size:
         frontend_return: "size_t"
         frontend_extra_params:
-        - "ggml_backend_buffer_type_t buft"
+        - "apir_buffer_type_host_handle_t host_handle"
         - "const ggml_tensor *op"
 
   buffer:
diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp
index 0069b47a5b..1c0083e8e3 100644
--- a/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp
+++ b/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp
@@ -1,13 +1,13 @@
 #include "virtgpu-forward-impl.h"
 
-const char * apir_buffer_type_get_name(virtgpu * gpu, ggml_backend_buffer_type_t buft) {
+char * apir_buffer_type_get_name(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle) {
     apir_encoder *        encoder;
     apir_decoder *        decoder;
     ApirForwardReturnCode ret;
 
     REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME);
 
-    apir_encode_ggml_buffer_type(encoder, buft);
+    apir_encode_apir_buffer_type_host_handle(encoder, host_handle);
 
     REMOTE_CALL(gpu, encoder, decoder, ret);
 
@@ -24,14 +24,14 @@ const char * apir_buffer_type_get_name(virtgpu * gpu, ggml_backend_buffer_type_t
     return string;
 }
 
-size_t apir_buffer_type_get_alignment(virtgpu * gpu, ggml_backend_buffer_type_t buft) {
+size_t apir_buffer_type_get_alignment(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle) {
     apir_encoder *        encoder;
     apir_decoder *        decoder;
     ApirForwardReturnCode ret;
 
     REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT);
 
-    apir_encode_ggml_buffer_type(encoder, buft);
+    apir_encode_apir_buffer_type_host_handle(encoder, host_handle);
 
     REMOTE_CALL(gpu, encoder, decoder, ret);
 
@@ -43,14 +43,14 @@ size_t apir_buffer_type_get_alignment(virtgpu * gpu, ggml_backend_buffer_type_t
     return alignment;
 }
 
-size_t apir_buffer_type_get_max_size(virtgpu * gpu, ggml_backend_buffer_type_t buft) {
+size_t apir_buffer_type_get_max_size(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle) {
     apir_encoder *        encoder;
     apir_decoder *        decoder;
     ApirForwardReturnCode ret;
 
     REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE);
 
-    apir_encode_ggml_buffer_type(encoder, buft);
+    apir_encode_apir_buffer_type_host_handle(encoder, host_handle);
 
     REMOTE_CALL(gpu, encoder, decoder, ret);
 
@@ -62,7 +62,7 @@ size_t apir_buffer_type_get_max_size(virtgpu * gpu, ggml_backend_buffer_type_t b
     return max_size;
 }
 
-apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, ggml_backend_buffer_type_t buft, size_t size) {
+apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle, size_t size) {
     apir_encoder *        encoder;
     apir_decoder *        decoder;
     ApirForwardReturnCode ret;
@@ -71,7 +71,7 @@ apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, ggml_backend_
 
     REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER);
 
-    apir_encode_ggml_buffer_type(encoder, buft);
+    apir_encode_apir_buffer_type_host_handle(encoder, host_handle);
 
     apir_encode_size_t(encoder, &size);
 
@@ -84,14 +84,14 @@ apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, ggml_backend_
     return buffer_context;
 }
 
-size_t apir_buffer_type_get_alloc_size(virtgpu * gpu, ggml_backend_buffer_type_t buft, const ggml_tensor * op) {
+size_t apir_buffer_type_get_alloc_size(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle, const ggml_tensor * op) {
     apir_encoder *        encoder;
     apir_decoder *        decoder;
     ApirForwardReturnCode ret;
 
     REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE);
 
-    apir_encode_ggml_buffer_type(encoder, buft);
+    apir_encode_apir_buffer_type_host_handle(encoder, host_handle);
 
     apir_encode_ggml_tensor_inline(encoder, op);
 
diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp b/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp
index 3e45e55bdc..627121b9ee 100644
--- a/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp
+++ b/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp
@@ -21,11 +21,7 @@ int apir_device_get_count(virtgpu * gpu) {
     return dev_count;
 }
 
-const char * apir_device_get_name(virtgpu * gpu) {
-    static char * string = nullptr;
-    if (string) {
-        return string;
-    }
+char * apir_device_get_name(virtgpu * gpu) {
     apir_encoder *        encoder;
     apir_decoder *        decoder;
     ApirForwardReturnCode ret;
@@ -34,7 +30,7 @@ const char * apir_device_get_name(virtgpu * gpu) {
     REMOTE_CALL(gpu, encoder, decoder, ret);
 
     const size_t string_size = apir_decode_array_size_unchecked(decoder);
-    string                   = (char *) apir_decoder_alloc_array(sizeof(char), string_size);
+    char            * string = (char *) apir_decoder_alloc_array(sizeof(char), string_size);
     if (!string) {
         GGML_LOG_ERROR("%s: Could not allocate the device name buffer\n", __func__);
         return NULL;
@@ -46,7 +42,7 @@ const char * apir_device_get_name(virtgpu * gpu) {
     return string;
 }
 
-const char * apir_device_get_description(virtgpu * gpu) {
+char * apir_device_get_description(virtgpu * gpu) {
     apir_encoder *        encoder;
     apir_decoder *        decoder;
     ApirForwardReturnCode ret;
diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h b/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h
index 6216ebad1f..fe4cae2025 100644
--- a/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h
+++ b/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h
@@ -3,8 +3,8 @@
 /* device */
 void                           apir_device_get_device_count(struct virtgpu * gpu);
 int                            apir_device_get_count(struct virtgpu * gpu);
-const char *                   apir_device_get_name(struct virtgpu * gpu);
-const char *                   apir_device_get_description(struct virtgpu * gpu);
+char *                         apir_device_get_name(struct virtgpu * gpu);
+char *                         apir_device_get_description(struct virtgpu * gpu);
 uint32_t                       apir_device_get_type(struct virtgpu * gpu);
 void                           apir_device_get_memory(struct virtgpu * gpu, size_t * free, size_t * total);
 bool                           apir_device_supports_op(struct virtgpu * gpu, const ggml_tensor * op);
@@ -17,13 +17,15 @@ void                           apir_device_get_props(struct virtgpu * gpu,
 apir_buffer_context_t          apir_device_buffer_from_ptr(struct virtgpu * gpu, size_t size, size_t max_tensor_size);
 
 /* buffer-type */
-const char *          apir_buffer_type_get_name(struct virtgpu * gpu, ggml_backend_buffer_type_t buft);
-size_t                apir_buffer_type_get_alignment(struct virtgpu * gpu, ggml_backend_buffer_type_t buft);
-size_t                apir_buffer_type_get_max_size(struct virtgpu * gpu, ggml_backend_buffer_type_t buft);
-apir_buffer_context_t apir_buffer_type_alloc_buffer(struct virtgpu *           gpu,
-                                                    ggml_backend_buffer_type_t buffer_buft,
-                                                    size_t                     size);
-size_t apir_buffer_type_get_alloc_size(struct virtgpu * gpu, ggml_backend_buffer_type_t buft, const ggml_tensor * op);
+char *                apir_buffer_type_get_name(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle);
+size_t                apir_buffer_type_get_alignment(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle);
+size_t                apir_buffer_type_get_max_size(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle);
+apir_buffer_context_t apir_buffer_type_alloc_buffer(struct virtgpu *               gpu,
+                                                    apir_buffer_type_host_handle_t host_handle,
+                                                    size_t                         size);
+size_t                apir_buffer_type_get_alloc_size(struct virtgpu *               gpu,
+                                                      apir_buffer_type_host_handle_t host_handle,
+                                                      const ggml_tensor *            op);
 
 /* buffer */
 void * apir_buffer_get_base(struct virtgpu * gpu, apir_buffer_context_t * buffer_context);
diff --git a/ggml/src/ggml-virtgpu/virtgpu.h b/ggml/src/ggml-virtgpu/virtgpu.h
index d4bb42e20b..6144319e4d 100644
--- a/ggml/src/ggml-virtgpu/virtgpu.h
+++ b/ggml/src/ggml-virtgpu/virtgpu.h
@@ -73,6 +73,24 @@ struct virtgpu {
     /* APIR communication pages */
     virtgpu_shmem reply_shmem;
     virtgpu_shmem data_shmem;
+
+    /* Cached device information to prevent memory leaks and race conditions */
+    struct {
+        char *   description;
+        char *   name;
+        int32_t  device_count;
+        uint32_t type;
+        size_t   memory_free;
+        size_t   memory_total;
+    } cached_device_info;
+
+    /* Cached buffer type information to prevent memory leaks and race conditions */
+    struct {
+        apir_buffer_type_host_handle_t host_handle;
+        char *                         name;
+        size_t                         alignment;
+        size_t                         max_size;
+    } cached_buffer_type;
 };
 
 static inline int virtgpu_ioctl(virtgpu * gpu, unsigned long request, void * args) {

From fcc689071027d6d6fc3b9fc2d6744abddde69ab2 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 28 Jan 2026 14:38:06 +0100
Subject: [PATCH 04/10] ggml-virtgpu: protect the use of the shared memory to
 transfer data

---
 .../ggml-virtgpu/virtgpu-forward-backend.cpp  | 12 ++++++++--
 .../ggml-virtgpu/virtgpu-forward-buffer.cpp   | 24 +++++++++++++++----
 ggml/src/ggml-virtgpu/virtgpu.cpp             |  7 ++++++
 ggml/src/ggml-virtgpu/virtgpu.h               |  3 +++
 4 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp b/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp
index bf3c41011a..201100e421 100644
--- a/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp
+++ b/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp
@@ -18,9 +18,14 @@ ggml_status apir_backend_graph_compute(virtgpu * gpu, ggml_cgraph * cgraph) {
 
     virtgpu_shmem   temp_shmem;  // Local storage for large buffers
     virtgpu_shmem * shmem = &temp_shmem;
+    bool using_shared_shmem = false;
 
     if (cgraph_size <= gpu->data_shmem.mmap_size) {
-        // prefer the init-time allocated page, if large enough
+        // Lock mutex before using shared data_shmem buffer
+        if (mtx_lock(&gpu->data_shmem_mutex) != thrd_success) {
+            GGML_ABORT("Failed to lock data_shmem mutex");
+        }
+        using_shared_shmem = true;
         shmem = &gpu->data_shmem;
     } else if (virtgpu_shmem_create(gpu, cgraph_size, shmem)) {
         GGML_ABORT("Couldn't allocate the guest-host shared buffer");
@@ -42,7 +47,10 @@ ggml_status apir_backend_graph_compute(virtgpu * gpu, ggml_cgraph * cgraph) {
 
     remote_call_finish(gpu, encoder, decoder);
 
-    if (shmem != &gpu->data_shmem) {
+    // Unlock mutex before cleanup
+    if (using_shared_shmem) {
+        mtx_unlock(&gpu->data_shmem_mutex);
+    } else {
         virtgpu_shmem_destroy(gpu, shmem);
     }
 
diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp b/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp
index 3181e39440..81ad66ec68 100644
--- a/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp
+++ b/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp
@@ -36,9 +36,14 @@ void apir_buffer_set_tensor(virtgpu *               gpu,
 
     virtgpu_shmem   temp_shmem;  // Local storage for large buffers
     virtgpu_shmem * shmem = &temp_shmem;
+    bool using_shared_shmem = false;
 
     if (size <= gpu->data_shmem.mmap_size) {
-        // prefer the init-time allocated page, if large enough
+        // Lock mutex before using shared data_shmem buffer
+        if (mtx_lock(&gpu->data_shmem_mutex) != thrd_success) {
+            GGML_ABORT("Failed to lock data_shmem mutex");
+        }
+        using_shared_shmem = true;
         shmem = &gpu->data_shmem;
 
     } else if (virtgpu_shmem_create(gpu, size, shmem)) {
@@ -55,7 +60,10 @@ void apir_buffer_set_tensor(virtgpu *               gpu,
 
     remote_call_finish(gpu, encoder, decoder);
 
-    if (shmem != &gpu->data_shmem) {
+    // Unlock mutex before cleanup
+    if (using_shared_shmem) {
+        mtx_unlock(&gpu->data_shmem_mutex);
+    } else {
         virtgpu_shmem_destroy(gpu, shmem);
     }
 
@@ -79,9 +87,14 @@ void apir_buffer_get_tensor(virtgpu *               gpu,
 
     virtgpu_shmem   temp_shmem;  // Local storage for large buffers
     virtgpu_shmem * shmem = &temp_shmem;
+    bool using_shared_shmem = false;
 
     if (size <= gpu->data_shmem.mmap_size) {
-        // prefer the init-time allocated page, if large enough
+        // Lock mutex before using shared data_shmem buffer
+        if (mtx_lock(&gpu->data_shmem_mutex) != thrd_success) {
+            GGML_ABORT("Failed to lock data_shmem mutex");
+        }
+        using_shared_shmem = true;
         shmem = &gpu->data_shmem;
 
     } else if (virtgpu_shmem_create(gpu, size, shmem)) {
@@ -98,7 +111,10 @@ void apir_buffer_get_tensor(virtgpu *               gpu,
 
     remote_call_finish(gpu, encoder, decoder);
 
-    if (shmem != &gpu->data_shmem) {
+    // Unlock mutex before cleanup
+    if (using_shared_shmem) {
+        mtx_unlock(&gpu->data_shmem_mutex);
+    } else {
         virtgpu_shmem_destroy(gpu, shmem);
     }
 }
diff --git a/ggml/src/ggml-virtgpu/virtgpu.cpp b/ggml/src/ggml-virtgpu/virtgpu.cpp
index 005c8e21db..f44b0b6927 100644
--- a/ggml/src/ggml-virtgpu/virtgpu.cpp
+++ b/ggml/src/ggml-virtgpu/virtgpu.cpp
@@ -149,6 +149,13 @@ virtgpu * create_virtgpu() {
     gpu->use_apir_capset = getenv("GGML_REMOTING_USE_APIR_CAPSET") != nullptr;
     util_sparse_array_init(&gpu->shmem_array, sizeof(virtgpu_shmem), 1024);
 
+    // Initialize mutex to protect shared data_shmem buffer
+    if (mtx_init(&gpu->data_shmem_mutex, mtx_plain) != thrd_success) {
+        delete gpu;
+        GGML_ABORT("%s: failed to initialize data_shmem mutex", __func__);
+        return NULL;
+    }
+
     if (virtgpu_open(gpu) != APIR_SUCCESS) {
         GGML_ABORT("%s: failed to open the virtgpu device", __func__);
         return NULL;
diff --git a/ggml/src/ggml-virtgpu/virtgpu.h b/ggml/src/ggml-virtgpu/virtgpu.h
index 6144319e4d..5c3709ec3b 100644
--- a/ggml/src/ggml-virtgpu/virtgpu.h
+++ b/ggml/src/ggml-virtgpu/virtgpu.h
@@ -74,6 +74,9 @@ struct virtgpu {
     virtgpu_shmem reply_shmem;
     virtgpu_shmem data_shmem;
 
+    /* Mutex to protect shared data_shmem buffer from concurrent access */
+    mtx_t data_shmem_mutex;
+
     /* Cached device information to prevent memory leaks and race conditions */
     struct {
         char *   description;

From 171ab8b4ad97833b36ece79ea10a9f4cdbc5eb7d Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 28 Jan 2026 14:55:30 +0100
Subject: [PATCH 05/10] ggml-virtgpu: make the remote calls thread-safe

---
 ggml/src/ggml-virtgpu/virtgpu.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-virtgpu/virtgpu.cpp b/ggml/src/ggml-virtgpu/virtgpu.cpp
index f44b0b6927..47af17ed38 100644
--- a/ggml/src/ggml-virtgpu/virtgpu.cpp
+++ b/ggml/src/ggml-virtgpu/virtgpu.cpp
@@ -340,9 +340,9 @@ apir_encoder * remote_call_prepare(virtgpu * gpu, ApirCommandType apir_cmd_type,
      * Prepare the command encoder and its buffer
      */
 
-    static char encoder_buffer[4096];
+    thread_local char encoder_buffer[4096];
 
-    static apir_encoder enc;
+    thread_local apir_encoder enc;
     enc = {
         .cur   = encoder_buffer,
         .start = encoder_buffer,

From 3864be3e5d67fe979b6cbbfd64f18c75250da488 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Thu, 29 Jan 2026 09:25:09 +0100
Subject: [PATCH 06/10] ggml-virtgpu: backend: don't continue if couldn't
 allocate the tensor memory

---
 ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h b/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h
index 28f7f270ef..207b930d24 100644
--- a/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h
+++ b/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h
@@ -39,11 +39,17 @@ static inline void apir_encode_ggml_tensor(apir_encoder * enc, const ggml_tensor
 
 static inline const ggml_tensor * apir_decode_ggml_tensor(apir_decoder * dec) {
     const apir_rpc_tensor * apir_rpc_tensor = apir_decode_apir_rpc_tensor_inplace(dec);
+
+    if (!apir_rpc_tensor) {
+        return NULL;
+    }
+
     ggml_init_params params{
         /*.mem_size   =*/ ggml_tensor_overhead(),
         /*.mem_buffer =*/ NULL,
         /*.no_alloc   =*/ true,
     };
+
     ggml_context * ctx = ggml_init(params);
 
     const ggml_tensor * tensor = apir_deserialize_tensor(ctx, apir_rpc_tensor);

From 07f41cafee6d6ba8fccd94562c89a40cd2eeb18b Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Thu, 29 Jan 2026 09:50:47 +0100
Subject: [PATCH 07/10] ggml-virtgpu: add a cleanup function for consistency

---
 ggml/src/ggml-virtgpu/ggml-backend-reg.cpp | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp b/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp
index ca8235bb65..7256737c63 100644
--- a/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp
@@ -4,6 +4,8 @@
 #include <iostream>
 #include <mutex>
 
+void ggml_virtgpu_cleanup(virtgpu *gpu);
+
 static virtgpu * apir_initialize() {
     static virtgpu *         gpu          = NULL;
     static std::atomic<bool> initialized  = false;
@@ -170,3 +172,21 @@ ggml_backend_reg_t ggml_backend_virtgpu_reg() {
 }
 
 GGML_BACKEND_DL_IMPL(ggml_backend_virtgpu_reg)
+
+// public function, not exposed in the GGML interface at the moment
+void ggml_virtgpu_cleanup(virtgpu *gpu) {
+    if (gpu->cached_device_info.name) {
+        free(gpu->cached_device_info.name);
+        gpu->cached_device_info.name = NULL;
+    }
+    if (gpu->cached_device_info.description) {
+        free(gpu->cached_device_info.description);
+        gpu->cached_device_info.description = NULL;
+    }
+    if (gpu->cached_buffer_type.name) {
+        free(gpu->cached_buffer_type.name);
+        gpu->cached_buffer_type.name = NULL;
+    }
+
+    mtx_destroy(&gpu->data_shmem_mutex);
+}

From f35b24e87c8893141ca44b8c6c926a1b8992adfb Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Thu, 29 Jan 2026 14:26:39 +0100
Subject: [PATCH 08/10] ggml-virtgpu: backend: don't crash if
 buft->iface.get_max_size is missing

---
 .../ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp
index 3a97fbd956..d55eec2761 100644
--- a/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp
+++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp
@@ -36,7 +36,11 @@ uint32_t backend_buffer_type_get_max_size(apir_encoder * enc, apir_decoder * dec
     ggml_backend_buffer_type_t buft;
     buft = apir_decode_ggml_buffer_type(dec);
 
-    size_t value = buft->iface.get_max_size(buft);
+    size_t value = SIZE_MAX;
+    if (buft->iface.get_max_size) {
+        value = buft->iface.get_max_size(buft);
+    }
+
     apir_encode_size_t(enc, &value);
 
     return 0;

From f978082f834183fb2da46fa24b52b53b49ab049f Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Fri, 30 Jan 2026 13:59:05 +0100
Subject: [PATCH 09/10] fix style and ordering

---
 ggml/src/ggml-virtgpu/ggml-backend-reg.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp b/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp
index 7256737c63..276edd8fdd 100644
--- a/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp
@@ -4,7 +4,7 @@
 #include <iostream>
 #include <mutex>
 
-void ggml_virtgpu_cleanup(virtgpu *gpu);
+void ggml_virtgpu_cleanup(virtgpu * gpu);
 
 static virtgpu * apir_initialize() {
     static virtgpu *         gpu          = NULL;
@@ -171,10 +171,8 @@ ggml_backend_reg_t ggml_backend_virtgpu_reg() {
     return &reg;
 }
 
-GGML_BACKEND_DL_IMPL(ggml_backend_virtgpu_reg)
-
 // public function, not exposed in the GGML interface at the moment
-void ggml_virtgpu_cleanup(virtgpu *gpu) {
+void ggml_virtgpu_cleanup(virtgpu * gpu) {
     if (gpu->cached_device_info.name) {
         free(gpu->cached_device_info.name);
         gpu->cached_device_info.name = NULL;
@@ -190,3 +188,5 @@ void ggml_virtgpu_cleanup(virtgpu *gpu) {
 
     mtx_destroy(&gpu->data_shmem_mutex);
 }
+
+GGML_BACKEND_DL_IMPL(ggml_backend_virtgpu_reg)

From e9e9d736bce22f79cc24a11545be413489edba6b Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Fri, 30 Jan 2026 20:38:51 +0100
Subject: [PATCH 10/10] Remove the static variable in apir_device_get_count

---
 ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp b/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp
index 627121b9ee..9cfc1ec171 100644
--- a/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp
+++ b/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp
@@ -2,11 +2,6 @@
 #include "virtgpu-shm.h"
 
 int apir_device_get_count(virtgpu * gpu) {
-    static int32_t dev_count = -1;
-    if (dev_count != -1) {
-        return dev_count;
-    }
-
     apir_encoder *        encoder;
     apir_decoder *        decoder;
     ApirForwardReturnCode ret;
@@ -14,6 +9,7 @@ int apir_device_get_count(virtgpu * gpu) {
     REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_COUNT);
     REMOTE_CALL(gpu, encoder, decoder, ret);
 
+    int32_t dev_count = -1;
     apir_decode_int32_t(decoder, &dev_count);
 
     remote_call_finish(gpu, encoder, decoder);