refactor(common): use generic backend API for VRAM heuristic

Replaces Vulkan-specific calls with ggml_backend_dev_memory to be backend-agnostic. Reverts changes to ggml-vulkan.cpp/h and removes vk_device_info example to comply with reviewer feedback.
2025-11-29 11:59:45 -05:00 · 2025-11-29 11:59:45 -05:00 · 4475a373d8
parent e8bf9ed0ce
commit 4475a373d8
6 changed files with 77 additions and 163 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -9,9 +9,7 @@
 #include "log.h"
 #include "llama.h"

-#ifdef GGML_USE_VULKAN
-#include "ggml-vulkan.h"
-#endif
+#include "ggml-backend.h"

 #include <algorithm>
 #include <cinttypes>
@ -1165,89 +1163,104 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
    if (params.n_gpu_layers != -1) {
        mparams.n_gpu_layers = params.n_gpu_layers;
    }
-#ifdef GGML_USE_VULKAN
    else {
        // Dynamic VRAM heuristic
        int n_gpu_layers = 0;

-        // Ensure Vulkan is initialized
-        ggml_backend_vk_get_device_count();
+        // Find the main GPU
+        int count = 0;
+        size_t free = 0;
+        size_t total = 0;
+        bool found_gpu = false;

-        // Get available VRAM
-        size_t free, total;
-        ggml_backend_vk_get_device_memory(params.main_gpu, &free, &total);
-
-        // Parse GGUF to get model info
-        struct gguf_init_params gguf_params = {
-            /*.no_alloc = */ true,
-            /*.ctx      = */ NULL,
-        };
-        struct gguf_context * ctx = gguf_init_from_file(params.model.path.c_str(), gguf_params);
-
-        if (ctx) {
-            int n_layers = -1;
-
-            // Find block count from GGUF metadata
-            int n_kv = gguf_get_n_kv(ctx);
-            for (int i = 0; i < n_kv; i++) {
-                const char * key = gguf_get_key(ctx, i);
-
-                // Find block_count (e.g. llama.block_count, gemma2.block_count)
-                const char * suffix     = ".block_count";
-                size_t       key_len    = strlen(key);
-                size_t       suffix_len = strlen(suffix);
-                if (key_len >= suffix_len && strcmp(key + key_len - suffix_len, suffix) == 0) {
-                    n_layers = gguf_get_val_u32(ctx, i);
+        size_t dev_count = ggml_backend_dev_count();
+        for (size_t i = 0; i < dev_count; ++i) {
+            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+            if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
+                if (count == params.main_gpu) {
+                    ggml_backend_dev_memory(dev, &free, &total);
+                    found_gpu = true;
+                    break;
                }
+                count++;
            }
+        }

-            if (n_layers > 0) {
-                size_t file_size = std::filesystem::file_size(params.model.path);
+        if (found_gpu) {
+            // Parse GGUF to get model info
+            struct gguf_init_params gguf_params = {
+                /*.no_alloc = */ true,
+                /*.ctx      = */ NULL,
+            };
+            struct gguf_context * ctx = gguf_init_from_file(params.model.path.c_str(), gguf_params);

-                // Reserve overhead for KV cache, compute buffers, and system
-                // KV cache is allocated dynamically by llama.cpp based on offloaded layers
-                // Conservative overhead: 800MB covers KV cache + compute for most scenarios
-                const size_t overhead = 800 * 1024 * 1024;
+            if (ctx) {
+                int n_layers = -1;

-                if (free > overhead) {
-                    size_t available_for_model = free - overhead;
-                    size_t bytes_per_layer     = file_size / n_layers;
+                // Find block count from GGUF metadata
+                int n_kv = gguf_get_n_kv(ctx);
+                for (int i = 0; i < n_kv; i++) {
+                    const char * key = gguf_get_key(ctx, i);

-                    if (bytes_per_layer > 0) {
-                        n_gpu_layers = (int) (available_for_model / bytes_per_layer);
+                    // Find block_count (e.g. llama.block_count, gemma2.block_count)
+                    const char * suffix     = ".block_count";
+                    size_t       key_len    = strlen(key);
+                    size_t       suffix_len = strlen(suffix);
+                    if (key_len >= suffix_len && strcmp(key + key_len - suffix_len, suffix) == 0) {
+                        n_layers = gguf_get_val_u32(ctx, i);
                    }
+                }

-                    // Clamp to total layers
-                    if (n_gpu_layers > n_layers) {
-                        n_gpu_layers = n_layers;
-                    }
-                    if (n_gpu_layers < 0) {
+                if (n_layers > 0) {
+                    size_t file_size = std::filesystem::file_size(params.model.path);
+
+                    // Reserve overhead for KV cache, compute buffers, and system
+                    // KV cache is allocated dynamically by llama.cpp based on offloaded layers
+                    // Conservative overhead: 800MB covers KV cache + compute for most scenarios
+                    const size_t overhead = 800 * 1024 * 1024;
+
+                    if (free > overhead) {
+                        size_t available_for_model = free - overhead;
+                        size_t bytes_per_layer     = file_size / n_layers;
+
+                        if (bytes_per_layer > 0) {
+                            n_gpu_layers = (int) (available_for_model / bytes_per_layer);
+                        }
+
+                        // Clamp to total layers
+                        if (n_gpu_layers > n_layers) {
+                            n_gpu_layers = n_layers;
+                        }
+                        if (n_gpu_layers < 0) {
+                            n_gpu_layers = 0;
+                        }
+
+                        LOG_INF(
+                            "%s: Dynamic VRAM heuristic: available_vram=%zu MB, model_size=%zu MB, n_layers=%d, "
+                            "overhead=%zu MB, calculated_layers=%d\n",
+                            __func__, free / 1024 / 1024, file_size / 1024 / 1024, n_layers, overhead / 1024 / 1024,
+                            n_gpu_layers);
+                    } else {
+                        LOG_WRN(
+                            "%s: Dynamic VRAM heuristic: Insufficient VRAM (%zu MB free, %zu MB overhead needed), "
+                            "disabling GPU offload\n",
+                            __func__, free / 1024 / 1024, overhead / 1024 / 1024);
                        n_gpu_layers = 0;
                    }
-
-                    LOG_INF(
-                        "%s: Vulkan dynamic heuristic: available_vram=%zu MB, model_size=%zu MB, n_layers=%d, "
-                        "overhead=%zu MB, calculated_layers=%d\n",
-                        __func__, free / 1024 / 1024, file_size / 1024 / 1024, n_layers, overhead / 1024 / 1024,
-                        n_gpu_layers);
-                } else {
-                    LOG_WRN(
-                        "%s: Vulkan dynamic heuristic: Insufficient VRAM (%zu MB free, %zu MB overhead needed), "
-                        "disabling GPU offload\n",
-                        __func__, free / 1024 / 1024, overhead / 1024 / 1024);
-                    n_gpu_layers = 0;
                }
+                gguf_free(ctx);
+            } else {
+                LOG_WRN("%s: Failed to open GGUF file for heuristic, disabling GPU offload\n", __func__);
+                // Fallback to CPU-only if GGUF fails
+                n_gpu_layers = 0;
            }
-            gguf_free(ctx);
        } else {
-            LOG_WRN("%s: Failed to open GGUF file for heuristic, disabling GPU offload\n", __func__);
-            // Fallback to CPU-only if GGUF fails
-            n_gpu_layers = 0;
+             LOG_WRN("%s: Dynamic VRAM heuristic: GPU %d not found, disabling GPU offload\n", __func__, params.main_gpu);
+             n_gpu_layers = 0;
        }

        mparams.n_gpu_layers = n_gpu_layers;
    }
-#endif

    mparams.main_gpu        = params.main_gpu;
    mparams.split_mode      = params.split_mode;
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -34,9 +34,6 @@ else()
    add_subdirectory(training)
    add_subdirectory(diffusion)
    add_subdirectory(model-conversion)
-    if (GGML_VULKAN)
-        add_subdirectory(vk_device_info)
-    endif()
    if (NOT GGML_BACKEND_DL)
        add_subdirectory(convert-llama2c-to-ggml)
        # these examples use the backends directly and cannot be built with dynamic loading
--- a/examples/vk_device_info/CMakeLists.txt
+++ b/examples/vk_device_info/CMakeLists.txt
@ -1,5 +0,0 @@
-set(TARGET llama-vk-device-info)
-add_executable(${TARGET} vk_device_info.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/vk_device_info/vk_device_info.cpp
+++ b/examples/vk_device_info/vk_device_info.cpp
@ -1,24 +0,0 @@
-#include "ggml-vulkan.h"
-#include <cstdio>
-
-int main(int argc, char ** argv) {
-    int device_count = ggml_backend_vk_get_device_count();
-    printf("Found %d Vulkan devices\\n", device_count);
-
-    for (int i = 0; i < device_count; i++) {
-        ggml_vk_device_info info = ggml_backend_vk_get_device_info(i);
-        printf("\\nDevice %d: %s\\n", i, info.device_name);
-        printf("  Vendor ID: %04x\\n", info.vendor_id);
-        printf(" Device ID: %04x\\n", info.device_id);
-        printf("  API Version: 0x%08x\\n", info.api_version);
-        printf("  Total Device Local Memory: %llu MB\\n", info.total_device_local_memory / (1024 * 1024));
-        printf("  Has Memory Budget Ext: %s\\n", info.has_memory_budget_ext ? "Yes" : "No");
-        printf("  Supports Float16: %s\\n", info.supports_float16 ? "Yes" : "No");
-        printf("  Supports 16-bit Storage: %s\\n", info.supports_16bit_storage ? "Yes" : "No");
-        
-        int default_layers = ggml_backend_vk_get_default_gpu_layers(i, -1);
-        printf("  Default GPU Layers (heuristic): %d\\n", default_layers);
-    }
-
-    return 0;
-}
--- a/ggml/include/ggml-vulkan.h
+++ b/ggml/include/ggml-vulkan.h
@ -24,20 +24,6 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(voi

 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_vk_reg(void);

-typedef struct {
-    char     device_name[256];
-    uint32_t vendor_id;
-    uint32_t device_id;
-    uint64_t total_device_local_memory;
-    bool     has_memory_budget_ext;
-    bool     supports_float16;
-    bool     supports_16bit_storage;
-    uint32_t api_version;
-} ggml_vk_device_info;
-
-GGML_BACKEND_API ggml_vk_device_info ggml_backend_vk_get_device_info(int device);
-GGML_BACKEND_API int                 ggml_backend_vk_get_default_gpu_layers(int device, int default_layers);
-
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -13303,59 +13303,6 @@ void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total
    }
 }

-ggml_vk_device_info ggml_backend_vk_get_device_info(int device) {
-    GGML_ASSERT(device < (int) vk_instance.device_indices.size());
-    GGML_ASSERT(device < (int) vk_instance.device_supports_membudget.size());
-
-    vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
-    vk::PhysicalDeviceProperties props = vkdev.getProperties();
-
-    ggml_vk_device_info info = {};
-    snprintf(info.device_name, sizeof(info.device_name), "%s", props.deviceName.data());
-    info.vendor_id = props.vendorID;
-    info.device_id = props.deviceID;
-    info.api_version = props.apiVersion;
-
-    // Get memory info
-    size_t free, total;
-    ggml_backend_vk_get_device_memory(device, &free, &total);
-    info.total_device_local_memory = total;
-    info.has_memory_budget_ext = vk_instance.device_supports_membudget[device];
-
-    // Check features
-    VkPhysicalDeviceFeatures2 device_features2;
-    device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
-    VkPhysicalDeviceVulkan11Features vk11_features;
-    vk11_features.pNext = nullptr;
-    vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES;
-    device_features2.pNext = &vk11_features;
-    vkGetPhysicalDeviceFeatures2(vkdev, &device_features2);
-
-    info.supports_16bit_storage = vk11_features.storageBuffer16BitAccess;
-    
-    // Check for float16 support (shaderFloat16 or shaderInt8)
-    const std::vector<vk::ExtensionProperties> ext_props = vkdev.enumerateDeviceExtensionProperties();
-    bool fp16_compute = false;
-    for (const auto& properties : ext_props) {
-        if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
-            fp16_compute = true;
-            break;
-        }
-    }
-    info.supports_float16 = fp16_compute;
-
-    return info;
-}
-
-int ggml_backend_vk_get_default_gpu_layers(int device, int default_layers) {
-    // The dynamic heuristic in common.cpp handles the default case (n_gpu_layers = -1).
-    // This function is kept for API compatibility but currently returns 0 to be safe
-    // if called directly without the heuristic logic.
-    (void)device;
-    (void)default_layers;
-    return 0;
-}
-
 static vk::PhysicalDeviceType ggml_backend_vk_get_device_type(int device_idx) {
    GGML_ASSERT(device_idx >= 0 && device_idx < (int) vk_instance.device_indices.size());