diff --git a/common/common.cpp b/common/common.cpp index f3cc55247e..0b9a3c6533 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -9,6 +9,10 @@ #include "log.h" #include "llama.h" +#ifdef GGML_USE_VULKAN +#include "ggml-vulkan.h" +#endif + #include #include #include @@ -1161,6 +1165,89 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { if (params.n_gpu_layers != -1) { mparams.n_gpu_layers = params.n_gpu_layers; } +#ifdef GGML_USE_VULKAN + else { + // Dynamic VRAM heuristic + int n_gpu_layers = 0; + + // Ensure Vulkan is initialized + ggml_backend_vk_get_device_count(); + + // Get available VRAM + size_t free, total; + ggml_backend_vk_get_device_memory(params.main_gpu, &free, &total); + + // Parse GGUF to get model info + struct gguf_init_params gguf_params = { + /*.no_alloc = */ true, + /*.ctx = */ NULL, + }; + struct gguf_context * ctx = gguf_init_from_file(params.model.path.c_str(), gguf_params); + + if (ctx) { + int n_layers = -1; + + // Find block count from GGUF metadata + int n_kv = gguf_get_n_kv(ctx); + for (int i = 0; i < n_kv; i++) { + const char * key = gguf_get_key(ctx, i); + + // Find block_count (e.g. llama.block_count, gemma2.block_count) + const char * suffix = ".block_count"; + size_t key_len = strlen(key); + size_t suffix_len = strlen(suffix); + if (key_len >= suffix_len && strcmp(key + key_len - suffix_len, suffix) == 0) { + n_layers = gguf_get_val_u32(ctx, i); + } + } + + if (n_layers > 0) { + size_t file_size = std::filesystem::file_size(params.model.path); + + // Reserve overhead for KV cache, compute buffers, and system + // KV cache is allocated dynamically by llama.cpp based on offloaded layers + // Conservative overhead: 800MB covers KV cache + compute for most scenarios + const size_t overhead = 800 * 1024 * 1024; + + if (free > overhead) { + size_t available_for_model = free - overhead; + size_t bytes_per_layer = file_size / n_layers; + + if (bytes_per_layer > 0) { + n_gpu_layers = (int) (available_for_model / bytes_per_layer); + } + + // Clamp to total layers + if (n_gpu_layers > n_layers) { + n_gpu_layers = n_layers; + } + if (n_gpu_layers < 0) { + n_gpu_layers = 0; + } + + LOG_INF( + "%s: Vulkan dynamic heuristic: available_vram=%zu MB, model_size=%zu MB, n_layers=%d, " + "overhead=%zu MB, calculated_layers=%d\n", + __func__, free / 1024 / 1024, file_size / 1024 / 1024, n_layers, overhead / 1024 / 1024, + n_gpu_layers); + } else { + LOG_WRN( + "%s: Vulkan dynamic heuristic: Insufficient VRAM (%zu MB free, %zu MB overhead needed), " + "disabling GPU offload\n", + __func__, free / 1024 / 1024, overhead / 1024 / 1024); + n_gpu_layers = 0; + } + } + gguf_free(ctx); + } else { + LOG_WRN("%s: Failed to open GGUF file for heuristic, disabling GPU offload\n", __func__); + // Fallback to CPU-only if GGUF fails + n_gpu_layers = 0; + } + + mparams.n_gpu_layers = n_gpu_layers; + } +#endif mparams.main_gpu = params.main_gpu; mparams.split_mode = params.split_mode; diff --git a/ggml/include/ggml-vulkan.h b/ggml/include/ggml-vulkan.h index 22c4ad928b..2ca1074dbb 100644 --- a/ggml/include/ggml-vulkan.h +++ b/ggml/include/ggml-vulkan.h @@ -1,13 +1,13 @@ #pragma once -#include "ggml-backend.h" #include "ggml.h" +#include "ggml-backend.h" -#ifdef __cplusplus +#ifdef __cplusplus extern "C" { #endif -#define GGML_VK_NAME "Vulkan" +#define GGML_VK_NAME "Vulkan" #define GGML_VK_MAX_DEVICES 16 // backend API @@ -38,6 +38,6 @@ typedef struct { GGML_BACKEND_API ggml_vk_device_info ggml_backend_vk_get_device_info(int device); GGML_BACKEND_API int ggml_backend_vk_get_default_gpu_layers(int device, int default_layers); -#ifdef __cplusplus +#ifdef __cplusplus } #endif