diff --git a/common/common.cpp b/common/common.cpp index 0b9a3c6533..f3cc55247e 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -9,10 +9,6 @@ #include "log.h" #include "llama.h" -#ifdef GGML_USE_VULKAN -#include "ggml-vulkan.h" -#endif - #include #include #include @@ -1165,89 +1161,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { if (params.n_gpu_layers != -1) { mparams.n_gpu_layers = params.n_gpu_layers; } -#ifdef GGML_USE_VULKAN - else { - // Dynamic VRAM heuristic - int n_gpu_layers = 0; - - // Ensure Vulkan is initialized - ggml_backend_vk_get_device_count(); - - // Get available VRAM - size_t free, total; - ggml_backend_vk_get_device_memory(params.main_gpu, &free, &total); - - // Parse GGUF to get model info - struct gguf_init_params gguf_params = { - /*.no_alloc = */ true, - /*.ctx = */ NULL, - }; - struct gguf_context * ctx = gguf_init_from_file(params.model.path.c_str(), gguf_params); - - if (ctx) { - int n_layers = -1; - - // Find block count from GGUF metadata - int n_kv = gguf_get_n_kv(ctx); - for (int i = 0; i < n_kv; i++) { - const char * key = gguf_get_key(ctx, i); - - // Find block_count (e.g. llama.block_count, gemma2.block_count) - const char * suffix = ".block_count"; - size_t key_len = strlen(key); - size_t suffix_len = strlen(suffix); - if (key_len >= suffix_len && strcmp(key + key_len - suffix_len, suffix) == 0) { - n_layers = gguf_get_val_u32(ctx, i); - } - } - - if (n_layers > 0) { - size_t file_size = std::filesystem::file_size(params.model.path); - - // Reserve overhead for KV cache, compute buffers, and system - // KV cache is allocated dynamically by llama.cpp based on offloaded layers - // Conservative overhead: 800MB covers KV cache + compute for most scenarios - const size_t overhead = 800 * 1024 * 1024; - - if (free > overhead) { - size_t available_for_model = free - overhead; - size_t bytes_per_layer = file_size / n_layers; - - if (bytes_per_layer > 0) { - n_gpu_layers = (int) (available_for_model / bytes_per_layer); - } - - // Clamp to total layers - if (n_gpu_layers > n_layers) { - n_gpu_layers = n_layers; - } - if (n_gpu_layers < 0) { - n_gpu_layers = 0; - } - - LOG_INF( - "%s: Vulkan dynamic heuristic: available_vram=%zu MB, model_size=%zu MB, n_layers=%d, " - "overhead=%zu MB, calculated_layers=%d\n", - __func__, free / 1024 / 1024, file_size / 1024 / 1024, n_layers, overhead / 1024 / 1024, - n_gpu_layers); - } else { - LOG_WRN( - "%s: Vulkan dynamic heuristic: Insufficient VRAM (%zu MB free, %zu MB overhead needed), " - "disabling GPU offload\n", - __func__, free / 1024 / 1024, overhead / 1024 / 1024); - n_gpu_layers = 0; - } - } - gguf_free(ctx); - } else { - LOG_WRN("%s: Failed to open GGUF file for heuristic, disabling GPU offload\n", __func__); - // Fallback to CPU-only if GGUF fails - n_gpu_layers = 0; - } - - mparams.n_gpu_layers = n_gpu_layers; - } -#endif mparams.main_gpu = params.main_gpu; mparams.split_mode = params.split_mode;