reset branch
This commit is contained in:
parent
03fe95d545
commit
ed4ed397a3
|
|
@ -9,10 +9,6 @@
|
|||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#ifdef GGML_USE_VULKAN
|
||||
#include "ggml-vulkan.h"
|
||||
#endif
|
||||
|
||||
#include <algorithm>
|
||||
#include <cinttypes>
|
||||
#include <climits>
|
||||
|
|
@ -1165,89 +1161,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|||
if (params.n_gpu_layers != -1) {
|
||||
mparams.n_gpu_layers = params.n_gpu_layers;
|
||||
}
|
||||
#ifdef GGML_USE_VULKAN
|
||||
else {
|
||||
// Dynamic VRAM heuristic
|
||||
int n_gpu_layers = 0;
|
||||
|
||||
// Ensure Vulkan is initialized
|
||||
ggml_backend_vk_get_device_count();
|
||||
|
||||
// Get available VRAM
|
||||
size_t free, total;
|
||||
ggml_backend_vk_get_device_memory(params.main_gpu, &free, &total);
|
||||
|
||||
// Parse GGUF to get model info
|
||||
struct gguf_init_params gguf_params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ NULL,
|
||||
};
|
||||
struct gguf_context * ctx = gguf_init_from_file(params.model.path.c_str(), gguf_params);
|
||||
|
||||
if (ctx) {
|
||||
int n_layers = -1;
|
||||
|
||||
// Find block count from GGUF metadata
|
||||
int n_kv = gguf_get_n_kv(ctx);
|
||||
for (int i = 0; i < n_kv; i++) {
|
||||
const char * key = gguf_get_key(ctx, i);
|
||||
|
||||
// Find block_count (e.g. llama.block_count, gemma2.block_count)
|
||||
const char * suffix = ".block_count";
|
||||
size_t key_len = strlen(key);
|
||||
size_t suffix_len = strlen(suffix);
|
||||
if (key_len >= suffix_len && strcmp(key + key_len - suffix_len, suffix) == 0) {
|
||||
n_layers = gguf_get_val_u32(ctx, i);
|
||||
}
|
||||
}
|
||||
|
||||
if (n_layers > 0) {
|
||||
size_t file_size = std::filesystem::file_size(params.model.path);
|
||||
|
||||
// Reserve overhead for KV cache, compute buffers, and system
|
||||
// KV cache is allocated dynamically by llama.cpp based on offloaded layers
|
||||
// Conservative overhead: 800MB covers KV cache + compute for most scenarios
|
||||
const size_t overhead = 800 * 1024 * 1024;
|
||||
|
||||
if (free > overhead) {
|
||||
size_t available_for_model = free - overhead;
|
||||
size_t bytes_per_layer = file_size / n_layers;
|
||||
|
||||
if (bytes_per_layer > 0) {
|
||||
n_gpu_layers = (int) (available_for_model / bytes_per_layer);
|
||||
}
|
||||
|
||||
// Clamp to total layers
|
||||
if (n_gpu_layers > n_layers) {
|
||||
n_gpu_layers = n_layers;
|
||||
}
|
||||
if (n_gpu_layers < 0) {
|
||||
n_gpu_layers = 0;
|
||||
}
|
||||
|
||||
LOG_INF(
|
||||
"%s: Vulkan dynamic heuristic: available_vram=%zu MB, model_size=%zu MB, n_layers=%d, "
|
||||
"overhead=%zu MB, calculated_layers=%d\n",
|
||||
__func__, free / 1024 / 1024, file_size / 1024 / 1024, n_layers, overhead / 1024 / 1024,
|
||||
n_gpu_layers);
|
||||
} else {
|
||||
LOG_WRN(
|
||||
"%s: Vulkan dynamic heuristic: Insufficient VRAM (%zu MB free, %zu MB overhead needed), "
|
||||
"disabling GPU offload\n",
|
||||
__func__, free / 1024 / 1024, overhead / 1024 / 1024);
|
||||
n_gpu_layers = 0;
|
||||
}
|
||||
}
|
||||
gguf_free(ctx);
|
||||
} else {
|
||||
LOG_WRN("%s: Failed to open GGUF file for heuristic, disabling GPU offload\n", __func__);
|
||||
// Fallback to CPU-only if GGUF fails
|
||||
n_gpu_layers = 0;
|
||||
}
|
||||
|
||||
mparams.n_gpu_layers = n_gpu_layers;
|
||||
}
|
||||
#endif
|
||||
|
||||
mparams.main_gpu = params.main_gpu;
|
||||
mparams.split_mode = params.split_mode;
|
||||
|
|
|
|||
Loading…
Reference in New Issue