refactor(common): use generic backend API for VRAM heuristic
Replaces Vulkan-specific calls with ggml_backend_dev_memory to be backend-agnostic. Reverts changes to ggml-vulkan.cpp/h and removes vk_device_info example to comply with reviewer feedback.
This commit is contained in:
parent
e8bf9ed0ce
commit
4475a373d8
|
|
@ -9,9 +9,7 @@
|
|||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#ifdef GGML_USE_VULKAN
|
||||
#include "ggml-vulkan.h"
|
||||
#endif
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cinttypes>
|
||||
|
|
@ -1165,89 +1163,104 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|||
if (params.n_gpu_layers != -1) {
|
||||
mparams.n_gpu_layers = params.n_gpu_layers;
|
||||
}
|
||||
#ifdef GGML_USE_VULKAN
|
||||
else {
|
||||
// Dynamic VRAM heuristic
|
||||
int n_gpu_layers = 0;
|
||||
|
||||
// Ensure Vulkan is initialized
|
||||
ggml_backend_vk_get_device_count();
|
||||
// Find the main GPU
|
||||
int count = 0;
|
||||
size_t free = 0;
|
||||
size_t total = 0;
|
||||
bool found_gpu = false;
|
||||
|
||||
// Get available VRAM
|
||||
size_t free, total;
|
||||
ggml_backend_vk_get_device_memory(params.main_gpu, &free, &total);
|
||||
|
||||
// Parse GGUF to get model info
|
||||
struct gguf_init_params gguf_params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ NULL,
|
||||
};
|
||||
struct gguf_context * ctx = gguf_init_from_file(params.model.path.c_str(), gguf_params);
|
||||
|
||||
if (ctx) {
|
||||
int n_layers = -1;
|
||||
|
||||
// Find block count from GGUF metadata
|
||||
int n_kv = gguf_get_n_kv(ctx);
|
||||
for (int i = 0; i < n_kv; i++) {
|
||||
const char * key = gguf_get_key(ctx, i);
|
||||
|
||||
// Find block_count (e.g. llama.block_count, gemma2.block_count)
|
||||
const char * suffix = ".block_count";
|
||||
size_t key_len = strlen(key);
|
||||
size_t suffix_len = strlen(suffix);
|
||||
if (key_len >= suffix_len && strcmp(key + key_len - suffix_len, suffix) == 0) {
|
||||
n_layers = gguf_get_val_u32(ctx, i);
|
||||
size_t dev_count = ggml_backend_dev_count();
|
||||
for (size_t i = 0; i < dev_count; ++i) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
|
||||
if (count == params.main_gpu) {
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
found_gpu = true;
|
||||
break;
|
||||
}
|
||||
count++;
|
||||
}
|
||||
}
|
||||
|
||||
if (n_layers > 0) {
|
||||
size_t file_size = std::filesystem::file_size(params.model.path);
|
||||
if (found_gpu) {
|
||||
// Parse GGUF to get model info
|
||||
struct gguf_init_params gguf_params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ NULL,
|
||||
};
|
||||
struct gguf_context * ctx = gguf_init_from_file(params.model.path.c_str(), gguf_params);
|
||||
|
||||
// Reserve overhead for KV cache, compute buffers, and system
|
||||
// KV cache is allocated dynamically by llama.cpp based on offloaded layers
|
||||
// Conservative overhead: 800MB covers KV cache + compute for most scenarios
|
||||
const size_t overhead = 800 * 1024 * 1024;
|
||||
if (ctx) {
|
||||
int n_layers = -1;
|
||||
|
||||
if (free > overhead) {
|
||||
size_t available_for_model = free - overhead;
|
||||
size_t bytes_per_layer = file_size / n_layers;
|
||||
// Find block count from GGUF metadata
|
||||
int n_kv = gguf_get_n_kv(ctx);
|
||||
for (int i = 0; i < n_kv; i++) {
|
||||
const char * key = gguf_get_key(ctx, i);
|
||||
|
||||
if (bytes_per_layer > 0) {
|
||||
n_gpu_layers = (int) (available_for_model / bytes_per_layer);
|
||||
// Find block_count (e.g. llama.block_count, gemma2.block_count)
|
||||
const char * suffix = ".block_count";
|
||||
size_t key_len = strlen(key);
|
||||
size_t suffix_len = strlen(suffix);
|
||||
if (key_len >= suffix_len && strcmp(key + key_len - suffix_len, suffix) == 0) {
|
||||
n_layers = gguf_get_val_u32(ctx, i);
|
||||
}
|
||||
}
|
||||
|
||||
// Clamp to total layers
|
||||
if (n_gpu_layers > n_layers) {
|
||||
n_gpu_layers = n_layers;
|
||||
}
|
||||
if (n_gpu_layers < 0) {
|
||||
if (n_layers > 0) {
|
||||
size_t file_size = std::filesystem::file_size(params.model.path);
|
||||
|
||||
// Reserve overhead for KV cache, compute buffers, and system
|
||||
// KV cache is allocated dynamically by llama.cpp based on offloaded layers
|
||||
// Conservative overhead: 800MB covers KV cache + compute for most scenarios
|
||||
const size_t overhead = 800 * 1024 * 1024;
|
||||
|
||||
if (free > overhead) {
|
||||
size_t available_for_model = free - overhead;
|
||||
size_t bytes_per_layer = file_size / n_layers;
|
||||
|
||||
if (bytes_per_layer > 0) {
|
||||
n_gpu_layers = (int) (available_for_model / bytes_per_layer);
|
||||
}
|
||||
|
||||
// Clamp to total layers
|
||||
if (n_gpu_layers > n_layers) {
|
||||
n_gpu_layers = n_layers;
|
||||
}
|
||||
if (n_gpu_layers < 0) {
|
||||
n_gpu_layers = 0;
|
||||
}
|
||||
|
||||
LOG_INF(
|
||||
"%s: Dynamic VRAM heuristic: available_vram=%zu MB, model_size=%zu MB, n_layers=%d, "
|
||||
"overhead=%zu MB, calculated_layers=%d\n",
|
||||
__func__, free / 1024 / 1024, file_size / 1024 / 1024, n_layers, overhead / 1024 / 1024,
|
||||
n_gpu_layers);
|
||||
} else {
|
||||
LOG_WRN(
|
||||
"%s: Dynamic VRAM heuristic: Insufficient VRAM (%zu MB free, %zu MB overhead needed), "
|
||||
"disabling GPU offload\n",
|
||||
__func__, free / 1024 / 1024, overhead / 1024 / 1024);
|
||||
n_gpu_layers = 0;
|
||||
}
|
||||
|
||||
LOG_INF(
|
||||
"%s: Vulkan dynamic heuristic: available_vram=%zu MB, model_size=%zu MB, n_layers=%d, "
|
||||
"overhead=%zu MB, calculated_layers=%d\n",
|
||||
__func__, free / 1024 / 1024, file_size / 1024 / 1024, n_layers, overhead / 1024 / 1024,
|
||||
n_gpu_layers);
|
||||
} else {
|
||||
LOG_WRN(
|
||||
"%s: Vulkan dynamic heuristic: Insufficient VRAM (%zu MB free, %zu MB overhead needed), "
|
||||
"disabling GPU offload\n",
|
||||
__func__, free / 1024 / 1024, overhead / 1024 / 1024);
|
||||
n_gpu_layers = 0;
|
||||
}
|
||||
gguf_free(ctx);
|
||||
} else {
|
||||
LOG_WRN("%s: Failed to open GGUF file for heuristic, disabling GPU offload\n", __func__);
|
||||
// Fallback to CPU-only if GGUF fails
|
||||
n_gpu_layers = 0;
|
||||
}
|
||||
gguf_free(ctx);
|
||||
} else {
|
||||
LOG_WRN("%s: Failed to open GGUF file for heuristic, disabling GPU offload\n", __func__);
|
||||
// Fallback to CPU-only if GGUF fails
|
||||
n_gpu_layers = 0;
|
||||
LOG_WRN("%s: Dynamic VRAM heuristic: GPU %d not found, disabling GPU offload\n", __func__, params.main_gpu);
|
||||
n_gpu_layers = 0;
|
||||
}
|
||||
|
||||
mparams.n_gpu_layers = n_gpu_layers;
|
||||
}
|
||||
#endif
|
||||
|
||||
mparams.main_gpu = params.main_gpu;
|
||||
mparams.split_mode = params.split_mode;
|
||||
|
|
|
|||
|
|
@ -34,9 +34,6 @@ else()
|
|||
add_subdirectory(training)
|
||||
add_subdirectory(diffusion)
|
||||
add_subdirectory(model-conversion)
|
||||
if (GGML_VULKAN)
|
||||
add_subdirectory(vk_device_info)
|
||||
endif()
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
# these examples use the backends directly and cannot be built with dynamic loading
|
||||
|
|
|
|||
|
|
@ -1,5 +0,0 @@
|
|||
set(TARGET llama-vk-device-info)
|
||||
add_executable(${TARGET} vk_device_info.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
|
@ -1,24 +0,0 @@
|
|||
#include "ggml-vulkan.h"
|
||||
#include <cstdio>
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
int device_count = ggml_backend_vk_get_device_count();
|
||||
printf("Found %d Vulkan devices\\n", device_count);
|
||||
|
||||
for (int i = 0; i < device_count; i++) {
|
||||
ggml_vk_device_info info = ggml_backend_vk_get_device_info(i);
|
||||
printf("\\nDevice %d: %s\\n", i, info.device_name);
|
||||
printf(" Vendor ID: %04x\\n", info.vendor_id);
|
||||
printf(" Device ID: %04x\\n", info.device_id);
|
||||
printf(" API Version: 0x%08x\\n", info.api_version);
|
||||
printf(" Total Device Local Memory: %llu MB\\n", info.total_device_local_memory / (1024 * 1024));
|
||||
printf(" Has Memory Budget Ext: %s\\n", info.has_memory_budget_ext ? "Yes" : "No");
|
||||
printf(" Supports Float16: %s\\n", info.supports_float16 ? "Yes" : "No");
|
||||
printf(" Supports 16-bit Storage: %s\\n", info.supports_16bit_storage ? "Yes" : "No");
|
||||
|
||||
int default_layers = ggml_backend_vk_get_default_gpu_layers(i, -1);
|
||||
printf(" Default GPU Layers (heuristic): %d\\n", default_layers);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -24,20 +24,6 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(voi
|
|||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_vk_reg(void);
|
||||
|
||||
typedef struct {
|
||||
char device_name[256];
|
||||
uint32_t vendor_id;
|
||||
uint32_t device_id;
|
||||
uint64_t total_device_local_memory;
|
||||
bool has_memory_budget_ext;
|
||||
bool supports_float16;
|
||||
bool supports_16bit_storage;
|
||||
uint32_t api_version;
|
||||
} ggml_vk_device_info;
|
||||
|
||||
GGML_BACKEND_API ggml_vk_device_info ggml_backend_vk_get_device_info(int device);
|
||||
GGML_BACKEND_API int ggml_backend_vk_get_default_gpu_layers(int device, int default_layers);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -13303,59 +13303,6 @@ void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total
|
|||
}
|
||||
}
|
||||
|
||||
ggml_vk_device_info ggml_backend_vk_get_device_info(int device) {
|
||||
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
|
||||
GGML_ASSERT(device < (int) vk_instance.device_supports_membudget.size());
|
||||
|
||||
vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
|
||||
vk::PhysicalDeviceProperties props = vkdev.getProperties();
|
||||
|
||||
ggml_vk_device_info info = {};
|
||||
snprintf(info.device_name, sizeof(info.device_name), "%s", props.deviceName.data());
|
||||
info.vendor_id = props.vendorID;
|
||||
info.device_id = props.deviceID;
|
||||
info.api_version = props.apiVersion;
|
||||
|
||||
// Get memory info
|
||||
size_t free, total;
|
||||
ggml_backend_vk_get_device_memory(device, &free, &total);
|
||||
info.total_device_local_memory = total;
|
||||
info.has_memory_budget_ext = vk_instance.device_supports_membudget[device];
|
||||
|
||||
// Check features
|
||||
VkPhysicalDeviceFeatures2 device_features2;
|
||||
device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
|
||||
VkPhysicalDeviceVulkan11Features vk11_features;
|
||||
vk11_features.pNext = nullptr;
|
||||
vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES;
|
||||
device_features2.pNext = &vk11_features;
|
||||
vkGetPhysicalDeviceFeatures2(vkdev, &device_features2);
|
||||
|
||||
info.supports_16bit_storage = vk11_features.storageBuffer16BitAccess;
|
||||
|
||||
// Check for float16 support (shaderFloat16 or shaderInt8)
|
||||
const std::vector<vk::ExtensionProperties> ext_props = vkdev.enumerateDeviceExtensionProperties();
|
||||
bool fp16_compute = false;
|
||||
for (const auto& properties : ext_props) {
|
||||
if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
|
||||
fp16_compute = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
info.supports_float16 = fp16_compute;
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
int ggml_backend_vk_get_default_gpu_layers(int device, int default_layers) {
|
||||
// The dynamic heuristic in common.cpp handles the default case (n_gpu_layers = -1).
|
||||
// This function is kept for API compatibility but currently returns 0 to be safe
|
||||
// if called directly without the heuristic logic.
|
||||
(void)device;
|
||||
(void)default_layers;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static vk::PhysicalDeviceType ggml_backend_vk_get_device_type(int device_idx) {
|
||||
GGML_ASSERT(device_idx >= 0 && device_idx < (int) vk_instance.device_indices.size());
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue