diff --git a/common/common.cpp b/common/common.cpp index 0b9a3c6533..d0df5f8beb 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -9,9 +9,7 @@ #include "log.h" #include "llama.h" -#ifdef GGML_USE_VULKAN -#include "ggml-vulkan.h" -#endif +#include "ggml-backend.h" #include #include @@ -1165,89 +1163,104 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { if (params.n_gpu_layers != -1) { mparams.n_gpu_layers = params.n_gpu_layers; } -#ifdef GGML_USE_VULKAN else { // Dynamic VRAM heuristic int n_gpu_layers = 0; - // Ensure Vulkan is initialized - ggml_backend_vk_get_device_count(); + // Find the main GPU + int count = 0; + size_t free = 0; + size_t total = 0; + bool found_gpu = false; - // Get available VRAM - size_t free, total; - ggml_backend_vk_get_device_memory(params.main_gpu, &free, &total); - - // Parse GGUF to get model info - struct gguf_init_params gguf_params = { - /*.no_alloc = */ true, - /*.ctx = */ NULL, - }; - struct gguf_context * ctx = gguf_init_from_file(params.model.path.c_str(), gguf_params); - - if (ctx) { - int n_layers = -1; - - // Find block count from GGUF metadata - int n_kv = gguf_get_n_kv(ctx); - for (int i = 0; i < n_kv; i++) { - const char * key = gguf_get_key(ctx, i); - - // Find block_count (e.g. llama.block_count, gemma2.block_count) - const char * suffix = ".block_count"; - size_t key_len = strlen(key); - size_t suffix_len = strlen(suffix); - if (key_len >= suffix_len && strcmp(key + key_len - suffix_len, suffix) == 0) { - n_layers = gguf_get_val_u32(ctx, i); + size_t dev_count = ggml_backend_dev_count(); + for (size_t i = 0; i < dev_count; ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) { + if (count == params.main_gpu) { + ggml_backend_dev_memory(dev, &free, &total); + found_gpu = true; + break; } + count++; } + } - if (n_layers > 0) { - size_t file_size = std::filesystem::file_size(params.model.path); + if (found_gpu) { + // Parse GGUF to get model info + struct gguf_init_params gguf_params = { + /*.no_alloc = */ true, + /*.ctx = */ NULL, + }; + struct gguf_context * ctx = gguf_init_from_file(params.model.path.c_str(), gguf_params); - // Reserve overhead for KV cache, compute buffers, and system - // KV cache is allocated dynamically by llama.cpp based on offloaded layers - // Conservative overhead: 800MB covers KV cache + compute for most scenarios - const size_t overhead = 800 * 1024 * 1024; + if (ctx) { + int n_layers = -1; - if (free > overhead) { - size_t available_for_model = free - overhead; - size_t bytes_per_layer = file_size / n_layers; + // Find block count from GGUF metadata + int n_kv = gguf_get_n_kv(ctx); + for (int i = 0; i < n_kv; i++) { + const char * key = gguf_get_key(ctx, i); - if (bytes_per_layer > 0) { - n_gpu_layers = (int) (available_for_model / bytes_per_layer); + // Find block_count (e.g. llama.block_count, gemma2.block_count) + const char * suffix = ".block_count"; + size_t key_len = strlen(key); + size_t suffix_len = strlen(suffix); + if (key_len >= suffix_len && strcmp(key + key_len - suffix_len, suffix) == 0) { + n_layers = gguf_get_val_u32(ctx, i); } + } - // Clamp to total layers - if (n_gpu_layers > n_layers) { - n_gpu_layers = n_layers; - } - if (n_gpu_layers < 0) { + if (n_layers > 0) { + size_t file_size = std::filesystem::file_size(params.model.path); + + // Reserve overhead for KV cache, compute buffers, and system + // KV cache is allocated dynamically by llama.cpp based on offloaded layers + // Conservative overhead: 800MB covers KV cache + compute for most scenarios + const size_t overhead = 800 * 1024 * 1024; + + if (free > overhead) { + size_t available_for_model = free - overhead; + size_t bytes_per_layer = file_size / n_layers; + + if (bytes_per_layer > 0) { + n_gpu_layers = (int) (available_for_model / bytes_per_layer); + } + + // Clamp to total layers + if (n_gpu_layers > n_layers) { + n_gpu_layers = n_layers; + } + if (n_gpu_layers < 0) { + n_gpu_layers = 0; + } + + LOG_INF( + "%s: Dynamic VRAM heuristic: available_vram=%zu MB, model_size=%zu MB, n_layers=%d, " + "overhead=%zu MB, calculated_layers=%d\n", + __func__, free / 1024 / 1024, file_size / 1024 / 1024, n_layers, overhead / 1024 / 1024, + n_gpu_layers); + } else { + LOG_WRN( + "%s: Dynamic VRAM heuristic: Insufficient VRAM (%zu MB free, %zu MB overhead needed), " + "disabling GPU offload\n", + __func__, free / 1024 / 1024, overhead / 1024 / 1024); n_gpu_layers = 0; } - - LOG_INF( - "%s: Vulkan dynamic heuristic: available_vram=%zu MB, model_size=%zu MB, n_layers=%d, " - "overhead=%zu MB, calculated_layers=%d\n", - __func__, free / 1024 / 1024, file_size / 1024 / 1024, n_layers, overhead / 1024 / 1024, - n_gpu_layers); - } else { - LOG_WRN( - "%s: Vulkan dynamic heuristic: Insufficient VRAM (%zu MB free, %zu MB overhead needed), " - "disabling GPU offload\n", - __func__, free / 1024 / 1024, overhead / 1024 / 1024); - n_gpu_layers = 0; } + gguf_free(ctx); + } else { + LOG_WRN("%s: Failed to open GGUF file for heuristic, disabling GPU offload\n", __func__); + // Fallback to CPU-only if GGUF fails + n_gpu_layers = 0; } - gguf_free(ctx); } else { - LOG_WRN("%s: Failed to open GGUF file for heuristic, disabling GPU offload\n", __func__); - // Fallback to CPU-only if GGUF fails - n_gpu_layers = 0; + LOG_WRN("%s: Dynamic VRAM heuristic: GPU %d not found, disabling GPU offload\n", __func__, params.main_gpu); + n_gpu_layers = 0; } mparams.n_gpu_layers = n_gpu_layers; } -#endif mparams.main_gpu = params.main_gpu; mparams.split_mode = params.split_mode; diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 139b236c71..dab795fb90 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -34,9 +34,6 @@ else() add_subdirectory(training) add_subdirectory(diffusion) add_subdirectory(model-conversion) - if (GGML_VULKAN) - add_subdirectory(vk_device_info) - endif() if (NOT GGML_BACKEND_DL) add_subdirectory(convert-llama2c-to-ggml) # these examples use the backends directly and cannot be built with dynamic loading diff --git a/examples/vk_device_info/CMakeLists.txt b/examples/vk_device_info/CMakeLists.txt deleted file mode 100644 index 2a50cd0f24..0000000000 --- a/examples/vk_device_info/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-vk-device-info) -add_executable(${TARGET} vk_device_info.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/vk_device_info/vk_device_info.cpp b/examples/vk_device_info/vk_device_info.cpp deleted file mode 100644 index 4b944bf0d5..0000000000 --- a/examples/vk_device_info/vk_device_info.cpp +++ /dev/null @@ -1,24 +0,0 @@ -#include "ggml-vulkan.h" -#include - -int main(int argc, char ** argv) { - int device_count = ggml_backend_vk_get_device_count(); - printf("Found %d Vulkan devices\\n", device_count); - - for (int i = 0; i < device_count; i++) { - ggml_vk_device_info info = ggml_backend_vk_get_device_info(i); - printf("\\nDevice %d: %s\\n", i, info.device_name); - printf(" Vendor ID: %04x\\n", info.vendor_id); - printf(" Device ID: %04x\\n", info.device_id); - printf(" API Version: 0x%08x\\n", info.api_version); - printf(" Total Device Local Memory: %llu MB\\n", info.total_device_local_memory / (1024 * 1024)); - printf(" Has Memory Budget Ext: %s\\n", info.has_memory_budget_ext ? "Yes" : "No"); - printf(" Supports Float16: %s\\n", info.supports_float16 ? "Yes" : "No"); - printf(" Supports 16-bit Storage: %s\\n", info.supports_16bit_storage ? "Yes" : "No"); - - int default_layers = ggml_backend_vk_get_default_gpu_layers(i, -1); - printf(" Default GPU Layers (heuristic): %d\\n", default_layers); - } - - return 0; -} diff --git a/ggml/include/ggml-vulkan.h b/ggml/include/ggml-vulkan.h index 2ca1074dbb..ed5ea5f798 100644 --- a/ggml/include/ggml-vulkan.h +++ b/ggml/include/ggml-vulkan.h @@ -24,20 +24,6 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(voi GGML_BACKEND_API ggml_backend_reg_t ggml_backend_vk_reg(void); -typedef struct { - char device_name[256]; - uint32_t vendor_id; - uint32_t device_id; - uint64_t total_device_local_memory; - bool has_memory_budget_ext; - bool supports_float16; - bool supports_16bit_storage; - uint32_t api_version; -} ggml_vk_device_info; - -GGML_BACKEND_API ggml_vk_device_info ggml_backend_vk_get_device_info(int device); -GGML_BACKEND_API int ggml_backend_vk_get_default_gpu_layers(int device, int default_layers); - #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index cf37ce6d84..bc8d3cdcb5 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -13303,59 +13303,6 @@ void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total } } -ggml_vk_device_info ggml_backend_vk_get_device_info(int device) { - GGML_ASSERT(device < (int) vk_instance.device_indices.size()); - GGML_ASSERT(device < (int) vk_instance.device_supports_membudget.size()); - - vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]]; - vk::PhysicalDeviceProperties props = vkdev.getProperties(); - - ggml_vk_device_info info = {}; - snprintf(info.device_name, sizeof(info.device_name), "%s", props.deviceName.data()); - info.vendor_id = props.vendorID; - info.device_id = props.deviceID; - info.api_version = props.apiVersion; - - // Get memory info - size_t free, total; - ggml_backend_vk_get_device_memory(device, &free, &total); - info.total_device_local_memory = total; - info.has_memory_budget_ext = vk_instance.device_supports_membudget[device]; - - // Check features - VkPhysicalDeviceFeatures2 device_features2; - device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; - VkPhysicalDeviceVulkan11Features vk11_features; - vk11_features.pNext = nullptr; - vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES; - device_features2.pNext = &vk11_features; - vkGetPhysicalDeviceFeatures2(vkdev, &device_features2); - - info.supports_16bit_storage = vk11_features.storageBuffer16BitAccess; - - // Check for float16 support (shaderFloat16 or shaderInt8) - const std::vector ext_props = vkdev.enumerateDeviceExtensionProperties(); - bool fp16_compute = false; - for (const auto& properties : ext_props) { - if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { - fp16_compute = true; - break; - } - } - info.supports_float16 = fp16_compute; - - return info; -} - -int ggml_backend_vk_get_default_gpu_layers(int device, int default_layers) { - // The dynamic heuristic in common.cpp handles the default case (n_gpu_layers = -1). - // This function is kept for API compatibility but currently returns 0 to be safe - // if called directly without the heuristic logic. - (void)device; - (void)default_layers; - return 0; -} - static vk::PhysicalDeviceType ggml_backend_vk_get_device_type(int device_idx) { GGML_ASSERT(device_idx >= 0 && device_idx < (int) vk_instance.device_indices.size());