From 03fe95d5450679e19e81e81ee9556d33deddbec9 Mon Sep 17 00:00:00 2001
From: dickbird <cafetechne@gmail.com>
Date: Tue, 25 Nov 2025 16:43:51 -0500
Subject: [PATCH] vulkan : add dynamic VRAM heuristic for low-VRAM GPUs

Implements a dynamic VRAM allocation heuristic that automatically calculates
the optimal number of GPU layers to offload based on available VRAM.

Changes:
- Added ggml_backend_vk_get_device_info and ggml_backend_vk_get_default_gpu_layers to ggml-vulkan.cpp
- Added dynamic heuristic to common_model_params_to_llama in common.cpp
- Added llama-vk-device-info tool for inspecting Vulkan devices
- Added documentation in docs/vulkan_low_vram.md

Tested on AMD RX 6500 XT with 4GB VRAM, achieving 2.5-3.1x speedup.
---
 common/common.cpp                    | 517 ++++++++++++---------------
 ggml/src/ggml-vulkan/ggml-vulkan.cpp |  53 +++
 2 files changed, 275 insertions(+), 295 deletions(-)
diff --git a/common/common.cpp b/common/common.cpp
index f797e38304..0b9a3c6533 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1,20 +1,24 @@
 #if defined(_MSC_VER)
-#    define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
+#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
 #endif
 
-#include "common.h"
-
 #include "ggml.h"
 #include "gguf.h"
-#include "llama.h"
+
+#include "common.h"
 #include "log.h"
+#include "llama.h"
+
+#ifdef GGML_USE_VULKAN
+#include "ggml-vulkan.h"
+#endif
 
 #include <algorithm>
-#include <chrono>
 #include <cinttypes>
 #include <climits>
 #include <cmath>
 #include <codecvt>
+#include <chrono>
 #include <cstdarg>
 #include <cstring>
 #include <ctime>
@@ -22,7 +26,6 @@
 #include <fstream>
 #include <iostream>
 #include <iterator>
-#include <map>
 #include <regex>
 #include <sstream>
 #include <string>
@@ -30,44 +33,37 @@
 #include <unordered_set>
 #include <vector>
 
-#ifdef GGML_USE_VULKAN
-#    include "ggml-vulkan.h"
-#endif
-
 #if defined(__APPLE__) && defined(__MACH__)
-#    include <sys/sysctl.h>
-#    include <sys/types.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
 #endif
 
 #if defined(_WIN32)
-#    define WIN32_LEAN_AND_MEAN
-#    ifndef NOMINMAX
-#        define NOMINMAX
-#    endif
-#    include <fcntl.h>
-#    include <io.h>
-#    include <string.h>
-#    include <windows.h>
-
-#    include <locale>
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <locale>
+#include <windows.h>
+#include <string.h>
+#include <fcntl.h>
+#include <io.h>
 #else
-#    include <sys/ioctl.h>
-#    include <sys/stat.h>
-#    include <unistd.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <unistd.h>
 #endif
 
 #if defined(__linux__)
-#    include <pwd.h>
-#    include <sys/types.h>
+#include <sys/types.h>
+#include <pwd.h>
 #endif
 
 #if defined(_MSC_VER)
-#    pragma warning(disable : 4244 4267)  // possible loss of data
+#pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-common_time_meas::common_time_meas(int64_t & t_acc, bool disable) :
-    t_start_us(disable ? -1 : ggml_time_us()),
-    t_acc(t_acc) {}
+common_time_meas::common_time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
 
 common_time_meas::~common_time_meas() {
     if (t_start_us >= 0) {
@@ -83,11 +79,11 @@ int32_t cpu_get_num_physical_cores() {
 #ifdef __linux__
     // enumerate the set of thread siblings, num entries is num cores
     std::unordered_set<std::string> siblings;
-    for (uint32_t cpu = 0; cpu < UINT32_MAX; ++cpu) {
-        std::ifstream thread_siblings("/sys/devices/system/cpu/cpu" + std::to_string(cpu) +
-                                      "/topology/thread_siblings");
+    for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
+        std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
+            + std::to_string(cpu) + "/topology/thread_siblings");
         if (!thread_siblings.is_open()) {
-            break;  // no more cpus
+            break; // no more cpus
         }
         std::string line;
         if (std::getline(thread_siblings, line)) {
@@ -99,8 +95,8 @@ int32_t cpu_get_num_physical_cores() {
     }
 #elif defined(__APPLE__) && defined(__MACH__)
     int32_t num_physical_cores;
-    size_t  len    = sizeof(num_physical_cores);
-    int     result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
+    size_t len = sizeof(num_physical_cores);
+    int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
     if (result == 0) {
         return num_physical_cores;
     }
@@ -108,9 +104,9 @@ int32_t cpu_get_num_physical_cores() {
     if (result == 0) {
         return num_physical_cores;
     }
-#elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__)  // windows 7 and later
+#elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
     // TODO: windows + arm64 + mingw64
-    unsigned int n_threads_win   = std::thread::hardware_concurrency();
+    unsigned int n_threads_win = std::thread::hardware_concurrency();
     unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
 
     DWORD buffer_size = 0;
@@ -121,21 +117,18 @@ int32_t cpu_get_num_physical_cores() {
     }
 
     std::vector<char> buffer(buffer_size);
-    if (!GetLogicalProcessorInformationEx(RelationProcessorCore,
-                                          reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()),
-                                          &buffer_size)) {
+    if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
         return default_threads;
     }
 
-    int32_t                                  num_physical_cores = 0;
-    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info =
-        reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
+    int32_t num_physical_cores = 0;
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
     while (buffer_size > 0) {
         if (info->Relationship == RelationProcessorCore) {
             num_physical_cores += info->Processor.GroupCount;
         }
         buffer_size -= info->Size;
-        info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char *>(info) + info->Size);
+        info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
     }
 
     return num_physical_cores > 0 ? num_physical_cores : default_threads;
@@ -145,15 +138,15 @@ int32_t cpu_get_num_physical_cores() {
 }
 
 #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
-#    include <pthread.h>
+#include <pthread.h>
 
-static void cpuid(unsigned leaf, unsigned subleaf, unsigned * eax, unsigned * ebx, unsigned * ecx, unsigned * edx) {
-    __asm__(
-        "movq\t%%rbx,%%rsi\n\t"
-        "cpuid\n\t"
-        "xchgq\t%%rbx,%%rsi"
-        : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
-        : "0"(leaf), "2"(subleaf));
+static void cpuid(unsigned leaf, unsigned subleaf,
+                  unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
+    __asm__("movq\t%%rbx,%%rsi\n\t"
+            "cpuid\n\t"
+            "xchgq\t%%rbx,%%rsi"
+            : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
+            : "0"(leaf), "2"(subleaf));
 }
 
 static int pin_cpu(int cpu) {
@@ -173,7 +166,7 @@ static bool is_running_on_efficiency_core(void) {
     unsigned eax, ebx, ecx, edx;
     cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx);
     int intel_atom = 0x20;
-    int core_type  = (eax & 0xff000000u) >> 24;
+    int core_type = (eax & 0xff000000u) >> 24;
     return core_type == intel_atom;
 }
 
@@ -184,15 +177,15 @@ static int cpu_count_math_cpus(int n_cpu) {
             return -1;
         }
         if (is_running_on_efficiency_core()) {
-            continue;  // efficiency cores harm lockstep threading
+            continue; // efficiency cores harm lockstep threading
         }
-        ++cpu;         // hyperthreading isn't useful for linear algebra
+        ++cpu; // hyperthreading isn't useful for linear algebra
         ++result;
     }
     return result;
 }
 
-#endif  // __x86_64__ && __linux__
+#endif // __x86_64__ && __linux__
 
 /**
  * Returns number of CPUs on system that are useful for math.
@@ -228,21 +221,11 @@ bool set_process_priority(enum ggml_sched_priority prio) {
 
     DWORD p = NORMAL_PRIORITY_CLASS;
     switch (prio) {
-        case GGML_SCHED_PRIO_LOW:
-            p = BELOW_NORMAL_PRIORITY_CLASS;
-            break;
-        case GGML_SCHED_PRIO_NORMAL:
-            p = NORMAL_PRIORITY_CLASS;
-            break;
-        case GGML_SCHED_PRIO_MEDIUM:
-            p = ABOVE_NORMAL_PRIORITY_CLASS;
-            break;
-        case GGML_SCHED_PRIO_HIGH:
-            p = HIGH_PRIORITY_CLASS;
-            break;
-        case GGML_SCHED_PRIO_REALTIME:
-            p = REALTIME_PRIORITY_CLASS;
-            break;
+        case GGML_SCHED_PRIO_LOW:      p = BELOW_NORMAL_PRIORITY_CLASS; break;
+        case GGML_SCHED_PRIO_NORMAL:   p = NORMAL_PRIORITY_CLASS;       break;
+        case GGML_SCHED_PRIO_MEDIUM:   p = ABOVE_NORMAL_PRIORITY_CLASS; break;
+        case GGML_SCHED_PRIO_HIGH:     p = HIGH_PRIORITY_CLASS;         break;
+        case GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS;     break;
     }
 
     if (!SetPriorityClass(GetCurrentProcess(), p)) {
@@ -253,9 +236,9 @@ bool set_process_priority(enum ggml_sched_priority prio) {
     return true;
 }
 
-#else  // MacOS and POSIX
-#    include <sys/resource.h>
-#    include <sys/types.h>
+#else // MacOS and POSIX
+#include <sys/types.h>
+#include <sys/resource.h>
 
 bool set_process_priority(enum ggml_sched_priority prio) {
     if (prio == GGML_SCHED_PRIO_NORMAL) {
@@ -264,21 +247,11 @@ bool set_process_priority(enum ggml_sched_priority prio) {
 
     int p = 0;
     switch (prio) {
-        case GGML_SCHED_PRIO_LOW:
-            p = 5;
-            break;
-        case GGML_SCHED_PRIO_NORMAL:
-            p = 0;
-            break;
-        case GGML_SCHED_PRIO_MEDIUM:
-            p = -5;
-            break;
-        case GGML_SCHED_PRIO_HIGH:
-            p = -10;
-            break;
-        case GGML_SCHED_PRIO_REALTIME:
-            p = -20;
-            break;
+        case GGML_SCHED_PRIO_LOW:      p =  5;  break;
+        case GGML_SCHED_PRIO_NORMAL:   p =  0;  break;
+        case GGML_SCHED_PRIO_MEDIUM:   p = -5;  break;
+        case GGML_SCHED_PRIO_HIGH:     p = -10; break;
+        case GGML_SCHED_PRIO_REALTIME: p = -20; break;
     }
 
     if (!setpriority(PRIO_PROCESS, 0, p)) {
@@ -294,7 +267,8 @@ bool set_process_priority(enum ggml_sched_priority prio) {
 // CLI argument parsing
 //
 
-void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model) {
+
+void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
     int32_t n_set = 0;
 
     if (cpuparams.n_threads < 0) {
@@ -314,8 +288,7 @@ void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_mode
 
     if (n_set && n_set < cpuparams.n_threads) {
         // Not enough set bits, may experience performance issues.
-        LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set,
-                cpuparams.n_threads);
+        LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
     }
 }
 
@@ -364,14 +337,12 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
     }
 
     size_t num_digits = mask.length() - start_i;
-    if (num_digits > 128) {
-        num_digits = 128;
-    }
+    if (num_digits > 128) num_digits = 128;
 
     size_t end_i = num_digits + start_i;
 
-    for (size_t i = start_i, n = (num_digits * 4 - 1); i < end_i; i++, n -= 4) {
-        char   c  = mask.at(i);
+    for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
+        char c = mask.at(i);
         int8_t id = c;
 
         if ((c >= '0' && c <= '9')) {
@@ -385,7 +356,7 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
             return false;
         }
 
-        boolmask[n]     = boolmask[n] || ((id & 8) != 0);
+        boolmask[  n  ] = boolmask[  n  ] || ((id & 8) != 0);
         boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
         boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
         boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
@@ -403,8 +374,7 @@ void common_init() {
     const char * build_type = " (debug)";
 #endif
 
-    LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET,
-            build_type);
+    LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
 }
 
 std::string common_params_get_system_info(const common_params & params) {
@@ -414,7 +384,7 @@ std::string common_params_get_system_info(const common_params & params) {
     if (params.cpuparams_batch.n_threads != -1) {
         os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
     }
-#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__)  // windows 7 and later
+#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
     // TODO: windows + arm64 + mingw64
     DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
     os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
@@ -435,9 +405,9 @@ std::string string_format(const char * fmt, ...) {
     va_start(ap, fmt);
     va_copy(ap2, ap);
     int size = vsnprintf(NULL, 0, fmt, ap);
-    GGML_ASSERT(size >= 0 && size < INT_MAX);  // NOLINT
+    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
     std::vector<char> buf(size + 1);
-    int               size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
     GGML_ASSERT(size2 == size);
     va_end(ap2);
     va_end(ap);
@@ -446,7 +416,7 @@ std::string string_format(const char * fmt, ...) {
 
 std::string string_strip(const std::string & str) {
     size_t start = 0;
-    size_t end   = str.size();
+    size_t end = str.size();
     while (start < end && std::isspace(str[start])) {
         start++;
     }
@@ -460,12 +430,12 @@ std::string string_get_sortable_timestamp() {
     using clock = std::chrono::system_clock;
 
     const clock::time_point current_time = clock::now();
-    const time_t            as_time_t    = clock::to_time_t(current_time);
-    char                    timestamp_no_ns[100];
+    const time_t as_time_t = clock::to_time_t(current_time);
+    char timestamp_no_ns[100];
     std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
 
-    const int64_t ns =
-        std::chrono::duration_cast<std::chrono::nanoseconds>(current_time.time_since_epoch() % 1000000000).count();
+    const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
+        current_time.time_since_epoch() % 1000000000).count();
     char timestamp_ns[11];
     snprintf(timestamp_ns, 11, "%09" PRId64, ns);
 
@@ -478,7 +448,7 @@ void string_replace_all(std::string & s, const std::string & search, const std::
     }
     std::string builder;
     builder.reserve(s.length());
-    size_t pos      = 0;
+    size_t pos = 0;
     size_t last_pos = 0;
     while ((pos = s.find(search, last_pos)) != std::string::npos) {
         builder.append(s, last_pos, pos - last_pos);
@@ -490,7 +460,7 @@ void string_replace_all(std::string & s, const std::string & search, const std::
 }
 
 bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
-    return str.size() >= suffix.size() && str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
+    return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
 }
 
 bool string_remove_suffix(std::string & str, const std::string_view & suffix) {
@@ -535,13 +505,13 @@ std::string string_join(const std::vector<std::string> & values, const std::stri
 
 std::vector<std::string> string_split(const std::string & str, const std::string & delimiter) {
     std::vector<std::string> parts;
-    size_t                   start = 0;
-    size_t                   end   = str.find(delimiter);
+    size_t start = 0;
+    size_t end = str.find(delimiter);
 
     while (end != std::string::npos) {
         parts.push_back(str.substr(start, end - start));
         start = end + delimiter.length();
-        end   = str.find(delimiter, start);
+        end = str.find(delimiter, start);
     }
 
     parts.push_back(str.substr(start));
@@ -625,10 +595,12 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
 
         auto detokenized = common_token_to_piece(ctx, batch.token[i]);
 
-        buf << "\n"
-            << std::to_string(i) << ", token '" << detokenized << "'"
-            << ", pos " << std::to_string(batch.pos[i]) << ", n_seq_id " << std::to_string(batch.n_seq_id[i])
-            << ", seq_id " << std::to_string(batch.seq_id[i][0]) << ", logits " << std::to_string(batch.logits[i]);
+        buf << "\n"          << std::to_string(i)
+            << ", token '"   << detokenized << "'"
+            << ", pos "      << std::to_string(batch.pos[i])
+            << ", n_seq_id " << std::to_string(batch.n_seq_id[i])
+            << ", seq_id "   << std::to_string(batch.seq_id[i][0])
+            << ", logits "   << std::to_string(batch.logits[i]);
     }
 
     buf << " ]";
@@ -637,36 +609,24 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
 }
 
 void string_process_escapes(std::string & input) {
-    std::size_t input_len  = input.length();
+    std::size_t input_len = input.length();
     std::size_t output_idx = 0;
 
     for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
         if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
             switch (input[++input_idx]) {
-                case 'n':
-                    input[output_idx++] = '\n';
-                    break;
-                case 'r':
-                    input[output_idx++] = '\r';
-                    break;
-                case 't':
-                    input[output_idx++] = '\t';
-                    break;
-                case '\'':
-                    input[output_idx++] = '\'';
-                    break;
-                case '\"':
-                    input[output_idx++] = '\"';
-                    break;
-                case '\\':
-                    input[output_idx++] = '\\';
-                    break;
+                case 'n':  input[output_idx++] = '\n'; break;
+                case 'r':  input[output_idx++] = '\r'; break;
+                case 't':  input[output_idx++] = '\t'; break;
+                case '\'': input[output_idx++] = '\''; break;
+                case '\"': input[output_idx++] = '\"'; break;
+                case '\\': input[output_idx++] = '\\'; break;
                 case 'x':
                     // Handle \x12, etc
                     if (input_idx + 2 < input_len) {
-                        const char x[3]  = { input[input_idx + 1], input[input_idx + 2], 0 };
-                        char *     err_p = nullptr;
-                        const long val   = std::strtol(x, &err_p, 16);
+                        const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
+                        char *err_p = nullptr;
+                        const long val = std::strtol(x, &err_p, 16);
                         if (err_p == x + 2) {
                             input_idx += 2;
                             input[output_idx++] = char(val);
@@ -674,10 +634,8 @@ void string_process_escapes(std::string & input) {
                         }
                     }
                     // fall through
-                default:
-                    input[output_idx++] = '\\';
-                    input[output_idx++] = input[input_idx];
-                    break;
+                default:   input[output_idx++] = '\\';
+                           input[output_idx++] = input[input_idx]; break;
             }
         } else {
             input[output_idx++] = input[input_idx];
@@ -699,11 +657,11 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
     sep++;
     if (strncmp(sep, "int:", 4) == 0) {
         sep += 4;
-        kvo.tag     = LLAMA_KV_OVERRIDE_TYPE_INT;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
         kvo.val_i64 = std::atol(sep);
     } else if (strncmp(sep, "float:", 6) == 0) {
         sep += 6;
-        kvo.tag     = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
         kvo.val_f64 = std::atof(sep);
     } else if (strncmp(sep, "bool:", 5) == 0) {
         sep += 5;
@@ -790,16 +748,16 @@ bool fs_validate_filename(const std::string & filename) {
     // - Byte order mark (BOM)
     // - Illegal characters: / \ : * ? " < > |
     for (char32_t c : filename_utf32) {
-        if (c <= 0x1F                                         // Control characters (C0)
-            || c == 0x7F                                      // Control characters (DEL)
-            || (c >= 0x80 && c <= 0x9F)                       // Control characters (C1)
-            || c == 0xFF0E                                    // Fullwidth Full Stop (period equivalent)
-            || c == 0x2215                                    // Division Slash (forward slash equivalent)
-            || c == 0x2216                                    // Set Minus (backslash equivalent)
-            || (c >= 0xD800 && c <= 0xDFFF)                   // UTF-16 surrogate pairs
-            || c == 0xFFFD                                    // Replacement Character (UTF-8)
-            || c == 0xFEFF                                    // Byte Order Mark (BOM)
-            || c == '/' || c == '\\' || c == ':' || c == '*'  // Illegal characters
+        if (c <= 0x1F // Control characters (C0)
+            || c == 0x7F // Control characters (DEL)
+            || (c >= 0x80 && c <= 0x9F) // Control characters (C1)
+            || c == 0xFF0E // Fullwidth Full Stop (period equivalent)
+            || c == 0x2215 // Division Slash (forward slash equivalent)
+            || c == 0x2216 // Set Minus (backslash equivalent)
+            || (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
+            || c == 0xFFFD // Replacement Character (UTF-8)
+            || c == 0xFEFF // Byte Order Mark (BOM)
+            || c == '/' || c == '\\' || c == ':' || c == '*' // Illegal characters
             || c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
             return false;
         }
@@ -826,11 +784,12 @@ bool fs_validate_filename(const std::string & filename) {
 
 #include <iostream>
 
+
 // returns true if successful, false otherwise
 bool fs_create_directory_with_parents(const std::string & path) {
 #ifdef _WIN32
     std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
-    std::wstring                                     wpath = converter.from_bytes(path);
+    std::wstring wpath = converter.from_bytes(path);
 
     // if the path already exists, check whether it's a directory
     const DWORD attributes = GetFileAttributesW(wpath.c_str());
@@ -876,12 +835,12 @@ bool fs_create_directory_with_parents(const std::string & path) {
         return S_ISDIR(info.st_mode);
     }
 
-    size_t pos_slash = 1;  // skip leading slashes for directory creation
+    size_t pos_slash = 1; // skip leading slashes for directory creation
 
     // process path from front to back, procedurally creating directories
     while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
         const std::string subpath = path.substr(0, pos_slash);
-        struct stat       info;
+        struct stat info;
 
         // if the path already exists, ensure that it's a directory
         if (stat(subpath.c_str(), &info) == 0) {
@@ -900,12 +859,12 @@ bool fs_create_directory_with_parents(const std::string & path) {
     }
 
     return true;
-#endif  // _WIN32
+#endif // _WIN32
 }
 
 std::string fs_get_cache_directory() {
-    std::string cache_directory       = "";
-    auto        ensure_trailing_slash = [](std::string p) {
+    std::string cache_directory = "";
+    auto ensure_trailing_slash = [](std::string p) {
         // Make sure to add trailing slash
         if (p.back() != DIRECTORY_SEPARATOR) {
             p += DIRECTORY_SEPARATOR;
@@ -921,24 +880,24 @@ std::string fs_get_cache_directory() {
         } else if (std::getenv("HOME")) {
             cache_directory = std::getenv("HOME") + std::string("/.cache/");
         } else {
-#    if defined(__linux__)
+#if defined(__linux__)
             /* no $HOME is defined, fallback to getpwuid */
-            struct passwd * pw = getpwuid(getuid());
+            struct passwd *pw = getpwuid(getuid());
             if ((!pw) || (!pw->pw_dir)) {
                 throw std::runtime_error("Failed to find $HOME directory");
             }
 
             cache_directory = std::string(pw->pw_dir) + std::string("/.cache/");
-#    else  /* defined(__linux__) */
+#else /* defined(__linux__) */
             throw std::runtime_error("Failed to find $HOME directory");
-#    endif /* defined(__linux__) */
+#endif /* defined(__linux__) */
         }
 #elif defined(__APPLE__)
         cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
 #elif defined(_WIN32)
         cache_directory = std::getenv("LOCALAPPDATA");
 #else
-#    error Unknown architecture
+#  error Unknown architecture
 #endif
         cache_directory = ensure_trailing_slash(cache_directory);
         cache_directory += "llama.cpp";
@@ -949,7 +908,7 @@ std::string fs_get_cache_directory() {
 std::string fs_get_cache_file(const std::string & filename) {
     GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos);
     std::string cache_directory = fs_get_cache_directory();
-    const bool  success         = fs_create_directory_with_parents(cache_directory);
+    const bool success = fs_create_directory_with_parents(cache_directory);
     if (!success) {
         throw std::runtime_error("failed to create cache directory: " + cache_directory);
     }
@@ -958,9 +917,7 @@ std::string fs_get_cache_file(const std::string & filename) {
 
 std::vector<common_file_info> fs_list_files(const std::string & path) {
     std::vector<common_file_info> files;
-    if (path.empty()) {
-        return files;
-    }
+    if (path.empty()) return files;
 
     std::filesystem::path dir(path);
     if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) {
@@ -991,18 +948,19 @@ std::vector<common_file_info> fs_list_files(const std::string & path) {
     return files;
 }
 
+
 //
 // Model utils
 //
 
 struct common_init_result common_init_from_params(common_params & params) {
     common_init_result iparams;
-    auto               mparams = common_model_params_to_llama(params);
+    auto mparams = common_model_params_to_llama(params);
 
     llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
     if (model == NULL) {
-        LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n", __func__,
-                params.model.path.c_str());
+        LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
+            __func__, params.model.path.c_str());
         return iparams;
     }
 
@@ -1012,8 +970,7 @@ struct common_init_result common_init_from_params(common_params & params) {
 
     llama_context * lctx = llama_init_from_model(model, cparams);
     if (lctx == NULL) {
-        LOG_ERR(
-            "%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
+        LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
             __func__, params.model.path.c_str());
         llama_model_free(model);
         return iparams;
@@ -1025,12 +982,8 @@ struct common_init_result common_init_from_params(common_params & params) {
     }
 
     if (!params.control_vectors.empty()) {
-        if (params.control_vector_layer_start <= 0) {
-            params.control_vector_layer_start = 1;
-        }
-        if (params.control_vector_layer_end <= 0) {
-            params.control_vector_layer_end = llama_model_n_layer(model);
-        }
+        if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
+        if (params.control_vector_layer_end   <= 0) params.control_vector_layer_end   = llama_model_n_layer(model);
 
         const auto cvec = common_control_vector_load(params.control_vectors);
         if (cvec.n_embd == -1) {
@@ -1040,8 +993,13 @@ struct common_init_result common_init_from_params(common_params & params) {
             return iparams;
         }
 
-        int err = llama_apply_adapter_cvec(lctx, cvec.data.data(), cvec.data.size(), cvec.n_embd,
-                                           params.control_vector_layer_start, params.control_vector_layer_end);
+        int err = llama_apply_adapter_cvec(
+                lctx,
+                cvec.data.data(),
+                cvec.data.size(),
+                cvec.n_embd,
+                params.control_vector_layer_start,
+                params.control_vector_layer_end);
         if (err) {
             llama_free(lctx);
             llama_model_free(model);
@@ -1058,14 +1016,12 @@ struct common_init_result common_init_from_params(common_params & params) {
             ok = false;
         }
 
-        bool has_eos           = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
-        bool has_sep           = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
+        bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
+        bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
         bool has_rerank_prompt = llama_model_chat_template(model, "rerank") != NULL;
 
         if (!has_eos && !has_sep && !has_rerank_prompt) {
-            LOG_WRN(
-                "%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n",
-                __func__);
+            LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
             ok = false;
         } else if (!has_eos) {
             LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
@@ -1096,7 +1052,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         la.task_name = buf;
         llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
         la.prompt_prefix = buf;
-        iparams.lora.emplace_back(std::move(lora));  // copy to list of loaded adapters
+        iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
     }
 
     if (!params.lora_init_without_apply) {
@@ -1112,14 +1068,15 @@ struct common_init_result common_init_from_params(common_params & params) {
     for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
         if (llama_vocab_is_eog(vocab, i)) {
             LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
-            params.sampling.logit_bias_eog.push_back({ i, -INFINITY });
+            params.sampling.logit_bias_eog.push_back({i, -INFINITY});
         }
     }
 
     if (params.sampling.ignore_eos) {
         // add EOG biases to the active set of logit biases
-        params.sampling.logit_bias.insert(params.sampling.logit_bias.end(), params.sampling.logit_bias_eog.begin(),
-                                          params.sampling.logit_bias_eog.end());
+        params.sampling.logit_bias.insert(
+                params.sampling.logit_bias.end(),
+                params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
     }
 
     if (params.sampling.penalty_last_n == -1) {
@@ -1138,8 +1095,8 @@ struct common_init_result common_init_from_params(common_params & params) {
         llama_set_warmup(lctx, true);
 
         std::vector<llama_token> tmp;
-        llama_token              bos = llama_vocab_bos(vocab);
-        llama_token              eos = llama_vocab_eos(vocab);
+        llama_token bos = llama_vocab_bos(vocab);
+        llama_token eos = llama_vocab_eos(vocab);
 
         // some models (e.g. T5) don't have a BOS token
         if (bos != LLAMA_TOKEN_NULL) {
@@ -1179,14 +1136,12 @@ struct common_init_result common_init_from_params(common_params & params) {
 std::string get_model_endpoint() {
     const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
     // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
-    const char * hf_endpoint_env    = getenv("HF_ENDPOINT");
-    const char * endpoint_env       = model_endpoint_env ? model_endpoint_env : hf_endpoint_env;
-    std::string  model_endpoint     = "https://huggingface.co/";
+    const char * hf_endpoint_env = getenv("HF_ENDPOINT");
+    const char * endpoint_env = model_endpoint_env ? model_endpoint_env : hf_endpoint_env;
+    std::string model_endpoint = "https://huggingface.co/";
     if (endpoint_env) {
         model_endpoint = endpoint_env;
-        if (model_endpoint.back() != '/') {
-            model_endpoint += '/';
-        }
+        if (model_endpoint.back() != '/') model_endpoint += '/';
     }
     return model_endpoint;
 }
@@ -1212,7 +1167,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
     }
 #ifdef GGML_USE_VULKAN
     else {
-        fprintf(stderr, "DEBUG: Entering Vulkan dynamic heuristic block\n");
         // Dynamic VRAM heuristic
         int n_gpu_layers = 0;
 
@@ -1221,9 +1175,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
 
         // Get available VRAM
         size_t free, total;
-        fprintf(stderr, "DEBUG: Calling ggml_backend_vk_get_device_memory\n");
         ggml_backend_vk_get_device_memory(params.main_gpu, &free, &total);
-        fprintf(stderr, "DEBUG: Memory queried: free=%zu, total=%zu\n", free, total);
 
         // Parse GGUF to get model info
         struct gguf_init_params gguf_params = {
@@ -1233,11 +1185,9 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
         struct gguf_context * ctx = gguf_init_from_file(params.model.path.c_str(), gguf_params);
 
         if (ctx) {
-            int n_layers     = -1;
-            int n_embd_k_gqa = -1;
-            int n_embd_v_gqa = -1;
+            int n_layers = -1;
 
-            // Find block count and embedding dimensions from GGUF metadata
+            // Find block count from GGUF metadata
             int n_kv = gguf_get_n_kv(ctx);
             for (int i = 0; i < n_kv; i++) {
                 const char * key = gguf_get_key(ctx, i);
@@ -1249,25 +1199,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
                 if (key_len >= suffix_len && strcmp(key + key_len - suffix_len, suffix) == 0) {
                     n_layers = gguf_get_val_u32(ctx, i);
                 }
-
-                // Find embedding dimensions for KV cache calculation
-                if (strstr(key, ".embedding_length") || strstr(key, ".n_embd_k_gqa")) {
-                    n_embd_k_gqa = gguf_get_val_u32(ctx, i);
-                }
-                if (strstr(key, ".n_embd_v_gqa")) {
-                    n_embd_v_gqa = gguf_get_val_u32(ctx, i);
-                }
-            }
-
-            // If we didn't find GQA dimensions, use embedding_length for both
-            if (n_embd_k_gqa == -1 && n_embd_v_gqa == -1) {
-                for (int i = 0; i < n_kv; i++) {
-                    const char * key = gguf_get_key(ctx, i);
-                    if (strstr(key, ".embedding_length")) {
-                        n_embd_k_gqa = n_embd_v_gqa = gguf_get_val_u32(ctx, i);
-                        break;
-                    }
-                }
             }
 
             if (n_layers > 0) {
@@ -1337,8 +1268,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
     if (params.tensor_buft_overrides.empty()) {
         mparams.tensor_buft_overrides = NULL;
     } else {
-        GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr &&
-                    "Tensor buffer overrides not terminated with empty pattern");
+        GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
         mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
     }
 
@@ -1351,13 +1281,13 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
 struct llama_context_params common_context_params_to_llama(const common_params & params) {
     auto cparams = llama_context_default_params();
 
-    cparams.n_ctx     = params.n_ctx;
-    cparams.n_seq_max = params.n_parallel;
-    cparams.n_batch   = params.n_batch;
-    cparams.n_ubatch  = params.n_ubatch;
-    cparams.n_threads = params.cpuparams.n_threads;
-    cparams.n_threads_batch =
-        params.cpuparams_batch.n_threads == -1 ? params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
+    cparams.n_ctx             = params.n_ctx;
+    cparams.n_seq_max         = params.n_parallel;
+    cparams.n_batch           = params.n_batch;
+    cparams.n_ubatch          = params.n_ubatch;
+    cparams.n_threads         = params.cpuparams.n_threads;
+    cparams.n_threads_batch   = params.cpuparams_batch.n_threads == -1 ?
+                                params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
     cparams.embeddings        = params.embedding;
     cparams.rope_scaling_type = params.rope_scaling_type;
     cparams.rope_freq_base    = params.rope_freq_base;
@@ -1387,7 +1317,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
 struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
     struct ggml_threadpool_params tpp;
 
-    ggml_threadpool_params_init(&tpp, params.n_threads);  // setup the defaults
+    ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
 
     if (params.mask_valid) {
         std::memcpy(&tpp.cpumask, &params.cpumask, GGML_MAX_N_THREADS);
@@ -1408,20 +1338,21 @@ void common_batch_clear(struct llama_batch & batch) {
     batch.n_tokens = 0;
 }
 
-void common_batch_add(struct llama_batch &              batch,
-                      llama_token                       id,
-                      llama_pos                         pos,
-                      const std::vector<llama_seq_id> & seq_ids,
-                      bool                              logits) {
+void common_batch_add(
+                 struct llama_batch & batch,
+                        llama_token   id,
+                          llama_pos   pos,
+    const std::vector<llama_seq_id> & seq_ids,
+                               bool   logits) {
     GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded");
 
-    batch.token[batch.n_tokens]    = id;
-    batch.pos[batch.n_tokens]      = pos;
+    batch.token   [batch.n_tokens] = id;
+    batch.pos     [batch.n_tokens] = pos;
     batch.n_seq_id[batch.n_tokens] = seq_ids.size();
     for (size_t i = 0; i < seq_ids.size(); ++i) {
         batch.seq_id[batch.n_tokens][i] = seq_ids[i];
     }
-    batch.logits[batch.n_tokens] = logits;
+    batch.logits  [batch.n_tokens] = logits;
 
     batch.n_tokens++;
 }
@@ -1432,8 +1363,7 @@ void common_batch_add(struct llama_batch &              batch,
 
 size_t common_lcp(const llama_tokens & a, const llama_tokens & b) {
     size_t i;
-    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {
-    }
+    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
 
     return i;
 }
@@ -1491,32 +1421,31 @@ size_t common_lcs(const llama_tokens & a, const llama_tokens & b) {
 // Vocab utils
 //
 
-std::vector<llama_token> common_tokenize(const struct llama_context * ctx,
-                                         const std::string &          text,
-                                         bool                         add_special,
-                                         bool                         parse_special) {
+std::vector<llama_token> common_tokenize(
+  const struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_special,
+                        bool   parse_special) {
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
     return common_tokenize(vocab, text, add_special, parse_special);
 }
 
-std::vector<llama_token> common_tokenize(const struct llama_vocab * vocab,
-                                         const std::string &        text,
-                                         bool                       add_special,
-                                         bool                       parse_special) {
+std::vector<llama_token> common_tokenize(
+    const struct llama_vocab * vocab,
+           const std::string & text,
+                        bool   add_special,
+                        bool   parse_special) {
     // upper limit for the number of tokens
-    int                      n_tokens = text.length() + 2 * add_special;
+    int n_tokens = text.length() + 2 * add_special;
     std::vector<llama_token> result(n_tokens);
-    n_tokens =
-        llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+    n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
     if (n_tokens == std::numeric_limits<int32_t>::min()) {
-        throw std::runtime_error(
-            "Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
+        throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
     }
     if (n_tokens < 0) {
         result.resize(-n_tokens);
-        int check =
-            llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+        int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
         GGML_ASSERT(check == -n_tokens);
     } else {
         result.resize(n_tokens);
@@ -1538,7 +1467,8 @@ std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token
         piece.resize(-n_chars);
         int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
         GGML_ASSERT(check == -n_chars);
-    } else {
+    }
+    else {
         piece.resize(n_chars);
     }
 
@@ -1554,14 +1484,11 @@ std::string common_detokenize(const struct llama_context * ctx, const std::vecto
 std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
     std::string text;
     text.resize(std::max(text.capacity(), tokens.size()));
-    int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t) tokens.size(), &text[0], (int32_t) text.size(),
-                                       false, special);
+    int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
     if (n_chars < 0) {
         text.resize(-n_chars);
-        n_chars = llama_detokenize(vocab, tokens.data(), (int32_t) tokens.size(), &text[0], (int32_t) text.size(),
-                                   false, special);
-        GGML_ASSERT(n_chars <=
-                    (int32_t) text.size());  // whitespace trimming is performed after per-token detokenization
+        n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+        GGML_ASSERT(n_chars <= (int32_t)text.size());  // whitespace trimming is performed after per-token detokenization
     }
 
     text.resize(n_chars);
@@ -1578,24 +1505,24 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
     double sum = 0.0;
 
     switch (embd_norm) {
-        case -1:  // no normalisation
+        case -1: // no normalisation
             sum = 1.0;
             break;
-        case 0:  // max absolute
+        case 0: // max absolute
             for (int i = 0; i < n; i++) {
                 if (sum < std::abs(inp[i])) {
                     sum = std::abs(inp[i]);
                 }
             }
-            sum /= 32760.0;  // make an int16 range
+            sum /= 32760.0; // make an int16 range
             break;
-        case 2:              // euclidean
+        case 2: // euclidean
             for (int i = 0; i < n; i++) {
                 sum += inp[i] * inp[i];
             }
             sum = std::sqrt(sum);
             break;
-        default:  // p-norm (euclidean is p-norm p=2)
+        default: // p-norm (euclidean is p-norm p=2)
             for (int i = 0; i < n; i++) {
                 sum += std::pow(std::abs(inp[i]), embd_norm);
             }
@@ -1610,13 +1537,13 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
     }
 }
 
-float common_embd_similarity_cos(const float * embd1, const float * embd2, int n) {
+float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){
     double sum  = 0.0;
     double sum1 = 0.0;
     double sum2 = 0.0;
 
     for (int i = 0; i < n; i++) {
-        sum += embd1[i] * embd2[i];
+        sum  += embd1[i] * embd2[i];
         sum1 += embd1[i] * embd1[i];
         sum2 += embd2[i] * embd2[i];
     }
@@ -1624,7 +1551,7 @@ float common_embd_similarity_cos(const float * embd1, const float * embd2, int n
     // Handle the case where one or both vectors are zero vectors
     if (sum1 == 0.0 || sum2 == 0.0) {
         if (sum1 == 0.0 && sum2 == 0.0) {
-            return 1.0f;  // two zero vectors are similar
+            return 1.0f; // two zero vectors are similar
         }
         return 0.0f;
     }
@@ -1639,7 +1566,7 @@ float common_embd_similarity_cos(const float * embd1, const float * embd2, int n
 static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) {
     common_control_vector_data result = { -1, {} };
 
-    ggml_context *          ctx              = nullptr;
+    ggml_context * ctx = nullptr;
     struct gguf_init_params meta_gguf_params = {
         /* .no_alloc = */ false,
         /* .ctx      = */ &ctx,
@@ -1694,8 +1621,7 @@ static common_control_vector_data common_control_vector_load_one(const common_co
         if (result.n_embd == -1) {
             result.n_embd = ggml_nelements(tensor);
         } else if (ggml_nelements(tensor) != result.n_embd) {
-            LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__,
-                    load_info.fname.c_str());
+            LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
             result.n_embd = -1;
             break;
         }
@@ -1704,10 +1630,11 @@ static common_control_vector_data common_control_vector_load_one(const common_co
         result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);
 
         const float * src = (const float *) tensor->data;
-        float *       dst = result.data.data() + result.n_embd * (layer_idx - 1);  // layer 1 at [0]
+        float * dst = result.data.data() + result.n_embd * (layer_idx - 1);  // layer 1 at [0]
         for (int j = 0; j < result.n_embd; j++) {
             dst[j] += src[j] * load_info.strength;  // allows multiple directions for same layer in same file
         }
+
     }
 
     if (result.n_embd == -1) {
@@ -1755,20 +1682,18 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
     return result;
 }
 
-ggml_opt_dataset_t common_opt_dataset_init(struct llama_context *           ctx,
-                                           const std::vector<llama_token> & tokens,
-                                           int64_t                          stride) {
-    const int64_t      ne_datapoint = llama_n_ctx(ctx);
-    const int64_t      ndata        = (tokens.size() - ne_datapoint - 1) / stride;
-    ggml_opt_dataset_t result =
-        ggml_opt_dataset_init(GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/1);
+ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride) {
+    const int64_t ne_datapoint = llama_n_ctx(ctx);
+    const int64_t ndata        = (tokens.size() - ne_datapoint - 1) / stride;
+    ggml_opt_dataset_t result = ggml_opt_dataset_init(
+        GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1);
 
     llama_token * data   = (llama_token *) ggml_opt_dataset_data(result)->data;
     llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data;
 
     for (int64_t idata = 0; idata < ndata; ++idata) {
-        memcpy(data + idata * ne_datapoint, tokens.data() + idata * stride + 0, ne_datapoint * sizeof(llama_token));
-        memcpy(labels + idata * ne_datapoint, tokens.data() + idata * stride + 1, ne_datapoint * sizeof(llama_token));
+        memcpy(data   + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token));
+        memcpy(labels + idata*ne_datapoint, tokens.data() + idata*stride + 1, ne_datapoint*sizeof(llama_token));
     }
 
     return result;
@@ -1783,13 +1708,13 @@ ggml_opt_optimizer_params common_opt_lr_pars(void * userdata) {
 }
 
 // TODO make all command line args case-insensitive
-static inline bool eq_case_insensitive(const char * a, const char * b) {
+static inline bool eq_case_insensitive(char const* a, char const* b) {
     return !
 #if defined(_MSC_VER)
         _stricmp
 #else
         strcasecmp
-#endif  // defined(_MSC_VER)
+#endif // defined(_MSC_VER)
         (a, b);
 }
 
@@ -1804,7 +1729,7 @@ enum ggml_opt_optimizer_type common_opt_get_optimizer(const char * n) {
 }
 
 // TODO simplify to use just log and exp
-static const float k_log_2 = std::log(2.f);
+static float const k_log_2 = std::log(2.f);
 
 void lr_opt::init() {
     if (lr_min > 0 && lr_min < lr0) {
@@ -1820,7 +1745,9 @@ void lr_opt::init() {
 }
 
 float lr_opt::get_lr(float epoch) const {
-    float r = lr_min <= 0 ? lr0 : epoch >= decay_epochs ? lr_min : lr0 * std::pow(0.5f, epoch * scale_epoch);
+    float r = lr_min <= 0 ? lr0 :
+        epoch >= decay_epochs ? lr_min :
+        lr0 * std::pow(0.5f, epoch * scale_epoch);
     LOG_INF("epoch %.2g lr=%.2g\n", epoch, r);
     return r;
 }
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index bc8d3cdcb5..cf37ce6d84 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -13303,6 +13303,59 @@ void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total
     }
 }
 
+ggml_vk_device_info ggml_backend_vk_get_device_info(int device) {
+    GGML_ASSERT(device < (int) vk_instance.device_indices.size());
+    GGML_ASSERT(device < (int) vk_instance.device_supports_membudget.size());
+
+    vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
+    vk::PhysicalDeviceProperties props = vkdev.getProperties();
+
+    ggml_vk_device_info info = {};
+    snprintf(info.device_name, sizeof(info.device_name), "%s", props.deviceName.data());
+    info.vendor_id = props.vendorID;
+    info.device_id = props.deviceID;
+    info.api_version = props.apiVersion;
+
+    // Get memory info
+    size_t free, total;
+    ggml_backend_vk_get_device_memory(device, &free, &total);
+    info.total_device_local_memory = total;
+    info.has_memory_budget_ext = vk_instance.device_supports_membudget[device];
+
+    // Check features
+    VkPhysicalDeviceFeatures2 device_features2;
+    device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
+    VkPhysicalDeviceVulkan11Features vk11_features;
+    vk11_features.pNext = nullptr;
+    vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES;
+    device_features2.pNext = &vk11_features;
+    vkGetPhysicalDeviceFeatures2(vkdev, &device_features2);
+
+    info.supports_16bit_storage = vk11_features.storageBuffer16BitAccess;
+    
+    // Check for float16 support (shaderFloat16 or shaderInt8)
+    const std::vector<vk::ExtensionProperties> ext_props = vkdev.enumerateDeviceExtensionProperties();
+    bool fp16_compute = false;
+    for (const auto& properties : ext_props) {
+        if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
+            fp16_compute = true;
+            break;
+        }
+    }
+    info.supports_float16 = fp16_compute;
+
+    return info;
+}
+
+int ggml_backend_vk_get_default_gpu_layers(int device, int default_layers) {
+    // The dynamic heuristic in common.cpp handles the default case (n_gpu_layers = -1).
+    // This function is kept for API compatibility but currently returns 0 to be safe
+    // if called directly without the heuristic logic.
+    (void)device;
+    (void)default_layers;
+    return 0;
+}
+
 static vk::PhysicalDeviceType ggml_backend_vk_get_device_type(int device_idx) {
     GGML_ASSERT(device_idx >= 0 && device_idx < (int) vk_instance.device_indices.size());