move llama_context_device_memory function to llama-ext.h

2026-04-02 11:39:07 +02:00 · 2026-04-02 11:39:07 +02:00 · 7666cacf28
parent 7e10ec8ff2
commit 7666cacf28
3 changed files with 8 additions and 6 deletions
--- a/include/llama.h
+++ b/include/llama.h
@ -1547,12 +1547,6 @@ extern "C" {
    // print a breakdown of per-device memory use via LLAMA_LOG:
    LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);

-    // Returns the projected memory use (model + context + compute) in bytes
-    // for the given device within this context. Returns 0 if the device is not used.
-    LLAMA_API uint64_t llama_context_device_memory(
-            const struct llama_context * ctx,
-            ggml_backend_dev_t           device);
-
    //
    // training
    //
--- a/src/llama-ext.h
+++ b/src/llama-ext.h
@ -54,3 +54,9 @@ LLAMA_API void llama_quant_compute_types(
        ggml_tensor ** tensors,
        ggml_type * result_types,
        size_t n_tensors);
+
+// Returns the projected memory use (model + context + compute) in bytes
+// for the given device within this context. Returns 0 if the device is not used.
+LLAMA_API uint64_t llama_context_device_memory(
+        const struct llama_context * ctx,
+        ggml_backend_dev_t           device);
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@ -7,6 +7,8 @@
 #include <cpp-httplib/httplib.h> // TODO: remove this once we use HTTP client from download.h
 #include <sheredom/subprocess.h>

+#include "../../src/llama-ext.h"
+
 #include <functional>
 #include <algorithm>
 #include <thread>