From 7666cacf28591c0179c499d1525121e7406b58e5 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Thu, 2 Apr 2026 11:39:07 +0200
Subject: [PATCH] move llama_context_device_memory function to llama-ext.h

---
 include/llama.h                | 6 ------
 src/llama-ext.h                | 6 ++++++
 tools/server/server-models.cpp | 2 ++
 3 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/include/llama.h b/include/llama.h
index de7c0670f5..a940f9d648 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1547,12 +1547,6 @@ extern "C" {
     // print a breakdown of per-device memory use via LLAMA_LOG:
     LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
 
-    // Returns the projected memory use (model + context + compute) in bytes
-    // for the given device within this context. Returns 0 if the device is not used.
-    LLAMA_API uint64_t llama_context_device_memory(
-            const struct llama_context * ctx,
-            ggml_backend_dev_t           device);
-
     //
     // training
     //
diff --git a/src/llama-ext.h b/src/llama-ext.h
index 2ffb77934e..ee6ff27be3 100644
--- a/src/llama-ext.h
+++ b/src/llama-ext.h
@@ -54,3 +54,9 @@ LLAMA_API void llama_quant_compute_types(
         ggml_tensor ** tensors,
         ggml_type * result_types,
         size_t n_tensors);
+
+// Returns the projected memory use (model + context + compute) in bytes
+// for the given device within this context. Returns 0 if the device is not used.
+LLAMA_API uint64_t llama_context_device_memory(
+        const struct llama_context * ctx,
+        ggml_backend_dev_t           device);
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index bb41f205a9..fe039b03ee 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -7,6 +7,8 @@
 #include <cpp-httplib/httplib.h> // TODO: remove this once we use HTTP client from download.h
 #include <sheredom/subprocess.h>
 
+#include "../../src/llama-ext.h"
+
 #include <functional>
 #include <algorithm>
 #include <thread>