move allocation workaround out of ggml-alloc.c

2026-02-11 15:21:58 +01:00 · 2026-02-11 15:21:58 +01:00 · 76d9439276
parent 4dc3d10e80
commit 76d9439276
6 changed files with 16 additions and 10 deletions
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@ -261,6 +261,9 @@ extern "C" {

    GGML_API enum ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync);

+    // temporary workaround to statically allocate tensors from a context in a deduplicated way:
+    GGML_API struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
+
    //
    // Backend registry
    //
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@ -1,6 +1,5 @@
 #include "ggml-alloc.h"
 #include "ggml-backend-impl.h"
-#include "ggml-backend.h"
 #include "ggml.h"
 #include "ggml-impl.h"
 #include <assert.h>
@ -1241,9 +1240,6 @@ size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx,
 }

 ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
-    if (ggml_backend_buft_is_meta(buft)) {
-        return ggml_backend_meta_alloc_ctx_tensors_from_buft(ctx, buft);
-    }
    size_t nbytes_total = 0;
    return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false);
 }
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@ -254,9 +254,6 @@ extern "C" {
 #    define GGML_BACKEND_DL_SCORE_IMPL(score_fn)
 #endif

-    // temporary workaround to statically allocate tensors from a context in a deduplicated way:
-    GGML_API struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
-
 #ifdef  __cplusplus
 }
 #endif
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@ -187,7 +187,11 @@ llama_kv_cache::llama_kv_cache(
                t->buffer = buf; // set dummy buffer for KV cache so that the backend scheduler won't try to allocate it
            }
        } else {
-            buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft); // real buffer
+            if (ggml_backend_buft_is_meta(buft)) {
+                buf = ggml_backend_meta_alloc_ctx_tensors_from_buft(ctx.get(), buft);
+            } else {
+                buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft); // real buffer
+            }
        }
        if (!buf) {
            throw std::runtime_error("failed to allocate buffer for kv cache");
--- a/src/llama-memory-recurrent.cpp
+++ b/src/llama-memory-recurrent.cpp
@ -1,5 +1,6 @@
 #include "llama-memory-recurrent.h"

+#include "ggml-backend.h"
 #include "llama-impl.h"
 #include "llama-io.h"
 #include "llama-batch.h"
@ -101,7 +102,8 @@ llama_memory_recurrent::llama_memory_recurrent(

    // allocate tensors and initialize the buffers to avoid NaNs in the padding
    for (auto & [buft, ctx] : ctx_map) {
-        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
+        ggml_backend_buffer_t buf = ggml_backend_buft_is_meta(buft) ?
+            ggml_backend_meta_alloc_ctx_tensors_from_buft(ctx.get(), buft) : ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
        if (!buf) {
            throw std::runtime_error("failed to allocate buffer for rs cache");
        }
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -7504,7 +7504,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                    t->buffer = buf; // set dummy buffer for weights so that the backend scheduler won't try to allocate them
                }
            } else {
-                buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer
+                if (ggml_backend_buft_is_meta(buft)) {
+                    buf = ggml_backend_meta_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer
+                } else {
+                    buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer
+                }
            }
            if (buf == nullptr) {
                throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));