move allocation workaround out of ggml-alloc.c
This commit is contained in:
parent
4dc3d10e80
commit
76d9439276
|
|
@ -261,6 +261,9 @@ extern "C" {
|
|||
|
||||
GGML_API enum ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync);
|
||||
|
||||
// temporary workaround to statically allocate tensors from a context in a deduplicated way:
|
||||
GGML_API struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||
|
||||
//
|
||||
// Backend registry
|
||||
//
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
#include <assert.h>
|
||||
|
|
@ -1241,9 +1240,6 @@ size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx,
|
|||
}
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
||||
if (ggml_backend_buft_is_meta(buft)) {
|
||||
return ggml_backend_meta_alloc_ctx_tensors_from_buft(ctx, buft);
|
||||
}
|
||||
size_t nbytes_total = 0;
|
||||
return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -254,9 +254,6 @@ extern "C" {
|
|||
# define GGML_BACKEND_DL_SCORE_IMPL(score_fn)
|
||||
#endif
|
||||
|
||||
// temporary workaround to statically allocate tensors from a context in a deduplicated way:
|
||||
GGML_API struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -187,7 +187,11 @@ llama_kv_cache::llama_kv_cache(
|
|||
t->buffer = buf; // set dummy buffer for KV cache so that the backend scheduler won't try to allocate it
|
||||
}
|
||||
} else {
|
||||
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft); // real buffer
|
||||
if (ggml_backend_buft_is_meta(buft)) {
|
||||
buf = ggml_backend_meta_alloc_ctx_tensors_from_buft(ctx.get(), buft);
|
||||
} else {
|
||||
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft); // real buffer
|
||||
}
|
||||
}
|
||||
if (!buf) {
|
||||
throw std::runtime_error("failed to allocate buffer for kv cache");
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#include "llama-memory-recurrent.h"
|
||||
|
||||
#include "ggml-backend.h"
|
||||
#include "llama-impl.h"
|
||||
#include "llama-io.h"
|
||||
#include "llama-batch.h"
|
||||
|
|
@ -101,7 +102,8 @@ llama_memory_recurrent::llama_memory_recurrent(
|
|||
|
||||
// allocate tensors and initialize the buffers to avoid NaNs in the padding
|
||||
for (auto & [buft, ctx] : ctx_map) {
|
||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
|
||||
ggml_backend_buffer_t buf = ggml_backend_buft_is_meta(buft) ?
|
||||
ggml_backend_meta_alloc_ctx_tensors_from_buft(ctx.get(), buft) : ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
|
||||
if (!buf) {
|
||||
throw std::runtime_error("failed to allocate buffer for rs cache");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7504,7 +7504,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
t->buffer = buf; // set dummy buffer for weights so that the backend scheduler won't try to allocate them
|
||||
}
|
||||
} else {
|
||||
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer
|
||||
if (ggml_backend_buft_is_meta(buft)) {
|
||||
buf = ggml_backend_meta_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer
|
||||
} else {
|
||||
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer
|
||||
}
|
||||
}
|
||||
if (buf == nullptr) {
|
||||
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
|
||||
|
|
|
|||
Loading…
Reference in New Issue