From 557515be1e93ed8939dd8a7c7d08765fdbe8be31 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 23 Jan 2026 18:22:34 +0200 Subject: [PATCH] graph : utilize `ggml_build_forward_select()` to avoid reallocations (#18898) * graph : avoid branches between embedding and token inputs * models : make deepstack graphs (e.g. Qwen3 VL) have constant topology * ci : enable -DGGML_SCHED_NO_REALLOC=ON for server CI * cont : pad token embeddings to n_embd_inp --- .github/workflows/server.yml | 4 +-- src/llama-context.cpp | 2 +- src/llama-graph.cpp | 65 ++++++++++++++++++++++++++---------- src/llama-graph.h | 9 +++-- src/models/gemma3n-iswa.cpp | 4 +-- src/models/qwen3vl-moe.cpp | 19 +++-------- src/models/qwen3vl.cpp | 19 +++-------- 7 files changed, 69 insertions(+), 53 deletions(-) diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 9d9d6884d4..9f1ef48c82 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -72,7 +72,7 @@ jobs: - name: Build id: cmake_build run: | - cmake -B build -DLLAMA_BUILD_BORINGSSL=ON + cmake -B build -DLLAMA_BUILD_BORINGSSL=ON -DGGML_SCHED_NO_REALLOC=ON cmake --build build --config ${{ matrix.build_type }} -j ${env:NUMBER_OF_PROCESSORS} --target llama-server - name: Python setup @@ -108,7 +108,7 @@ jobs: - name: Build id: cmake_build run: | - cmake -B build -DLLAMA_BUILD_BORINGSSL=ON + cmake -B build -DLLAMA_BUILD_BORINGSSL=ON -DGGML_SCHED_NO_REALLOC=ON cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server - name: Python setup diff --git a/src/llama-context.cpp b/src/llama-context.cpp index a6d5ddfa33..a35cf5a94b 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2903,7 +2903,7 @@ void llama_context::opt_epoch_iter( }; ctx_compute_opt = ggml_init(params); } - ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_tokens(), res->get_logits()); + ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_inp_tokens(), res->get_logits()); ggml_opt_alloc(opt_ctx, train); res->set_inputs(&ubatch); diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 5ebd0cf8aa..f9ed87cce1 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -23,7 +23,8 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) { } if (ubatch->embd) { - const int64_t n_embd = embd->ne[0]; + GGML_ASSERT(n_embd == embd->ne[0]); + const int64_t n_tokens = ubatch->n_tokens; ggml_backend_tensor_set(embd, ubatch->embd, 0, n_tokens*n_embd*ggml_element_size(embd)); @@ -33,8 +34,8 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) { bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) { bool res = true; - res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens); - res &= (!embd && !params.ubatch.embd) || (embd && embd->ne[1] == params.ubatch.n_tokens); + res &= (!params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens); + res &= (!params.ubatch.embd) || (embd && embd->ne[1] == params.ubatch.n_tokens); return res; } @@ -634,7 +635,8 @@ int64_t llm_graph_result::get_max_nodes() const { } void llm_graph_result::reset() { - t_tokens = nullptr; + t_inp_tokens = nullptr; + t_inp_embd = nullptr; t_logits = nullptr; t_embd = nullptr; t_embd_pooled = nullptr; @@ -1338,17 +1340,29 @@ ggml_tensor * llm_graph_context::build_moe_ffn( // input embeddings with optional lora ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { - const int64_t n_embd = hparams.n_embd_inp(); + const int64_t n_embd_inp = hparams.n_embd_inp(); + const int64_t n_embd = hparams.n_embd; - auto inp = std::make_unique(); + assert(n_embd_inp >= n_embd); - ggml_tensor * cur = nullptr; + auto inp = std::make_unique(n_embd_inp); - if (ubatch.token) { - inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); - //cb(inp->tokens, "inp_tokens", -1); - ggml_set_input(inp->tokens); - res->t_tokens = inp->tokens; + inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); + cb(inp->tokens, "inp_tokens", -1); + ggml_set_input(inp->tokens); + res->t_inp_tokens = inp->tokens; + + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_inp, ubatch.n_tokens); + cb(inp->embd, "inp_embd", -1); + ggml_set_input(inp->embd); + + // select one of the 2 inputs, based on the batch contents + // ref: https://github.com/ggml-org/llama.cpp/pull/18550 + std::array inps; + + // token embeddings path (ubatch.token != nullptr) + { + auto & cur = inps[0]; cur = ggml_get_rows(ctx0, tok_embd, inp->tokens); @@ -1369,19 +1383,36 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { cur = ggml_add(ctx0, cur, inpL_delta); } - } else { - inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens); - ggml_set_input(inp->embd); + + if (n_embd_inp != n_embd) { + cur = ggml_pad(ctx0, cur, hparams.n_embd_inp() - n_embd, 0, 0, 0); + } + } + + // vector embeddings path (ubatch.embd != nullptr) + { + auto & cur = inps[1]; cur = inp->embd; } + assert(ggml_are_same_shape (inps[0], inps[1])); + assert(ggml_are_same_stride(inps[0], inps[1])); + + ggml_tensor * cur = ggml_build_forward_select(gf, inps.data(), inps.size(), ubatch.token ? 0 : 1); + + if (n_embd_inp != n_embd) { + cur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0); + } + + res->t_inp_embd = cur; + // For Granite architecture if (hparams.f_embedding_scale != 0.0f) { cur = ggml_scale(ctx0, cur, hparams.f_embedding_scale); } - cb(cur, "inp_embd", -1); + cb(cur, "embd", -1); res->add_input(std::move(inp)); @@ -1480,7 +1511,7 @@ ggml_tensor * llm_graph_context::build_inp_cross_embd() const { //} const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd_inp(); - const auto n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train; + const auto n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train; cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc); ggml_set_input(cur); diff --git a/src/llama-graph.h b/src/llama-graph.h index 93d32522d1..242a046d56 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -106,7 +106,7 @@ using llm_graph_input_ptr = std::unique_ptr; class llm_graph_input_embd : public llm_graph_input_i { public: - llm_graph_input_embd() = default; + llm_graph_input_embd(int64_t n_embd) : n_embd(n_embd) {} virtual ~llm_graph_input_embd() = default; void set_input(const llama_ubatch * ubatch) override; @@ -115,6 +115,8 @@ public: ggml_tensor * tokens = nullptr; // I32 [n_batch] ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch] + + const int64_t n_embd = 0; }; class llm_graph_input_pos : public llm_graph_input_i { @@ -566,7 +568,7 @@ public: virtual ~llm_graph_result() = default; - ggml_tensor * get_tokens() const { return t_tokens; } + ggml_tensor * get_inp_tokens() const { return t_inp_tokens; } ggml_tensor * get_logits() const { return t_logits; } ggml_tensor * get_embd() const { return t_embd; } ggml_tensor * get_embd_pooled() const { return t_embd_pooled; } @@ -593,7 +595,8 @@ public: void set_params(const llm_graph_params & params); // important graph nodes - ggml_tensor * t_tokens = nullptr; + ggml_tensor * t_inp_tokens = nullptr; + ggml_tensor * t_inp_embd = nullptr; // [n_embd_inp, n_tokens] ggml_tensor * t_logits = nullptr; ggml_tensor * t_embd = nullptr; ggml_tensor * t_embd_pooled = nullptr; diff --git a/src/models/gemma3n-iswa.cpp b/src/models/gemma3n-iswa.cpp index 51acab1490..7db6d3bf4e 100644 --- a/src/models/gemma3n-iswa.cpp +++ b/src/models/gemma3n-iswa.cpp @@ -245,12 +245,12 @@ ggml_tensor * llm_build_gemma3n_iswa::view_2d_slice(ggml_tensor * x, int idx) { // equivalent to get_per_layer_inputs() in python code // output shape: [n_embd_altup, n_layer, n_tokens] ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() { - auto inp = std::make_unique(); + auto inp = std::make_unique(n_embd); ggml_tensor * inp_per_layer; if (ubatch.token) { inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); ggml_set_input(inp->tokens); - res->t_tokens = inp->tokens; + res->t_inp_tokens = inp->tokens; inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens); inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens); inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup)); diff --git a/src/models/qwen3vl-moe.cpp b/src/models/qwen3vl-moe.cpp index f72f80a837..e5e1a2150c 100644 --- a/src/models/qwen3vl-moe.cpp +++ b/src/models/qwen3vl-moe.cpp @@ -2,7 +2,8 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const size_t n_deepstack_layers = hparams.n_deepstack_layers; - const int64_t n_embd = hparams.n_embd; + + const int64_t n_embd = hparams.n_embd; const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -16,17 +17,6 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_ int sections[4]; std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); - std::vector deepstack_features(n_deepstack_layers, nullptr); - - if (ubatch.embd) { - // Image input: split main embd and deepstack embds - ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], 0); - for (size_t i = 0; i < n_deepstack_layers; i++) { - deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float)); - } - inpL = inpL_main; - } - // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); @@ -120,8 +110,9 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_ cur = build_cvec(cur, il); cb(cur, "l_out", il); - if (ubatch.embd && (size_t)il < n_deepstack_layers) { - cur = ggml_add(ctx0, cur, deepstack_features[il]); + if (il < (int) n_deepstack_layers) { + ggml_tensor * ds = ggml_view_2d(ctx0, res->t_inp_embd, n_embd, n_tokens, res->t_inp_embd->nb[1], (il + 1) * n_embd * sizeof(float)); + cur = ggml_add(ctx0, cur, ds); cb(cur, "deepstack_out", il); } diff --git a/src/models/qwen3vl.cpp b/src/models/qwen3vl.cpp index 0bae52239c..0f8315b324 100644 --- a/src/models/qwen3vl.cpp +++ b/src/models/qwen3vl.cpp @@ -2,7 +2,8 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const size_t n_deepstack_layers = hparams.n_deepstack_layers; - const int64_t n_embd = hparams.n_embd; + + const int64_t n_embd = hparams.n_embd; const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -16,17 +17,6 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_ int sections[4]; std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); - std::vector deepstack_features(n_deepstack_layers, nullptr); - - if (ubatch.embd) { - // Image input: split main embd and deepstack embds - ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], 0); - for (size_t i = 0; i < n_deepstack_layers; i++) { - deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float)); - } - inpL = inpL_main; - } - // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); @@ -113,8 +103,9 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_ cur = build_cvec(cur, il); cb(cur, "l_out", il); - if (ubatch.embd && (size_t)il < n_deepstack_layers) { - cur = ggml_add(ctx0, cur, deepstack_features[il]); + if (il < (int) n_deepstack_layers) { + ggml_tensor * ds = ggml_view_2d(ctx0, res->t_inp_embd, n_embd, n_tokens, res->t_inp_embd->nb[1], (il + 1) * n_embd * sizeof(float)); + cur = ggml_add(ctx0, cur, ds); cb(cur, "deepstack_out", il); }