From 765998f2d7cb9bf539da464f0f5d9dd6e528d2ae Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Thu, 26 Feb 2026 11:16:23 +0100 Subject: [PATCH] llama : enable static graph for multiple sampling outputs per sequence This commit makes the computation graph static when backend samplers process multiple outputs per sequence. Previously, only active samplers, those with outputs in the current batch, were added to the graph. This could cause graph reallocations if different samplers become active/inactive across batches, even when the number of outputs remained constant. --- src/llama-context.cpp | 3 --- src/llama-graph.cpp | 31 ++++++++++++++++++++----------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index e9f000bf2b..60b980ff6e 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1331,7 +1331,6 @@ static void copy_tensor_async_ints( } const std::vector & rows = it->second; - GGML_ASSERT(tensors.size() == rows.size() && "number of tensors must match number of output rows"); for (size_t i = 0; i < tensors.size(); ++i) { const uint32_t row = rows[i]; @@ -1364,7 +1363,6 @@ static void copy_tensor_async_floats( } const std::vector & rows = it->second; - GGML_ASSERT(tensors.size() == rows.size() && "number of tensors must match number of output rows"); for (size_t i = 0; i < tensors.size(); ++i) { const uint32_t row = rows[i]; @@ -1401,7 +1399,6 @@ static void copy_tensor_async_candidates( } const std::vector & rows = it->second; - GGML_ASSERT(tensors.size() == rows.size() && "number of tensors must match number of output rows"); for (size_t i = 0; i < tensors.size(); ++i) { const uint32_t row = rows[i]; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 63e0276c83..1baafb0d82 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -2605,17 +2605,18 @@ void llm_graph_context::build_sampling() const { for (const auto & [seq_id, sampler] : samplers) { const auto row_it = seq_to_logit_rows.find(seq_id); + const bool sampler_is_active = row_it != seq_to_logit_rows.end(); - // row_it is now a sequence id to list of row ids - static const std::vector default_row = {0}; - const std::vector & logit_rows = row_it != seq_to_logit_rows.end() ? row_it->second : default_row; - for (const int32_t row_idx : logit_rows) { + // Always build samplers for all possible outputs even if the sampler is + // not active (the sampler's sequence id is not in the current ubatch). + for (uint32_t i = 0; i < max_outputs; ++i) { + const bool real_output = sampler_is_active && i < row_it->second.size(); - // inactive samplers always work on the first row - const int i_out = row_it != seq_to_logit_rows.end() ? 1 : 0; + const int32_t row_idx = real_output ? row_it->second[i] : 0; + const int i_out = real_output ? 1 : 0; ggml_tensor * logits_seq = ggml_view_1d(ctx0, logits_t, logits_t->ne[0], row_idx * logits_t->nb[1]); - ggml_format_name(logits_seq, "logits_seq_%d", seq_id); + ggml_format_name(logits_seq, "logits_seq_%d_%d", seq_id, i); struct llama_sampler_data data = { /*.logits =*/ logits_seq, @@ -2628,25 +2629,33 @@ void llm_graph_context::build_sampling() const { sampler->iface->backend_apply(sampler, ctx0, gf, &data); if (data.sampled != nullptr) { - res->t_sampled[seq_id].push_back(data.sampled); + if (real_output) { + res->t_sampled[seq_id].push_back(data.sampled); + } outs[1] = data.sampled; ggml_build_forward_select(gf, outs.data(), outs.size(), i_out); } if (data.probs != nullptr) { - res->t_sampled_probs[seq_id].push_back(data.probs); + if (real_output) { + res->t_sampled_probs[seq_id].push_back(data.probs); + } outs[1] = data.probs; ggml_build_forward_select(gf, outs.data(), outs.size(), i_out); } if (data.logits != nullptr) { - res->t_sampled_logits[seq_id].push_back(data.logits); + if (real_output) { + res->t_sampled_logits[seq_id].push_back(data.logits); + } outs[1] = data.logits; ggml_build_forward_select(gf, outs.data(), outs.size(), i_out); } if (data.candidates != nullptr) { - res->t_candidates[seq_id].push_back(data.candidates); + if (real_output) { + res->t_candidates[seq_id].push_back(data.candidates); + } outs[1] = data.candidates; ggml_build_forward_select(gf, outs.data(), outs.size(), i_out); }