From 1138d5c2d95515a09d87978a561141636dd3562a Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Mon, 23 Feb 2026 14:31:29 +0100 Subject: [PATCH 1/5] sampling : support multiple outputs per sequence This commit adds support for multiple outputs per sequence in the backend sampling implementation. The main motivation for this change is to be able to support speculative decoding using backend samplers where multiple outputs for the same sequence would be needed. --- src/llama-context.cpp | 108 ++++++++++++++++----------------- src/llama-graph.cpp | 99 ++++++++++++++++-------------- src/llama-graph.h | 8 +-- tests/test-backend-sampler.cpp | 29 ++++++--- 4 files changed, 129 insertions(+), 115 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 98d055d34e..f8c3844539 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1296,8 +1296,8 @@ int llama_context::encode(const llama_batch & batch_inp) { return 0; } -static std::map build_seq_to_output_row(const llama_ubatch & ubatch, uint32_t row_offset) { - std::map seq_to_row; +static std::map> build_seq_to_output_row(const llama_ubatch & ubatch, uint32_t row_offset) { + std::map> seq_to_row; // how many output tokens we have seen so far for this ubatch. uint32_t local = 0; for (uint32_t i = 0; i < ubatch.n_tokens; ++i) { @@ -1308,96 +1308,114 @@ static std::map build_seq_to_output_row(const llama_ubat const llama_seq_id seq_id = ubatch.seq_id[i][0]; // row_offset is the number of output tokens before this ubatch. - seq_to_row[seq_id] = row_offset + local; + seq_to_row[seq_id].push_back(row_offset + local); ++local; } return seq_to_row; } static void copy_tensor_async_ints( - const std::map & tensor_map, + const std::map> & tensor_map, const buffer_view & sampled, - const std::map & seq_to_row, + const std::map> & seq_to_row, ggml_backend_sched_t sched) { if (!sampled.has_data()) { return; } - for (const auto & [seq_id, tensor] : tensor_map) { + for (const auto & [seq_id, tensors] : tensor_map) { auto it = seq_to_row.find(seq_id); if (it == seq_to_row.end()) { continue; } - const uint32_t row = it->second; - GGML_ASSERT(row < sampled.size); + const std::vector & rows = it->second; + GGML_ASSERT(tensors.size() == rows.size() && "number of tensors must match number of output rows"); - GGML_ASSERT(ggml_is_contiguous(tensor) && "sampled tokens tensor must be contiguous for async copy"); + for (size_t i = 0; i < tensors.size(); ++i) { + const uint32_t row = rows[i]; + ggml_tensor * tensor = tensors[i]; - ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor); - ggml_backend_tensor_get_async(backend, tensor, sampled.data + row, 0, sizeof(sampled.data[row])); + GGML_ASSERT(row < sampled.size); + GGML_ASSERT(ggml_is_contiguous(tensor) && "sampled tokens tensor must be contiguous for async copy"); + + ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor); + ggml_backend_tensor_get_async(backend, tensor, sampled.data + row, 0, sizeof(sampled.data[row])); + } } } static void copy_tensor_async_floats( - const std::map & tensor_map, + const std::map> & tensor_map, const buffer_view & dst, size_t stride, std::vector & counts, - const std::map & seq_to_row, + const std::map> & seq_to_row, ggml_backend_sched_t sched) { if (!dst.has_data()) { return; } - for (const auto & [seq_id, tensor] : tensor_map) { + for (const auto & [seq_id, tensors] : tensor_map) { auto it = seq_to_row.find(seq_id); if (it == seq_to_row.end()) { continue; } - const uint32_t row = it->second; - GGML_ASSERT(row < counts.size()); + const std::vector & rows = it->second; + GGML_ASSERT(tensors.size() == rows.size() && "number of tensors must match number of output rows"); - GGML_ASSERT(ggml_is_contiguous(tensor) && "logits/probs tensor must be contiguous for async copy"); + for (size_t i = 0; i < tensors.size(); ++i) { + const uint32_t row = rows[i]; + ggml_tensor * tensor = tensors[i]; - ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor); - float * row_ptr = dst.data + (size_t) row * stride; - ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor)); + GGML_ASSERT(row < counts.size()); + GGML_ASSERT(ggml_is_contiguous(tensor) && "logits/probs tensor must be contiguous for async copy"); - // Update the actual number of logits/probabilities that were written for this row. - counts[row] = ggml_nelements(tensor); + ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor); + float * row_ptr = dst.data + (size_t) row * stride; + ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor)); + + // Update the actual number of logits/probabilities that were written for this row. + counts[row] = ggml_nelements(tensor); + } } } static void copy_tensor_async_candidates( - const std::map & tensor_map, + const std::map> & tensor_map, const buffer_view & dst, size_t stride, std::vector & counts, - const std::map & seq_to_row, + const std::map> & seq_to_row, ggml_backend_sched_t sched) { if (!dst.has_data()) { return; } - for (const auto & [seq_id, tensor] : tensor_map) { + for (const auto & [seq_id, tensors] : tensor_map) { auto it = seq_to_row.find(seq_id); if (it == seq_to_row.end()) { continue; } - const uint32_t row = it->second; - GGML_ASSERT(row < counts.size()); + const std::vector & rows = it->second; + GGML_ASSERT(tensors.size() == rows.size() && "number of tensors must match number of output rows"); - GGML_ASSERT(ggml_is_contiguous(tensor) && "candidates tensor must be contiguous for async copy"); + for (size_t i = 0; i < tensors.size(); ++i) { + const uint32_t row = rows[i]; + ggml_tensor * tensor = tensors[i]; - ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor); - llama_token * row_ptr = dst.data + (size_t) row * stride; - ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor)); + GGML_ASSERT(row < counts.size()); + GGML_ASSERT(ggml_is_contiguous(tensor) && "candidates tensor must be contiguous for async copy"); - // Update the actual number of candidates that were written. - counts[row] = ggml_nelements(tensor); + ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor); + llama_token * row_ptr = dst.data + (size_t) row * stride; + ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor)); + + // Update the actual number of candidates that were written. + counts[row] = ggml_nelements(tensor); + } } } @@ -1443,30 +1461,6 @@ int llama_context::decode(const llama_batch & batch_inp) { const uint32_t n_seq_max = cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max; - // TODO: avoid this workaround in the future - if (has_samplers && batch_inp.logits) { - std::vector seq_output_count(n_seq_max, 0); - - for (int32_t i = 0; i < batch_inp.n_tokens; ++i) { - if (batch_inp.logits[i] == 0) { - continue; - } - - const int ns = batch_inp.n_seq_id ? batch_inp.n_seq_id[i] : 1; - - for (int32_t s = 0; s < ns; ++s) { - const llama_seq_id seq_id = batch_inp.seq_id ? batch_inp.seq_id[i][s] : 0; - - seq_output_count[seq_id]++; - if (seq_output_count[seq_id] > 1) { - LLAMA_LOG_ERROR("%s: backend sampling requires at most one output token per sequence (seq_id %d had %d)\n", - __func__, seq_id, seq_output_count[seq_id]); - return -1; - } - } - } - } - if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, n_seq_max, output_all)) { LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__); return -1; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 23a86ea290..76bf9f5f53 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -774,24 +774,24 @@ void llm_graph_result::set_outputs() { if (t_embd_pooled != nullptr) { ggml_set_output(t_embd_pooled); } - for (auto & [seq_id, t] : t_sampled) { - if (t != nullptr) { - ggml_set_output(t); + for (auto & [seq_id, tensors] : t_sampled) { + for (ggml_tensor * tensor : tensors) { + ggml_set_output(tensor); } } - for (auto & [seq_id, t] : t_sampled_probs) { - if (t != nullptr) { - ggml_set_output(t); + for (auto & [seq_id, tensors] : t_sampled_probs) { + for (ggml_tensor * tensor : tensors) { + ggml_set_output(tensor); } } - for (auto & [seq_id, t] : t_sampled_logits) { - if (t != nullptr) { - ggml_set_output(t); + for (auto & [seq_id, tensors] : t_sampled_logits) { + for (ggml_tensor * tensor : tensors) { + ggml_set_output(tensor); } } - for (auto & [seq_id, t] : t_candidates) { - if (t != nullptr) { - ggml_set_output(t); + for (auto & [seq_id, tensors] : t_candidates) { + for (ggml_tensor * tensor : tensors) { + ggml_set_output(tensor); } } } @@ -2580,13 +2580,13 @@ void llm_graph_context::build_sampling() const { auto inp_sampling = std::make_unique(samplers); res->add_input(std::move(inp_sampling)); - std::map seq_to_logit_row; + std::map> seq_to_logit_rows; int32_t logit_row_idx = 0; for (uint32_t i = 0; i < ubatch.n_tokens; i++) { if (ubatch.output[i]) { llama_seq_id seq_id = ubatch.seq_id[i][0]; - seq_to_logit_row[seq_id] = logit_row_idx; + seq_to_logit_rows[seq_id].push_back(logit_row_idx); logit_row_idx++; } } @@ -2600,47 +2600,52 @@ void llm_graph_context::build_sampling() const { ggml_tensor * logits_t = ggml_pad(ctx0, res->t_logits, 0, 1, 0, 0); for (const auto & [seq_id, sampler] : samplers) { - const auto it = seq_to_logit_row.find(seq_id); + const auto row_it = seq_to_logit_rows.find(seq_id); - // inactive samplers always work on the first row - const auto row_idx = it != seq_to_logit_row.end() ? it->second : 0; - const int i_out = it != seq_to_logit_row.end() ? 1 : 0; + // row_it is now a sequence id to list of row ids + static const std::vector default_row = {0}; + const std::vector & logit_rows = row_it != seq_to_logit_rows.end() ? row_it->second : default_row; + for (const int32_t row_idx : logit_rows) { - ggml_tensor * logits_seq = ggml_view_1d(ctx0, logits_t, logits_t->ne[0], row_idx * logits_t->nb[1]); - ggml_format_name(logits_seq, "logits_seq_%d", seq_id); + // inactive samplers always work on the first row + const int i_out = row_it != seq_to_logit_rows.end() ? 1 : 0; - struct llama_sampler_data data = { - /*.logits =*/ logits_seq, - /*.probs =*/ nullptr, - /*.sampled =*/ nullptr, - /*.candidates =*/ nullptr, - }; + ggml_tensor * logits_seq = ggml_view_1d(ctx0, logits_t, logits_t->ne[0], row_idx * logits_t->nb[1]); + ggml_format_name(logits_seq, "logits_seq_%d", seq_id); - assert(sampler->iface->backend_apply); - sampler->iface->backend_apply(sampler, ctx0, gf, &data); + struct llama_sampler_data data = { + /*.logits =*/ logits_seq, + /*.probs =*/ nullptr, + /*.sampled =*/ nullptr, + /*.candidates =*/ nullptr, + }; - if (data.sampled != nullptr) { - res->t_sampled[seq_id] = data.sampled; - outs[1] = data.sampled; - ggml_build_forward_select(gf, outs.data(), outs.size(), i_out); - } + assert(sampler->iface->backend_apply); + sampler->iface->backend_apply(sampler, ctx0, gf, &data); - if (data.probs != nullptr) { - res->t_sampled_probs[seq_id] = data.probs; - outs[1] = data.probs; - ggml_build_forward_select(gf, outs.data(), outs.size(), i_out); - } + if (data.sampled != nullptr) { + res->t_sampled[seq_id].push_back(data.sampled); + outs[1] = data.sampled; + ggml_build_forward_select(gf, outs.data(), outs.size(), i_out); + } - if (data.logits != nullptr) { - res->t_sampled_logits[seq_id] = data.logits; - outs[1] = data.logits; - ggml_build_forward_select(gf, outs.data(), outs.size(), i_out); - } + if (data.probs != nullptr) { + res->t_sampled_probs[seq_id].push_back(data.probs); + outs[1] = data.probs; + ggml_build_forward_select(gf, outs.data(), outs.size(), i_out); + } - if (data.candidates != nullptr) { - res->t_candidates[seq_id] = data.candidates; - outs[1] = data.candidates; - ggml_build_forward_select(gf, outs.data(), outs.size(), i_out); + if (data.logits != nullptr) { + res->t_sampled_logits[seq_id].push_back(data.logits); + outs[1] = data.logits; + ggml_build_forward_select(gf, outs.data(), outs.size(), i_out); + } + + if (data.candidates != nullptr) { + res->t_candidates[seq_id].push_back(data.candidates); + outs[1] = data.candidates; + ggml_build_forward_select(gf, outs.data(), outs.size(), i_out); + } } } diff --git a/src/llama-graph.h b/src/llama-graph.h index e8f006977d..0b4017df76 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -662,10 +662,10 @@ public: ggml_tensor * t_embd = nullptr; ggml_tensor * t_embd_pooled = nullptr; - std::map t_sampled_logits; - std::map t_candidates; - std::map t_sampled; - std::map t_sampled_probs; + std::map> t_sampled_logits; + std::map> t_candidates; + std::map> t_sampled; + std::map> t_sampled_probs; std::vector inputs; diff --git a/tests/test-backend-sampler.cpp b/tests/test-backend-sampler.cpp index d4cd62c71e..ddc942e340 100644 --- a/tests/test-backend-sampler.cpp +++ b/tests/test-backend-sampler.cpp @@ -968,7 +968,7 @@ static void test_backend_cpu_mixed_batch(const test_params & params) { printf("backend-cpu mixed batch test PASSED\n"); } -static void test_backend_max_outputs(const test_params & params) { +static void test_backend_multiple_outputs(const test_params & params) { const int seq_id = 0; const int32_t seed = 88; @@ -994,17 +994,32 @@ static void test_backend_max_outputs(const test_params & params) { } for (size_t i = 0; i < tokens.size(); i++) { - // set all tokens as output to trigger error + // set all tokens as output to get multiple outputs for a single sequence. common_batch_add(batch, tokens[i], i, { seq_id }, true); } - printf(">>> test_max_outputs expected error start:\n"); const int ret = llama_decode(test_ctx.ctx.get(), batch); - GGML_ASSERT(ret != 0 && "llama_decode should not succeed multiple outputs per sequence"); - printf("<<< test_max_outputs expected error end.\n"); + if (ret != 0) { + GGML_ASSERT(false && "Failed to decode sequence with multiple outputs"); + } + + std::vector sampled_tokens; + for (int i = 0; i < batch.n_tokens; i++) { + if (batch.logits[i]) { + llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), i); + const std::string token_str = test_ctx.token_to_piece(token, false); + //printf("Position %d: token id=%d, string='%s'\n", i, token, token_str.c_str()); + GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab); + sampled_tokens.push_back(token); + } + } + + GGML_ASSERT((int)sampled_tokens.size() == batch.n_tokens); + printf("Sampled %zu tokens for sequence %d\n", sampled_tokens.size(), seq_id); + llama_batch_free(batch); - printf("backend max outputs test PASSED\n"); + printf("backend multiple outputs test PASSED\n"); } struct backend_test_case { @@ -1023,7 +1038,7 @@ static const backend_test_case BACKEND_TESTS[] = { { "dist", test_backend_dist_sampling, true }, { "dist_and_cpu", test_backend_dist_sampling_and_cpu, true }, { "set_sampler", test_backend_set_sampler, true }, - { "max_outputs", test_backend_max_outputs, true }, + { "multiple_outputs",test_backend_multiple_outputs, true }, { "mixed", test_backend_mixed_sampling, true }, { "min_p", test_backend_min_p_sampling, true }, { "cpu_mixed", test_backend_cpu_mixed_batch, true }, From 1e8c02aa95e5467149bfb432e372681517b6f51e Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 25 Feb 2026 15:35:22 +0100 Subject: [PATCH 2/5] llama : add n_sampling_outputs_max cparam This commit adds a compute graph parameter named n_sampling_outputs_max which is intended to be used as a max (cap) value for the number of output for backend sampling. The motivation for this is that it gives a configurable value instead of a hardcoded macro (LLAMA_MAX_SAMPLING_OUTPUTS) which has been removed. I'm not sure if this is the best option as having multiple outputs per sequence might not be the most common use case. I need to think a little bit more about this. I'll commmit this to see that CI passes and also this parameter should be exposed as a common options for tools which I'll do in a follow up commit. --- include/llama.h | 1 + src/llama-context.cpp | 27 ++++++++++++++++++++++----- src/llama-cparams.h | 2 ++ src/llama-graph.cpp | 4 ++++ 4 files changed, 29 insertions(+), 5 deletions(-) diff --git a/include/llama.h b/include/llama.h index 077f66dc65..a93d63b774 100644 --- a/include/llama.h +++ b/include/llama.h @@ -376,6 +376,7 @@ extern "C" { // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init) struct llama_sampler_seq_config * samplers; size_t n_samplers; + uint32_t n_sampling_outputs_max; }; // model quantization parameters diff --git a/src/llama-context.cpp b/src/llama-context.cpp index f8c3844539..e9f000bf2b 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -62,6 +62,7 @@ llama_context::llama_context( cparams.cb_eval = params.cb_eval; cparams.cb_eval_user_data = params.cb_eval_user_data; + cparams.n_sampling_outputs_max = params.n_sampling_outputs_max; // Initialize backend samplers here so they are part of the sampling graph // before the reserve passes run later in this function. This avoids a later // re-reserve when graph nodes change. @@ -1947,14 +1948,29 @@ void llama_context::output_reorder() { // uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const { + uint32_t res; if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR || model.arch == LLM_ARCH_QWEN35 || model.arch == LLM_ARCH_QWEN35MOE) { - return std::max(n_tokens * 40, 32u * model.n_tensors()); + res = std::max(n_tokens * 40, 32u * model.n_tensors()); + } else { + res = std::max(1024u, 8u*model.n_tensors()); + for (const auto & lora : model.loras) { + res += lora->get_n_nodes(); + } } - uint32_t res = std::max(1024u, 8u*model.n_tensors()); - for (const auto & lora : model.loras) { - res += lora->get_n_nodes(); + + // Account for backend sampling with multiple outputs per sequence. + uint32_t sampling_nodes = 0; + if (!sampling.samplers.empty()) { + const uint32_t tensors_per_output = 50; + const uint32_t sampling_outputs = std::min(n_tokens, cparams.n_sampling_outputs_max); + + // Account for worst case (all sequences could have backend samplers). + const uint32_t max_samplers = cparams.n_seq_max; + + sampling_nodes = tensors_per_output * sampling_outputs * max_samplers; } - return res; + + return res + sampling_nodes; } llm_graph_result * llama_context::get_gf_res_reserve() const { @@ -2795,6 +2811,7 @@ llama_context_params llama_context_default_params() { /*.kv_unified =*/ false, /*.sampler =*/ nullptr, /*.n_sampler =*/ 0, + /*.n_sampling_outputs_max =*/ 32, }; return result; diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 2da3bbd6f9..1674a7dc0a 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -39,6 +39,8 @@ struct llama_cparams { enum llama_pooling_type pooling_type; + uint32_t n_sampling_outputs_max; // max outputs per sequence for backend sampling + ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; }; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 76bf9f5f53..63e0276c83 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -2599,6 +2599,10 @@ void llm_graph_context::build_sampling() const { // this is important in order to minimize graph reallocations ggml_tensor * logits_t = ggml_pad(ctx0, res->t_logits, 0, 1, 0, 0); + // During graph reservation, n_outputs can be very large (for example 512 for worst-case PP). + // We cap it to a user-configurable maximum since typical multi output scenarios use far fewer. + const uint32_t max_outputs = std::min(n_outputs, cparams.n_sampling_outputs_max); + for (const auto & [seq_id, sampler] : samplers) { const auto row_it = seq_to_logit_rows.find(seq_id); From 765998f2d7cb9bf539da464f0f5d9dd6e528d2ae Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Thu, 26 Feb 2026 11:16:23 +0100 Subject: [PATCH 3/5] llama : enable static graph for multiple sampling outputs per sequence This commit makes the computation graph static when backend samplers process multiple outputs per sequence. Previously, only active samplers, those with outputs in the current batch, were added to the graph. This could cause graph reallocations if different samplers become active/inactive across batches, even when the number of outputs remained constant. --- src/llama-context.cpp | 3 --- src/llama-graph.cpp | 31 ++++++++++++++++++++----------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index e9f000bf2b..60b980ff6e 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1331,7 +1331,6 @@ static void copy_tensor_async_ints( } const std::vector & rows = it->second; - GGML_ASSERT(tensors.size() == rows.size() && "number of tensors must match number of output rows"); for (size_t i = 0; i < tensors.size(); ++i) { const uint32_t row = rows[i]; @@ -1364,7 +1363,6 @@ static void copy_tensor_async_floats( } const std::vector & rows = it->second; - GGML_ASSERT(tensors.size() == rows.size() && "number of tensors must match number of output rows"); for (size_t i = 0; i < tensors.size(); ++i) { const uint32_t row = rows[i]; @@ -1401,7 +1399,6 @@ static void copy_tensor_async_candidates( } const std::vector & rows = it->second; - GGML_ASSERT(tensors.size() == rows.size() && "number of tensors must match number of output rows"); for (size_t i = 0; i < tensors.size(); ++i) { const uint32_t row = rows[i]; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 63e0276c83..1baafb0d82 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -2605,17 +2605,18 @@ void llm_graph_context::build_sampling() const { for (const auto & [seq_id, sampler] : samplers) { const auto row_it = seq_to_logit_rows.find(seq_id); + const bool sampler_is_active = row_it != seq_to_logit_rows.end(); - // row_it is now a sequence id to list of row ids - static const std::vector default_row = {0}; - const std::vector & logit_rows = row_it != seq_to_logit_rows.end() ? row_it->second : default_row; - for (const int32_t row_idx : logit_rows) { + // Always build samplers for all possible outputs even if the sampler is + // not active (the sampler's sequence id is not in the current ubatch). + for (uint32_t i = 0; i < max_outputs; ++i) { + const bool real_output = sampler_is_active && i < row_it->second.size(); - // inactive samplers always work on the first row - const int i_out = row_it != seq_to_logit_rows.end() ? 1 : 0; + const int32_t row_idx = real_output ? row_it->second[i] : 0; + const int i_out = real_output ? 1 : 0; ggml_tensor * logits_seq = ggml_view_1d(ctx0, logits_t, logits_t->ne[0], row_idx * logits_t->nb[1]); - ggml_format_name(logits_seq, "logits_seq_%d", seq_id); + ggml_format_name(logits_seq, "logits_seq_%d_%d", seq_id, i); struct llama_sampler_data data = { /*.logits =*/ logits_seq, @@ -2628,25 +2629,33 @@ void llm_graph_context::build_sampling() const { sampler->iface->backend_apply(sampler, ctx0, gf, &data); if (data.sampled != nullptr) { - res->t_sampled[seq_id].push_back(data.sampled); + if (real_output) { + res->t_sampled[seq_id].push_back(data.sampled); + } outs[1] = data.sampled; ggml_build_forward_select(gf, outs.data(), outs.size(), i_out); } if (data.probs != nullptr) { - res->t_sampled_probs[seq_id].push_back(data.probs); + if (real_output) { + res->t_sampled_probs[seq_id].push_back(data.probs); + } outs[1] = data.probs; ggml_build_forward_select(gf, outs.data(), outs.size(), i_out); } if (data.logits != nullptr) { - res->t_sampled_logits[seq_id].push_back(data.logits); + if (real_output) { + res->t_sampled_logits[seq_id].push_back(data.logits); + } outs[1] = data.logits; ggml_build_forward_select(gf, outs.data(), outs.size(), i_out); } if (data.candidates != nullptr) { - res->t_candidates[seq_id].push_back(data.candidates); + if (real_output) { + res->t_candidates[seq_id].push_back(data.candidates); + } outs[1] = data.candidates; ggml_build_forward_select(gf, outs.data(), outs.size(), i_out); } From 2235b4be499a2a3c03a12a7ca0c07f2ece371cf2 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 25 Feb 2026 15:39:35 +0100 Subject: [PATCH 4/5] server : enable backend sampling for multiple outputs per sequence --- tools/server/server-context.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index eba463e4da..8a20225240 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -1149,9 +1149,6 @@ private: backend_sampling &= task.params.sampling.backend_sampling; - // TODO: speculative decoding requires multiple samples per batch - not supported yet - backend_sampling &= !(slot.spec && task.params.speculative.n_max > 0); - // TODO: getting post/pre sampling logits is not yet supported with backend sampling backend_sampling &= !need_logits; From 5c92c76e9edd20b040c1b9f14c8b40483cfdb601 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 25 Feb 2026 15:39:57 +0100 Subject: [PATCH 5/5] sampling : add clamping to backend dist sampler This commit adds clamping to the backend distribution sampler to avoid the case where idxf values are all zero. If this happens then we will incorrectly create an out of bounds idx value which will cause a crash. This can be reproduced by explicitly setting idxf to zero: ```c++ idxf = ggml_scale(ctx, idxf, 0.0f); ``` --- src/llama-sampler.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp index 9bbc5dbde2..a5a67d206f 100644 --- a/src/llama-sampler.cpp +++ b/src/llama-sampler.cpp @@ -1180,6 +1180,9 @@ static void llama_sampler_dist_backend_apply( struct ggml_tensor * idxf = ggml_sum(ctx, mask); ggml_set_name(idxf, "dist_index_f32"); + // Clamp to prevent out-of-bounds access when computing the index. + idxf = ggml_clamp(ctx, idxf, 1.0f, mask->ne[0]); + // Use ggml_scale_bias to scale the index value by -1 and then add the size // of the mask to that value so we get the correct index ((-1 * idxf) + n). struct ggml_tensor * idx = ggml_cast(ctx, ggml_scale_bias(ctx, idxf, -1.0f, mask->ne[0]), GGML_TYPE_I32);