From 1e8c02aa95e5467149bfb432e372681517b6f51e Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 25 Feb 2026 15:35:22 +0100 Subject: [PATCH] llama : add n_sampling_outputs_max cparam This commit adds a compute graph parameter named n_sampling_outputs_max which is intended to be used as a max (cap) value for the number of output for backend sampling. The motivation for this is that it gives a configurable value instead of a hardcoded macro (LLAMA_MAX_SAMPLING_OUTPUTS) which has been removed. I'm not sure if this is the best option as having multiple outputs per sequence might not be the most common use case. I need to think a little bit more about this. I'll commmit this to see that CI passes and also this parameter should be exposed as a common options for tools which I'll do in a follow up commit. --- include/llama.h | 1 + src/llama-context.cpp | 27 ++++++++++++++++++++++----- src/llama-cparams.h | 2 ++ src/llama-graph.cpp | 4 ++++ 4 files changed, 29 insertions(+), 5 deletions(-) diff --git a/include/llama.h b/include/llama.h index 077f66dc65..a93d63b774 100644 --- a/include/llama.h +++ b/include/llama.h @@ -376,6 +376,7 @@ extern "C" { // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init) struct llama_sampler_seq_config * samplers; size_t n_samplers; + uint32_t n_sampling_outputs_max; }; // model quantization parameters diff --git a/src/llama-context.cpp b/src/llama-context.cpp index f8c3844539..e9f000bf2b 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -62,6 +62,7 @@ llama_context::llama_context( cparams.cb_eval = params.cb_eval; cparams.cb_eval_user_data = params.cb_eval_user_data; + cparams.n_sampling_outputs_max = params.n_sampling_outputs_max; // Initialize backend samplers here so they are part of the sampling graph // before the reserve passes run later in this function. This avoids a later // re-reserve when graph nodes change. @@ -1947,14 +1948,29 @@ void llama_context::output_reorder() { // uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const { + uint32_t res; if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR || model.arch == LLM_ARCH_QWEN35 || model.arch == LLM_ARCH_QWEN35MOE) { - return std::max(n_tokens * 40, 32u * model.n_tensors()); + res = std::max(n_tokens * 40, 32u * model.n_tensors()); + } else { + res = std::max(1024u, 8u*model.n_tensors()); + for (const auto & lora : model.loras) { + res += lora->get_n_nodes(); + } } - uint32_t res = std::max(1024u, 8u*model.n_tensors()); - for (const auto & lora : model.loras) { - res += lora->get_n_nodes(); + + // Account for backend sampling with multiple outputs per sequence. + uint32_t sampling_nodes = 0; + if (!sampling.samplers.empty()) { + const uint32_t tensors_per_output = 50; + const uint32_t sampling_outputs = std::min(n_tokens, cparams.n_sampling_outputs_max); + + // Account for worst case (all sequences could have backend samplers). + const uint32_t max_samplers = cparams.n_seq_max; + + sampling_nodes = tensors_per_output * sampling_outputs * max_samplers; } - return res; + + return res + sampling_nodes; } llm_graph_result * llama_context::get_gf_res_reserve() const { @@ -2795,6 +2811,7 @@ llama_context_params llama_context_default_params() { /*.kv_unified =*/ false, /*.sampler =*/ nullptr, /*.n_sampler =*/ 0, + /*.n_sampling_outputs_max =*/ 32, }; return result; diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 2da3bbd6f9..1674a7dc0a 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -39,6 +39,8 @@ struct llama_cparams { enum llama_pooling_type pooling_type; + uint32_t n_sampling_outputs_max; // max outputs per sequence for backend sampling + ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; }; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 76bf9f5f53..63e0276c83 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -2599,6 +2599,10 @@ void llm_graph_context::build_sampling() const { // this is important in order to minimize graph reallocations ggml_tensor * logits_t = ggml_pad(ctx0, res->t_logits, 0, 1, 0, 0); + // During graph reservation, n_outputs can be very large (for example 512 for worst-case PP). + // We cap it to a user-configurable maximum since typical multi output scenarios use far fewer. + const uint32_t max_outputs = std::min(n_outputs, cparams.n_sampling_outputs_max); + for (const auto & [seq_id, sampler] : samplers) { const auto row_it = seq_to_logit_rows.find(seq_id);