llama : add n_sampling_outputs_max cparam

This commit adds a compute graph parameter named n_sampling_outputs_max which is intended to be used as a max (cap) value for the number of output for backend sampling. The motivation for this is that it gives a configurable value instead of a hardcoded macro (LLAMA_MAX_SAMPLING_OUTPUTS) which has been removed. I'm not sure if this is the best option as having multiple outputs per sequence might not be the most common use case. I need to think a little bit more about this. I'll commmit this to see that CI passes and also this parameter should be exposed as a common options for tools which I'll do in a follow up commit.
2026-02-25 15:35:22 +01:00 · 2026-02-25 15:35:22 +01:00 · 1e8c02aa95
parent 1138d5c2d9
commit 1e8c02aa95
4 changed files with 29 additions and 5 deletions
--- a/include/llama.h
+++ b/include/llama.h
@ -376,6 +376,7 @@ extern "C" {
        // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
        struct llama_sampler_seq_config * samplers;
        size_t                            n_samplers;
+        uint32_t                          n_sampling_outputs_max;
    };

    // model quantization parameters
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -62,6 +62,7 @@ llama_context::llama_context(
    cparams.cb_eval           = params.cb_eval;
    cparams.cb_eval_user_data = params.cb_eval_user_data;

+    cparams.n_sampling_outputs_max = params.n_sampling_outputs_max;
    // Initialize backend samplers here so they are part of the sampling graph
    // before the reserve passes run later in this function. This avoids a later
    // re-reserve when graph nodes change.
@ -1947,14 +1948,29 @@ void llama_context::output_reorder() {
 //

 uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
+    uint32_t res;
    if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR || model.arch == LLM_ARCH_QWEN35 || model.arch == LLM_ARCH_QWEN35MOE) {
-        return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
+        res = std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
+    } else {
+        res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
+        for (const auto & lora : model.loras) {
+            res += lora->get_n_nodes();
+        }
    }
-    uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
-    for (const auto & lora : model.loras) {
-        res += lora->get_n_nodes();
+
+    // Account for backend sampling with multiple outputs per sequence.
+    uint32_t sampling_nodes = 0;
+    if (!sampling.samplers.empty()) {
+        const uint32_t tensors_per_output   = 50;
+        const uint32_t sampling_outputs     = std::min<uint32_t>(n_tokens, cparams.n_sampling_outputs_max);
+
+        // Account for worst case (all sequences could have backend samplers).
+        const uint32_t max_samplers         = cparams.n_seq_max;
+
+        sampling_nodes = tensors_per_output * sampling_outputs * max_samplers;
    }
-    return res;
+
+    return res + sampling_nodes;
 }

 llm_graph_result * llama_context::get_gf_res_reserve() const {
@ -2795,6 +2811,7 @@ llama_context_params llama_context_default_params() {
        /*.kv_unified                  =*/ false,
        /*.sampler                     =*/ nullptr,
        /*.n_sampler                   =*/ 0,
+        /*.n_sampling_outputs_max      =*/ 32,
    };

    return result;
--- a/src/llama-cparams.h
+++ b/src/llama-cparams.h
@ -39,6 +39,8 @@ struct llama_cparams {

    enum llama_pooling_type pooling_type;

+    uint32_t n_sampling_outputs_max; // max outputs per sequence for backend sampling
+
    ggml_backend_sched_eval_callback cb_eval;
    void * cb_eval_user_data;
 };
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@ -2599,6 +2599,10 @@ void llm_graph_context::build_sampling() const {
    // this is important in order to minimize graph reallocations
    ggml_tensor * logits_t = ggml_pad(ctx0, res->t_logits, 0, 1, 0, 0);

+    // During graph reservation, n_outputs can be very large (for example 512 for worst-case PP).
+    // We cap it to a user-configurable maximum since typical multi output scenarios use far fewer.
+    const uint32_t max_outputs = std::min<uint32_t>(n_outputs, cparams.n_sampling_outputs_max);
+
    for (const auto & [seq_id, sampler] : samplers) {
        const auto row_it = seq_to_logit_rows.find(seq_id);