From 1e8c02aa95e5467149bfb432e372681517b6f51e Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Wed, 25 Feb 2026 15:35:22 +0100
Subject: [PATCH] llama : add n_sampling_outputs_max cparam

This commit adds a compute graph parameter named n_sampling_outputs_max
which is intended to be used as a max (cap) value for the number of
output for backend sampling.

The motivation for this is that it gives a configurable value instead of
a hardcoded macro (LLAMA_MAX_SAMPLING_OUTPUTS) which has been removed.

I'm not sure if this is the best option as having multiple outputs per
sequence might not be the most common use case. I need to think a little
bit more about this. I'll commmit this to see that CI passes and also
this parameter should be exposed as a common options for tools which
I'll do in a follow up commit.
---
 include/llama.h       |  1 +
 src/llama-context.cpp | 27 ++++++++++++++++++++++-----
 src/llama-cparams.h   |  2 ++
 src/llama-graph.cpp   |  4 ++++
 4 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 077f66dc65..a93d63b774 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -376,6 +376,7 @@ extern "C" {
         // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
         struct llama_sampler_seq_config * samplers;
         size_t                            n_samplers;
+        uint32_t                          n_sampling_outputs_max;
     };
 
     // model quantization parameters
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index f8c3844539..e9f000bf2b 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -62,6 +62,7 @@ llama_context::llama_context(
     cparams.cb_eval           = params.cb_eval;
     cparams.cb_eval_user_data = params.cb_eval_user_data;
 
+    cparams.n_sampling_outputs_max = params.n_sampling_outputs_max;
     // Initialize backend samplers here so they are part of the sampling graph
     // before the reserve passes run later in this function. This avoids a later
     // re-reserve when graph nodes change.
@@ -1947,14 +1948,29 @@ void llama_context::output_reorder() {
 //
 
 uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
+    uint32_t res;
     if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR || model.arch == LLM_ARCH_QWEN35 || model.arch == LLM_ARCH_QWEN35MOE) {
-        return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
+        res = std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
+    } else {
+        res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
+        for (const auto & lora : model.loras) {
+            res += lora->get_n_nodes();
+        }
     }
-    uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
-    for (const auto & lora : model.loras) {
-        res += lora->get_n_nodes();
+
+    // Account for backend sampling with multiple outputs per sequence.
+    uint32_t sampling_nodes = 0;
+    if (!sampling.samplers.empty()) {
+        const uint32_t tensors_per_output   = 50;
+        const uint32_t sampling_outputs     = std::min<uint32_t>(n_tokens, cparams.n_sampling_outputs_max);
+
+        // Account for worst case (all sequences could have backend samplers).
+        const uint32_t max_samplers         = cparams.n_seq_max;
+
+        sampling_nodes = tensors_per_output * sampling_outputs * max_samplers;
     }
-    return res;
+
+    return res + sampling_nodes;
 }
 
 llm_graph_result * llama_context::get_gf_res_reserve() const {
@@ -2795,6 +2811,7 @@ llama_context_params llama_context_default_params() {
         /*.kv_unified                  =*/ false,
         /*.sampler                     =*/ nullptr,
         /*.n_sampler                   =*/ 0,
+        /*.n_sampling_outputs_max      =*/ 32,
     };
 
     return result;
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
index 2da3bbd6f9..1674a7dc0a 100644
--- a/src/llama-cparams.h
+++ b/src/llama-cparams.h
@@ -39,6 +39,8 @@ struct llama_cparams {
 
     enum llama_pooling_type pooling_type;
 
+    uint32_t n_sampling_outputs_max; // max outputs per sequence for backend sampling
+
     ggml_backend_sched_eval_callback cb_eval;
     void * cb_eval_user_data;
 };
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 76bf9f5f53..63e0276c83 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -2599,6 +2599,10 @@ void llm_graph_context::build_sampling() const {
     // this is important in order to minimize graph reallocations
     ggml_tensor * logits_t = ggml_pad(ctx0, res->t_logits, 0, 1, 0, 0);
 
+    // During graph reservation, n_outputs can be very large (for example 512 for worst-case PP).
+    // We cap it to a user-configurable maximum since typical multi output scenarios use far fewer.
+    const uint32_t max_outputs = std::min<uint32_t>(n_outputs, cparams.n_sampling_outputs_max);
+
     for (const auto & [seq_id, sampler] : samplers) {
         const auto row_it = seq_to_logit_rows.find(seq_id);