llama : add n_sampling_outputs_max cparam

This commit adds a compute graph parameter named n_sampling_outputs_max
which is intended to be used as a max (cap) value for the number of
output for backend sampling.

The motivation for this is that it gives a configurable value instead of
a hardcoded macro (LLAMA_MAX_SAMPLING_OUTPUTS) which has been removed.

I'm not sure if this is the best option as having multiple outputs per
sequence might not be the most common use case. I need to think a little
bit more about this. I'll commmit this to see that CI passes and also
this parameter should be exposed as a common options for tools which
I'll do in a follow up commit.
This commit is contained in:
Daniel Bevenius 2026-02-25 15:35:22 +01:00
parent 1138d5c2d9
commit 1e8c02aa95
No known key found for this signature in database
4 changed files with 29 additions and 5 deletions

View File

@ -376,6 +376,7 @@ extern "C" {
// note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
struct llama_sampler_seq_config * samplers;
size_t n_samplers;
uint32_t n_sampling_outputs_max;
};
// model quantization parameters

View File

@ -62,6 +62,7 @@ llama_context::llama_context(
cparams.cb_eval = params.cb_eval;
cparams.cb_eval_user_data = params.cb_eval_user_data;
cparams.n_sampling_outputs_max = params.n_sampling_outputs_max;
// Initialize backend samplers here so they are part of the sampling graph
// before the reserve passes run later in this function. This avoids a later
// re-reserve when graph nodes change.
@ -1947,14 +1948,29 @@ void llama_context::output_reorder() {
//
uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
uint32_t res;
if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR || model.arch == LLM_ARCH_QWEN35 || model.arch == LLM_ARCH_QWEN35MOE) {
return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
res = std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
} else {
res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
for (const auto & lora : model.loras) {
res += lora->get_n_nodes();
}
}
uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
for (const auto & lora : model.loras) {
res += lora->get_n_nodes();
// Account for backend sampling with multiple outputs per sequence.
uint32_t sampling_nodes = 0;
if (!sampling.samplers.empty()) {
const uint32_t tensors_per_output = 50;
const uint32_t sampling_outputs = std::min<uint32_t>(n_tokens, cparams.n_sampling_outputs_max);
// Account for worst case (all sequences could have backend samplers).
const uint32_t max_samplers = cparams.n_seq_max;
sampling_nodes = tensors_per_output * sampling_outputs * max_samplers;
}
return res;
return res + sampling_nodes;
}
llm_graph_result * llama_context::get_gf_res_reserve() const {
@ -2795,6 +2811,7 @@ llama_context_params llama_context_default_params() {
/*.kv_unified =*/ false,
/*.sampler =*/ nullptr,
/*.n_sampler =*/ 0,
/*.n_sampling_outputs_max =*/ 32,
};
return result;

View File

@ -39,6 +39,8 @@ struct llama_cparams {
enum llama_pooling_type pooling_type;
uint32_t n_sampling_outputs_max; // max outputs per sequence for backend sampling
ggml_backend_sched_eval_callback cb_eval;
void * cb_eval_user_data;
};

View File

@ -2599,6 +2599,10 @@ void llm_graph_context::build_sampling() const {
// this is important in order to minimize graph reallocations
ggml_tensor * logits_t = ggml_pad(ctx0, res->t_logits, 0, 1, 0, 0);
// During graph reservation, n_outputs can be very large (for example 512 for worst-case PP).
// We cap it to a user-configurable maximum since typical multi output scenarios use far fewer.
const uint32_t max_outputs = std::min<uint32_t>(n_outputs, cparams.n_sampling_outputs_max);
for (const auto & [seq_id, sampler] : samplers) {
const auto row_it = seq_to_logit_rows.find(seq_id);