llama : add n_sampling_outputs_max cparam
This commit adds a compute graph parameter named n_sampling_outputs_max which is intended to be used as a max (cap) value for the number of output for backend sampling. The motivation for this is that it gives a configurable value instead of a hardcoded macro (LLAMA_MAX_SAMPLING_OUTPUTS) which has been removed. I'm not sure if this is the best option as having multiple outputs per sequence might not be the most common use case. I need to think a little bit more about this. I'll commmit this to see that CI passes and also this parameter should be exposed as a common options for tools which I'll do in a follow up commit.
This commit is contained in:
parent
1138d5c2d9
commit
1e8c02aa95
|
|
@ -376,6 +376,7 @@ extern "C" {
|
|||
// note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
|
||||
struct llama_sampler_seq_config * samplers;
|
||||
size_t n_samplers;
|
||||
uint32_t n_sampling_outputs_max;
|
||||
};
|
||||
|
||||
// model quantization parameters
|
||||
|
|
|
|||
|
|
@ -62,6 +62,7 @@ llama_context::llama_context(
|
|||
cparams.cb_eval = params.cb_eval;
|
||||
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
||||
|
||||
cparams.n_sampling_outputs_max = params.n_sampling_outputs_max;
|
||||
// Initialize backend samplers here so they are part of the sampling graph
|
||||
// before the reserve passes run later in this function. This avoids a later
|
||||
// re-reserve when graph nodes change.
|
||||
|
|
@ -1947,14 +1948,29 @@ void llama_context::output_reorder() {
|
|||
//
|
||||
|
||||
uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
|
||||
uint32_t res;
|
||||
if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR || model.arch == LLM_ARCH_QWEN35 || model.arch == LLM_ARCH_QWEN35MOE) {
|
||||
return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
|
||||
res = std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
|
||||
} else {
|
||||
res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
|
||||
for (const auto & lora : model.loras) {
|
||||
res += lora->get_n_nodes();
|
||||
}
|
||||
}
|
||||
uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
|
||||
for (const auto & lora : model.loras) {
|
||||
res += lora->get_n_nodes();
|
||||
|
||||
// Account for backend sampling with multiple outputs per sequence.
|
||||
uint32_t sampling_nodes = 0;
|
||||
if (!sampling.samplers.empty()) {
|
||||
const uint32_t tensors_per_output = 50;
|
||||
const uint32_t sampling_outputs = std::min<uint32_t>(n_tokens, cparams.n_sampling_outputs_max);
|
||||
|
||||
// Account for worst case (all sequences could have backend samplers).
|
||||
const uint32_t max_samplers = cparams.n_seq_max;
|
||||
|
||||
sampling_nodes = tensors_per_output * sampling_outputs * max_samplers;
|
||||
}
|
||||
return res;
|
||||
|
||||
return res + sampling_nodes;
|
||||
}
|
||||
|
||||
llm_graph_result * llama_context::get_gf_res_reserve() const {
|
||||
|
|
@ -2795,6 +2811,7 @@ llama_context_params llama_context_default_params() {
|
|||
/*.kv_unified =*/ false,
|
||||
/*.sampler =*/ nullptr,
|
||||
/*.n_sampler =*/ 0,
|
||||
/*.n_sampling_outputs_max =*/ 32,
|
||||
};
|
||||
|
||||
return result;
|
||||
|
|
|
|||
|
|
@ -39,6 +39,8 @@ struct llama_cparams {
|
|||
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
uint32_t n_sampling_outputs_max; // max outputs per sequence for backend sampling
|
||||
|
||||
ggml_backend_sched_eval_callback cb_eval;
|
||||
void * cb_eval_user_data;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -2599,6 +2599,10 @@ void llm_graph_context::build_sampling() const {
|
|||
// this is important in order to minimize graph reallocations
|
||||
ggml_tensor * logits_t = ggml_pad(ctx0, res->t_logits, 0, 1, 0, 0);
|
||||
|
||||
// During graph reservation, n_outputs can be very large (for example 512 for worst-case PP).
|
||||
// We cap it to a user-configurable maximum since typical multi output scenarios use far fewer.
|
||||
const uint32_t max_outputs = std::min<uint32_t>(n_outputs, cparams.n_sampling_outputs_max);
|
||||
|
||||
for (const auto & [seq_id, sampler] : samplers) {
|
||||
const auto row_it = seq_to_logit_rows.find(seq_id);
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue