added getter for nextn layer count and server slot has_mtp property

This commit is contained in:
Aaron Lee 2025-08-10 23:52:54 -04:00
parent 79c1160b07
commit db60623e79
3 changed files with 17 additions and 1 deletions

View File

@ -495,6 +495,8 @@ extern "C" {
LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
LLAMA_API int32_t llama_model_n_nextn_layer(const struct llama_model * model);
// Functions to access the model's GGUF metadata scalar values
// - The functions return the length of the string on success, or -1 on failure
// - The output string is always null-terminated and cleared on failure

View File

@ -18587,6 +18587,10 @@ const char * llama_model_cls_label(const struct llama_model * model, uint32_t i)
return nullptr;
}
int32_t llama_model_n_nextn_layer(const llama_model * model) {
return model->hparams.nextn_predict_layers;
}
// deprecated
int32_t llama_n_ctx_train(const llama_model * model) {
return llama_model_n_ctx_train(model);

View File

@ -1294,7 +1294,8 @@ struct server_slot {
mtmd_context * mctx = nullptr;
common_speculative * spec = nullptr;
bool has_mtp = false;
std::vector<common_adapter_lora_info> lora;
// the index relative to completion multi-task request
@ -2121,6 +2122,15 @@ struct server_context {
common_speculative_add_replacement_tgt_dft(slot.spec, pair.first.c_str(), pair.second.c_str());
}
}
else if (llama_model_n_nextn_layer(model) > 0) {
SRV_INF("model has nextn layers = %d\n", llama_model_n_nextn_layer(model));
slot.has_mtp = true;
// assume one speculative token (true of all well-known MTP models so far)
slot.batch_spec = llama_batch_init(2, 0, 1);
params_base.speculative.n_min = 0;
params_base.speculative.n_max = 1;
}
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);