added getter for nextn layer count and server slot has_mtp property
This commit is contained in:
parent
79c1160b07
commit
db60623e79
|
|
@ -495,6 +495,8 @@ extern "C" {
|
|||
|
||||
LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
|
||||
|
||||
LLAMA_API int32_t llama_model_n_nextn_layer(const struct llama_model * model);
|
||||
|
||||
// Functions to access the model's GGUF metadata scalar values
|
||||
// - The functions return the length of the string on success, or -1 on failure
|
||||
// - The output string is always null-terminated and cleared on failure
|
||||
|
|
|
|||
|
|
@ -18587,6 +18587,10 @@ const char * llama_model_cls_label(const struct llama_model * model, uint32_t i)
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
int32_t llama_model_n_nextn_layer(const llama_model * model) {
|
||||
return model->hparams.nextn_predict_layers;
|
||||
}
|
||||
|
||||
// deprecated
|
||||
int32_t llama_n_ctx_train(const llama_model * model) {
|
||||
return llama_model_n_ctx_train(model);
|
||||
|
|
|
|||
|
|
@ -1294,7 +1294,8 @@ struct server_slot {
|
|||
mtmd_context * mctx = nullptr;
|
||||
|
||||
common_speculative * spec = nullptr;
|
||||
|
||||
bool has_mtp = false;
|
||||
|
||||
std::vector<common_adapter_lora_info> lora;
|
||||
|
||||
// the index relative to completion multi-task request
|
||||
|
|
@ -2121,6 +2122,15 @@ struct server_context {
|
|||
common_speculative_add_replacement_tgt_dft(slot.spec, pair.first.c_str(), pair.second.c_str());
|
||||
}
|
||||
}
|
||||
else if (llama_model_n_nextn_layer(model) > 0) {
|
||||
SRV_INF("model has nextn layers = %d\n", llama_model_n_nextn_layer(model));
|
||||
slot.has_mtp = true;
|
||||
|
||||
// assume one speculative token (true of all well-known MTP models so far)
|
||||
slot.batch_spec = llama_batch_init(2, 0, 1);
|
||||
params_base.speculative.n_min = 0;
|
||||
params_base.speculative.n_max = 1;
|
||||
}
|
||||
|
||||
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue