diff --git a/include/llama.h b/include/llama.h index 545e957e5f..3bade3ae71 100644 --- a/include/llama.h +++ b/include/llama.h @@ -495,6 +495,8 @@ extern "C" { LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab); + LLAMA_API int32_t llama_model_n_nextn_layer(const struct llama_model * model); + // Functions to access the model's GGUF metadata scalar values // - The functions return the length of the string on success, or -1 on failure // - The output string is always null-terminated and cleared on failure diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 58ca7df707..2351478c2f 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -18587,6 +18587,10 @@ const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) return nullptr; } +int32_t llama_model_n_nextn_layer(const llama_model * model) { + return model->hparams.nextn_predict_layers; +} + // deprecated int32_t llama_n_ctx_train(const llama_model * model) { return llama_model_n_ctx_train(model); diff --git a/tools/server/server.cpp b/tools/server/server.cpp index a255d481a4..7a931cc6b0 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -1294,7 +1294,8 @@ struct server_slot { mtmd_context * mctx = nullptr; common_speculative * spec = nullptr; - + bool has_mtp = false; + std::vector lora; // the index relative to completion multi-task request @@ -2121,6 +2122,15 @@ struct server_context { common_speculative_add_replacement_tgt_dft(slot.spec, pair.first.c_str(), pair.second.c_str()); } } + else if (llama_model_n_nextn_layer(model) > 0) { + SRV_INF("model has nextn layers = %d\n", llama_model_n_nextn_layer(model)); + slot.has_mtp = true; + + // assume one speculative token (true of all well-known MTP models so far) + slot.batch_spec = llama_batch_init(2, 0, 1); + params_base.speculative.n_min = 0; + params_base.speculative.n_max = 1; + } SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);