added getter for nextn layer count and server slot has_mtp property

2025-08-10 23:52:54 -04:00 · 2025-08-10 23:52:54 -04:00 · db60623e79
parent 79c1160b07
commit db60623e79
3 changed files with 17 additions and 1 deletions
--- a/include/llama.h
+++ b/include/llama.h
@ -495,6 +495,8 @@ extern "C" {

    LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);

+    LLAMA_API int32_t llama_model_n_nextn_layer(const struct llama_model * model);
+
    // Functions to access the model's GGUF metadata scalar values
    // - The functions return the length of the string on success, or -1 on failure
    // - The output string is always null-terminated and cleared on failure
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -18587,6 +18587,10 @@ const char * llama_model_cls_label(const struct llama_model * model, uint32_t i)
    return nullptr;
 }

+int32_t llama_model_n_nextn_layer(const llama_model * model) {
+    return model->hparams.nextn_predict_layers;
+}
+
 // deprecated
 int32_t llama_n_ctx_train(const llama_model * model) {
    return llama_model_n_ctx_train(model);
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@ -1294,7 +1294,8 @@ struct server_slot {
    mtmd_context * mctx = nullptr;

    common_speculative * spec = nullptr;
-
+    bool has_mtp = false;    
+    
    std::vector<common_adapter_lora_info> lora;

    // the index relative to completion multi-task request
@ -2121,6 +2122,15 @@ struct server_context {
                    common_speculative_add_replacement_tgt_dft(slot.spec, pair.first.c_str(), pair.second.c_str());
                }
            }
+            else if (llama_model_n_nextn_layer(model) > 0) {
+              SRV_INF("model has nextn layers = %d\n", llama_model_n_nextn_layer(model));
+              slot.has_mtp = true;
+              
+              // assume one speculative token (true of all well-known MTP models so far)
+              slot.batch_spec = llama_batch_init(2, 0, 1);
+              params_base.speculative.n_min = 0;
+              params_base.speculative.n_max = 1;
+            }

            SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);