diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 7c9aff2826..a5345ee2a4 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -13,7 +13,6 @@ #include #include #include -#include // // llama_context @@ -738,17 +737,6 @@ bool llama_context::apply_adapter_cvec( return cvec.apply(model, data, len, n_embd, il_start, il_end); } -static double calculate_vector_sum(const float* vec, size_t size) { - if (!vec) { - return 0.0; - } - double sum = 0.0; - for (size_t i = 0; i < size; ++i) { - sum += vec[i]; - } - return sum; -} - llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret, const llama_mtp_params & mtp_params) { if (mctx && !mctx->apply()) { @@ -995,10 +983,6 @@ int llama_context::decode(const llama_batch & batch_inp) { GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT auto * kvd = static_cast(kv_cache_data); - // LLAMA_LOG_WARN("[DEBUG-DECODE-ENTRY] Entering llama_decode. update_mtp_kv=%s, use_mtp_head=%s\n", - // batch_inp.update_mtp_kv ? "true" : "false", - // batch_inp.use_mtp_head ? "true" : "false" - // ); if (!memory) { LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__); @@ -1074,10 +1058,10 @@ int llama_context::decode(const llama_batch & batch_inp) { } case LLAMA_MEMORY_STATUS_FAILED_PREPARE: { - // if (use_last_main_model_sinfos) { - // LLAMA_LOG_ERROR("%s: Mismatch between ubatches and sinfos during reuse.\n", __func__); - // return -1; - // } + if (kvd->forced_sinfos) { + LLAMA_LOG_ERROR("%s: Mismatch between ubatches and sinfos during reuse.\n", __func__); + return -1; + } if (!did_optimize) { did_optimize = true; @@ -1106,9 +1090,6 @@ int llama_context::decode(const llama_batch & batch_inp) { }; int64_t n_outputs_prev = 0; - // const bool do_mtp_kv_update = batch_inp.update_mtp_kv; - // const bool use_mtp_head = batch_inp.use_mtp_head; - // const bool is_prompt_warmup = batch_inp.is_mtp_prompt_warmup; do { const auto & ubatch = mctx->get_ubatch(); @@ -1127,14 +1108,6 @@ int llama_context::decode(const llama_batch & batch_inp) { // needs to happen before the graph is built n_outputs = n_outputs_new; } - // if (do_mtp_kv_update) { - // LLAMA_LOG_WARN("[DEBUG-MTP-UPDATE] MTP KV Update ubatch: n_tokens=%d\n", ubatch.n_tokens); - // std::string positions_str; - // for (int i = 0; i < std::min((uint32_t)5, ubatch.n_tokens); ++i) { - // positions_str += std::to_string(ubatch.pos[i]) + " "; - // } - // LLAMA_LOG_WARN("[DEBUG-MTP-UPDATE] Positions: %s...\n", positions_str.c_str()); - // } ggml_status status; const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status, batch_inp.mtp_params); if (!res) { @@ -1195,14 +1168,6 @@ int llama_context::decode(const llama_batch & batch_inp) { } } - // if (use_mtp_head) { - // if (t_embd != nullptr) { - // LLAMA_LOG_ERROR("[MTP-GRAPH-BUG] The MTP graph returned an embedding tensor when it shouldn't have! This will cause corruption.\n"); - // } else { - // LLAMA_LOG_WARN("[MTP-GRAPH-OK] The MTP graph correctly did not return an embedding tensor.\n"); - // } - // } - // extract embeddings if (t_embd && n_outputs > 0) { if (batch_inp.mtp_params.op_type == MTP_OP_NONE) { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 56f2bae06c..ab7daee356 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -13829,11 +13829,6 @@ struct llm_build_glm4_moe : public llm_graph_context { // Final layer tensors are loaded but not processed in forward pass const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; for (int il = 0; il < n_transformer_layers; ++il) { - // if (params.use_mtp_head) { - // LLAMA_LOG_ERROR("[DEBUG-KV-ERROR] MTP path is running the main layer %d!\n", il); - // } else { - // LLAMA_LOG_WARN("[DEBUG-KV] Main Head Path: Accessing layer %d\n", il); - // } ggml_tensor * inpSA = inpL; // Pre-attention norm @@ -13976,7 +13971,6 @@ private: ggml_tensor * embd_copy = ggml_dup(ctx0, prev_embeddings); const int il = hparams.n_layer - 1; - // LLAMA_LOG_WARN("[DEBUG-KV] MTP Head Path: Accessing layer %d\n", il); ggml_tensor * sum_node = ggml_sum(ctx0, embd_copy); ggml_set_name(sum_node, "mtp_input_sum"); @@ -18311,12 +18305,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, } ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { - const int64_t t_start_us = ggml_time_us(); std::unique_ptr llm; - - const bool build_mtp = params.mtp_params.op_type == MTP_OP_UPDATE_ACCEPTED; - switch (arch) { case LLM_ARCH_LLAMA: { @@ -18678,12 +18668,6 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { // add on pooling layer llm->build_pooling(cls, cls_b, cls_out, cls_out_b); } - const int64_t t_end_us = ggml_time_us(); - // LLAMA_LOG_INFO( - // "[PERF] Graph build time: %.2f ms (MTP path: %s)\n", - // (t_end_us - t_start_us) / 1000.0, - // params.use_mtp_head ? "yes" : "no" - // ); return llm->res->get_gf(); } diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 4ff69f005f..a24532c693 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -3520,7 +3520,7 @@ struct server_context { // Clean up the forced state to not affect subsequent decodes. llama_mtp_cancel_sinfo_update(ctx); } else { - LOG_ERR("%s: Failed to prepare the MTP symphony for warmup.", __func__); + LOG_ERR("%s: Failed to prepare the MTP for warmup.", __func__); } }