From 6d957078270f58d4ea14e8c205f5ef4e49be33f3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 11 Feb 2026 07:52:20 +0200 Subject: [PATCH] model : fix wavtokenizer embedding notions (#19479) --- src/llama-hparams.h | 1 - src/llama-model.cpp | 11 ++++++----- tools/tts/tts.cpp | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 6c695bdbf6..706eda8441 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -42,7 +42,6 @@ struct llama_hparams { uint32_t n_ctx_train; // context size the model was trained on uint32_t n_embd; - uint32_t n_embd_features = 0; uint32_t n_layer; int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache uint32_t n_rot; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 7a06e96c87..5816e9a954 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -523,7 +523,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used, false); if (arch == LLM_ARCH_WAVTOKENIZER_DEC) { - ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features); + ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd); + ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd_out_impl); ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd); ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer); @@ -6046,9 +6047,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } break; case LLM_ARCH_WAVTOKENIZER_DEC: { - tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0); + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd, n_vocab}, 0); - conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0); + conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd, hparams.posnet.n_embd}, 0); conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0); // posnet @@ -6144,8 +6145,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0); } - output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0); - output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, hparams.n_embd_out()}, 0); + output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {hparams.n_embd_out()}, 0); } break; case LLM_ARCH_BAILINGMOE: { diff --git a/tools/tts/tts.cpp b/tools/tts/tts.cpp index 8c39fce8ba..ac55a8b1ca 100644 --- a/tools/tts/tts.cpp +++ b/tools/tts/tts.cpp @@ -1036,7 +1036,7 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14 #if 1 // spectral operations - const int n_embd = llama_model_n_embd(model_cts); + const int n_embd = llama_model_n_embd_out(model_cts); const float * embd = llama_get_embeddings(ctx_cts); auto audio = embd_to_audio(embd, n_codes, n_embd, params.cpuparams.n_threads);