diff --git a/src/models/gemma3n-iswa.cpp b/src/models/gemma3n-iswa.cpp index 4b6817c96a..ad982808bc 100644 --- a/src/models/gemma3n-iswa.cpp +++ b/src/models/gemma3n-iswa.cpp @@ -250,27 +250,29 @@ ggml_tensor * llm_build_gemma3n_iswa::calc_magnitude(ggml_tensor * x) { ggml_tensor * llm_build_gemma3n_iswa::build_inp_per_layer() { auto inp = std::make_unique(n_embd); ggml_tensor * inp_per_layer; + float tok_embd_scale = sqrtf((float) n_embd_altup); if (ubatch.token) { inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); ggml_set_input(inp->tokens); res->t_inp_tokens = inp->tokens; - inp_per_layer = ggml_get_rows(ctx0, model.per_layer_tok_embd, inp->tokens); + inp_per_layer = ggml_get_rows (ctx0, model.per_layer_tok_embd, inp->tokens); inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens); - inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup)); + inp_per_layer = ggml_scale (ctx0, inp_per_layer, tok_embd_scale); cb(inp_per_layer, "inp_per_layer_selected", -1); res->add_input(std::move(inp)); } else { - // Vision embedding path: use padding token (ID=0) embedding + // Multimodal embedding path: use padding token (ID=0) embedding // TODO: verify if this is the correct behavior in transformers implementation const int64_t embd_size = model.per_layer_tok_embd->ne[0]; // n_embd_altup * n_layer // Extract and dequantize padding token embedding (row 0) ggml_tensor * padding = ggml_view_1d(ctx0, model.per_layer_tok_embd, embd_size, 0); - inp_per_layer = ggml_cast(ctx0, padding, GGML_TYPE_F32); + inp_per_layer = ggml_cast (ctx0, padding, GGML_TYPE_F32); + inp_per_layer = ggml_scale(ctx0, inp_per_layer, tok_embd_scale); // Reshape to [n_embd_altup, n_layer, 1] inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, 1); - cb(inp_per_layer, "inp_per_layer_vision", -1); + cb(inp_per_layer, "inp_per_layer_multimodal", -1); } return inp_per_layer; } diff --git a/src/models/gemma4-iswa.cpp b/src/models/gemma4-iswa.cpp index b3c6c5be2a..405cdadc13 100644 --- a/src/models/gemma4-iswa.cpp +++ b/src/models/gemma4-iswa.cpp @@ -265,6 +265,7 @@ ggml_tensor * llm_build_gemma4_iswa::build_inp_per_layer() { auto inp = std::make_unique(n_embd); ggml_tensor * inp_per_layer; + float tok_embd_scale = sqrtf((float) n_embd_per_layer); if (ubatch.token) { inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); ggml_set_input(inp->tokens); @@ -272,22 +273,23 @@ ggml_tensor * llm_build_gemma4_iswa::build_inp_per_layer() { inp_per_layer = ggml_get_rows (ctx0, model.per_layer_tok_embd, inp->tokens); inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_per_layer, n_layer, n_tokens); - inp_per_layer = ggml_scale (ctx0, inp_per_layer, sqrtf((float) n_embd_per_layer)); + inp_per_layer = ggml_scale (ctx0, inp_per_layer, tok_embd_scale); cb(inp_per_layer, "inp_per_layer_selected", -1); res->add_input(std::move(inp)); } else { - // Vision embedding path: use padding token (ID=0) embedding + // Multimodal embedding path: use padding token (ID=0) embedding // TODO: verify if this is the correct behavior in transformers implementation const int64_t embd_size = model.per_layer_tok_embd->ne[0]; // n_embd_per_layer * n_layer // Extract and dequantize padding token embedding (row 0) ggml_tensor * padding = ggml_view_1d(ctx0, model.per_layer_tok_embd, embd_size, 0); - inp_per_layer = ggml_cast(ctx0, padding, GGML_TYPE_F32); + inp_per_layer = ggml_cast (ctx0, padding, GGML_TYPE_F32); + inp_per_layer = ggml_scale(ctx0, inp_per_layer, tok_embd_scale); // Reshape to [n_embd_per_layer, n_layer, 1] inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_per_layer, n_layer, 1); - cb(inp_per_layer, "inp_per_layer_vision", -1); + cb(inp_per_layer, "inp_per_layer_multimodal", -1); } return inp_per_layer; }