model: fix multimodal padding token for gemma3n/gemma4 (#21625)
* model: fix multimodal padding token for gemma3n/gemma4 * nits
This commit is contained in:
parent
501aeed18f
commit
057dba336e
|
|
@ -250,27 +250,29 @@ ggml_tensor * llm_build_gemma3n_iswa::calc_magnitude(ggml_tensor * x) {
|
|||
ggml_tensor * llm_build_gemma3n_iswa::build_inp_per_layer() {
|
||||
auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
|
||||
ggml_tensor * inp_per_layer;
|
||||
float tok_embd_scale = sqrtf((float) n_embd_altup);
|
||||
if (ubatch.token) {
|
||||
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
|
||||
ggml_set_input(inp->tokens);
|
||||
res->t_inp_tokens = inp->tokens;
|
||||
inp_per_layer = ggml_get_rows(ctx0, model.per_layer_tok_embd, inp->tokens);
|
||||
inp_per_layer = ggml_get_rows (ctx0, model.per_layer_tok_embd, inp->tokens);
|
||||
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
|
||||
inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));
|
||||
inp_per_layer = ggml_scale (ctx0, inp_per_layer, tok_embd_scale);
|
||||
cb(inp_per_layer, "inp_per_layer_selected", -1);
|
||||
res->add_input(std::move(inp));
|
||||
} else {
|
||||
// Vision embedding path: use padding token (ID=0) embedding
|
||||
// Multimodal embedding path: use padding token (ID=0) embedding
|
||||
// TODO: verify if this is the correct behavior in transformers implementation
|
||||
const int64_t embd_size = model.per_layer_tok_embd->ne[0]; // n_embd_altup * n_layer
|
||||
|
||||
// Extract and dequantize padding token embedding (row 0)
|
||||
ggml_tensor * padding = ggml_view_1d(ctx0, model.per_layer_tok_embd, embd_size, 0);
|
||||
inp_per_layer = ggml_cast(ctx0, padding, GGML_TYPE_F32);
|
||||
inp_per_layer = ggml_cast (ctx0, padding, GGML_TYPE_F32);
|
||||
inp_per_layer = ggml_scale(ctx0, inp_per_layer, tok_embd_scale);
|
||||
|
||||
// Reshape to [n_embd_altup, n_layer, 1]
|
||||
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, 1);
|
||||
cb(inp_per_layer, "inp_per_layer_vision", -1);
|
||||
cb(inp_per_layer, "inp_per_layer_multimodal", -1);
|
||||
}
|
||||
return inp_per_layer;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -265,6 +265,7 @@ ggml_tensor * llm_build_gemma4_iswa::build_inp_per_layer() {
|
|||
auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
|
||||
|
||||
ggml_tensor * inp_per_layer;
|
||||
float tok_embd_scale = sqrtf((float) n_embd_per_layer);
|
||||
if (ubatch.token) {
|
||||
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
|
||||
ggml_set_input(inp->tokens);
|
||||
|
|
@ -272,22 +273,23 @@ ggml_tensor * llm_build_gemma4_iswa::build_inp_per_layer() {
|
|||
|
||||
inp_per_layer = ggml_get_rows (ctx0, model.per_layer_tok_embd, inp->tokens);
|
||||
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_per_layer, n_layer, n_tokens);
|
||||
inp_per_layer = ggml_scale (ctx0, inp_per_layer, sqrtf((float) n_embd_per_layer));
|
||||
inp_per_layer = ggml_scale (ctx0, inp_per_layer, tok_embd_scale);
|
||||
cb(inp_per_layer, "inp_per_layer_selected", -1);
|
||||
|
||||
res->add_input(std::move(inp));
|
||||
} else {
|
||||
// Vision embedding path: use padding token (ID=0) embedding
|
||||
// Multimodal embedding path: use padding token (ID=0) embedding
|
||||
// TODO: verify if this is the correct behavior in transformers implementation
|
||||
const int64_t embd_size = model.per_layer_tok_embd->ne[0]; // n_embd_per_layer * n_layer
|
||||
|
||||
// Extract and dequantize padding token embedding (row 0)
|
||||
ggml_tensor * padding = ggml_view_1d(ctx0, model.per_layer_tok_embd, embd_size, 0);
|
||||
inp_per_layer = ggml_cast(ctx0, padding, GGML_TYPE_F32);
|
||||
inp_per_layer = ggml_cast (ctx0, padding, GGML_TYPE_F32);
|
||||
inp_per_layer = ggml_scale(ctx0, inp_per_layer, tok_embd_scale);
|
||||
|
||||
// Reshape to [n_embd_per_layer, n_layer, 1]
|
||||
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_per_layer, n_layer, 1);
|
||||
cb(inp_per_layer, "inp_per_layer_vision", -1);
|
||||
cb(inp_per_layer, "inp_per_layer_multimodal", -1);
|
||||
}
|
||||
return inp_per_layer;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue