From 63f8fe0ef4d134378fcdd67f50b05dff3cee179b Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Thu, 2 Apr 2026 17:10:32 +0200 Subject: [PATCH] model, mtmd: fix gguf conversion for audio/vision mmproj (#21309) * fix gguf conversion for audio/vision mmproj * fix test --- common/chat-auto-parser-generator.cpp | 111 ++++++++ common/chat-auto-parser.h | 4 + common/chat-diff-analyzer.cpp | 27 ++ common/chat.cpp | 48 ++++ convert_hf_to_gguf.py | 233 ++++++++++++++++- examples/eval-callback/eval-callback.cpp | 7 +- gguf-py/gguf/constants.py | 83 +++++- gguf-py/gguf/gguf_writer.py | 1 + gguf-py/gguf/tensor_mapping.py | 97 ++++++- src/CMakeLists.txt | 1 + src/llama-arch.cpp | 43 ++++ src/llama-arch.h | 7 + src/llama-hparams.h | 3 + src/llama-model.cpp | 126 ++++++++- src/llama-model.h | 7 + src/llama-vocab.cpp | 13 + src/models/gemma4-iswa.cpp | 311 +++++++++++++++++++++++ src/models/models.h | 11 + tests/test-llama-archs.cpp | 6 + tools/mtmd/CMakeLists.txt | 1 + tools/mtmd/clip-graph.h | 2 +- tools/mtmd/clip-impl.h | 27 +- tools/mtmd/clip-model.h | 29 ++- tools/mtmd/clip.cpp | 140 ++++++++-- tools/mtmd/models/gemma4v.cpp | 151 +++++++++++ tools/mtmd/models/models.h | 6 + tools/mtmd/mtmd.cpp | 8 + 27 files changed, 1462 insertions(+), 41 deletions(-) create mode 100644 src/models/gemma4-iswa.cpp create mode 100644 tools/mtmd/models/gemma4v.cpp diff --git a/common/chat-auto-parser-generator.cpp b/common/chat-auto-parser-generator.cpp index e38541fd04..a368a110a0 100644 --- a/common/chat-auto-parser-generator.cpp +++ b/common/chat-auto-parser-generator.cpp @@ -169,6 +169,8 @@ common_peg_parser analyze_tools::build_parser(parser_build_context & ctx) const return build_tool_parser_tag_json(ctx); case tool_format::TAG_WITH_TAGGED: return build_tool_parser_tag_tagged(ctx); + case tool_format::TAG_WITH_GEMMA4_DICT: + return build_tool_parser_tag_gemma4_dict(ctx); default: LOG_ERR("[ERROR] Template seems to support tool calls, but failed to determine tool format. Tool calling will not work properly. " "Check for a fixed template for your model in the models/templates directory of your llama.cpp installation or " @@ -433,4 +435,113 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte p.end(); } +common_peg_parser analyze_tools::build_tool_parser_tag_gemma4_dict(parser_build_context & ctx) const { + auto & p = ctx.p; + const auto & inputs = ctx.inputs; + bool force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED; + + // The Gemma4 string quote token used in place of JSON " + static const std::string QUOTE = "<|\"|>"; + + common_peg_parser tool_choice = p.choice(); + + foreach_function(inputs.tools, [&](const json & tool) { + const auto & func = tool.at("function"); + std::string name = func.at("name"); + const auto & params = func.at("parameters"); + + if (!params.contains("properties") || !params.at("properties").is_object()) { + // No arguments - just match the function name with empty braces + auto func_parser = p.atomic( + p.tool_open(p.literal(function.name_prefix) + p.tool_name(p.literal(name)) + p.literal("{")) + + p.tool_args(p.eps()) + + p.tool_close(p.literal("}"))); + tool_choice |= p.rule("tool-" + name, func_parser); + return; + } + + const auto & properties = params.at("properties"); + std::set required; + if (params.contains("required") && params.at("required").is_array()) { + params.at("required").get_to(required); + } + + // Build per-argument parsers, sorted alphabetically (matching template's dictsort) + struct arg_entry { + std::string param_name; + common_peg_parser parser; + }; + std::vector arg_entries; + + for (const auto & [param_name, param_schema] : properties.items()) { + std::string type = "object"; + auto type_v = param_schema.contains("type") ? param_schema.at("type") : json::object(); + if (type_v.is_string()) type_v.get_to(type); + + common_peg_parser value_parser = p.eps(); + if (type == "string") { + // String values are delimited by <|"|>...<|"|> + value_parser = + p.literal(QUOTE) + + p.tool_arg_string_value(p.schema(p.until(QUOTE), + "tool-" + name + "-arg-" + param_name + "-schema", param_schema, true)) + + p.literal(QUOTE); + } else { + // Numbers, booleans: raw text up to the next comma or closing brace + value_parser = p.tool_arg_value(p.until_one_of({",", "}"})); + } + + auto arg = p.tool_arg( + p.tool_arg_open(p.tool_arg_name(p.literal(param_name)) + p.literal(":")) + + value_parser + + p.tool_arg_close(p.eps())); + + arg_entries.push_back({param_name, p.rule("tool-" + name + "-arg-" + param_name, arg)}); + } + + // Sort alphabetically to match Jinja's dictsort + std::sort(arg_entries.begin(), arg_entries.end(), [](const auto & a, const auto & b) { + return a.param_name < b.param_name; + }); + + // Build arg sequence: any arg, then zero-or-more comma-separated additional args + common_peg_parser args_seq = p.eps(); + if (!arg_entries.empty()) { + common_peg_parser any_arg = p.choice(); + for (auto & entry : arg_entries) { + any_arg |= entry.parser; + } + args_seq = p.optional( + any_arg + p.repeat(p.literal(",") + any_arg, 0, (int) arg_entries.size() - 1)); + } + + // Full parser: call:name{args} + auto func_parser = p.atomic( + p.tool_open(p.literal(function.name_prefix) + p.tool_name(p.literal(name)) + p.literal("{")) + + p.tool_args(args_seq) + + p.tool_close(p.literal("}"))); + + tool_choice |= p.rule("tool-" + name, func_parser); + }); + + // Wrap each call in <|tool_call>... + auto wrapped_call = p.literal(format.per_call_start) + tool_choice + p.literal(format.per_call_end); + + common_peg_parser tool_calls = p.eps(); + if (inputs.parallel_tool_calls) { + tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call)); + } else { + tool_calls = p.trigger_rule("tool-call", wrapped_call); + } + + if (!force_tools) { + tool_calls = p.optional(tool_calls); + } + + auto content_before_tools = p.until(format.per_call_start); + return ctx.reasoning_parser + + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + + tool_calls + p.end(); +} + } // namespace autoparser diff --git a/common/chat-auto-parser.h b/common/chat-auto-parser.h index 73888276f4..514c76576e 100644 --- a/common/chat-auto-parser.h +++ b/common/chat-auto-parser.h @@ -144,6 +144,7 @@ enum class tool_format { JSON_NATIVE, // Pure JSON: {"name": "X", "arguments": {...}} TAG_WITH_JSON, // Tag-based with JSON args: {...} TAG_WITH_TAGGED, // Tag-based with tagged args: value + TAG_WITH_GEMMA4_DICT, // Gemma4 custom dict: <|tool_call>call:name{key:<|"|>val<|"|>} }; inline std::ostream & operator<<(std::ostream & os, const tool_format & format) { @@ -156,6 +157,8 @@ inline std::ostream & operator<<(std::ostream & os, const tool_format & format) return os << "TAG_WITH_JSON"; case tool_format::TAG_WITH_TAGGED: return os << "TAG_WITH_TAGGED"; + case tool_format::TAG_WITH_GEMMA4_DICT: + return os << "TAG_WITH_GEMMA4_DICT"; default: return os << "UNKNOWN"; } @@ -350,6 +353,7 @@ struct analyze_tools : analyze_base { common_peg_parser build_tool_parser_json_native(parser_build_context & ctx) const; common_peg_parser build_tool_parser_tag_json(parser_build_context & ctx) const; common_peg_parser build_tool_parser_tag_tagged(parser_build_context & ctx) const; + common_peg_parser build_tool_parser_tag_gemma4_dict(parser_build_context & ctx) const; }; // ============================================================================ diff --git a/common/chat-diff-analyzer.cpp b/common/chat-diff-analyzer.cpp index 414ee892f8..042da92e4c 100644 --- a/common/chat-diff-analyzer.cpp +++ b/common/chat-diff-analyzer.cpp @@ -92,6 +92,33 @@ static std::vectorcall:name{key:<|"|>val<|"|>} + [](const common_chat_template & tmpl, autoparser & analysis) -> void { + if (tmpl.src.find("'<|tool_call>call:'") != std::string::npos) { + analysis.tools.format.mode = tool_format::TAG_WITH_GEMMA4_DICT; + analysis.tools.format.per_call_start = "<|tool_call>"; + analysis.tools.format.per_call_end = ""; + analysis.tools.format.section_start = ""; + analysis.tools.format.section_end = ""; + analysis.tools.function.name_prefix = "call:"; + analysis.tools.function.name_suffix = ""; + analysis.tools.arguments.start = "{"; + analysis.tools.arguments.end = "}"; + analysis.tools.arguments.name_suffix = ":"; + analysis.tools.arguments.separator = ","; + analysis.reasoning.mode = reasoning_mode::TAG_BASED; + analysis.reasoning.start = "<|channel>thought\n"; + analysis.reasoning.end = ""; + analysis.preserved_tokens.clear(); + analysis.preserved_tokens.push_back("<|tool_call>"); + analysis.preserved_tokens.push_back(""); + analysis.preserved_tokens.push_back("<|tool_response>"); + analysis.preserved_tokens.push_back(""); + analysis.preserved_tokens.push_back("<|\"|>"); + analysis.preserved_tokens.push_back("<|turn>"); + LOG_DBG(ANSI_ORANGE "[Patch: Gemma4]\n" ANSI_RESET); + } + }, // DeepSeek-R1-Distill-Qwen [](const common_chat_template & tmpl, autoparser & analysis) -> void { if (tmpl.src.find( diff --git a/common/chat.cpp b/common/chat.cpp index df13e0db09..f92b2bd290 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1545,6 +1545,50 @@ static void requires_non_null_content(json & messages) { } } +// Gemma4 uses a custom tool_responses field instead of role:tool messages. +// Convert consecutive role:tool messages into a single user message with tool_responses. +static void convert_tool_responses_gemma4(json & messages) { + json result = json::array(); + size_t i = 0; + while (i < messages.size()) { + if (messages[i].contains("role") && messages[i].at("role") == "tool") { + json tool_responses = json::array(); + while (i < messages.size() && + messages[i].contains("role") && + messages[i].at("role") == "tool") { + const auto & tool_msg = messages[i]; + std::string name; + if (tool_msg.contains("tool_call_id") && tool_msg.at("tool_call_id").is_string()) { + name = tool_msg.at("tool_call_id"); + } else if (tool_msg.contains("name") && tool_msg.at("name").is_string()) { + name = tool_msg.at("name"); + } + json response; + if (tool_msg.contains("content")) { + const auto & content = tool_msg.at("content"); + if (content.is_string()) { + // Try to parse the content as JSON; fall back to raw string + try { + response = json::parse(content.get()); + } catch (...) { + response = content; + } + } else { + response = content; + } + } + tool_responses.push_back({{"name", name}, {"response", response}}); + i++; + } + result.push_back({{"role", "user"}, {"tool_responses", tool_responses}}); + } else { + result.push_back(messages[i]); + i++; + } + } + messages = result; +} + static void func_args_not_string(json & messages) { GGML_ASSERT(messages.is_array()); for (auto & message : messages) { @@ -1673,6 +1717,10 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_ workaround::func_args_not_string(params.messages); } + if (src.find("'<|tool_call>call:'") != std::string::npos) { + workaround::convert_tool_responses_gemma4(params.messages); + } + params.add_generation_prompt = false; std::string no_gen_prompt = common_chat_template_direct_apply(tmpl, params); params.add_generation_prompt = true; diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 51f0d1ab15..de1def3201 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1164,7 +1164,7 @@ class TextModel(ModelBase): if (n_experts := self.find_hparam(["num_local_experts", "num_experts"], optional=True)) is not None: self.gguf_writer.add_expert_count(n_experts) logger.info(f"gguf: expert count = {n_experts}") - if (n_experts_used := self.find_hparam(["num_experts_per_tok", "num_experts_per_token"], optional=True)) is not None: + if (n_experts_used := self.find_hparam(["num_experts_per_tok", "num_experts_per_token", "top_k_experts"], optional=True)) is not None: self.gguf_writer.add_expert_used_count(n_experts_used) logger.info(f"gguf: experts used count = {n_experts_used}") if (n_expert_groups := self.hparams.get("n_group")) is not None: @@ -6878,7 +6878,9 @@ class Gemma2Model(TextModel): @ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration") class Gemma3Model(TextModel): model_arch = gguf.MODEL_ARCH.GEMMA3 - norm_shift = 1.0 # Gemma3RMSNorm adds 1.0 to the norm value + + def norm_shift(self, name: str) -> float: + return 1.0 if name.endswith("norm.weight") else 0.0 # Gemma3RMSNorm adds 1.0 to the norm value def set_vocab(self): if (self.dir_model / "tokenizer.model").is_file(): @@ -6916,17 +6918,22 @@ class Gemma3Model(TextModel): # remove OOV (out-of-vocabulary) rows in token_embd if "embed_tokens.weight" in name: + n_vocab_real = -1 if (self.dir_model / "tokenizer.model").is_file(): tokens = self._create_vocab_sentencepiece()[0] + n_vocab_real = len(tokens) else: - tokens = self.get_vocab_base()[0] - data_torch = data_torch[:len(tokens)] + with open(self.dir_model / "tokenizer.json", "r", encoding="utf-8") as f: + tokenizer_json = json.load(f) + n_vocab_real = len(tokenizer_json["model"]["vocab"]) + len(tokenizer_json["added_tokens"]) + data_torch = data_torch[:n_vocab_real] # ref code in Gemma3RMSNorm # output = output * (1.0 + self.weight.float()) # note: this is not the case on gemma3n - if name.endswith("norm.weight"): - data_torch = data_torch + self.norm_shift + f_shift = self.norm_shift(name) + if f_shift != 0.0: + data_torch = data_torch + f_shift yield from super().modify_tensors(data_torch, name, bid) @@ -7100,7 +7107,8 @@ class ConformerAudioModel(MmprojModel): assert data_torch.shape[2] == 1 data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1]) - yield from super().modify_tensors(data_torch, name, bid) + mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min")) + yield (mapped_name, data_torch) @ModelBase.register("DeepseekOCRForCausalLM") @@ -7289,7 +7297,6 @@ class Gemma3nVisionAudioModel(ConformerAudioModel): @ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration") class Gemma3NModel(Gemma3Model): model_arch = gguf.MODEL_ARCH.GEMMA3N - norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code _altup_proj: list[Tensor] = [] _altup_unembd: list[Tensor] = [] @@ -7308,6 +7315,10 @@ class Gemma3NModel(Gemma3Model): torch.Tensor(), # to be replaced ] + def norm_shift(self, name: str) -> float: + del name + return 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code + def set_vocab(self): # For Gemma3n multimodal models, we need the FULL vocab_size (262400) # which includes special tokens from 262144-262399 for vision/audio. @@ -7425,6 +7436,212 @@ class Gemma3NModel(Gemma3Model): yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("Gemma4ForConditionalGeneration") +class Gemma4Model(Gemma3Model): + model_arch = gguf.MODEL_ARCH.GEMMA4 + + def norm_shift(self, name: str) -> float: + del name # unused + return 0.0 + + def set_vocab(self): + vocab = gguf.LlamaHfVocab(self.dir_model) + tokens = [] + scores = [] + toktypes = [] + visible_tokens = {"<|channel>", "", "<|tool_call>", "", "<|tool_response>", "", "<|\"|>"} + + for text, score, toktype in vocab.all_tokens(): + tokens.append(text) + scores.append(score) + text_str = text.decode() + if text_str in visible_tokens: + # always render these tokens, so that the chat parser can read them + toktypes.append(gguf.TokenType.USER_DEFINED) + logger.info(f"Token '{text_str}' is set to USER_DEFINED") + else: + toktypes.append(toktype) + + assert len(tokens) == vocab.vocab_size + + # TODO @ngxson : there are some known (rare) issues with the tokenizer during development + # but I don't have time to dive into them right now; + # using a dedicated tokenizer name so that we can fix later without re-converting GGUF + self.gguf_writer.add_tokenizer_model("gemma4") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + self.gguf_writer.add_add_space_prefix(False) + self.gguf_writer.add_add_bos_token(False) # already added via the chat template + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + num_kv_shared_layers = self.hparams["num_kv_shared_layers"] + self.gguf_writer.add_shared_kv_layers(num_kv_shared_layers) + + # per-layer embedding is optional + n_pl_embd = self.hparams.get("hidden_size_per_layer_input") or 0 + self.gguf_writer.add_embedding_length_per_layer_input(n_pl_embd) + + swa_layers = [t == "sliding_attention" for t in self.hparams["layer_types"]] + self.gguf_writer.add_sliding_window_pattern(swa_layers) + + head_dim_full = self.hparams["global_head_dim"] + head_dim_swa = self.hparams["head_dim"] + # correct the head dim for global/swa layers + self.gguf_writer.add_key_length(head_dim_full) + self.gguf_writer.add_value_length(head_dim_full) + self.gguf_writer.add_key_length_swa(head_dim_swa) + self.gguf_writer.add_value_length_swa(head_dim_swa) + + expert_intermediate_size = self.find_hparam(["expert_intermediate_size", "moe_intermediate_size"]) + if expert_intermediate_size is not None: + self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size) + + # if use_double_wide_mlp is set, we need to adjust the value for kv shared layers + use_double_wide_mlp = self.hparams.get("use_double_wide_mlp", False) + first_kv_shared_layer_idx = self.block_count - num_kv_shared_layers + if use_double_wide_mlp: + n_ff = self.hparams["intermediate_size"] + n_ff_arr = [n_ff if il < first_kv_shared_layer_idx else n_ff * 2 for il in range(self.block_count)] + self.gguf_writer.add_feed_forward_length(n_ff_arr) + + # handle num_global_key_value_heads + num_key_value_heads_full = self.hparams.get("num_global_key_value_heads") + num_key_value_heads_swa = self.hparams.get("num_key_value_heads") + if num_key_value_heads_full is not None and num_key_value_heads_swa is not None: + value_arr = [num_key_value_heads_swa if is_swa else num_key_value_heads_full for is_swa in swa_layers] + self.gguf_writer.add_head_count_kv(value_arr) + + # handle n_rot differently for global vs swa layers + partial_rotary_factor_swa = self.hparams.get("partial_rotary_factor", 1.0) + n_rot_full = int(head_dim_full) # "proportional" is used, see generate_extra_tensors + n_rot_swa = int(head_dim_swa * partial_rotary_factor_swa) + self.gguf_writer.add_rope_dimension_count(n_rot_full) + self.gguf_writer.add_rope_dimension_count_swa(n_rot_swa) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + # full layer uses "proportional" rope with partial_rotary_factor=0.25 + # the expected ordering is cc000000ss000000 (c = cos, s = sin, 0 = unrotated), + # but ggml neox only supports ccss000000000000, and we cannot rearrange the head because that will break use_alternative_attention + # solution is to set specific freq_factors for the unrotated dims + + # IMPORTANT: this ROPE_FREQS tensor is ONLY used by the full_attention layers + rope_params_full = self.hparams["rope_parameters"]["full_attention"] + assert rope_params_full["rope_type"] == "proportional" + head_dim_full = (self.hparams["global_head_dim"]) + partial_rotary_factor_full = rope_params_full["partial_rotary_factor"] + n_rot_full = int(head_dim_full * partial_rotary_factor_full / 2) + n_unrot_full = int(head_dim_full / 2) - n_rot_full + values = [1.0] * n_rot_full + [1e30] * n_unrot_full + rope_freqs_full = torch.tensor(values, dtype=torch.float32) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), rope_freqs_full) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.endswith("per_dim_scale") or name.endswith("layer_scalar"): + name = name + ".weight" + + if "language_model." not in name and "rope_freqs" not in name: + return # skip non-language model tensors + + name = name.replace("language_model.", "") + if name.endswith("router.scale"): + name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_INP, bid, ".scale") + yield (name, data_torch) + return + if ".per_expert_scale" in name: + # convert per-expert scale to FFN down scale + name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN_EXP, bid, ".scale") + yield (name, data_torch) + return + if ".experts." in name and not name.endswith(".weight"): + name += ".weight" + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Gemma4ForConditionalGeneration") +class Gemma4VisionAudioModel(MmprojModel): + has_audio_encoder = True + has_vision_encoder = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + self.hparams_vision["image_size"] = 224 # unused, but set to avoid error + + # remap audio hparams + if self.hparams_audio: + self.hparams_audio["feat_in"] = self.hparams_audio.get("input_feat_size", 128) + self.hparams_audio["intermediate_size"] = self.hparams_audio["hidden_size"] * 4 + else: + self.has_audio_encoder = False + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # vision params + self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA4V) + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) + + # audio params + if self.hparams_audio: + self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A) + self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"]) + self.gguf_writer.add_audio_attention_layernorm_eps(1e-5) + + def is_audio_tensor(self, name: str) -> bool: + return "audio_tower" in name or "embed_audio" in name + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if self.is_audio_tensor(name): + if ".conv" in name or "_conv" in name and ".weight" in name: + return gguf.GGMLQuantizationType.F32 + if "position_embedding_table" in name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + if name.startswith("model.language_model."): + return # skip + + if len(data_torch.shape) == 0: + # convert scalar tensors (input/output_mix/max) to 1D tensors + data_torch = data_torch.unsqueeze(0) + + if self.is_audio_tensor(name): + assert self.hparams_audio is not None + name = name.replace("model.audio_tower.", "conformer.") + name = name.replace(".linear.", ".") + if name.endswith("per_dim_key_scale") or name.endswith("per_dim_scale"): + name = name + ".weight" + data_torch = torch.nn.functional.softplus(data_torch) + if "lconv1d.depthwise_conv1d" in name and name.endswith(".weight"): + assert data_torch.shape[1] == 1 + data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2]) + mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min")) + yield (mapped_name, data_torch) + + else: + name = name.replace("model.vision_tower.encoder.", "vision_model.model.") + name = name.replace(".linear.weight", ".weight") + if name.endswith("layer_scalar") or name.endswith("position_embedding_table"): + name = name + ".weight" + if name.endswith("patch_embedder.input_proj.weight"): + n_embd, ksize_sq_c = data_torch.shape + patch_size = int((ksize_sq_c // 3) ** 0.5) + data_torch = data_torch.reshape(n_embd, patch_size, patch_size, 3) + data_torch = data_torch.permute(0, 3, 1, 2).contiguous() + mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min")) + yield (mapped_name, data_torch) + + @ModelBase.register("Starcoder2ForCausalLM") class StarCoder2Model(TextModel): model_arch = gguf.MODEL_ARCH.STARCODER2 diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 902b0fdb56..8832468451 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -15,13 +15,18 @@ static bool run(llama_context * ctx, const common_params & params) { const bool add_bos = llama_vocab_get_add_bos(vocab); - std::vector tokens = common_tokenize(ctx, params.prompt, add_bos); + std::vector tokens = common_tokenize(ctx, params.prompt, add_bos, true); if (tokens.empty()) { LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__); return false; } + LOG_INF("number of input tokens = %zu\n", tokens.size()); + for (size_t i = 0; i < tokens.size(); ++i) { + LOG_INF(" %d\n", tokens[i]); + } + if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) { LOG_ERR("%s : failed to eval\n", __func__); return false; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index b35c976e8f..3ebd9de5f6 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -419,6 +419,7 @@ class MODEL_ARCH(IntEnum): GEMMA2 = auto() GEMMA3 = auto() GEMMA3N = auto() + GEMMA4 = auto() GEMMA_EMBEDDING = auto() STARCODER2 = auto() RWKV6 = auto() @@ -535,8 +536,11 @@ class MODEL_TENSOR(IntEnum): FFN_GATE_INP = auto() FFN_GATE_INP_SHEXP = auto() FFN_NORM = auto() - FFN_PRE_NORM = auto() + FFN_PRE_NORM = auto() # alias of FFN_NORM + FFN_PRE_NORM_2 = auto() # gemma4 FFN_POST_NORM = auto() + FFN_POST_NORM_1 = auto() # gemma4 + FFN_POST_NORM_2 = auto() # gemma4 FFN_GATE = auto() FFN_DOWN = auto() FFN_UP = auto() @@ -558,6 +562,7 @@ class MODEL_TENSOR(IntEnum): ATTN_Q_NORM = auto() ATTN_K_NORM = auto() LAYER_OUT_NORM = auto() + LAYER_OUT_SCALE = auto() PER_LAYER_TOKEN_EMBD = auto() # gemma3n PER_LAYER_MODEL_PROJ = auto() # gemma3n PER_LAYER_INP_GATE = auto() # gemma3n @@ -722,8 +727,11 @@ class MODEL_TENSOR(IntEnum): V_ENC_FFN_UP = auto() V_ENC_FFN_GATE = auto() V_ENC_FFN_DOWN = auto() + V_ENC_ATTN_POST_NORM = auto() # gemma4 + V_ENC_FFN_POST_NORM = auto() V_LAYER_SCALE_1 = auto() V_LAYER_SCALE_2 = auto() + V_LAYER_OUT_SCALE = auto() V_PRE_NORM = auto() V_POST_NORM = auto() V_MM_POST_NORM = auto() @@ -761,6 +769,8 @@ class MODEL_TENSOR(IntEnum): V_MM_GATE = auto() # cogvlm V_TOK_BOI = auto() # cogvlm V_TOK_EOI = auto() # cogvlm + V_STD_BIAS = auto() # gemma4 + V_STD_SCALE = auto() # gemma4 V_SAM_POS_EMBD = auto() # Deepseek-OCR V_SAM_PATCH_EMBD = auto() # Deepseek-OCR V_SAM_PRE_NORM = auto() # Deepseek-OCR @@ -781,6 +791,7 @@ class MODEL_TENSOR(IntEnum): A_ENC_EMBD_POS = auto() A_ENC_EMBD_NORM = auto() A_ENC_EMBD_TO_LOGITS = auto() # lfm2 + A_ENC_INP_PROJ = auto() # gemma4 A_ENC_CONV1D = auto() A_ENC_CONV1D_NORM = auto() # gemma3n A_PRE_NORM = auto() @@ -789,10 +800,13 @@ class MODEL_TENSOR(IntEnum): A_ENC_ATTN_Q = auto() A_ENC_ATTN_K = auto() A_ENC_ATTN_V = auto() + A_ENC_ATTN_POST_NORM = auto() + A_ENC_ATTN_PRE_NORM = auto() + A_ENC_ATTN_K_REL = auto() # gemma4 A_ENC_PER_DIM_SCALE = auto() # gemma3n A_ENC_INPUT_NORM = auto() - A_ENC_OUTPUT = auto() - A_ENC_OUTPUT_NORM = auto() + A_ENC_OUTPUT = auto() # TODO @ngxson: rename to ATTN_OUT + A_ENC_OUTPUT_NORM = auto() # TODO @ngxson: rename to ATTN_OUT A_ENC_FFN_UP = auto() A_ENC_FFN_NORM = auto() A_ENC_FFN_POST_NORM = auto() # gemma3n @@ -813,6 +827,8 @@ class MODEL_TENSOR(IntEnum): A_MM_HARD_EMB_NORM = auto() # gemma3n A_MM_SOFT_EMB_NORM = auto() # gemma3n A_MM_INP_PROJ = auto() # gemma3n + A_PER_DIM_K_SCALE = auto() # gemma4 + A_PER_DIM_SCALE = auto() # gemma4 # nextn/mtp NEXTN_EH_PROJ = auto() NEXTN_EMBED_TOKENS = auto() @@ -882,6 +898,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.GEMMA2: "gemma2", MODEL_ARCH.GEMMA3: "gemma3", MODEL_ARCH.GEMMA3N: "gemma3n", + MODEL_ARCH.GEMMA4: "gemma4", MODEL_ARCH.GEMMA_EMBEDDING: "gemma-embedding", MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.RWKV6: "rwkv6", @@ -1000,6 +1017,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm", MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm", + MODEL_TENSOR.FFN_PRE_NORM_2: "blk.{bid}.pre_ffw_norm_2", # gemma4 + MODEL_TENSOR.FFN_POST_NORM_1: "blk.{bid}.post_ffw_norm_1", # gemma4 + MODEL_TENSOR.FFN_POST_NORM_2: "blk.{bid}.post_ffw_norm_2", # gemma4 MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", @@ -1019,6 +1039,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.MOE_LATENT_DOWN: "blk.{bid}.ffn_latent_down", # nemotron 3 super MODEL_TENSOR.MOE_LATENT_UP: "blk.{bid}.ffn_latent_up", # nemotron 3 super MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", + MODEL_TENSOR.LAYER_OUT_SCALE: "blk.{bid}.layer_output_scale", MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: "per_layer_token_embd", # gemma3n MODEL_TENSOR.PER_LAYER_MODEL_PROJ: "per_layer_model_proj", # gemma3n MODEL_TENSOR.PER_LAYER_PROJ_NORM: "per_layer_proj_norm", # gemma3n @@ -1183,8 +1204,11 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.V_ENC_FFN_UP: "v.blk.{bid}.ffn_up", MODEL_TENSOR.V_ENC_FFN_GATE: "v.blk.{bid}.ffn_gate", MODEL_TENSOR.V_ENC_FFN_DOWN: "v.blk.{bid}.ffn_down", + MODEL_TENSOR.V_ENC_ATTN_POST_NORM: "v.blk.{bid}.attn_post_norm", + MODEL_TENSOR.V_ENC_FFN_POST_NORM: "v.blk.{bid}.ffn_post_norm", MODEL_TENSOR.V_LAYER_SCALE_1: "v.blk.{bid}.ls1", MODEL_TENSOR.V_LAYER_SCALE_2: "v.blk.{bid}.ls2", + MODEL_TENSOR.V_LAYER_OUT_SCALE: "v.blk.{bid}.out_scale", MODEL_TENSOR.V_PRE_NORM: "v.pre_ln", MODEL_TENSOR.V_POST_NORM: "v.post_ln", MODEL_TENSOR.V_MM_POST_NORM: "mm.post_norm", @@ -1222,6 +1246,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.V_MM_GATE: "mm.gate", MODEL_TENSOR.V_TOK_BOI: "v.boi", MODEL_TENSOR.V_TOK_EOI: "v.eoi", + MODEL_TENSOR.V_STD_BIAS: "v.std_bias", # gemma4 + MODEL_TENSOR.V_STD_SCALE: "v.std_scale", # gemma4 # DeepSeek-OCR SAM MODEL_TENSOR.V_SAM_POS_EMBD: "v.sam.pos_embd", MODEL_TENSOR.V_SAM_PATCH_EMBD: "v.sam.patch_embd", @@ -1243,6 +1269,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd", MODEL_TENSOR.A_ENC_EMBD_NORM: "a.position_embd_norm", MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: "a.embd_to_logits", + MODEL_TENSOR.A_ENC_INP_PROJ: "a.input_projection", MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}", MODEL_TENSOR.A_ENC_CONV1D_NORM: "a.conv1d.{bid}.norm", MODEL_TENSOR.A_PRE_NORM: "a.pre_ln", @@ -1251,6 +1278,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.A_ENC_ATTN_Q: "a.blk.{bid}.attn_q", MODEL_TENSOR.A_ENC_ATTN_K: "a.blk.{bid}.attn_k", MODEL_TENSOR.A_ENC_ATTN_V: "a.blk.{bid}.attn_v", + MODEL_TENSOR.A_ENC_ATTN_POST_NORM: "a.blk.{bid}.attn_post_norm", + MODEL_TENSOR.A_ENC_ATTN_PRE_NORM: "a.blk.{bid}.attn_pre_norm", + MODEL_TENSOR.A_ENC_ATTN_K_REL: "a.blk.{bid}.attn_k_rel", MODEL_TENSOR.A_ENC_PER_DIM_SCALE: "a.blk.{bid}.per_dim_scale", MODEL_TENSOR.A_ENC_INPUT_NORM: "a.blk.{bid}.ln1", MODEL_TENSOR.A_ENC_OUTPUT: "a.blk.{bid}.attn_out", @@ -1275,6 +1305,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.A_MM_SOFT_EMB_NORM: "mm.a.soft_emb_norm", # gemma3n MODEL_TENSOR.A_MM_EMBEDDING: "mm.a.embedding", # gemma3n MODEL_TENSOR.A_MM_HARD_EMB_NORM: "mm.a.hard_emb_norm", # gemma3n + MODEL_TENSOR.A_PER_DIM_K_SCALE: "a.blk.{bid}.per_dim_k_scale", # gemma4 + MODEL_TENSOR.A_PER_DIM_SCALE: "a.blk.{bid}.per_dim_scale", # gemma4 # lfm2 audio MODEL_TENSOR.A_ENC_NORM_CONV: "a.blk.{bid}.norm_conv", MODEL_TENSOR.A_ENC_LINEAR_POS: "a.blk.{bid}.linear_pos", @@ -1319,8 +1351,11 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.V_ENC_FFN_UP, MODEL_TENSOR.V_ENC_FFN_GATE, MODEL_TENSOR.V_ENC_FFN_DOWN, + MODEL_TENSOR.V_ENC_ATTN_POST_NORM, + MODEL_TENSOR.V_ENC_FFN_POST_NORM, MODEL_TENSOR.V_LAYER_SCALE_1, MODEL_TENSOR.V_LAYER_SCALE_2, + MODEL_TENSOR.V_LAYER_OUT_SCALE, MODEL_TENSOR.V_PRE_NORM, MODEL_TENSOR.V_POST_NORM, MODEL_TENSOR.V_MM_POST_NORM, @@ -1358,6 +1393,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.V_MM_GATE, MODEL_TENSOR.V_TOK_BOI, MODEL_TENSOR.V_TOK_EOI, + MODEL_TENSOR.V_STD_BIAS, + MODEL_TENSOR.V_STD_SCALE, MODEL_TENSOR.V_SAM_POS_EMBD, MODEL_TENSOR.V_SAM_PATCH_EMBD, MODEL_TENSOR.V_SAM_PRE_NORM, @@ -1375,6 +1412,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.A_ENC_EMBD_POS, MODEL_TENSOR.A_ENC_EMBD_NORM, MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS, + MODEL_TENSOR.A_ENC_INP_PROJ, MODEL_TENSOR.A_ENC_CONV1D, MODEL_TENSOR.A_ENC_CONV1D_NORM, MODEL_TENSOR.A_PRE_NORM, @@ -1383,6 +1421,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.A_ENC_ATTN_Q, MODEL_TENSOR.A_ENC_ATTN_K, MODEL_TENSOR.A_ENC_ATTN_V, + MODEL_TENSOR.A_ENC_ATTN_POST_NORM, + MODEL_TENSOR.A_ENC_ATTN_PRE_NORM, + MODEL_TENSOR.A_ENC_ATTN_K_REL, MODEL_TENSOR.A_ENC_PER_DIM_SCALE, MODEL_TENSOR.A_ENC_INPUT_NORM, MODEL_TENSOR.A_ENC_OUTPUT, @@ -1416,6 +1457,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.A_MM_SOFT_EMB_NORM, MODEL_TENSOR.A_MM_EMBEDDING, MODEL_TENSOR.A_MM_HARD_EMB_NORM, + MODEL_TENSOR.A_PER_DIM_K_SCALE, + MODEL_TENSOR.A_PER_DIM_SCALE, ], MODEL_ARCH.LLAMA: [ MODEL_TENSOR.TOKEN_EMBD, @@ -2273,6 +2316,38 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.LAUREL_R, MODEL_TENSOR.LAUREL_POST_NORM, ], + MODEL_ARCH.GEMMA4: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_UP_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_POST_NORM, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_PRE_NORM, + MODEL_TENSOR.FFN_PRE_NORM_2, + MODEL_TENSOR.FFN_POST_NORM, + MODEL_TENSOR.FFN_POST_NORM_1, + MODEL_TENSOR.FFN_POST_NORM_2, + MODEL_TENSOR.LAYER_OUT_SCALE, + MODEL_TENSOR.PER_LAYER_TOKEN_EMBD, + MODEL_TENSOR.PER_LAYER_MODEL_PROJ, + MODEL_TENSOR.PER_LAYER_INP_GATE, + MODEL_TENSOR.PER_LAYER_PROJ, + MODEL_TENSOR.PER_LAYER_PROJ_NORM, + MODEL_TENSOR.PER_LAYER_POST_NORM, + ], MODEL_ARCH.GEMMA_EMBEDDING: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT, @@ -4010,6 +4085,8 @@ class VisionProjectorType: GEMMA3 = "gemma3" GEMMA3NV = "gemma3nv" GEMMA3NA = "gemma3na" + GEMMA4V = "gemma4v" + GEMMA4A = "gemma4a" PHI4 = "phi4" IDEFICS3 = "idefics3" PIXTRAL = "pixtral" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 37b9879930..90d500dc77 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -799,6 +799,7 @@ class GGUFWriter: def add_shared_kv_layers(self, value: int) -> None: self.add_uint32(Keys.Attention.SHARED_KV_LAYERS.format(arch=self.arch), value) + # if input is array, true means SWA and false means full_attention for each layer def add_sliding_window_pattern(self, value: int | Sequence[bool]) -> None: key = Keys.Attention.SLIDING_WINDOW_PATTERN.format(arch=self.arch) if isinstance(value, int): diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index df70577dbc..a7c7ce4640 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -401,6 +401,10 @@ class TensorNameMap: "model.layers.{bid}.pre_mlp_layernorm", # afmoe ), + MODEL_TENSOR.FFN_PRE_NORM_2: ( + "model.layers.{bid}.pre_feedforward_layernorm_2", # gemma4 + ), + # Post feed-forward norm MODEL_TENSOR.FFN_POST_NORM: ( "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2 @@ -411,6 +415,14 @@ class TensorNameMap: "model.layers.{bid}.post_moe_norm", # grok-2 ), + MODEL_TENSOR.FFN_POST_NORM_1: ( + "model.layers.{bid}.post_feedforward_layernorm_1", # gemma4 + ), + + MODEL_TENSOR.FFN_POST_NORM_2: ( + "model.layers.{bid}.post_feedforward_layernorm_2", # gemma4 + ), + MODEL_TENSOR.FFN_GATE_INP: ( "layers.{bid}.feed_forward.gate", # mixtral "model.layers.{bid}.block_sparse_moe.gate", # mixtral phimoe @@ -428,6 +440,7 @@ class TensorNameMap: "layers.{bid}.gate", # mistral-large "backbone.layers.{bid}.mixer.gate", # nemotron-h-moe "model.layers.{bid}.moe.gate", # step3.5 + "model.layers.{bid}.router.proj", # gemma4 ), MODEL_TENSOR.FFN_GATE_INP_SHEXP: ( @@ -570,6 +583,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_GATE_UP_EXP: ( "model.layers.{bid}.mlp.experts.gate_up_proj", + "model.layers.{bid}.experts.gate_up_proj", # gemma4 ), MODEL_TENSOR.MOE_LATENT_DOWN: ( @@ -629,6 +643,7 @@ class TensorNameMap: "encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe "model.layers.{bid}.block_sparse_moe.experts.down", # smallthinker "model.layers.{bid}.moe.down_proj", # step3.5 + "model.layers.{bid}.experts.down_proj", # gemma4 ), MODEL_TENSOR.FFN_DOWN_SHEXP: ( @@ -693,6 +708,10 @@ class TensorNameMap: "model.layers.{bid}.final_layernorm", # bailingmoe2 ), + MODEL_TENSOR.LAYER_OUT_SCALE: ( + "model.layers.{bid}.layer_scalar", # gemma4 + ), + MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: ( "model.embed_tokens_per_layer", # gemma3n ), @@ -1383,6 +1402,7 @@ class TensorNameMap: "model.vision_model.embeddings.patch_embedding", # Deepseek-OCR CLIP "siglip2.vision_model.embeddings.patch_embedding", "vision_model.radio_model.model.patch_generator.embedder", # Nemotron Nano v2 VL + "model.vision_tower.patch_embedder.input_proj", # gemma4 ), MODEL_TENSOR.V_ENC_EMBD_NORM: ( @@ -1400,6 +1420,7 @@ class TensorNameMap: "model.vision.patch_embedding.position_embedding", # cogvlm "visual.embeddings.position_embedding", # glm4v "vision_model.radio_model.model.patch_generator.pos_embed", # Nemotron Nano v2 VL + "model.vision_tower.patch_embedder.position_embedding_table", # gemma4 ), MODEL_TENSOR.V_ENC_EMBD_IMGNL: ( @@ -1430,12 +1451,14 @@ class TensorNameMap: "vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated "siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl "model.vision_model.transformer.layers.{bid}.self_attn.q_proj", # Deepseek-OCR CLIP, generated + "vision_model.model.layers.{bid}.self_attn.q_proj.linear", # gemma4 ), MODEL_TENSOR.V_ENC_ATTN_Q_NORM: ( "vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", # InternVL "model.vision_tower.encoder.layer.{bid}.attention.q_norm", # Intern-S1 "visual.blocks.{bid}.attn.q_norm", # GLM-OCR + "vision_model.model.layers.{bid}.self_attn.q_norm", # gemma4 ), MODEL_TENSOR.V_ENC_ATTN_K: ( @@ -1450,12 +1473,14 @@ class TensorNameMap: "vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated "model.vision_model.transformer.layers.{bid}.self_attn.k_proj", # Deepseek-OCR CLIP, generated "siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj", + "vision_model.model.layers.{bid}.self_attn.k_proj.linear", # gemma4 ), MODEL_TENSOR.V_ENC_ATTN_K_NORM: ( "vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", # InternVL "model.vision_tower.encoder.layer.{bid}.attention.k_norm", # Intern-S1 "visual.blocks.{bid}.attn.k_norm", # GLM-OCR + "vision_model.model.layers.{bid}.self_attn.k_norm", # gemma4 ), MODEL_TENSOR.V_ENC_ATTN_V: ( @@ -1470,6 +1495,7 @@ class TensorNameMap: "vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated "siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj", "model.vision_model.transformer.layers.{bid}.self_attn.v_proj", # Deepseek-OCR CLIP, generated + "vision_model.model.layers.{bid}.self_attn.v_proj.linear", # gemma4 ), MODEL_TENSOR.V_ENC_INPUT_NORM: ( @@ -1480,7 +1506,7 @@ class TensorNameMap: "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM "vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf "vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral - "vision_model.model.layers.{bid}.input_layernorm", # llama4 + "vision_model.model.layers.{bid}.input_layernorm", # llama4, gemma4 "visual.blocks.{bid}.norm1", # qwen2vl "vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1) "model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm @@ -1505,6 +1531,7 @@ class TensorNameMap: "model.vision_model.transformer.layers.{bid}.self_attn.out_proj", # Deepseek-OCR CLIP "siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl "vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL + "vision_model.model.layers.{bid}.self_attn.o_proj.linear", # gemma4 ), MODEL_TENSOR.V_ENC_POST_ATTN_NORM: ( @@ -1522,6 +1549,7 @@ class TensorNameMap: "model.vision_model.transformer.layers.{bid}.layer_norm2", # Deepseek-OCR CLIP "siglip2.vision_model.encoder.layers.{bid}.layer_norm2", "vision_model.radio_model.model.blocks.{bid}.norm2", # Nemotron Nano v2 VL + "vision_model.model.layers.{bid}.pre_feedforward_layernorm", # gemma4 ), MODEL_TENSOR.V_ENC_FFN_UP: ( @@ -1540,12 +1568,14 @@ class TensorNameMap: "model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm "siglip2.vision_model.encoder.layers.{bid}.mlp.fc1", "vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL + "vision_model.model.layers.{bid}.mlp.up_proj", # gemma4 ), MODEL_TENSOR.V_ENC_FFN_GATE: ( "vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.feed_forward.w1", # pixtral "visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl + "vision_model.model.layers.{bid}.mlp.gate_proj", # gemma4 ), MODEL_TENSOR.V_ENC_FFN_DOWN: ( @@ -1564,6 +1594,15 @@ class TensorNameMap: "model.vision_model.transformer.layers.{bid}.mlp.fc2", # Deepseek-OCR CLIP "siglip2.vision_model.encoder.layers.{bid}.mlp.fc2", "vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL + "vision_model.model.layers.{bid}.mlp.down_proj", # gemma4 + ), + + MODEL_TENSOR.V_ENC_ATTN_POST_NORM: ( + "vision_model.model.layers.{bid}.post_attention_layernorm", # gemma4 + ), + + MODEL_TENSOR.V_ENC_FFN_POST_NORM: ( + "vision_model.model.layers.{bid}.post_feedforward_layernorm", # gemma4 ), MODEL_TENSOR.V_LAYER_SCALE_1: ( @@ -1576,6 +1615,10 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.lambda_2", # Intern-S1 ), + MODEL_TENSOR.V_LAYER_OUT_SCALE: ( + "vision_model.model.layers.{bid}.layer_scalar", # gemma4 + ), + MODEL_TENSOR.V_PRE_NORM: ( "vision_tower.vision_model.pre_layrnorm", "vision_tower.ln_pre", # pixtral-hf @@ -1763,6 +1806,14 @@ class TensorNameMap: "model.vision.eoi", # cogvlm ), + MODEL_TENSOR.V_STD_BIAS: ( + "model.vision_tower.std_bias", # gemma4 + ), + + MODEL_TENSOR.V_STD_SCALE: ( + "model.vision_tower.std_scale", # gemma4 + ), + # audio (mtmd) MODEL_TENSOR.A_ENC_EMBD_POS: ( @@ -1782,10 +1833,15 @@ class TensorNameMap: "audio_tower.conv{bid}", # ultravox "conformer.pre_encode.conv.{bid}", # lfm2 "model.audio_tower.subsample_conv_projection.conv_{bid}.conv", # gemma3n + "conformer.subsample_conv_projection.layer{bid}.conv", # gemma4 ), MODEL_TENSOR.A_ENC_CONV1D_NORM: ( - "model.audio_tower.subsample_conv_projection.conv_{bid}.norm", # gemma3n + "conformer.subsample_conv_projection.layer{bid}.norm", # gemma4 + ), + + MODEL_TENSOR.A_ENC_INP_PROJ: ( + "conformer.subsample_conv_projection.input_proj_linear", # gemma4 ), MODEL_TENSOR.A_PRE_NORM: (), @@ -1799,22 +1855,38 @@ class TensorNameMap: "audio_tower.layers.{bid}.self_attn.q_proj", # ultravox "conformer.layers.{bid}.self_attn.linear_q", # lfm2 "conformer.layers.{bid}.attention.attn.q_proj", # gemma3n + "conformer.layers.{bid}.self_attn.q_proj", # gemma4 ), MODEL_TENSOR.A_ENC_ATTN_K: ( "audio_tower.layers.{bid}.self_attn.k_proj", # ultravox "conformer.layers.{bid}.self_attn.linear_k", # lfm2 "conformer.layers.{bid}.attention.attn.k_proj", # gemma3n + "conformer.layers.{bid}.self_attn.k_proj", # gemma4 ), MODEL_TENSOR.A_ENC_ATTN_V: ( "audio_tower.layers.{bid}.self_attn.v_proj", # ultravox "conformer.layers.{bid}.self_attn.linear_v", # lfm2 "conformer.layers.{bid}.attention.attn.v_proj", # gemma3n + "conformer.layers.{bid}.self_attn.v_proj", # gemma4 + ), + + MODEL_TENSOR.A_ENC_ATTN_K_REL: ( + "conformer.layers.{bid}.self_attn.relative_k_proj", # gemma4 + ), + + MODEL_TENSOR.A_ENC_ATTN_POST_NORM: ( + "conformer.layers.{bid}.norm_post_attn", # gemma4 + ), + + MODEL_TENSOR.A_ENC_ATTN_PRE_NORM: ( + "conformer.layers.{bid}.norm_pre_attn", # gemma4 ), MODEL_TENSOR.A_ENC_PER_DIM_SCALE: ( "conformer.layers.{bid}.attention.attn.per_dim_scale", # gemma3n + "conformer.layers.{bid}.self_attn.per_dim_scale", # gemma3n ), MODEL_TENSOR.A_ENC_LAYER_PRE_NORM: ( @@ -1831,6 +1903,7 @@ class TensorNameMap: "audio_tower.layers.{bid}.self_attn.out_proj", # ultravox "conformer.layers.{bid}.self_attn.linear_out", # lfm2 "conformer.layers.{bid}.attention.post", # gemma3n + "conformer.layers.{bid}.self_attn.post", # gemma4 ), MODEL_TENSOR.A_ENC_OUTPUT_NORM: ( @@ -1842,10 +1915,12 @@ class TensorNameMap: MODEL_TENSOR.A_ENC_FFN_NORM: ( "conformer.layers.{bid}.norm_feed_forward1", # lfm2 "conformer.layers.{bid}.ffw_layer_start.pre_layer_norm", # gemma3n + "conformer.layers.{bid}.feed_forward1.pre_layer_norm", # gemma4 ), MODEL_TENSOR.A_ENC_FFN_POST_NORM: ( "conformer.layers.{bid}.ffw_layer_start.post_layer_norm", # gemma3n + "conformer.layers.{bid}.feed_forward1.post_layer_norm", # gemma4 ), MODEL_TENSOR.A_ENC_FFN_SCALE: ( @@ -1856,6 +1931,7 @@ class TensorNameMap: "audio_tower.layers.{bid}.fc1", # ultravox "conformer.layers.{bid}.feed_forward1.linear1", # lfm2 "conformer.layers.{bid}.ffw_layer_start.ffw_layer_1", # gemma3n + "conformer.layers.{bid}.feed_forward1.ffw_layer_1", # gemma4 ), MODEL_TENSOR.A_ENC_FFN_GATE: (), @@ -1864,25 +1940,30 @@ class TensorNameMap: "audio_tower.layers.{bid}.fc2", # ultravox "conformer.layers.{bid}.feed_forward1.linear2", # lfm2 "conformer.layers.{bid}.ffw_layer_start.ffw_layer_2", # gemma3n + "conformer.layers.{bid}.feed_forward1.ffw_layer_2", # gemma4 ), MODEL_TENSOR.A_ENC_FFN_UP_1: ( "conformer.layers.{bid}.feed_forward2.linear1", # lfm2 "conformer.layers.{bid}.ffw_layer_end.ffw_layer_1", # gemma3n + "conformer.layers.{bid}.feed_forward2.ffw_layer_1", # gemma4 ), MODEL_TENSOR.A_ENC_FFN_DOWN_1: ( "conformer.layers.{bid}.feed_forward2.linear2", # lfm2 "conformer.layers.{bid}.ffw_layer_end.ffw_layer_2", # gemma3n + "conformer.layers.{bid}.feed_forward2.ffw_layer_2", # gemma4 ), MODEL_TENSOR.A_ENC_FFN_NORM_1: ( "conformer.layers.{bid}.norm_feed_forward2", # lfm2 "conformer.layers.{bid}.ffw_layer_end.pre_layer_norm", # gemma3n + "conformer.layers.{bid}.feed_forward2.pre_layer_norm", # gemma4 ), MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: ( "conformer.layers.{bid}.ffw_layer_end.post_layer_norm", # gemma3n + "conformer.layers.{bid}.feed_forward2.post_layer_norm", # gemma4 ), MODEL_TENSOR.A_ENC_FFN_SCALE_1: ( @@ -1904,7 +1985,8 @@ class TensorNameMap: MODEL_TENSOR.A_ENC_OUT: ( "conformer.pre_encode.out", # lfm2 - "model.audio_tower.subsample_conv_projection.input_proj_linear", # gemma3n + "model.audio_tower.subsample_conv_projection.input_proj_linear", # gemma3n (note: it should be A_ENC_INP_PROJ, this is a mistake; it should be corrected in C++ code when it's supported) + "conformer.output_proj", # gemma4 ), # note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors @@ -1918,6 +2000,7 @@ class TensorNameMap: MODEL_TENSOR.A_MMPROJ_FC: ( "audio.multi_modal_projector.linear", # qwen2audio "audio_tower.proj", # qwen2omni + "model.audio_tower.output_proj" # gemma4 ), MODEL_TENSOR.A_MM_NORM_PRE: ( @@ -1953,6 +2036,14 @@ class TensorNameMap: "conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n ), + MODEL_TENSOR.A_PER_DIM_K_SCALE: ( + "conformer.layers.{bid}.attention.attn.per_dim_key_scale", # gemma4 + ), + + MODEL_TENSOR.A_PER_DIM_SCALE: ( + "conformer.layers.{bid}.attention.attn.per_dim_scale", # gemma4 + ), + MODEL_TENSOR.A_MM_EMBEDDING: ( "model.embed_audio.embedding", # gemma3n ), diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 283823fa9c..121c21fed9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -73,6 +73,7 @@ add_library(llama models/gemma2-iswa.cpp models/gemma3.cpp models/gemma3n-iswa.cpp + models/gemma4-iswa.cpp models/glm4-moe.cpp models/glm4.cpp models/gpt2.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index bd6d28e5f2..e210dcdae2 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -56,6 +56,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_GEMMA2, "gemma2" }, { LLM_ARCH_GEMMA3, "gemma3" }, { LLM_ARCH_GEMMA3N, "gemma3n" }, + { LLM_ARCH_GEMMA4, "gemma4" }, { LLM_ARCH_GEMMA_EMBEDDING, "gemma-embedding" }, { LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_MAMBA, "mamba" }, @@ -165,6 +166,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_CONTEXT_LENGTH, "%s.context_length" }, { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" }, { LLM_KV_EMBEDDING_LENGTH_OUT, "%s.embedding_length_out" }, + { LLM_KV_EMBEDDING_LENGTH_PER_LAYER, "%s.embedding_length_per_layer_input" }, { LLM_KV_FEATURES_LENGTH, "%s.features_length" }, { LLM_KV_BLOCK_COUNT, "%s.block_count" }, { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" }, @@ -238,6 +240,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, "%s.attention.indexer.head_count" }, { LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, "%s.attention.indexer.key_length" }, { LLM_KV_ATTENTION_INDEXER_TOP_K, "%s.attention.indexer.top_k" }, + { LLM_KV_ATTENTION_SHARED_KV_LAYERS, "%s.attention.shared_kv_layers" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_COUNT_SWA, "%s.rope.dimension_count_swa" }, @@ -364,6 +367,9 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, { LLM_TENSOR_ATTN_GATE, "blk.%d.attn_gate" }, { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, + { LLM_TENSOR_FFN_POST_NORM_1, "blk.%d.post_ffw_norm_1" }, + { LLM_TENSOR_FFN_POST_NORM_2, "blk.%d.post_ffw_norm_2" }, + { LLM_TENSOR_FFN_PRE_NORM_2, "blk.%d.pre_ffw_norm_2" }, { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, @@ -373,6 +379,7 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" }, { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, + { LLM_TENSOR_LAYER_OUT_SCALE, "blk.%d.layer_output_scale" }, { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, { LLM_TENSOR_POS_EMBD, "position_embd" }, { LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" }, @@ -1342,6 +1349,38 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_LAUREL_R, LLM_TENSOR_LAUREL_POST_NORM, }; + case LLM_ARCH_GEMMA4: + return { + LLM_TENSOR_ROPE_FREQS, + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_POST_NORM, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_GATE_UP_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_POST_NORM, + LLM_TENSOR_FFN_POST_NORM_1, + LLM_TENSOR_FFN_POST_NORM_2, + LLM_TENSOR_FFN_PRE_NORM_2, + LLM_TENSOR_LAYER_OUT_SCALE, + LLM_TENSOR_PER_LAYER_TOKEN_EMBD, + LLM_TENSOR_PER_LAYER_MODEL_PROJ, + LLM_TENSOR_PER_LAYER_PROJ_NORM, + LLM_TENSOR_PER_LAYER_INP_GATE, + LLM_TENSOR_PER_LAYER_PROJ, + LLM_TENSOR_PER_LAYER_POST_NORM, + }; case LLM_ARCH_GEMMA_EMBEDDING: return { LLM_TENSOR_TOKEN_EMBD, @@ -2654,11 +2693,15 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_ATTN_OUT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_ATTN_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_FFN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_FFN_PRE_NORM_2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_FFN_POST_NORM_1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_FFN_POST_NORM_2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_FFN_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_FFN_NORM_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_ATTN_Q_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_ATTN_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_LAYER_OUT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_LAYER_OUT_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_ATTN_Q_A_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_ATTN_KV_A_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_ATTN_SUB_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 4c5b6a1ad1..1b8737b747 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -60,6 +60,7 @@ enum llm_arch { LLM_ARCH_GEMMA2, LLM_ARCH_GEMMA3, LLM_ARCH_GEMMA3N, + LLM_ARCH_GEMMA4, LLM_ARCH_GEMMA_EMBEDDING, LLM_ARCH_STARCODER2, LLM_ARCH_MAMBA, @@ -169,6 +170,7 @@ enum llm_kv { LLM_KV_CONTEXT_LENGTH, LLM_KV_EMBEDDING_LENGTH, LLM_KV_EMBEDDING_LENGTH_OUT, + LLM_KV_EMBEDDING_LENGTH_PER_LAYER, LLM_KV_FEATURES_LENGTH, LLM_KV_BLOCK_COUNT, LLM_KV_LEADING_DENSE_BLOCK_COUNT, @@ -242,6 +244,7 @@ enum llm_kv { LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, LLM_KV_ATTENTION_INDEXER_TOP_K, + LLM_KV_ATTENTION_SHARED_KV_LAYERS, LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_COUNT_SWA, @@ -369,6 +372,9 @@ enum llm_tensor { LLM_TENSOR_FFN_GATE_INP_SHEXP, LLM_TENSOR_FFN_NORM, LLM_TENSOR_FFN_POST_NORM, + LLM_TENSOR_FFN_POST_NORM_1, + LLM_TENSOR_FFN_POST_NORM_2, + LLM_TENSOR_FFN_PRE_NORM_2, LLM_TENSOR_FFN_GATE, LLM_TENSOR_FFN_DOWN, LLM_TENSOR_FFN_UP, @@ -393,6 +399,7 @@ enum llm_tensor { LLM_TENSOR_ATTN_Q_NORM, LLM_TENSOR_ATTN_K_NORM, LLM_TENSOR_LAYER_OUT_NORM, + LLM_TENSOR_LAYER_OUT_SCALE, LLM_TENSOR_POST_ATTN_NORM, LLM_TENSOR_POST_MLP_NORM, LLM_TENSOR_PER_LAYER_TOKEN_EMBD, // gemma3n diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 78c0bc27d4..c2000c77c3 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -209,6 +209,9 @@ struct llama_hparams { // qwen3vl deepstack uint32_t n_deepstack_layers = 0; + // gemma4 per-layer embedding + uint32_t n_embd_per_layer = 0; + // needed by encoder-decoder models (e.g. T5, FLAN-T5) // ref: https://github.com/ggml-org/llama.cpp/pull/8141 llama_token dec_start_token_id = LLAMA_TOKEN_NULL; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 1a67e64e2b..ba935340fc 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1261,6 +1261,31 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_GEMMA4: + { + hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer); + + uint32_t n_kv_shared_layers = 0; + ml.get_key(LLM_KV_ATTENTION_SHARED_KV_LAYERS, n_kv_shared_layers, false); + + hparams.n_layer_kv_from_start = hparams.n_layer - (int32_t)n_kv_shared_layers; + hparams.f_attention_scale = 1.0f; // Gemma4 uses self.scaling = 1.0 (no pre-attn scaling) + + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_EMBEDDING_LENGTH_PER_LAYER, hparams.n_embd_per_layer); + ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa); + ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa); + + switch (hparams.n_layer) { + case 35: type = LLM_TYPE_E2B; break; + case 42: type = LLM_TYPE_E4B; break; // to confirm: E4B or E5B? + default: type = LLM_TYPE_UNKNOWN; + } + } break; case LLM_ARCH_GEMMA_EMBEDDING: { hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC; @@ -4229,6 +4254,100 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.laurel_post_norm = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM, "weight", i), {n_embd}, 0); } } break; + case LLM_ARCH_GEMMA4: + { + const uint32_t n_embd_per_layer = hparams.n_embd_per_layer; + const int64_t n_ff_exp = hparams.n_ff_exp; + + if (n_embd_head_k != n_embd_head_v) { + throw std::runtime_error("Gemma 4 requires n_embd_head_k == n_embd_head_v"); + } + if (hparams.n_embd_head_k_swa != hparams.n_embd_head_v_swa) { + throw std::runtime_error("Gemma 4 requires n_embd_head_k_swa == n_embd_head_v_swa"); + } + + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + if (n_embd_per_layer > 0) { + tok_embd_per_layer = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_per_layer * n_layer, n_vocab}, 0); + per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight"), {n_embd, n_embd_per_layer * n_layer}, 0); + per_layer_proj_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM, "weight"), {n_embd_per_layer}, 0); + } + + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + + int rope_freqs_flag = 0; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + const int64_t n_head = hparams.n_head(i); + const int64_t n_embd_head = hparams.n_embd_head_k(i); + const int64_t n_embd_k = hparams.n_embd_k_gqa(i); + const int64_t n_embd_v = hparams.n_embd_v_gqa(i); + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + // note: use_alternative_attention (v_proj is optional, if it's not present, use k_proj) + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v}, TENSOR_NOT_REQUIRED); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head * n_head, n_embd}, 0); + + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head}, 0); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head}, 0); + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0); + + layer.out_scale = create_tensor(tn(LLM_TENSOR_LAYER_OUT_SCALE, "weight", i), {1u}, TENSOR_NOT_REQUIRED); + + if (!hparams.is_swa(i)) { + // full_attention layers use rope_freqs for proportional rope + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_embd_head/2}, rope_freqs_flag); + rope_freqs_flag = TENSOR_DUPLICATED; + } + + // handle use_double_wide_mlp + int64_t n_ff_cur = hparams.n_ff(i); + + // for expert layers, we use normal FFN as shared expert (same as python code) + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff_cur}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff_cur}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_cur, n_embd}, 0); + layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0); + + // MoE router + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED); + bool has_expert = layer.ffn_gate_inp != nullptr; + + // norm + if (has_expert) { + layer.ffn_gate_inp_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "scale", i), {n_embd}, 0); + + layer.ffn_pre_norm_2 = create_tensor(tn(LLM_TENSOR_FFN_PRE_NORM_2, "weight", i), {n_embd}, 0); + layer.ffn_post_norm_1 = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM_1, "weight", i), {n_embd}, 0); + layer.ffn_post_norm_2 = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM_2, "weight", i), {n_embd}, 0); + + // MoE FFN + layer.ffn_gate_up_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS, "weight", i), {n_embd, n_ff_exp * 2, n_expert}, 0); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0); + + // per-expert scale will be loaded as down_exps_s at the end of the current switch case + } + + // per-layer embeddings + if (n_embd_per_layer > 0) { + layer.per_layer_inp_gate = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE, "weight", i), {n_embd, n_embd_per_layer}, 0); + layer.per_layer_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ, "weight", i), {n_embd_per_layer, n_embd}, 0); + layer.per_layer_post_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0); + } + } + } break; case LLM_ARCH_STARCODER2: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -8233,7 +8352,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, } else { llama_memory_i::layer_reuse_cb reuse = nullptr; - if (arch == LLM_ARCH_GEMMA3N) { + if (arch == LLM_ARCH_GEMMA3N || arch == LLM_ARCH_GEMMA4) { reuse = [&](int32_t il) { if (il >= (int32_t) hparams.n_layer_kv_from_start) { return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1); @@ -8486,6 +8605,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_GEMMA4: + { + llm = std::make_unique(*this, params); + } break; case LLM_ARCH_GEMMA_EMBEDDING: { llm = std::make_unique(*this, params); @@ -9006,6 +9129,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_GEMMA2: case LLM_ARCH_GEMMA3: case LLM_ARCH_GEMMA3N: + case LLM_ARCH_GEMMA4: case LLM_ARCH_GEMMA_EMBEDDING: case LLM_ARCH_STARCODER2: case LLM_ARCH_OPENELM: diff --git a/src/llama-model.h b/src/llama-model.h index 96ab31cbb0..4f11008397 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -270,6 +270,9 @@ struct llama_layer { struct ggml_tensor * ffn_norm = nullptr; struct ggml_tensor * ffn_norm_b = nullptr; struct ggml_tensor * ffn_post_norm = nullptr; + struct ggml_tensor * ffn_post_norm_1 = nullptr; // gemma4 + struct ggml_tensor * ffn_post_norm_2 = nullptr; // gemma4 + struct ggml_tensor * ffn_pre_norm_2 = nullptr; // gemma4 struct ggml_tensor * layer_out_norm = nullptr; struct ggml_tensor * layer_out_norm_b = nullptr; struct ggml_tensor * ffn_norm_exps = nullptr; @@ -285,6 +288,7 @@ struct llama_layer { // ff MoE struct ggml_tensor * ffn_gate_inp = nullptr; + struct ggml_tensor * ffn_gate_inp_s = nullptr; // gemma4 struct ggml_tensor * ffn_gate_exps = nullptr; struct ggml_tensor * ffn_down_exps = nullptr; struct ggml_tensor * ffn_up_exps = nullptr; @@ -483,6 +487,9 @@ struct llama_layer { struct ggml_tensor * indexer_attn_k = nullptr; struct ggml_tensor * indexer_attn_q_b = nullptr; // note: for lora a/b, not bias + // gemma4 layer output scale + struct ggml_tensor * out_scale = nullptr; + struct llama_layer_posnet posnet; struct llama_layer_convnext convnext; diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index f51b4badc1..bce9d837c7 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1863,6 +1863,18 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { special_sep_id = LLAMA_TOKEN_NULL; special_pad_id = 3; // <|plamo:pad|> special_mask_id = LLAMA_TOKEN_NULL; + } else if (tokenizer_model == "gemma4") { + type = LLAMA_VOCAB_TYPE_SPM; + + // default special tokens (to be read from GGUF) + special_bos_id = LLAMA_TOKEN_NULL; + special_eos_id = LLAMA_TOKEN_NULL; + special_unk_id = LLAMA_TOKEN_NULL; + special_sep_id = LLAMA_TOKEN_NULL; + special_pad_id = LLAMA_TOKEN_NULL; + special_mask_id = LLAMA_TOKEN_NULL; + + tokenizer_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } else { throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str())); } @@ -2490,6 +2502,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { || t.first == "[EOS]" // Kimi-K2 || t.first == "<|end_of_text|>" || t.first == "" // smoldocling + || t.first == "" // gemma4 || t.first == "<|end▁of▁sentence|>" // deepseek-ocr ) { special_eog_ids.insert(t.second); diff --git a/src/models/gemma4-iswa.cpp b/src/models/gemma4-iswa.cpp new file mode 100644 index 0000000000..5bddb215d1 --- /dev/null +++ b/src/models/gemma4-iswa.cpp @@ -0,0 +1,311 @@ +#include "models.h" + +llm_build_gemma4_iswa::llm_build_gemma4_iswa(const llama_model & model, const llm_graph_params & params) : + llm_graph_context(params), + model(model), + n_embd_per_layer(model.hparams.n_embd_per_layer) { + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings) + inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f); + cb(inpL, "inp_scaled", -1); + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + // TODO: is causal == true correct? might need some changes + auto * inp_attn = build_attn_inp_kv_iswa(); + + // inp_per_layer shape: [n_embd_per_layer, n_tokens, n_layer] + ggml_tensor * inp_per_layer = nullptr; + if (model.tok_embd_per_layer) { + inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs()); + } + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + const int64_t n_embd_head = hparams.n_embd_head_k(il); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_v(il)); + + const int64_t n_head = hparams.n_head(il); + const int64_t n_head_kv = hparams.n_head_kv(il); + + const float freq_base_l = model.get_rope_freq_base(cparams, il); + const float freq_scale_l = model.get_rope_freq_scale(cparams, il); + const int n_rot_l = hparams.n_rot(il); + + // norm + cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + ggml_tensor * freq_factors = nullptr; + if (!hparams.is_swa(il)) { + // full_attention layers use rope_freqs for proportional rope + freq_factors = model.layers[il].rope_freqs; + } + + // Q projection (shared for both non-KV and KV layers) + // this is to mirror Gemma4Attention in pytorch code + ggml_tensor * Qcur; + { + Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); + cb(Qcur, "Qcur_normed", il); + + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur_pos", il); + } + + // self-attention + if (hparams.has_kv(il)) { + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + ggml_tensor * Vcur = model.layers[il].wv + ? build_lora_mm(model.layers[il].wv, cur) + : Kcur; // if v_proj is not present, use Kcur as Vcur + cb(Vcur, "Vcur", il); + + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); + Vcur = ggml_rms_norm(ctx0, Vcur, hparams.f_norm_rms_eps); + + cb(Kcur, "Kcur_normed", il); + cb(Vcur, "Vcur_normed", il); + + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, + ext_factor, attn_factor, beta_fast, beta_slow); + + cb(Kcur, "Kcur_pos", il); + + cur = build_attn(inp_attn, model.layers[il].wo, + nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, + hparams.f_attention_scale, il); + } else { + // reuse KV cache of earlier layers + cur = build_attn(inp_attn, + model.layers[il].wo, nullptr, + Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il); + } + + // TODO @ngxson : strip unused token right after the last KV layer to speed up prompt processing + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + cur = build_norm(cur, + model.layers[il].attn_post_norm, nullptr, + LLM_NORM_RMS, il); + cb(cur, "attn_post_norm", il); + + ggml_tensor * attn_out = ggml_add(ctx0, cur, inpL); + cb(attn_out, "attn_out", il); + + // feed-forward network + const bool is_moe_layer = model.layers[il].ffn_gate_inp != nullptr; + if (is_moe_layer) { + // MLP (shared exp) + ggml_tensor * cur_mlp = build_norm(attn_out, + model.layers[il].ffn_norm, nullptr, + LLM_NORM_RMS, il); + cb(cur_mlp, "ffn_norm_1", il); + + cur_mlp = build_ffn(cur_mlp, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, + LLM_FFN_GELU, LLM_FFN_PAR, il); + cur_mlp = build_norm(cur_mlp, + model.layers[il].ffn_post_norm_1, nullptr, + LLM_NORM_RMS, il); + cb(cur_mlp, "ffn_mlp", il); + + // Expert FFN + ggml_tensor * cur_moe = build_norm(attn_out, + model.layers[il].ffn_pre_norm_2, nullptr, + LLM_NORM_RMS, il); + cb(cur_moe, "ffn_norm_2", il); + + // custom MoE logits calculation (router operates on attn_out, not cur) + ggml_tensor * tmp = ggml_rms_norm(ctx0, attn_out, hparams.f_norm_rms_eps); + tmp = ggml_scale(ctx0, tmp, 1.0f / sqrtf((float) n_embd)); + tmp = ggml_mul(ctx0, tmp, model.layers[il].ffn_gate_inp_s); + ggml_tensor * logits = build_lora_mm(model.layers[il].ffn_gate_inp, tmp); // [n_expert, n_tokens] + cb(logits, "ffn_moe_logits", il); + + cur_moe = build_moe_ffn(cur_moe, + nullptr, // gate_inp + nullptr, // up_exps + nullptr, // gate_exps + model.layers[il].ffn_down_exps, + nullptr, // exp_probs_b (not used for gemma4) + n_expert, n_expert_used, + LLM_FFN_GELU, true, + 1.0f, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + il, logits, + model.layers[il].ffn_gate_up_exps, + nullptr, // up_exps_s + nullptr, // gate_exps_s + model.layers[il].ffn_down_exps_s); + cur_moe = build_norm(cur_moe, + model.layers[il].ffn_post_norm_2, nullptr, + LLM_NORM_RMS, il); + cb(cur_moe, "ffn_moe", il); + + cur = ggml_add(ctx0, cur_mlp, cur_moe); + cb(cur, "ffn_moe_combined", il); + } else { + cur = build_norm(attn_out, + model.layers[il].ffn_norm, nullptr, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, + LLM_FFN_GELU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } + cur = build_norm(cur, + model.layers[il].ffn_post_norm, nullptr, + LLM_NORM_RMS, -1); + cb(cur, "ffn_post_norm", il); + + // residual connection + cur = ggml_add(ctx0, cur, attn_out); + + // per-layer embedding + if (inp_per_layer) { + ggml_tensor * pe_in = cur; + cb(cur, "pe_in", il); + + cur = build_lora_mm(model.layers[il].per_layer_inp_gate, cur); // [n_embd_per_layer, n_tokens] + cur = ggml_gelu(ctx0, cur); + ggml_tensor * inp_this_layer = view_2d_slice(inp_per_layer, il); // [n_embd_per_layer, n_tokens] + + // TODO @ngxson : improve this + if (il == n_layer - 1 && inp_out_ids) { + inp_this_layer = ggml_get_rows(ctx0, inp_this_layer, inp_out_ids); + } + + cur = ggml_mul(ctx0, cur, inp_this_layer); + cur = build_lora_mm(model.layers[il].per_layer_proj, cur); // [n_embd, n_tokens] + cur = build_norm(cur, model.layers[il].per_layer_post_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "per_layer_embd_out", il); + + // residual connection + cur = ggml_add(ctx0, pe_in, cur); + } + + // layer_scalar + if (model.layers[il].out_scale) { + cur = ggml_mul(ctx0, cur, model.layers[il].out_scale); + cb(cur, "out_scaled", il); + } + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + cur = inpL; + + cur = build_norm(cur, + model.output_norm, nullptr, + LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head + cur = build_lora_mm(model.output, cur); + + if (hparams.f_final_logit_softcapping) { + cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); + cur = ggml_tanh(ctx0, cur); + cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping); + } + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); +} + +// get 2D slice view from a 3D tensor, the idx corresponds to the 3rd dim +ggml_tensor * llm_build_gemma4_iswa::view_2d_slice(ggml_tensor * x, int idx) { + GGML_ASSERT(idx < (int) x->ne[2]); + return ggml_view_2d(ctx0, x, x->ne[0], x->ne[1], ggml_row_size(x->type, x->ne[0]), + idx * x->ne[0] * x->ne[1] * ggml_element_size(x)); +} + +// equivalent to get_per_layer_inputs() in python code +// output shape: [n_embd_per_layer, n_layer, n_tokens] +ggml_tensor * llm_build_gemma4_iswa::get_per_layer_inputs() { + auto inp = std::make_unique(n_embd); + ggml_tensor * inp_per_layer; + if (ubatch.token) { + inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); + ggml_set_input(inp->tokens); + res->t_inp_tokens = inp->tokens; + inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens); + inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_per_layer, n_layer, n_tokens); + inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_per_layer)); + cb(inp_per_layer, "inp_per_layer_selected", -1); + res->add_input(std::move(inp)); + } else { + // Vision embedding path: use padding token (ID=0) embedding + // TODO: verify if this is the correct behavior in transformers implementation + const int64_t embd_size = model.tok_embd_per_layer->ne[0]; // n_embd_per_layer * n_layer + + // Extract and dequantize padding token embedding (row 0) + ggml_tensor * padding = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0); + inp_per_layer = ggml_cast(ctx0, padding, GGML_TYPE_F32); + + // Reshape to [n_embd_per_layer, n_layer, 1] + inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_per_layer, n_layer, 1); + cb(inp_per_layer, "inp_per_layer_vision", -1); + } + return inp_per_layer; +} + +// equivalent to project_per_layer_inputs() in python code +// this calculates the per-layer inputs, so the final tensor shape will have n_layer as the last dim +// inputs_embeds shape: [n_embd, n_tokens] +// inp_per_layer shape: [n_embd_per_layer, n_layer, n_tokens] (from get_per_layer_inputs) +// output shape: [n_embd_per_layer, n_tokens, n_layer] +ggml_tensor * llm_build_gemma4_iswa::project_per_layer_inputs(ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer) { + const float per_layer_projection_scale = 1.0f / sqrtf((float) n_embd); + const float per_layer_input_scale = 1.0f / sqrtf(2.0f); + + ggml_tensor * per_layer_proj = ggml_mul_mat(ctx0, model.per_layer_model_proj, inputs_embeds); + per_layer_proj = ggml_scale(ctx0, per_layer_proj, per_layer_projection_scale); + per_layer_proj = ggml_reshape_3d(ctx0, per_layer_proj, n_embd_per_layer, n_layer, n_tokens); + per_layer_proj = build_norm(per_layer_proj, model.per_layer_proj_norm, nullptr, LLM_NORM_RMS, + -1); // [n_embd_per_layer, n_layer, n_tokens] + cb(per_layer_proj, "per_layer_proj", -1); + + inp_per_layer = ggml_add(ctx0, per_layer_proj, inp_per_layer); + inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale); + cb(inp_per_layer, "inp_per_layer", -1); + + // permute to shape: [n_embd_per_layer, n_tokens, n_layer] + inp_per_layer = ggml_cont(ctx0, ggml_permute(ctx0, inp_per_layer, 0, 2, 1, 3)); + return inp_per_layer; +} diff --git a/src/models/models.h b/src/models/models.h index a86b2b1ebd..8e6b9c238f 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -266,6 +266,17 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { ggml_tensor * altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il); }; +struct llm_build_gemma4_iswa : public llm_graph_context { + const llama_model & model; + + const int64_t n_embd_per_layer; + + llm_build_gemma4_iswa(const llama_model & model, const llm_graph_params & params); + ggml_tensor * view_2d_slice(ggml_tensor * x, int idx); + ggml_tensor * get_per_layer_inputs(); + ggml_tensor * project_per_layer_inputs(ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer); +}; + struct llm_build_gemma_embedding : public llm_graph_context { llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params); }; diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp index df21ced74b..d0ef675808 100644 --- a/tests/test-llama-archs.cpp +++ b/tests/test-llama-archs.cpp @@ -385,6 +385,9 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml if (arch == LLM_ARCH_CHAMELEON) { continue; // Only half-implemented and to be removed in the future. } + if (arch == LLM_ARCH_GEMMA4) { + continue; // FIXME @ngxson + } if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) { continue; // FIXME } @@ -451,6 +454,9 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg if (arch == LLM_ARCH_CHAMELEON) { continue; // Only half-implemented and to be removed in the future. } + if (arch == LLM_ARCH_GEMMA4) { + continue; // FIXME @ngxson + } if (arch == LLM_ARCH_WAVTOKENIZER_DEC) { continue; // FIXME CUDA backend crashes. } diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index b3cf15f9ec..675464c6b5 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -17,6 +17,7 @@ add_library(mtmd models/models.h models/cogvlm.cpp models/conformer.cpp + models/gemma4v.cpp models/glm4v.cpp models/internvl.cpp models/kimivl.cpp diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h index 3604bf77e8..d3e7b1ed04 100644 --- a/tools/mtmd/clip-graph.h +++ b/tools/mtmd/clip-graph.h @@ -29,7 +29,7 @@ struct clip_graph { const int n_layer; const int n_mmproj_embd; const float eps; - const float kq_scale; + float kq_scale; // TODO: maybe move this to hparams const clip_flash_attn_type flash_attn_type; ggml_context_ptr ctx0_ptr; diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 011d76bcf6..5fa487367c 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -88,8 +88,11 @@ #define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s" #define TN_LN_1 "%s.blk.%d.ln1.%s" // layer norm #define TN_LN_2 "%s.blk.%d.ln2.%s" // layer norm -#define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale -#define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale +#define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale +#define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale +#define TN_LS_OUT "%s.blk.%d.out_scale.%s" // layer out scale (gemma4) +#define TN_ATTN_POST_NORM "%s.blk.%d.attn_post_norm.%s" // post-attn norm (gemma4) +#define TN_FFN_POST_NORM "%s.blk.%d.ffn_post_norm.%s" // post-FFN norm (gemma4) #define TN_LN_PRE "%s.pre_ln.%s" #define TN_LN_POST "%s.post_ln.%s" #define TN_LLAVA_PROJ "mm.%d.%s" @@ -213,6 +216,10 @@ #define TN_MNV5_MSFA_FFN_PROJ_BN "v.msfa.ffn.pw_proj.bn.weight" #define TN_MNV5_MSFA_NORM "v.msfa.norm.weight" +// gemma4 +#define TN_STD_BIAS "v.std_bias" +#define TN_STD_SCALE "v.std_scale" + // align x to upper multiple of n #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n)) @@ -233,6 +240,8 @@ enum projector_type { PROJECTOR_TYPE_GEMMA3, PROJECTOR_TYPE_GEMMA3NV, PROJECTOR_TYPE_GEMMA3NA, + PROJECTOR_TYPE_GEMMA4V, + PROJECTOR_TYPE_GEMMA4A, PROJECTOR_TYPE_PHI4, PROJECTOR_TYPE_IDEFICS3, PROJECTOR_TYPE_PIXTRAL, @@ -272,6 +281,8 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_GEMMA3, "gemma3"}, { PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"}, { PROJECTOR_TYPE_GEMMA3NA, "gemma3na"}, + { PROJECTOR_TYPE_GEMMA4V, "gemma4v"}, + { PROJECTOR_TYPE_GEMMA4A, "gemma4a"}, { PROJECTOR_TYPE_PHI4, "phi4"}, { PROJECTOR_TYPE_IDEFICS3, "idefics3"}, { PROJECTOR_TYPE_PIXTRAL, "pixtral"}, @@ -476,6 +487,18 @@ static std::vector string_split_str(std::string s, const std::strin return tokens; } +// remove when moving to c++20 +inline bool string_starts_with(std::string_view str, std::string_view prefix) { + return str.size() >= prefix.size() && + str.compare(0, prefix.size(), prefix) == 0; +} + +// remove when moving to c++20 +inline bool string_ends_with(std::string_view str, std::string_view suffix) { + return str.size() >= suffix.size() && + str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0; +} + // // gguf utils // diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index a73e9ba38b..70270d6e76 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -143,6 +143,10 @@ struct clip_hparams { }; struct clip_layer { + // layernorm 1 (or layer input norm, or pre-attention norm) + ggml_tensor * ln_1_w = nullptr; + ggml_tensor * ln_1_b = nullptr; + // attention ggml_tensor * k_w = nullptr; ggml_tensor * k_b = nullptr; @@ -159,9 +163,7 @@ struct clip_layer { ggml_tensor * k_norm = nullptr; ggml_tensor * q_norm = nullptr; - // layernorm 1 - ggml_tensor * ln_1_w = nullptr; - ggml_tensor * ln_1_b = nullptr; + ggml_tensor * attn_post_norm_w = nullptr; ggml_tensor * ff_up_w = nullptr; ggml_tensor * ff_up_b = nullptr; @@ -170,13 +172,16 @@ struct clip_layer { ggml_tensor * ff_down_w = nullptr; ggml_tensor * ff_down_b = nullptr; - // layernorm 2 + // layernorm 2 (or pre-FFN norm) ggml_tensor * ln_2_w = nullptr; ggml_tensor * ln_2_b = nullptr; + ggml_tensor * ff_post_norm_w = nullptr; + // layer scale (no bias) - ggml_tensor * ls_1_w = nullptr; - ggml_tensor * ls_2_w = nullptr; + ggml_tensor * ls_1_w = nullptr; + ggml_tensor * ls_2_w = nullptr; + ggml_tensor * ls_out_w = nullptr; // gemma4 // qwen3vl deepstack merger ggml_tensor * deepstack_norm_w = nullptr; @@ -437,6 +442,18 @@ struct clip_model { ggml_tensor * pre_encode_out_w = nullptr; ggml_tensor * pre_encode_out_b = nullptr; + // gemma4 + ggml_tensor * std_bias = nullptr; + ggml_tensor * std_scale = nullptr; + // Gemma4ClippableLinear + struct clamp_info { + float inp_max; + float inp_min; + float out_max; + float out_min; + }; + std::map clamp_info_map; + bool audio_has_avgpool() const { return proj_type == PROJECTOR_TYPE_QWEN2A || proj_type == PROJECTOR_TYPE_VOXTRAL diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 2947fcf9a3..12517123e7 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -24,6 +24,7 @@ #include #include #include +#include struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL}; @@ -379,19 +380,34 @@ ggml_tensor * clip_graph::build_vit( Vcur = ggml_add(ctx0, Vcur, layer.v_b); } - if (layer.q_norm) { - Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il); - cb(Qcur, "Qcur_norm", il); - } + // if true, norm must be applied after reshaping to (d_head, n_head, n_pos) + bool norm_per_head = layer.q_norm && layer.q_norm->ne[0] == d_head; - if (layer.k_norm) { - Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il); - cb(Kcur, "Kcur_norm", il); + if (!norm_per_head) { + if (layer.q_norm) { + Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il); + cb(Qcur, "Qcur_norm", il); + } + if (layer.k_norm) { + Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il); + cb(Kcur, "Kcur_norm", il); + } } Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos); Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos); Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos); + + if (norm_per_head) { + if (layer.q_norm) { + Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il); + cb(Qcur, "Qcur_norm_per_head", il); + } + if (layer.k_norm) { + Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il); + cb(Kcur, "Kcur_norm_per_head", il); + } + } } cb(Qcur, "Qcur", il); @@ -405,6 +421,11 @@ ggml_tensor * clip_graph::build_vit( cb(Kcur, "Kcur_pos", il); } + if (proj_type == PROJECTOR_TYPE_GEMMA4V) { + Vcur = ggml_rms_norm(ctx0, Vcur, eps); + cb(Vcur, "Vcur_normed", il); + } + cur = build_attn(layer.o_w, layer.o_b, Qcur, Kcur, Vcur, nullptr, kq_scale, il); cb(cur, "attn_out", il); @@ -415,6 +436,11 @@ ggml_tensor * clip_graph::build_vit( cb(cur, "attn_out_scaled", il); } + if (layer.attn_post_norm_w) { + cur = build_norm(cur, layer.attn_post_norm_w, nullptr, norm_t, eps, il); + cb(cur, "attn_post_normed", il); + } + // re-add the layer input, e.g., residual cur = ggml_add(ctx0, cur, inpL); @@ -422,7 +448,7 @@ ggml_tensor * clip_graph::build_vit( cb(cur, "ffn_inp", il); - // layernorm2 + // layernorm2 (pre-ffn norm) cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); cb(cur, "ffn_inp_normed", il); @@ -435,6 +461,11 @@ ggml_tensor * clip_graph::build_vit( cb(cur, "ffn_out", il); + if (layer.ff_post_norm_w) { + cur = build_norm(cur, layer.ff_post_norm_w, nullptr, norm_t, eps, il); + cb(cur, "ffn_post_normed", il); + } + if (layer.ls_2_w) { cur = ggml_mul(ctx0, cur, layer.ls_2_w); cb(cur, "ffn_out_scaled", il); @@ -444,6 +475,11 @@ ggml_tensor * clip_graph::build_vit( cur = ggml_add(ctx0, inpL, cur); cb(cur, "layer_out", il); + if (layer.ls_out_w) { + cur = ggml_mul(ctx0, cur, layer.ls_out_w); + cb(cur, "layer_out_scaled", il); + } + inpL = cur; } @@ -808,6 +844,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_GEMMA4V: + { + builder = std::make_unique(ctx, img); + } break; case PROJECTOR_TYPE_PIXTRAL: case PROJECTOR_TYPE_LIGHTONOCR: { @@ -1257,6 +1297,17 @@ struct clip_model_loader { get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); } break; + case PROJECTOR_TYPE_GEMMA4V: + { + hparams.rope_theta = 100.0f; + hparams.n_merge = 3; // pooling_kernel_size + hparams.image_resize_algo = RESIZE_ALGO_BILINEAR; + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); + // @ngxson : the model performs quite poor with small images, we need to bump minimum image tokens to 40 to avoid that + hparams.set_limit_image_tokens(252, 280); + hparams.set_warmup_n_tokens(256); // avoid OOM on warmup + } break; + case PROJECTOR_TYPE_GEMMA3NV: { // Gemma3n uses MobileNetV5 which produces 256 tokens (16x16) @@ -1442,6 +1493,11 @@ struct clip_model_loader { std::map tensor_offset; std::vector tensors_to_load; + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str())); + } + // TODO @ngxson : support both audio and video in the future const char * prefix = model.modality == CLIP_MODALITY_AUDIO ? "a" : "v"; @@ -1478,6 +1534,18 @@ struct clip_model_loader { return cur; }; + auto get_scalar = [&](const std::string & name, float default_val) { + auto it = tensor_offset.find(name); + if (it == tensor_offset.end()) { + return default_val; + } + size_t offset = it->second; + fin.seekg(offset, std::ios::beg); + float value; + fin.read(reinterpret_cast(&value), sizeof(float)); + return value; + }; + model.class_embedding = get_tensor(TN_CLASS_EMBD, false); model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, prefix, "weight"), false); @@ -1512,8 +1580,11 @@ struct clip_model_loader { layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, prefix, il, "weight"), false); layer.ln_1_w = get_tensor(string_format(TN_LN_1, prefix, il, "weight"), false); layer.ln_2_w = get_tensor(string_format(TN_LN_2, prefix, il, "weight"), false); - layer.ls_1_w = get_tensor(string_format(TN_LS_1, prefix, il, "weight"), false); // no bias - layer.ls_2_w = get_tensor(string_format(TN_LS_2, prefix, il, "weight"), false); // no bias + layer.ls_1_w = get_tensor(string_format(TN_LS_1, prefix, il, "weight"), false); // no bias + layer.ls_2_w = get_tensor(string_format(TN_LS_2, prefix, il, "weight"), false); // no bias + layer.ls_out_w = get_tensor(string_format(TN_LS_OUT, prefix, il, "weight"), false); // no bias + layer.attn_post_norm_w = get_tensor(string_format(TN_ATTN_POST_NORM, prefix, il, "weight"), false); // no bias + layer.ff_post_norm_w = get_tensor(string_format(TN_FFN_POST_NORM, prefix, il, "weight"), false); // no bias layer.k_b = get_tensor(string_format(TN_ATTN_K, prefix, il, "bias"), false); layer.q_b = get_tensor(string_format(TN_ATTN_Q, prefix, il, "bias"), false); @@ -1713,6 +1784,32 @@ struct clip_model_loader { model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ); model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N); } break; + case PROJECTOR_TYPE_GEMMA4V: + { + model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ); + model.std_bias = get_tensor(TN_STD_BIAS, false); + model.std_scale = get_tensor(TN_STD_SCALE, false); + // load scalar for Gemma4ClippableLinear + for (auto * tensor : tensors_to_load) { + std::string name = tensor->name; + if (string_ends_with(name, ".weight")) { + std::string name_inp_max = name; + std::string name_inp_min = name; + std::string name_out_max = name; + std::string name_out_min = name; + string_replace_all(name_inp_max, ".weight", ".input_max"); + string_replace_all(name_inp_min, ".weight", ".input_min"); + string_replace_all(name_out_max, ".weight", ".output_max"); + string_replace_all(name_out_min, ".weight", ".output_min"); + model.clamp_info_map[name] = { + get_scalar(name_inp_max, FLT_MAX), + get_scalar(name_inp_min, -FLT_MAX), + get_scalar(name_out_max, FLT_MAX), + get_scalar(name_out_min, -FLT_MAX) + }; + } + } + } break; case PROJECTOR_TYPE_GEMMA3NV: { model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false); @@ -2042,11 +2139,6 @@ struct clip_model_loader { { std::vector read_buf; - auto fin = std::ifstream(fname, std::ios::binary); - if (!fin) { - throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str())); - } - // alloc memory and offload data ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend); ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft)); @@ -2345,7 +2437,8 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params // TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors // we can remove this check when we implement audio support for Gemma 3N - skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV; + skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV + || ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA4V; } if (loader.has_audio && !skip_audio) { @@ -2581,6 +2674,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im n_patches = x_patch * y_patch; } break; case PROJECTOR_TYPE_GEMMA3: + case PROJECTOR_TYPE_GEMMA4V: case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_INTERNVL: case PROJECTOR_TYPE_NEMOTRON_V2_VL: @@ -3031,6 +3125,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } set_input_i32("patches", patches); } break; + case PROJECTOR_TYPE_GEMMA4V: + { + // set (col, row) patch positions for learned positional embedding + const int n_cols = image_size_width / patch_size; + std::vector pos_x(num_patches), pos_y(num_patches); + for (int i = 0; i < num_patches; i++) { + pos_x[i] = i % n_cols; + pos_y[i] = i / n_cols; + } + set_input_i32("pos_x", pos_x); + set_input_i32("pos_y", pos_y); + } break; case PROJECTOR_TYPE_DEEPSEEKOCR: { GGML_ASSERT(pos_w == pos_h); @@ -3218,6 +3324,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_GEMMA3NV: return ctx->model.mm_input_proj_w->ne[0]; + case PROJECTOR_TYPE_GEMMA4V: + return ctx->model.mm_input_proj_w->ne[1]; case PROJECTOR_TYPE_IDEFICS3: return ctx->model.mm_fc_w->ne[1]; case PROJECTOR_TYPE_ULTRAVOX: diff --git a/tools/mtmd/models/gemma4v.cpp b/tools/mtmd/models/gemma4v.cpp new file mode 100644 index 0000000000..4068a08aaf --- /dev/null +++ b/tools/mtmd/models/gemma4v.cpp @@ -0,0 +1,151 @@ +#include "models.h" +#include + +ggml_cgraph * clip_graph_gemma4v::build() { + ggml_tensor * inp_raw = build_inp_raw(); + + // patches = 2 * (patches - 0.5) + // equivalent to: patches * 2 - 1 + inp_raw = ggml_scale_bias(ctx0, inp_raw, 2.0f, -1.0f); + ggml_set_name(inp_raw, "inp_raw_scaled"); + + ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd); + inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); + ggml_set_name(inp, "inp"); + // note: no patch bias + + ggml_tensor * pos_x = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_x, "pos_x"); + ggml_set_input(pos_x); + + ggml_tensor * pos_y = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_y, "pos_y"); + ggml_set_input(pos_y); + + { + const int64_t pos_size = model.position_embeddings->ne[1]; + const size_t nb1 = ggml_row_size(model.position_embeddings->type, n_embd); + + // positional embeddings are stored as lookup tables (one for x, one for y) + ggml_tensor * tbl_x = ggml_view_2d(ctx0, model.position_embeddings, + n_embd, pos_size, nb1, 0); + ggml_tensor * tbl_y = ggml_view_2d(ctx0, model.position_embeddings, + n_embd, pos_size, nb1, pos_size * nb1); + + // ggml_get_rows: [n_embd, n_patches] + ggml_tensor * emb_x = ggml_get_rows(ctx0, tbl_x, pos_x); + ggml_tensor * emb_y = ggml_get_rows(ctx0, tbl_y, pos_y); + + inp = ggml_add(ctx0, inp, emb_x); + inp = ggml_add(ctx0, inp, emb_y); + cb(inp, "pos_embd", -1); + } + + // similar to build_rope_2d, but use neox ordering + auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { + const int64_t n_dim = cur->ne[0]; + const int64_t n_head = cur->ne[1]; + const int64_t n_pos = cur->ne[2]; + + // first half + ggml_tensor * first; + { + first = ggml_view_3d(ctx0, cur, + n_dim/2, n_head, n_pos, + cur->nb[1], + cur->nb[2], + 0); + first = ggml_rope_ext( + ctx0, + first, + pos_x, // positions + nullptr, // freq factors + n_dim/2, // n_dims + GGML_ROPE_TYPE_NEOX, 0, hparams.rope_theta, + 1.0f, 0.0f, 1.0f, 0.0f, 0.0f + ); + } + + // second half + ggml_tensor * second; + { + second = ggml_view_3d(ctx0, cur, + n_dim/2, n_head, n_pos, + cur->nb[1], + cur->nb[2], + n_dim/2 * ggml_element_size(cur)); + second = ggml_rope_ext( + ctx0, + second, + pos_y, // positions + nullptr, // freq factors + n_dim/2, // n_dims + GGML_ROPE_TYPE_NEOX, 0, hparams.rope_theta, + 1.0f, 0.0f, 1.0f, 0.0f, 0.0f + ); + } + + cur = ggml_concat(ctx0, first, second, 0); + return cur; + }; + + kq_scale = 1.0f; + ggml_tensor * cur = build_vit( + inp, n_patches, + NORM_TYPE_RMS, + hparams.ffn_op, + nullptr, // pos embd is already handled above + add_pos); + + // Gemma4VisionPooler + { + const int kernel_size = hparams.n_merge; + GGML_ASSERT(kernel_size > 0); + + // [n_embd, n_patches] -> [n_patches_x, n_patches_y, n_embd, 1] + cur = ggml_cont_4d(ctx0, ggml_transpose(ctx0, cur), n_patches_x, n_patches_y, n_embd, 1); + cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, + kernel_size, kernel_size, kernel_size, kernel_size, 0, 0); + const int out_x = n_patches_x / kernel_size; + const int out_y = n_patches_y / kernel_size; + // [out_x, out_y, n_embd, 1] -> [n_embd, out_x * out_y] + cur = ggml_reshape_3d(ctx0, cur, out_x * out_y, n_embd, 1); + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + cur = ggml_scale(ctx0, cur, sqrtf((float)n_embd)); + cb(cur, "pooled", -1); + } + + // hidden_states = (hidden_states - self.std_bias) * self.std_scale + if (model.std_bias && model.std_scale) { + cur = ggml_sub(ctx0, cur, model.std_bias); + cur = ggml_mul(ctx0, cur, model.std_scale); + cb(cur, "std_scaled", -1); + } + + // Gemma4MultimodalEmbedder + cur = build_mm(model.mm_input_proj_w, cur); + cb(cur, "projected", -1); + + // embedding_post_projection_norm + cur = ggml_rms_norm(ctx0, cur, hparams.eps); + cb(cur, "projected_normed", -1); + + ggml_build_forward_expand(gf, cur); + return gf; +} + +ggml_tensor * clip_graph_gemma4v::build_mm(ggml_tensor * w, ggml_tensor * x) const { + // Gemma4ClippableLinear + + auto it = model.clamp_info_map.find(w->name); + if (it == model.clamp_info_map.end()) { + return ggml_mul_mat(ctx0, w, x); + } else { + const auto & clamp_info = it->second; + ggml_tensor * clamped = ggml_clamp(ctx0, x, clamp_info.inp_min, clamp_info.inp_max); + ggml_tensor * out = ggml_mul_mat(ctx0, w, clamped); + out = ggml_clamp(ctx0, out, clamp_info.out_min, clamp_info.out_max); + return out; + } +} diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index 5705d7f21e..992eda04bb 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -12,6 +12,12 @@ struct clip_graph_siglip : clip_graph { ggml_cgraph * build() override; }; +struct clip_graph_gemma4v : clip_graph { + clip_graph_gemma4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; + ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const override; +}; + struct clip_graph_pixtral : clip_graph { clip_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 9c400ce104..35b4396fd8 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -394,6 +394,13 @@ struct mtmd_context { img_end = "<|IMAGE_END|>"; image_preproc = std::make_unique(ctx_v); } break; + case PROJECTOR_TYPE_GEMMA4V: + { + // <|image> ... (image embeddings) ... + img_beg = "<|image>"; + img_end = ""; + image_preproc = std::make_unique(ctx_v); + } break; case PROJECTOR_TYPE_DEEPSEEKOCR: { img_end = "\n"; // prevent empty batch on llama-server @@ -974,6 +981,7 @@ float * mtmd_get_output_embd(mtmd_context * ctx) { bool mtmd_decode_use_non_causal(mtmd_context * ctx) { switch (ctx->proj_type_v()) { case PROJECTOR_TYPE_GEMMA3: + case PROJECTOR_TYPE_GEMMA4V: return true; default: return false;