model, mtmd: fix gguf conversion for audio/vision mmproj (#21309)
* fix gguf conversion for audio/vision mmproj * fix test
This commit is contained in:
parent
223373742b
commit
63f8fe0ef4
|
|
@ -169,6 +169,8 @@ common_peg_parser analyze_tools::build_parser(parser_build_context & ctx) const
|
||||||
return build_tool_parser_tag_json(ctx);
|
return build_tool_parser_tag_json(ctx);
|
||||||
case tool_format::TAG_WITH_TAGGED:
|
case tool_format::TAG_WITH_TAGGED:
|
||||||
return build_tool_parser_tag_tagged(ctx);
|
return build_tool_parser_tag_tagged(ctx);
|
||||||
|
case tool_format::TAG_WITH_GEMMA4_DICT:
|
||||||
|
return build_tool_parser_tag_gemma4_dict(ctx);
|
||||||
default:
|
default:
|
||||||
LOG_ERR("[ERROR] Template seems to support tool calls, but failed to determine tool format. Tool calling will not work properly. "
|
LOG_ERR("[ERROR] Template seems to support tool calls, but failed to determine tool format. Tool calling will not work properly. "
|
||||||
"Check for a fixed template for your model in the models/templates directory of your llama.cpp installation or "
|
"Check for a fixed template for your model in the models/templates directory of your llama.cpp installation or "
|
||||||
|
|
@ -433,4 +435,113 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
|
||||||
p.end();
|
p.end();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
common_peg_parser analyze_tools::build_tool_parser_tag_gemma4_dict(parser_build_context & ctx) const {
|
||||||
|
auto & p = ctx.p;
|
||||||
|
const auto & inputs = ctx.inputs;
|
||||||
|
bool force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||||
|
|
||||||
|
// The Gemma4 string quote token used in place of JSON "
|
||||||
|
static const std::string QUOTE = "<|\"|>";
|
||||||
|
|
||||||
|
common_peg_parser tool_choice = p.choice();
|
||||||
|
|
||||||
|
foreach_function(inputs.tools, [&](const json & tool) {
|
||||||
|
const auto & func = tool.at("function");
|
||||||
|
std::string name = func.at("name");
|
||||||
|
const auto & params = func.at("parameters");
|
||||||
|
|
||||||
|
if (!params.contains("properties") || !params.at("properties").is_object()) {
|
||||||
|
// No arguments - just match the function name with empty braces
|
||||||
|
auto func_parser = p.atomic(
|
||||||
|
p.tool_open(p.literal(function.name_prefix) + p.tool_name(p.literal(name)) + p.literal("{")) +
|
||||||
|
p.tool_args(p.eps()) +
|
||||||
|
p.tool_close(p.literal("}")));
|
||||||
|
tool_choice |= p.rule("tool-" + name, func_parser);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto & properties = params.at("properties");
|
||||||
|
std::set<std::string> required;
|
||||||
|
if (params.contains("required") && params.at("required").is_array()) {
|
||||||
|
params.at("required").get_to(required);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build per-argument parsers, sorted alphabetically (matching template's dictsort)
|
||||||
|
struct arg_entry {
|
||||||
|
std::string param_name;
|
||||||
|
common_peg_parser parser;
|
||||||
|
};
|
||||||
|
std::vector<arg_entry> arg_entries;
|
||||||
|
|
||||||
|
for (const auto & [param_name, param_schema] : properties.items()) {
|
||||||
|
std::string type = "object";
|
||||||
|
auto type_v = param_schema.contains("type") ? param_schema.at("type") : json::object();
|
||||||
|
if (type_v.is_string()) type_v.get_to(type);
|
||||||
|
|
||||||
|
common_peg_parser value_parser = p.eps();
|
||||||
|
if (type == "string") {
|
||||||
|
// String values are delimited by <|"|>...<|"|>
|
||||||
|
value_parser =
|
||||||
|
p.literal(QUOTE) +
|
||||||
|
p.tool_arg_string_value(p.schema(p.until(QUOTE),
|
||||||
|
"tool-" + name + "-arg-" + param_name + "-schema", param_schema, true)) +
|
||||||
|
p.literal(QUOTE);
|
||||||
|
} else {
|
||||||
|
// Numbers, booleans: raw text up to the next comma or closing brace
|
||||||
|
value_parser = p.tool_arg_value(p.until_one_of({",", "}"}));
|
||||||
|
}
|
||||||
|
|
||||||
|
auto arg = p.tool_arg(
|
||||||
|
p.tool_arg_open(p.tool_arg_name(p.literal(param_name)) + p.literal(":")) +
|
||||||
|
value_parser +
|
||||||
|
p.tool_arg_close(p.eps()));
|
||||||
|
|
||||||
|
arg_entries.push_back({param_name, p.rule("tool-" + name + "-arg-" + param_name, arg)});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort alphabetically to match Jinja's dictsort
|
||||||
|
std::sort(arg_entries.begin(), arg_entries.end(), [](const auto & a, const auto & b) {
|
||||||
|
return a.param_name < b.param_name;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Build arg sequence: any arg, then zero-or-more comma-separated additional args
|
||||||
|
common_peg_parser args_seq = p.eps();
|
||||||
|
if (!arg_entries.empty()) {
|
||||||
|
common_peg_parser any_arg = p.choice();
|
||||||
|
for (auto & entry : arg_entries) {
|
||||||
|
any_arg |= entry.parser;
|
||||||
|
}
|
||||||
|
args_seq = p.optional(
|
||||||
|
any_arg + p.repeat(p.literal(",") + any_arg, 0, (int) arg_entries.size() - 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Full parser: call:name{args}
|
||||||
|
auto func_parser = p.atomic(
|
||||||
|
p.tool_open(p.literal(function.name_prefix) + p.tool_name(p.literal(name)) + p.literal("{")) +
|
||||||
|
p.tool_args(args_seq) +
|
||||||
|
p.tool_close(p.literal("}")));
|
||||||
|
|
||||||
|
tool_choice |= p.rule("tool-" + name, func_parser);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Wrap each call in <|tool_call>...</tool_call|>
|
||||||
|
auto wrapped_call = p.literal(format.per_call_start) + tool_choice + p.literal(format.per_call_end);
|
||||||
|
|
||||||
|
common_peg_parser tool_calls = p.eps();
|
||||||
|
if (inputs.parallel_tool_calls) {
|
||||||
|
tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call));
|
||||||
|
} else {
|
||||||
|
tool_calls = p.trigger_rule("tool-call", wrapped_call);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!force_tools) {
|
||||||
|
tool_calls = p.optional(tool_calls);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto content_before_tools = p.until(format.per_call_start);
|
||||||
|
return ctx.reasoning_parser +
|
||||||
|
(force_tools ? p.eps() : p.optional(p.content(content_before_tools))) +
|
||||||
|
tool_calls + p.end();
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace autoparser
|
} // namespace autoparser
|
||||||
|
|
|
||||||
|
|
@ -144,6 +144,7 @@ enum class tool_format {
|
||||||
JSON_NATIVE, // Pure JSON: {"name": "X", "arguments": {...}}
|
JSON_NATIVE, // Pure JSON: {"name": "X", "arguments": {...}}
|
||||||
TAG_WITH_JSON, // Tag-based with JSON args: <function=X>{...}</function>
|
TAG_WITH_JSON, // Tag-based with JSON args: <function=X>{...}</function>
|
||||||
TAG_WITH_TAGGED, // Tag-based with tagged args: <param=key>value</param>
|
TAG_WITH_TAGGED, // Tag-based with tagged args: <param=key>value</param>
|
||||||
|
TAG_WITH_GEMMA4_DICT, // Gemma4 custom dict: <|tool_call>call:name{key:<|"|>val<|"|>}<tool_call|>
|
||||||
};
|
};
|
||||||
|
|
||||||
inline std::ostream & operator<<(std::ostream & os, const tool_format & format) {
|
inline std::ostream & operator<<(std::ostream & os, const tool_format & format) {
|
||||||
|
|
@ -156,6 +157,8 @@ inline std::ostream & operator<<(std::ostream & os, const tool_format & format)
|
||||||
return os << "TAG_WITH_JSON";
|
return os << "TAG_WITH_JSON";
|
||||||
case tool_format::TAG_WITH_TAGGED:
|
case tool_format::TAG_WITH_TAGGED:
|
||||||
return os << "TAG_WITH_TAGGED";
|
return os << "TAG_WITH_TAGGED";
|
||||||
|
case tool_format::TAG_WITH_GEMMA4_DICT:
|
||||||
|
return os << "TAG_WITH_GEMMA4_DICT";
|
||||||
default:
|
default:
|
||||||
return os << "UNKNOWN";
|
return os << "UNKNOWN";
|
||||||
}
|
}
|
||||||
|
|
@ -350,6 +353,7 @@ struct analyze_tools : analyze_base {
|
||||||
common_peg_parser build_tool_parser_json_native(parser_build_context & ctx) const;
|
common_peg_parser build_tool_parser_json_native(parser_build_context & ctx) const;
|
||||||
common_peg_parser build_tool_parser_tag_json(parser_build_context & ctx) const;
|
common_peg_parser build_tool_parser_tag_json(parser_build_context & ctx) const;
|
||||||
common_peg_parser build_tool_parser_tag_tagged(parser_build_context & ctx) const;
|
common_peg_parser build_tool_parser_tag_tagged(parser_build_context & ctx) const;
|
||||||
|
common_peg_parser build_tool_parser_tag_gemma4_dict(parser_build_context & ctx) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
|
|
||||||
|
|
@ -92,6 +92,33 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
|
||||||
LOG_DBG(ANSI_ORANGE "[Patch: Functionary 3.1]\n" ANSI_RESET);
|
LOG_DBG(ANSI_ORANGE "[Patch: Functionary 3.1]\n" ANSI_RESET);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
// Gemma4 - custom dict format: <|tool_call>call:name{key:<|"|>val<|"|>}<tool_call|>
|
||||||
|
[](const common_chat_template & tmpl, autoparser & analysis) -> void {
|
||||||
|
if (tmpl.src.find("'<|tool_call>call:'") != std::string::npos) {
|
||||||
|
analysis.tools.format.mode = tool_format::TAG_WITH_GEMMA4_DICT;
|
||||||
|
analysis.tools.format.per_call_start = "<|tool_call>";
|
||||||
|
analysis.tools.format.per_call_end = "<tool_call|>";
|
||||||
|
analysis.tools.format.section_start = "";
|
||||||
|
analysis.tools.format.section_end = "";
|
||||||
|
analysis.tools.function.name_prefix = "call:";
|
||||||
|
analysis.tools.function.name_suffix = "";
|
||||||
|
analysis.tools.arguments.start = "{";
|
||||||
|
analysis.tools.arguments.end = "}";
|
||||||
|
analysis.tools.arguments.name_suffix = ":";
|
||||||
|
analysis.tools.arguments.separator = ",";
|
||||||
|
analysis.reasoning.mode = reasoning_mode::TAG_BASED;
|
||||||
|
analysis.reasoning.start = "<|channel>thought\n";
|
||||||
|
analysis.reasoning.end = "<channel|>";
|
||||||
|
analysis.preserved_tokens.clear();
|
||||||
|
analysis.preserved_tokens.push_back("<|tool_call>");
|
||||||
|
analysis.preserved_tokens.push_back("<tool_call|>");
|
||||||
|
analysis.preserved_tokens.push_back("<|tool_response>");
|
||||||
|
analysis.preserved_tokens.push_back("<tool_response|>");
|
||||||
|
analysis.preserved_tokens.push_back("<|\"|>");
|
||||||
|
analysis.preserved_tokens.push_back("<|turn>");
|
||||||
|
LOG_DBG(ANSI_ORANGE "[Patch: Gemma4]\n" ANSI_RESET);
|
||||||
|
}
|
||||||
|
},
|
||||||
// DeepSeek-R1-Distill-Qwen
|
// DeepSeek-R1-Distill-Qwen
|
||||||
[](const common_chat_template & tmpl, autoparser & analysis) -> void {
|
[](const common_chat_template & tmpl, autoparser & analysis) -> void {
|
||||||
if (tmpl.src.find(
|
if (tmpl.src.find(
|
||||||
|
|
|
||||||
|
|
@ -1545,6 +1545,50 @@ static void requires_non_null_content(json & messages) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Gemma4 uses a custom tool_responses field instead of role:tool messages.
|
||||||
|
// Convert consecutive role:tool messages into a single user message with tool_responses.
|
||||||
|
static void convert_tool_responses_gemma4(json & messages) {
|
||||||
|
json result = json::array();
|
||||||
|
size_t i = 0;
|
||||||
|
while (i < messages.size()) {
|
||||||
|
if (messages[i].contains("role") && messages[i].at("role") == "tool") {
|
||||||
|
json tool_responses = json::array();
|
||||||
|
while (i < messages.size() &&
|
||||||
|
messages[i].contains("role") &&
|
||||||
|
messages[i].at("role") == "tool") {
|
||||||
|
const auto & tool_msg = messages[i];
|
||||||
|
std::string name;
|
||||||
|
if (tool_msg.contains("tool_call_id") && tool_msg.at("tool_call_id").is_string()) {
|
||||||
|
name = tool_msg.at("tool_call_id");
|
||||||
|
} else if (tool_msg.contains("name") && tool_msg.at("name").is_string()) {
|
||||||
|
name = tool_msg.at("name");
|
||||||
|
}
|
||||||
|
json response;
|
||||||
|
if (tool_msg.contains("content")) {
|
||||||
|
const auto & content = tool_msg.at("content");
|
||||||
|
if (content.is_string()) {
|
||||||
|
// Try to parse the content as JSON; fall back to raw string
|
||||||
|
try {
|
||||||
|
response = json::parse(content.get<std::string>());
|
||||||
|
} catch (...) {
|
||||||
|
response = content;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
response = content;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tool_responses.push_back({{"name", name}, {"response", response}});
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
result.push_back({{"role", "user"}, {"tool_responses", tool_responses}});
|
||||||
|
} else {
|
||||||
|
result.push_back(messages[i]);
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
messages = result;
|
||||||
|
}
|
||||||
|
|
||||||
static void func_args_not_string(json & messages) {
|
static void func_args_not_string(json & messages) {
|
||||||
GGML_ASSERT(messages.is_array());
|
GGML_ASSERT(messages.is_array());
|
||||||
for (auto & message : messages) {
|
for (auto & message : messages) {
|
||||||
|
|
@ -1673,6 +1717,10 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
|
||||||
workaround::func_args_not_string(params.messages);
|
workaround::func_args_not_string(params.messages);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (src.find("'<|tool_call>call:'") != std::string::npos) {
|
||||||
|
workaround::convert_tool_responses_gemma4(params.messages);
|
||||||
|
}
|
||||||
|
|
||||||
params.add_generation_prompt = false;
|
params.add_generation_prompt = false;
|
||||||
std::string no_gen_prompt = common_chat_template_direct_apply(tmpl, params);
|
std::string no_gen_prompt = common_chat_template_direct_apply(tmpl, params);
|
||||||
params.add_generation_prompt = true;
|
params.add_generation_prompt = true;
|
||||||
|
|
|
||||||
|
|
@ -1164,7 +1164,7 @@ class TextModel(ModelBase):
|
||||||
if (n_experts := self.find_hparam(["num_local_experts", "num_experts"], optional=True)) is not None:
|
if (n_experts := self.find_hparam(["num_local_experts", "num_experts"], optional=True)) is not None:
|
||||||
self.gguf_writer.add_expert_count(n_experts)
|
self.gguf_writer.add_expert_count(n_experts)
|
||||||
logger.info(f"gguf: expert count = {n_experts}")
|
logger.info(f"gguf: expert count = {n_experts}")
|
||||||
if (n_experts_used := self.find_hparam(["num_experts_per_tok", "num_experts_per_token"], optional=True)) is not None:
|
if (n_experts_used := self.find_hparam(["num_experts_per_tok", "num_experts_per_token", "top_k_experts"], optional=True)) is not None:
|
||||||
self.gguf_writer.add_expert_used_count(n_experts_used)
|
self.gguf_writer.add_expert_used_count(n_experts_used)
|
||||||
logger.info(f"gguf: experts used count = {n_experts_used}")
|
logger.info(f"gguf: experts used count = {n_experts_used}")
|
||||||
if (n_expert_groups := self.hparams.get("n_group")) is not None:
|
if (n_expert_groups := self.hparams.get("n_group")) is not None:
|
||||||
|
|
@ -6878,7 +6878,9 @@ class Gemma2Model(TextModel):
|
||||||
@ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
|
@ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
|
||||||
class Gemma3Model(TextModel):
|
class Gemma3Model(TextModel):
|
||||||
model_arch = gguf.MODEL_ARCH.GEMMA3
|
model_arch = gguf.MODEL_ARCH.GEMMA3
|
||||||
norm_shift = 1.0 # Gemma3RMSNorm adds 1.0 to the norm value
|
|
||||||
|
def norm_shift(self, name: str) -> float:
|
||||||
|
return 1.0 if name.endswith("norm.weight") else 0.0 # Gemma3RMSNorm adds 1.0 to the norm value
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
if (self.dir_model / "tokenizer.model").is_file():
|
if (self.dir_model / "tokenizer.model").is_file():
|
||||||
|
|
@ -6916,17 +6918,22 @@ class Gemma3Model(TextModel):
|
||||||
|
|
||||||
# remove OOV (out-of-vocabulary) rows in token_embd
|
# remove OOV (out-of-vocabulary) rows in token_embd
|
||||||
if "embed_tokens.weight" in name:
|
if "embed_tokens.weight" in name:
|
||||||
|
n_vocab_real = -1
|
||||||
if (self.dir_model / "tokenizer.model").is_file():
|
if (self.dir_model / "tokenizer.model").is_file():
|
||||||
tokens = self._create_vocab_sentencepiece()[0]
|
tokens = self._create_vocab_sentencepiece()[0]
|
||||||
|
n_vocab_real = len(tokens)
|
||||||
else:
|
else:
|
||||||
tokens = self.get_vocab_base()[0]
|
with open(self.dir_model / "tokenizer.json", "r", encoding="utf-8") as f:
|
||||||
data_torch = data_torch[:len(tokens)]
|
tokenizer_json = json.load(f)
|
||||||
|
n_vocab_real = len(tokenizer_json["model"]["vocab"]) + len(tokenizer_json["added_tokens"])
|
||||||
|
data_torch = data_torch[:n_vocab_real]
|
||||||
|
|
||||||
# ref code in Gemma3RMSNorm
|
# ref code in Gemma3RMSNorm
|
||||||
# output = output * (1.0 + self.weight.float())
|
# output = output * (1.0 + self.weight.float())
|
||||||
# note: this is not the case on gemma3n
|
# note: this is not the case on gemma3n
|
||||||
if name.endswith("norm.weight"):
|
f_shift = self.norm_shift(name)
|
||||||
data_torch = data_torch + self.norm_shift
|
if f_shift != 0.0:
|
||||||
|
data_torch = data_torch + f_shift
|
||||||
|
|
||||||
yield from super().modify_tensors(data_torch, name, bid)
|
yield from super().modify_tensors(data_torch, name, bid)
|
||||||
|
|
||||||
|
|
@ -7100,7 +7107,8 @@ class ConformerAudioModel(MmprojModel):
|
||||||
assert data_torch.shape[2] == 1
|
assert data_torch.shape[2] == 1
|
||||||
data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1])
|
data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1])
|
||||||
|
|
||||||
yield from super().modify_tensors(data_torch, name, bid)
|
mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min"))
|
||||||
|
yield (mapped_name, data_torch)
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("DeepseekOCRForCausalLM")
|
@ModelBase.register("DeepseekOCRForCausalLM")
|
||||||
|
|
@ -7289,7 +7297,6 @@ class Gemma3nVisionAudioModel(ConformerAudioModel):
|
||||||
@ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration")
|
@ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration")
|
||||||
class Gemma3NModel(Gemma3Model):
|
class Gemma3NModel(Gemma3Model):
|
||||||
model_arch = gguf.MODEL_ARCH.GEMMA3N
|
model_arch = gguf.MODEL_ARCH.GEMMA3N
|
||||||
norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code
|
|
||||||
|
|
||||||
_altup_proj: list[Tensor] = []
|
_altup_proj: list[Tensor] = []
|
||||||
_altup_unembd: list[Tensor] = []
|
_altup_unembd: list[Tensor] = []
|
||||||
|
|
@ -7308,6 +7315,10 @@ class Gemma3NModel(Gemma3Model):
|
||||||
torch.Tensor(), # to be replaced
|
torch.Tensor(), # to be replaced
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def norm_shift(self, name: str) -> float:
|
||||||
|
del name
|
||||||
|
return 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
# For Gemma3n multimodal models, we need the FULL vocab_size (262400)
|
# For Gemma3n multimodal models, we need the FULL vocab_size (262400)
|
||||||
# which includes special tokens from 262144-262399 for vision/audio.
|
# which includes special tokens from 262144-262399 for vision/audio.
|
||||||
|
|
@ -7425,6 +7436,212 @@ class Gemma3NModel(Gemma3Model):
|
||||||
yield from super().modify_tensors(data_torch, name, bid)
|
yield from super().modify_tensors(data_torch, name, bid)
|
||||||
|
|
||||||
|
|
||||||
|
@ModelBase.register("Gemma4ForConditionalGeneration")
|
||||||
|
class Gemma4Model(Gemma3Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.GEMMA4
|
||||||
|
|
||||||
|
def norm_shift(self, name: str) -> float:
|
||||||
|
del name # unused
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
vocab = gguf.LlamaHfVocab(self.dir_model)
|
||||||
|
tokens = []
|
||||||
|
scores = []
|
||||||
|
toktypes = []
|
||||||
|
visible_tokens = {"<|channel>", "<channel|>", "<|tool_call>", "<tool_call|>", "<|tool_response>", "<tool_response|>", "<|\"|>"}
|
||||||
|
|
||||||
|
for text, score, toktype in vocab.all_tokens():
|
||||||
|
tokens.append(text)
|
||||||
|
scores.append(score)
|
||||||
|
text_str = text.decode()
|
||||||
|
if text_str in visible_tokens:
|
||||||
|
# always render these tokens, so that the chat parser can read them
|
||||||
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||||
|
logger.info(f"Token '{text_str}' is set to USER_DEFINED")
|
||||||
|
else:
|
||||||
|
toktypes.append(toktype)
|
||||||
|
|
||||||
|
assert len(tokens) == vocab.vocab_size
|
||||||
|
|
||||||
|
# TODO @ngxson : there are some known (rare) issues with the tokenizer during development
|
||||||
|
# but I don't have time to dive into them right now;
|
||||||
|
# using a dedicated tokenizer name so that we can fix later without re-converting GGUF
|
||||||
|
self.gguf_writer.add_tokenizer_model("gemma4")
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_scores(scores)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
self.gguf_writer.add_add_space_prefix(False)
|
||||||
|
self.gguf_writer.add_add_bos_token(False) # already added via the chat template
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
|
||||||
|
num_kv_shared_layers = self.hparams["num_kv_shared_layers"]
|
||||||
|
self.gguf_writer.add_shared_kv_layers(num_kv_shared_layers)
|
||||||
|
|
||||||
|
# per-layer embedding is optional
|
||||||
|
n_pl_embd = self.hparams.get("hidden_size_per_layer_input") or 0
|
||||||
|
self.gguf_writer.add_embedding_length_per_layer_input(n_pl_embd)
|
||||||
|
|
||||||
|
swa_layers = [t == "sliding_attention" for t in self.hparams["layer_types"]]
|
||||||
|
self.gguf_writer.add_sliding_window_pattern(swa_layers)
|
||||||
|
|
||||||
|
head_dim_full = self.hparams["global_head_dim"]
|
||||||
|
head_dim_swa = self.hparams["head_dim"]
|
||||||
|
# correct the head dim for global/swa layers
|
||||||
|
self.gguf_writer.add_key_length(head_dim_full)
|
||||||
|
self.gguf_writer.add_value_length(head_dim_full)
|
||||||
|
self.gguf_writer.add_key_length_swa(head_dim_swa)
|
||||||
|
self.gguf_writer.add_value_length_swa(head_dim_swa)
|
||||||
|
|
||||||
|
expert_intermediate_size = self.find_hparam(["expert_intermediate_size", "moe_intermediate_size"])
|
||||||
|
if expert_intermediate_size is not None:
|
||||||
|
self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size)
|
||||||
|
|
||||||
|
# if use_double_wide_mlp is set, we need to adjust the value for kv shared layers
|
||||||
|
use_double_wide_mlp = self.hparams.get("use_double_wide_mlp", False)
|
||||||
|
first_kv_shared_layer_idx = self.block_count - num_kv_shared_layers
|
||||||
|
if use_double_wide_mlp:
|
||||||
|
n_ff = self.hparams["intermediate_size"]
|
||||||
|
n_ff_arr = [n_ff if il < first_kv_shared_layer_idx else n_ff * 2 for il in range(self.block_count)]
|
||||||
|
self.gguf_writer.add_feed_forward_length(n_ff_arr)
|
||||||
|
|
||||||
|
# handle num_global_key_value_heads
|
||||||
|
num_key_value_heads_full = self.hparams.get("num_global_key_value_heads")
|
||||||
|
num_key_value_heads_swa = self.hparams.get("num_key_value_heads")
|
||||||
|
if num_key_value_heads_full is not None and num_key_value_heads_swa is not None:
|
||||||
|
value_arr = [num_key_value_heads_swa if is_swa else num_key_value_heads_full for is_swa in swa_layers]
|
||||||
|
self.gguf_writer.add_head_count_kv(value_arr)
|
||||||
|
|
||||||
|
# handle n_rot differently for global vs swa layers
|
||||||
|
partial_rotary_factor_swa = self.hparams.get("partial_rotary_factor", 1.0)
|
||||||
|
n_rot_full = int(head_dim_full) # "proportional" is used, see generate_extra_tensors
|
||||||
|
n_rot_swa = int(head_dim_swa * partial_rotary_factor_swa)
|
||||||
|
self.gguf_writer.add_rope_dimension_count(n_rot_full)
|
||||||
|
self.gguf_writer.add_rope_dimension_count_swa(n_rot_swa)
|
||||||
|
|
||||||
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
# full layer uses "proportional" rope with partial_rotary_factor=0.25
|
||||||
|
# the expected ordering is cc000000ss000000 (c = cos, s = sin, 0 = unrotated),
|
||||||
|
# but ggml neox only supports ccss000000000000, and we cannot rearrange the head because that will break use_alternative_attention
|
||||||
|
# solution is to set specific freq_factors for the unrotated dims
|
||||||
|
|
||||||
|
# IMPORTANT: this ROPE_FREQS tensor is ONLY used by the full_attention layers
|
||||||
|
rope_params_full = self.hparams["rope_parameters"]["full_attention"]
|
||||||
|
assert rope_params_full["rope_type"] == "proportional"
|
||||||
|
head_dim_full = (self.hparams["global_head_dim"])
|
||||||
|
partial_rotary_factor_full = rope_params_full["partial_rotary_factor"]
|
||||||
|
n_rot_full = int(head_dim_full * partial_rotary_factor_full / 2)
|
||||||
|
n_unrot_full = int(head_dim_full / 2) - n_rot_full
|
||||||
|
values = [1.0] * n_rot_full + [1e30] * n_unrot_full
|
||||||
|
rope_freqs_full = torch.tensor(values, dtype=torch.float32)
|
||||||
|
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), rope_freqs_full)
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
if name.endswith("per_dim_scale") or name.endswith("layer_scalar"):
|
||||||
|
name = name + ".weight"
|
||||||
|
|
||||||
|
if "language_model." not in name and "rope_freqs" not in name:
|
||||||
|
return # skip non-language model tensors
|
||||||
|
|
||||||
|
name = name.replace("language_model.", "")
|
||||||
|
if name.endswith("router.scale"):
|
||||||
|
name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_INP, bid, ".scale")
|
||||||
|
yield (name, data_torch)
|
||||||
|
return
|
||||||
|
if ".per_expert_scale" in name:
|
||||||
|
# convert per-expert scale to FFN down scale
|
||||||
|
name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN_EXP, bid, ".scale")
|
||||||
|
yield (name, data_torch)
|
||||||
|
return
|
||||||
|
if ".experts." in name and not name.endswith(".weight"):
|
||||||
|
name += ".weight"
|
||||||
|
|
||||||
|
yield from super().modify_tensors(data_torch, name, bid)
|
||||||
|
|
||||||
|
|
||||||
|
@ModelBase.register("Gemma4ForConditionalGeneration")
|
||||||
|
class Gemma4VisionAudioModel(MmprojModel):
|
||||||
|
has_audio_encoder = True
|
||||||
|
has_vision_encoder = True
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
assert self.hparams_vision is not None
|
||||||
|
self.hparams_vision["image_size"] = 224 # unused, but set to avoid error
|
||||||
|
|
||||||
|
# remap audio hparams
|
||||||
|
if self.hparams_audio:
|
||||||
|
self.hparams_audio["feat_in"] = self.hparams_audio.get("input_feat_size", 128)
|
||||||
|
self.hparams_audio["intermediate_size"] = self.hparams_audio["hidden_size"] * 4
|
||||||
|
else:
|
||||||
|
self.has_audio_encoder = False
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
|
||||||
|
# vision params
|
||||||
|
self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA4V)
|
||||||
|
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
|
||||||
|
|
||||||
|
# audio params
|
||||||
|
if self.hparams_audio:
|
||||||
|
self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A)
|
||||||
|
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
|
||||||
|
self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
|
||||||
|
|
||||||
|
def is_audio_tensor(self, name: str) -> bool:
|
||||||
|
return "audio_tower" in name or "embed_audio" in name
|
||||||
|
|
||||||
|
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||||
|
if self.is_audio_tensor(name):
|
||||||
|
if ".conv" in name or "_conv" in name and ".weight" in name:
|
||||||
|
return gguf.GGMLQuantizationType.F32
|
||||||
|
if "position_embedding_table" in name:
|
||||||
|
return gguf.GGMLQuantizationType.F32
|
||||||
|
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
del bid # unused
|
||||||
|
|
||||||
|
if name.startswith("model.language_model."):
|
||||||
|
return # skip
|
||||||
|
|
||||||
|
if len(data_torch.shape) == 0:
|
||||||
|
# convert scalar tensors (input/output_mix/max) to 1D tensors
|
||||||
|
data_torch = data_torch.unsqueeze(0)
|
||||||
|
|
||||||
|
if self.is_audio_tensor(name):
|
||||||
|
assert self.hparams_audio is not None
|
||||||
|
name = name.replace("model.audio_tower.", "conformer.")
|
||||||
|
name = name.replace(".linear.", ".")
|
||||||
|
if name.endswith("per_dim_key_scale") or name.endswith("per_dim_scale"):
|
||||||
|
name = name + ".weight"
|
||||||
|
data_torch = torch.nn.functional.softplus(data_torch)
|
||||||
|
if "lconv1d.depthwise_conv1d" in name and name.endswith(".weight"):
|
||||||
|
assert data_torch.shape[1] == 1
|
||||||
|
data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2])
|
||||||
|
mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min"))
|
||||||
|
yield (mapped_name, data_torch)
|
||||||
|
|
||||||
|
else:
|
||||||
|
name = name.replace("model.vision_tower.encoder.", "vision_model.model.")
|
||||||
|
name = name.replace(".linear.weight", ".weight")
|
||||||
|
if name.endswith("layer_scalar") or name.endswith("position_embedding_table"):
|
||||||
|
name = name + ".weight"
|
||||||
|
if name.endswith("patch_embedder.input_proj.weight"):
|
||||||
|
n_embd, ksize_sq_c = data_torch.shape
|
||||||
|
patch_size = int((ksize_sq_c // 3) ** 0.5)
|
||||||
|
data_torch = data_torch.reshape(n_embd, patch_size, patch_size, 3)
|
||||||
|
data_torch = data_torch.permute(0, 3, 1, 2).contiguous()
|
||||||
|
mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min"))
|
||||||
|
yield (mapped_name, data_torch)
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("Starcoder2ForCausalLM")
|
@ModelBase.register("Starcoder2ForCausalLM")
|
||||||
class StarCoder2Model(TextModel):
|
class StarCoder2Model(TextModel):
|
||||||
model_arch = gguf.MODEL_ARCH.STARCODER2
|
model_arch = gguf.MODEL_ARCH.STARCODER2
|
||||||
|
|
|
||||||
|
|
@ -15,13 +15,18 @@ static bool run(llama_context * ctx, const common_params & params) {
|
||||||
|
|
||||||
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
||||||
|
|
||||||
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
|
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos, true);
|
||||||
|
|
||||||
if (tokens.empty()) {
|
if (tokens.empty()) {
|
||||||
LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__);
|
LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG_INF("number of input tokens = %zu\n", tokens.size());
|
||||||
|
for (size_t i = 0; i < tokens.size(); ++i) {
|
||||||
|
LOG_INF(" %d\n", tokens[i]);
|
||||||
|
}
|
||||||
|
|
||||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
|
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
|
||||||
LOG_ERR("%s : failed to eval\n", __func__);
|
LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
|
|
|
||||||
|
|
@ -419,6 +419,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
GEMMA2 = auto()
|
GEMMA2 = auto()
|
||||||
GEMMA3 = auto()
|
GEMMA3 = auto()
|
||||||
GEMMA3N = auto()
|
GEMMA3N = auto()
|
||||||
|
GEMMA4 = auto()
|
||||||
GEMMA_EMBEDDING = auto()
|
GEMMA_EMBEDDING = auto()
|
||||||
STARCODER2 = auto()
|
STARCODER2 = auto()
|
||||||
RWKV6 = auto()
|
RWKV6 = auto()
|
||||||
|
|
@ -535,8 +536,11 @@ class MODEL_TENSOR(IntEnum):
|
||||||
FFN_GATE_INP = auto()
|
FFN_GATE_INP = auto()
|
||||||
FFN_GATE_INP_SHEXP = auto()
|
FFN_GATE_INP_SHEXP = auto()
|
||||||
FFN_NORM = auto()
|
FFN_NORM = auto()
|
||||||
FFN_PRE_NORM = auto()
|
FFN_PRE_NORM = auto() # alias of FFN_NORM
|
||||||
|
FFN_PRE_NORM_2 = auto() # gemma4
|
||||||
FFN_POST_NORM = auto()
|
FFN_POST_NORM = auto()
|
||||||
|
FFN_POST_NORM_1 = auto() # gemma4
|
||||||
|
FFN_POST_NORM_2 = auto() # gemma4
|
||||||
FFN_GATE = auto()
|
FFN_GATE = auto()
|
||||||
FFN_DOWN = auto()
|
FFN_DOWN = auto()
|
||||||
FFN_UP = auto()
|
FFN_UP = auto()
|
||||||
|
|
@ -558,6 +562,7 @@ class MODEL_TENSOR(IntEnum):
|
||||||
ATTN_Q_NORM = auto()
|
ATTN_Q_NORM = auto()
|
||||||
ATTN_K_NORM = auto()
|
ATTN_K_NORM = auto()
|
||||||
LAYER_OUT_NORM = auto()
|
LAYER_OUT_NORM = auto()
|
||||||
|
LAYER_OUT_SCALE = auto()
|
||||||
PER_LAYER_TOKEN_EMBD = auto() # gemma3n
|
PER_LAYER_TOKEN_EMBD = auto() # gemma3n
|
||||||
PER_LAYER_MODEL_PROJ = auto() # gemma3n
|
PER_LAYER_MODEL_PROJ = auto() # gemma3n
|
||||||
PER_LAYER_INP_GATE = auto() # gemma3n
|
PER_LAYER_INP_GATE = auto() # gemma3n
|
||||||
|
|
@ -722,8 +727,11 @@ class MODEL_TENSOR(IntEnum):
|
||||||
V_ENC_FFN_UP = auto()
|
V_ENC_FFN_UP = auto()
|
||||||
V_ENC_FFN_GATE = auto()
|
V_ENC_FFN_GATE = auto()
|
||||||
V_ENC_FFN_DOWN = auto()
|
V_ENC_FFN_DOWN = auto()
|
||||||
|
V_ENC_ATTN_POST_NORM = auto() # gemma4
|
||||||
|
V_ENC_FFN_POST_NORM = auto()
|
||||||
V_LAYER_SCALE_1 = auto()
|
V_LAYER_SCALE_1 = auto()
|
||||||
V_LAYER_SCALE_2 = auto()
|
V_LAYER_SCALE_2 = auto()
|
||||||
|
V_LAYER_OUT_SCALE = auto()
|
||||||
V_PRE_NORM = auto()
|
V_PRE_NORM = auto()
|
||||||
V_POST_NORM = auto()
|
V_POST_NORM = auto()
|
||||||
V_MM_POST_NORM = auto()
|
V_MM_POST_NORM = auto()
|
||||||
|
|
@ -761,6 +769,8 @@ class MODEL_TENSOR(IntEnum):
|
||||||
V_MM_GATE = auto() # cogvlm
|
V_MM_GATE = auto() # cogvlm
|
||||||
V_TOK_BOI = auto() # cogvlm
|
V_TOK_BOI = auto() # cogvlm
|
||||||
V_TOK_EOI = auto() # cogvlm
|
V_TOK_EOI = auto() # cogvlm
|
||||||
|
V_STD_BIAS = auto() # gemma4
|
||||||
|
V_STD_SCALE = auto() # gemma4
|
||||||
V_SAM_POS_EMBD = auto() # Deepseek-OCR
|
V_SAM_POS_EMBD = auto() # Deepseek-OCR
|
||||||
V_SAM_PATCH_EMBD = auto() # Deepseek-OCR
|
V_SAM_PATCH_EMBD = auto() # Deepseek-OCR
|
||||||
V_SAM_PRE_NORM = auto() # Deepseek-OCR
|
V_SAM_PRE_NORM = auto() # Deepseek-OCR
|
||||||
|
|
@ -781,6 +791,7 @@ class MODEL_TENSOR(IntEnum):
|
||||||
A_ENC_EMBD_POS = auto()
|
A_ENC_EMBD_POS = auto()
|
||||||
A_ENC_EMBD_NORM = auto()
|
A_ENC_EMBD_NORM = auto()
|
||||||
A_ENC_EMBD_TO_LOGITS = auto() # lfm2
|
A_ENC_EMBD_TO_LOGITS = auto() # lfm2
|
||||||
|
A_ENC_INP_PROJ = auto() # gemma4
|
||||||
A_ENC_CONV1D = auto()
|
A_ENC_CONV1D = auto()
|
||||||
A_ENC_CONV1D_NORM = auto() # gemma3n
|
A_ENC_CONV1D_NORM = auto() # gemma3n
|
||||||
A_PRE_NORM = auto()
|
A_PRE_NORM = auto()
|
||||||
|
|
@ -789,10 +800,13 @@ class MODEL_TENSOR(IntEnum):
|
||||||
A_ENC_ATTN_Q = auto()
|
A_ENC_ATTN_Q = auto()
|
||||||
A_ENC_ATTN_K = auto()
|
A_ENC_ATTN_K = auto()
|
||||||
A_ENC_ATTN_V = auto()
|
A_ENC_ATTN_V = auto()
|
||||||
|
A_ENC_ATTN_POST_NORM = auto()
|
||||||
|
A_ENC_ATTN_PRE_NORM = auto()
|
||||||
|
A_ENC_ATTN_K_REL = auto() # gemma4
|
||||||
A_ENC_PER_DIM_SCALE = auto() # gemma3n
|
A_ENC_PER_DIM_SCALE = auto() # gemma3n
|
||||||
A_ENC_INPUT_NORM = auto()
|
A_ENC_INPUT_NORM = auto()
|
||||||
A_ENC_OUTPUT = auto()
|
A_ENC_OUTPUT = auto() # TODO @ngxson: rename to ATTN_OUT
|
||||||
A_ENC_OUTPUT_NORM = auto()
|
A_ENC_OUTPUT_NORM = auto() # TODO @ngxson: rename to ATTN_OUT
|
||||||
A_ENC_FFN_UP = auto()
|
A_ENC_FFN_UP = auto()
|
||||||
A_ENC_FFN_NORM = auto()
|
A_ENC_FFN_NORM = auto()
|
||||||
A_ENC_FFN_POST_NORM = auto() # gemma3n
|
A_ENC_FFN_POST_NORM = auto() # gemma3n
|
||||||
|
|
@ -813,6 +827,8 @@ class MODEL_TENSOR(IntEnum):
|
||||||
A_MM_HARD_EMB_NORM = auto() # gemma3n
|
A_MM_HARD_EMB_NORM = auto() # gemma3n
|
||||||
A_MM_SOFT_EMB_NORM = auto() # gemma3n
|
A_MM_SOFT_EMB_NORM = auto() # gemma3n
|
||||||
A_MM_INP_PROJ = auto() # gemma3n
|
A_MM_INP_PROJ = auto() # gemma3n
|
||||||
|
A_PER_DIM_K_SCALE = auto() # gemma4
|
||||||
|
A_PER_DIM_SCALE = auto() # gemma4
|
||||||
# nextn/mtp
|
# nextn/mtp
|
||||||
NEXTN_EH_PROJ = auto()
|
NEXTN_EH_PROJ = auto()
|
||||||
NEXTN_EMBED_TOKENS = auto()
|
NEXTN_EMBED_TOKENS = auto()
|
||||||
|
|
@ -882,6 +898,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.GEMMA2: "gemma2",
|
MODEL_ARCH.GEMMA2: "gemma2",
|
||||||
MODEL_ARCH.GEMMA3: "gemma3",
|
MODEL_ARCH.GEMMA3: "gemma3",
|
||||||
MODEL_ARCH.GEMMA3N: "gemma3n",
|
MODEL_ARCH.GEMMA3N: "gemma3n",
|
||||||
|
MODEL_ARCH.GEMMA4: "gemma4",
|
||||||
MODEL_ARCH.GEMMA_EMBEDDING: "gemma-embedding",
|
MODEL_ARCH.GEMMA_EMBEDDING: "gemma-embedding",
|
||||||
MODEL_ARCH.STARCODER2: "starcoder2",
|
MODEL_ARCH.STARCODER2: "starcoder2",
|
||||||
MODEL_ARCH.RWKV6: "rwkv6",
|
MODEL_ARCH.RWKV6: "rwkv6",
|
||||||
|
|
@ -1000,6 +1017,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||||
MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm",
|
MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm",
|
||||||
MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm",
|
MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm",
|
||||||
|
MODEL_TENSOR.FFN_PRE_NORM_2: "blk.{bid}.pre_ffw_norm_2", # gemma4
|
||||||
|
MODEL_TENSOR.FFN_POST_NORM_1: "blk.{bid}.post_ffw_norm_1", # gemma4
|
||||||
|
MODEL_TENSOR.FFN_POST_NORM_2: "blk.{bid}.post_ffw_norm_2", # gemma4
|
||||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||||
|
|
@ -1019,6 +1039,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.MOE_LATENT_DOWN: "blk.{bid}.ffn_latent_down", # nemotron 3 super
|
MODEL_TENSOR.MOE_LATENT_DOWN: "blk.{bid}.ffn_latent_down", # nemotron 3 super
|
||||||
MODEL_TENSOR.MOE_LATENT_UP: "blk.{bid}.ffn_latent_up", # nemotron 3 super
|
MODEL_TENSOR.MOE_LATENT_UP: "blk.{bid}.ffn_latent_up", # nemotron 3 super
|
||||||
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
||||||
|
MODEL_TENSOR.LAYER_OUT_SCALE: "blk.{bid}.layer_output_scale",
|
||||||
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: "per_layer_token_embd", # gemma3n
|
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: "per_layer_token_embd", # gemma3n
|
||||||
MODEL_TENSOR.PER_LAYER_MODEL_PROJ: "per_layer_model_proj", # gemma3n
|
MODEL_TENSOR.PER_LAYER_MODEL_PROJ: "per_layer_model_proj", # gemma3n
|
||||||
MODEL_TENSOR.PER_LAYER_PROJ_NORM: "per_layer_proj_norm", # gemma3n
|
MODEL_TENSOR.PER_LAYER_PROJ_NORM: "per_layer_proj_norm", # gemma3n
|
||||||
|
|
@ -1183,8 +1204,11 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.V_ENC_FFN_UP: "v.blk.{bid}.ffn_up",
|
MODEL_TENSOR.V_ENC_FFN_UP: "v.blk.{bid}.ffn_up",
|
||||||
MODEL_TENSOR.V_ENC_FFN_GATE: "v.blk.{bid}.ffn_gate",
|
MODEL_TENSOR.V_ENC_FFN_GATE: "v.blk.{bid}.ffn_gate",
|
||||||
MODEL_TENSOR.V_ENC_FFN_DOWN: "v.blk.{bid}.ffn_down",
|
MODEL_TENSOR.V_ENC_FFN_DOWN: "v.blk.{bid}.ffn_down",
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_POST_NORM: "v.blk.{bid}.attn_post_norm",
|
||||||
|
MODEL_TENSOR.V_ENC_FFN_POST_NORM: "v.blk.{bid}.ffn_post_norm",
|
||||||
MODEL_TENSOR.V_LAYER_SCALE_1: "v.blk.{bid}.ls1",
|
MODEL_TENSOR.V_LAYER_SCALE_1: "v.blk.{bid}.ls1",
|
||||||
MODEL_TENSOR.V_LAYER_SCALE_2: "v.blk.{bid}.ls2",
|
MODEL_TENSOR.V_LAYER_SCALE_2: "v.blk.{bid}.ls2",
|
||||||
|
MODEL_TENSOR.V_LAYER_OUT_SCALE: "v.blk.{bid}.out_scale",
|
||||||
MODEL_TENSOR.V_PRE_NORM: "v.pre_ln",
|
MODEL_TENSOR.V_PRE_NORM: "v.pre_ln",
|
||||||
MODEL_TENSOR.V_POST_NORM: "v.post_ln",
|
MODEL_TENSOR.V_POST_NORM: "v.post_ln",
|
||||||
MODEL_TENSOR.V_MM_POST_NORM: "mm.post_norm",
|
MODEL_TENSOR.V_MM_POST_NORM: "mm.post_norm",
|
||||||
|
|
@ -1222,6 +1246,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.V_MM_GATE: "mm.gate",
|
MODEL_TENSOR.V_MM_GATE: "mm.gate",
|
||||||
MODEL_TENSOR.V_TOK_BOI: "v.boi",
|
MODEL_TENSOR.V_TOK_BOI: "v.boi",
|
||||||
MODEL_TENSOR.V_TOK_EOI: "v.eoi",
|
MODEL_TENSOR.V_TOK_EOI: "v.eoi",
|
||||||
|
MODEL_TENSOR.V_STD_BIAS: "v.std_bias", # gemma4
|
||||||
|
MODEL_TENSOR.V_STD_SCALE: "v.std_scale", # gemma4
|
||||||
# DeepSeek-OCR SAM
|
# DeepSeek-OCR SAM
|
||||||
MODEL_TENSOR.V_SAM_POS_EMBD: "v.sam.pos_embd",
|
MODEL_TENSOR.V_SAM_POS_EMBD: "v.sam.pos_embd",
|
||||||
MODEL_TENSOR.V_SAM_PATCH_EMBD: "v.sam.patch_embd",
|
MODEL_TENSOR.V_SAM_PATCH_EMBD: "v.sam.patch_embd",
|
||||||
|
|
@ -1243,6 +1269,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
|
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
|
||||||
MODEL_TENSOR.A_ENC_EMBD_NORM: "a.position_embd_norm",
|
MODEL_TENSOR.A_ENC_EMBD_NORM: "a.position_embd_norm",
|
||||||
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: "a.embd_to_logits",
|
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: "a.embd_to_logits",
|
||||||
|
MODEL_TENSOR.A_ENC_INP_PROJ: "a.input_projection",
|
||||||
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
|
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
|
||||||
MODEL_TENSOR.A_ENC_CONV1D_NORM: "a.conv1d.{bid}.norm",
|
MODEL_TENSOR.A_ENC_CONV1D_NORM: "a.conv1d.{bid}.norm",
|
||||||
MODEL_TENSOR.A_PRE_NORM: "a.pre_ln",
|
MODEL_TENSOR.A_PRE_NORM: "a.pre_ln",
|
||||||
|
|
@ -1251,6 +1278,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.A_ENC_ATTN_Q: "a.blk.{bid}.attn_q",
|
MODEL_TENSOR.A_ENC_ATTN_Q: "a.blk.{bid}.attn_q",
|
||||||
MODEL_TENSOR.A_ENC_ATTN_K: "a.blk.{bid}.attn_k",
|
MODEL_TENSOR.A_ENC_ATTN_K: "a.blk.{bid}.attn_k",
|
||||||
MODEL_TENSOR.A_ENC_ATTN_V: "a.blk.{bid}.attn_v",
|
MODEL_TENSOR.A_ENC_ATTN_V: "a.blk.{bid}.attn_v",
|
||||||
|
MODEL_TENSOR.A_ENC_ATTN_POST_NORM: "a.blk.{bid}.attn_post_norm",
|
||||||
|
MODEL_TENSOR.A_ENC_ATTN_PRE_NORM: "a.blk.{bid}.attn_pre_norm",
|
||||||
|
MODEL_TENSOR.A_ENC_ATTN_K_REL: "a.blk.{bid}.attn_k_rel",
|
||||||
MODEL_TENSOR.A_ENC_PER_DIM_SCALE: "a.blk.{bid}.per_dim_scale",
|
MODEL_TENSOR.A_ENC_PER_DIM_SCALE: "a.blk.{bid}.per_dim_scale",
|
||||||
MODEL_TENSOR.A_ENC_INPUT_NORM: "a.blk.{bid}.ln1",
|
MODEL_TENSOR.A_ENC_INPUT_NORM: "a.blk.{bid}.ln1",
|
||||||
MODEL_TENSOR.A_ENC_OUTPUT: "a.blk.{bid}.attn_out",
|
MODEL_TENSOR.A_ENC_OUTPUT: "a.blk.{bid}.attn_out",
|
||||||
|
|
@ -1275,6 +1305,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.A_MM_SOFT_EMB_NORM: "mm.a.soft_emb_norm", # gemma3n
|
MODEL_TENSOR.A_MM_SOFT_EMB_NORM: "mm.a.soft_emb_norm", # gemma3n
|
||||||
MODEL_TENSOR.A_MM_EMBEDDING: "mm.a.embedding", # gemma3n
|
MODEL_TENSOR.A_MM_EMBEDDING: "mm.a.embedding", # gemma3n
|
||||||
MODEL_TENSOR.A_MM_HARD_EMB_NORM: "mm.a.hard_emb_norm", # gemma3n
|
MODEL_TENSOR.A_MM_HARD_EMB_NORM: "mm.a.hard_emb_norm", # gemma3n
|
||||||
|
MODEL_TENSOR.A_PER_DIM_K_SCALE: "a.blk.{bid}.per_dim_k_scale", # gemma4
|
||||||
|
MODEL_TENSOR.A_PER_DIM_SCALE: "a.blk.{bid}.per_dim_scale", # gemma4
|
||||||
# lfm2 audio
|
# lfm2 audio
|
||||||
MODEL_TENSOR.A_ENC_NORM_CONV: "a.blk.{bid}.norm_conv",
|
MODEL_TENSOR.A_ENC_NORM_CONV: "a.blk.{bid}.norm_conv",
|
||||||
MODEL_TENSOR.A_ENC_LINEAR_POS: "a.blk.{bid}.linear_pos",
|
MODEL_TENSOR.A_ENC_LINEAR_POS: "a.blk.{bid}.linear_pos",
|
||||||
|
|
@ -1319,8 +1351,11 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.V_ENC_FFN_UP,
|
MODEL_TENSOR.V_ENC_FFN_UP,
|
||||||
MODEL_TENSOR.V_ENC_FFN_GATE,
|
MODEL_TENSOR.V_ENC_FFN_GATE,
|
||||||
MODEL_TENSOR.V_ENC_FFN_DOWN,
|
MODEL_TENSOR.V_ENC_FFN_DOWN,
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_POST_NORM,
|
||||||
|
MODEL_TENSOR.V_ENC_FFN_POST_NORM,
|
||||||
MODEL_TENSOR.V_LAYER_SCALE_1,
|
MODEL_TENSOR.V_LAYER_SCALE_1,
|
||||||
MODEL_TENSOR.V_LAYER_SCALE_2,
|
MODEL_TENSOR.V_LAYER_SCALE_2,
|
||||||
|
MODEL_TENSOR.V_LAYER_OUT_SCALE,
|
||||||
MODEL_TENSOR.V_PRE_NORM,
|
MODEL_TENSOR.V_PRE_NORM,
|
||||||
MODEL_TENSOR.V_POST_NORM,
|
MODEL_TENSOR.V_POST_NORM,
|
||||||
MODEL_TENSOR.V_MM_POST_NORM,
|
MODEL_TENSOR.V_MM_POST_NORM,
|
||||||
|
|
@ -1358,6 +1393,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.V_MM_GATE,
|
MODEL_TENSOR.V_MM_GATE,
|
||||||
MODEL_TENSOR.V_TOK_BOI,
|
MODEL_TENSOR.V_TOK_BOI,
|
||||||
MODEL_TENSOR.V_TOK_EOI,
|
MODEL_TENSOR.V_TOK_EOI,
|
||||||
|
MODEL_TENSOR.V_STD_BIAS,
|
||||||
|
MODEL_TENSOR.V_STD_SCALE,
|
||||||
MODEL_TENSOR.V_SAM_POS_EMBD,
|
MODEL_TENSOR.V_SAM_POS_EMBD,
|
||||||
MODEL_TENSOR.V_SAM_PATCH_EMBD,
|
MODEL_TENSOR.V_SAM_PATCH_EMBD,
|
||||||
MODEL_TENSOR.V_SAM_PRE_NORM,
|
MODEL_TENSOR.V_SAM_PRE_NORM,
|
||||||
|
|
@ -1375,6 +1412,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.A_ENC_EMBD_POS,
|
MODEL_TENSOR.A_ENC_EMBD_POS,
|
||||||
MODEL_TENSOR.A_ENC_EMBD_NORM,
|
MODEL_TENSOR.A_ENC_EMBD_NORM,
|
||||||
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS,
|
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS,
|
||||||
|
MODEL_TENSOR.A_ENC_INP_PROJ,
|
||||||
MODEL_TENSOR.A_ENC_CONV1D,
|
MODEL_TENSOR.A_ENC_CONV1D,
|
||||||
MODEL_TENSOR.A_ENC_CONV1D_NORM,
|
MODEL_TENSOR.A_ENC_CONV1D_NORM,
|
||||||
MODEL_TENSOR.A_PRE_NORM,
|
MODEL_TENSOR.A_PRE_NORM,
|
||||||
|
|
@ -1383,6 +1421,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.A_ENC_ATTN_Q,
|
MODEL_TENSOR.A_ENC_ATTN_Q,
|
||||||
MODEL_TENSOR.A_ENC_ATTN_K,
|
MODEL_TENSOR.A_ENC_ATTN_K,
|
||||||
MODEL_TENSOR.A_ENC_ATTN_V,
|
MODEL_TENSOR.A_ENC_ATTN_V,
|
||||||
|
MODEL_TENSOR.A_ENC_ATTN_POST_NORM,
|
||||||
|
MODEL_TENSOR.A_ENC_ATTN_PRE_NORM,
|
||||||
|
MODEL_TENSOR.A_ENC_ATTN_K_REL,
|
||||||
MODEL_TENSOR.A_ENC_PER_DIM_SCALE,
|
MODEL_TENSOR.A_ENC_PER_DIM_SCALE,
|
||||||
MODEL_TENSOR.A_ENC_INPUT_NORM,
|
MODEL_TENSOR.A_ENC_INPUT_NORM,
|
||||||
MODEL_TENSOR.A_ENC_OUTPUT,
|
MODEL_TENSOR.A_ENC_OUTPUT,
|
||||||
|
|
@ -1416,6 +1457,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.A_MM_SOFT_EMB_NORM,
|
MODEL_TENSOR.A_MM_SOFT_EMB_NORM,
|
||||||
MODEL_TENSOR.A_MM_EMBEDDING,
|
MODEL_TENSOR.A_MM_EMBEDDING,
|
||||||
MODEL_TENSOR.A_MM_HARD_EMB_NORM,
|
MODEL_TENSOR.A_MM_HARD_EMB_NORM,
|
||||||
|
MODEL_TENSOR.A_PER_DIM_K_SCALE,
|
||||||
|
MODEL_TENSOR.A_PER_DIM_SCALE,
|
||||||
],
|
],
|
||||||
MODEL_ARCH.LLAMA: [
|
MODEL_ARCH.LLAMA: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
|
@ -2273,6 +2316,38 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.LAUREL_R,
|
MODEL_TENSOR.LAUREL_R,
|
||||||
MODEL_TENSOR.LAUREL_POST_NORM,
|
MODEL_TENSOR.LAUREL_POST_NORM,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.GEMMA4: [
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_Q_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_K_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_UP_EXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_POST_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP,
|
||||||
|
MODEL_TENSOR.FFN_PRE_NORM,
|
||||||
|
MODEL_TENSOR.FFN_PRE_NORM_2,
|
||||||
|
MODEL_TENSOR.FFN_POST_NORM,
|
||||||
|
MODEL_TENSOR.FFN_POST_NORM_1,
|
||||||
|
MODEL_TENSOR.FFN_POST_NORM_2,
|
||||||
|
MODEL_TENSOR.LAYER_OUT_SCALE,
|
||||||
|
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.PER_LAYER_MODEL_PROJ,
|
||||||
|
MODEL_TENSOR.PER_LAYER_INP_GATE,
|
||||||
|
MODEL_TENSOR.PER_LAYER_PROJ,
|
||||||
|
MODEL_TENSOR.PER_LAYER_PROJ_NORM,
|
||||||
|
MODEL_TENSOR.PER_LAYER_POST_NORM,
|
||||||
|
],
|
||||||
MODEL_ARCH.GEMMA_EMBEDDING: [
|
MODEL_ARCH.GEMMA_EMBEDDING: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT,
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
|
@ -4010,6 +4085,8 @@ class VisionProjectorType:
|
||||||
GEMMA3 = "gemma3"
|
GEMMA3 = "gemma3"
|
||||||
GEMMA3NV = "gemma3nv"
|
GEMMA3NV = "gemma3nv"
|
||||||
GEMMA3NA = "gemma3na"
|
GEMMA3NA = "gemma3na"
|
||||||
|
GEMMA4V = "gemma4v"
|
||||||
|
GEMMA4A = "gemma4a"
|
||||||
PHI4 = "phi4"
|
PHI4 = "phi4"
|
||||||
IDEFICS3 = "idefics3"
|
IDEFICS3 = "idefics3"
|
||||||
PIXTRAL = "pixtral"
|
PIXTRAL = "pixtral"
|
||||||
|
|
|
||||||
|
|
@ -799,6 +799,7 @@ class GGUFWriter:
|
||||||
def add_shared_kv_layers(self, value: int) -> None:
|
def add_shared_kv_layers(self, value: int) -> None:
|
||||||
self.add_uint32(Keys.Attention.SHARED_KV_LAYERS.format(arch=self.arch), value)
|
self.add_uint32(Keys.Attention.SHARED_KV_LAYERS.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
# if input is array, true means SWA and false means full_attention for each layer
|
||||||
def add_sliding_window_pattern(self, value: int | Sequence[bool]) -> None:
|
def add_sliding_window_pattern(self, value: int | Sequence[bool]) -> None:
|
||||||
key = Keys.Attention.SLIDING_WINDOW_PATTERN.format(arch=self.arch)
|
key = Keys.Attention.SLIDING_WINDOW_PATTERN.format(arch=self.arch)
|
||||||
if isinstance(value, int):
|
if isinstance(value, int):
|
||||||
|
|
|
||||||
|
|
@ -401,6 +401,10 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.pre_mlp_layernorm", # afmoe
|
"model.layers.{bid}.pre_mlp_layernorm", # afmoe
|
||||||
),
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_PRE_NORM_2: (
|
||||||
|
"model.layers.{bid}.pre_feedforward_layernorm_2", # gemma4
|
||||||
|
),
|
||||||
|
|
||||||
# Post feed-forward norm
|
# Post feed-forward norm
|
||||||
MODEL_TENSOR.FFN_POST_NORM: (
|
MODEL_TENSOR.FFN_POST_NORM: (
|
||||||
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
|
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
|
||||||
|
|
@ -411,6 +415,14 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.post_moe_norm", # grok-2
|
"model.layers.{bid}.post_moe_norm", # grok-2
|
||||||
),
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_POST_NORM_1: (
|
||||||
|
"model.layers.{bid}.post_feedforward_layernorm_1", # gemma4
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_POST_NORM_2: (
|
||||||
|
"model.layers.{bid}.post_feedforward_layernorm_2", # gemma4
|
||||||
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_INP: (
|
MODEL_TENSOR.FFN_GATE_INP: (
|
||||||
"layers.{bid}.feed_forward.gate", # mixtral
|
"layers.{bid}.feed_forward.gate", # mixtral
|
||||||
"model.layers.{bid}.block_sparse_moe.gate", # mixtral phimoe
|
"model.layers.{bid}.block_sparse_moe.gate", # mixtral phimoe
|
||||||
|
|
@ -428,6 +440,7 @@ class TensorNameMap:
|
||||||
"layers.{bid}.gate", # mistral-large
|
"layers.{bid}.gate", # mistral-large
|
||||||
"backbone.layers.{bid}.mixer.gate", # nemotron-h-moe
|
"backbone.layers.{bid}.mixer.gate", # nemotron-h-moe
|
||||||
"model.layers.{bid}.moe.gate", # step3.5
|
"model.layers.{bid}.moe.gate", # step3.5
|
||||||
|
"model.layers.{bid}.router.proj", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
|
||||||
|
|
@ -570,6 +583,7 @@ class TensorNameMap:
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_UP_EXP: (
|
MODEL_TENSOR.FFN_GATE_UP_EXP: (
|
||||||
"model.layers.{bid}.mlp.experts.gate_up_proj",
|
"model.layers.{bid}.mlp.experts.gate_up_proj",
|
||||||
|
"model.layers.{bid}.experts.gate_up_proj", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.MOE_LATENT_DOWN: (
|
MODEL_TENSOR.MOE_LATENT_DOWN: (
|
||||||
|
|
@ -629,6 +643,7 @@ class TensorNameMap:
|
||||||
"encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe
|
"encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe
|
||||||
"model.layers.{bid}.block_sparse_moe.experts.down", # smallthinker
|
"model.layers.{bid}.block_sparse_moe.experts.down", # smallthinker
|
||||||
"model.layers.{bid}.moe.down_proj", # step3.5
|
"model.layers.{bid}.moe.down_proj", # step3.5
|
||||||
|
"model.layers.{bid}.experts.down_proj", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
||||||
|
|
@ -693,6 +708,10 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.final_layernorm", # bailingmoe2
|
"model.layers.{bid}.final_layernorm", # bailingmoe2
|
||||||
),
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.LAYER_OUT_SCALE: (
|
||||||
|
"model.layers.{bid}.layer_scalar", # gemma4
|
||||||
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: (
|
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: (
|
||||||
"model.embed_tokens_per_layer", # gemma3n
|
"model.embed_tokens_per_layer", # gemma3n
|
||||||
),
|
),
|
||||||
|
|
@ -1383,6 +1402,7 @@ class TensorNameMap:
|
||||||
"model.vision_model.embeddings.patch_embedding", # Deepseek-OCR CLIP
|
"model.vision_model.embeddings.patch_embedding", # Deepseek-OCR CLIP
|
||||||
"siglip2.vision_model.embeddings.patch_embedding",
|
"siglip2.vision_model.embeddings.patch_embedding",
|
||||||
"vision_model.radio_model.model.patch_generator.embedder", # Nemotron Nano v2 VL
|
"vision_model.radio_model.model.patch_generator.embedder", # Nemotron Nano v2 VL
|
||||||
|
"model.vision_tower.patch_embedder.input_proj", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_ENC_EMBD_NORM: (
|
MODEL_TENSOR.V_ENC_EMBD_NORM: (
|
||||||
|
|
@ -1400,6 +1420,7 @@ class TensorNameMap:
|
||||||
"model.vision.patch_embedding.position_embedding", # cogvlm
|
"model.vision.patch_embedding.position_embedding", # cogvlm
|
||||||
"visual.embeddings.position_embedding", # glm4v
|
"visual.embeddings.position_embedding", # glm4v
|
||||||
"vision_model.radio_model.model.patch_generator.pos_embed", # Nemotron Nano v2 VL
|
"vision_model.radio_model.model.patch_generator.pos_embed", # Nemotron Nano v2 VL
|
||||||
|
"model.vision_tower.patch_embedder.position_embedding_table", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_ENC_EMBD_IMGNL: (
|
MODEL_TENSOR.V_ENC_EMBD_IMGNL: (
|
||||||
|
|
@ -1430,12 +1451,14 @@ class TensorNameMap:
|
||||||
"vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated
|
"vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated
|
||||||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl
|
"siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl
|
||||||
"model.vision_model.transformer.layers.{bid}.self_attn.q_proj", # Deepseek-OCR CLIP, generated
|
"model.vision_model.transformer.layers.{bid}.self_attn.q_proj", # Deepseek-OCR CLIP, generated
|
||||||
|
"vision_model.model.layers.{bid}.self_attn.q_proj.linear", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
|
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
|
||||||
"vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", # InternVL
|
"vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", # InternVL
|
||||||
"model.vision_tower.encoder.layer.{bid}.attention.q_norm", # Intern-S1
|
"model.vision_tower.encoder.layer.{bid}.attention.q_norm", # Intern-S1
|
||||||
"visual.blocks.{bid}.attn.q_norm", # GLM-OCR
|
"visual.blocks.{bid}.attn.q_norm", # GLM-OCR
|
||||||
|
"vision_model.model.layers.{bid}.self_attn.q_norm", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_ENC_ATTN_K: (
|
MODEL_TENSOR.V_ENC_ATTN_K: (
|
||||||
|
|
@ -1450,12 +1473,14 @@ class TensorNameMap:
|
||||||
"vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated
|
"vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated
|
||||||
"model.vision_model.transformer.layers.{bid}.self_attn.k_proj", # Deepseek-OCR CLIP, generated
|
"model.vision_model.transformer.layers.{bid}.self_attn.k_proj", # Deepseek-OCR CLIP, generated
|
||||||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj",
|
"siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj",
|
||||||
|
"vision_model.model.layers.{bid}.self_attn.k_proj.linear", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
|
MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
|
||||||
"vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", # InternVL
|
"vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", # InternVL
|
||||||
"model.vision_tower.encoder.layer.{bid}.attention.k_norm", # Intern-S1
|
"model.vision_tower.encoder.layer.{bid}.attention.k_norm", # Intern-S1
|
||||||
"visual.blocks.{bid}.attn.k_norm", # GLM-OCR
|
"visual.blocks.{bid}.attn.k_norm", # GLM-OCR
|
||||||
|
"vision_model.model.layers.{bid}.self_attn.k_norm", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_ENC_ATTN_V: (
|
MODEL_TENSOR.V_ENC_ATTN_V: (
|
||||||
|
|
@ -1470,6 +1495,7 @@ class TensorNameMap:
|
||||||
"vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated
|
"vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated
|
||||||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj",
|
"siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj",
|
||||||
"model.vision_model.transformer.layers.{bid}.self_attn.v_proj", # Deepseek-OCR CLIP, generated
|
"model.vision_model.transformer.layers.{bid}.self_attn.v_proj", # Deepseek-OCR CLIP, generated
|
||||||
|
"vision_model.model.layers.{bid}.self_attn.v_proj.linear", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_ENC_INPUT_NORM: (
|
MODEL_TENSOR.V_ENC_INPUT_NORM: (
|
||||||
|
|
@ -1480,7 +1506,7 @@ class TensorNameMap:
|
||||||
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
|
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
|
||||||
"vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf
|
"vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf
|
||||||
"vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral
|
"vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral
|
||||||
"vision_model.model.layers.{bid}.input_layernorm", # llama4
|
"vision_model.model.layers.{bid}.input_layernorm", # llama4, gemma4
|
||||||
"visual.blocks.{bid}.norm1", # qwen2vl
|
"visual.blocks.{bid}.norm1", # qwen2vl
|
||||||
"vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1)
|
"vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1)
|
||||||
"model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm
|
"model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm
|
||||||
|
|
@ -1505,6 +1531,7 @@ class TensorNameMap:
|
||||||
"model.vision_model.transformer.layers.{bid}.self_attn.out_proj", # Deepseek-OCR CLIP
|
"model.vision_model.transformer.layers.{bid}.self_attn.out_proj", # Deepseek-OCR CLIP
|
||||||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
|
"siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
|
||||||
"vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL
|
"vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL
|
||||||
|
"vision_model.model.layers.{bid}.self_attn.o_proj.linear", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
|
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
|
||||||
|
|
@ -1522,6 +1549,7 @@ class TensorNameMap:
|
||||||
"model.vision_model.transformer.layers.{bid}.layer_norm2", # Deepseek-OCR CLIP
|
"model.vision_model.transformer.layers.{bid}.layer_norm2", # Deepseek-OCR CLIP
|
||||||
"siglip2.vision_model.encoder.layers.{bid}.layer_norm2",
|
"siglip2.vision_model.encoder.layers.{bid}.layer_norm2",
|
||||||
"vision_model.radio_model.model.blocks.{bid}.norm2", # Nemotron Nano v2 VL
|
"vision_model.radio_model.model.blocks.{bid}.norm2", # Nemotron Nano v2 VL
|
||||||
|
"vision_model.model.layers.{bid}.pre_feedforward_layernorm", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_ENC_FFN_UP: (
|
MODEL_TENSOR.V_ENC_FFN_UP: (
|
||||||
|
|
@ -1540,12 +1568,14 @@ class TensorNameMap:
|
||||||
"model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm
|
"model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm
|
||||||
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc1",
|
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc1",
|
||||||
"vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL
|
"vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL
|
||||||
|
"vision_model.model.layers.{bid}.mlp.up_proj", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_ENC_FFN_GATE: (
|
MODEL_TENSOR.V_ENC_FFN_GATE: (
|
||||||
"vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral-hf
|
"vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral-hf
|
||||||
"vision_encoder.transformer.layers.{bid}.feed_forward.w1", # pixtral
|
"vision_encoder.transformer.layers.{bid}.feed_forward.w1", # pixtral
|
||||||
"visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl
|
"visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl
|
||||||
|
"vision_model.model.layers.{bid}.mlp.gate_proj", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_ENC_FFN_DOWN: (
|
MODEL_TENSOR.V_ENC_FFN_DOWN: (
|
||||||
|
|
@ -1564,6 +1594,15 @@ class TensorNameMap:
|
||||||
"model.vision_model.transformer.layers.{bid}.mlp.fc2", # Deepseek-OCR CLIP
|
"model.vision_model.transformer.layers.{bid}.mlp.fc2", # Deepseek-OCR CLIP
|
||||||
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
|
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
|
||||||
"vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL
|
"vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL
|
||||||
|
"vision_model.model.layers.{bid}.mlp.down_proj", # gemma4
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_POST_NORM: (
|
||||||
|
"vision_model.model.layers.{bid}.post_attention_layernorm", # gemma4
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.V_ENC_FFN_POST_NORM: (
|
||||||
|
"vision_model.model.layers.{bid}.post_feedforward_layernorm", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_LAYER_SCALE_1: (
|
MODEL_TENSOR.V_LAYER_SCALE_1: (
|
||||||
|
|
@ -1576,6 +1615,10 @@ class TensorNameMap:
|
||||||
"model.vision_tower.encoder.layer.{bid}.lambda_2", # Intern-S1
|
"model.vision_tower.encoder.layer.{bid}.lambda_2", # Intern-S1
|
||||||
),
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.V_LAYER_OUT_SCALE: (
|
||||||
|
"vision_model.model.layers.{bid}.layer_scalar", # gemma4
|
||||||
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_PRE_NORM: (
|
MODEL_TENSOR.V_PRE_NORM: (
|
||||||
"vision_tower.vision_model.pre_layrnorm",
|
"vision_tower.vision_model.pre_layrnorm",
|
||||||
"vision_tower.ln_pre", # pixtral-hf
|
"vision_tower.ln_pre", # pixtral-hf
|
||||||
|
|
@ -1763,6 +1806,14 @@ class TensorNameMap:
|
||||||
"model.vision.eoi", # cogvlm
|
"model.vision.eoi", # cogvlm
|
||||||
),
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.V_STD_BIAS: (
|
||||||
|
"model.vision_tower.std_bias", # gemma4
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.V_STD_SCALE: (
|
||||||
|
"model.vision_tower.std_scale", # gemma4
|
||||||
|
),
|
||||||
|
|
||||||
# audio (mtmd)
|
# audio (mtmd)
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_EMBD_POS: (
|
MODEL_TENSOR.A_ENC_EMBD_POS: (
|
||||||
|
|
@ -1782,10 +1833,15 @@ class TensorNameMap:
|
||||||
"audio_tower.conv{bid}", # ultravox
|
"audio_tower.conv{bid}", # ultravox
|
||||||
"conformer.pre_encode.conv.{bid}", # lfm2
|
"conformer.pre_encode.conv.{bid}", # lfm2
|
||||||
"model.audio_tower.subsample_conv_projection.conv_{bid}.conv", # gemma3n
|
"model.audio_tower.subsample_conv_projection.conv_{bid}.conv", # gemma3n
|
||||||
|
"conformer.subsample_conv_projection.layer{bid}.conv", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_CONV1D_NORM: (
|
MODEL_TENSOR.A_ENC_CONV1D_NORM: (
|
||||||
"model.audio_tower.subsample_conv_projection.conv_{bid}.norm", # gemma3n
|
"conformer.subsample_conv_projection.layer{bid}.norm", # gemma4
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.A_ENC_INP_PROJ: (
|
||||||
|
"conformer.subsample_conv_projection.input_proj_linear", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_PRE_NORM: (),
|
MODEL_TENSOR.A_PRE_NORM: (),
|
||||||
|
|
@ -1799,22 +1855,38 @@ class TensorNameMap:
|
||||||
"audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
|
"audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
|
||||||
"conformer.layers.{bid}.self_attn.linear_q", # lfm2
|
"conformer.layers.{bid}.self_attn.linear_q", # lfm2
|
||||||
"conformer.layers.{bid}.attention.attn.q_proj", # gemma3n
|
"conformer.layers.{bid}.attention.attn.q_proj", # gemma3n
|
||||||
|
"conformer.layers.{bid}.self_attn.q_proj", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_ATTN_K: (
|
MODEL_TENSOR.A_ENC_ATTN_K: (
|
||||||
"audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
|
"audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
|
||||||
"conformer.layers.{bid}.self_attn.linear_k", # lfm2
|
"conformer.layers.{bid}.self_attn.linear_k", # lfm2
|
||||||
"conformer.layers.{bid}.attention.attn.k_proj", # gemma3n
|
"conformer.layers.{bid}.attention.attn.k_proj", # gemma3n
|
||||||
|
"conformer.layers.{bid}.self_attn.k_proj", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_ATTN_V: (
|
MODEL_TENSOR.A_ENC_ATTN_V: (
|
||||||
"audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
|
"audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
|
||||||
"conformer.layers.{bid}.self_attn.linear_v", # lfm2
|
"conformer.layers.{bid}.self_attn.linear_v", # lfm2
|
||||||
"conformer.layers.{bid}.attention.attn.v_proj", # gemma3n
|
"conformer.layers.{bid}.attention.attn.v_proj", # gemma3n
|
||||||
|
"conformer.layers.{bid}.self_attn.v_proj", # gemma4
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.A_ENC_ATTN_K_REL: (
|
||||||
|
"conformer.layers.{bid}.self_attn.relative_k_proj", # gemma4
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.A_ENC_ATTN_POST_NORM: (
|
||||||
|
"conformer.layers.{bid}.norm_post_attn", # gemma4
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.A_ENC_ATTN_PRE_NORM: (
|
||||||
|
"conformer.layers.{bid}.norm_pre_attn", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_PER_DIM_SCALE: (
|
MODEL_TENSOR.A_ENC_PER_DIM_SCALE: (
|
||||||
"conformer.layers.{bid}.attention.attn.per_dim_scale", # gemma3n
|
"conformer.layers.{bid}.attention.attn.per_dim_scale", # gemma3n
|
||||||
|
"conformer.layers.{bid}.self_attn.per_dim_scale", # gemma3n
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_LAYER_PRE_NORM: (
|
MODEL_TENSOR.A_ENC_LAYER_PRE_NORM: (
|
||||||
|
|
@ -1831,6 +1903,7 @@ class TensorNameMap:
|
||||||
"audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
|
"audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
|
||||||
"conformer.layers.{bid}.self_attn.linear_out", # lfm2
|
"conformer.layers.{bid}.self_attn.linear_out", # lfm2
|
||||||
"conformer.layers.{bid}.attention.post", # gemma3n
|
"conformer.layers.{bid}.attention.post", # gemma3n
|
||||||
|
"conformer.layers.{bid}.self_attn.post", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
|
MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
|
||||||
|
|
@ -1842,10 +1915,12 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.A_ENC_FFN_NORM: (
|
MODEL_TENSOR.A_ENC_FFN_NORM: (
|
||||||
"conformer.layers.{bid}.norm_feed_forward1", # lfm2
|
"conformer.layers.{bid}.norm_feed_forward1", # lfm2
|
||||||
"conformer.layers.{bid}.ffw_layer_start.pre_layer_norm", # gemma3n
|
"conformer.layers.{bid}.ffw_layer_start.pre_layer_norm", # gemma3n
|
||||||
|
"conformer.layers.{bid}.feed_forward1.pre_layer_norm", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_FFN_POST_NORM: (
|
MODEL_TENSOR.A_ENC_FFN_POST_NORM: (
|
||||||
"conformer.layers.{bid}.ffw_layer_start.post_layer_norm", # gemma3n
|
"conformer.layers.{bid}.ffw_layer_start.post_layer_norm", # gemma3n
|
||||||
|
"conformer.layers.{bid}.feed_forward1.post_layer_norm", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_FFN_SCALE: (
|
MODEL_TENSOR.A_ENC_FFN_SCALE: (
|
||||||
|
|
@ -1856,6 +1931,7 @@ class TensorNameMap:
|
||||||
"audio_tower.layers.{bid}.fc1", # ultravox
|
"audio_tower.layers.{bid}.fc1", # ultravox
|
||||||
"conformer.layers.{bid}.feed_forward1.linear1", # lfm2
|
"conformer.layers.{bid}.feed_forward1.linear1", # lfm2
|
||||||
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_1", # gemma3n
|
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_1", # gemma3n
|
||||||
|
"conformer.layers.{bid}.feed_forward1.ffw_layer_1", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_FFN_GATE: (),
|
MODEL_TENSOR.A_ENC_FFN_GATE: (),
|
||||||
|
|
@ -1864,25 +1940,30 @@ class TensorNameMap:
|
||||||
"audio_tower.layers.{bid}.fc2", # ultravox
|
"audio_tower.layers.{bid}.fc2", # ultravox
|
||||||
"conformer.layers.{bid}.feed_forward1.linear2", # lfm2
|
"conformer.layers.{bid}.feed_forward1.linear2", # lfm2
|
||||||
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_2", # gemma3n
|
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_2", # gemma3n
|
||||||
|
"conformer.layers.{bid}.feed_forward1.ffw_layer_2", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_FFN_UP_1: (
|
MODEL_TENSOR.A_ENC_FFN_UP_1: (
|
||||||
"conformer.layers.{bid}.feed_forward2.linear1", # lfm2
|
"conformer.layers.{bid}.feed_forward2.linear1", # lfm2
|
||||||
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_1", # gemma3n
|
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_1", # gemma3n
|
||||||
|
"conformer.layers.{bid}.feed_forward2.ffw_layer_1", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_FFN_DOWN_1: (
|
MODEL_TENSOR.A_ENC_FFN_DOWN_1: (
|
||||||
"conformer.layers.{bid}.feed_forward2.linear2", # lfm2
|
"conformer.layers.{bid}.feed_forward2.linear2", # lfm2
|
||||||
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_2", # gemma3n
|
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_2", # gemma3n
|
||||||
|
"conformer.layers.{bid}.feed_forward2.ffw_layer_2", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_FFN_NORM_1: (
|
MODEL_TENSOR.A_ENC_FFN_NORM_1: (
|
||||||
"conformer.layers.{bid}.norm_feed_forward2", # lfm2
|
"conformer.layers.{bid}.norm_feed_forward2", # lfm2
|
||||||
"conformer.layers.{bid}.ffw_layer_end.pre_layer_norm", # gemma3n
|
"conformer.layers.{bid}.ffw_layer_end.pre_layer_norm", # gemma3n
|
||||||
|
"conformer.layers.{bid}.feed_forward2.pre_layer_norm", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: (
|
MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: (
|
||||||
"conformer.layers.{bid}.ffw_layer_end.post_layer_norm", # gemma3n
|
"conformer.layers.{bid}.ffw_layer_end.post_layer_norm", # gemma3n
|
||||||
|
"conformer.layers.{bid}.feed_forward2.post_layer_norm", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_FFN_SCALE_1: (
|
MODEL_TENSOR.A_ENC_FFN_SCALE_1: (
|
||||||
|
|
@ -1904,7 +1985,8 @@ class TensorNameMap:
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_OUT: (
|
MODEL_TENSOR.A_ENC_OUT: (
|
||||||
"conformer.pre_encode.out", # lfm2
|
"conformer.pre_encode.out", # lfm2
|
||||||
"model.audio_tower.subsample_conv_projection.input_proj_linear", # gemma3n
|
"model.audio_tower.subsample_conv_projection.input_proj_linear", # gemma3n (note: it should be A_ENC_INP_PROJ, this is a mistake; it should be corrected in C++ code when it's supported)
|
||||||
|
"conformer.output_proj", # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
# note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
|
# note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
|
||||||
|
|
@ -1918,6 +2000,7 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.A_MMPROJ_FC: (
|
MODEL_TENSOR.A_MMPROJ_FC: (
|
||||||
"audio.multi_modal_projector.linear", # qwen2audio
|
"audio.multi_modal_projector.linear", # qwen2audio
|
||||||
"audio_tower.proj", # qwen2omni
|
"audio_tower.proj", # qwen2omni
|
||||||
|
"model.audio_tower.output_proj" # gemma4
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_MM_NORM_PRE: (
|
MODEL_TENSOR.A_MM_NORM_PRE: (
|
||||||
|
|
@ -1953,6 +2036,14 @@ class TensorNameMap:
|
||||||
"conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n
|
"conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n
|
||||||
),
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.A_PER_DIM_K_SCALE: (
|
||||||
|
"conformer.layers.{bid}.attention.attn.per_dim_key_scale", # gemma4
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.A_PER_DIM_SCALE: (
|
||||||
|
"conformer.layers.{bid}.attention.attn.per_dim_scale", # gemma4
|
||||||
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_MM_EMBEDDING: (
|
MODEL_TENSOR.A_MM_EMBEDDING: (
|
||||||
"model.embed_audio.embedding", # gemma3n
|
"model.embed_audio.embedding", # gemma3n
|
||||||
),
|
),
|
||||||
|
|
|
||||||
|
|
@ -73,6 +73,7 @@ add_library(llama
|
||||||
models/gemma2-iswa.cpp
|
models/gemma2-iswa.cpp
|
||||||
models/gemma3.cpp
|
models/gemma3.cpp
|
||||||
models/gemma3n-iswa.cpp
|
models/gemma3n-iswa.cpp
|
||||||
|
models/gemma4-iswa.cpp
|
||||||
models/glm4-moe.cpp
|
models/glm4-moe.cpp
|
||||||
models/glm4.cpp
|
models/glm4.cpp
|
||||||
models/gpt2.cpp
|
models/gpt2.cpp
|
||||||
|
|
|
||||||
|
|
@ -56,6 +56,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_GEMMA2, "gemma2" },
|
{ LLM_ARCH_GEMMA2, "gemma2" },
|
||||||
{ LLM_ARCH_GEMMA3, "gemma3" },
|
{ LLM_ARCH_GEMMA3, "gemma3" },
|
||||||
{ LLM_ARCH_GEMMA3N, "gemma3n" },
|
{ LLM_ARCH_GEMMA3N, "gemma3n" },
|
||||||
|
{ LLM_ARCH_GEMMA4, "gemma4" },
|
||||||
{ LLM_ARCH_GEMMA_EMBEDDING, "gemma-embedding" },
|
{ LLM_ARCH_GEMMA_EMBEDDING, "gemma-embedding" },
|
||||||
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
||||||
{ LLM_ARCH_MAMBA, "mamba" },
|
{ LLM_ARCH_MAMBA, "mamba" },
|
||||||
|
|
@ -165,6 +166,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
||||||
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
||||||
{ LLM_KV_EMBEDDING_LENGTH_OUT, "%s.embedding_length_out" },
|
{ LLM_KV_EMBEDDING_LENGTH_OUT, "%s.embedding_length_out" },
|
||||||
|
{ LLM_KV_EMBEDDING_LENGTH_PER_LAYER, "%s.embedding_length_per_layer_input" },
|
||||||
{ LLM_KV_FEATURES_LENGTH, "%s.features_length" },
|
{ LLM_KV_FEATURES_LENGTH, "%s.features_length" },
|
||||||
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
||||||
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
|
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
|
||||||
|
|
@ -238,6 +240,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, "%s.attention.indexer.head_count" },
|
{ LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, "%s.attention.indexer.head_count" },
|
||||||
{ LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, "%s.attention.indexer.key_length" },
|
{ LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, "%s.attention.indexer.key_length" },
|
||||||
{ LLM_KV_ATTENTION_INDEXER_TOP_K, "%s.attention.indexer.top_k" },
|
{ LLM_KV_ATTENTION_INDEXER_TOP_K, "%s.attention.indexer.top_k" },
|
||||||
|
{ LLM_KV_ATTENTION_SHARED_KV_LAYERS, "%s.attention.shared_kv_layers" },
|
||||||
|
|
||||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||||
{ LLM_KV_ROPE_DIMENSION_COUNT_SWA, "%s.rope.dimension_count_swa" },
|
{ LLM_KV_ROPE_DIMENSION_COUNT_SWA, "%s.rope.dimension_count_swa" },
|
||||||
|
|
@ -364,6 +367,9 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
|
||||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||||
{ LLM_TENSOR_ATTN_GATE, "blk.%d.attn_gate" },
|
{ LLM_TENSOR_ATTN_GATE, "blk.%d.attn_gate" },
|
||||||
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_POST_NORM_1, "blk.%d.post_ffw_norm_1" },
|
||||||
|
{ LLM_TENSOR_FFN_POST_NORM_2, "blk.%d.post_ffw_norm_2" },
|
||||||
|
{ LLM_TENSOR_FFN_PRE_NORM_2, "blk.%d.pre_ffw_norm_2" },
|
||||||
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
||||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||||
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
||||||
|
|
@ -373,6 +379,7 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
|
||||||
{ LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
|
{ LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
|
||||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||||
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
||||||
|
{ LLM_TENSOR_LAYER_OUT_SCALE, "blk.%d.layer_output_scale" },
|
||||||
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
||||||
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
||||||
{ LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
|
{ LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
|
||||||
|
|
@ -1342,6 +1349,38 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||||
LLM_TENSOR_LAUREL_R,
|
LLM_TENSOR_LAUREL_R,
|
||||||
LLM_TENSOR_LAUREL_POST_NORM,
|
LLM_TENSOR_LAUREL_POST_NORM,
|
||||||
};
|
};
|
||||||
|
case LLM_ARCH_GEMMA4:
|
||||||
|
return {
|
||||||
|
LLM_TENSOR_ROPE_FREQS,
|
||||||
|
LLM_TENSOR_TOKEN_EMBD,
|
||||||
|
LLM_TENSOR_OUTPUT_NORM,
|
||||||
|
LLM_TENSOR_ATTN_NORM,
|
||||||
|
LLM_TENSOR_ATTN_Q,
|
||||||
|
LLM_TENSOR_ATTN_Q_NORM,
|
||||||
|
LLM_TENSOR_ATTN_K,
|
||||||
|
LLM_TENSOR_ATTN_K_NORM,
|
||||||
|
LLM_TENSOR_ATTN_V,
|
||||||
|
LLM_TENSOR_ATTN_OUT,
|
||||||
|
LLM_TENSOR_ATTN_POST_NORM,
|
||||||
|
LLM_TENSOR_FFN_NORM,
|
||||||
|
LLM_TENSOR_FFN_GATE,
|
||||||
|
LLM_TENSOR_FFN_DOWN,
|
||||||
|
LLM_TENSOR_FFN_UP,
|
||||||
|
LLM_TENSOR_FFN_GATE_UP_EXPS,
|
||||||
|
LLM_TENSOR_FFN_DOWN_EXPS,
|
||||||
|
LLM_TENSOR_FFN_GATE_INP,
|
||||||
|
LLM_TENSOR_FFN_POST_NORM,
|
||||||
|
LLM_TENSOR_FFN_POST_NORM_1,
|
||||||
|
LLM_TENSOR_FFN_POST_NORM_2,
|
||||||
|
LLM_TENSOR_FFN_PRE_NORM_2,
|
||||||
|
LLM_TENSOR_LAYER_OUT_SCALE,
|
||||||
|
LLM_TENSOR_PER_LAYER_TOKEN_EMBD,
|
||||||
|
LLM_TENSOR_PER_LAYER_MODEL_PROJ,
|
||||||
|
LLM_TENSOR_PER_LAYER_PROJ_NORM,
|
||||||
|
LLM_TENSOR_PER_LAYER_INP_GATE,
|
||||||
|
LLM_TENSOR_PER_LAYER_PROJ,
|
||||||
|
LLM_TENSOR_PER_LAYER_POST_NORM,
|
||||||
|
};
|
||||||
case LLM_ARCH_GEMMA_EMBEDDING:
|
case LLM_ARCH_GEMMA_EMBEDDING:
|
||||||
return {
|
return {
|
||||||
LLM_TENSOR_TOKEN_EMBD,
|
LLM_TENSOR_TOKEN_EMBD,
|
||||||
|
|
@ -2654,11 +2693,15 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||||
{LLM_TENSOR_ATTN_OUT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
{LLM_TENSOR_ATTN_OUT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
{LLM_TENSOR_ATTN_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
{LLM_TENSOR_ATTN_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
{LLM_TENSOR_FFN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
{LLM_TENSOR_FFN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
|
{LLM_TENSOR_FFN_PRE_NORM_2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
|
{LLM_TENSOR_FFN_POST_NORM_1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
|
{LLM_TENSOR_FFN_POST_NORM_2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
{LLM_TENSOR_FFN_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
{LLM_TENSOR_FFN_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
{LLM_TENSOR_FFN_NORM_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
{LLM_TENSOR_FFN_NORM_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
{LLM_TENSOR_ATTN_Q_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
{LLM_TENSOR_ATTN_Q_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
{LLM_TENSOR_ATTN_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
{LLM_TENSOR_ATTN_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
{LLM_TENSOR_LAYER_OUT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
{LLM_TENSOR_LAYER_OUT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
|
{LLM_TENSOR_LAYER_OUT_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
{LLM_TENSOR_ATTN_Q_A_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
{LLM_TENSOR_ATTN_Q_A_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
{LLM_TENSOR_ATTN_KV_A_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
{LLM_TENSOR_ATTN_KV_A_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
{LLM_TENSOR_ATTN_SUB_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
{LLM_TENSOR_ATTN_SUB_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
|
|
|
||||||
|
|
@ -60,6 +60,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_GEMMA2,
|
LLM_ARCH_GEMMA2,
|
||||||
LLM_ARCH_GEMMA3,
|
LLM_ARCH_GEMMA3,
|
||||||
LLM_ARCH_GEMMA3N,
|
LLM_ARCH_GEMMA3N,
|
||||||
|
LLM_ARCH_GEMMA4,
|
||||||
LLM_ARCH_GEMMA_EMBEDDING,
|
LLM_ARCH_GEMMA_EMBEDDING,
|
||||||
LLM_ARCH_STARCODER2,
|
LLM_ARCH_STARCODER2,
|
||||||
LLM_ARCH_MAMBA,
|
LLM_ARCH_MAMBA,
|
||||||
|
|
@ -169,6 +170,7 @@ enum llm_kv {
|
||||||
LLM_KV_CONTEXT_LENGTH,
|
LLM_KV_CONTEXT_LENGTH,
|
||||||
LLM_KV_EMBEDDING_LENGTH,
|
LLM_KV_EMBEDDING_LENGTH,
|
||||||
LLM_KV_EMBEDDING_LENGTH_OUT,
|
LLM_KV_EMBEDDING_LENGTH_OUT,
|
||||||
|
LLM_KV_EMBEDDING_LENGTH_PER_LAYER,
|
||||||
LLM_KV_FEATURES_LENGTH,
|
LLM_KV_FEATURES_LENGTH,
|
||||||
LLM_KV_BLOCK_COUNT,
|
LLM_KV_BLOCK_COUNT,
|
||||||
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
||||||
|
|
@ -242,6 +244,7 @@ enum llm_kv {
|
||||||
LLM_KV_ATTENTION_INDEXER_HEAD_COUNT,
|
LLM_KV_ATTENTION_INDEXER_HEAD_COUNT,
|
||||||
LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,
|
LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,
|
||||||
LLM_KV_ATTENTION_INDEXER_TOP_K,
|
LLM_KV_ATTENTION_INDEXER_TOP_K,
|
||||||
|
LLM_KV_ATTENTION_SHARED_KV_LAYERS,
|
||||||
|
|
||||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||||
LLM_KV_ROPE_DIMENSION_COUNT_SWA,
|
LLM_KV_ROPE_DIMENSION_COUNT_SWA,
|
||||||
|
|
@ -369,6 +372,9 @@ enum llm_tensor {
|
||||||
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
||||||
LLM_TENSOR_FFN_NORM,
|
LLM_TENSOR_FFN_NORM,
|
||||||
LLM_TENSOR_FFN_POST_NORM,
|
LLM_TENSOR_FFN_POST_NORM,
|
||||||
|
LLM_TENSOR_FFN_POST_NORM_1,
|
||||||
|
LLM_TENSOR_FFN_POST_NORM_2,
|
||||||
|
LLM_TENSOR_FFN_PRE_NORM_2,
|
||||||
LLM_TENSOR_FFN_GATE,
|
LLM_TENSOR_FFN_GATE,
|
||||||
LLM_TENSOR_FFN_DOWN,
|
LLM_TENSOR_FFN_DOWN,
|
||||||
LLM_TENSOR_FFN_UP,
|
LLM_TENSOR_FFN_UP,
|
||||||
|
|
@ -393,6 +399,7 @@ enum llm_tensor {
|
||||||
LLM_TENSOR_ATTN_Q_NORM,
|
LLM_TENSOR_ATTN_Q_NORM,
|
||||||
LLM_TENSOR_ATTN_K_NORM,
|
LLM_TENSOR_ATTN_K_NORM,
|
||||||
LLM_TENSOR_LAYER_OUT_NORM,
|
LLM_TENSOR_LAYER_OUT_NORM,
|
||||||
|
LLM_TENSOR_LAYER_OUT_SCALE,
|
||||||
LLM_TENSOR_POST_ATTN_NORM,
|
LLM_TENSOR_POST_ATTN_NORM,
|
||||||
LLM_TENSOR_POST_MLP_NORM,
|
LLM_TENSOR_POST_MLP_NORM,
|
||||||
LLM_TENSOR_PER_LAYER_TOKEN_EMBD, // gemma3n
|
LLM_TENSOR_PER_LAYER_TOKEN_EMBD, // gemma3n
|
||||||
|
|
|
||||||
|
|
@ -209,6 +209,9 @@ struct llama_hparams {
|
||||||
// qwen3vl deepstack
|
// qwen3vl deepstack
|
||||||
uint32_t n_deepstack_layers = 0;
|
uint32_t n_deepstack_layers = 0;
|
||||||
|
|
||||||
|
// gemma4 per-layer embedding
|
||||||
|
uint32_t n_embd_per_layer = 0;
|
||||||
|
|
||||||
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
|
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
|
||||||
// ref: https://github.com/ggml-org/llama.cpp/pull/8141
|
// ref: https://github.com/ggml-org/llama.cpp/pull/8141
|
||||||
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
|
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
|
||||||
|
|
|
||||||
|
|
@ -1261,6 +1261,31 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
default: type = LLM_TYPE_UNKNOWN;
|
default: type = LLM_TYPE_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_GEMMA4:
|
||||||
|
{
|
||||||
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||||
|
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
|
||||||
|
|
||||||
|
uint32_t n_kv_shared_layers = 0;
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_SHARED_KV_LAYERS, n_kv_shared_layers, false);
|
||||||
|
|
||||||
|
hparams.n_layer_kv_from_start = hparams.n_layer - (int32_t)n_kv_shared_layers;
|
||||||
|
hparams.f_attention_scale = 1.0f; // Gemma4 uses self.scaling = 1.0 (no pre-attn scaling)
|
||||||
|
|
||||||
|
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||||
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
ml.get_key(LLM_KV_EMBEDDING_LENGTH_PER_LAYER, hparams.n_embd_per_layer);
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa);
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa);
|
||||||
|
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 35: type = LLM_TYPE_E2B; break;
|
||||||
|
case 42: type = LLM_TYPE_E4B; break; // to confirm: E4B or E5B?
|
||||||
|
default: type = LLM_TYPE_UNKNOWN;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_GEMMA_EMBEDDING:
|
case LLM_ARCH_GEMMA_EMBEDDING:
|
||||||
{
|
{
|
||||||
hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
|
hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
|
||||||
|
|
@ -4229,6 +4254,100 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
layer.laurel_post_norm = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM, "weight", i), {n_embd}, 0);
|
layer.laurel_post_norm = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM, "weight", i), {n_embd}, 0);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_GEMMA4:
|
||||||
|
{
|
||||||
|
const uint32_t n_embd_per_layer = hparams.n_embd_per_layer;
|
||||||
|
const int64_t n_ff_exp = hparams.n_ff_exp;
|
||||||
|
|
||||||
|
if (n_embd_head_k != n_embd_head_v) {
|
||||||
|
throw std::runtime_error("Gemma 4 requires n_embd_head_k == n_embd_head_v");
|
||||||
|
}
|
||||||
|
if (hparams.n_embd_head_k_swa != hparams.n_embd_head_v_swa) {
|
||||||
|
throw std::runtime_error("Gemma 4 requires n_embd_head_k_swa == n_embd_head_v_swa");
|
||||||
|
}
|
||||||
|
|
||||||
|
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||||
|
// if output is NULL, init from the input tok embed
|
||||||
|
if (output == NULL) {
|
||||||
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||||
|
}
|
||||||
|
|
||||||
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||||
|
|
||||||
|
if (n_embd_per_layer > 0) {
|
||||||
|
tok_embd_per_layer = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_per_layer * n_layer, n_vocab}, 0);
|
||||||
|
per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight"), {n_embd, n_embd_per_layer * n_layer}, 0);
|
||||||
|
per_layer_proj_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM, "weight"), {n_embd_per_layer}, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||||
|
|
||||||
|
int rope_freqs_flag = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
auto & layer = layers[i];
|
||||||
|
const int64_t n_head = hparams.n_head(i);
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head_k(i);
|
||||||
|
const int64_t n_embd_k = hparams.n_embd_k_gqa(i);
|
||||||
|
const int64_t n_embd_v = hparams.n_embd_v_gqa(i);
|
||||||
|
|
||||||
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
|
// note: use_alternative_attention (v_proj is optional, if it's not present, use k_proj)
|
||||||
|
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head * n_head}, 0);
|
||||||
|
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k}, 0);
|
||||||
|
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v}, TENSOR_NOT_REQUIRED);
|
||||||
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head * n_head, n_embd}, 0);
|
||||||
|
|
||||||
|
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head}, 0);
|
||||||
|
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head}, 0);
|
||||||
|
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
|
layer.out_scale = create_tensor(tn(LLM_TENSOR_LAYER_OUT_SCALE, "weight", i), {1u}, TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
|
if (!hparams.is_swa(i)) {
|
||||||
|
// full_attention layers use rope_freqs for proportional rope
|
||||||
|
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_embd_head/2}, rope_freqs_flag);
|
||||||
|
rope_freqs_flag = TENSOR_DUPLICATED;
|
||||||
|
}
|
||||||
|
|
||||||
|
// handle use_double_wide_mlp
|
||||||
|
int64_t n_ff_cur = hparams.n_ff(i);
|
||||||
|
|
||||||
|
// for expert layers, we use normal FFN as shared expert (same as python code)
|
||||||
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff_cur}, 0);
|
||||||
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff_cur}, 0);
|
||||||
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_cur, n_embd}, 0);
|
||||||
|
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
|
// MoE router
|
||||||
|
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
|
||||||
|
bool has_expert = layer.ffn_gate_inp != nullptr;
|
||||||
|
|
||||||
|
// norm
|
||||||
|
if (has_expert) {
|
||||||
|
layer.ffn_gate_inp_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "scale", i), {n_embd}, 0);
|
||||||
|
|
||||||
|
layer.ffn_pre_norm_2 = create_tensor(tn(LLM_TENSOR_FFN_PRE_NORM_2, "weight", i), {n_embd}, 0);
|
||||||
|
layer.ffn_post_norm_1 = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM_1, "weight", i), {n_embd}, 0);
|
||||||
|
layer.ffn_post_norm_2 = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM_2, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
|
// MoE FFN
|
||||||
|
layer.ffn_gate_up_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS, "weight", i), {n_embd, n_ff_exp * 2, n_expert}, 0);
|
||||||
|
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
||||||
|
|
||||||
|
// per-expert scale will be loaded as down_exps_s at the end of the current switch case
|
||||||
|
}
|
||||||
|
|
||||||
|
// per-layer embeddings
|
||||||
|
if (n_embd_per_layer > 0) {
|
||||||
|
layer.per_layer_inp_gate = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE, "weight", i), {n_embd, n_embd_per_layer}, 0);
|
||||||
|
layer.per_layer_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ, "weight", i), {n_embd_per_layer, n_embd}, 0);
|
||||||
|
layer.per_layer_post_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_STARCODER2:
|
case LLM_ARCH_STARCODER2:
|
||||||
{
|
{
|
||||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||||
|
|
@ -8233,7 +8352,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||||
} else {
|
} else {
|
||||||
llama_memory_i::layer_reuse_cb reuse = nullptr;
|
llama_memory_i::layer_reuse_cb reuse = nullptr;
|
||||||
|
|
||||||
if (arch == LLM_ARCH_GEMMA3N) {
|
if (arch == LLM_ARCH_GEMMA3N || arch == LLM_ARCH_GEMMA4) {
|
||||||
reuse = [&](int32_t il) {
|
reuse = [&](int32_t il) {
|
||||||
if (il >= (int32_t) hparams.n_layer_kv_from_start) {
|
if (il >= (int32_t) hparams.n_layer_kv_from_start) {
|
||||||
return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
|
return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
|
||||||
|
|
@ -8486,6 +8605,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||||
{
|
{
|
||||||
llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params);
|
llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params);
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_GEMMA4:
|
||||||
|
{
|
||||||
|
llm = std::make_unique<llm_build_gemma4_iswa>(*this, params);
|
||||||
|
} break;
|
||||||
case LLM_ARCH_GEMMA_EMBEDDING:
|
case LLM_ARCH_GEMMA_EMBEDDING:
|
||||||
{
|
{
|
||||||
llm = std::make_unique<llm_build_gemma_embedding>(*this, params);
|
llm = std::make_unique<llm_build_gemma_embedding>(*this, params);
|
||||||
|
|
@ -9006,6 +9129,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||||
case LLM_ARCH_GEMMA2:
|
case LLM_ARCH_GEMMA2:
|
||||||
case LLM_ARCH_GEMMA3:
|
case LLM_ARCH_GEMMA3:
|
||||||
case LLM_ARCH_GEMMA3N:
|
case LLM_ARCH_GEMMA3N:
|
||||||
|
case LLM_ARCH_GEMMA4:
|
||||||
case LLM_ARCH_GEMMA_EMBEDDING:
|
case LLM_ARCH_GEMMA_EMBEDDING:
|
||||||
case LLM_ARCH_STARCODER2:
|
case LLM_ARCH_STARCODER2:
|
||||||
case LLM_ARCH_OPENELM:
|
case LLM_ARCH_OPENELM:
|
||||||
|
|
|
||||||
|
|
@ -270,6 +270,9 @@ struct llama_layer {
|
||||||
struct ggml_tensor * ffn_norm = nullptr;
|
struct ggml_tensor * ffn_norm = nullptr;
|
||||||
struct ggml_tensor * ffn_norm_b = nullptr;
|
struct ggml_tensor * ffn_norm_b = nullptr;
|
||||||
struct ggml_tensor * ffn_post_norm = nullptr;
|
struct ggml_tensor * ffn_post_norm = nullptr;
|
||||||
|
struct ggml_tensor * ffn_post_norm_1 = nullptr; // gemma4
|
||||||
|
struct ggml_tensor * ffn_post_norm_2 = nullptr; // gemma4
|
||||||
|
struct ggml_tensor * ffn_pre_norm_2 = nullptr; // gemma4
|
||||||
struct ggml_tensor * layer_out_norm = nullptr;
|
struct ggml_tensor * layer_out_norm = nullptr;
|
||||||
struct ggml_tensor * layer_out_norm_b = nullptr;
|
struct ggml_tensor * layer_out_norm_b = nullptr;
|
||||||
struct ggml_tensor * ffn_norm_exps = nullptr;
|
struct ggml_tensor * ffn_norm_exps = nullptr;
|
||||||
|
|
@ -285,6 +288,7 @@ struct llama_layer {
|
||||||
|
|
||||||
// ff MoE
|
// ff MoE
|
||||||
struct ggml_tensor * ffn_gate_inp = nullptr;
|
struct ggml_tensor * ffn_gate_inp = nullptr;
|
||||||
|
struct ggml_tensor * ffn_gate_inp_s = nullptr; // gemma4
|
||||||
struct ggml_tensor * ffn_gate_exps = nullptr;
|
struct ggml_tensor * ffn_gate_exps = nullptr;
|
||||||
struct ggml_tensor * ffn_down_exps = nullptr;
|
struct ggml_tensor * ffn_down_exps = nullptr;
|
||||||
struct ggml_tensor * ffn_up_exps = nullptr;
|
struct ggml_tensor * ffn_up_exps = nullptr;
|
||||||
|
|
@ -483,6 +487,9 @@ struct llama_layer {
|
||||||
struct ggml_tensor * indexer_attn_k = nullptr;
|
struct ggml_tensor * indexer_attn_k = nullptr;
|
||||||
struct ggml_tensor * indexer_attn_q_b = nullptr; // note: for lora a/b, not bias
|
struct ggml_tensor * indexer_attn_q_b = nullptr; // note: for lora a/b, not bias
|
||||||
|
|
||||||
|
// gemma4 layer output scale
|
||||||
|
struct ggml_tensor * out_scale = nullptr;
|
||||||
|
|
||||||
struct llama_layer_posnet posnet;
|
struct llama_layer_posnet posnet;
|
||||||
|
|
||||||
struct llama_layer_convnext convnext;
|
struct llama_layer_convnext convnext;
|
||||||
|
|
|
||||||
|
|
@ -1863,6 +1863,18 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
special_sep_id = LLAMA_TOKEN_NULL;
|
special_sep_id = LLAMA_TOKEN_NULL;
|
||||||
special_pad_id = 3; // <|plamo:pad|>
|
special_pad_id = 3; // <|plamo:pad|>
|
||||||
special_mask_id = LLAMA_TOKEN_NULL;
|
special_mask_id = LLAMA_TOKEN_NULL;
|
||||||
|
} else if (tokenizer_model == "gemma4") {
|
||||||
|
type = LLAMA_VOCAB_TYPE_SPM;
|
||||||
|
|
||||||
|
// default special tokens (to be read from GGUF)
|
||||||
|
special_bos_id = LLAMA_TOKEN_NULL;
|
||||||
|
special_eos_id = LLAMA_TOKEN_NULL;
|
||||||
|
special_unk_id = LLAMA_TOKEN_NULL;
|
||||||
|
special_sep_id = LLAMA_TOKEN_NULL;
|
||||||
|
special_pad_id = LLAMA_TOKEN_NULL;
|
||||||
|
special_mask_id = LLAMA_TOKEN_NULL;
|
||||||
|
|
||||||
|
tokenizer_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
} else {
|
} else {
|
||||||
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
|
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
|
||||||
}
|
}
|
||||||
|
|
@ -2490,6 +2502,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
|| t.first == "[EOS]" // Kimi-K2
|
|| t.first == "[EOS]" // Kimi-K2
|
||||||
|| t.first == "<|end_of_text|>"
|
|| t.first == "<|end_of_text|>"
|
||||||
|| t.first == "<end_of_utterance>" // smoldocling
|
|| t.first == "<end_of_utterance>" // smoldocling
|
||||||
|
|| t.first == "<turn|>" // gemma4
|
||||||
|| t.first == "<|end▁of▁sentence|>" // deepseek-ocr
|
|| t.first == "<|end▁of▁sentence|>" // deepseek-ocr
|
||||||
) {
|
) {
|
||||||
special_eog_ids.insert(t.second);
|
special_eog_ids.insert(t.second);
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,311 @@
|
||||||
|
#include "models.h"
|
||||||
|
|
||||||
|
llm_build_gemma4_iswa::llm_build_gemma4_iswa(const llama_model & model, const llm_graph_params & params) :
|
||||||
|
llm_graph_context(params),
|
||||||
|
model(model),
|
||||||
|
n_embd_per_layer(model.hparams.n_embd_per_layer) {
|
||||||
|
ggml_tensor * cur;
|
||||||
|
ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
|
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
||||||
|
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
|
||||||
|
cb(inpL, "inp_scaled", -1);
|
||||||
|
|
||||||
|
// inp_pos - contains the positions
|
||||||
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
// TODO: is causal == true correct? might need some changes
|
||||||
|
auto * inp_attn = build_attn_inp_kv_iswa();
|
||||||
|
|
||||||
|
// inp_per_layer shape: [n_embd_per_layer, n_tokens, n_layer]
|
||||||
|
ggml_tensor * inp_per_layer = nullptr;
|
||||||
|
if (model.tok_embd_per_layer) {
|
||||||
|
inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs());
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head_k(il);
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_v(il));
|
||||||
|
|
||||||
|
const int64_t n_head = hparams.n_head(il);
|
||||||
|
const int64_t n_head_kv = hparams.n_head_kv(il);
|
||||||
|
|
||||||
|
const float freq_base_l = model.get_rope_freq_base(cparams, il);
|
||||||
|
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||||
|
const int n_rot_l = hparams.n_rot(il);
|
||||||
|
|
||||||
|
// norm
|
||||||
|
cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
|
||||||
|
cb(cur, "attn_norm", il);
|
||||||
|
|
||||||
|
ggml_tensor * freq_factors = nullptr;
|
||||||
|
if (!hparams.is_swa(il)) {
|
||||||
|
// full_attention layers use rope_freqs for proportional rope
|
||||||
|
freq_factors = model.layers[il].rope_freqs;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Q projection (shared for both non-KV and KV layers)
|
||||||
|
// this is to mirror Gemma4Attention in pytorch code
|
||||||
|
ggml_tensor * Qcur;
|
||||||
|
{
|
||||||
|
Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
|
|
||||||
|
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
|
||||||
|
cb(Qcur, "Qcur_normed", il);
|
||||||
|
|
||||||
|
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
cb(Qcur, "Qcur_pos", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
if (hparams.has_kv(il)) {
|
||||||
|
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
ggml_tensor * Vcur = model.layers[il].wv
|
||||||
|
? build_lora_mm(model.layers[il].wv, cur)
|
||||||
|
: Kcur; // if v_proj is not present, use Kcur as Vcur
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
|
||||||
|
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il);
|
||||||
|
Vcur = ggml_rms_norm(ctx0, Vcur, hparams.f_norm_rms_eps);
|
||||||
|
|
||||||
|
cb(Kcur, "Kcur_normed", il);
|
||||||
|
cb(Vcur, "Vcur_normed", il);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
|
||||||
|
cb(Kcur, "Kcur_pos", il);
|
||||||
|
|
||||||
|
cur = build_attn(inp_attn, model.layers[il].wo,
|
||||||
|
nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr,
|
||||||
|
hparams.f_attention_scale, il);
|
||||||
|
} else {
|
||||||
|
// reuse KV cache of earlier layers
|
||||||
|
cur = build_attn(inp_attn,
|
||||||
|
model.layers[il].wo, nullptr,
|
||||||
|
Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO @ngxson : strip unused token right after the last KV layer to speed up prompt processing
|
||||||
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||||
|
}
|
||||||
|
cur = build_norm(cur,
|
||||||
|
model.layers[il].attn_post_norm, nullptr,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur, "attn_post_norm", il);
|
||||||
|
|
||||||
|
ggml_tensor * attn_out = ggml_add(ctx0, cur, inpL);
|
||||||
|
cb(attn_out, "attn_out", il);
|
||||||
|
|
||||||
|
// feed-forward network
|
||||||
|
const bool is_moe_layer = model.layers[il].ffn_gate_inp != nullptr;
|
||||||
|
if (is_moe_layer) {
|
||||||
|
// MLP (shared exp)
|
||||||
|
ggml_tensor * cur_mlp = build_norm(attn_out,
|
||||||
|
model.layers[il].ffn_norm, nullptr,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur_mlp, "ffn_norm_1", il);
|
||||||
|
|
||||||
|
cur_mlp = build_ffn(cur_mlp,
|
||||||
|
model.layers[il].ffn_up, nullptr, nullptr,
|
||||||
|
model.layers[il].ffn_gate, nullptr, nullptr,
|
||||||
|
model.layers[il].ffn_down, nullptr, nullptr,
|
||||||
|
nullptr,
|
||||||
|
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
||||||
|
cur_mlp = build_norm(cur_mlp,
|
||||||
|
model.layers[il].ffn_post_norm_1, nullptr,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur_mlp, "ffn_mlp", il);
|
||||||
|
|
||||||
|
// Expert FFN
|
||||||
|
ggml_tensor * cur_moe = build_norm(attn_out,
|
||||||
|
model.layers[il].ffn_pre_norm_2, nullptr,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur_moe, "ffn_norm_2", il);
|
||||||
|
|
||||||
|
// custom MoE logits calculation (router operates on attn_out, not cur)
|
||||||
|
ggml_tensor * tmp = ggml_rms_norm(ctx0, attn_out, hparams.f_norm_rms_eps);
|
||||||
|
tmp = ggml_scale(ctx0, tmp, 1.0f / sqrtf((float) n_embd));
|
||||||
|
tmp = ggml_mul(ctx0, tmp, model.layers[il].ffn_gate_inp_s);
|
||||||
|
ggml_tensor * logits = build_lora_mm(model.layers[il].ffn_gate_inp, tmp); // [n_expert, n_tokens]
|
||||||
|
cb(logits, "ffn_moe_logits", il);
|
||||||
|
|
||||||
|
cur_moe = build_moe_ffn(cur_moe,
|
||||||
|
nullptr, // gate_inp
|
||||||
|
nullptr, // up_exps
|
||||||
|
nullptr, // gate_exps
|
||||||
|
model.layers[il].ffn_down_exps,
|
||||||
|
nullptr, // exp_probs_b (not used for gemma4)
|
||||||
|
n_expert, n_expert_used,
|
||||||
|
LLM_FFN_GELU, true,
|
||||||
|
1.0f,
|
||||||
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||||
|
il, logits,
|
||||||
|
model.layers[il].ffn_gate_up_exps,
|
||||||
|
nullptr, // up_exps_s
|
||||||
|
nullptr, // gate_exps_s
|
||||||
|
model.layers[il].ffn_down_exps_s);
|
||||||
|
cur_moe = build_norm(cur_moe,
|
||||||
|
model.layers[il].ffn_post_norm_2, nullptr,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur_moe, "ffn_moe", il);
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur_mlp, cur_moe);
|
||||||
|
cb(cur, "ffn_moe_combined", il);
|
||||||
|
} else {
|
||||||
|
cur = build_norm(attn_out,
|
||||||
|
model.layers[il].ffn_norm, nullptr,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
cur = build_ffn(cur,
|
||||||
|
model.layers[il].ffn_up, nullptr, nullptr,
|
||||||
|
model.layers[il].ffn_gate, nullptr, nullptr,
|
||||||
|
model.layers[il].ffn_down, nullptr, nullptr,
|
||||||
|
nullptr,
|
||||||
|
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
}
|
||||||
|
cur = build_norm(cur,
|
||||||
|
model.layers[il].ffn_post_norm, nullptr,
|
||||||
|
LLM_NORM_RMS, -1);
|
||||||
|
cb(cur, "ffn_post_norm", il);
|
||||||
|
|
||||||
|
// residual connection
|
||||||
|
cur = ggml_add(ctx0, cur, attn_out);
|
||||||
|
|
||||||
|
// per-layer embedding
|
||||||
|
if (inp_per_layer) {
|
||||||
|
ggml_tensor * pe_in = cur;
|
||||||
|
cb(cur, "pe_in", il);
|
||||||
|
|
||||||
|
cur = build_lora_mm(model.layers[il].per_layer_inp_gate, cur); // [n_embd_per_layer, n_tokens]
|
||||||
|
cur = ggml_gelu(ctx0, cur);
|
||||||
|
ggml_tensor * inp_this_layer = view_2d_slice(inp_per_layer, il); // [n_embd_per_layer, n_tokens]
|
||||||
|
|
||||||
|
// TODO @ngxson : improve this
|
||||||
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
inp_this_layer = ggml_get_rows(ctx0, inp_this_layer, inp_out_ids);
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = ggml_mul(ctx0, cur, inp_this_layer);
|
||||||
|
cur = build_lora_mm(model.layers[il].per_layer_proj, cur); // [n_embd, n_tokens]
|
||||||
|
cur = build_norm(cur, model.layers[il].per_layer_post_norm, nullptr, LLM_NORM_RMS, il);
|
||||||
|
cb(cur, "per_layer_embd_out", il);
|
||||||
|
|
||||||
|
// residual connection
|
||||||
|
cur = ggml_add(ctx0, pe_in, cur);
|
||||||
|
}
|
||||||
|
|
||||||
|
// layer_scalar
|
||||||
|
if (model.layers[il].out_scale) {
|
||||||
|
cur = ggml_mul(ctx0, cur, model.layers[il].out_scale);
|
||||||
|
cb(cur, "out_scaled", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = build_cvec(cur, il);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
cur = build_norm(cur,
|
||||||
|
model.output_norm, nullptr,
|
||||||
|
LLM_NORM_RMS, -1);
|
||||||
|
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
res->t_embd = cur;
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
cur = build_lora_mm(model.output, cur);
|
||||||
|
|
||||||
|
if (hparams.f_final_logit_softcapping) {
|
||||||
|
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
|
||||||
|
cur = ggml_tanh(ctx0, cur);
|
||||||
|
cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
|
||||||
|
}
|
||||||
|
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
res->t_logits = cur;
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
}
|
||||||
|
|
||||||
|
// get 2D slice view from a 3D tensor, the idx corresponds to the 3rd dim
|
||||||
|
ggml_tensor * llm_build_gemma4_iswa::view_2d_slice(ggml_tensor * x, int idx) {
|
||||||
|
GGML_ASSERT(idx < (int) x->ne[2]);
|
||||||
|
return ggml_view_2d(ctx0, x, x->ne[0], x->ne[1], ggml_row_size(x->type, x->ne[0]),
|
||||||
|
idx * x->ne[0] * x->ne[1] * ggml_element_size(x));
|
||||||
|
}
|
||||||
|
|
||||||
|
// equivalent to get_per_layer_inputs() in python code
|
||||||
|
// output shape: [n_embd_per_layer, n_layer, n_tokens]
|
||||||
|
ggml_tensor * llm_build_gemma4_iswa::get_per_layer_inputs() {
|
||||||
|
auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
|
||||||
|
ggml_tensor * inp_per_layer;
|
||||||
|
if (ubatch.token) {
|
||||||
|
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
|
||||||
|
ggml_set_input(inp->tokens);
|
||||||
|
res->t_inp_tokens = inp->tokens;
|
||||||
|
inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens);
|
||||||
|
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_per_layer, n_layer, n_tokens);
|
||||||
|
inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_per_layer));
|
||||||
|
cb(inp_per_layer, "inp_per_layer_selected", -1);
|
||||||
|
res->add_input(std::move(inp));
|
||||||
|
} else {
|
||||||
|
// Vision embedding path: use padding token (ID=0) embedding
|
||||||
|
// TODO: verify if this is the correct behavior in transformers implementation
|
||||||
|
const int64_t embd_size = model.tok_embd_per_layer->ne[0]; // n_embd_per_layer * n_layer
|
||||||
|
|
||||||
|
// Extract and dequantize padding token embedding (row 0)
|
||||||
|
ggml_tensor * padding = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0);
|
||||||
|
inp_per_layer = ggml_cast(ctx0, padding, GGML_TYPE_F32);
|
||||||
|
|
||||||
|
// Reshape to [n_embd_per_layer, n_layer, 1]
|
||||||
|
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_per_layer, n_layer, 1);
|
||||||
|
cb(inp_per_layer, "inp_per_layer_vision", -1);
|
||||||
|
}
|
||||||
|
return inp_per_layer;
|
||||||
|
}
|
||||||
|
|
||||||
|
// equivalent to project_per_layer_inputs() in python code
|
||||||
|
// this calculates the per-layer inputs, so the final tensor shape will have n_layer as the last dim
|
||||||
|
// inputs_embeds shape: [n_embd, n_tokens]
|
||||||
|
// inp_per_layer shape: [n_embd_per_layer, n_layer, n_tokens] (from get_per_layer_inputs)
|
||||||
|
// output shape: [n_embd_per_layer, n_tokens, n_layer]
|
||||||
|
ggml_tensor * llm_build_gemma4_iswa::project_per_layer_inputs(ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer) {
|
||||||
|
const float per_layer_projection_scale = 1.0f / sqrtf((float) n_embd);
|
||||||
|
const float per_layer_input_scale = 1.0f / sqrtf(2.0f);
|
||||||
|
|
||||||
|
ggml_tensor * per_layer_proj = ggml_mul_mat(ctx0, model.per_layer_model_proj, inputs_embeds);
|
||||||
|
per_layer_proj = ggml_scale(ctx0, per_layer_proj, per_layer_projection_scale);
|
||||||
|
per_layer_proj = ggml_reshape_3d(ctx0, per_layer_proj, n_embd_per_layer, n_layer, n_tokens);
|
||||||
|
per_layer_proj = build_norm(per_layer_proj, model.per_layer_proj_norm, nullptr, LLM_NORM_RMS,
|
||||||
|
-1); // [n_embd_per_layer, n_layer, n_tokens]
|
||||||
|
cb(per_layer_proj, "per_layer_proj", -1);
|
||||||
|
|
||||||
|
inp_per_layer = ggml_add(ctx0, per_layer_proj, inp_per_layer);
|
||||||
|
inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
|
||||||
|
cb(inp_per_layer, "inp_per_layer", -1);
|
||||||
|
|
||||||
|
// permute to shape: [n_embd_per_layer, n_tokens, n_layer]
|
||||||
|
inp_per_layer = ggml_cont(ctx0, ggml_permute(ctx0, inp_per_layer, 0, 2, 1, 3));
|
||||||
|
return inp_per_layer;
|
||||||
|
}
|
||||||
|
|
@ -266,6 +266,17 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
||||||
ggml_tensor * altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il);
|
ggml_tensor * altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct llm_build_gemma4_iswa : public llm_graph_context {
|
||||||
|
const llama_model & model;
|
||||||
|
|
||||||
|
const int64_t n_embd_per_layer;
|
||||||
|
|
||||||
|
llm_build_gemma4_iswa(const llama_model & model, const llm_graph_params & params);
|
||||||
|
ggml_tensor * view_2d_slice(ggml_tensor * x, int idx);
|
||||||
|
ggml_tensor * get_per_layer_inputs();
|
||||||
|
ggml_tensor * project_per_layer_inputs(ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer);
|
||||||
|
};
|
||||||
|
|
||||||
struct llm_build_gemma_embedding : public llm_graph_context {
|
struct llm_build_gemma_embedding : public llm_graph_context {
|
||||||
llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params);
|
llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params);
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -385,6 +385,9 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml
|
||||||
if (arch == LLM_ARCH_CHAMELEON) {
|
if (arch == LLM_ARCH_CHAMELEON) {
|
||||||
continue; // Only half-implemented and to be removed in the future.
|
continue; // Only half-implemented and to be removed in the future.
|
||||||
}
|
}
|
||||||
|
if (arch == LLM_ARCH_GEMMA4) {
|
||||||
|
continue; // FIXME @ngxson
|
||||||
|
}
|
||||||
if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) {
|
if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) {
|
||||||
continue; // FIXME
|
continue; // FIXME
|
||||||
}
|
}
|
||||||
|
|
@ -451,6 +454,9 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
|
||||||
if (arch == LLM_ARCH_CHAMELEON) {
|
if (arch == LLM_ARCH_CHAMELEON) {
|
||||||
continue; // Only half-implemented and to be removed in the future.
|
continue; // Only half-implemented and to be removed in the future.
|
||||||
}
|
}
|
||||||
|
if (arch == LLM_ARCH_GEMMA4) {
|
||||||
|
continue; // FIXME @ngxson
|
||||||
|
}
|
||||||
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
||||||
continue; // FIXME CUDA backend crashes.
|
continue; // FIXME CUDA backend crashes.
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,7 @@ add_library(mtmd
|
||||||
models/models.h
|
models/models.h
|
||||||
models/cogvlm.cpp
|
models/cogvlm.cpp
|
||||||
models/conformer.cpp
|
models/conformer.cpp
|
||||||
|
models/gemma4v.cpp
|
||||||
models/glm4v.cpp
|
models/glm4v.cpp
|
||||||
models/internvl.cpp
|
models/internvl.cpp
|
||||||
models/kimivl.cpp
|
models/kimivl.cpp
|
||||||
|
|
|
||||||
|
|
@ -29,7 +29,7 @@ struct clip_graph {
|
||||||
const int n_layer;
|
const int n_layer;
|
||||||
const int n_mmproj_embd;
|
const int n_mmproj_embd;
|
||||||
const float eps;
|
const float eps;
|
||||||
const float kq_scale;
|
float kq_scale; // TODO: maybe move this to hparams
|
||||||
const clip_flash_attn_type flash_attn_type;
|
const clip_flash_attn_type flash_attn_type;
|
||||||
|
|
||||||
ggml_context_ptr ctx0_ptr;
|
ggml_context_ptr ctx0_ptr;
|
||||||
|
|
|
||||||
|
|
@ -88,8 +88,11 @@
|
||||||
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
|
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
|
||||||
#define TN_LN_1 "%s.blk.%d.ln1.%s" // layer norm
|
#define TN_LN_1 "%s.blk.%d.ln1.%s" // layer norm
|
||||||
#define TN_LN_2 "%s.blk.%d.ln2.%s" // layer norm
|
#define TN_LN_2 "%s.blk.%d.ln2.%s" // layer norm
|
||||||
#define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale
|
#define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale
|
||||||
#define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale
|
#define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale
|
||||||
|
#define TN_LS_OUT "%s.blk.%d.out_scale.%s" // layer out scale (gemma4)
|
||||||
|
#define TN_ATTN_POST_NORM "%s.blk.%d.attn_post_norm.%s" // post-attn norm (gemma4)
|
||||||
|
#define TN_FFN_POST_NORM "%s.blk.%d.ffn_post_norm.%s" // post-FFN norm (gemma4)
|
||||||
#define TN_LN_PRE "%s.pre_ln.%s"
|
#define TN_LN_PRE "%s.pre_ln.%s"
|
||||||
#define TN_LN_POST "%s.post_ln.%s"
|
#define TN_LN_POST "%s.post_ln.%s"
|
||||||
#define TN_LLAVA_PROJ "mm.%d.%s"
|
#define TN_LLAVA_PROJ "mm.%d.%s"
|
||||||
|
|
@ -213,6 +216,10 @@
|
||||||
#define TN_MNV5_MSFA_FFN_PROJ_BN "v.msfa.ffn.pw_proj.bn.weight"
|
#define TN_MNV5_MSFA_FFN_PROJ_BN "v.msfa.ffn.pw_proj.bn.weight"
|
||||||
#define TN_MNV5_MSFA_NORM "v.msfa.norm.weight"
|
#define TN_MNV5_MSFA_NORM "v.msfa.norm.weight"
|
||||||
|
|
||||||
|
// gemma4
|
||||||
|
#define TN_STD_BIAS "v.std_bias"
|
||||||
|
#define TN_STD_SCALE "v.std_scale"
|
||||||
|
|
||||||
|
|
||||||
// align x to upper multiple of n
|
// align x to upper multiple of n
|
||||||
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
||||||
|
|
@ -233,6 +240,8 @@ enum projector_type {
|
||||||
PROJECTOR_TYPE_GEMMA3,
|
PROJECTOR_TYPE_GEMMA3,
|
||||||
PROJECTOR_TYPE_GEMMA3NV,
|
PROJECTOR_TYPE_GEMMA3NV,
|
||||||
PROJECTOR_TYPE_GEMMA3NA,
|
PROJECTOR_TYPE_GEMMA3NA,
|
||||||
|
PROJECTOR_TYPE_GEMMA4V,
|
||||||
|
PROJECTOR_TYPE_GEMMA4A,
|
||||||
PROJECTOR_TYPE_PHI4,
|
PROJECTOR_TYPE_PHI4,
|
||||||
PROJECTOR_TYPE_IDEFICS3,
|
PROJECTOR_TYPE_IDEFICS3,
|
||||||
PROJECTOR_TYPE_PIXTRAL,
|
PROJECTOR_TYPE_PIXTRAL,
|
||||||
|
|
@ -272,6 +281,8 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||||
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
|
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
|
||||||
{ PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"},
|
{ PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"},
|
||||||
{ PROJECTOR_TYPE_GEMMA3NA, "gemma3na"},
|
{ PROJECTOR_TYPE_GEMMA3NA, "gemma3na"},
|
||||||
|
{ PROJECTOR_TYPE_GEMMA4V, "gemma4v"},
|
||||||
|
{ PROJECTOR_TYPE_GEMMA4A, "gemma4a"},
|
||||||
{ PROJECTOR_TYPE_PHI4, "phi4"},
|
{ PROJECTOR_TYPE_PHI4, "phi4"},
|
||||||
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
|
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
|
||||||
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
|
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
|
||||||
|
|
@ -476,6 +487,18 @@ static std::vector<std::string> string_split_str(std::string s, const std::strin
|
||||||
return tokens;
|
return tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// remove when moving to c++20
|
||||||
|
inline bool string_starts_with(std::string_view str, std::string_view prefix) {
|
||||||
|
return str.size() >= prefix.size() &&
|
||||||
|
str.compare(0, prefix.size(), prefix) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// remove when moving to c++20
|
||||||
|
inline bool string_ends_with(std::string_view str, std::string_view suffix) {
|
||||||
|
return str.size() >= suffix.size() &&
|
||||||
|
str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// gguf utils
|
// gguf utils
|
||||||
//
|
//
|
||||||
|
|
|
||||||
|
|
@ -143,6 +143,10 @@ struct clip_hparams {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct clip_layer {
|
struct clip_layer {
|
||||||
|
// layernorm 1 (or layer input norm, or pre-attention norm)
|
||||||
|
ggml_tensor * ln_1_w = nullptr;
|
||||||
|
ggml_tensor * ln_1_b = nullptr;
|
||||||
|
|
||||||
// attention
|
// attention
|
||||||
ggml_tensor * k_w = nullptr;
|
ggml_tensor * k_w = nullptr;
|
||||||
ggml_tensor * k_b = nullptr;
|
ggml_tensor * k_b = nullptr;
|
||||||
|
|
@ -159,9 +163,7 @@ struct clip_layer {
|
||||||
ggml_tensor * k_norm = nullptr;
|
ggml_tensor * k_norm = nullptr;
|
||||||
ggml_tensor * q_norm = nullptr;
|
ggml_tensor * q_norm = nullptr;
|
||||||
|
|
||||||
// layernorm 1
|
ggml_tensor * attn_post_norm_w = nullptr;
|
||||||
ggml_tensor * ln_1_w = nullptr;
|
|
||||||
ggml_tensor * ln_1_b = nullptr;
|
|
||||||
|
|
||||||
ggml_tensor * ff_up_w = nullptr;
|
ggml_tensor * ff_up_w = nullptr;
|
||||||
ggml_tensor * ff_up_b = nullptr;
|
ggml_tensor * ff_up_b = nullptr;
|
||||||
|
|
@ -170,13 +172,16 @@ struct clip_layer {
|
||||||
ggml_tensor * ff_down_w = nullptr;
|
ggml_tensor * ff_down_w = nullptr;
|
||||||
ggml_tensor * ff_down_b = nullptr;
|
ggml_tensor * ff_down_b = nullptr;
|
||||||
|
|
||||||
// layernorm 2
|
// layernorm 2 (or pre-FFN norm)
|
||||||
ggml_tensor * ln_2_w = nullptr;
|
ggml_tensor * ln_2_w = nullptr;
|
||||||
ggml_tensor * ln_2_b = nullptr;
|
ggml_tensor * ln_2_b = nullptr;
|
||||||
|
|
||||||
|
ggml_tensor * ff_post_norm_w = nullptr;
|
||||||
|
|
||||||
// layer scale (no bias)
|
// layer scale (no bias)
|
||||||
ggml_tensor * ls_1_w = nullptr;
|
ggml_tensor * ls_1_w = nullptr;
|
||||||
ggml_tensor * ls_2_w = nullptr;
|
ggml_tensor * ls_2_w = nullptr;
|
||||||
|
ggml_tensor * ls_out_w = nullptr; // gemma4
|
||||||
|
|
||||||
// qwen3vl deepstack merger
|
// qwen3vl deepstack merger
|
||||||
ggml_tensor * deepstack_norm_w = nullptr;
|
ggml_tensor * deepstack_norm_w = nullptr;
|
||||||
|
|
@ -437,6 +442,18 @@ struct clip_model {
|
||||||
ggml_tensor * pre_encode_out_w = nullptr;
|
ggml_tensor * pre_encode_out_w = nullptr;
|
||||||
ggml_tensor * pre_encode_out_b = nullptr;
|
ggml_tensor * pre_encode_out_b = nullptr;
|
||||||
|
|
||||||
|
// gemma4
|
||||||
|
ggml_tensor * std_bias = nullptr;
|
||||||
|
ggml_tensor * std_scale = nullptr;
|
||||||
|
// Gemma4ClippableLinear
|
||||||
|
struct clamp_info {
|
||||||
|
float inp_max;
|
||||||
|
float inp_min;
|
||||||
|
float out_max;
|
||||||
|
float out_min;
|
||||||
|
};
|
||||||
|
std::map<std::string, clamp_info> clamp_info_map;
|
||||||
|
|
||||||
bool audio_has_avgpool() const {
|
bool audio_has_avgpool() const {
|
||||||
return proj_type == PROJECTOR_TYPE_QWEN2A
|
return proj_type == PROJECTOR_TYPE_QWEN2A
|
||||||
|| proj_type == PROJECTOR_TYPE_VOXTRAL
|
|| proj_type == PROJECTOR_TYPE_VOXTRAL
|
||||||
|
|
|
||||||
|
|
@ -24,6 +24,7 @@
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
|
#include <float.h>
|
||||||
|
|
||||||
struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
|
struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
|
||||||
|
|
||||||
|
|
@ -379,19 +380,34 @@ ggml_tensor * clip_graph::build_vit(
|
||||||
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (layer.q_norm) {
|
// if true, norm must be applied after reshaping to (d_head, n_head, n_pos)
|
||||||
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
|
bool norm_per_head = layer.q_norm && layer.q_norm->ne[0] == d_head;
|
||||||
cb(Qcur, "Qcur_norm", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (layer.k_norm) {
|
if (!norm_per_head) {
|
||||||
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
|
if (layer.q_norm) {
|
||||||
cb(Kcur, "Kcur_norm", il);
|
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
|
||||||
|
cb(Qcur, "Qcur_norm", il);
|
||||||
|
}
|
||||||
|
if (layer.k_norm) {
|
||||||
|
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
|
||||||
|
cb(Kcur, "Kcur_norm", il);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
|
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
|
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
|
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
|
||||||
|
|
||||||
|
if (norm_per_head) {
|
||||||
|
if (layer.q_norm) {
|
||||||
|
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
|
||||||
|
cb(Qcur, "Qcur_norm_per_head", il);
|
||||||
|
}
|
||||||
|
if (layer.k_norm) {
|
||||||
|
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
|
||||||
|
cb(Kcur, "Kcur_norm_per_head", il);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
@ -405,6 +421,11 @@ ggml_tensor * clip_graph::build_vit(
|
||||||
cb(Kcur, "Kcur_pos", il);
|
cb(Kcur, "Kcur_pos", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (proj_type == PROJECTOR_TYPE_GEMMA4V) {
|
||||||
|
Vcur = ggml_rms_norm(ctx0, Vcur, eps);
|
||||||
|
cb(Vcur, "Vcur_normed", il);
|
||||||
|
}
|
||||||
|
|
||||||
cur = build_attn(layer.o_w, layer.o_b,
|
cur = build_attn(layer.o_w, layer.o_b,
|
||||||
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
||||||
cb(cur, "attn_out", il);
|
cb(cur, "attn_out", il);
|
||||||
|
|
@ -415,6 +436,11 @@ ggml_tensor * clip_graph::build_vit(
|
||||||
cb(cur, "attn_out_scaled", il);
|
cb(cur, "attn_out_scaled", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (layer.attn_post_norm_w) {
|
||||||
|
cur = build_norm(cur, layer.attn_post_norm_w, nullptr, norm_t, eps, il);
|
||||||
|
cb(cur, "attn_post_normed", il);
|
||||||
|
}
|
||||||
|
|
||||||
// re-add the layer input, e.g., residual
|
// re-add the layer input, e.g., residual
|
||||||
cur = ggml_add(ctx0, cur, inpL);
|
cur = ggml_add(ctx0, cur, inpL);
|
||||||
|
|
||||||
|
|
@ -422,7 +448,7 @@ ggml_tensor * clip_graph::build_vit(
|
||||||
|
|
||||||
cb(cur, "ffn_inp", il);
|
cb(cur, "ffn_inp", il);
|
||||||
|
|
||||||
// layernorm2
|
// layernorm2 (pre-ffn norm)
|
||||||
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
|
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
|
||||||
cb(cur, "ffn_inp_normed", il);
|
cb(cur, "ffn_inp_normed", il);
|
||||||
|
|
||||||
|
|
@ -435,6 +461,11 @@ ggml_tensor * clip_graph::build_vit(
|
||||||
|
|
||||||
cb(cur, "ffn_out", il);
|
cb(cur, "ffn_out", il);
|
||||||
|
|
||||||
|
if (layer.ff_post_norm_w) {
|
||||||
|
cur = build_norm(cur, layer.ff_post_norm_w, nullptr, norm_t, eps, il);
|
||||||
|
cb(cur, "ffn_post_normed", il);
|
||||||
|
}
|
||||||
|
|
||||||
if (layer.ls_2_w) {
|
if (layer.ls_2_w) {
|
||||||
cur = ggml_mul(ctx0, cur, layer.ls_2_w);
|
cur = ggml_mul(ctx0, cur, layer.ls_2_w);
|
||||||
cb(cur, "ffn_out_scaled", il);
|
cb(cur, "ffn_out_scaled", il);
|
||||||
|
|
@ -444,6 +475,11 @@ ggml_tensor * clip_graph::build_vit(
|
||||||
cur = ggml_add(ctx0, inpL, cur);
|
cur = ggml_add(ctx0, inpL, cur);
|
||||||
cb(cur, "layer_out", il);
|
cb(cur, "layer_out", il);
|
||||||
|
|
||||||
|
if (layer.ls_out_w) {
|
||||||
|
cur = ggml_mul(ctx0, cur, layer.ls_out_w);
|
||||||
|
cb(cur, "layer_out_scaled", il);
|
||||||
|
}
|
||||||
|
|
||||||
inpL = cur;
|
inpL = cur;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -808,6 +844,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
{
|
{
|
||||||
builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
|
builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_GEMMA4V:
|
||||||
|
{
|
||||||
|
builder = std::make_unique<clip_graph_gemma4v>(ctx, img);
|
||||||
|
} break;
|
||||||
case PROJECTOR_TYPE_PIXTRAL:
|
case PROJECTOR_TYPE_PIXTRAL:
|
||||||
case PROJECTOR_TYPE_LIGHTONOCR:
|
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||||
{
|
{
|
||||||
|
|
@ -1257,6 +1297,17 @@ struct clip_model_loader {
|
||||||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
||||||
} break;
|
} break;
|
||||||
|
|
||||||
|
case PROJECTOR_TYPE_GEMMA4V:
|
||||||
|
{
|
||||||
|
hparams.rope_theta = 100.0f;
|
||||||
|
hparams.n_merge = 3; // pooling_kernel_size
|
||||||
|
hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
|
||||||
|
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
||||||
|
// @ngxson : the model performs quite poor with small images, we need to bump minimum image tokens to 40 to avoid that
|
||||||
|
hparams.set_limit_image_tokens(252, 280);
|
||||||
|
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
|
||||||
|
} break;
|
||||||
|
|
||||||
case PROJECTOR_TYPE_GEMMA3NV:
|
case PROJECTOR_TYPE_GEMMA3NV:
|
||||||
{
|
{
|
||||||
// Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
|
// Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
|
||||||
|
|
@ -1442,6 +1493,11 @@ struct clip_model_loader {
|
||||||
std::map<std::string, size_t> tensor_offset;
|
std::map<std::string, size_t> tensor_offset;
|
||||||
std::vector<ggml_tensor *> tensors_to_load;
|
std::vector<ggml_tensor *> tensors_to_load;
|
||||||
|
|
||||||
|
auto fin = std::ifstream(fname, std::ios::binary);
|
||||||
|
if (!fin) {
|
||||||
|
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
|
||||||
|
}
|
||||||
|
|
||||||
// TODO @ngxson : support both audio and video in the future
|
// TODO @ngxson : support both audio and video in the future
|
||||||
const char * prefix = model.modality == CLIP_MODALITY_AUDIO ? "a" : "v";
|
const char * prefix = model.modality == CLIP_MODALITY_AUDIO ? "a" : "v";
|
||||||
|
|
||||||
|
|
@ -1478,6 +1534,18 @@ struct clip_model_loader {
|
||||||
return cur;
|
return cur;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
auto get_scalar = [&](const std::string & name, float default_val) {
|
||||||
|
auto it = tensor_offset.find(name);
|
||||||
|
if (it == tensor_offset.end()) {
|
||||||
|
return default_val;
|
||||||
|
}
|
||||||
|
size_t offset = it->second;
|
||||||
|
fin.seekg(offset, std::ios::beg);
|
||||||
|
float value;
|
||||||
|
fin.read(reinterpret_cast<char*>(&value), sizeof(float));
|
||||||
|
return value;
|
||||||
|
};
|
||||||
|
|
||||||
model.class_embedding = get_tensor(TN_CLASS_EMBD, false);
|
model.class_embedding = get_tensor(TN_CLASS_EMBD, false);
|
||||||
|
|
||||||
model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, prefix, "weight"), false);
|
model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, prefix, "weight"), false);
|
||||||
|
|
@ -1512,8 +1580,11 @@ struct clip_model_loader {
|
||||||
layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, prefix, il, "weight"), false);
|
layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, prefix, il, "weight"), false);
|
||||||
layer.ln_1_w = get_tensor(string_format(TN_LN_1, prefix, il, "weight"), false);
|
layer.ln_1_w = get_tensor(string_format(TN_LN_1, prefix, il, "weight"), false);
|
||||||
layer.ln_2_w = get_tensor(string_format(TN_LN_2, prefix, il, "weight"), false);
|
layer.ln_2_w = get_tensor(string_format(TN_LN_2, prefix, il, "weight"), false);
|
||||||
layer.ls_1_w = get_tensor(string_format(TN_LS_1, prefix, il, "weight"), false); // no bias
|
layer.ls_1_w = get_tensor(string_format(TN_LS_1, prefix, il, "weight"), false); // no bias
|
||||||
layer.ls_2_w = get_tensor(string_format(TN_LS_2, prefix, il, "weight"), false); // no bias
|
layer.ls_2_w = get_tensor(string_format(TN_LS_2, prefix, il, "weight"), false); // no bias
|
||||||
|
layer.ls_out_w = get_tensor(string_format(TN_LS_OUT, prefix, il, "weight"), false); // no bias
|
||||||
|
layer.attn_post_norm_w = get_tensor(string_format(TN_ATTN_POST_NORM, prefix, il, "weight"), false); // no bias
|
||||||
|
layer.ff_post_norm_w = get_tensor(string_format(TN_FFN_POST_NORM, prefix, il, "weight"), false); // no bias
|
||||||
|
|
||||||
layer.k_b = get_tensor(string_format(TN_ATTN_K, prefix, il, "bias"), false);
|
layer.k_b = get_tensor(string_format(TN_ATTN_K, prefix, il, "bias"), false);
|
||||||
layer.q_b = get_tensor(string_format(TN_ATTN_Q, prefix, il, "bias"), false);
|
layer.q_b = get_tensor(string_format(TN_ATTN_Q, prefix, il, "bias"), false);
|
||||||
|
|
@ -1713,6 +1784,32 @@ struct clip_model_loader {
|
||||||
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
|
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
|
||||||
model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
|
model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_GEMMA4V:
|
||||||
|
{
|
||||||
|
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
|
||||||
|
model.std_bias = get_tensor(TN_STD_BIAS, false);
|
||||||
|
model.std_scale = get_tensor(TN_STD_SCALE, false);
|
||||||
|
// load scalar for Gemma4ClippableLinear
|
||||||
|
for (auto * tensor : tensors_to_load) {
|
||||||
|
std::string name = tensor->name;
|
||||||
|
if (string_ends_with(name, ".weight")) {
|
||||||
|
std::string name_inp_max = name;
|
||||||
|
std::string name_inp_min = name;
|
||||||
|
std::string name_out_max = name;
|
||||||
|
std::string name_out_min = name;
|
||||||
|
string_replace_all(name_inp_max, ".weight", ".input_max");
|
||||||
|
string_replace_all(name_inp_min, ".weight", ".input_min");
|
||||||
|
string_replace_all(name_out_max, ".weight", ".output_max");
|
||||||
|
string_replace_all(name_out_min, ".weight", ".output_min");
|
||||||
|
model.clamp_info_map[name] = {
|
||||||
|
get_scalar(name_inp_max, FLT_MAX),
|
||||||
|
get_scalar(name_inp_min, -FLT_MAX),
|
||||||
|
get_scalar(name_out_max, FLT_MAX),
|
||||||
|
get_scalar(name_out_min, -FLT_MAX)
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case PROJECTOR_TYPE_GEMMA3NV:
|
case PROJECTOR_TYPE_GEMMA3NV:
|
||||||
{
|
{
|
||||||
model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
|
model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
|
||||||
|
|
@ -2042,11 +2139,6 @@ struct clip_model_loader {
|
||||||
{
|
{
|
||||||
std::vector<uint8_t> read_buf;
|
std::vector<uint8_t> read_buf;
|
||||||
|
|
||||||
auto fin = std::ifstream(fname, std::ios::binary);
|
|
||||||
if (!fin) {
|
|
||||||
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
|
|
||||||
}
|
|
||||||
|
|
||||||
// alloc memory and offload data
|
// alloc memory and offload data
|
||||||
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend);
|
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend);
|
||||||
ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft));
|
ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft));
|
||||||
|
|
@ -2345,7 +2437,8 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
|
||||||
|
|
||||||
// TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
|
// TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
|
||||||
// we can remove this check when we implement audio support for Gemma 3N
|
// we can remove this check when we implement audio support for Gemma 3N
|
||||||
skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
|
skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV
|
||||||
|
|| ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA4V;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (loader.has_audio && !skip_audio) {
|
if (loader.has_audio && !skip_audio) {
|
||||||
|
|
@ -2581,6 +2674,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||||
n_patches = x_patch * y_patch;
|
n_patches = x_patch * y_patch;
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_GEMMA3:
|
case PROJECTOR_TYPE_GEMMA3:
|
||||||
|
case PROJECTOR_TYPE_GEMMA4V:
|
||||||
case PROJECTOR_TYPE_IDEFICS3:
|
case PROJECTOR_TYPE_IDEFICS3:
|
||||||
case PROJECTOR_TYPE_INTERNVL:
|
case PROJECTOR_TYPE_INTERNVL:
|
||||||
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
|
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
|
||||||
|
|
@ -3031,6 +3125,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
}
|
}
|
||||||
set_input_i32("patches", patches);
|
set_input_i32("patches", patches);
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_GEMMA4V:
|
||||||
|
{
|
||||||
|
// set (col, row) patch positions for learned positional embedding
|
||||||
|
const int n_cols = image_size_width / patch_size;
|
||||||
|
std::vector<int> pos_x(num_patches), pos_y(num_patches);
|
||||||
|
for (int i = 0; i < num_patches; i++) {
|
||||||
|
pos_x[i] = i % n_cols;
|
||||||
|
pos_y[i] = i / n_cols;
|
||||||
|
}
|
||||||
|
set_input_i32("pos_x", pos_x);
|
||||||
|
set_input_i32("pos_y", pos_y);
|
||||||
|
} break;
|
||||||
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(pos_w == pos_h);
|
GGML_ASSERT(pos_w == pos_h);
|
||||||
|
|
@ -3218,6 +3324,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||||
case PROJECTOR_TYPE_GEMMA3:
|
case PROJECTOR_TYPE_GEMMA3:
|
||||||
case PROJECTOR_TYPE_GEMMA3NV:
|
case PROJECTOR_TYPE_GEMMA3NV:
|
||||||
return ctx->model.mm_input_proj_w->ne[0];
|
return ctx->model.mm_input_proj_w->ne[0];
|
||||||
|
case PROJECTOR_TYPE_GEMMA4V:
|
||||||
|
return ctx->model.mm_input_proj_w->ne[1];
|
||||||
case PROJECTOR_TYPE_IDEFICS3:
|
case PROJECTOR_TYPE_IDEFICS3:
|
||||||
return ctx->model.mm_fc_w->ne[1];
|
return ctx->model.mm_fc_w->ne[1];
|
||||||
case PROJECTOR_TYPE_ULTRAVOX:
|
case PROJECTOR_TYPE_ULTRAVOX:
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,151 @@
|
||||||
|
#include "models.h"
|
||||||
|
#include <cmath>
|
||||||
|
|
||||||
|
ggml_cgraph * clip_graph_gemma4v::build() {
|
||||||
|
ggml_tensor * inp_raw = build_inp_raw();
|
||||||
|
|
||||||
|
// patches = 2 * (patches - 0.5)
|
||||||
|
// equivalent to: patches * 2 - 1
|
||||||
|
inp_raw = ggml_scale_bias(ctx0, inp_raw, 2.0f, -1.0f);
|
||||||
|
ggml_set_name(inp_raw, "inp_raw_scaled");
|
||||||
|
|
||||||
|
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||||
|
inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd);
|
||||||
|
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
|
||||||
|
ggml_set_name(inp, "inp");
|
||||||
|
// note: no patch bias
|
||||||
|
|
||||||
|
ggml_tensor * pos_x = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||||
|
ggml_set_name(pos_x, "pos_x");
|
||||||
|
ggml_set_input(pos_x);
|
||||||
|
|
||||||
|
ggml_tensor * pos_y = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||||
|
ggml_set_name(pos_y, "pos_y");
|
||||||
|
ggml_set_input(pos_y);
|
||||||
|
|
||||||
|
{
|
||||||
|
const int64_t pos_size = model.position_embeddings->ne[1];
|
||||||
|
const size_t nb1 = ggml_row_size(model.position_embeddings->type, n_embd);
|
||||||
|
|
||||||
|
// positional embeddings are stored as lookup tables (one for x, one for y)
|
||||||
|
ggml_tensor * tbl_x = ggml_view_2d(ctx0, model.position_embeddings,
|
||||||
|
n_embd, pos_size, nb1, 0);
|
||||||
|
ggml_tensor * tbl_y = ggml_view_2d(ctx0, model.position_embeddings,
|
||||||
|
n_embd, pos_size, nb1, pos_size * nb1);
|
||||||
|
|
||||||
|
// ggml_get_rows: [n_embd, n_patches]
|
||||||
|
ggml_tensor * emb_x = ggml_get_rows(ctx0, tbl_x, pos_x);
|
||||||
|
ggml_tensor * emb_y = ggml_get_rows(ctx0, tbl_y, pos_y);
|
||||||
|
|
||||||
|
inp = ggml_add(ctx0, inp, emb_x);
|
||||||
|
inp = ggml_add(ctx0, inp, emb_y);
|
||||||
|
cb(inp, "pos_embd", -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// similar to build_rope_2d, but use neox ordering
|
||||||
|
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||||
|
const int64_t n_dim = cur->ne[0];
|
||||||
|
const int64_t n_head = cur->ne[1];
|
||||||
|
const int64_t n_pos = cur->ne[2];
|
||||||
|
|
||||||
|
// first half
|
||||||
|
ggml_tensor * first;
|
||||||
|
{
|
||||||
|
first = ggml_view_3d(ctx0, cur,
|
||||||
|
n_dim/2, n_head, n_pos,
|
||||||
|
cur->nb[1],
|
||||||
|
cur->nb[2],
|
||||||
|
0);
|
||||||
|
first = ggml_rope_ext(
|
||||||
|
ctx0,
|
||||||
|
first,
|
||||||
|
pos_x, // positions
|
||||||
|
nullptr, // freq factors
|
||||||
|
n_dim/2, // n_dims
|
||||||
|
GGML_ROPE_TYPE_NEOX, 0, hparams.rope_theta,
|
||||||
|
1.0f, 0.0f, 1.0f, 0.0f, 0.0f
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// second half
|
||||||
|
ggml_tensor * second;
|
||||||
|
{
|
||||||
|
second = ggml_view_3d(ctx0, cur,
|
||||||
|
n_dim/2, n_head, n_pos,
|
||||||
|
cur->nb[1],
|
||||||
|
cur->nb[2],
|
||||||
|
n_dim/2 * ggml_element_size(cur));
|
||||||
|
second = ggml_rope_ext(
|
||||||
|
ctx0,
|
||||||
|
second,
|
||||||
|
pos_y, // positions
|
||||||
|
nullptr, // freq factors
|
||||||
|
n_dim/2, // n_dims
|
||||||
|
GGML_ROPE_TYPE_NEOX, 0, hparams.rope_theta,
|
||||||
|
1.0f, 0.0f, 1.0f, 0.0f, 0.0f
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = ggml_concat(ctx0, first, second, 0);
|
||||||
|
return cur;
|
||||||
|
};
|
||||||
|
|
||||||
|
kq_scale = 1.0f;
|
||||||
|
ggml_tensor * cur = build_vit(
|
||||||
|
inp, n_patches,
|
||||||
|
NORM_TYPE_RMS,
|
||||||
|
hparams.ffn_op,
|
||||||
|
nullptr, // pos embd is already handled above
|
||||||
|
add_pos);
|
||||||
|
|
||||||
|
// Gemma4VisionPooler
|
||||||
|
{
|
||||||
|
const int kernel_size = hparams.n_merge;
|
||||||
|
GGML_ASSERT(kernel_size > 0);
|
||||||
|
|
||||||
|
// [n_embd, n_patches] -> [n_patches_x, n_patches_y, n_embd, 1]
|
||||||
|
cur = ggml_cont_4d(ctx0, ggml_transpose(ctx0, cur), n_patches_x, n_patches_y, n_embd, 1);
|
||||||
|
cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG,
|
||||||
|
kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
|
||||||
|
const int out_x = n_patches_x / kernel_size;
|
||||||
|
const int out_y = n_patches_y / kernel_size;
|
||||||
|
// [out_x, out_y, n_embd, 1] -> [n_embd, out_x * out_y]
|
||||||
|
cur = ggml_reshape_3d(ctx0, cur, out_x * out_y, n_embd, 1);
|
||||||
|
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
||||||
|
cur = ggml_scale(ctx0, cur, sqrtf((float)n_embd));
|
||||||
|
cb(cur, "pooled", -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// hidden_states = (hidden_states - self.std_bias) * self.std_scale
|
||||||
|
if (model.std_bias && model.std_scale) {
|
||||||
|
cur = ggml_sub(ctx0, cur, model.std_bias);
|
||||||
|
cur = ggml_mul(ctx0, cur, model.std_scale);
|
||||||
|
cb(cur, "std_scaled", -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Gemma4MultimodalEmbedder
|
||||||
|
cur = build_mm(model.mm_input_proj_w, cur);
|
||||||
|
cb(cur, "projected", -1);
|
||||||
|
|
||||||
|
// embedding_post_projection_norm
|
||||||
|
cur = ggml_rms_norm(ctx0, cur, hparams.eps);
|
||||||
|
cb(cur, "projected_normed", -1);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor * clip_graph_gemma4v::build_mm(ggml_tensor * w, ggml_tensor * x) const {
|
||||||
|
// Gemma4ClippableLinear
|
||||||
|
|
||||||
|
auto it = model.clamp_info_map.find(w->name);
|
||||||
|
if (it == model.clamp_info_map.end()) {
|
||||||
|
return ggml_mul_mat(ctx0, w, x);
|
||||||
|
} else {
|
||||||
|
const auto & clamp_info = it->second;
|
||||||
|
ggml_tensor * clamped = ggml_clamp(ctx0, x, clamp_info.inp_min, clamp_info.inp_max);
|
||||||
|
ggml_tensor * out = ggml_mul_mat(ctx0, w, clamped);
|
||||||
|
out = ggml_clamp(ctx0, out, clamp_info.out_min, clamp_info.out_max);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -12,6 +12,12 @@ struct clip_graph_siglip : clip_graph {
|
||||||
ggml_cgraph * build() override;
|
ggml_cgraph * build() override;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct clip_graph_gemma4v : clip_graph {
|
||||||
|
clip_graph_gemma4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||||
|
ggml_cgraph * build() override;
|
||||||
|
ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const override;
|
||||||
|
};
|
||||||
|
|
||||||
struct clip_graph_pixtral : clip_graph {
|
struct clip_graph_pixtral : clip_graph {
|
||||||
clip_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
clip_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||||
ggml_cgraph * build() override;
|
ggml_cgraph * build() override;
|
||||||
|
|
|
||||||
|
|
@ -394,6 +394,13 @@ struct mtmd_context {
|
||||||
img_end = "<|IMAGE_END|>";
|
img_end = "<|IMAGE_END|>";
|
||||||
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_GEMMA4V:
|
||||||
|
{
|
||||||
|
// <|image> ... (image embeddings) ... <image|>
|
||||||
|
img_beg = "<|image>";
|
||||||
|
img_end = "<image|>";
|
||||||
|
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||||
|
} break;
|
||||||
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
||||||
{
|
{
|
||||||
img_end = "\n"; // prevent empty batch on llama-server
|
img_end = "\n"; // prevent empty batch on llama-server
|
||||||
|
|
@ -974,6 +981,7 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
|
||||||
bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
|
bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
|
||||||
switch (ctx->proj_type_v()) {
|
switch (ctx->proj_type_v()) {
|
||||||
case PROJECTOR_TYPE_GEMMA3:
|
case PROJECTOR_TYPE_GEMMA3:
|
||||||
|
case PROJECTOR_TYPE_GEMMA4V:
|
||||||
return true;
|
return true;
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue