From 5ba95754ee27433971f4f247326770019ad475c1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 16 Dec 2025 11:17:11 +0200 Subject: [PATCH 01/44] security : add collaborator guidance (#18081) --- SECURITY.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/SECURITY.md b/SECURITY.md index 9c86ae91b5..ae496f4e3d 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -68,3 +68,6 @@ Please disclose it as a private [security advisory](https://github.com/ggml-org/ Please note that using AI to identify vulnerabilities and generate reports is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before submitting the report. A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure. + +> [!IMPORTANT] +> For collaborators: if you are interested in helping out with reviewing privting security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080 From 279cef27c2b297476ecde6e5df729c057691f1de Mon Sep 17 00:00:00 2001 From: Francisco Herrera Date: Tue, 16 Dec 2025 04:45:09 -0500 Subject: [PATCH 02/44] added note for old Intel hardware pre sycl (#18017) * added note for old Intel hardware pre sycl Older hardware used opencl * typo * use consistent terms --- docs/backend/SYCL.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 02a72a9d51..f44458ed3b 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -103,6 +103,8 @@ SYCL backend supports Intel GPU Family: - Intel Built-in Arc GPU - Intel iGPU in Core CPU (11th Generation Core CPU and newer, refer to [oneAPI supported GPU](https://www.intel.com/content/www/us/en/developer/articles/system-requirements/intel-oneapi-base-toolkit-system-requirements.html#inpage-nav-1-1)). +On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the performance is not optimal, and some GPUs may not support OpenCL nor have any GPGPU capabilities. + #### Verified devices | Intel GPU | Status | Verified Model | From c05aa69f32ee87b2019a1b39979d89302c2d6b7b Mon Sep 17 00:00:00 2001 From: Aldehir Rojas Date: Tue, 16 Dec 2025 04:05:23 -0600 Subject: [PATCH 03/44] common : add nemotron 3 parsing (#18077) * common : expose json-schema functionality to extract type info * common : fix peg parser negation during needs_more_input * common : add some defensive measures in constructed peg parser * common : add nemotron nano 3 support * common : add nemotron nano 3 tests * remove debug line --- common/chat-peg-parser.cpp | 14 +- common/chat.cpp | 140 ++++++++++++ common/json-schema-to-grammar.cpp | 135 +++++++++++- common/json-schema-to-grammar.h | 20 ++ common/peg-parser.cpp | 2 +- .../NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja | 204 ++++++++++++++++++ tests/test-chat.cpp | 157 ++++++++++++++ tests/test-json-schema-to-grammar.cpp | 75 +++++++ 8 files changed, 741 insertions(+), 6 deletions(-) create mode 100644 models/templates/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja diff --git a/common/chat-peg-parser.cpp b/common/chat-peg-parser.cpp index 74a7b6a46d..1bcba9cd86 100644 --- a/common/chat-peg-parser.cpp +++ b/common/chat-peg-parser.cpp @@ -4,9 +4,14 @@ using json = nlohmann::json; -static std::string_view trim_trailing_space(std::string_view sv) { +static std::string_view trim_trailing_space(std::string_view sv, int max = -1) { + int count = 0; while (!sv.empty() && std::isspace(static_cast(sv.back()))) { + if (max != -1 && count <= max) { + break; + } sv.remove_suffix(1); + count++; } return sv; } @@ -93,7 +98,7 @@ void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) { if (is_arg_string && current_tool) { // Serialize to JSON, but exclude the end quote - std::string dumped = json(node.text).dump(); + std::string dumped = json(trim_trailing_space(node.text)).dump(); current_tool->arguments += dumped.substr(0, dumped.size() - 1); needs_closing_quote = true; } @@ -101,6 +106,7 @@ void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) { if (is_arg_close && current_tool) { if (needs_closing_quote) { current_tool->arguments += "\""; + needs_closing_quote = false; } } @@ -109,6 +115,10 @@ void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) { } if (is_tool_close && current_tool) { + if (needs_closing_quote) { + current_tool->arguments += "\""; + needs_closing_quote = false; + } current_tool->arguments += "}"; } } diff --git a/common/chat.cpp b/common/chat.cpp index c371edaa5a..0a426f4478 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -711,6 +711,25 @@ static void foreach_function(const json & tools, const std::function & fn) { + if (!function.contains("parameters") || !function.at("parameters").is_object()) { + return; + } + const auto & params = function.at("parameters"); + if (!params.contains("properties") || !params.at("properties").is_object()) { + return; + } + const auto & props = params.at("properties"); + std::set required; + if (params.contains("required") && params.at("required").is_array()) { + params.at("required").get_to(required); + } + for (const auto & [name, prop] : props.items()) { + bool is_required = (required.find(name) != required.end()); + fn(name, prop, is_required); + } +} + static std::string apply( const common_chat_template & tmpl, const struct templates_params & inputs, @@ -1409,6 +1428,123 @@ static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_ return data; } +static common_chat_params common_chat_params_init_nemotron_v3(const common_chat_template & tmpl, const struct templates_params & inputs) { + common_chat_params data; + + data.prompt = apply(tmpl, inputs); + data.format = COMMON_CHAT_FORMAT_PEG_CONSTRUCTED; + + // Handle thinking tags appropriately based on inputs.enable_thinking + if (string_ends_with(data.prompt, "\n")) { + if (!inputs.enable_thinking) { + data.prompt += ""; + } else { + data.thinking_forced_open = true; + } + } + + data.preserved_tokens = { + "", + "", + "", + "", + }; + + auto has_tools = inputs.tools.is_array() && !inputs.tools.empty(); + auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE; + auto include_grammar = true; + + auto parser = build_chat_peg_constructed_parser([&](auto & p) { + auto reasoning = p.eps(); + if (inputs.enable_thinking && extract_reasoning) { + auto reasoning_content = p.reasoning(p.until("")) + ("" | p.end()); + if (data.thinking_forced_open) { + reasoning = reasoning_content; + } + } + + // Response format parser + if (inputs.json_schema.is_object() && !inputs.json_schema.empty()) { + return reasoning << p.content(p.schema(p.json(), "response-format", inputs.json_schema)); + } + + // Tool call parser + if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) { + auto tool_choice = p.choice(); + foreach_function(inputs.tools, [&](const json & tool) { + const auto & function = tool.at("function"); + std::string name = function.at("name"); + auto parameters = function.at("parameters"); + + auto schema_info = common_schema_info(); + schema_info.resolve_refs(parameters); + + auto tool_open = "\n"; + auto tool_close = p.literal("\n"); + auto args = p.sequence(); + auto arg_string = p.rule("xml-arg-string", p.until_one_of({ + "\n", + "\n" + })); + + foreach_parameter(function, [&](const auto & param_name, const json & param_schema, bool is_required) { + auto rule_name = "tool-" + name + "-arg-" + param_name; + + auto arg_open = "\n"; + auto arg_close = p.literal("\n"); + auto arg_value = p.eps(); + + if (schema_info.resolves_to_string(param_schema)) { + arg_value = p.tool_arg_string_value(arg_string) + "\n"; + } else { + arg_value = p.tool_arg_json_value(p.schema(p.json(), rule_name + "-schema", param_schema)); + } + + // Model may or my not close with + auto arg_rule = p.rule(rule_name, p.tool_arg_open(arg_open) + arg_value + p.optional(p.tool_arg_close(arg_close))); + args += p.repeat(arg_rule, /* min = */ is_required ? 1 : 0, /* max = */ 1); + }); + + tool_choice |= p.rule("tool-" + name, p.tool_open(tool_open) + args + p.tool_close(tool_close)); + }); + + auto min_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0; + auto max_calls = inputs.parallel_tool_calls ? -1 : 1; + auto tool_call = p.rule("tool-call", "\n" + tool_choice + "" + p.space()); + auto tool_calls = p.trigger_rule("tool-call-root", p.repeat(tool_call, /* min = */ min_calls, /* max = */ max_calls)); + + return reasoning << p.content(p.until("")) << tool_calls; + } + + // Content only parser + include_grammar = false; + return reasoning << p.content(p.rest()); + }); + + data.parser = parser.save(); + + if (include_grammar) { + data.grammar_lazy = has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO; + + data.grammar = build_grammar([&](const common_grammar_builder & builder) { + foreach_function(inputs.tools, [&](const json & tool) { + const auto & function = tool.at("function"); + auto schema = function.at("parameters"); + builder.resolve_refs(schema); + }); + parser.build_grammar(builder, data.grammar_lazy); + }); + + data.grammar_triggers = { + {COMMON_GRAMMAR_TRIGGER_TYPE_WORD, ""} + }; + } + + return data; +} + + static common_chat_params common_chat_params_init_apertus(const common_chat_template & tmpl, const struct templates_params & inputs) { common_chat_params data; @@ -2534,6 +2670,10 @@ static common_chat_params common_chat_templates_apply_jinja( src.find("") != std::string::npos && src.find("") != std::string::npos) { + return common_chat_params_init_nemotron_v3(tmpl, params); + } return common_chat_params_init_qwen3_coder_xml(tmpl, params); } diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index c3b4e5d9dc..2f67c74d79 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -305,8 +305,9 @@ static std::string format_literal(const std::string & literal) { std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); } -class SchemaConverter { +class common_schema_converter { private: + friend class common_schema_info; friend std::string build_grammar(const std::function & cb, const common_grammar_options & options); std::function _fetch_json; bool _dotall; @@ -729,7 +730,7 @@ private: } public: - SchemaConverter( + common_schema_converter( const std::function & fetch_json, bool dotall) : _fetch_json(fetch_json), _dotall(dotall) @@ -990,6 +991,134 @@ public: } }; +// common_schema_info implementation (pimpl) + +common_schema_info::common_schema_info() + : impl_(std::make_unique( + [](const std::string &) { return json(); }, + false)) {} + +common_schema_info::~common_schema_info() = default; + +common_schema_info::common_schema_info(common_schema_info &&) noexcept = default; +common_schema_info & common_schema_info::operator=(common_schema_info &&) noexcept = default; + +void common_schema_info::resolve_refs(nlohmann::ordered_json & schema) { + impl_->resolve_refs(schema, ""); +} + +// Determines if a JSON schema can resolve to a string type through any path. +// Some models emit raw string values rather than JSON-encoded strings for string parameters. +// If any branch of the schema (via oneOf, anyOf, $ref, etc.) permits a string, this returns +// true, allowing callers to handle the value as a raw string for simplicity. +bool common_schema_info::resolves_to_string(const nlohmann::ordered_json & schema) { + std::unordered_set visited_refs; + + std::function check = [&](const json & s) -> bool { + if (!s.is_object()) { + return false; + } + + // Handle $ref + if (s.contains("$ref")) { + const std::string & ref = s["$ref"]; + if (visited_refs.find(ref) != visited_refs.end()) { + // Circular reference, assume not a string to be safe + return false; + } + visited_refs.insert(ref); + auto it = impl_->_refs.find(ref); + if (it != impl_->_refs.end()) { + return check(it->second); + } + return false; + } + + // Check type field + if (s.contains("type")) { + const json & schema_type = s["type"]; + if (schema_type.is_string()) { + if (schema_type == "string") { + return true; + } + } else if (schema_type.is_array()) { + // Type can be an array like ["string", "null"] + for (const auto & t : schema_type) { + if (t == "string") { + return true; + } + } + } + } + + // Check oneOf/anyOf - if any alternative can be a string + if (s.contains("oneOf")) { + for (const auto & alt : s["oneOf"]) { + if (check(alt)) { + return true; + } + } + } + if (s.contains("anyOf")) { + for (const auto & alt : s["anyOf"]) { + if (check(alt)) { + return true; + } + } + } + + // Check allOf - all components must be compatible with string type + if (s.contains("allOf")) { + bool all_string = true; + for (const auto & component : s["allOf"]) { + if (!check(component)) { + all_string = false; + break; + } + } + if (all_string) { + return true; + } + } + + // Check const - if the constant value is a string + if (s.contains("const")) { + if (s["const"].is_string()) { + return true; + } + } + + // Check enum - if any enum value is a string + if (s.contains("enum")) { + for (const auto & val : s["enum"]) { + if (val.is_string()) { + return true; + } + } + } + + // String-specific keywords imply string type + if (s.contains("pattern") || s.contains("minLength") || s.contains("maxLength")) { + return true; + } + + // Check format - many formats imply string + if (s.contains("format")) { + const std::string & fmt = s["format"]; + if (fmt == "date" || fmt == "time" || fmt == "date-time" || + fmt == "uri" || fmt == "email" || fmt == "hostname" || + fmt == "ipv4" || fmt == "ipv6" || fmt == "uuid" || + fmt.find("uuid") == 0) { + return true; + } + } + + return false; + }; + + return check(schema); +} + std::string json_schema_to_grammar(const json & schema, bool force_gbnf) { #ifdef LLAMA_USE_LLGUIDANCE if (!force_gbnf) { @@ -1006,7 +1135,7 @@ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) { } std::string build_grammar(const std::function & cb, const common_grammar_options & options) { - SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall); + common_schema_converter converter([&](const std::string &) { return json(); }, options.dotall); common_grammar_builder builder { /* .add_rule = */ [&](const std::string & name, const std::string & rule) { return converter._add_rule(name, rule); diff --git a/common/json-schema-to-grammar.h b/common/json-schema-to-grammar.h index c89ab7f997..240d642311 100644 --- a/common/json-schema-to-grammar.h +++ b/common/json-schema-to-grammar.h @@ -3,11 +3,31 @@ #include #include +#include #include std::string json_schema_to_grammar(const nlohmann::ordered_json & schema, bool force_gbnf = false); +class common_schema_converter; + +// Probes a JSON schema to extract information about its structure and type constraints. +class common_schema_info { + std::unique_ptr impl_; + + public: + common_schema_info(); + ~common_schema_info(); + + common_schema_info(const common_schema_info &) = delete; + common_schema_info & operator=(const common_schema_info &) = delete; + common_schema_info(common_schema_info &&) noexcept; + common_schema_info & operator=(common_schema_info &&) noexcept; + + void resolve_refs(nlohmann::ordered_json & schema); + bool resolves_to_string(const nlohmann::ordered_json & schema); +}; + struct common_grammar_builder { std::function add_rule; std::function add_schema; diff --git a/common/peg-parser.cpp b/common/peg-parser.cpp index dec99e1820..f2fc84500f 100644 --- a/common/peg-parser.cpp +++ b/common/peg-parser.cpp @@ -425,7 +425,7 @@ struct parser_executor { if (result.need_more_input()) { // Propagate - need to know what child would match before negating - return result; + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos); } // Child failed, so negation succeeds diff --git a/models/templates/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja b/models/templates/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja new file mode 100644 index 0000000000..a01e0861c6 --- /dev/null +++ b/models/templates/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja @@ -0,0 +1,204 @@ +{% macro render_extra_keys(json_dict, handled_keys) %} + {%- if json_dict is mapping %} + {%- for json_key in json_dict if json_key not in handled_keys %} + {%- if json_dict[json_key] is mapping or (json_dict[json_key] is sequence and json_dict[json_key] is not string) %} + {{- '\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | tojson | safe) ~ '' }} + {%- else %} + {{-'\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | string) ~ '' }} + {%- endif %} + {%- endfor %} + {%- endif %} +{% endmacro %} +{%- set enable_thinking = enable_thinking if enable_thinking is defined else True %} +{%- set truncate_history_thinking = truncate_history_thinking if truncate_history_thinking is defined else True %} + +{%- set ns = namespace(last_user_idx = -1) %} +{%- set loop_messages = messages %} +{%- for m in loop_messages %} + {%- if m["role"] == "user" %} + {%- set ns.last_user_idx = loop.index0 %} + {%- endif %} +{%- endfor %} + +{%- if messages[0]["role"] == "system" %} + {%- set system_message = messages[0]["content"] %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} + {%- set loop_messages = messages %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = [] %} +{%- endif %} +{# Recompute last_user_idx relative to loop_messages after handling system #} +{%- set ns = namespace(last_user_idx = -1) %} +{%- for m in loop_messages %} + {%- if m["role"] == "user" %} + {%- set ns.last_user_idx = loop.index0 %} + {%- endif %} +{%- endfor %} +{%- if system_message is defined %} + {{- "<|im_start|>system\n" + system_message }} +{%- else %} + {%- if tools is iterable and tools | length > 0 %} + {{- "<|im_start|>system\n" }} + {%- endif %} +{%- endif %} +{%- if tools is iterable and tools | length > 0 %} + {%- if system_message is defined and system_message | length > 0 %} + {{- "\n\n" }} + {%- endif %} + {{- "# Tools\n\nYou have access to the following functions:\n\n" }} + {{- "" }} + {%- for tool in tools %} + {%- if tool.function is defined %} + {%- set tool = tool.function %} + {%- endif %} + {{- "\n\n" ~ tool.name ~ "" }} + {%- if tool.description is defined %} + {{- '\n' ~ (tool.description | trim) ~ '' }} + {%- endif %} + {{- '\n' }} + {%- if tool.parameters is defined and tool.parameters is mapping and tool.parameters.properties is defined and tool.parameters.properties is mapping %} + {%- for param_name, param_fields in tool.parameters.properties|items %} + {{- '\n' }} + {{- '\n' ~ param_name ~ '' }} + {%- if param_fields.type is defined %} + {{- '\n' ~ (param_fields.type | string) ~ '' }} + {%- endif %} + {%- if param_fields.description is defined %} + {{- '\n' ~ (param_fields.description | trim) ~ '' }} + {%- endif %} + {%- if param_fields.enum is defined %} + {{- '\n' ~ (param_fields.enum | tojson | safe) ~ '' }} + {%- endif %} + {%- set handled_keys = ['name', 'type', 'description', 'enum'] %} + {{- render_extra_keys(param_fields, handled_keys) }} + {{- '\n' }} + {%- endfor %} + {%- endif %} + {% set handled_keys = ['type', 'properties', 'required'] %} + {{- render_extra_keys(tool.parameters, handled_keys) }} + {%- if tool.parameters is defined and tool.parameters.required is defined %} + {{- '\n' ~ (tool.parameters.required | tojson | safe) ~ '' }} + {%- endif %} + {{- '\n' }} + {%- set handled_keys = ['type', 'name', 'description', 'parameters'] %} + {{- render_extra_keys(tool, handled_keys) }} + {{- '\n' }} + {%- endfor %} + {{- "\n" }} + + {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\n\n\nvalue_1\n\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n' }} +{%- endif %} + + +{%- if system_message is defined %} + {{- '<|im_end|>\n' }} +{%- else %} + {%- if tools is iterable and tools | length > 0 %} + {{- '<|im_end|>\n' }} + {%- endif %} +{%- endif %} + +{%- for message in loop_messages %} + {%- if message.role == "assistant" %} + {# Add reasoning content in to content field for unified processing below. #} + {%- if message.reasoning_content is defined and message.reasoning_content is string and message.reasoning_content | trim | length > 0 %} + {%- set content = "\n" ~ message.reasoning_content ~ "\n\n" ~ (message.content | default('', true)) %} + {%- else %} + {%- set content = message.content | default('', true) %} + {%- if content is string -%} + {# Allow downstream logic to to take care of broken thought, only handle coherent reasoning here. #} + {%- if '' not in content and '' not in content -%} + {%- set content = "" ~ content -%} + {%- endif -%} + {%- else -%} + {%- set content = content -%} + {%- endif -%} + {%- endif %} + {%- if message.tool_calls is defined and message.tool_calls is iterable and message.tool_calls | length > 0 %} + {# Assistant message has tool calls. #} + {{- '<|im_start|>assistant\n' }} + {%- set include_content = not (truncate_history_thinking and loop.index0 < ns.last_user_idx) %} + {%- if content is string and content | trim | length > 0 %} + {%- if include_content %} + {{- (content | trim) ~ '\n' -}} + {%- else %} + {%- set c = (content | string) %} + {%- if '' in c %} + {# Keep only content after the last closing think. Also generation prompt causes this. #} + {%- set c = c.split('')[-1] %} + {%- elif '' in c %} + {# If was opened but never closed, drop the trailing think segment #} + {%- set c = c.split('')[0] %} + {%- endif %} + {%- set c = "" ~ c | trim %} + {%- if c | length > 0 %} + {{- c ~ '\n' -}} + {%- endif %} + {%- endif %} + {%- else %} + {{- "" -}} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n' -}} + {%- if tool_call.arguments is defined %} + {%- for args_name, args_value in tool_call.arguments|items %} + {{- '\n' -}} + {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %} + {{- args_value ~ '\n\n' -}} + {%- endfor %} + {%- endif %} + {{- '\n\n' -}} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- else %} + {# Assistant message doesn't have tool calls. #} + {%- if not (truncate_history_thinking and loop.index0 < ns.last_user_idx) %} + {{- '<|im_start|>assistant\n' ~ (content | default('', true) | string | trim) ~ '<|im_end|>\n' }} + {%- else %} + {%- set c = (content | default('', true) | string) %} + {%- if '' in c and '' in c %} + {%- set c = "" ~ c.split('')[-1] %} + {%- endif %} + {%- set c = c | trim %} + {%- if c | length > 0 %} + {{- '<|im_start|>assistant\n' ~ c ~ '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>assistant\n<|im_end|>\n' }} + {%- endif %} + {%- endif %} + {%- endif %} + {%- elif message.role == "user" or message.role == "system" %} + {{- '<|im_start|>' + message.role + '\n' }} + {%- set content = message.content | string %} + {{- content }} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.previtem and loop.previtem.role != "tool" %} + {{- '<|im_start|>user\n' }} + {%- endif %} + {{- '\n' }} + {{- message.content }} + {{- '\n\n' }} + {%- if not loop.last and loop.nextitem.role != "tool" %} + {{- '<|im_end|>\n' }} + {%- elif loop.last %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' }} + {%- endif %} +{%- endfor %} + +{%- if add_generation_prompt %} + {%- if enable_thinking %} + {{- '<|im_start|>assistant\n\n' }} + {%- else %} + {{- '<|im_start|>assistant\n' }} + {%- endif %} +{%- endif %} diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index 007929f517..02af5251cc 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -3588,6 +3588,163 @@ static void test_template_output_peg_parsers() { t.expect.content =R"({"amount": 123.45, "date": "2025-12-03"})"; }); } + + { + // NVIDIA Nemotron-3 Nano + auto tmpls = read_templates("models/templates/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja"); + + // Test basic message + test_peg_parser(tmpls.get(), [&](auto & t) { + t.input = "Hello, world!\nWhat's up?"; + t.expect = message_assist; + }); + + // Test basic message and reasoning with reasoning_format = none + test_peg_parser(tmpls.get(), [&](auto & t) { + t.input = "I'm\nthinking\n\nHello, world!\nWhat's up?"; + t.expect.content = "I'm\nthinking\n\nHello, world!\nWhat's up?"; + }); + + // Test basic message and reasoning with reasoning_format = auto + test_peg_parser(tmpls.get(), [&](auto & t) { + t.input = "I'm\nthinking\n\nHello, world!\nWhat's up?"; + t.params.enable_thinking = true; + t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; + + t.expect = message_assist_thoughts; + }); + + // Test tool call + test_peg_parser(tmpls.get(), [&](auto & t) { + t.input = + "\n" + "\n" + "\n" + "1\n" + "\n" + "\n" + ""; + t.params.enable_thinking = false; + t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; + t.params.tools = {special_function_tool}; + + t.expect = message_assist_call; + }); + + // Test tool call with reasoning + test_peg_parser(tmpls.get(), [&](auto & t) { + t.input = + "I'm\nthinking\n\n" + "\n" + "\n" + "\n" + "1\n" + "\n" + "\n" + ""; + t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; + t.params.tools = {special_function_tool}; + + t.expect = message_assist_call_thoughts; + }); + + // Test parallel tool calls + test_peg_parser(tmpls.get(), [&](auto & t) { + t.input = + "\n" + "\n" + "\n" + "1\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "1\n" + "\n" + "\n" + "2\n" + "\n" + "\n" + ""; + t.params.enable_thinking = false; + t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; + t.params.parallel_tool_calls = true; + t.params.tools = {special_function_tool, special_function_tool_with_optional_param}; + + t.expect.tool_calls = {{ + /* .name = */ "special_function", + /* .arguments = */ R"({"arg1": 1})", + /* .id = */ {}, + }, { + /* .name = */ "special_function_with_opt", + /* .arguments = */ R"({"arg1": 1, "arg2": 2})", + /* .id = */ {}, + }}; + }); + + // Test tool call with string parameter + test_peg_parser(tmpls.get(), [&](auto & t) { + t.input = + "\n" + "\n" + "\n" + "def hello():\n" + " print(\"Hello, world!\")\n" + "\n" + "hello()\n" + "\n" + "\n" + ""; + t.params.enable_thinking = false; + t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; + t.params.tools = {python_tool}; + + t.expect.tool_calls = {{ + /* .name = */ "python", + /* .arguments = */ "{\"code\": \"def hello():\\n print(\\\"Hello, world!\\\")\\n\\nhello()\"}", + /* .id = */ {}, + }}; + }); + + // Test tool call with string parameter and no closing tag + test_peg_parser(tmpls.get(), [&](auto & t) { + t.input = + "\n" + "\n" + "\n" + "def hello():\n" + " print(\"Hello, world!\")\n" + "\n" + "hello()\n" + "\n" + ""; + t.params.enable_thinking = false; + t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; + t.params.tools = {python_tool}; + + t.expect.tool_calls = {{ + /* .name = */ "python", + /* .arguments = */ "{\"code\": \"def hello():\\n print(\\\"Hello, world!\\\")\\n\\nhello()\"}", + /* .id = */ {}, + }}; + }); + + // Test response format + test_peg_parser(tmpls.get(), [&](auto & t) { + t.input = + "I need to output the invoice details in JSON\n" + "\n" + R"({"amount": 123.45, "date": "2025-12-03"})"; + t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; + t.params.json_schema = invoice_schema; + + t.expect.reasoning_content = "I need to output the invoice details in JSON"; + t.expect.content = R"({"amount": 123.45, "date": "2025-12-03"})"; + }); + } + } static void test_msg_diffs_compute() { diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index 6a4bd8fb4d..a8e9ff33a4 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -1367,10 +1367,85 @@ static void test_all(const std::string & lang, std::function Date: Tue, 16 Dec 2025 11:17:20 +0100 Subject: [PATCH 04/44] model-conversion : use CONVERTED_EMBEDDING_MODEL for embedding_verify_logits (#18079) This commit updates the embedding model verification script to use the CONVERTED_EMBEDDING_MODEL environment variable instead of using the EMBEDDING_MODEL_PATH (the original embedding model path) as the basis for the converted model file name. The motivation for this that currently if the converted embedding model file name differs from the original embedding model directory/name the verification script will look for the wrong .bin files that were generating when running the models. --- .../scripts/embedding/compare-embeddings-logits.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh b/examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh index c48af3075c..984d03e95d 100755 --- a/examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh +++ b/examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh @@ -34,8 +34,11 @@ done MODEL_PATH="${MODEL_PATH:-"$EMBEDDING_MODEL_PATH"}" MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}" +CONVERTED_MODEL_PATH="${CONVERTED_EMBEDDING_PATH:-"$CONVERTED_EMBEDDING_MODEL"}" +CONVERTED_MODEL_NAME="${CONVERTED_MODEL_NAME:-$(basename "$CONVERTED_MODEL_PATH" .gguf)}" + if [ -t 0 ]; then - CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin" + CPP_EMBEDDINGS="data/llamacpp-${CONVERTED_MODEL_NAME}-embeddings.bin" else # Process piped JSON data and convert to binary (matching logits.cpp format) TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn) From 9963b81f6392da8066958c177db77ad4b4a8f284 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 16 Dec 2025 11:17:40 +0100 Subject: [PATCH 05/44] model-conversion : add note about verifying previous models (#18082) This commit adds a note to the README in the model-conversion examples, advising developers to verify that previous versions of models pass logits verification before adding new models from the same family. --- examples/model-conversion/README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/model-conversion/README.md b/examples/model-conversion/README.md index 05d95d588b..8163b306b4 100644 --- a/examples/model-conversion/README.md +++ b/examples/model-conversion/README.md @@ -10,6 +10,13 @@ and in some cases perplexity checked of the quantized model. And finally the model/models need to the ggml-org on Hugging Face. This tool/example tries to help with this process. +> 📝 **Note:** When adding a new model from an existing family, verify the +> previous version passes logits verification first. Existing models can have +> subtle numerical differences that don't affect generation quality but cause +> logits mismatches. Identifying these upfront whether they exist in llama.cpp, +> the conversion script, or in an upstream implementation, can save significant +> debugging time. + ### Overview The idea is that the makefile targets and scripts here can be used in the development/conversion process assisting with things like: From 3d86c6c2b546ae5ce9cdee3ca0ac4a8181e0a073 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Tue, 16 Dec 2025 11:25:26 +0100 Subject: [PATCH 06/44] model: support GLM4V vision encoder (#18042) * convert ok * no deepstack * less new tensors * cgraph ok * add mrope for text model * faster patch merger * add GGML_ROPE_TYPE_MRNORM * add support for metal * move glm4v do dedicated graph * convert: add norm_embd * clip: add debugging fn * working correctly * fix style * use bicubic * fix mrope metal * improve cpu * convert to neox ordering on conversion * revert backend changes * force stop if using old weight * support moe variant * fix conversion * fix convert (2) * Update tools/mtmd/clip-graph.h Co-authored-by: Georgi Gerganov * process mrope_section on TextModel base class * resolve conflict merge --------- Co-authored-by: Georgi Gerganov --- convert_hf_to_gguf.py | 111 ++++++++++++++++++++---------- gguf-py/gguf/constants.py | 7 ++ gguf-py/gguf/tensor_mapping.py | 15 +++++ src/llama-hparams.cpp | 4 ++ src/llama-hparams.h | 2 + src/llama-model.cpp | 15 +++-- src/models/glm4-moe.cpp | 37 +++++++--- src/models/glm4.cpp | 31 +++++++-- tools/mtmd/CMakeLists.txt | 1 + tools/mtmd/clip-graph.h | 4 +- tools/mtmd/clip-impl.h | 11 ++- tools/mtmd/clip-model.h | 13 +++- tools/mtmd/clip.cpp | 107 +++++++++++++++++++++++------ tools/mtmd/clip.h | 2 +- tools/mtmd/models/glm4v.cpp | 120 +++++++++++++++++++++++++++++++++ tools/mtmd/models/models.h | 5 ++ tools/mtmd/mtmd.cpp | 6 +- 17 files changed, 412 insertions(+), 79 deletions(-) create mode 100644 tools/mtmd/models/glm4v.cpp diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 01b15e4b9a..bd16ba312f 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -862,6 +862,14 @@ class TextModel(ModelBase): logger.warning(f"Unknown RoPE type: {rope_type}") logger.info(f"gguf: rope scaling type = {rope_gguf_type.name}") + if "mrope_section" in self.rope_parameters: + mrope_section = self.rope_parameters["mrope_section"] + # Pad to 4 dimensions [time, height, width, extra] + while len(mrope_section) < 4: + mrope_section.append(0) + self.gguf_writer.add_rope_dimension_sections(mrope_section[:4]) + logger.info(f"gguf: mrope sections: {mrope_section[:4]}") + if (rope_theta := rope_params.get("rope_theta")) is not None: self.gguf_writer.add_rope_freq_base(rope_theta) logger.info(f"gguf: rope theta = {rope_theta}") @@ -3739,9 +3747,6 @@ class Qwen2VLModel(TextModel): def set_gguf_parameters(self): super().set_gguf_parameters() - mrope_section = self.hparams["rope_scaling"]["mrope_section"] - mrope_section += [0] * max(0, 4 - len(mrope_section)) - self.gguf_writer.add_rope_dimension_sections(mrope_section) def set_vocab(self): try: @@ -4377,6 +4382,30 @@ class Qwen3VLVisionModel(MmprojModel): return super().modify_tensors(data_torch, name, bid) +@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration") +class Glm4VVisionModel(Qwen3VLVisionModel): + def set_gguf_parameters(self): + MmprojModel.set_gguf_parameters(self) # skip Qwen3VLVisionModel parameters + assert self.hparams_vision is not None + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V) + + hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower() + if hidden_act == "gelu": + self.gguf_writer.add_vision_use_gelu(True) + elif hidden_act == "silu": + self.gguf_writer.add_vision_use_silu(True) + + rms_norm_eps = self.hparams_vision.get("rms_norm_eps", 1e-5) + self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.startswith("model.visual."): + name = name.replace("model.visual.", "visual.") + if name.startswith("visual.merger."): + return [(self.map_tensor_name(name), data_torch)] + return super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("Qwen3VLForConditionalGeneration") class Qwen3VLTextModel(Qwen3Model): model_arch = gguf.MODEL_ARCH.QWEN3VL @@ -4385,20 +4414,6 @@ class Qwen3VLTextModel(Qwen3Model): super().set_gguf_parameters() # Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL - text_config = self.hparams.get("text_config", {}) - # rope_scaling is deprecated in V5, use rope_parameters instead - rope_scaling = text_config.get("rope_scaling") or text_config.get("rope_parameters") or {} - - if rope_scaling.get("mrope_section"): - # mrope_section contains [time, height, width] dimensions - mrope_section = rope_scaling["mrope_section"] - # Pad to 4 dimensions [time, height, width, extra] - while len(mrope_section) < 4: - mrope_section.append(0) - self.gguf_writer.add_rope_dimension_sections(mrope_section[:4]) - - logger.info(f"MRoPE sections: {mrope_section[:4]}") - vision_config = self.hparams.get("vision_config", {}) deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", [])) self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num) @@ -4417,22 +4432,6 @@ class Qwen3VLMoeTextModel(Qwen3MoeModel): def set_gguf_parameters(self): super().set_gguf_parameters() - - # Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL - text_config = self.hparams.get("text_config", {}) - # rope_scaling is deprecated in V5, use rope_parameters instead - rope_scaling = text_config.get("rope_scaling") or text_config.get("rope_parameters") or {} - - if rope_scaling.get("mrope_section"): - # mrope_section contains [time, height, width] dimensions - mrope_section = rope_scaling["mrope_section"] - # Pad to 4 dimensions [time, height, width, extra] - while len(mrope_section) < 4: - mrope_section.append(0) - self.gguf_writer.add_rope_dimension_sections(mrope_section[:4]) - - logger.info(f"MRoPE sections: {mrope_section[:4]}") - vision_config = self.hparams.get("vision_config", {}) deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", [])) self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num) @@ -7795,6 +7794,15 @@ class JaisModel(TextModel): @ModelBase.register("Glm4ForCausalLM", "Glm4vForConditionalGeneration") class Glm4Model(TextModel): model_arch = gguf.MODEL_ARCH.GLM4 + use_mrope = False + partial_rotary_factor = 0.5 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 0.5) + if "mrope_section" in self.rope_parameters: + self.use_mrope = True + logger.info("Q/K weight will need to be permuted for M-RoPE") def set_vocab(self): from transformers import AutoTokenizer @@ -7816,17 +7824,49 @@ class Glm4Model(TextModel): super().set_gguf_parameters() if (rope_dim := self.hparams.get("head_dim")) is None: rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) + self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.partial_rotary_factor)) + + @staticmethod + def normal_to_neox(weights: Tensor, n_head: int, n_head_kv: int, head_dim: int, partial_rotary_factor: float) -> Tensor: + orig_shape = weights.shape + if len(orig_shape) == 1: + weights = weights.unsqueeze(1) # [out_dim, 1] + if len(weights.shape) != 2: + raise ValueError("Only 1D and 2D tensors are supported.") + n_effective_heads = weights.shape[0] // head_dim + if n_head_kv is not None and n_effective_heads != n_head: + if n_effective_heads != n_head_kv: + raise AssertionError(f"Mismatch in effective heads: computed {n_effective_heads}, expected {n_head} or {n_head_kv}") + rotary_dim = int(head_dim * partial_rotary_factor) + if rotary_dim % 2 != 0: + raise ValueError("rotary_dim must be even.") + reshaped = weights.reshape(n_effective_heads, head_dim, -1) + rot_part = reshaped[:, :rotary_dim, :] + non_rot_part = reshaped[:, rotary_dim:, :] + permuted_rot = torch.cat((rot_part[:, ::2, :], rot_part[:, 1::2, :]), dim=1) + combined = torch.cat((permuted_rot, non_rot_part), dim=1) + result = combined.reshape(weights.shape) + return result if len(orig_shape) != 1 else result.squeeze(1) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if name.startswith("model.visual."): # ignore visual part of Glm4v return [] elif name.startswith("model.language_model."): name = name.replace("language_model.", "") # for Glm4v + if self.use_mrope: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams["num_key_value_heads"] + n_embd = self.hparams["hidden_size"] + head_dim = n_embd // n_head + # because llama.cpp M-RoPE kernel only supports Neox ordering, we have to permute the weights here + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_head, head_dim, self.partial_rotary_factor) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_kv_head, head_dim, self.partial_rotary_factor) return super().modify_tensors(data_torch, name, bid) -@ModelBase.register("Glm4MoeForCausalLM") +@ModelBase.register("Glm4MoeForCausalLM", "Glm4vMoeForConditionalGeneration") class Glm4MoeModel(TextModel): model_arch = gguf.MODEL_ARCH.GLM4_MOE @@ -7893,6 +7933,7 @@ class Glm4MoeModel(TextModel): _experts: list[dict[str, Tensor]] | None = None + # note: unlike GLM4V non-MoE, we don't need to permute Q/K here since GLM4V_MOE uses Neox ordering already def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 5ca4efd043..fed0e94dcf 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -643,6 +643,7 @@ class MODEL_TENSOR(IntEnum): V_MMPROJ_PEG = auto() V_ENC_EMBD_CLS = auto() V_ENC_EMBD_PATCH = auto() + V_ENC_EMBD_NORM = auto() V_ENC_EMBD_POS = auto() V_ENC_INPUT_NORM = auto() V_ENC_ATTN_QKV = auto() @@ -661,6 +662,7 @@ class MODEL_TENSOR(IntEnum): V_LAYER_SCALE_2 = auto() V_PRE_NORM = auto() V_POST_NORM = auto() + V_MM_POST_NORM = auto() V_MM_INP_NORM = auto() V_MM_INP_PROJ = auto() # gemma3 V_MM_SOFT_EMB_NORM = auto() # gemma3 @@ -1016,6 +1018,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.V_MMPROJ_PEG: "mm.model.peg.{bid}", MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd", MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd", + MODEL_TENSOR.V_ENC_EMBD_NORM: "v.norm_embd", MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd", MODEL_TENSOR.V_ENC_ATTN_QKV: "v.blk.{bid}.attn_qkv", MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q", @@ -1034,6 +1037,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.V_LAYER_SCALE_2: "v.blk.{bid}.ls2", MODEL_TENSOR.V_PRE_NORM: "v.pre_ln", MODEL_TENSOR.V_POST_NORM: "v.post_ln", + MODEL_TENSOR.V_MM_POST_NORM: "mm.post_norm", MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection", MODEL_TENSOR.V_MM_INP_NORM: "mm.input_norm", MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm", @@ -1094,6 +1098,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.V_MMPROJ_PEG, MODEL_TENSOR.V_ENC_EMBD_CLS, MODEL_TENSOR.V_ENC_EMBD_PATCH, + MODEL_TENSOR.V_ENC_EMBD_NORM, MODEL_TENSOR.V_ENC_EMBD_POS, MODEL_TENSOR.V_ENC_INPUT_NORM, MODEL_TENSOR.V_ENC_ATTN_QKV, @@ -1112,6 +1117,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.V_LAYER_SCALE_2, MODEL_TENSOR.V_PRE_NORM, MODEL_TENSOR.V_POST_NORM, + MODEL_TENSOR.V_MM_POST_NORM, MODEL_TENSOR.V_MM_INP_PROJ, MODEL_TENSOR.V_MM_INP_NORM, MODEL_TENSOR.V_MM_SOFT_EMB_NORM, @@ -3357,6 +3363,7 @@ class VisionProjectorType: LIGHTONOCR = "lightonocr" COGVLM = "cogvlm" JANUS_PRO = "janus_pro" + GLM4V = "glm4v" # Items here are (block size, type size) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 610227231f..e04ff11164 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1212,6 +1212,7 @@ class TensorNameMap: MODEL_TENSOR.V_MMPROJ_FC: ( "model.connector.modality_projection.proj", # SmolVLM "model.vision.linear_proj.linear_proj", # cogvlm + "visual.merger.proj", # glm4v ), MODEL_TENSOR.V_MMPROJ_MLP: ( @@ -1245,6 +1246,10 @@ class TensorNameMap: "model.vision.patch_embedding.proj", # cogvlm ), + MODEL_TENSOR.V_ENC_EMBD_NORM: ( + "visual.post_conv_layernorm", # glm4v + ), + MODEL_TENSOR.V_ENC_EMBD_POS: ( "vision_tower.vision_model.embeddings.position_embedding", "model.vision_tower.embeddings.position_embeddings", # Intern-S1 @@ -1254,6 +1259,7 @@ class TensorNameMap: "vision_tower.patch_embed.pos_emb", # kimi-vl "visual.pos_embed", # qwen3vl "model.vision.patch_embedding.position_embedding", # cogvlm + "visual.embeddings.position_embedding", # glm4v ), MODEL_TENSOR.V_ENC_ATTN_QKV: ( @@ -1409,6 +1415,11 @@ class TensorNameMap: "vision_model.layernorm_post", # llama4 "visual.merger.ln_q", # qwen2vl "vision_tower.encoder.final_layernorm", # kimi-vl + "visual.post_layernorm", # glm4v + ), + + MODEL_TENSOR.V_MM_POST_NORM: ( + "visual.merger.post_projection_norm", # glm4v ), MODEL_TENSOR.V_MM_INP_PROJ: ( @@ -1478,6 +1489,7 @@ class TensorNameMap: MODEL_TENSOR.V_MM_PATCH_MERGER: ( "multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1 - hf "patch_merger.merging_layer", # mistral + "visual.downsample", # glm4v ), MODEL_TENSOR.V_DS_NORM: ( @@ -1498,14 +1510,17 @@ class TensorNameMap: MODEL_TENSOR.V_MM_UP: ( "model.vision.linear_proj.dense_h_to_4h", # cogvlm + "visual.merger.up_proj", # glm4v ), MODEL_TENSOR.V_MM_DOWN: ( "model.vision.linear_proj.dense_4h_to_h", # cogvlm + "visual.merger.down_proj", # glm4v ), MODEL_TENSOR.V_MM_GATE: ( "model.vision.linear_proj.gate_proj", # cogvlm + "visual.merger.gate_proj", # glm4v ), MODEL_TENSOR.V_TOK_BOI: ( diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 83d6d6ee3c..fe1fa4341d 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -231,3 +231,7 @@ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama return false; } + +bool llama_hparams::use_mrope() const { + return rope_sections[0] > 0 && rope_sections[1] > 0; +} diff --git a/src/llama-hparams.h b/src/llama-hparams.h index cecb476e91..f6e95b5d2a 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -270,6 +270,8 @@ struct llama_hparams { // TODO: think of a better place for this function // TODO: pack the SWA params in a struct? static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1); + + bool use_mrope() const; }; static_assert(std::is_trivially_copyable::value, "llama_hparams must be trivially copyable"); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 15547403b4..ae8207ee1a 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1689,7 +1689,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { } break; case LLM_ARCH_GLM4: { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false); switch (hparams.n_layer) { case 40: type = LLM_TYPE_9B; break; case 61: type = LLM_TYPE_32B; break; @@ -1698,8 +1699,9 @@ void llama_model::load_hparams(llama_model_loader & ml) { } break; case LLM_ARCH_GLM4_MOE: { - ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false); // MoE parameters ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert); @@ -7792,7 +7794,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_DEEPSEEK2: case LLM_ARCH_PLM: case LLM_ARCH_CHATGLM: - case LLM_ARCH_GLM4: case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_GRANITE_HYBRID: @@ -7854,7 +7855,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_LFM2: case LLM_ARCH_LFM2MOE: case LLM_ARCH_SMALLTHINKER: - case LLM_ARCH_GLM4_MOE: case LLM_ARCH_SEED_OSS: case LLM_ARCH_GROVEMOE: case LLM_ARCH_APERTUS: @@ -7871,6 +7871,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_QWEN3VLMOE: return LLAMA_ROPE_TYPE_IMROPE; + case LLM_ARCH_GLM4: + return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM; + case LLM_ARCH_GLM4_MOE: + return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX; + // all model arches should be listed explicitly here case LLM_ARCH_UNKNOWN: GGML_ABORT("unknown architecture"); diff --git a/src/models/glm4-moe.cpp b/src/models/glm4-moe.cpp index 33ee707046..003f70f739 100644 --- a/src/models/glm4-moe.cpp +++ b/src/models/glm4-moe.cpp @@ -5,11 +5,20 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + int sections[4]; + std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); + ggml_tensor * cur; ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); + bool use_mrope = hparams.use_mrope(); + if (ubatch.embd && !use_mrope) { + // unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results + GGML_ABORT("This GGUF does not support multimodal. Please reconvert it."); + } + // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); @@ -60,17 +69,25 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); cb(Kcur, "Kcur_normed", il); } - Qcur = ggml_rope_ext( - ctx0, Qcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - Kcur = ggml_rope_ext( - ctx0, Kcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); + if (use_mrope) { + Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + } else { + // Normal RoPE + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, + rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, + rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + } cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp index f789b28248..204aa3932a 100644 --- a/src/models/glm4.cpp +++ b/src/models/glm4.cpp @@ -8,11 +8,20 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + int sections[4]; + std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); + ggml_tensor * cur; ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); + bool use_mrope = hparams.use_mrope(); + if (ubatch.embd && !use_mrope) { + // unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results + GGML_ABORT("This GGUF does not support multimodal. Please reconvert it."); + } + // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); @@ -63,11 +72,25 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)); } - Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); + if (use_mrope) { + Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + } else { + // Normal RoPE + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, + rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, + rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + } cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 3ee42036fd..e7f3067a16 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -15,6 +15,7 @@ add_library(mtmd clip-graph.h models/models.h models/cogvlm.cpp + models/glm4v.cpp models/internvl.cpp models/kimivl.cpp models/llama4.cpp diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h index 17f90e8aa8..2b1915779f 100644 --- a/tools/mtmd/clip-graph.h +++ b/tools/mtmd/clip-graph.h @@ -9,6 +9,8 @@ #include #include +#define DEFAULT_INTERPOLATION_MODE (GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS) + struct clip_graph { const clip_model & model; const clip_hparams & hparams; @@ -49,7 +51,7 @@ struct clip_graph { void cb(ggml_tensor * cur0, const char * name, int il) const; // siglip2 naflex - ggml_tensor * resize_position_embeddings(); + ggml_tensor * resize_position_embeddings(uint32_t interpolation_mode = DEFAULT_INTERPOLATION_MODE); // build vision transformer (ViT) cgraph // this function should cover most of the models diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 9ef0d301e4..d75233cc0a 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -68,6 +68,7 @@ #define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat #define TN_PATCH_EMBD_1 "v.patch_embd.weight.1" #define TN_PATCH_BIAS "v.patch_embd.bias" +#define TN_NORM_EMBD "v.norm_embd.%s" #define TN_ATTN_QKV "%s.blk.%d.attn_qkv.%s" #define TN_ATTN_K "%s.blk.%d.attn_k.%s" #define TN_ATTN_Q "%s.blk.%d.attn_q.%s" @@ -86,6 +87,10 @@ #define TN_LN_PRE "%s.pre_ln.%s" #define TN_LN_POST "%s.post_ln.%s" #define TN_LLAVA_PROJ "mm.%d.%s" +#define TN_MM_UP "mm.up.%s" +#define TN_MM_GATE "mm.gate.%s" +#define TN_MM_DOWN "mm.down.%s" +#define TN_MM_POST_NORM "mm.post_norm.%s" #define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" #define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" @@ -95,7 +100,7 @@ #define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3 #define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3 #define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3 -#define TN_MM_PATCH_MERGER "mm.patch_merger.weight" // mistral small 3.1 +#define TN_MM_PATCH_MERGER "mm.patch_merger.%s" // mistral small 3.1, glm4v #define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral #define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model) #define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model) @@ -165,6 +170,7 @@ enum projector_type { PROJECTOR_TYPE_LIGHTONOCR, PROJECTOR_TYPE_COGVLM, PROJECTOR_TYPE_JANUS_PRO, + PROJECTOR_TYPE_GLM4V, PROJECTOR_TYPE_UNKNOWN, }; @@ -192,6 +198,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"}, { PROJECTOR_TYPE_COGVLM, "cogvlm"}, { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"}, + { PROJECTOR_TYPE_GLM4V, "glm4v"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { @@ -495,6 +502,8 @@ static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) { } } +void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value); + // // API used internally with mtmd // diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index 2f7dbb458e..f5c41ff138 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -158,6 +158,8 @@ struct clip_model { ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL) ggml_tensor * patch_bias = nullptr; ggml_tensor * position_embeddings = nullptr; + ggml_tensor * norm_embd_w = nullptr; + ggml_tensor * norm_embd_b = nullptr; ggml_tensor * pre_ln_w = nullptr; ggml_tensor * pre_ln_b = nullptr; @@ -172,6 +174,14 @@ struct clip_model { ggml_tensor * projection; // TODO: rename it to fc (fully connected layer) ggml_tensor * mm_fc_w; ggml_tensor * mm_fc_b; + ggml_tensor * mm_ffn_up_w = nullptr; + ggml_tensor * mm_ffn_up_b = nullptr; + ggml_tensor * mm_ffn_gate_w = nullptr; + ggml_tensor * mm_ffn_gate_b = nullptr; + ggml_tensor * mm_ffn_down_w = nullptr; + ggml_tensor * mm_ffn_down_b = nullptr; + ggml_tensor * mm_post_norm_w = nullptr; + ggml_tensor * mm_post_norm_b = nullptr; // LLaVA projection ggml_tensor * mm_input_norm_w = nullptr; @@ -253,9 +263,10 @@ struct clip_model { ggml_tensor * mm_input_proj_w = nullptr; ggml_tensor * mm_soft_emb_norm_w = nullptr; - // pixtral + // pixtral, glm4v ggml_tensor * token_embd_img_break = nullptr; ggml_tensor * mm_patch_merger_w = nullptr; + ggml_tensor * mm_patch_merger_b = nullptr; // ultravox / whisper encoder ggml_tensor * conv1d_1_w = nullptr; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index fee49e465c..35e3aef0ab 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -264,11 +264,11 @@ void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const { } // siglip2 naflex -ggml_tensor * clip_graph::resize_position_embeddings() { +ggml_tensor * clip_graph::resize_position_embeddings(uint32_t interpolation_mode) { ggml_tensor * pos_embd = model.position_embeddings; const int height = img.ny / patch_size; const int width = img.nx / patch_size; - const uint32_t mode = GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS; + const uint32_t mode = interpolation_mode; const int n_per_side = (int)std::sqrt(pos_embd->ne[1]); GGML_ASSERT(pos_embd); @@ -485,19 +485,14 @@ ggml_tensor * clip_graph::build_norm( ? ggml_rms_norm(ctx0, cur, norm_eps) : ggml_norm(ctx0, cur, norm_eps); - if (mw || mb) { - cb(cur, "norm", il); - } - if (mw) { cur = ggml_mul(ctx0, cur, mw); - if (mb) { - cb(cur, "norm_w", il); - } + cb(cur, "norm_w", il); } if (mb) { cur = ggml_add(ctx0, cur, mb); + cb(cur, "norm_b", il); } return cur; @@ -842,6 +837,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_GLM4V: + { + builder = std::make_unique(ctx, img); + } break; default: GGML_ABORT("missing cgraph builder"); } @@ -1155,6 +1154,14 @@ struct clip_model_loader { LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__); } } break; + case PROJECTOR_TYPE_GLM4V: + { + hparams.rope_theta = 10000.0f; + hparams.n_merge = 2; // default value for GLM4-V + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); + hparams.set_limit_image_tokens(8, 4096); + hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup + } break; case PROJECTOR_TYPE_LLAMA4: { hparams.rope_theta = 10000.0f; @@ -1282,6 +1289,9 @@ struct clip_model_loader { model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false); model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false); + model.norm_embd_w = get_tensor(string_format(TN_NORM_EMBD, "weight"), false); + model.norm_embd_b = get_tensor(string_format(TN_NORM_EMBD, "bias"), false); + model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false); // layers @@ -1470,6 +1480,20 @@ struct clip_model_loader { model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); } break; + case PROJECTOR_TYPE_GLM4V: + { + model.projection = get_tensor(TN_MM_PROJECTOR); + model.mm_ffn_up_w = get_tensor(string_format(TN_MM_UP, "weight")); + model.mm_ffn_up_b = get_tensor(string_format(TN_MM_UP, "bias"), false); + model.mm_ffn_gate_w = get_tensor(string_format(TN_MM_GATE, "weight")); + model.mm_ffn_gate_b = get_tensor(string_format(TN_MM_GATE, "bias"), false); + model.mm_ffn_down_w = get_tensor(string_format(TN_MM_DOWN, "weight")); + model.mm_ffn_down_b = get_tensor(string_format(TN_MM_DOWN, "bias"), false); + model.mm_post_norm_w = get_tensor(string_format(TN_MM_POST_NORM, "weight")); + model.mm_post_norm_b = get_tensor(string_format(TN_MM_POST_NORM, "bias"), false); + model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight")); + model.mm_patch_merger_b = get_tensor(string_format(TN_MM_PATCH_MERGER, "bias")); + } break; case PROJECTOR_TYPE_GEMMA3: { model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ); @@ -1498,8 +1522,8 @@ struct clip_model_loader { // [IMG_BREAK] token embedding model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK); // for mistral small 3.1 - model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false); - model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false); + model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false); + model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false); } break; case PROJECTOR_TYPE_LIGHTONOCR: { @@ -1507,8 +1531,8 @@ struct clip_model_loader { model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false); model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false); - model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false); - model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false); + model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false); + model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false); } break; case PROJECTOR_TYPE_ULTRAVOX: { @@ -1873,6 +1897,8 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params if (ctx_params.warmup) { loader.warmup(*ctx_vision); } + + // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f); } if (loader.has_audio) { @@ -2582,6 +2608,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_GLM4V: { GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0); clip_image_u8 resized; @@ -2824,16 +2851,30 @@ const char * clip_patch_merge_type(const struct clip_ctx * ctx) { int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) { const auto & params = ctx->model.hparams; const int n_total = clip_n_output_tokens(ctx, img); - if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) { - return img->nx / (params.patch_size * 2); + const auto & proj = ctx->proj_type(); + switch (proj) { + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_GLM4V: + return (img->nx / params.patch_size) / 2; + default: + break; } return n_total; } int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) { const auto & params = ctx->model.hparams; - if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) { - return img->ny / (params.patch_size * 2); + const auto & proj = ctx->proj_type(); + switch (proj) { + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_GLM4V: + return (img->ny / params.patch_size) / 2; + default: + break; } return 1; } @@ -2890,6 +2931,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_GLM4V: { // dynamic size (2 conv, so double patch size) int x_patch = img->nx / (params.patch_size * 2); @@ -3137,6 +3179,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } break; case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_GLM4V: { const int merge_ratio = hparams.n_merge; const int pw = image_size_width / patch_size; @@ -3363,7 +3406,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } // copy the embeddings to the location passed by the user - ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); + if (vec != nullptr) { + ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); + } return true; } @@ -3411,6 +3456,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->model.mm_2_w->ne[1]; case PROJECTOR_TYPE_COGVLM: return ctx->model.mm_4h_to_h_w->ne[1]; + case PROJECTOR_TYPE_GLM4V: + return ctx->model.mm_ffn_down_w->ne[1]; default: GGML_ABORT("Unknown projector type"); } @@ -3427,10 +3474,11 @@ bool clip_is_glm(const struct clip_ctx * ctx) { return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE; } -bool clip_is_qwen2vl(const struct clip_ctx * ctx) { +bool clip_is_mrope(const struct clip_ctx * ctx) { return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL - || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL; + || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL + || ctx->proj_type() == PROJECTOR_TYPE_GLM4V; } bool clip_is_llava(const struct clip_ctx * ctx) { @@ -3491,3 +3539,22 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) { return &ctx->model.hparams; } + +// +// API for debugging +// + +void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) { + clip_image_f32 img; + img.nx = w; + img.ny = h; + img.buf.resize(h * w * 3); + for (int i = 0; i < h * w * 3; i++) { + img.buf[i] = static_cast(fill_value); + } + bool cur_debug_graph = ctx->debug_graph; + ctx->debug_graph = true; + clip_image_encode(ctx, 1, &img, nullptr); + ctx->debug_graph = cur_debug_graph; + GGML_ASSERT(img.buf.empty() && "expected, always stop here"); +} diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 5aae27a700..68a0d6e857 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -104,7 +104,7 @@ bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct int clip_is_minicpmv(const struct clip_ctx * ctx); bool clip_is_glm(const struct clip_ctx * ctx); -bool clip_is_qwen2vl(const struct clip_ctx * ctx); +bool clip_is_mrope(const struct clip_ctx * ctx); bool clip_is_llava(const struct clip_ctx * ctx); bool clip_is_gemma3(const struct clip_ctx * ctx); diff --git a/tools/mtmd/models/glm4v.cpp b/tools/mtmd/models/glm4v.cpp new file mode 100644 index 0000000000..f39b6922eb --- /dev/null +++ b/tools/mtmd/models/glm4v.cpp @@ -0,0 +1,120 @@ +#include "models.h" + +ggml_cgraph * clip_graph_glm4v::build() { + GGML_ASSERT(model.patch_bias != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + GGML_ASSERT(model.class_embedding == nullptr); + + const int batch_size = 1; + + norm_type norm_t = NORM_TYPE_RMS; + + ggml_tensor * inp_raw = build_inp_raw(); + ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + + int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; + ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches * 4); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + GGML_ASSERT(img.nx % (patch_size * 2) == 0); + GGML_ASSERT(img.ny % (patch_size * 2) == 0); + + // second conv dimension + { + auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_add(ctx0, inp, inp_1); + + inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b] + inp = ggml_cont_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); + inp = ggml_reshape_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); + inp = ggml_permute(ctx0, inp, 0, 2, 1, 3); + inp = ggml_cont_3d( + ctx0, inp, + n_embd, n_patches_x * n_patches_y, batch_size); + } + + // add patch bias + inp = ggml_add(ctx0, inp, model.patch_bias); + cb(inp, "patch_bias", -1); + + // pos-conv norm + inp = build_norm(inp, model.norm_embd_w, model.norm_embd_b, norm_t, eps, -1); + + // calculate absolute position embedding and apply + ggml_tensor * learned_pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BICUBIC); + learned_pos_embd = ggml_cont_4d( + ctx0, learned_pos_embd, + n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); + learned_pos_embd = ggml_reshape_4d( + ctx0, learned_pos_embd, + n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); + learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3); + learned_pos_embd = ggml_cont_3d( + ctx0, learned_pos_embd, + n_embd, n_patches_x * n_patches_y, batch_size); + cb(learned_pos_embd, "learned_pos_embd", -1); + + auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { + return ggml_rope_multi( + ctx0, cur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, + 32768, hparams.rope_theta, 1, 0, 1, 32, 1); + }; + + ggml_tensor * cur = build_vit( + inp, n_patches, + norm_t, + hparams.ffn_op, + learned_pos_embd, + add_pos); + + cb(cur, "vit_out", -1); + // cb(ggml_sum(ctx0, cur), "vit_out_sum", -1); + + // GLM4V projector + // ref: https://github.com/huggingface/transformers/blob/40dc11cd3eb4126652aa41ef8272525affd4a636/src/transformers/models/glm4v/modeling_glm4v.py#L116-L130 + + // patch merger (downsample) + { + int n_merge = hparams.n_merge; + GGML_ASSERT(n_merge > 0); + + int n_token_out = n_patches / n_merge / n_merge; + cur = ggml_reshape_4d(ctx0, cur, n_embd, n_merge, n_merge, n_token_out); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); // [n_merge, n_merge, n_embd, n_token_out] + cur = ggml_conv_2d(ctx0, model.mm_patch_merger_w, cur, n_merge, n_merge, 0, 0, 1, 1); + cur = ggml_reshape_2d(ctx0, cur, cur->ne[2], n_token_out); // [n_embd_out, n_token_out] + + cur = ggml_add(ctx0, cur, model.mm_patch_merger_b); + } + + // FC projector + { + cur = ggml_mul_mat(ctx0, model.projection, cur); + // default LayerNorm (post_projection_norm) + cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1); + cur = ggml_gelu_erf(ctx0, cur); + cb(cur, "after_fc_proj", -1); + } + + // FFN projector + { + cur = build_ffn(cur, + model.mm_ffn_up_w, model.mm_ffn_up_b, + model.mm_ffn_gate_w, model.mm_ffn_gate_b, + model.mm_ffn_down_w, model.mm_ffn_down_b, + hparams.ffn_op, -1); + cb(cur, "after_ffn_proj", -1); + // cb(ggml_sum(ctx0, cur), "merged_sum", -1); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index 4b35da259c..0496d6b22f 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -56,3 +56,8 @@ struct clip_graph_whisper_enc : clip_graph { clip_graph_whisper_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; }; + +struct clip_graph_glm4v : clip_graph { + clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index c63f299cd9..2638fe4fc5 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -217,7 +217,7 @@ struct mtmd_context { void init_vision() { GGML_ASSERT(ctx_v != nullptr); - use_mrope = clip_is_qwen2vl(ctx_v); + use_mrope = clip_is_mrope(ctx_v); projector_type proj = clip_get_projector_type(ctx_v); int minicpmv_version = clip_is_minicpmv(ctx_v); @@ -309,6 +309,10 @@ struct mtmd_context { img_beg = "<|image_start|>"; img_end = "<|image_end|>"; + } else if (proj == PROJECTOR_TYPE_GLM4V) { + img_beg = "<|begin_of_image|>"; + img_end = "<|end_of_image|>"; + } } From 5f5f9b46376ac14d7f95b0d968c182f522602880 Mon Sep 17 00:00:00 2001 From: 2114L3 <2114L3@users.noreply.github.com> Date: Tue, 16 Dec 2025 20:50:43 +1000 Subject: [PATCH 07/44] server: Update README.md incorrect argument (#18073) n-gpu-layer is incorrect argument is n-gpu-layers with the 's' --- tools/server/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/server/README.md b/tools/server/README.md index 073bcd2ccd..ef4990faf1 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1430,7 +1430,7 @@ Model presets allow advanced users to define custom configurations using an `.in llama-server --models-preset ./my-models.ini ``` -Each section in the file defines a new preset. Keys within a section correspond to command-line arguments (without leading dashes). For example, the argument `--n-gpu-layer 123` is written as `n-gpu-layer = 123`. +Each section in the file defines a new preset. Keys within a section correspond to command-line arguments (without leading dashes). For example, the argument `--n-gpu-layers 123` is written as `n-gpu-layers = 123`. Short argument forms (e.g., `c`, `ngl`) and environment variable names (e.g., `LLAMA_ARG_N_GPU_LAYERS`) are also supported as keys. @@ -1445,7 +1445,7 @@ version = 1 ; string value chat-template = chatml ; numeric value -n-gpu-layer = 123 +n-gpu-layers = 123 ; flag value (for certain flags, you need to use the "no-" prefix for negation) jinja = true ; shorthand argument (for example, context size) From fb644247de14c616b10deb5e6b17e6f4230f0601 Mon Sep 17 00:00:00 2001 From: Andrew Aladjev Date: Tue, 16 Dec 2025 13:52:23 +0300 Subject: [PATCH 08/44] CLI: fixed adding cli and completion into docker containers, improved docs (#18003) Co-authored-by: Andrew Aladjev --- .devops/cann.Dockerfile | 2 +- .devops/llama-cli-cann.Dockerfile | 5 +++-- .devops/llama-cpp-cuda.srpm.spec | 2 ++ .devops/llama-cpp.srpm.spec | 2 ++ docs/docker.md | 26 +++++++++++++++----------- 5 files changed, 23 insertions(+), 14 deletions(-) diff --git a/.devops/cann.Dockerfile b/.devops/cann.Dockerfile index 83182c9700..db221b0b81 100644 --- a/.devops/cann.Dockerfile +++ b/.devops/cann.Dockerfile @@ -107,7 +107,7 @@ ENTRYPOINT ["/app/tools.sh"] # ENTRYPOINT ["/app/llama-server"] ### Target: light -# Lightweight image containing only llama-cli +# Lightweight image containing only llama-cli and llama-completion # ============================================================================== FROM base AS light diff --git a/.devops/llama-cli-cann.Dockerfile b/.devops/llama-cli-cann.Dockerfile index ef43d78cd2..6581187f32 100644 --- a/.devops/llama-cli-cann.Dockerfile +++ b/.devops/llama-cli-cann.Dockerfile @@ -23,11 +23,12 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH RUN echo "Building with static libs" && \ source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \ cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \ - cmake --build build --config Release --target llama-cli + cmake --build build --config Release --target llama-cli && \ + cmake --build build --config Release --target llama-completion # TODO: use image with NNRT FROM ascendai/cann:$ASCEND_VERSION AS runtime -COPY --from=build /app/build/bin/llama-cli /llama-cli +COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion / ENV LC_ALL=C.utf8 diff --git a/.devops/llama-cpp-cuda.srpm.spec b/.devops/llama-cpp-cuda.srpm.spec index 3bbf4a4def..4d42a906b1 100644 --- a/.devops/llama-cpp-cuda.srpm.spec +++ b/.devops/llama-cpp-cuda.srpm.spec @@ -37,6 +37,7 @@ make -j GGML_CUDA=1 %install mkdir -p %{buildroot}%{_bindir}/ cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli +cp -p llama-completion %{buildroot}%{_bindir}/llama-cuda-completion cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple @@ -68,6 +69,7 @@ rm -rf %{_builddir}/* %files %{_bindir}/llama-cuda-cli +%{_bindir}/llama-cuda-completion %{_bindir}/llama-cuda-server %{_bindir}/llama-cuda-simple /usr/lib/systemd/system/llamacuda.service diff --git a/.devops/llama-cpp.srpm.spec b/.devops/llama-cpp.srpm.spec index 45902dcf89..0a4f43058d 100644 --- a/.devops/llama-cpp.srpm.spec +++ b/.devops/llama-cpp.srpm.spec @@ -39,6 +39,7 @@ make -j %install mkdir -p %{buildroot}%{_bindir}/ cp -p llama-cli %{buildroot}%{_bindir}/llama-cli +cp -p llama-completion %{buildroot}%{_bindir}/llama-completion cp -p llama-server %{buildroot}%{_bindir}/llama-server cp -p llama-simple %{buildroot}%{_bindir}/llama-simple @@ -70,6 +71,7 @@ rm -rf %{_builddir}/* %files %{_bindir}/llama-cli +%{_bindir}/llama-completion %{_bindir}/llama-server %{_bindir}/llama-simple /usr/lib/systemd/system/llama.service diff --git a/docs/docker.md b/docs/docker.md index b9e5015396..a3b263497c 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -7,9 +7,9 @@ ## Images We have three Docker images available for this project: -1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`) -2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`) -3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`) +1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`) +2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the `llama-cli` and `llama-completion` executables. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`) +3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the `llama-server` executable. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`) Additionally, there the following images, similar to the above: @@ -44,13 +44,15 @@ docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --all-in-o On completion, you are ready to play! ```bash -docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 +docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf +docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run-legacy -m /models/32B/ggml-model-q8_0.gguf -no-cnv -p "Building a mobile app can be done in 15 steps:" -n 512 ``` or with a light image: ```bash -docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 +docker run -v /path/to/models:/models --entrypoint /app/llama-cli ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf +docker run -v /path/to/models:/models --entrypoint /app/llama-completion ghcr.io/ggml-org/llama.cpp:light -m /models/32B/ggml-model-q8_0.gguf -no-cnv -p "Building a mobile app can be done in 15 steps:" -n 512 ``` or with a server image: @@ -59,6 +61,8 @@ or with a server image: docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 ``` +In the above examples, `--entrypoint /app/llama-cli` is specified for clarity, but you can safely omit it since it's the default entrypoint in the container. + ## Docker With CUDA Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container. @@ -80,9 +84,9 @@ The defaults are: The resulting images, are essentially the same as the non-CUDA images: -1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. -2. `local/llama.cpp:light-cuda`: This image only includes the main executable file. -3. `local/llama.cpp:server-cuda`: This image only includes the server executable file. +1. `local/llama.cpp:full-cuda`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. +2. `local/llama.cpp:light-cuda`: This image only includes the `llama-cli` and `llama-completion` executables. +3. `local/llama.cpp:server-cuda`: This image only includes the `llama-server` executable. ## Usage @@ -114,9 +118,9 @@ The defaults are: The resulting images, are essentially the same as the non-MUSA images: -1. `local/llama.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. -2. `local/llama.cpp:light-musa`: This image only includes the main executable file. -3. `local/llama.cpp:server-musa`: This image only includes the server executable file. +1. `local/llama.cpp:full-musa`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. +2. `local/llama.cpp:light-musa`: This image only includes the `llama-cli` and `llama-completion` executables. +3. `local/llama.cpp:server-musa`: This image only includes the `llama-server` executable. ## Usage From a5251ca11d2317d93a7b6da4217483f4e83beb3d Mon Sep 17 00:00:00 2001 From: "Piotr Wilkin (ilintar)" Date: Tue, 16 Dec 2025 11:59:53 +0100 Subject: [PATCH 09/44] Optimization: Qwen3 next autoregressive pass (#17996) * It's Qwen3 Next, the lean mean token generation machine! * Apply patches from thread * Remove recurrent version, only keep chunked and autoregressive * Remove unnecessary conts and asserts * Remove more extra conts and asserts * Cleanup masking --- src/models/models.h | 22 +-- src/models/qwen3next.cpp | 333 +++++++++------------------------------ 2 files changed, 85 insertions(+), 270 deletions(-) diff --git a/src/models/models.h b/src/models/models.h index 6494f54501..ffb36acc61 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -441,23 +441,13 @@ private: ggml_tensor * cur, ggml_tensor * causal_mask, ggml_tensor * identity, + ggml_tensor * diag_mask, int il); ggml_tensor * build_layer_ffn( ggml_tensor * cur, int il); - ggml_tensor * build_delta_net_recurrent( - ggml_tensor * q, - ggml_tensor * k, - ggml_tensor * v, - ggml_tensor * g, - ggml_tensor * beta, - ggml_tensor * state, - ggml_tensor * causal_mask, - ggml_tensor * identity, - int il); - ggml_tensor * build_delta_net_chunking( ggml_tensor * q, ggml_tensor * k, @@ -467,8 +457,18 @@ private: ggml_tensor * state, ggml_tensor * causal_mask, ggml_tensor * identity, + ggml_tensor * diag_mask, int il); + ggml_tensor * build_delta_net_autoregressive( + ggml_tensor * q, + ggml_tensor * k, + ggml_tensor * v, + ggml_tensor * g, + ggml_tensor * beta, + ggml_tensor * state, + int il); + ggml_tensor * build_norm_gated( ggml_tensor * input, ggml_tensor * weights, diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp index c8f1b5ec90..775b3135d3 100644 --- a/src/models/qwen3next.cpp +++ b/src/models/qwen3next.cpp @@ -17,13 +17,15 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr ggml_tensor * inp_out_ids = build_inp_out_ids(); ggml_tensor * causal_mask = - ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens, ubatch.n_seq_tokens), 1.0f), + ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, CHUNK_SIZE, CHUNK_SIZE), 1.0f), GGML_TRI_TYPE_LOWER); - ggml_tensor * identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens), 1.0f)); + ggml_tensor * identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f)); + ggml_tensor * diag_mask = ggml_add(ctx0, causal_mask, identity); ggml_build_forward_expand(gf, causal_mask); ggml_build_forward_expand(gf, identity); + ggml_build_forward_expand(gf, diag_mask); for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -34,7 +36,7 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr // Determine layer type and build appropriate attention mechanism if (hparams.is_recurrent(il)) { // Linear attention layer (gated delta net) - cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, il); + cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, diag_mask, il); } else { // Full attention layer cur = build_layer_attn(inp->get_attn(), cur, inp_pos, il); @@ -93,14 +95,8 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking( ggml_tensor * state, ggml_tensor * causal_mask, ggml_tensor * identity, + ggml_tensor * diag_mask, int il) { - GGML_ASSERT(ggml_is_contiguous(q)); - GGML_ASSERT(ggml_is_contiguous(k)); - GGML_ASSERT(ggml_is_contiguous(v)); - GGML_ASSERT(ggml_is_contiguous(g)); - GGML_ASSERT(ggml_is_contiguous(beta)); - GGML_ASSERT(ggml_is_contiguous(state)); - const int64_t S_k = q->ne[0]; const int64_t H_k = q->ne[1]; const int64_t n_tokens = q->ne[2]; @@ -120,15 +116,10 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking( GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case - // TODO: can this ever be false? - const bool use_qk_l2norm = true; + const float eps_norm = hparams.f_norm_rms_eps; - if (use_qk_l2norm) { - const float eps_norm = hparams.f_norm_rms_eps; - - q = ggml_l2_norm(ctx0, q, eps_norm); - k = ggml_l2_norm(ctx0, k, eps_norm); - } + q = ggml_l2_norm(ctx0, q, eps_norm); + k = ggml_l2_norm(ctx0, k, eps_norm); const float scale = 1.0f / sqrtf(S_v); @@ -136,8 +127,6 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking( beta = ggml_sigmoid(ctx0, beta); - ggml_tensor * causal_diag_mask = ggml_add(ctx0, causal_mask, identity); - cb(q, "q_in", il); cb(k, "k_in", il); cb(v, "v_in", il); @@ -188,36 +177,21 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking( cb(v_beta, "v_beta", il); cb(k_beta, "k_beta", il); - ggml_tensor * chunked_mask = - ggml_view_4d(ctx0, causal_mask, chunk_size, - chunk_size, causal_mask->ne[2], causal_mask->ne[3], - causal_mask->nb[1], causal_mask->nb[2], causal_mask->nb[3], 0); + q = ggml_reshape_4d(ctx0, q, S_k, chunk_size, n_chunks, H_k * n_seqs); + k = ggml_reshape_4d(ctx0, k, S_k, chunk_size, n_chunks, H_k * n_seqs); + k_beta = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs); + v = ggml_reshape_4d(ctx0, v, S_v, chunk_size, n_chunks, H_v * n_seqs); + v_beta = ggml_reshape_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs); - ggml_tensor * chunked_diag_mask = - ggml_view_4d(ctx0, causal_diag_mask, chunk_size, - chunk_size, causal_diag_mask->ne[2], causal_diag_mask->ne[3], - causal_diag_mask->nb[1], causal_diag_mask->nb[2], causal_diag_mask->nb[3], 0); - - ggml_tensor * chunked_identity = - ggml_view_4d(ctx0, identity, chunk_size, - chunk_size, identity->ne[2], identity->ne[3], - identity->nb[1], identity->nb[2], identity->nb[3], 0); - - q = ggml_cont_4d(ctx0, q, S_k, chunk_size, n_chunks, H_k * n_seqs); - k = ggml_cont_4d(ctx0, k, S_k, chunk_size, n_chunks, H_k * n_seqs); - k_beta = ggml_cont_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs); - v = ggml_cont_4d(ctx0, v, S_v, chunk_size, n_chunks, H_v * n_seqs); - v_beta = ggml_cont_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs); - - g = ggml_cont_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs); - beta = ggml_cont_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs); + g = ggml_reshape_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs); + beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs); ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g); cb(g_cumsum, "g_cumsum", il); - ggml_tensor * gcs_i = ggml_cont_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs); - ggml_tensor * gcs_j = ggml_cont_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs); + ggml_tensor * gcs_i = ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs); + ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs); ggml_tensor * gcs_j_broadcast = ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs); @@ -226,23 +200,23 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking( cb(decay_mask, "decay_mask", il); - decay_mask = ggml_mul(ctx0, decay_mask, chunked_diag_mask); + decay_mask = ggml_mul(ctx0, decay_mask, diag_mask); decay_mask = ggml_exp(ctx0, decay_mask); - decay_mask = ggml_mul(ctx0, decay_mask, chunked_diag_mask); + decay_mask = ggml_mul(ctx0, decay_mask, diag_mask); ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta); ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask); - ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, chunked_mask)); + ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask)); cb(attn, "attn_pre_solve", il); - ggml_tensor * attn_lower = ggml_mul(ctx0, attn, chunked_mask); - ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, chunked_identity, attn_lower), attn_lower); + ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask); + ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower); ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false); - attn = ggml_mul(ctx0, lin_solve, chunked_mask); - attn = ggml_add(ctx0, attn, chunked_identity); + attn = ggml_mul(ctx0, lin_solve, causal_mask); + attn = ggml_add(ctx0, attn, identity); cb(attn, "attn_solved", il); @@ -291,7 +265,7 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking( // attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0) attn = ggml_mul_mat(ctx0, k_chunk, q_chunk); attn = ggml_mul(ctx0, attn, decay_mask_chunk); - attn = ggml_mul(ctx0, attn, ggml_add(ctx0, chunked_identity, chunked_mask)); + attn = ggml_mul(ctx0, attn, diag_mask); ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs); @@ -361,23 +335,14 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking( return ggml_concat(ctx0, flat_output, flat_state, 0); } -ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent( +ggml_tensor * llm_build_qwen3next::build_delta_net_autoregressive( ggml_tensor * q, ggml_tensor * k, ggml_tensor * v, ggml_tensor * g, ggml_tensor * beta, ggml_tensor * state, - ggml_tensor * causal_mask, - ggml_tensor * identity, int il) { - GGML_ASSERT(ggml_is_contiguous(q)); - GGML_ASSERT(ggml_is_contiguous(k)); - GGML_ASSERT(ggml_is_contiguous(v)); - GGML_ASSERT(ggml_is_contiguous(g)); - GGML_ASSERT(ggml_is_contiguous(beta)); - GGML_ASSERT(ggml_is_contiguous(state)); - const int64_t S_k = q->ne[0]; const int64_t H_k = q->ne[1]; const int64_t n_tokens = q->ne[2]; @@ -386,6 +351,7 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent( const int64_t S_v = v->ne[0]; const int64_t H_v = v->ne[1]; + GGML_ASSERT(n_tokens == 1); // This function is optimized for single token processing GGML_ASSERT(v->ne[2] == n_tokens); GGML_ASSERT(k->ne[2] == n_tokens); GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs); @@ -397,215 +363,65 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent( GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case - // TODO: can this ever be false? - const bool use_qk_l2norm = true; + const float eps_norm = hparams.f_norm_rms_eps; - if (use_qk_l2norm) { - const float eps_norm = hparams.f_norm_rms_eps; - - q = ggml_l2_norm(ctx0, q, eps_norm); - k = ggml_l2_norm(ctx0, k, eps_norm); - } + q = ggml_l2_norm(ctx0, q, eps_norm); + k = ggml_l2_norm(ctx0, k, eps_norm); const float scale = 1.0f / sqrtf(S_v); - q = ggml_scale(ctx0, q, scale); - + q = ggml_scale(ctx0, q, scale); beta = ggml_sigmoid(ctx0, beta); - ggml_tensor * causal_diag_mask = ggml_add(ctx0, causal_mask, identity); - cb(q, "q_in", il); cb(k, "k_in", il); cb(v, "v_in", il); cb(beta, "beta_in", il); cb(g, "g_in", il); - q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); - k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); - v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); - g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs); - - beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3)); state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs); - cb(q, "q_perm", il); - cb(k, "k_perm", il); - cb(v, "v_perm", il); - cb(beta, "beta_perm", il); - cb(g, "g_perm", il); - cb(state, "state_in", il); + ggml_tensor * g_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, g), 1, 1, H_k, n_seqs); + ggml_tensor * beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs); - GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs); - GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs); - GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs); - GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs); + // Apply exponential to g_t + g_t = ggml_exp(ctx0, g_t); - ggml_tensor * v_beta = ggml_mul(ctx0, v, beta); - ggml_tensor * k_beta = ggml_mul(ctx0, k, beta); + // Apply the gated delta rule for the single timestep + // last_recurrent_state = last_recurrent_state * g_t + state = ggml_mul(ctx0, state, g_t); - ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g); + // kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2) + ggml_tensor * k_t_unsqueezed = ggml_reshape_4d(ctx0, k, 1, S_v, H_v, n_seqs); + ggml_tensor * kv_mem = ggml_mul(ctx0, state, k_t_unsqueezed); + // we need to sum over dim=-2, so we transpose, sum, then transpose again + kv_mem = ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kv_mem)))); - cb(k_beta, "k_beta", il); - cb(v_beta, "v_beta", il); - cb(g_cumsum, "g_cumsum", il); + // v_t = v.unsqueeze(2) (we insert the singleton dimension after n_seqs and H_v) + ggml_tensor * v_t = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs); + // delta = (v_t - kv_mem) * beta_t + ggml_tensor * v_diff = ggml_sub(ctx0, v_t, kv_mem); // both should be [S_v, 1, H_v, n_seqs] + ggml_tensor * delta = ggml_mul(ctx0, v_diff, beta_t); - ggml_tensor * gcs_i = ggml_cont_4d(ctx0, g_cumsum, n_tokens, 1, H_v, n_seqs); // [chunk_size, 1, n_tokens, n_seqs] - ggml_tensor * gcs_j = ggml_cont_4d(ctx0, g_cumsum, 1, n_tokens, H_v, n_seqs); // [1, chunk_size, n_tokens, n_seqs] + // last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta + ggml_tensor * k_t_delta = ggml_mul(ctx0, ggml_repeat_4d(ctx0, k_t_unsqueezed, S_v, S_v, H_v, n_seqs), delta); + state = ggml_add(ctx0, state, k_t_delta); - // Broadcast both tensors to [chunk_size, chunk_size, H_v, n_seqs] - // ggml_tensor * gcs_i_broadcast = - // ggml_repeat_4d(ctx0, gcs_i, GGML_DELTA_NET_CHUNK, GGML_DELTA_NET_CHUNK, num_chunks * H_v, - // n_seqs); // [chunk_size, 1, H_v, n_seqs] -> [chunk_size, chunk_size, H_v, n_seqs] - // Don't need this, this one will get auto-broadcast - ggml_tensor * gcs_j_broadcast = - ggml_repeat_4d(ctx0, gcs_j, n_tokens, n_tokens, H_v, n_seqs); // [1, chunk_size, H_v, n_seqs] -> [chunk_size, chunk_size, H_v, n_seqs] - - ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i); - - // Apply lower triangular mask to ensure attention is causal (only past tokens influence current) - decay_mask = ggml_mul(ctx0, decay_mask, causal_diag_mask); - // Apply exponential to get the decay mask values - decay_mask = ggml_exp(ctx0, decay_mask); - // Apply lower triangular mask again to ensure only lower triangular values remain - decay_mask = ggml_mul(ctx0, decay_mask, causal_diag_mask); - - cb(decay_mask, "decay_mask", il); - - // attn = -((k_beta @ key.transpose(-1, -2)) * decay_mask).masked_fill(mask, 0) - ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta); - - cb(kmulkbeta, "kmulkbeta", il); - - ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask); - ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask)); - - cb(attn, "attn_pre_rec", il); - - // for i in range(1, chunk_size): - // row = attn[..., i, :i].clone() - // sub = attn[..., :i, :i].clone() - // attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2) - // attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device) - // - // We reduce this to a linear triangular solve: AX = B, where B = attn, A = I - tril(A) - ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask); - ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower); - - ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false); - attn = ggml_mul(ctx0, lin_solve, causal_mask); - attn = ggml_add(ctx0, attn, identity); - - // value = attn @ v_beta - v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn); - - cb(v, "value_beta", il); - - // k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1)) - ggml_tensor * g_cumsum_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum)); - ggml_tensor * gexp = ggml_exp(ctx0, g_cumsum_t); - - cb(gexp, "g_cum_exp", il); - - ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp); - - cb(kbeta_gexp, "kbeta_gexp", il); - - ggml_tensor * k_cumdecay = - ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp))))); - - cb(k_cumdecay, "k_cumdecay", il); - - // attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0) - attn = ggml_mul_mat(ctx0, k, q); - attn = ggml_mul(ctx0, attn, decay_mask); - attn = ggml_mul(ctx0, attn, ggml_add(ctx0, identity, causal_mask)); - - cb(attn, "attn_decay_key", il); - - ggml_tensor * state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state)); - - // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state - ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay); - - cb(v_prime, "v_prime", il); - - // v_new = v_i - v_prime - ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v, v_prime), v_prime); - - ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new)); - - cb(v_new, "v_new", il); - - // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state - ggml_tensor * q_g_exp = ggml_mul(ctx0, q, gexp); - ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp); - - cb(attn_inter, "attn_inter", il); - - // core_attn_out[:, :, i] = attn_inter + attn @ v_new - ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn); - - cb(v_attn, "v_attn", il); - - ggml_tensor * core_attn_out = ggml_add(ctx0, attn_inter, v_attn); - - cb(core_attn_out, "core_attn_out", il); - - // g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1) - // g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp() - // key_gdiff = key * g_diff.unsqueeze(-1) - // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new - // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew - - ggml_tensor * g_cum_last = - ggml_cont(ctx0, ggml_view_4d(ctx0, g_cumsum_t, g_cumsum_t->ne[0], 1, g_cumsum_t->ne[2], g_cumsum_t->ne[3], - g_cumsum_t->nb[1], g_cumsum_t->nb[2], g_cumsum_t->nb[3], - g_cumsum_t->nb[0] * (g_cumsum_t->ne[1] - 1))); - - cb(g_cum_last, "g_cum_last", il); - - ggml_tensor * gexp_last = - ggml_reshape_4d(ctx0, ggml_exp(ctx0, g_cum_last), 1, 1, g_cum_last->ne[0] * g_cum_last->ne[2], g_cum_last->ne[3]); - - cb(gexp_last, "gexp_last", il); - - ggml_tensor * g_cum_last_3d = - ggml_reshape_3d(ctx0, g_cum_last, g_cum_last->ne[0], g_cum_last->ne[2], g_cum_last->ne[3]); - - cb(g_cum_last_3d, "g_cum_last_3d", il); - - ggml_tensor * g_cumsum_3d = ggml_reshape_3d(ctx0, g_cumsum, g_cumsum->ne[0], g_cumsum->ne[2], g_cumsum->ne[3]); - - cb(g_cumsum_3d, "g_cumsum_3d", il); - - ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum_3d, g_cum_last_3d)); - - cb(g_diff, "g_diff", il); - - ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff); - - cb(g_diff_exp, "g_diff_exp", il); - - ggml_tensor * key_gdiff = ggml_mul(ctx0, k, - ggml_reshape_4d(ctx0, g_diff_exp, 1, g_diff_exp->ne[0], g_diff_exp->ne[1], - g_diff_exp->ne[2] * g_diff_exp->ne[3])); - - cb(key_gdiff, "key_gdiff", il); - - ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff))); - - cb(kgdmulvnew, "kgdmulvnew", il); - - state = ggml_add(ctx0, ggml_mul(ctx0, state, gexp_last), kgdmulvnew); + // Compute the attention output + // core_attn_out = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2) + ggml_tensor * q_t_unsqueezed = ggml_reshape_4d(ctx0, q, 1, S_v, H_v, n_seqs); // unsqueeze q_t + ggml_tensor * state_q = ggml_mul(ctx0, state, q_t_unsqueezed); + // again, since it's over dim = -2, transpose, sum, transpose back + ggml_tensor * core_attn_out = + ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, state_q)))); + // core_attn_out should be [S_v, 1, H_v, n_seqs] after this + cb(core_attn_out, "output_tokens", il); cb(state, "new_state", il); - // flatten output - ggml_tensor * flat_output = - ggml_cont_1d(ctx0, ggml_permute(ctx0, core_attn_out, 0, 2, 1, 3), S_v * H_v * n_tokens * n_seqs); - - ggml_tensor * flat_state = ggml_cont_1d(ctx0, state, S_v * S_v * H_v * n_seqs); + // flatten output, no need to permute since n_tokens is 1 so [S_v, 1, H_v, n_seqs] and [S_v, H_v, 1, n_seqs] are equivalent memory-layout wise + ggml_tensor * flat_output = ggml_reshape_1d(ctx0, core_attn_out, S_v * H_v * n_tokens * n_seqs); + ggml_tensor * flat_state = ggml_reshape_1d(ctx0, state, S_v * S_v * H_v * n_seqs); return ggml_concat(ctx0, flat_output, flat_state, 0); } @@ -712,6 +528,7 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear( ggml_tensor * cur, ggml_tensor * causal_mask, ggml_tensor * identity, + ggml_tensor * diag_mask, int il) { const auto * mctx_cur = inp->mctx; @@ -737,11 +554,11 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear( cb(mixed_ba, "linear_attn_mixed_ba", il); int64_t qkvz_new_dim = 2 * head_k_dim + 2 * head_v_dim * (num_v_heads / num_k_heads); - ggml_tensor * mixed_qkvz_reshaped = ggml_cont_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs); + ggml_tensor * mixed_qkvz_reshaped = ggml_reshape_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs); // Reshape mixed_ba: [batch, seq_len, hidden_size] -> [batch, seq_len, num_k_heads, 2*num_v_heads/num_k_heads] int64_t ba_new_dim = 2 * num_v_heads / num_k_heads; - ggml_tensor * mixed_ba_reshaped = ggml_cont_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_seq_tokens, n_seqs); + ggml_tensor * mixed_ba_reshaped = ggml_reshape_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_seq_tokens, n_seqs); // Split mixed_ba into b and a (beta and alpha parameters) int64_t split_sizes_ba[2] = { @@ -762,8 +579,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear( ggml_tensor * beta = ggml_cont_3d(ctx0, b, num_v_heads, n_seq_tokens, n_seqs); ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_seq_tokens, n_seqs); - GGML_ASSERT(ggml_nelements(beta) + ggml_nelements(alpha) == ggml_nelements(mixed_ba)); - ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt); ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased); cb(alpha_softplus, "a_softplus", il); @@ -799,9 +614,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear( (split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * sizeof(float)); cb(z, "z", il); - GGML_ASSERT(ggml_nelements(query) + ggml_nelements(key) + ggml_nelements(value) + ggml_nelements(z) == - ggml_nelements(mixed_qkvz)); - // After creating query, key, and value_reshaped, reshape each to flatten the head dimensions // query: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs] ggml_tensor * query_flat = ggml_cont_3d(ctx0, query, head_k_dim * num_k_heads, n_seq_tokens, n_seqs); @@ -925,10 +737,13 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear( cb(k_conv, "k_conv_predelta", il); cb(v_conv, "v_conv_predelta", il); - // Choose between build_delta_net_chunking and build_delta_net_recurrent based on n_tokens - ggml_tensor * attn_out = n_seq_tokens > CHUNK_SIZE ? - build_delta_net_chunking (q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, il) : - build_delta_net_recurrent(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, il); + // Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens + ggml_tensor * attn_out; + if (n_seq_tokens == 1) { + attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il); + } else { + attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il); + } cb(attn_out, "attn_out", il); // The tensors were concatenated 1d, so we need to extract them 1d as well From 7b1db3d3b770d0affbf3aadee033e1614280085f Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Tue, 16 Dec 2025 12:01:27 +0100 Subject: [PATCH 10/44] arg: clarify auto kvu/np being set on server (#17997) * arg: clarify auto kvu/np being set on server * improve docs * use invalid_argument --- common/arg.cpp | 50 +++++++++++++++++++++++++-------- examples/gen-docs/gen-docs.cpp | 9 +++--- tools/completion/completion.cpp | 3 -- tools/mtmd/mtmd-cli.cpp | 2 -- tools/server/README.md | 13 ++++----- tools/server/server.cpp | 9 ++---- 6 files changed, 51 insertions(+), 35 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index acf4c8f8a8..f2aec895ba 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -835,6 +835,19 @@ bool common_arg_utils::is_autoy(const std::string & value) { } common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) { + // per-example default params + // we define here to make sure it's included in llama-gen-docs + if (ex == LLAMA_EXAMPLE_COMPLETION) { + params.use_jinja = false; // disable jinja by default + + } else if (ex == LLAMA_EXAMPLE_MTMD) { + params.use_jinja = false; // disable jinja by default + params.sampling.temp = 0.2; // lower temp by default for better quality + + } else if (ex == LLAMA_EXAMPLE_SERVER) { + params.n_parallel = -1; // auto by default + } + params.use_color = tty_can_use_colors(); // load dynamic backends @@ -1107,7 +1120,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_env("LLAMA_ARG_SWA_FULL")); add_opt(common_arg( {"--ctx-checkpoints", "--swa-checkpoints"}, "N", - string_format("max number of context checkpoints to create per slot (default: %d)\n" + string_format("max number of context checkpoints to create per slot (default: %d)" "[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints), [](common_params & params, int value) { params.n_ctx_checkpoints = value; @@ -1115,7 +1128,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"--cache-ram", "-cram"}, "N", - string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n" + string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)" "[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib), [](common_params & params, int value) { params.cache_ram_mib = value; @@ -1123,12 +1136,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"--kv-unified", "-kvu"}, - string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n" - "[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"), + "use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)", [](common_params & params) { params.kv_unified = true; } - ).set_env("LLAMA_ARG_KV_UNIFIED")); + ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--context-shift"}, {"--no-context-shift"}, @@ -1888,13 +1900,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n"); } ).set_env("LLAMA_ARG_DEFRAG_THOLD")); - add_opt(common_arg( - {"-np", "--parallel"}, "N", - string_format("number of parallel sequences to decode (default: %d)", params.n_parallel), - [](common_params & params, int value) { - params.n_parallel = value; - } - ).set_env("LLAMA_ARG_N_PARALLEL")); + if (ex == LLAMA_EXAMPLE_SERVER) { + // this is to make sure this option appears in the server-specific section of the help message + add_opt(common_arg( + {"-np", "--parallel"}, "N", + string_format("number of server slots (default: %d, -1 = auto)", params.n_parallel), + [](common_params & params, int value) { + if (value == 0) { + throw std::invalid_argument("error: invalid value for n_parallel\n"); + } + params.n_parallel = value; + } + ).set_env("LLAMA_ARG_N_PARALLEL").set_examples({LLAMA_EXAMPLE_SERVER})); + } else { + add_opt(common_arg( + {"-np", "--parallel"}, "N", + string_format("number of parallel sequences to decode (default: %d)", params.n_parallel), + [](common_params & params, int value) { + params.n_parallel = value; + } + ).set_env("LLAMA_ARG_N_PARALLEL")); + } add_opt(common_arg( {"-ns", "--sequences"}, "N", string_format("number of sequences to decode (default: %d)", params.n_sequences), diff --git a/examples/gen-docs/gen-docs.cpp b/examples/gen-docs/gen-docs.cpp index e9f7bf9313..dc76c4cf53 100644 --- a/examples/gen-docs/gen-docs.cpp +++ b/examples/gen-docs/gen-docs.cpp @@ -48,7 +48,7 @@ static void write_table(std::ofstream & file, std::vector & opts) } } -static void export_md(std::string fname, llama_example ex) { +static void export_md(std::string fname, llama_example ex, std::string name) { std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc); common_params params; @@ -72,13 +72,14 @@ static void export_md(std::string fname, llama_example ex) { write_table(file, common_options); file << "\n\n**Sampling params**\n\n"; write_table(file, sparam_options); - file << "\n\n**Example-specific params**\n\n"; + file << "\n\n**" << name << "-specific params**\n\n"; write_table(file, specific_options); } int main(int, char **) { - export_md("autogen-main.md", LLAMA_EXAMPLE_COMPLETION); - export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER); + // TODO: add CLI + export_md("autogen-completion.md", LLAMA_EXAMPLE_COMPLETION, "Tool"); + export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER, "Server"); return 0; } diff --git a/tools/completion/completion.cpp b/tools/completion/completion.cpp index 85480f3369..29770515f5 100644 --- a/tools/completion/completion.cpp +++ b/tools/completion/completion.cpp @@ -87,9 +87,6 @@ int main(int argc, char ** argv) { common_params params; g_params = ¶ms; - // disable jinja by default - params.use_jinja = false; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMPLETION, print_usage)) { return 1; } diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index 332d2049e5..3ee1c2eccf 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -270,8 +270,6 @@ int main(int argc, char ** argv) { ggml_time_init(); common_params params; - params.use_jinja = false; // disable jinja by default - params.sampling.temp = 0.2; // lower temp by default for better quality if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MTMD, show_additional_info)) { return 1; diff --git a/tools/server/README.md b/tools/server/README.md index ef4990faf1..9a2b9b1f36 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -52,7 +52,6 @@ For the ful list of features, please refer to [server's changelog](https://githu | `-ub, --ubatch-size N` | physical maximum batch size (default: 512)
(env: LLAMA_ARG_UBATCH) | | `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) | | `--swa-full` | use full-size SWA cache (default: false)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
(env: LLAMA_ARG_SWA_FULL) | -| `--kv-unified, -kvu` | use single unified KV buffer for the KV cache of all sequences (default: false)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)
(env: LLAMA_ARG_KV_UNIFIED) | | `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')
(env: LLAMA_ARG_FLASH_ATTN) | | `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)
(env: LLAMA_ARG_PERF) | | `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) | @@ -67,11 +66,10 @@ For the ful list of features, please refer to [server's changelog](https://githu | `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)
(env: LLAMA_ARG_YARN_BETA_FAST) | | `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)
(env: LLAMA_ARG_KV_OFFLOAD) | | `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)
(env: LLAMA_ARG_REPACK) | -| `--no-host` | bypass host buffer allowing extra buffers to be used
(env: LLAMA_ARG_HOST) | +| `--no-host` | bypass host buffer allowing extra buffers to be used
(env: LLAMA_ARG_NO_HOST) | | `-ctk, --cache-type-k TYPE` | KV cache data type for K
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | | `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | -| `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | | `--mmap, --no-mmap` | whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | @@ -150,19 +148,20 @@ For the ful list of features, please refer to [server's changelog](https://githu | `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | -**Example-specific params** +**Server-specific params** | Argument | Explanation | | -------- | ----------- | -| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | -| `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | +| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | +| `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | +| `--kv-unified, -kvu` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)
(env: LLAMA_ARG_KV_UNIFIED) | | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode
| | `-sp, --special` | special tokens output enabled (default: false) | | `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) | | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) | | `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified
(env: LLAMA_ARG_POOLING) | -| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) | +| `-np, --parallel N` | number of server slots (default: -1, -1 = auto)
(env: LLAMA_ARG_N_PARALLEL) | | `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) | | `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md
note: if -hf is used, this argument can be omitted
(env: LLAMA_ARG_MMPROJ) | | `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md
(env: LLAMA_ARG_MMPROJ_URL) | diff --git a/tools/server/server.cpp b/tools/server/server.cpp index d5bef3df44..235ae4e8c0 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -73,13 +73,8 @@ int main(int argc, char ** argv, char ** envp) { return 1; } - // TODO: should we have a separate n_parallel parameter for the server? - // https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177 - // TODO: this is a common configuration that is suitable for most local use cases - // however, overriding the parameters is a bit confusing - figure out something more intuitive - if (params.n_parallel == 1 && params.kv_unified == false && !params.has_speculative()) { - LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true (add -kvu to disable this)\n", __func__); - + if (params.n_parallel < 0) { + LOG_INF("%s: n_parallel is set to auto, using n_parallel = 4 and kv_unified = true\n", __func__); params.n_parallel = 4; params.kv_unified = true; } From 7f2b2f3c778f430edc57d5728641317e9ac5a505 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Tue, 16 Dec 2025 13:22:30 +0100 Subject: [PATCH 11/44] arch: refactor LLM_TENSOR_NAMES (#18051) * arch: refactor LLM_TENSOR_NAMES * update docs * typo * fix LLM_ARCH_NEMOTRON_H_MOE * show more meaningful error message on missing tensor * fix and tested LLM_ARCH_NEMOTRON_H_MOE --- docs/development/HOWTO-add-model.md | 2 +- src/llama-arch.cpp | 4169 ++++++++++++--------------- src/llama-arch.h | 10 +- 3 files changed, 1897 insertions(+), 2284 deletions(-) diff --git a/docs/development/HOWTO-add-model.md b/docs/development/HOWTO-add-model.md index 9d1452e3f0..b6870f6e25 100644 --- a/docs/development/HOWTO-add-model.md +++ b/docs/development/HOWTO-add-model.md @@ -97,7 +97,7 @@ The model params and tensors layout must be defined in `llama.cpp` source files: 1. Define a new `llm_arch` enum value in `src/llama-arch.h`. 2. In `src/llama-arch.cpp`: - Add the architecture name to the `LLM_ARCH_NAMES` map. - - Add the tensor mappings to the `LLM_TENSOR_NAMES` map. + - Add the list of model tensors to `llm_get_tensor_names` (you may also need to update `LLM_TENSOR_NAMES`) 3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`. 4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`. diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 05b12a6072..8caf80afcf 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -3,6 +3,7 @@ #include "llama-impl.h" #include +#include static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_CLIP, "clip" }, // dummy, only used by llama-quantize @@ -302,2286 +303,1884 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" }, }; -static const std::map> LLM_TENSOR_NAMES = { - { - LLM_ARCH_CLIP, - {}, - }, - { - LLM_ARCH_LLAMA, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, - { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, - { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - }, - }, - { - LLM_ARCH_ARCEE, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_AFMOE, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_ATTN_GATE, "blk.%d.attn_gate" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, - { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, - { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, - { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" }, - }, - }, - { - LLM_ARCH_LLAMA4, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, - { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, - { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, - { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, - { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, - }, - }, - { - LLM_ARCH_DECI, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, - { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, - { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - }, - }, - { - LLM_ARCH_BAICHUAN, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_FALCON, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" }, - { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_GROK, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, - { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, - { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, - { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, - { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, - }, - }, - { - LLM_ARCH_GPT2, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_POS_EMBD, "position_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - }, - }, - { - LLM_ARCH_GPTJ, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - }, - }, - { - LLM_ARCH_GPTNEOX, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_MPT, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output"}, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" }, - { LLM_TENSOR_POS_EMBD, "position_embd" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"}, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"}, - }, - }, - { - LLM_ARCH_STARCODER, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_POS_EMBD, "position_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - }, - }, - { - LLM_ARCH_REFACT, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_BERT, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, - { LLM_TENSOR_TOKEN_TYPES, "token_types" }, - { LLM_TENSOR_POS_EMBD, "position_embd" }, - { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, - { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_CLS, "cls" }, - { LLM_TENSOR_CLS_OUT, "cls.output" }, - }, - }, - { - LLM_ARCH_NOMIC_BERT, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, - { LLM_TENSOR_TOKEN_TYPES, "token_types" }, - { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, - { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_NOMIC_BERT_MOE, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, - { LLM_TENSOR_TOKEN_TYPES, "token_types" }, - { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, - { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - }, - }, - { - LLM_ARCH_NEO_BERT, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" }, - { LLM_TENSOR_CLS, "cls" }, - { LLM_TENSOR_CLS_OUT, "cls.output" }, - }, - }, - { - LLM_ARCH_JINA_BERT_V2, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, - { LLM_TENSOR_TOKEN_TYPES, "token_types" }, - { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" }, - { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_CLS, "cls" }, - }, - }, - { - LLM_ARCH_JINA_BERT_V3, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, - { LLM_TENSOR_TOKEN_TYPES, "token_types" }, - { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, - { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, - }, - }, - { - LLM_ARCH_BLOOM, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - }, - }, - { - LLM_ARCH_STABLELM, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - }, - }, - { - LLM_ARCH_QWEN, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_QWEN2, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_QWEN2VL, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_QWEN2MOE, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" }, - { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, - { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, - { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, - }, - }, - { - LLM_ARCH_QWEN3, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_CLS_OUT, "cls.output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_QWEN3MOE, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - }, - }, - { - LLM_ARCH_QWEN3NEXT, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" }, - { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, - { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, - { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, - { LLM_TENSOR_SSM_A_NOSCAN, "blk.%d.ssm_a" }, - { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" }, - { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" }, - { LLM_TENSOR_SSM_BETA_ALPHA, "blk.%d.ssm_ba" }, - { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" }, - { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" }, - { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, - }, - }, - { - LLM_ARCH_QWEN3VL, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_QWEN3VLMOE, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - }, - }, - { - LLM_ARCH_PHI2, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_PHI3, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" }, - { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_PHIMOE, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" }, - { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - }, - }, - { - LLM_ARCH_PLAMO, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_PLAMO2, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" }, - { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" }, - { LLM_TENSOR_SSM_X, "blk.%d.ssm_x" }, - { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" }, - { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" }, - { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" }, - { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, - { LLM_TENSOR_SSM_DT_NORM, "blk.%d.ssm_dt_norm" }, - { LLM_TENSOR_SSM_B_NORM, "blk.%d.ssm_b_norm" }, - { LLM_TENSOR_SSM_C_NORM, "blk.%d.ssm_c_norm" }, - { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, - { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, - }, - }, - { - LLM_ARCH_CODESHELL, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_ORION, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_INTERNLM2, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_MINICPM, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" }, - { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, - { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, - { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, - }, - }, - { - LLM_ARCH_MINICPM3, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" }, - { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" }, - { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" }, - { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" }, - { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" }, - { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - }, - }, - { - LLM_ARCH_GEMMA, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_GEMMA2, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, - }, - }, - { - LLM_ARCH_GEMMA3, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, - }, - }, - { - LLM_ARCH_GEMMA3N, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, - { LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "per_layer_token_embd" }, - { LLM_TENSOR_PER_LAYER_MODEL_PROJ, "per_layer_model_proj" }, - { LLM_TENSOR_PER_LAYER_PROJ_NORM, "per_layer_proj_norm" }, - { LLM_TENSOR_ALTUP_UNEMBD_PROJ, "altup_unembd_proj" }, - { LLM_TENSOR_ALTUP_PROJ, "altup_proj" }, - { LLM_TENSOR_PER_LAYER_INP_GATE, "blk.%d.inp_gate" }, - { LLM_TENSOR_PER_LAYER_PROJ, "blk.%d.proj" }, - { LLM_TENSOR_PER_LAYER_POST_NORM, "blk.%d.post_norm" }, - { LLM_TENSOR_ALTUP_CORRECT_COEF, "blk.%d.altup_correct_coef" }, - { LLM_TENSOR_ALTUP_CORRECT_SCALE, "blk.%d.altup_correct_scale" }, - { LLM_TENSOR_ALTUP_PREDICT_COEF, "blk.%d.altup_predict_coef" }, - { LLM_TENSOR_ALTUP_ROUTER, "blk.%d.altup_router" }, - { LLM_TENSOR_ALTUP_ROUTER_NORM, "blk.%d.altup_router_norm" }, - { LLM_TENSOR_LAUREL_L, "blk.%d.laurel_l" }, - { LLM_TENSOR_LAUREL_R, "blk.%d.laurel_r" }, - { LLM_TENSOR_LAUREL_POST_NORM, "blk.%d.laurel_post_norm" }, - }, - }, - { - LLM_ARCH_GEMMA_EMBEDDING, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_DENSE_2_OUT, "dense_2" }, - { LLM_TENSOR_DENSE_3_OUT, "dense_3" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, - }, - }, - { - LLM_ARCH_STARCODER2, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_MAMBA, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" }, - { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" }, - { LLM_TENSOR_SSM_X, "blk.%d.ssm_x" }, - { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" }, - { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" }, - { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" }, - { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, - }, - }, - { - LLM_ARCH_MAMBA2, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" }, - { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" }, - { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" }, - { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" }, - { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" }, - { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" }, - { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, - }, - }, - { - LLM_ARCH_JAMBA, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" }, - { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" }, - { LLM_TENSOR_SSM_X, "blk.%d.ssm_x" }, - { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" }, - { LLM_TENSOR_SSM_DT_NORM, "blk.%d.ssm_dt_norm" }, - { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" }, - { LLM_TENSOR_SSM_B_NORM, "blk.%d.ssm_b_norm" }, - { LLM_TENSOR_SSM_C_NORM, "blk.%d.ssm_c_norm" }, - { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" }, - { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - }, - }, - { - LLM_ARCH_FALCON_H1, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" }, - { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" }, - { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" }, - { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" }, - { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" }, - { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" }, - { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_XVERSE, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_COMMAND_R, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - }, - }, - { - LLM_ARCH_COHERE2, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_DBRX, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - }, - }, - { - LLM_ARCH_OLMO, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_OLMO2, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_OLMOE, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - }, - }, - { - LLM_ARCH_OPENELM, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_ARCTIC, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - }, - }, - { - LLM_ARCH_DEEPSEEK, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" }, - { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, - { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, - { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, - }, - }, - { - LLM_ARCH_DEEPSEEK2, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" }, - { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" }, - { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" }, - { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" }, - { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" }, - { LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" }, - { LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" }, - { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, - { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, - { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, - { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" }, - }, - }, - { - LLM_ARCH_PLM, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" }, - { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" }, - { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_CHATGLM, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - }, - }, - { - LLM_ARCH_GLM4, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, - { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, - }, - }, - { - LLM_ARCH_GLM4_MOE, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, - { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, - { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, - { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" }, - // NextN/MTP tensors - preserved but unused (in final layer, dynamic layer number) - { LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" }, - { LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.nextn.embed_tokens" }, - { LLM_TENSOR_NEXTN_ENORM, "blk.%d.nextn.enorm" }, - { LLM_TENSOR_NEXTN_HNORM, "blk.%d.nextn.hnorm" }, - { LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" }, - { LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" }, - }, - }, - { - LLM_ARCH_BITNET, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_SUB_NORM, "blk.%d.attn_sub_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_SUB_NORM, "blk.%d.ffn_sub_norm" }, - }, - }, - { - LLM_ARCH_T5, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_DEC_OUTPUT_NORM, "dec.output_norm" }, - { LLM_TENSOR_DEC_ATTN_NORM, "dec.blk.%d.attn_norm" }, - { LLM_TENSOR_DEC_ATTN_Q, "dec.blk.%d.attn_q" }, - { LLM_TENSOR_DEC_ATTN_K, "dec.blk.%d.attn_k" }, - { LLM_TENSOR_DEC_ATTN_V, "dec.blk.%d.attn_v" }, - { LLM_TENSOR_DEC_ATTN_OUT, "dec.blk.%d.attn_o" }, - { LLM_TENSOR_DEC_ATTN_REL_B, "dec.blk.%d.attn_rel_b" }, - { LLM_TENSOR_DEC_CROSS_ATTN_NORM, "dec.blk.%d.cross_attn_norm" }, - { LLM_TENSOR_DEC_CROSS_ATTN_Q, "dec.blk.%d.cross_attn_q" }, - { LLM_TENSOR_DEC_CROSS_ATTN_K, "dec.blk.%d.cross_attn_k" }, - { LLM_TENSOR_DEC_CROSS_ATTN_V, "dec.blk.%d.cross_attn_v" }, - { LLM_TENSOR_DEC_CROSS_ATTN_OUT, "dec.blk.%d.cross_attn_o" }, - { LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "dec.blk.%d.cross_attn_rel_b" }, - { LLM_TENSOR_DEC_FFN_NORM, "dec.blk.%d.ffn_norm" }, - { LLM_TENSOR_DEC_FFN_GATE, "dec.blk.%d.ffn_gate" }, - { LLM_TENSOR_DEC_FFN_DOWN, "dec.blk.%d.ffn_down" }, - { LLM_TENSOR_DEC_FFN_UP, "dec.blk.%d.ffn_up" }, - { LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" }, - { LLM_TENSOR_ENC_ATTN_NORM, "enc.blk.%d.attn_norm" }, - { LLM_TENSOR_ENC_ATTN_Q, "enc.blk.%d.attn_q" }, - { LLM_TENSOR_ENC_ATTN_K, "enc.blk.%d.attn_k" }, - { LLM_TENSOR_ENC_ATTN_V, "enc.blk.%d.attn_v" }, - { LLM_TENSOR_ENC_ATTN_OUT, "enc.blk.%d.attn_o" }, - { LLM_TENSOR_ENC_ATTN_REL_B, "enc.blk.%d.attn_rel_b" }, - { LLM_TENSOR_ENC_FFN_NORM, "enc.blk.%d.ffn_norm" }, - { LLM_TENSOR_ENC_FFN_GATE, "enc.blk.%d.ffn_gate" }, - { LLM_TENSOR_ENC_FFN_DOWN, "enc.blk.%d.ffn_down" }, - { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_T5ENCODER, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" }, - { LLM_TENSOR_ENC_ATTN_NORM, "enc.blk.%d.attn_norm" }, - { LLM_TENSOR_ENC_ATTN_Q, "enc.blk.%d.attn_q" }, - { LLM_TENSOR_ENC_ATTN_K, "enc.blk.%d.attn_k" }, - { LLM_TENSOR_ENC_ATTN_V, "enc.blk.%d.attn_v" }, - { LLM_TENSOR_ENC_ATTN_OUT, "enc.blk.%d.attn_o" }, - { LLM_TENSOR_ENC_ATTN_REL_B, "enc.blk.%d.attn_rel_b" }, - { LLM_TENSOR_ENC_FFN_NORM, "enc.blk.%d.ffn_norm" }, - { LLM_TENSOR_ENC_FFN_GATE, "enc.blk.%d.ffn_gate" }, - { LLM_TENSOR_ENC_FFN_DOWN, "enc.blk.%d.ffn_down" }, - { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_JAIS, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - }, - }, - { - LLM_ARCH_NEMOTRON, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_NEMOTRON_H, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - // mamba(2) ssm layers - { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" }, - { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" }, - { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" }, - { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" }, - { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" }, - { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" }, - { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, - // attention layers - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - // dense FFN - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_NEMOTRON_H_MOE, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - // mamba(2) ssm layers - { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" }, - { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" }, - { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" }, - { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" }, - { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" }, - { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" }, - { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, - // attention layers - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - // dense FFN - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - // MoE FFN (for MoE layers) - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_EXP_PROBS_B,"blk.%d.exp_probs_b" }, - // MoE shared expert layer - { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, - { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, - }, - }, - { - LLM_ARCH_EXAONE, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_EXAONE4, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, - } - }, - { - LLM_ARCH_RWKV6, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" }, - { LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" }, - { LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" }, - { LLM_TENSOR_TIME_MIX_LERP_X, "blk.%d.time_mix_lerp_x" }, - { LLM_TENSOR_TIME_MIX_LERP_W, "blk.%d.time_mix_lerp_w" }, - { LLM_TENSOR_TIME_MIX_LERP_K, "blk.%d.time_mix_lerp_k" }, - { LLM_TENSOR_TIME_MIX_LERP_V, "blk.%d.time_mix_lerp_v" }, - { LLM_TENSOR_TIME_MIX_LERP_R, "blk.%d.time_mix_lerp_r" }, - { LLM_TENSOR_TIME_MIX_LERP_G, "blk.%d.time_mix_lerp_g" }, - { LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" }, - { LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" }, - { LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" }, - { LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" }, - { LLM_TENSOR_TIME_MIX_DECAY_W2, "blk.%d.time_mix_decay_w2" }, - { LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" }, - { LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" }, - { LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" }, - { LLM_TENSOR_TIME_MIX_GATE, "blk.%d.time_mix_gate" }, - { LLM_TENSOR_TIME_MIX_LN, "blk.%d.time_mix_ln" }, - { LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" }, - { LLM_TENSOR_CHANNEL_MIX_LERP_K, "blk.%d.channel_mix_lerp_k" }, - { LLM_TENSOR_CHANNEL_MIX_LERP_R, "blk.%d.channel_mix_lerp_r" }, - { LLM_TENSOR_CHANNEL_MIX_KEY, "blk.%d.channel_mix_key" }, - { LLM_TENSOR_CHANNEL_MIX_VALUE, "blk.%d.channel_mix_value" }, - { LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" }, - }, - }, - { - LLM_ARCH_RWKV6QWEN2, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" }, - { LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" }, - { LLM_TENSOR_TIME_MIX_LERP_X, "blk.%d.time_mix_lerp_x" }, - { LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" }, - { LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" }, - { LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" }, - { LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" }, - { LLM_TENSOR_TIME_MIX_DECAY_W2, "blk.%d.time_mix_decay_w2" }, - { LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" }, - { LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" }, - { LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" }, - { LLM_TENSOR_TIME_MIX_GATE, "blk.%d.time_mix_gate" }, - { LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_RWKV7, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" }, - { LLM_TENSOR_TIME_MIX_W0, "blk.%d.time_mix_w0" }, - { LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" }, - { LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" }, - { LLM_TENSOR_TIME_MIX_A0, "blk.%d.time_mix_a0" }, - { LLM_TENSOR_TIME_MIX_A1, "blk.%d.time_mix_a1" }, - { LLM_TENSOR_TIME_MIX_A2, "blk.%d.time_mix_a2" }, - { LLM_TENSOR_TIME_MIX_V0, "blk.%d.time_mix_v0" }, - { LLM_TENSOR_TIME_MIX_V1, "blk.%d.time_mix_v1" }, - { LLM_TENSOR_TIME_MIX_V2, "blk.%d.time_mix_v2" }, - { LLM_TENSOR_TIME_MIX_G1, "blk.%d.time_mix_g1" }, - { LLM_TENSOR_TIME_MIX_G2, "blk.%d.time_mix_g2" }, - { LLM_TENSOR_TIME_MIX_K_K, "blk.%d.time_mix_k_k" }, - { LLM_TENSOR_TIME_MIX_K_A, "blk.%d.time_mix_k_a" }, - { LLM_TENSOR_TIME_MIX_R_K, "blk.%d.time_mix_r_k" }, - { LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" }, - { LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" }, - { LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" }, - { LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" }, - { LLM_TENSOR_TIME_MIX_LN, "blk.%d.time_mix_ln" }, - { LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" }, - { LLM_TENSOR_CHANNEL_MIX_LERP_K, "blk.%d.channel_mix_lerp_k" }, - { LLM_TENSOR_CHANNEL_MIX_KEY, "blk.%d.channel_mix_key" }, - { LLM_TENSOR_CHANNEL_MIX_VALUE, "blk.%d.channel_mix_value" }, - }, - }, - { - LLM_ARCH_ARWKV7, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_TIME_MIX_W0, "blk.%d.time_mix_w0" }, - { LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" }, - { LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" }, - { LLM_TENSOR_TIME_MIX_A0, "blk.%d.time_mix_a0" }, - { LLM_TENSOR_TIME_MIX_A1, "blk.%d.time_mix_a1" }, - { LLM_TENSOR_TIME_MIX_A2, "blk.%d.time_mix_a2" }, - { LLM_TENSOR_TIME_MIX_V0, "blk.%d.time_mix_v0" }, - { LLM_TENSOR_TIME_MIX_V1, "blk.%d.time_mix_v1" }, - { LLM_TENSOR_TIME_MIX_V2, "blk.%d.time_mix_v2" }, - { LLM_TENSOR_TIME_MIX_G1, "blk.%d.time_mix_g1" }, - { LLM_TENSOR_TIME_MIX_G2, "blk.%d.time_mix_g2" }, - { LLM_TENSOR_TIME_MIX_K_K, "blk.%d.time_mix_k_k" }, - { LLM_TENSOR_TIME_MIX_K_A, "blk.%d.time_mix_k_a" }, - { LLM_TENSOR_TIME_MIX_R_K, "blk.%d.time_mix_r_k" }, - { LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" }, - { LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" }, - { LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" }, - { LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" }, - { LLM_TENSOR_TIME_MIX_LN, "blk.%d.time_mix_ln" }, - { LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_GRANITE, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_GRANITE_MOE, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, - { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, - { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, - }, - }, - { - LLM_ARCH_GRANITE_HYBRID, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - // mamba(2) ssm layers - { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" }, - { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" }, - { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" }, - { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" }, - { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" }, - { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" }, - { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, - // attention layers - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - // dense FFN - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - // moe FFN - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - // shared expert - { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, - { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, - { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, - }, - }, - { - LLM_ARCH_CHAMELEON, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - }, - }, - { - LLM_ARCH_WAVTOKENIZER_DEC, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, - { LLM_TENSOR_CONV1D, "conv1d" }, - { LLM_TENSOR_CONVNEXT_DW, "convnext.%d.dw" }, - { LLM_TENSOR_CONVNEXT_NORM, "convnext.%d.norm" }, - { LLM_TENSOR_CONVNEXT_PW1, "convnext.%d.pw1" }, - { LLM_TENSOR_CONVNEXT_PW2, "convnext.%d.pw2" }, - { LLM_TENSOR_CONVNEXT_GAMMA, "convnext.%d.gamma" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_POS_NET_CONV1, "posnet.%d.conv1" }, - { LLM_TENSOR_POS_NET_CONV2, "posnet.%d.conv2" }, - { LLM_TENSOR_POS_NET_NORM, "posnet.%d.norm" }, - { LLM_TENSOR_POS_NET_NORM1, "posnet.%d.norm1" }, - { LLM_TENSOR_POS_NET_NORM2, "posnet.%d.norm2" }, - { LLM_TENSOR_POS_NET_ATTN_NORM, "posnet.%d.attn_norm" }, - { LLM_TENSOR_POS_NET_ATTN_Q, "posnet.%d.attn_q" }, - { LLM_TENSOR_POS_NET_ATTN_K, "posnet.%d.attn_k" }, - { LLM_TENSOR_POS_NET_ATTN_V, "posnet.%d.attn_v" }, - { LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" }, - }, - }, - { - LLM_ARCH_BAILINGMOE, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" }, - { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, - { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, - { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, - }, - }, - { - LLM_ARCH_BAILINGMOE2, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, - { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, - { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, - { LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" }, - { LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.nextn.embed_tokens" }, - { LLM_TENSOR_NEXTN_ENORM, "blk.%d.nextn.enorm" }, - { LLM_TENSOR_NEXTN_HNORM, "blk.%d.nextn.hnorm" }, - { LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" }, - { LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" }, - { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, - }, - }, - { - LLM_ARCH_DOTS1, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" }, - { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, - { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, - { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, - { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" }, - } - }, - { - LLM_ARCH_ERNIE4_5, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_ERNIE4_5_MOE, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, - { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, - { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" }, - }, - }, - { - LLM_ARCH_HUNYUAN_MOE, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, - { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, - { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - }, - }, - { - LLM_ARCH_HUNYUAN_DENSE, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - - }, - }, - { - LLM_ARCH_SMOLLM3, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_OPENAI_MOE, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_SINKS, "blk.%d.attn_sinks" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - }, - }, - { - LLM_ARCH_LFM2, - { - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_SHORTCONV_CONV, "blk.%d.shortconv.conv" }, - { LLM_TENSOR_SHORTCONV_INPROJ, "blk.%d.shortconv.in_proj" }, - { LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" }, - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "token_embd_norm" }, // note: wrong tensor name - { LLM_TENSOR_OUTPUT, "output" }, - } - }, - { - LLM_ARCH_LFM2MOE, - { - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_SHORTCONV_CONV, "blk.%d.shortconv.conv" }, - { LLM_TENSOR_SHORTCONV_INPROJ, "blk.%d.shortconv.in_proj" }, - { LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" }, - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "token_embd_norm" }, // note: wrong tensor name - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" }, - } - }, - { - LLM_ARCH_SMALLTHINKER, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" } - }, - }, - { - LLM_ARCH_APERTUS, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_DREAM, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_LLADA, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_LLADA_MOE, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - }, - }, - { - LLM_ARCH_SEED_OSS, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_GROVEMOE, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - { LLM_TENSOR_FFN_GATE_CHEXPS, "blk.%d.ffn_gate_chexps" }, - { LLM_TENSOR_FFN_DOWN_CHEXPS, "blk.%d.ffn_down_chexps" }, - { LLM_TENSOR_FFN_UP_CHEXPS, "blk.%d.ffn_up_chexps" }, - }, - }, - { - LLM_ARCH_MINIMAX_M2, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" }, - }, - }, - { - LLM_ARCH_PANGU_EMBED, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - }, - }, - { - LLM_ARCH_COGVLM, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_VISEXP_ATTN_QKV, "blk.%d.vis_attn_qkv" }, - { LLM_TENSOR_VISEXP_ATTN_OUT, "blk.%d.vis_attn_output" }, - { LLM_TENSOR_VISEXP_FFN_GATE, "blk.%d.vis_gate" }, - { LLM_TENSOR_VISEXP_FFN_DOWN, "blk.%d.vis_down" }, - { LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.vis_up" }, - }, - }, - { - LLM_ARCH_RND1, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - }, - }, - { - LLM_ARCH_MISTRAL3, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, - { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, - { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - }, - }, - { - LLM_ARCH_UNKNOWN, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - }, - }, +static const std::map LLM_TENSOR_NAMES = { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT_NORM_LFM2, "token_embd_norm" }, // fix for wrong tensor name + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, + { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, + { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_ATTN_GATE, "blk.%d.attn_gate" }, + { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, + { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, + { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, + { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, + { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" }, + { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, + { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, + { LLM_TENSOR_POS_EMBD, "position_embd" }, + { LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" }, + { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, + { LLM_TENSOR_TOKEN_TYPES, "token_types" }, + { LLM_TENSOR_CLS, "cls" }, + { LLM_TENSOR_CLS_OUT, "cls.output" }, + { LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" }, + { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" }, + { LLM_TENSOR_SSM_A_NOSCAN, "blk.%d.ssm_a" }, + { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" }, + { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" }, + { LLM_TENSOR_SSM_BETA_ALPHA, "blk.%d.ssm_ba" }, + { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" }, + { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" }, + { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, + { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" }, + { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" }, + { LLM_TENSOR_SSM_X, "blk.%d.ssm_x" }, + { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" }, + { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" }, + { LLM_TENSOR_SSM_DT_NORM, "blk.%d.ssm_dt_norm" }, + { LLM_TENSOR_SSM_B_NORM, "blk.%d.ssm_b_norm" }, + { LLM_TENSOR_SSM_C_NORM, "blk.%d.ssm_c_norm" }, + { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" }, + { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" }, + { LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" }, + { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" }, + { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" }, + { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" }, + { LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "per_layer_token_embd" }, + { LLM_TENSOR_PER_LAYER_MODEL_PROJ, "per_layer_model_proj" }, + { LLM_TENSOR_PER_LAYER_PROJ_NORM, "per_layer_proj_norm" }, + { LLM_TENSOR_ALTUP_UNEMBD_PROJ, "altup_unembd_proj" }, + { LLM_TENSOR_ALTUP_PROJ, "altup_proj" }, + { LLM_TENSOR_PER_LAYER_INP_GATE, "blk.%d.inp_gate" }, + { LLM_TENSOR_PER_LAYER_PROJ, "blk.%d.proj" }, + { LLM_TENSOR_PER_LAYER_POST_NORM, "blk.%d.post_norm" }, + { LLM_TENSOR_ALTUP_CORRECT_COEF, "blk.%d.altup_correct_coef" }, + { LLM_TENSOR_ALTUP_CORRECT_SCALE, "blk.%d.altup_correct_scale" }, + { LLM_TENSOR_ALTUP_PREDICT_COEF, "blk.%d.altup_predict_coef" }, + { LLM_TENSOR_ALTUP_ROUTER, "blk.%d.altup_router" }, + { LLM_TENSOR_ALTUP_ROUTER_NORM, "blk.%d.altup_router_norm" }, + { LLM_TENSOR_LAUREL_L, "blk.%d.laurel_l" }, + { LLM_TENSOR_LAUREL_R, "blk.%d.laurel_r" }, + { LLM_TENSOR_LAUREL_POST_NORM, "blk.%d.laurel_post_norm" }, + { LLM_TENSOR_DENSE_2_OUT, "dense_2" }, + { LLM_TENSOR_DENSE_3_OUT, "dense_3" }, + { LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" }, + { LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" }, + { LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" }, + { LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" }, + { LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.nextn.embed_tokens" }, + { LLM_TENSOR_NEXTN_ENORM, "blk.%d.nextn.enorm" }, + { LLM_TENSOR_NEXTN_HNORM, "blk.%d.nextn.hnorm" }, + { LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" }, + { LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" }, + { LLM_TENSOR_ATTN_SUB_NORM, "blk.%d.attn_sub_norm" }, + { LLM_TENSOR_FFN_SUB_NORM, "blk.%d.ffn_sub_norm" }, + { LLM_TENSOR_DEC_OUTPUT_NORM, "dec.output_norm" }, + { LLM_TENSOR_DEC_ATTN_NORM, "dec.blk.%d.attn_norm" }, + { LLM_TENSOR_DEC_ATTN_Q, "dec.blk.%d.attn_q" }, + { LLM_TENSOR_DEC_ATTN_K, "dec.blk.%d.attn_k" }, + { LLM_TENSOR_DEC_ATTN_V, "dec.blk.%d.attn_v" }, + { LLM_TENSOR_DEC_ATTN_OUT, "dec.blk.%d.attn_o" }, + { LLM_TENSOR_DEC_ATTN_REL_B, "dec.blk.%d.attn_rel_b" }, + { LLM_TENSOR_DEC_CROSS_ATTN_NORM, "dec.blk.%d.cross_attn_norm" }, + { LLM_TENSOR_DEC_CROSS_ATTN_Q, "dec.blk.%d.cross_attn_q" }, + { LLM_TENSOR_DEC_CROSS_ATTN_K, "dec.blk.%d.cross_attn_k" }, + { LLM_TENSOR_DEC_CROSS_ATTN_V, "dec.blk.%d.cross_attn_v" }, + { LLM_TENSOR_DEC_CROSS_ATTN_OUT, "dec.blk.%d.cross_attn_o" }, + { LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "dec.blk.%d.cross_attn_rel_b" }, + { LLM_TENSOR_DEC_FFN_NORM, "dec.blk.%d.ffn_norm" }, + { LLM_TENSOR_DEC_FFN_GATE, "dec.blk.%d.ffn_gate" }, + { LLM_TENSOR_DEC_FFN_DOWN, "dec.blk.%d.ffn_down" }, + { LLM_TENSOR_DEC_FFN_UP, "dec.blk.%d.ffn_up" }, + { LLM_TENSOR_ENC_ATTN_NORM, "enc.blk.%d.attn_norm" }, + { LLM_TENSOR_ENC_ATTN_Q, "enc.blk.%d.attn_q" }, + { LLM_TENSOR_ENC_ATTN_K, "enc.blk.%d.attn_k" }, + { LLM_TENSOR_ENC_ATTN_V, "enc.blk.%d.attn_v" }, + { LLM_TENSOR_ENC_ATTN_OUT, "enc.blk.%d.attn_o" }, + { LLM_TENSOR_ENC_ATTN_REL_B, "enc.blk.%d.attn_rel_b" }, + { LLM_TENSOR_ENC_FFN_NORM, "enc.blk.%d.ffn_norm" }, + { LLM_TENSOR_ENC_FFN_GATE, "enc.blk.%d.ffn_gate" }, + { LLM_TENSOR_ENC_FFN_DOWN, "enc.blk.%d.ffn_down" }, + { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" }, + { LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" }, + { LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" }, + { LLM_TENSOR_TIME_MIX_LERP_X, "blk.%d.time_mix_lerp_x" }, + { LLM_TENSOR_TIME_MIX_LERP_W, "blk.%d.time_mix_lerp_w" }, + { LLM_TENSOR_TIME_MIX_LERP_K, "blk.%d.time_mix_lerp_k" }, + { LLM_TENSOR_TIME_MIX_LERP_V, "blk.%d.time_mix_lerp_v" }, + { LLM_TENSOR_TIME_MIX_LERP_R, "blk.%d.time_mix_lerp_r" }, + { LLM_TENSOR_TIME_MIX_LERP_G, "blk.%d.time_mix_lerp_g" }, + { LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" }, + { LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" }, + { LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" }, + { LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" }, + { LLM_TENSOR_TIME_MIX_DECAY_W2, "blk.%d.time_mix_decay_w2" }, + { LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" }, + { LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" }, + { LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" }, + { LLM_TENSOR_TIME_MIX_GATE, "blk.%d.time_mix_gate" }, + { LLM_TENSOR_TIME_MIX_LN, "blk.%d.time_mix_ln" }, + { LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" }, + { LLM_TENSOR_CHANNEL_MIX_LERP_K, "blk.%d.channel_mix_lerp_k" }, + { LLM_TENSOR_CHANNEL_MIX_LERP_R, "blk.%d.channel_mix_lerp_r" }, + { LLM_TENSOR_CHANNEL_MIX_KEY, "blk.%d.channel_mix_key" }, + { LLM_TENSOR_CHANNEL_MIX_VALUE, "blk.%d.channel_mix_value" }, + { LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" }, + { LLM_TENSOR_TIME_MIX_W0, "blk.%d.time_mix_w0" }, + { LLM_TENSOR_TIME_MIX_A0, "blk.%d.time_mix_a0" }, + { LLM_TENSOR_TIME_MIX_A1, "blk.%d.time_mix_a1" }, + { LLM_TENSOR_TIME_MIX_A2, "blk.%d.time_mix_a2" }, + { LLM_TENSOR_TIME_MIX_V0, "blk.%d.time_mix_v0" }, + { LLM_TENSOR_TIME_MIX_V1, "blk.%d.time_mix_v1" }, + { LLM_TENSOR_TIME_MIX_V2, "blk.%d.time_mix_v2" }, + { LLM_TENSOR_TIME_MIX_G1, "blk.%d.time_mix_g1" }, + { LLM_TENSOR_TIME_MIX_G2, "blk.%d.time_mix_g2" }, + { LLM_TENSOR_TIME_MIX_K_K, "blk.%d.time_mix_k_k" }, + { LLM_TENSOR_TIME_MIX_K_A, "blk.%d.time_mix_k_a" }, + { LLM_TENSOR_TIME_MIX_R_K, "blk.%d.time_mix_r_k" }, + { LLM_TENSOR_CONV1D, "conv1d" }, + { LLM_TENSOR_CONVNEXT_DW, "convnext.%d.dw" }, + { LLM_TENSOR_CONVNEXT_NORM, "convnext.%d.norm" }, + { LLM_TENSOR_CONVNEXT_PW1, "convnext.%d.pw1" }, + { LLM_TENSOR_CONVNEXT_PW2, "convnext.%d.pw2" }, + { LLM_TENSOR_CONVNEXT_GAMMA, "convnext.%d.gamma" }, + { LLM_TENSOR_POS_NET_CONV1, "posnet.%d.conv1" }, + { LLM_TENSOR_POS_NET_CONV2, "posnet.%d.conv2" }, + { LLM_TENSOR_POS_NET_NORM, "posnet.%d.norm" }, + { LLM_TENSOR_POS_NET_NORM1, "posnet.%d.norm1" }, + { LLM_TENSOR_POS_NET_NORM2, "posnet.%d.norm2" }, + { LLM_TENSOR_POS_NET_ATTN_NORM, "posnet.%d.attn_norm" }, + { LLM_TENSOR_POS_NET_ATTN_Q, "posnet.%d.attn_q" }, + { LLM_TENSOR_POS_NET_ATTN_K, "posnet.%d.attn_k" }, + { LLM_TENSOR_POS_NET_ATTN_V, "posnet.%d.attn_v" }, + { LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" }, + { LLM_TENSOR_ATTN_SINKS, "blk.%d.attn_sinks" }, + { LLM_TENSOR_SHORTCONV_CONV, "blk.%d.shortconv.conv" }, + { LLM_TENSOR_SHORTCONV_INPROJ, "blk.%d.shortconv.in_proj" }, + { LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" }, + { LLM_TENSOR_FFN_GATE_CHEXPS, "blk.%d.ffn_gate_chexps" }, + { LLM_TENSOR_FFN_DOWN_CHEXPS, "blk.%d.ffn_down_chexps" }, + { LLM_TENSOR_FFN_UP_CHEXPS, "blk.%d.ffn_up_chexps" }, + { LLM_TENSOR_VISEXP_ATTN_QKV, "blk.%d.vis_attn_qkv" }, + { LLM_TENSOR_VISEXP_ATTN_OUT, "blk.%d.vis_attn_output" }, + { LLM_TENSOR_VISEXP_FFN_GATE, "blk.%d.vis_gate" }, + { LLM_TENSOR_VISEXP_FFN_DOWN, "blk.%d.vis_down" }, + { LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.vis_up" }, }; +static std::set llm_get_tensor_names(llm_arch arch) { + switch (arch) { + case LLM_ARCH_CLIP: + return {}; + case LLM_ARCH_LLAMA: + case LLM_ARCH_DECI: + case LLM_ARCH_MISTRAL3: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FREQS, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_ROT_EMBD, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_GATE_EXP, + LLM_TENSOR_FFN_DOWN_EXP, + LLM_TENSOR_FFN_UP_EXP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + }; + case LLM_ARCH_ARCEE: + case LLM_ARCH_STARCODER2: + case LLM_ARCH_NEMOTRON: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FREQS, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_ROT_EMBD, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; + case LLM_ARCH_AFMOE: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_POST_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_ATTN_GATE, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_POST_NORM, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_SHEXP, + LLM_TENSOR_FFN_UP_SHEXP, + LLM_TENSOR_FFN_DOWN_SHEXP, + LLM_TENSOR_FFN_EXP_PROBS_B, + }; + case LLM_ARCH_LLAMA4: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FREQS, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_ROT_EMBD, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_GATE_EXP, + LLM_TENSOR_FFN_DOWN_EXP, + LLM_TENSOR_FFN_UP_EXP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_SHEXP, + LLM_TENSOR_FFN_DOWN_SHEXP, + LLM_TENSOR_FFN_UP_SHEXP, + }; + case LLM_ARCH_BAICHUAN: + case LLM_ARCH_ORION: + case LLM_ARCH_XVERSE: + case LLM_ARCH_EXAONE: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FREQS, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_ROT_EMBD, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; + case LLM_ARCH_FALCON: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_NORM_2, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; + case LLM_ARCH_GROK: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FREQS, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_ROT_EMBD, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_GATE_EXP, + LLM_TENSOR_FFN_DOWN_EXP, + LLM_TENSOR_FFN_UP_EXP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_POST_NORM, + LLM_TENSOR_LAYER_OUT_NORM, + LLM_TENSOR_ATTN_OUT_NORM, + }; + case LLM_ARCH_GPT2: + case LLM_ARCH_STARCODER: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_POS_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_DOWN, + }; + case LLM_ARCH_GPTNEOX: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; + case LLM_ARCH_MPT: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_ACT, + LLM_TENSOR_POS_EMBD, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K_NORM, + }; + case LLM_ARCH_REFACT: + case LLM_ARCH_QWEN2: + case LLM_ARCH_QWEN2VL: + case LLM_ARCH_INTERNLM2: + case LLM_ARCH_GRANITE: + case LLM_ARCH_ERNIE4_5: + case LLM_ARCH_SMOLLM3: + case LLM_ARCH_DREAM: + case LLM_ARCH_LLADA: + case LLM_ARCH_PANGU_EMBED: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; + case LLM_ARCH_BERT: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_TOKEN_EMBD_NORM, + LLM_TENSOR_TOKEN_TYPES, + LLM_TENSOR_POS_EMBD, + LLM_TENSOR_ATTN_OUT_NORM, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_LAYER_OUT_NORM, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_CLS, + LLM_TENSOR_CLS_OUT, + }; + case LLM_ARCH_NOMIC_BERT: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_TOKEN_EMBD_NORM, + LLM_TENSOR_TOKEN_TYPES, + LLM_TENSOR_ATTN_OUT_NORM, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_LAYER_OUT_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; + case LLM_ARCH_NOMIC_BERT_MOE: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_TOKEN_EMBD_NORM, + LLM_TENSOR_TOKEN_TYPES, + LLM_TENSOR_ATTN_OUT_NORM, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_LAYER_OUT_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + }; + case LLM_ARCH_NEO_BERT: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_ENC_OUTPUT_NORM, + LLM_TENSOR_CLS, + LLM_TENSOR_CLS_OUT, + }; + case LLM_ARCH_JINA_BERT_V2: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_TOKEN_EMBD_NORM, + LLM_TENSOR_TOKEN_TYPES, + LLM_TENSOR_ATTN_NORM_2, + LLM_TENSOR_ATTN_OUT_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_LAYER_OUT_NORM, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_CLS, + }; + case LLM_ARCH_JINA_BERT_V3: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_TOKEN_EMBD_NORM, + LLM_TENSOR_TOKEN_TYPES, + LLM_TENSOR_ATTN_OUT_NORM, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_LAYER_OUT_NORM, + }; + case LLM_ARCH_BLOOM: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_TOKEN_EMBD_NORM, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_DOWN, + }; + case LLM_ARCH_STABLELM: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FREQS, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K_NORM, + }; + case LLM_ARCH_QWEN: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FREQS, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; + case LLM_ARCH_QWEN2MOE: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_INP_SHEXP, + LLM_TENSOR_FFN_GATE_SHEXP, + LLM_TENSOR_FFN_DOWN_SHEXP, + LLM_TENSOR_FFN_UP_SHEXP, + }; + case LLM_ARCH_QWEN3: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_CLS_OUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; + case LLM_ARCH_QWEN3MOE: + case LLM_ARCH_QWEN3VLMOE: + case LLM_ARCH_OLMOE: + case LLM_ARCH_LLADA_MOE: + case LLM_ARCH_RND1: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + }; + case LLM_ARCH_QWEN3NEXT: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_POST_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_INP_SHEXP, + LLM_TENSOR_FFN_GATE_SHEXP, + LLM_TENSOR_FFN_DOWN_SHEXP, + LLM_TENSOR_FFN_UP_SHEXP, + LLM_TENSOR_SSM_A_NOSCAN, + LLM_TENSOR_SSM_CONV1D, + LLM_TENSOR_SSM_DT, + LLM_TENSOR_SSM_BETA_ALPHA, + LLM_TENSOR_SSM_IN, + LLM_TENSOR_SSM_NORM, + LLM_TENSOR_SSM_OUT, + }; + case LLM_ARCH_QWEN3VL: + case LLM_ARCH_CHAMELEON: + case LLM_ARCH_HUNYUAN_DENSE: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; + case LLM_ARCH_PHI2: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; + case LLM_ARCH_PHI3: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FACTORS_LONG, + LLM_TENSOR_ROPE_FACTORS_SHORT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; + case LLM_ARCH_PHIMOE: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FACTORS_LONG, + LLM_TENSOR_ROPE_FACTORS_SHORT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + }; + case LLM_ARCH_PLAMO: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FREQS, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_ROT_EMBD, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; + case LLM_ARCH_PLAMO2: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FREQS, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_ROT_EMBD, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_SSM_IN, + LLM_TENSOR_SSM_CONV1D, + LLM_TENSOR_SSM_X, + LLM_TENSOR_SSM_DT, + LLM_TENSOR_SSM_A, + LLM_TENSOR_SSM_D, + LLM_TENSOR_SSM_OUT, + LLM_TENSOR_SSM_DT_NORM, + LLM_TENSOR_SSM_B_NORM, + LLM_TENSOR_SSM_C_NORM, + LLM_TENSOR_ATTN_POST_NORM, + LLM_TENSOR_FFN_POST_NORM, + }; + case LLM_ARCH_CODESHELL: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FREQS, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_ROT_EMBD, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; + case LLM_ARCH_MINICPM: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FREQS, + LLM_TENSOR_ROPE_FACTORS_LONG, + LLM_TENSOR_ROPE_FACTORS_SHORT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_ROT_EMBD, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_GATE_EXP, + LLM_TENSOR_FFN_DOWN_EXP, + LLM_TENSOR_FFN_UP_EXP, + }; + case LLM_ARCH_MINICPM3: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FACTORS_LONG, + LLM_TENSOR_ROPE_FACTORS_SHORT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q_A_NORM, + LLM_TENSOR_ATTN_KV_A_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_Q_A, + LLM_TENSOR_ATTN_Q_B, + LLM_TENSOR_ATTN_KV_A_MQA, + LLM_TENSOR_ATTN_KV_B, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_DOWN, + }; + case LLM_ARCH_GEMMA: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; + case LLM_ARCH_GEMMA2: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_POST_NORM, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_POST_NORM, + }; + case LLM_ARCH_GEMMA3: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_POST_NORM, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_POST_NORM, + }; + case LLM_ARCH_GEMMA3N: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_POST_NORM, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_POST_NORM, + LLM_TENSOR_PER_LAYER_TOKEN_EMBD, + LLM_TENSOR_PER_LAYER_MODEL_PROJ, + LLM_TENSOR_PER_LAYER_PROJ_NORM, + LLM_TENSOR_ALTUP_UNEMBD_PROJ, + LLM_TENSOR_ALTUP_PROJ, + LLM_TENSOR_PER_LAYER_INP_GATE, + LLM_TENSOR_PER_LAYER_PROJ, + LLM_TENSOR_PER_LAYER_POST_NORM, + LLM_TENSOR_ALTUP_CORRECT_COEF, + LLM_TENSOR_ALTUP_CORRECT_SCALE, + LLM_TENSOR_ALTUP_PREDICT_COEF, + LLM_TENSOR_ALTUP_ROUTER, + LLM_TENSOR_ALTUP_ROUTER_NORM, + LLM_TENSOR_LAUREL_L, + LLM_TENSOR_LAUREL_R, + LLM_TENSOR_LAUREL_POST_NORM, + }; + case LLM_ARCH_GEMMA_EMBEDDING: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_DENSE_2_OUT, + LLM_TENSOR_DENSE_3_OUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_POST_NORM, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_POST_NORM, + }; + case LLM_ARCH_MAMBA: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_SSM_IN, + LLM_TENSOR_SSM_CONV1D, + LLM_TENSOR_SSM_X, + LLM_TENSOR_SSM_DT, + LLM_TENSOR_SSM_A, + LLM_TENSOR_SSM_D, + LLM_TENSOR_SSM_OUT, + }; + case LLM_ARCH_MAMBA2: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_SSM_IN, + LLM_TENSOR_SSM_CONV1D, + LLM_TENSOR_SSM_DT, + LLM_TENSOR_SSM_A, + LLM_TENSOR_SSM_D, + LLM_TENSOR_SSM_NORM, + LLM_TENSOR_SSM_OUT, + }; + case LLM_ARCH_JAMBA: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_SSM_IN, + LLM_TENSOR_SSM_CONV1D, + LLM_TENSOR_SSM_X, + LLM_TENSOR_SSM_DT, + LLM_TENSOR_SSM_DT_NORM, + LLM_TENSOR_SSM_A, + LLM_TENSOR_SSM_B_NORM, + LLM_TENSOR_SSM_C_NORM, + LLM_TENSOR_SSM_D, + LLM_TENSOR_SSM_OUT, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + }; + case LLM_ARCH_FALCON_H1: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_SSM_IN, + LLM_TENSOR_SSM_CONV1D, + LLM_TENSOR_SSM_DT, + LLM_TENSOR_SSM_A, + LLM_TENSOR_SSM_D, + LLM_TENSOR_SSM_NORM, + LLM_TENSOR_SSM_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; + case LLM_ARCH_COMMAND_R: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K_NORM, + }; + case LLM_ARCH_COHERE2: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; + case LLM_ARCH_DBRX: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_OUT_NORM, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + }; + case LLM_ARCH_OLMO: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; + case LLM_ARCH_OLMO2: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_POST_NORM, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_FFN_POST_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; + case LLM_ARCH_OPENELM: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; + case LLM_ARCH_ARCTIC: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_NORM_EXPS, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + }; + case LLM_ARCH_DEEPSEEK: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FREQS, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_ROT_EMBD, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_INP_SHEXP, + LLM_TENSOR_FFN_GATE_SHEXP, + LLM_TENSOR_FFN_DOWN_SHEXP, + LLM_TENSOR_FFN_UP_SHEXP, + }; + case LLM_ARCH_DEEPSEEK2: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q_A_NORM, + LLM_TENSOR_ATTN_KV_A_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_Q_A, + LLM_TENSOR_ATTN_Q_B, + LLM_TENSOR_ATTN_KV_A_MQA, + LLM_TENSOR_ATTN_KV_B, + LLM_TENSOR_ATTN_K_B, + LLM_TENSOR_ATTN_V_B, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_INP_SHEXP, + LLM_TENSOR_FFN_GATE_SHEXP, + LLM_TENSOR_FFN_DOWN_SHEXP, + LLM_TENSOR_FFN_UP_SHEXP, + LLM_TENSOR_FFN_EXP_PROBS_B, + }; + case LLM_ARCH_PLM: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_KV_A_MQA, + LLM_TENSOR_ATTN_KV_A_NORM, + LLM_TENSOR_ATTN_KV_B, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; + case LLM_ARCH_CHATGLM: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_ROPE_FREQS, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_DOWN, + }; + case LLM_ARCH_GLM4: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_ROPE_FREQS, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_ATTN_POST_NORM, + LLM_TENSOR_FFN_POST_NORM, + }; + case LLM_ARCH_GLM4_MOE: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_POST_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_SHEXP, + LLM_TENSOR_FFN_DOWN_SHEXP, + LLM_TENSOR_FFN_UP_SHEXP, + LLM_TENSOR_FFN_EXP_PROBS_B, + LLM_TENSOR_NEXTN_EH_PROJ, + LLM_TENSOR_NEXTN_EMBED_TOKENS, + LLM_TENSOR_NEXTN_ENORM, + LLM_TENSOR_NEXTN_HNORM, + LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, + LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, + }; + case LLM_ARCH_BITNET: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_SUB_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_SUB_NORM, + }; + case LLM_ARCH_T5: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_DEC_OUTPUT_NORM, + LLM_TENSOR_DEC_ATTN_NORM, + LLM_TENSOR_DEC_ATTN_Q, + LLM_TENSOR_DEC_ATTN_K, + LLM_TENSOR_DEC_ATTN_V, + LLM_TENSOR_DEC_ATTN_OUT, + LLM_TENSOR_DEC_ATTN_REL_B, + LLM_TENSOR_DEC_CROSS_ATTN_NORM, + LLM_TENSOR_DEC_CROSS_ATTN_Q, + LLM_TENSOR_DEC_CROSS_ATTN_K, + LLM_TENSOR_DEC_CROSS_ATTN_V, + LLM_TENSOR_DEC_CROSS_ATTN_OUT, + LLM_TENSOR_DEC_CROSS_ATTN_REL_B, + LLM_TENSOR_DEC_FFN_NORM, + LLM_TENSOR_DEC_FFN_GATE, + LLM_TENSOR_DEC_FFN_DOWN, + LLM_TENSOR_DEC_FFN_UP, + LLM_TENSOR_ENC_OUTPUT_NORM, + LLM_TENSOR_ENC_ATTN_NORM, + LLM_TENSOR_ENC_ATTN_Q, + LLM_TENSOR_ENC_ATTN_K, + LLM_TENSOR_ENC_ATTN_V, + LLM_TENSOR_ENC_ATTN_OUT, + LLM_TENSOR_ENC_ATTN_REL_B, + LLM_TENSOR_ENC_FFN_NORM, + LLM_TENSOR_ENC_FFN_GATE, + LLM_TENSOR_ENC_FFN_DOWN, + LLM_TENSOR_ENC_FFN_UP, + }; + case LLM_ARCH_T5ENCODER: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ENC_OUTPUT_NORM, + LLM_TENSOR_ENC_ATTN_NORM, + LLM_TENSOR_ENC_ATTN_Q, + LLM_TENSOR_ENC_ATTN_K, + LLM_TENSOR_ENC_ATTN_V, + LLM_TENSOR_ENC_ATTN_OUT, + LLM_TENSOR_ENC_ATTN_REL_B, + LLM_TENSOR_ENC_FFN_NORM, + LLM_TENSOR_ENC_FFN_GATE, + LLM_TENSOR_ENC_FFN_DOWN, + LLM_TENSOR_ENC_FFN_UP, + }; + case LLM_ARCH_JAIS: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + }; + case LLM_ARCH_NEMOTRON_H: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_SSM_IN, + LLM_TENSOR_SSM_CONV1D, + LLM_TENSOR_SSM_DT, + LLM_TENSOR_SSM_A, + LLM_TENSOR_SSM_D, + LLM_TENSOR_SSM_NORM, + LLM_TENSOR_SSM_OUT, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; + case LLM_ARCH_NEMOTRON_H_MOE: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + // mamba(2) ssm layers + LLM_TENSOR_SSM_IN, + LLM_TENSOR_SSM_CONV1D, + LLM_TENSOR_SSM_DT, + LLM_TENSOR_SSM_A, + LLM_TENSOR_SSM_D, + LLM_TENSOR_SSM_NORM, + LLM_TENSOR_SSM_OUT, + // attention layers + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + // dense FFN + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + // MoE FFN (for MoE layers) + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_EXP_PROBS_B, + // MoE shared expert layer + LLM_TENSOR_FFN_DOWN_SHEXP, + LLM_TENSOR_FFN_UP_SHEXP, + }; + case LLM_ARCH_EXAONE4: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FREQS, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_POST_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_POST_NORM, + }; + case LLM_ARCH_RWKV6: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_TOKEN_EMBD_NORM, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_NORM_2, + LLM_TENSOR_TIME_MIX_W1, + LLM_TENSOR_TIME_MIX_W2, + LLM_TENSOR_TIME_MIX_LERP_X, + LLM_TENSOR_TIME_MIX_LERP_W, + LLM_TENSOR_TIME_MIX_LERP_K, + LLM_TENSOR_TIME_MIX_LERP_V, + LLM_TENSOR_TIME_MIX_LERP_R, + LLM_TENSOR_TIME_MIX_LERP_G, + LLM_TENSOR_TIME_MIX_LERP_FUSED, + LLM_TENSOR_TIME_MIX_FIRST, + LLM_TENSOR_TIME_MIX_DECAY, + LLM_TENSOR_TIME_MIX_DECAY_W1, + LLM_TENSOR_TIME_MIX_DECAY_W2, + LLM_TENSOR_TIME_MIX_KEY, + LLM_TENSOR_TIME_MIX_VALUE, + LLM_TENSOR_TIME_MIX_RECEPTANCE, + LLM_TENSOR_TIME_MIX_GATE, + LLM_TENSOR_TIME_MIX_LN, + LLM_TENSOR_TIME_MIX_OUTPUT, + LLM_TENSOR_CHANNEL_MIX_LERP_K, + LLM_TENSOR_CHANNEL_MIX_LERP_R, + LLM_TENSOR_CHANNEL_MIX_KEY, + LLM_TENSOR_CHANNEL_MIX_VALUE, + LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, + }; + case LLM_ARCH_RWKV6QWEN2: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_TIME_MIX_W1, + LLM_TENSOR_TIME_MIX_W2, + LLM_TENSOR_TIME_MIX_LERP_X, + LLM_TENSOR_TIME_MIX_LERP_FUSED, + LLM_TENSOR_TIME_MIX_FIRST, + LLM_TENSOR_TIME_MIX_DECAY, + LLM_TENSOR_TIME_MIX_DECAY_W1, + LLM_TENSOR_TIME_MIX_DECAY_W2, + LLM_TENSOR_TIME_MIX_KEY, + LLM_TENSOR_TIME_MIX_VALUE, + LLM_TENSOR_TIME_MIX_RECEPTANCE, + LLM_TENSOR_TIME_MIX_GATE, + LLM_TENSOR_TIME_MIX_OUTPUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; + case LLM_ARCH_RWKV7: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_TOKEN_EMBD_NORM, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_NORM_2, + LLM_TENSOR_TIME_MIX_W0, + LLM_TENSOR_TIME_MIX_W1, + LLM_TENSOR_TIME_MIX_W2, + LLM_TENSOR_TIME_MIX_A0, + LLM_TENSOR_TIME_MIX_A1, + LLM_TENSOR_TIME_MIX_A2, + LLM_TENSOR_TIME_MIX_V0, + LLM_TENSOR_TIME_MIX_V1, + LLM_TENSOR_TIME_MIX_V2, + LLM_TENSOR_TIME_MIX_G1, + LLM_TENSOR_TIME_MIX_G2, + LLM_TENSOR_TIME_MIX_K_K, + LLM_TENSOR_TIME_MIX_K_A, + LLM_TENSOR_TIME_MIX_R_K, + LLM_TENSOR_TIME_MIX_LERP_FUSED, + LLM_TENSOR_TIME_MIX_KEY, + LLM_TENSOR_TIME_MIX_VALUE, + LLM_TENSOR_TIME_MIX_RECEPTANCE, + LLM_TENSOR_TIME_MIX_LN, + LLM_TENSOR_TIME_MIX_OUTPUT, + LLM_TENSOR_CHANNEL_MIX_LERP_K, + LLM_TENSOR_CHANNEL_MIX_KEY, + LLM_TENSOR_CHANNEL_MIX_VALUE, + }; + case LLM_ARCH_ARWKV7: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_TOKEN_EMBD_NORM, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_TIME_MIX_W0, + LLM_TENSOR_TIME_MIX_W1, + LLM_TENSOR_TIME_MIX_W2, + LLM_TENSOR_TIME_MIX_A0, + LLM_TENSOR_TIME_MIX_A1, + LLM_TENSOR_TIME_MIX_A2, + LLM_TENSOR_TIME_MIX_V0, + LLM_TENSOR_TIME_MIX_V1, + LLM_TENSOR_TIME_MIX_V2, + LLM_TENSOR_TIME_MIX_G1, + LLM_TENSOR_TIME_MIX_G2, + LLM_TENSOR_TIME_MIX_K_K, + LLM_TENSOR_TIME_MIX_K_A, + LLM_TENSOR_TIME_MIX_R_K, + LLM_TENSOR_TIME_MIX_LERP_FUSED, + LLM_TENSOR_TIME_MIX_KEY, + LLM_TENSOR_TIME_MIX_VALUE, + LLM_TENSOR_TIME_MIX_RECEPTANCE, + LLM_TENSOR_TIME_MIX_LN, + LLM_TENSOR_TIME_MIX_OUTPUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; + case LLM_ARCH_GRANITE_MOE: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_SHEXP, + LLM_TENSOR_FFN_DOWN_SHEXP, + LLM_TENSOR_FFN_UP_SHEXP, + }; + case LLM_ARCH_GRANITE_HYBRID: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_SSM_IN, + LLM_TENSOR_SSM_CONV1D, + LLM_TENSOR_SSM_DT, + LLM_TENSOR_SSM_A, + LLM_TENSOR_SSM_D, + LLM_TENSOR_SSM_NORM, + LLM_TENSOR_SSM_OUT, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_SHEXP, + LLM_TENSOR_FFN_DOWN_SHEXP, + LLM_TENSOR_FFN_UP_SHEXP, + }; + case LLM_ARCH_WAVTOKENIZER_DEC: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_TOKEN_EMBD_NORM, + LLM_TENSOR_CONV1D, + LLM_TENSOR_CONVNEXT_DW, + LLM_TENSOR_CONVNEXT_NORM, + LLM_TENSOR_CONVNEXT_PW1, + LLM_TENSOR_CONVNEXT_PW2, + LLM_TENSOR_CONVNEXT_GAMMA, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_POS_NET_CONV1, + LLM_TENSOR_POS_NET_CONV2, + LLM_TENSOR_POS_NET_NORM, + LLM_TENSOR_POS_NET_NORM1, + LLM_TENSOR_POS_NET_NORM2, + LLM_TENSOR_POS_NET_ATTN_NORM, + LLM_TENSOR_POS_NET_ATTN_Q, + LLM_TENSOR_POS_NET_ATTN_K, + LLM_TENSOR_POS_NET_ATTN_V, + LLM_TENSOR_POS_NET_ATTN_OUT, + }; + case LLM_ARCH_BAILINGMOE: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FREQS, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_INP_SHEXP, + LLM_TENSOR_FFN_GATE_SHEXP, + LLM_TENSOR_FFN_DOWN_SHEXP, + LLM_TENSOR_FFN_UP_SHEXP, + }; + case LLM_ARCH_BAILINGMOE2: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_EXP_PROBS_B, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_SHEXP, + LLM_TENSOR_FFN_DOWN_SHEXP, + LLM_TENSOR_FFN_UP_SHEXP, + LLM_TENSOR_NEXTN_EH_PROJ, + LLM_TENSOR_NEXTN_EMBED_TOKENS, + LLM_TENSOR_NEXTN_ENORM, + LLM_TENSOR_NEXTN_HNORM, + LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, + LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, + LLM_TENSOR_LAYER_OUT_NORM, + }; + case LLM_ARCH_DOTS1: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_INP_SHEXP, + LLM_TENSOR_FFN_GATE_SHEXP, + LLM_TENSOR_FFN_DOWN_SHEXP, + LLM_TENSOR_FFN_UP_SHEXP, + LLM_TENSOR_FFN_EXP_PROBS_B, + }; + case LLM_ARCH_ERNIE4_5_MOE: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_GATE_SHEXP, + LLM_TENSOR_FFN_DOWN_SHEXP, + LLM_TENSOR_FFN_UP_SHEXP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_EXP_PROBS_B, + }; + case LLM_ARCH_HUNYUAN_MOE: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE_SHEXP, + LLM_TENSOR_FFN_DOWN_SHEXP, + LLM_TENSOR_FFN_UP_SHEXP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + }; + case LLM_ARCH_OPENAI_MOE: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_POST_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_SINKS, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + }; + case LLM_ARCH_LFM2: + return { + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_SHORTCONV_CONV, + LLM_TENSOR_SHORTCONV_INPROJ, + LLM_TENSOR_SHORTCONV_OUTPROJ, + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM_LFM2, + LLM_TENSOR_OUTPUT, + }; + case LLM_ARCH_LFM2MOE: + return { + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_SHORTCONV_CONV, + LLM_TENSOR_SHORTCONV_INPROJ, + LLM_TENSOR_SHORTCONV_OUTPROJ, + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_EXP_PROBS_B, + }; + case LLM_ARCH_SMALLTHINKER: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + }; + case LLM_ARCH_APERTUS: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FREQS, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; + case LLM_ARCH_SEED_OSS: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_POST_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; + case LLM_ARCH_GROVEMOE: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_CHEXPS, + LLM_TENSOR_FFN_DOWN_CHEXPS, + LLM_TENSOR_FFN_UP_CHEXPS, + }; + case LLM_ARCH_MINIMAX_M2: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_EXP_PROBS_B, + }; + case LLM_ARCH_COGVLM: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_VISEXP_ATTN_QKV, + LLM_TENSOR_VISEXP_ATTN_OUT, + LLM_TENSOR_VISEXP_FFN_GATE, + LLM_TENSOR_VISEXP_FFN_DOWN, + LLM_TENSOR_VISEXP_FFN_UP, + }; + case LLM_ARCH_GPTJ: + case LLM_ARCH_UNKNOWN: + return { + LLM_TENSOR_TOKEN_EMBD, + }; + default: + GGML_ABORT("unknown architecture for tensor mapping"); + } +} + // declare information about the model weight tensors: // - the layer in which the tensor is going to be used. this is needed in order to assign the correct buffer type for the weight // - the operator which is going to use the weight. this is needed to determine if the respective backend supports the operator @@ -2603,6 +2202,7 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_DENSE_2_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output {LLM_TENSOR_DENSE_3_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output {LLM_TENSOR_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, + {LLM_TENSOR_OUTPUT_NORM_LFM2, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, {LLM_TENSOR_DEC_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, {LLM_TENSOR_ENC_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, {LLM_TENSOR_ROPE_FREQS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ROPE}}, @@ -2791,13 +2391,20 @@ std::string LLM_KV::operator()(llm_kv kv) const { return name; } +LLM_TN_IMPL::LLM_TN_IMPL(llm_arch arch, llm_tensor tensor, const char * suffix, int bid, int xid) + : arch(arch), tensor(tensor), suffix(suffix), bid(bid), xid(xid), + model_tensors(llm_get_tensor_names(arch)) {} + std::string LLM_TN_IMPL::str() const { - if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) { - return "__missing__"; + if (LLM_TENSOR_NAMES.find(tensor) == LLM_TENSOR_NAMES.end()) { + GGML_ABORT("unknown tensor name for tensor id %d", static_cast(tensor)); } - std::string name = ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid, xid); + if (model_tensors.find(tensor) == model_tensors.end()) { + return LLM_TENSOR_NAMES.at(tensor); + } + std::string name = ::format(LLM_TENSOR_NAMES.at(tensor), bid, xid); if (suffix != nullptr) { name += "."; name += suffix; diff --git a/src/llama-arch.h b/src/llama-arch.h index 455658f5dc..6cbf9b1f89 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -3,6 +3,7 @@ #include "ggml.h" // ggml_op #include +#include // // gguf constants (sync with gguf.py) @@ -316,6 +317,7 @@ enum llm_tensor { LLM_TENSOR_DENSE_3_OUT, LLM_TENSOR_OUTPUT, LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT_NORM_LFM2, // fix for wrong tensor name LLM_TENSOR_ROPE_FREQS, LLM_TENSOR_ROPE_FACTORS_LONG, LLM_TENSOR_ROPE_FACTORS_SHORT, @@ -526,6 +528,10 @@ struct LLM_TN_IMPL { const int bid; const int xid; + const std::set model_tensors; + + LLM_TN_IMPL(llm_arch arch, llm_tensor tensor, const char * suffix, int bid, int xid); + std::string str() const; operator std::string() const { @@ -547,11 +553,11 @@ struct LLM_TN { llm_arch arch; LLM_TN_IMPL operator()(llm_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const { - return { arch, tensor, suffix, bid, xid }; + return LLM_TN_IMPL(arch, tensor, suffix, bid, xid); } LLM_TN_IMPL operator()(llm_tensor tensor, int bid = -1, int xid = -1) const { - return { arch, tensor, nullptr, bid, xid }; + return LLM_TN_IMPL(arch, tensor, nullptr, bid, xid); } }; From 79dbae034afdfaa8e17989ea1b9b20094c8d0a36 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 16 Dec 2025 13:25:09 +0100 Subject: [PATCH 12/44] model-conversion : remove -fa option in model card template [no ci] (#18088) This commit updates the causal model card template and removes the -fa option as it is no longer required (fa is auto detected). --- examples/model-conversion/scripts/causal/modelcard.template | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/model-conversion/scripts/causal/modelcard.template b/examples/model-conversion/scripts/causal/modelcard.template index 87800a1b93..cfa8e6b433 100644 --- a/examples/model-conversion/scripts/causal/modelcard.template +++ b/examples/model-conversion/scripts/causal/modelcard.template @@ -7,7 +7,7 @@ base_model: Recommended way to run this model: ```sh -llama-server -hf {namespace}/{model_name}-GGUF -c 0 -fa +llama-server -hf {namespace}/{model_name}-GGUF -c 0 ``` Then, access http://localhost:8080 From 59977eba7b0a3603d0017717d3beec7bde018f3c Mon Sep 17 00:00:00 2001 From: yifant-code Date: Tue, 16 Dec 2025 07:27:36 -0500 Subject: [PATCH 13/44] server: fix crash when batch > ubatch with embeddings (#17912) * server: fix crash when batch > ubatch with embeddings (#12836) Fixes #12836 where the server crashes with GGML_ASSERT failure when running with embeddings enabled and n_batch > n_ubatch. Root cause: Embeddings use non-causal attention which requires all tokens to be processed within a single ubatch. When n_batch > n_ubatch, the server attempts to split processing, causing assertion failure. Solution: - Add parameter validation in main() after common_params_parse() - When embeddings enabled and n_batch > n_ubatch: * Log warnings explaining the issue * Automatically set n_batch = n_ubatch * Prevent server crash This follows the approach suggested by @ggerganov in issue #12836. Note: This supersedes stalled PR #12940 which attempted a runtime fix in the old examples/server/server.cpp location. This implementation validates at startup in tools/server/server.cpp (current location). Testing: - Build: Compiles successfully - Validation triggers: Warns when -b > -ub with --embedding - Auto-correction works: Adjusts n_batch = n_ubatch - No false positives: Valid params don't trigger warnings - Verified on macOS M3 Pro with embedding model * Update tools/server/server.cpp --------- Co-authored-by: ytian218 Co-authored-by: Georgi Gerganov --- tools/server/server.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 235ae4e8c0..8538427f73 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -73,8 +73,18 @@ int main(int argc, char ** argv, char ** envp) { return 1; } + // validate batch size for embeddings + // embeddings require all tokens to be processed in a single ubatch + // see https://github.com/ggml-org/llama.cpp/issues/12836 + if (params.embedding && params.n_batch > params.n_ubatch) { + LOG_WRN("%s: embeddings enabled with n_batch (%d) > n_ubatch (%d)\n", __func__, params.n_batch, params.n_ubatch); + LOG_WRN("%s: setting n_batch = n_ubatch = %d to avoid assertion failure\n", __func__, params.n_ubatch); + params.n_batch = params.n_ubatch; + } + if (params.n_parallel < 0) { LOG_INF("%s: n_parallel is set to auto, using n_parallel = 4 and kv_unified = true\n", __func__); + params.n_parallel = 4; params.kv_unified = true; } From ec98e20021f7611db3bbcf6bb6629fed6e1ce4f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 16 Dec 2025 14:24:00 +0100 Subject: [PATCH 14/44] llama: fix early stop in params_fit if ctx is set (#18070) --- src/llama.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 7ed34b80ae..f69964b6d5 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -241,6 +241,13 @@ static void llama_params_fit_impl( global_surplus += memory_reduction; LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n", __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB); + if (global_surplus >= 0) { + if (nd == 1) { + LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__); + return; + } + LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__); + } } else { LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n", __func__, hp_nct, n_ctx_min); @@ -249,10 +256,6 @@ static void llama_params_fit_impl( LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx); } } - if (global_surplus >= 0) { - LLAMA_LOG_INFO("%s: entire model can be fit across devices by reducing context\n", __func__); - return; - } } if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) { From ef83fb8601229ff650d952985be47e82d644bfaa Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Tue, 16 Dec 2025 19:07:43 +0100 Subject: [PATCH 15/44] model: fix LFM2 missing tensors (#18105) --- src/llama-model.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index ae8207ee1a..c9a3c5dfa2 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -6236,8 +6236,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); - output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); - output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM_LFM2, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); if (output == NULL) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); From 4164596c76cb4f62322fd1ff9d157cee5ba97b23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 17 Dec 2025 00:03:19 +0100 Subject: [PATCH 16/44] llama-fit-params: QoL impr. for prints/errors (#18089) --- tools/fit-params/fit-params.cpp | 36 ++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/tools/fit-params/fit-params.cpp b/tools/fit-params/fit-params.cpp index fbf7a2eb37..2c113c453e 100644 --- a/tools/fit-params/fit-params.cpp +++ b/tools/fit-params/fit-params.cpp @@ -4,7 +4,11 @@ #include "common.h" #include "log.h" -#include +#include +#include +#include + +using namespace std::chrono_literals; #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -22,13 +26,17 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); auto mparams = common_model_params_to_llama(params); auto cparams = common_context_params_to_llama(params); - llama_params_fit(params.model.path.c_str(), &mparams, &cparams, + const bool success = llama_params_fit(params.model.path.c_str(), &mparams, &cparams, params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx, params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR); + if (!success) { + LOG_ERR("%s: failed to fit CLI arguments to free memory, exiting...\n", __func__); + exit(1); + } - LOG_INF("Printing fitted CLI arguments to stdout...\n"); - std::cout << "-c " << cparams.n_ctx; - std::cout << " -ngl " << mparams.n_gpu_layers; + LOG_INF("%s: printing fitted CLI arguments to stdout...\n", __func__); + std::this_thread::sleep_for(10ms); // to avoid a race between stderr and stdout + printf("-c %" PRIu32 " -ngl %" PRIu32, cparams.n_ctx, mparams.n_gpu_layers); size_t nd = llama_max_devices(); while (nd > 1 && mparams.tensor_split[nd - 1] == 0.0f) { @@ -37,26 +45,22 @@ int main(int argc, char ** argv) { if (nd > 1) { for (size_t id = 0; id < nd; id++) { if (id == 0) { - std::cout << " -ts "; + printf(" -ts "); } - if (id > 0) { - std::cout << ","; - } - std::cout << mparams.tensor_split[id]; + printf("%s%" PRIu32, id > 0 ? "," : "", uint32_t(mparams.tensor_split[id])); } } const size_t ntbo = llama_max_tensor_buft_overrides(); + bool any_tbo = false; for (size_t itbo = 0; itbo < ntbo && mparams.tensor_buft_overrides[itbo].pattern != nullptr; itbo++) { if (itbo == 0) { - std::cout << " -ot "; + printf(" -ot \""); } - if (itbo > 0) { - std::cout << ","; - } - std::cout << mparams.tensor_buft_overrides[itbo].pattern << "=" << ggml_backend_buft_name(mparams.tensor_buft_overrides[itbo].buft); + printf("%s%s=%s", itbo > 0 ? "," : "", mparams.tensor_buft_overrides[itbo].pattern, ggml_backend_buft_name(mparams.tensor_buft_overrides[itbo].buft)); + any_tbo = true; } - std::cout << "\n"; + printf("%s\n", any_tbo ? "\"" : ""); return 0; } From 0e49a7b8b47aaa8995953ee7e3c0b5c39ead19fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 17 Dec 2025 00:47:37 +0100 Subject: [PATCH 17/44] llama-fit-params: fix underflow for dense models (#18095) --- src/llama.cpp | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index f69964b6d5..1d609bbe01 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -481,8 +481,13 @@ static void llama_params_fit_impl( } else { LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__); } - uint32_t n_unassigned = hp_ngl; for (int id = nd - 1; id >= 0; id--) { + uint32_t n_unassigned = hp_ngl; + for (size_t jd = id + 1; jd < nd; ++jd) { + assert(n_unassigned >= ngl_per_device[jd].n_layer); + n_unassigned -= ngl_per_device[jd].n_layer; + } + std::vector ngl_per_device_high = ngl_per_device; ngl_per_device_high[id].n_layer = n_unassigned; if (hp_nex > 0) { @@ -491,7 +496,9 @@ static void llama_params_fit_impl( if (ngl_per_device_high[id].n_layer > 0) { std::vector mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe); if (mem_high[id] > targets[id]) { + assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer); uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer; + LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta); while (delta > 1) { uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]); step_size = std::max(step_size, uint32_t(1)); @@ -505,20 +512,19 @@ static void llama_params_fit_impl( const std::vector mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe); if (mem_test[id] <= targets[id]) { - ngl_per_device = ngl_per_device_test; - mem = mem_test; - n_unassigned -= ngl_per_device[id].n_layer; + ngl_per_device = ngl_per_device_test; + mem = mem_test; LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer); } else { ngl_per_device_high = ngl_per_device_test; mem_high = mem_test; - LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer); + LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer); } delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer; } } else { - ngl_per_device = ngl_per_device_high; - n_unassigned -= ngl_per_device[id].n_layer; + assert(ngl_per_device_high[id].n_layer == n_unassigned); + ngl_per_device = ngl_per_device_high; LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer); } } From 9dcac6cf9f1972ec0f324ea2d28672993d31fac9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 17 Dec 2025 00:49:34 +0100 Subject: [PATCH 18/44] llama-fit-params: lower ctx size for multi GPU (#18101) --- src/llama.cpp | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 1d609bbe01..85f2ee729c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -180,11 +180,12 @@ static void llama_params_fit_impl( } } - int64_t sum_total = 0; - int64_t sum_projected_free = 0; - int64_t min_projected_free = INT64_MAX; - int64_t sum_projected_used = 0; - int64_t sum_projected_ctx = 0; + int64_t sum_total = 0; + int64_t sum_projected_free = 0; + int64_t min_projected_free = INT64_MAX; + int64_t sum_projected_used = 0; + int64_t sum_projected_model = 0; + int64_t sum_projected_ctx = 0; if (nd > 1) { LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__); @@ -195,11 +196,12 @@ static void llama_params_fit_impl( const int64_t projected_used = dmd.mb.total(); const int64_t projected_free = dmd.free - projected_used; - sum_total += dmd.total; - sum_projected_used += projected_used; - sum_projected_free += projected_free; - min_projected_free = std::min(min_projected_free, projected_free); - sum_projected_ctx += dmd.mb.context; + sum_total += dmd.total; + sum_projected_used += projected_used; + sum_projected_free += projected_free; + min_projected_free = std::min(min_projected_free, projected_free); + sum_projected_model += dmd.mb.model; + sum_projected_ctx += dmd.mb.context; if (nd > 1) { LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n", @@ -234,10 +236,24 @@ static void llama_params_fit_impl( if (cparams->n_ctx == 0) { if (hp_nct > n_ctx_min) { const int64_t bytes_per_ctx = sum_projected_ctx / hp_nct; - const uint32_t ctx_reduction = std::min( - uint32_t((-global_surplus + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min); + + int64_t memory_reduction = -global_surplus; + if (nd > 1) { + // for multiple devices we need to be more conservative in terms of how much context we think can fit: + // - for dense models only whole layers can be assigned to devices + // - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer + // - on average we expect a waste of 0.5 layers/tensors per device + // - use slightly more than the expected average for nd devices to be safe + const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl); + memory_reduction += (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6); + } + + uint32_t ctx_reduction = std::min(uint32_t((memory_reduction + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min); cparams->n_ctx = hp_nct - ctx_reduction; - const int64_t memory_reduction = ctx_reduction * bytes_per_ctx; + cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend + + ctx_reduction = hp_nct - cparams->n_ctx; + memory_reduction = ctx_reduction * bytes_per_ctx; global_surplus += memory_reduction; LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n", __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB); From d0794e89d9444ec970650834007bdf5c8fd4a05c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 17 Dec 2025 00:50:12 +0100 Subject: [PATCH 19/44] llama-fit-params: force disable mlock (#18103) --- src/llama.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 85f2ee729c..c8b5febe70 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -71,8 +71,9 @@ static std::vector llama_get_device_memory_data( }, &ud); llama_model_params mparams_copy = *mparams; - mparams_copy.no_alloc = true; - mparams_copy.use_mmap = false; + mparams_copy.no_alloc = true; + mparams_copy.use_mmap = false; + mparams_copy.use_mlock = false; llama_model * model = llama_model_load_from_file(path_model, mparams_copy); if (model == nullptr) { From 2973a65ecb6c884ca609de6eb5f1b6dc08631aaf Mon Sep 17 00:00:00 2001 From: Yuri Khrustalev Date: Tue, 16 Dec 2025 20:27:03 -0500 Subject: [PATCH 20/44] gguf-py : allow converting multi-tensor models from read-only locations (#18100) --- gguf-py/gguf/utility.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/utility.py b/gguf-py/gguf/utility.py index c9401a1c0a..7907e706d5 100644 --- a/gguf-py/gguf/utility.py +++ b/gguf-py/gguf/utility.py @@ -288,7 +288,7 @@ class LocalTensor: data_range: LocalTensorRange def mmap_bytes(self) -> np.ndarray: - return np.memmap(self.data_range.filename, offset=self.data_range.offset, shape=self.data_range.size) + return np.memmap(self.data_range.filename, mode='r', offset=self.data_range.offset, shape=self.data_range.size) class SafetensorsLocal: From 58062860afb88e555857c1266d3a17e1b65b5eb9 Mon Sep 17 00:00:00 2001 From: Aadeshveer Singh Date: Wed, 17 Dec 2025 09:17:01 +0530 Subject: [PATCH 21/44] ggml : use WARP_SIZE/2 for argmax reduction offset (#18092) --- ggml/src/ggml-cuda/argmax.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/argmax.cu b/ggml/src/ggml-cuda/argmax.cu index 5340eedc08..51967c667c 100644 --- a/ggml/src/ggml-cuda/argmax.cu +++ b/ggml/src/ggml-cuda/argmax.cu @@ -21,7 +21,7 @@ static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __rest } #pragma unroll - for (int offset = 16; offset > 0; offset >>= 1) { + for (int offset = WARP_SIZE/2; offset > 0; offset >>= 1) { const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE); const int col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE); if (val > maxval) { @@ -50,7 +50,7 @@ static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __rest argmax = shared_argmax[lane_id]; } #pragma unroll - for (int offset = 16; offset > 0; offset >>= 1) { + for (int offset = WARP_SIZE/2; offset > 0; offset >>= 1) { const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE); const int col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE); if (val > maxval) { From 4b2a4778f81f222c12271ce3b1997990b3071faf Mon Sep 17 00:00:00 2001 From: TrevorS Date: Tue, 16 Dec 2025 22:33:02 -0800 Subject: [PATCH 22/44] arg: allow -kvu flag for llama-perplexity (#18117) The -kvu (--kv-unified) flag is required for hellaswag and winogrande benchmarks which use coupled sequences. Without unified KV cache, these benchmarks fail with: split_equal: sequential split is not supported when there are coupled sequences in the input batch (you may need to use the -kvu flag) This change adds LLAMA_EXAMPLE_PERPLEXITY to the allowed examples for the -kvu argument, enabling its use with llama-perplexity. --- common/arg.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/arg.cpp b/common/arg.cpp index f2aec895ba..a4ffd5586c 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1140,7 +1140,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.kv_unified = true; } - ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER})); + ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--context-shift"}, {"--no-context-shift"}, From 5c0d18881e0e9794c96b2602736b758bac9d9388 Mon Sep 17 00:00:00 2001 From: Naco Siren Date: Wed, 17 Dec 2025 00:14:47 -0800 Subject: [PATCH 23/44] llama.android : Rewrite Android binding (w/o cpu_features dep) (#17413) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * UI: implement basic UI components * util: implement performance monitor; wrap it with a viewmodel * util: implement user preferences utility * UI: implement core flow's screens * UI: add a new MainActivity; update manifest * [WIP] DI: implement simple local vm factory provider * UI: disable triggering drawer via gesture; enable alert dialog on back navigation inside conversation and benchmark * UI: allow drawer's gesture control only on Home and Settings screens; enable alert dialog on back navigation inside conversation and benchmark * UI: split a nested parent settings screen into separate child settings screens * UI: polish system prompt setup UI * Deps: bump Kotlin plugin; introduce KSP; apply in :app subproject * DB: setup Room database * data: introduce repo for System Prompt; flow data from Room to VM * bugfix: properly handle user's quitting conversation screen while tokens in generation * UI: rename `ModeSelection` to `ModelLoading` for better clarity * UI: update app name to be more Arm * UI: polish conversation screen * data: code polish * UI: code polish * bugfix: handle user quitting on model loading * UI: locks user in alert dialog when model is unloading * vm: replace token metrics stubs with actual implementation * UI: refactor top app bars * nit: combine temperatureMetrics and useFahrenheit * DI: introduce Hilt plugin + processor + lib dependencies * DI: make app Hilt injectable * DI: make viewmodels Hilt injectable * DI: replace manual DI with Hilt DI * UI: optimize AppContent's composing * bugfix: wait for model to load before navigating to benchmark screen; use NavigationActions instead of raw navController * UI: navigation with more natural animated transitions * DI: Optimize AppModule * Feature: Introduce ModelRepository and ModelsManagementViewModel; update AppModule * UI: polish UI for ModelsManagementScreen; inject ModelsManagementVieModel * DI: abstract the protocol of SystemPromptRepository; update AppModule * data: [WIP] prepare for ModelRepository refactor & impl * data: introduce Model entity and DAO; update DI module * UI: replace Models Management screen's stubbing with instrumentation * UI: polish sort order menu * data: import local model with file picker * bugfix: use List instead of Collection for ModelDao's deletion * data: add a util file for extracting file name & size and model metadata * UI: enrich ModelManagementState; extract filename to show correct importing UI * UI: implement multiple models deletion; update Models Management screen * UI: handle back navigation when user is in multi-selection mode * util: extract file size formatting into ModelUtils * UI: add a confirmation step when user picks a file; refactor model import overlay into AlertDialog * UI: extract a shared ModelCard component * UI: replace model selection screen's data stubbing; add empty view * nit: tidy SystemPromptViewModel * Util: split FileUtils from ModelUtils; extract copy methods into FileUtils * data: pass through getModelById from ModelDao into ModelRepository * core: extract conversation and benchmark logics into InferenceManager; add logs and missing state updates in stub InferenceEngine * vm: split mono MainViewModel into separate individual ViewModels * vm: merge SystemPromptViewModel into ModelLoadingViewModel * core: break down InferenceManager due to Interface Segregation Principle * UI: show model card in Model Loading screen * UI: show model card in Conversation screen * UI: unify Model Card components * core: swap in LLamaAndroid and mark stub engine for testing only * data: allow canceling the ongoing model import * UI: update UI ongoing model import's cancellation * LLama: update engine state after handling the cancellation of sendUserPrompt * VM: handle the cancellation of ongoing token generation * LLama: refactor loadModel by splitting the system prompt setting into a separate method * feature: check for available space before copying local model * UI: centralize the AppScaffold and modularize its configs * UI: refactor BottomBarConfig.ModelsManagement APIs * UI: combine TopBarConfig and BottomBarConfig into each route's ScaffoldConfig * UI: replace ugly optional as casts in AppScaffold with extension functions * UI: fix the typo `totalGb` in `StorageMetrics` * UI: remove code duplication in sort menu * LLama: add ModelUnloadingState to engine State; add missing state checks in stub engine; fix instrumentation engine's error messages * UI: refactor back handling by removing centralized BackHandlerSetup and UnloadModelConfirmationDialog from AppContent * UI: implement BenchmarkScreen's individual back handling * LLama: add a new Initializing state; ; add two extension properties; rename LibraryLoaded state to Initialized * UI: Introduce an abstract ViewModel to handle additional model unloading logics * UI: expose a single facade ModelUnloadDialogHandler; move UnloadModelState into ModelUnloadingViewModel.kt * UI: migrate ModelLoadingScreen onto ModelLoadingViewModel; update & refine ModelLoadingScreen * UI: migrate ConversationViewModel onto ModelLoadingViewModel; update & refine ConversationScreen * nit: extract app name into a constant value; remove unused onBackPressed callbacks * UI: update AppContent to pass in correct navigation callbacks * nit: polish ModelLoadingScreen UI * core: throw Exception instead of returning null if model fails to load * navigation: sink model loading state management from AppContent down into ModelLoadingScreen; pass ModelLoadingMetrics to Benchmark and Conversation screens * gguf: add GGUF metadata data holder and its corresponding extractor implementation * DB: introduce Kotlin serialization extension's library and plugin; add Room runtime library * GGUF: make GgufMetadata serializable in order to be compatible with Room * nit: refactor data.local package structure * nit: rename lastUsed field to dateLastUsed; add dateAdded field * UI: refactor ModelCard UI to show GGUF metadata * UI: update ModelSelectionScreen with a preselect mechanism * UI: polish model card * nit: allow deselect model on Model Selection screen * nit: revert accidental committing of debug code * UI: polish ModelLoading screen * util: extract formatting helper functions from FileUtils into a new FormatUtils * UI: polish model cards on Benchmark and Conversation screens to show model loading metrics * UI: show a Snack bar to warn user that system prompt is not always supported * UI: handle back press on Model Selection screen * UI: finally support theme modes; remove hardcoded color schemes, default to dynamic color scheme implementation * feature: support searching on Model Selection screen * nit: move scaffold related UI components into a separate package * UI: extract InfoView out into a separate file for reusability * data: move Model related actions (query, filter, sort) into ModelInfo file * UI: animate FAB on model preselection states * feature: support filtering in Model Management screen * ui: show empty models info in Model Management screen * ui: add filter off icon to "Clear filters" menu item * [WIP] ui: polish Benchmark screen; implement its bottom app bar * ui: polish Benchmark screen; implement its bottom app bar's rerun and share * nit: disable mode selection's radio buttons when loading model * feature: implement Conversation screen's bottom app bar * pkg: restructure BottomAppBars into separate files in a child package * pkg: restructure TopBarApps into separate files in a child package * pkg: restructure system metrics into a separate file * UI: polish Conversation screen * data: update system prompt presets * UI: allow hide or show model card on Conversation & Benchmark screens; fix message arrangement * data: update & enhance system prompt presets * deps: introduce Retrofit2 * data: implement HuggingFace data model, data source with Retrofit API * data: update Model data repository to support fetching HuggingFace models * [WIP] UI: replace the HuggingFace stub in Model Management screen with actual API call * UI: map language codes into country Emojis * ui: add "clear results" action to Benchmark screen * nit: print current pp & tg in llama-bench * UI: disable landscape mode; prevent duplicated benchmark running * llama: migrate C/CXX flags into CMakeList * [WIP] llama: ABI split builds five .so artifacts. However, all .so are performing on SVE level * [WIP] llama: ABI split where five tiers are built sequentially. * [WIP] llama: disable OpenMP in ABI split since most SoCs are big.LITTLE * [WIP] llama: enable KleidiAI and disable tier 4 due to `+sve+sve2` bug caused by `ggml_add_cpu_backend_variant_impl` as explained below ```CMake if (NOT SME_ENABLED MATCHES -1) ... set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2") ... ``` * core: add Google's cpu_features as a submodule * core: implement cpu_detector native lib * core: swap out hardcoded LlamaAndroid library loading * core: add back OpenMP due to huge perf loss on TG128 * misc: reorg the pkg structure * misc: rename LlamaAndroid related class to InferenceEngine prefixes * [WIP] lib: move GgufMetadata into the lib submodule * lib: expose GgufMetadataReader as interface only * lib: replace the naive & plain SharedPreferences with DataStore implementation * lib: hide the internal implementations, only expose a facade and interfaces * lib: expose Arm features * di: add a stub TierDetection; provide both actual impl and stub in AppModule * UI: add visualizer UI for Arm features * misc: UI polish * lib: refactored InferenceEngineLoader; added a `NONE` Llama Tier * UI: support `NONE` Llama Tier in general settings * lib: optimize engine loader; always perform a fresh detection when cache is null * remote: add HuggingFaceModelDetails data class * remote: refine HuggingFaceModel data class * nit: remove `trendingScore` field from HuggingFace model entities, weird... * remote: refactor HuggingFaceApiService; implement download feature in HuggingFaceRemoteDataSource * remote: fix the incorrect parse of HuggingFace's inconsistent & weird JSON response * UI: scaffold Models Management screen and view model * UI: implement a dialog UI to show fetched HuggingFace models. * UI: use a broadcast receiver to listen for download complete events and show local import dialog. * data: handle network exceptions elegantly * pkg: restructure `data`'s packages * data: extract local file info, copy and cleanup logics into LocalFileDataSource * nit: minor UI patch; add missing comments * bugfix: tapping "Home" in navigation drawer should simply close it without any navigation action. * UI: improve autoscroll during token generation * lib: tested on JFrog Artifactory for Maven publishing * UI: show RAM warning if model too large * UI: polish model management screen's error dialog * util: add more items into the mapping table of ISO 639-1 language code to ISO 3166-1 country code * llm: properly propagate error to UI upon failing to load selected model * UI: avoid duplicated calculation of token metrics * lib: read & validate the magic number from the picked source file before executing the import * UI: add "Learn More" hyperlinks to Error dialog upon model import failures * lib: refactor the GgufMetadataReader to take InputStream instead of absolute path as argument * lib: fix the `SIMD` typo in Tier description * core: verify model file path is readable * lib: add UnsupportedArchitectureException for triaged error message * util: split FormatUtils into multiple utils for better readability * UI: change benchmark screen from raw markdown to table view * bugfix: reset preselection upon running the preselected model * misc: linter issue * bugfix: fix the malfunctioning monitoring switch * UI: update Arm features indicator; fix the broken hyperlinks * UI: add quick action buttons to benchmark screen's result card * UI: hide share fab after clearing all benchmark results * UI: fix the model unload dialog message; elevate the model card and hide it by default on Conversation screen; * UI: hide the stubbing actions in Conversation screen * UI: add show/hide stats control to conversation screen's assistant message bubble; fix placeholder * UI: add a info button to explain token metrics * misc: remove the redundant `Companion` added due to refactoring * UI: show corresponding system metrics detailed info upon tapping RAM / storage / temperature indicator * UI: add info button to System Prompt switch; expand the model card by default * UI: disable tag & language chips; add section headers to explain what they are * misc: replace top bar indicator's spacer with padding * UI: merge the Model Selection and Model Management into a unified Models screen * UI: split the ModelsManagementViewModel from a unified ModelsViewModel due to huge complexity * UI: add model loading in progress view; polish the empty model info view * UI: polish the bottom bars and info view when no models found; show loading in progress while fetching models * build: [BREAKING] bump the versions of libraries and plugins * UI: fix the breaking build * UI: add Tooltip on Import FAB for user onboarding * UI: adds AppPreferences to track user onboarding status * UI: tracks user's first success on importing a model * data: add hand crafted rules to filter the models fetched from HuggingFace API * UI: update app name & about; polish top bars' indicators & buttons * UI: polish Hugging Face download dialog UI * UX: implement onboarding tooltips for model import and onboarding * misc: use sentence case for CTA button labels * [WIP] UI: add Arm color palette from Philip.Watson3 * UI: address Rojin's UX feedbacks * UI: address Rojin's UX feedbacks - part 2 * UI: update Arm color palette from Philip.Watson3 * data: make sure fetch preselected models in the same order of their IDs * UI: fix UI issues in the generic settings screen and navigation drawer * nit: address Rojin's feedbacks on model import message again * nit: append `®` to all `Arm` labels * UI: extract a reusable InfoAlertDialog * core: support GGML_CPU_ALL_VARIANTS on Android! * core: restructure Kleidi-Llama library * core: organizing cmake arguments * data: sort preselected models according to device's available RAM * app: update adaptive + themed + legacy icons and app name * UI: fix the font size auto scaling for ArmFeaturesVisualizer * core: further improve the performance on native methods * UI: minor color palette changes; emphasize the bottom bar FABs; fix Settings Screen menu item label * UI: make more room for assistant message bubble's width * UI: better usage of tertiary colors to highlight model cards but not for warnings * UI: fix the layout issue on large font sizes * lib: support x86-64 by dynamically set Arm related definitions * lib: replace the factory pattern for deprecated tiered lib loading with single instance pattern * llama: update the library name in JNI and CMake project * llama: update the library's package name and namespace * llama: update the app's package name and namespace * app: bump ksp version * app: remove deprecated SystemUIController from accompanist by migrating to EdgeToEdge * app: extract AppContent from MainActivity to a separate file in ui package * lib: add File version for GGUF Magic number verification * lib: perform engine state check inclusively instead of exclusively * lib: change `LlamaTier` to `ArmCpuTier` * lib: remove kleidi-llama related namings * cleanup: remove Arm AI Chat/Playground app source code; replace with the basic sample app from https://github.com/hanyin-arm/Arm-AI-Chat-Sample Note: the full Google Play version of AI Chat app will be open will be open sourced in another repo soon, therefore didn't go through the trouble of pruning the history using `git filter-repo` here. * [WIP] doc: update main and Android README docs; add self to code owners * lib: revert System.load back to System.loadLibrary * jni: introduce a logging util to filter different logging levels on different build types * lib: enable app optimization * doc: replace stub Google Play app URL with the actual link add screenshots; add my GitHub ID to maintainer list * Remove cpu_features * Fix linters issues in editorconfig-checker job https://github.com/ggml-org/llama.cpp/actions/runs/19548770247/job/55974800633?pr=17413 * Remove unnecessary Android CMake flag * purge include/cpu_features directory --------- Co-authored-by: Han Yin --- CODEOWNERS | 2 +- README.md | 1 + docs/android.md | 20 + examples/llama.android/app/build.gradle.kts | 52 +- examples/llama.android/app/proguard-rules.pro | 8 + .../app/src/main/AndroidManifest.xml | 13 +- .../java/com/example/llama/Downloadable.kt | 119 ---- .../java/com/example/llama/MainActivity.kt | 359 +++++++---- .../java/com/example/llama/MainViewModel.kt | 105 ---- .../java/com/example/llama/MessageAdapter.kt | 51 ++ .../java/com/example/llama/ui/theme/Color.kt | 11 - .../java/com/example/llama/ui/theme/Theme.kt | 70 --- .../java/com/example/llama/ui/theme/Type.kt | 34 - .../res/drawable/bg_assistant_message.xml | 4 + .../src/main/res/drawable/bg_user_message.xml | 4 + .../res/drawable/outline_folder_open_24.xml | 10 + .../src/main/res/drawable/outline_send_24.xml | 11 + .../app/src/main/res/layout/activity_main.xml | 76 +++ .../res/layout/item_message_assistant.xml | 15 + .../src/main/res/layout/item_message_user.xml | 15 + .../app/src/main/res/values/strings.xml | 2 +- .../app/src/main/res/values/themes.xml | 7 +- examples/llama.android/build.gradle.kts | 6 +- examples/llama.android/gradle.properties | 1 + .../llama.android/gradle/libs.versions.toml | 53 ++ .../gradle/wrapper/gradle-wrapper.properties | 4 +- .../llama.android/{llama => lib}/.gitignore | 0 examples/llama.android/lib/build.gradle.kts | 78 +++ examples/llama.android/lib/consumer-rules.pro | 8 + .../{llama => lib}/proguard-rules.pro | 0 .../llama/cpp/ExampleInstrumentedTest.kt | 0 .../src/main/AndroidManifest.xml | 0 .../lib/src/main/cpp/CMakeLists.txt | 56 ++ .../lib/src/main/cpp/ai_chat.cpp | 565 +++++++++++++++++ .../llama.android/lib/src/main/cpp/logging.h | 61 ++ .../src/main/java/com/arm/aichat/AiChat.kt | 14 + .../java/com/arm/aichat/InferenceEngine.kt | 89 +++ .../main/java/com/arm/aichat/gguf/FileType.kt | 61 ++ .../java/com/arm/aichat/gguf/GgufMetadata.kt | 132 ++++ .../com/arm/aichat/gguf/GgufMetadataReader.kt | 77 +++ .../aichat/internal/InferenceEngineImpl.kt | 309 +++++++++ .../internal/gguf/GgufMetadataReaderImpl.kt | 590 ++++++++++++++++++ .../java/android/llama/cpp/ExampleUnitTest.kt | 0 examples/llama.android/llama/build.gradle.kts | 71 --- .../llama.android/llama/consumer-rules.pro | 0 .../llama/src/main/cpp/CMakeLists.txt | 53 -- .../llama/src/main/cpp/llama-android.cpp | 452 -------------- .../java/android/llama/cpp/LLamaAndroid.kt | 180 ------ examples/llama.android/settings.gradle.kts | 6 +- ggml/src/CMakeLists.txt | 3 + 50 files changed, 2588 insertions(+), 1270 deletions(-) delete mode 100644 examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt delete mode 100644 examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt create mode 100644 examples/llama.android/app/src/main/java/com/example/llama/MessageAdapter.kt delete mode 100644 examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt delete mode 100644 examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt delete mode 100644 examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt create mode 100644 examples/llama.android/app/src/main/res/drawable/bg_assistant_message.xml create mode 100644 examples/llama.android/app/src/main/res/drawable/bg_user_message.xml create mode 100644 examples/llama.android/app/src/main/res/drawable/outline_folder_open_24.xml create mode 100644 examples/llama.android/app/src/main/res/drawable/outline_send_24.xml create mode 100644 examples/llama.android/app/src/main/res/layout/activity_main.xml create mode 100644 examples/llama.android/app/src/main/res/layout/item_message_assistant.xml create mode 100644 examples/llama.android/app/src/main/res/layout/item_message_user.xml create mode 100644 examples/llama.android/gradle/libs.versions.toml rename examples/llama.android/{llama => lib}/.gitignore (100%) create mode 100644 examples/llama.android/lib/build.gradle.kts create mode 100644 examples/llama.android/lib/consumer-rules.pro rename examples/llama.android/{llama => lib}/proguard-rules.pro (100%) rename examples/llama.android/{llama => lib}/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt (100%) rename examples/llama.android/{llama => lib}/src/main/AndroidManifest.xml (100%) create mode 100644 examples/llama.android/lib/src/main/cpp/CMakeLists.txt create mode 100644 examples/llama.android/lib/src/main/cpp/ai_chat.cpp create mode 100644 examples/llama.android/lib/src/main/cpp/logging.h create mode 100644 examples/llama.android/lib/src/main/java/com/arm/aichat/AiChat.kt create mode 100644 examples/llama.android/lib/src/main/java/com/arm/aichat/InferenceEngine.kt create mode 100644 examples/llama.android/lib/src/main/java/com/arm/aichat/gguf/FileType.kt create mode 100644 examples/llama.android/lib/src/main/java/com/arm/aichat/gguf/GgufMetadata.kt create mode 100644 examples/llama.android/lib/src/main/java/com/arm/aichat/gguf/GgufMetadataReader.kt create mode 100644 examples/llama.android/lib/src/main/java/com/arm/aichat/internal/InferenceEngineImpl.kt create mode 100644 examples/llama.android/lib/src/main/java/com/arm/aichat/internal/gguf/GgufMetadataReaderImpl.kt rename examples/llama.android/{llama => lib}/src/test/java/android/llama/cpp/ExampleUnitTest.kt (100%) delete mode 100644 examples/llama.android/llama/build.gradle.kts delete mode 100644 examples/llama.android/llama/consumer-rules.pro delete mode 100644 examples/llama.android/llama/src/main/cpp/CMakeLists.txt delete mode 100644 examples/llama.android/llama/src/main/cpp/llama-android.cpp delete mode 100644 examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt diff --git a/CODEOWNERS b/CODEOWNERS index 8a0c98c968..750096d9a1 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -32,7 +32,7 @@ /examples/export-docs/ @ggerganov /examples/gen-docs/ @ggerganov /examples/gguf/ @ggerganov -/examples/llama.android/ @ggerganov +/examples/llama.android/ @ggerganov @hanyin-arm @naco-siren /examples/llama.swiftui/ @ggerganov /examples/llama.vim @ggerganov /examples/lookahead/ @ggerganov diff --git a/README.md b/README.md index 5f2076d0a3..ed956bb02e 100644 --- a/README.md +++ b/README.md @@ -190,6 +190,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama) - Delphi [Embarcadero/llama-cpp-delphi](https://github.com/Embarcadero/llama-cpp-delphi) - Go (no CGo needed): [hybridgroup/yzma](https://github.com/hybridgroup/yzma) +- Android: [llama.android](/examples/llama.android) diff --git a/docs/android.md b/docs/android.md index d2a835653f..28b966ffc7 100644 --- a/docs/android.md +++ b/docs/android.md @@ -1,6 +1,26 @@ # Android +## Build with Android Studio + +Import the `examples/llama.android` directory into Android Studio, then perform a Gradle sync and build the project. +![Project imported into Android Studio](./android/imported-into-android-studio.png) + +This Android binding supports hardware acceleration up to `SME2` for **Arm** and `AMX` for **x86-64** CPUs on Android and ChromeOS devices. +It automatically detects the host's hardware to load compatible kernels. As a result, it runs seamlessly on both the latest premium devices and older devices that may lack modern CPU features or have limited RAM, without requiring any manual configuration. + +A minimal Android app frontend is included to showcase the binding’s core functionalities: +1. **Parse GGUF metadata** via `GgufMetadataReader` from either a `ContentResolver` provided `Uri` or a local `File`. +2. **Obtain a `TierDetection` or `InferenceEngine`** instance through the high-level facade APIs. +3. **Send a raw user prompt** for automatic template formatting, prefill, and decoding. Then collect the generated tokens in a Kotlin `Flow`. + +For a production-ready experience that leverages advanced features such as system prompts and benchmarks, check out [Arm AI Chat](https://play.google.com/store/apps/details?id=com.arm.aichat) on Google Play. +This project is made possible through a collaborative effort by Arm's **CT-ML**, **CE-ML** and **STE** groups: + +| ![Home screen](./android/arm-ai-chat-home-screen.png) | ![System prompt](./android/system-prompt-setup.png) | !["Haiku"](./android/chat-with-system-prompt-haiku.png) | +|:------------------------------------------------------:|:----------------------------------------------------:|:--------------------------------------------------------:| +| Home screen | System prompt | "Haiku" | + ## Build on Android using Termux [Termux](https://termux.dev/en/) is an Android terminal emulator and Linux environment app (no root required). As of writing, Termux is available experimentally in the Google Play Store; otherwise, it may be obtained directly from the project repo or on F-Droid. diff --git a/examples/llama.android/app/build.gradle.kts b/examples/llama.android/app/build.gradle.kts index 8d1b37195e..3524fe39c4 100644 --- a/examples/llama.android/app/build.gradle.kts +++ b/examples/llama.android/app/build.gradle.kts @@ -1,16 +1,18 @@ plugins { - id("com.android.application") - id("org.jetbrains.kotlin.android") + alias(libs.plugins.android.application) + alias(libs.plugins.jetbrains.kotlin.android) } android { namespace = "com.example.llama" - compileSdk = 34 + compileSdk = 36 defaultConfig { - applicationId = "com.example.llama" + applicationId = "com.example.llama.aichat" + minSdk = 33 - targetSdk = 34 + targetSdk = 36 + versionCode = 1 versionName = "1.0" @@ -21,8 +23,17 @@ android { } buildTypes { + debug { + isMinifyEnabled = true + isShrinkResources = true + proguardFiles( + getDefaultProguardFile("proguard-android.txt"), + "proguard-rules.pro" + ) + } release { - isMinifyEnabled = false + isMinifyEnabled = true + isShrinkResources = true proguardFiles( getDefaultProguardFile("proguard-android-optimize.txt"), "proguard-rules.pro" @@ -36,30 +47,15 @@ android { kotlinOptions { jvmTarget = "1.8" } - buildFeatures { - compose = true - } - composeOptions { - kotlinCompilerExtensionVersion = "1.5.1" - } } dependencies { + implementation(libs.bundles.androidx) + implementation(libs.material) - implementation("androidx.core:core-ktx:1.12.0") - implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.6.2") - implementation("androidx.activity:activity-compose:1.8.2") - implementation(platform("androidx.compose:compose-bom:2023.08.00")) - implementation("androidx.compose.ui:ui") - implementation("androidx.compose.ui:ui-graphics") - implementation("androidx.compose.ui:ui-tooling-preview") - implementation("androidx.compose.material3:material3") - implementation(project(":llama")) - testImplementation("junit:junit:4.13.2") - androidTestImplementation("androidx.test.ext:junit:1.1.5") - androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1") - androidTestImplementation(platform("androidx.compose:compose-bom:2023.08.00")) - androidTestImplementation("androidx.compose.ui:ui-test-junit4") - debugImplementation("androidx.compose.ui:ui-tooling") - debugImplementation("androidx.compose.ui:ui-test-manifest") + implementation(project(":lib")) + + testImplementation(libs.junit) + androidTestImplementation(libs.androidx.junit) + androidTestImplementation(libs.androidx.espresso.core) } diff --git a/examples/llama.android/app/proguard-rules.pro b/examples/llama.android/app/proguard-rules.pro index f1b424510d..358020d2d2 100644 --- a/examples/llama.android/app/proguard-rules.pro +++ b/examples/llama.android/app/proguard-rules.pro @@ -19,3 +19,11 @@ # If you keep the line number information, uncomment this to # hide the original source file name. #-renamesourcefileattribute SourceFile + +-keep class com.arm.aichat.* { *; } +-keep class com.arm.aichat.gguf.* { *; } + +-assumenosideeffects class android.util.Log { + public static int v(...); + public static int d(...); +} diff --git a/examples/llama.android/app/src/main/AndroidManifest.xml b/examples/llama.android/app/src/main/AndroidManifest.xml index 41a358a299..8f7c606b41 100644 --- a/examples/llama.android/app/src/main/AndroidManifest.xml +++ b/examples/llama.android/app/src/main/AndroidManifest.xml @@ -1,24 +1,21 @@ - - - + + android:exported="true"> diff --git a/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt b/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt deleted file mode 100644 index 78c231ae55..0000000000 --- a/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt +++ /dev/null @@ -1,119 +0,0 @@ -package com.example.llama - -import android.app.DownloadManager -import android.net.Uri -import android.util.Log -import androidx.compose.material3.Button -import androidx.compose.material3.Text -import androidx.compose.runtime.Composable -import androidx.compose.runtime.getValue -import androidx.compose.runtime.mutableDoubleStateOf -import androidx.compose.runtime.mutableStateOf -import androidx.compose.runtime.remember -import androidx.compose.runtime.rememberCoroutineScope -import androidx.compose.runtime.setValue -import androidx.core.database.getLongOrNull -import androidx.core.net.toUri -import kotlinx.coroutines.delay -import kotlinx.coroutines.launch -import java.io.File - -data class Downloadable(val name: String, val source: Uri, val destination: File) { - companion object { - @JvmStatic - private val tag: String? = this::class.qualifiedName - - sealed interface State - data object Ready: State - data class Downloading(val id: Long): State - data class Downloaded(val downloadable: Downloadable): State - data class Error(val message: String): State - - @JvmStatic - @Composable - fun Button(viewModel: MainViewModel, dm: DownloadManager, item: Downloadable) { - var status: State by remember { - mutableStateOf( - if (item.destination.exists()) Downloaded(item) - else Ready - ) - } - var progress by remember { mutableDoubleStateOf(0.0) } - - val coroutineScope = rememberCoroutineScope() - - suspend fun waitForDownload(result: Downloading, item: Downloadable): State { - while (true) { - val cursor = dm.query(DownloadManager.Query().setFilterById(result.id)) - - if (cursor == null) { - Log.e(tag, "dm.query() returned null") - return Error("dm.query() returned null") - } - - if (!cursor.moveToFirst() || cursor.count < 1) { - cursor.close() - Log.i(tag, "cursor.moveToFirst() returned false or cursor.count < 1, download canceled?") - return Ready - } - - val pix = cursor.getColumnIndex(DownloadManager.COLUMN_BYTES_DOWNLOADED_SO_FAR) - val tix = cursor.getColumnIndex(DownloadManager.COLUMN_TOTAL_SIZE_BYTES) - val sofar = cursor.getLongOrNull(pix) ?: 0 - val total = cursor.getLongOrNull(tix) ?: 1 - cursor.close() - - if (sofar == total) { - return Downloaded(item) - } - - progress = (sofar * 1.0) / total - - delay(1000L) - } - } - - fun onClick() { - when (val s = status) { - is Downloaded -> { - viewModel.load(item.destination.path) - } - - is Downloading -> { - coroutineScope.launch { - status = waitForDownload(s, item) - } - } - - else -> { - item.destination.delete() - - val request = DownloadManager.Request(item.source).apply { - setTitle("Downloading model") - setDescription("Downloading model: ${item.name}") - setAllowedNetworkTypes(DownloadManager.Request.NETWORK_WIFI) - setDestinationUri(item.destination.toUri()) - } - - viewModel.log("Saving ${item.name} to ${item.destination.path}") - Log.i(tag, "Saving ${item.name} to ${item.destination.path}") - - val id = dm.enqueue(request) - status = Downloading(id) - onClick() - } - } - } - - Button(onClick = { onClick() }, enabled = status !is Downloading) { - when (status) { - is Downloading -> Text(text = "Downloading ${(progress * 100).toInt()}%") - is Downloaded -> Text("Load ${item.name}") - is Ready -> Text("Download ${item.name}") - is Error -> Text("Download ${item.name}") - } - } - } - - } -} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt index 9da04f7d3c..52c5dc2154 100644 --- a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt +++ b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt @@ -1,154 +1,257 @@ package com.example.llama -import android.app.ActivityManager -import android.app.DownloadManager -import android.content.ClipData -import android.content.ClipboardManager import android.net.Uri import android.os.Bundle -import android.os.StrictMode -import android.os.StrictMode.VmPolicy -import android.text.format.Formatter -import androidx.activity.ComponentActivity -import androidx.activity.compose.setContent -import androidx.activity.viewModels -import androidx.compose.foundation.layout.Box -import androidx.compose.foundation.layout.Column -import androidx.compose.foundation.layout.Row -import androidx.compose.foundation.layout.fillMaxSize -import androidx.compose.foundation.layout.padding -import androidx.compose.foundation.lazy.LazyColumn -import androidx.compose.foundation.lazy.items -import androidx.compose.foundation.lazy.rememberLazyListState -import androidx.compose.material3.Button -import androidx.compose.material3.LocalContentColor -import androidx.compose.material3.MaterialTheme -import androidx.compose.material3.OutlinedTextField -import androidx.compose.material3.Surface -import androidx.compose.material3.Text -import androidx.compose.runtime.Composable -import androidx.compose.ui.Modifier -import androidx.compose.ui.unit.dp -import androidx.core.content.getSystemService -import com.example.llama.ui.theme.LlamaAndroidTheme +import android.util.Log +import android.widget.EditText +import android.widget.TextView +import android.widget.Toast +import androidx.activity.enableEdgeToEdge +import androidx.activity.result.contract.ActivityResultContracts +import androidx.appcompat.app.AppCompatActivity +import androidx.lifecycle.lifecycleScope +import androidx.recyclerview.widget.LinearLayoutManager +import androidx.recyclerview.widget.RecyclerView +import com.arm.aichat.AiChat +import com.arm.aichat.InferenceEngine +import com.arm.aichat.gguf.GgufMetadata +import com.arm.aichat.gguf.GgufMetadataReader +import com.google.android.material.floatingactionbutton.FloatingActionButton +import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.flow.onCompletion +import kotlinx.coroutines.launch +import kotlinx.coroutines.withContext import java.io.File +import java.io.FileOutputStream +import java.io.InputStream +import java.util.UUID -class MainActivity( - activityManager: ActivityManager? = null, - downloadManager: DownloadManager? = null, - clipboardManager: ClipboardManager? = null, -): ComponentActivity() { - private val tag: String? = this::class.simpleName +class MainActivity : AppCompatActivity() { - private val activityManager by lazy { activityManager ?: getSystemService()!! } - private val downloadManager by lazy { downloadManager ?: getSystemService()!! } - private val clipboardManager by lazy { clipboardManager ?: getSystemService()!! } + // Android views + private lateinit var ggufTv: TextView + private lateinit var messagesRv: RecyclerView + private lateinit var userInputEt: EditText + private lateinit var userActionFab: FloatingActionButton - private val viewModel: MainViewModel by viewModels() + // Arm AI Chat inference engine + private lateinit var engine: InferenceEngine - // Get a MemoryInfo object for the device's current memory status. - private fun availableMemory(): ActivityManager.MemoryInfo { - return ActivityManager.MemoryInfo().also { memoryInfo -> - activityManager.getMemoryInfo(memoryInfo) - } - } + // Conversation states + private var isModelReady = false + private val messages = mutableListOf() + private val lastAssistantMsg = StringBuilder() + private val messageAdapter = MessageAdapter(messages) override fun onCreate(savedInstanceState: Bundle?) { super.onCreate(savedInstanceState) + enableEdgeToEdge() + setContentView(R.layout.activity_main) - StrictMode.setVmPolicy( - VmPolicy.Builder(StrictMode.getVmPolicy()) - .detectLeakedClosableObjects() - .build() - ) + // Find views + ggufTv = findViewById(R.id.gguf) + messagesRv = findViewById(R.id.messages) + messagesRv.layoutManager = LinearLayoutManager(this) + messagesRv.adapter = messageAdapter + userInputEt = findViewById(R.id.user_input) + userActionFab = findViewById(R.id.fab) - val free = Formatter.formatFileSize(this, availableMemory().availMem) - val total = Formatter.formatFileSize(this, availableMemory().totalMem) - - viewModel.log("Current memory: $free / $total") - viewModel.log("Downloads directory: ${getExternalFilesDir(null)}") - - val extFilesDir = getExternalFilesDir(null) - - val models = listOf( - Downloadable( - "Phi-2 7B (Q4_0, 1.6 GiB)", - Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"), - File(extFilesDir, "phi-2-q4_0.gguf"), - ), - Downloadable( - "TinyLlama 1.1B (f16, 2.2 GiB)", - Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"), - File(extFilesDir, "tinyllama-1.1-f16.gguf"), - ), - Downloadable( - "Phi 2 DPO (Q3_K_M, 1.48 GiB)", - Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"), - File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf") - ), - ) - - setContent { - LlamaAndroidTheme { - // A surface container using the 'background' color from the theme - Surface( - modifier = Modifier.fillMaxSize(), - color = MaterialTheme.colorScheme.background - ) { - MainCompose( - viewModel, - clipboardManager, - downloadManager, - models, - ) - } + // Arm AI Chat initialization + lifecycleScope.launch(Dispatchers.Default) { + engine = AiChat.getInferenceEngine(applicationContext) + } + // Upon CTA button tapped + userActionFab.setOnClickListener { + if (isModelReady) { + // If model is ready, validate input and send to engine + handleUserInput() + } else { + // Otherwise, prompt user to select a GGUF metadata on the device + getContent.launch(arrayOf("*/*")) } } } -} -@Composable -fun MainCompose( - viewModel: MainViewModel, - clipboard: ClipboardManager, - dm: DownloadManager, - models: List -) { - Column { - val scrollState = rememberLazyListState() + private val getContent = registerForActivityResult( + ActivityResultContracts.OpenDocument() + ) { uri -> + Log.i(TAG, "Selected file uri:\n $uri") + uri?.let { handleSelectedModel(it) } + } - Box(modifier = Modifier.weight(1f)) { - LazyColumn(state = scrollState) { - items(viewModel.messages) { - Text( - it, - style = MaterialTheme.typography.bodyLarge.copy(color = LocalContentColor.current), - modifier = Modifier.padding(16.dp) - ) + /** + * Handles the file Uri from [getContent] result + */ + private fun handleSelectedModel(uri: Uri) { + // Update UI states + userActionFab.isEnabled = false + userInputEt.hint = "Parsing GGUF..." + ggufTv.text = "Parsing metadata from selected file \n$uri" + + lifecycleScope.launch(Dispatchers.IO) { + // Parse GGUF metadata + Log.i(TAG, "Parsing GGUF metadata...") + contentResolver.openInputStream(uri)?.use { + GgufMetadataReader.create().readStructuredMetadata(it) + }?.let { metadata -> + // Update UI to show GGUF metadata to user + Log.i(TAG, "GGUF parsed: \n$metadata") + withContext(Dispatchers.Main) { + ggufTv.text = metadata.toString() } - } - } - OutlinedTextField( - value = viewModel.message, - onValueChange = { viewModel.updateMessage(it) }, - label = { Text("Message") }, - ) - Row { - Button({ viewModel.send() }) { Text("Send") } - Button({ viewModel.bench(8, 4, 1) }) { Text("Bench") } - Button({ viewModel.clear() }) { Text("Clear") } - Button({ - viewModel.messages.joinToString("\n").let { - clipboard.setPrimaryClip(ClipData.newPlainText("", it)) - } - }) { Text("Copy") } - } - Column { - for (model in models) { - Downloadable.Button(viewModel, dm, model) + // Ensure the model file is available + val modelName = metadata.filename() + FILE_EXTENSION_GGUF + contentResolver.openInputStream(uri)?.use { input -> + ensureModelFile(modelName, input) + }?.let { modelFile -> + loadModel(modelName, modelFile) + + withContext(Dispatchers.Main) { + isModelReady = true + userInputEt.hint = "Type and send a message!" + userInputEt.isEnabled = true + userActionFab.setImageResource(R.drawable.outline_send_24) + userActionFab.isEnabled = true + } + } } } } + + /** + * Prepare the model file within app's private storage + */ + private suspend fun ensureModelFile(modelName: String, input: InputStream) = + withContext(Dispatchers.IO) { + File(ensureModelsDirectory(), modelName).also { file -> + // Copy the file into local storage if not yet done + if (!file.exists()) { + Log.i(TAG, "Start copying file to $modelName") + withContext(Dispatchers.Main) { + userInputEt.hint = "Copying file..." + } + + FileOutputStream(file).use { input.copyTo(it) } + Log.i(TAG, "Finished copying file to $modelName") + } else { + Log.i(TAG, "File already exists $modelName") + } + } + } + + /** + * Load the model file from the app private storage + */ + private suspend fun loadModel(modelName: String, modelFile: File) = + withContext(Dispatchers.IO) { + Log.i(TAG, "Loading model $modelName") + withContext(Dispatchers.Main) { + userInputEt.hint = "Loading model..." + } + engine.loadModel(modelFile.path) + } + + /** + * Validate and send the user message into [InferenceEngine] + */ + private fun handleUserInput() { + userInputEt.text.toString().also { userSsg -> + if (userSsg.isEmpty()) { + Toast.makeText(this, "Input message is empty!", Toast.LENGTH_SHORT).show() + } else { + userInputEt.text = null + userActionFab.isEnabled = false + + // Update message states + messages.add(Message(UUID.randomUUID().toString(), userSsg, true)) + lastAssistantMsg.clear() + messages.add(Message(UUID.randomUUID().toString(), lastAssistantMsg.toString(), false)) + + lifecycleScope.launch(Dispatchers.Default) { + engine.sendUserPrompt(userSsg) + .onCompletion { + withContext(Dispatchers.Main) { + userActionFab.isEnabled = true + } + }.collect { token -> + val messageCount = messages.size + check(messageCount > 0 && !messages[messageCount - 1].isUser) + + messages.removeAt(messageCount - 1).copy( + content = lastAssistantMsg.append(token).toString() + ).let { messages.add(it) } + + withContext(Dispatchers.Main) { + messageAdapter.notifyItemChanged(messages.size - 1) + } + } + } + } + } + } + + /** + * Run a benchmark with the model file + */ + private suspend fun runBenchmark(modelName: String, modelFile: File) = + withContext(Dispatchers.Default) { + Log.i(TAG, "Starts benchmarking $modelName") + withContext(Dispatchers.Main) { + userInputEt.hint = "Running benchmark..." + } + engine.bench( + pp=BENCH_PROMPT_PROCESSING_TOKENS, + tg=BENCH_TOKEN_GENERATION_TOKENS, + pl=BENCH_SEQUENCE, + nr=BENCH_REPETITION + ).let { result -> + messages.add(Message(UUID.randomUUID().toString(), result, false)) + withContext(Dispatchers.Main) { + messageAdapter.notifyItemChanged(messages.size - 1) + } + } + } + + /** + * Create the `models` directory if not exist. + */ + private fun ensureModelsDirectory() = + File(filesDir, DIRECTORY_MODELS).also { + if (it.exists() && !it.isDirectory) { it.delete() } + if (!it.exists()) { it.mkdir() } + } + + companion object { + private val TAG = MainActivity::class.java.simpleName + + private const val DIRECTORY_MODELS = "models" + private const val FILE_EXTENSION_GGUF = ".gguf" + + private const val BENCH_PROMPT_PROCESSING_TOKENS = 512 + private const val BENCH_TOKEN_GENERATION_TOKENS = 128 + private const val BENCH_SEQUENCE = 1 + private const val BENCH_REPETITION = 3 + } +} + +fun GgufMetadata.filename() = when { + basic.name != null -> { + basic.name?.let { name -> + basic.sizeLabel?.let { size -> + "$name-$size" + } ?: name + } + } + architecture?.architecture != null -> { + architecture?.architecture?.let { arch -> + basic.uuid?.let { uuid -> + "$arch-$uuid" + } ?: "$arch-${System.currentTimeMillis()}" + } + } + else -> { + "model-${System.currentTimeMillis().toHexString()}" + } } diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt deleted file mode 100644 index 45ac29938f..0000000000 --- a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt +++ /dev/null @@ -1,105 +0,0 @@ -package com.example.llama - -import android.llama.cpp.LLamaAndroid -import android.util.Log -import androidx.compose.runtime.getValue -import androidx.compose.runtime.mutableStateOf -import androidx.compose.runtime.setValue -import androidx.lifecycle.ViewModel -import androidx.lifecycle.viewModelScope -import kotlinx.coroutines.flow.catch -import kotlinx.coroutines.launch - -class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() { - companion object { - @JvmStatic - private val NanosPerSecond = 1_000_000_000.0 - } - - private val tag: String? = this::class.simpleName - - var messages by mutableStateOf(listOf("Initializing...")) - private set - - var message by mutableStateOf("") - private set - - override fun onCleared() { - super.onCleared() - - viewModelScope.launch { - try { - llamaAndroid.unload() - } catch (exc: IllegalStateException) { - messages += exc.message!! - } - } - } - - fun send() { - val text = message - message = "" - - // Add to messages console. - messages += text - messages += "" - - viewModelScope.launch { - llamaAndroid.send(text) - .catch { - Log.e(tag, "send() failed", it) - messages += it.message!! - } - .collect { messages = messages.dropLast(1) + (messages.last() + it) } - } - } - - fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) { - viewModelScope.launch { - try { - val start = System.nanoTime() - val warmupResult = llamaAndroid.bench(pp, tg, pl, nr) - val end = System.nanoTime() - - messages += warmupResult - - val warmup = (end - start).toDouble() / NanosPerSecond - messages += "Warm up time: $warmup seconds, please wait..." - - if (warmup > 5.0) { - messages += "Warm up took too long, aborting benchmark" - return@launch - } - - messages += llamaAndroid.bench(512, 128, 1, 3) - } catch (exc: IllegalStateException) { - Log.e(tag, "bench() failed", exc) - messages += exc.message!! - } - } - } - - fun load(pathToModel: String) { - viewModelScope.launch { - try { - llamaAndroid.load(pathToModel) - messages += "Loaded $pathToModel" - } catch (exc: IllegalStateException) { - Log.e(tag, "load() failed", exc) - messages += exc.message!! - } - } - } - - fun updateMessage(newMessage: String) { - message = newMessage - } - - fun clear() { - messages = listOf() - } - - fun log(message: String) { - messages += message - } -} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MessageAdapter.kt b/examples/llama.android/app/src/main/java/com/example/llama/MessageAdapter.kt new file mode 100644 index 0000000000..0439f96441 --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/MessageAdapter.kt @@ -0,0 +1,51 @@ +package com.example.llama + +import android.view.LayoutInflater +import android.view.View +import android.view.ViewGroup +import android.widget.TextView +import androidx.recyclerview.widget.RecyclerView + +data class Message( + val id: String, + val content: String, + val isUser: Boolean +) + +class MessageAdapter( + private val messages: List +) : RecyclerView.Adapter() { + + companion object { + private const val VIEW_TYPE_USER = 1 + private const val VIEW_TYPE_ASSISTANT = 2 + } + + override fun getItemViewType(position: Int): Int { + return if (messages[position].isUser) VIEW_TYPE_USER else VIEW_TYPE_ASSISTANT + } + + override fun onCreateViewHolder(parent: ViewGroup, viewType: Int): RecyclerView.ViewHolder { + val layoutInflater = LayoutInflater.from(parent.context) + return if (viewType == VIEW_TYPE_USER) { + val view = layoutInflater.inflate(R.layout.item_message_user, parent, false) + UserMessageViewHolder(view) + } else { + val view = layoutInflater.inflate(R.layout.item_message_assistant, parent, false) + AssistantMessageViewHolder(view) + } + } + + override fun onBindViewHolder(holder: RecyclerView.ViewHolder, position: Int) { + val message = messages[position] + if (holder is UserMessageViewHolder || holder is AssistantMessageViewHolder) { + val textView = holder.itemView.findViewById(R.id.msg_content) + textView.text = message.content + } + } + + override fun getItemCount(): Int = messages.size + + class UserMessageViewHolder(view: View) : RecyclerView.ViewHolder(view) + class AssistantMessageViewHolder(view: View) : RecyclerView.ViewHolder(view) +} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt deleted file mode 100644 index 40c30e8d97..0000000000 --- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt +++ /dev/null @@ -1,11 +0,0 @@ -package com.example.llama.ui.theme - -import androidx.compose.ui.graphics.Color - -val Purple80 = Color(0xFFD0BCFF) -val PurpleGrey80 = Color(0xFFCCC2DC) -val Pink80 = Color(0xFFEFB8C8) - -val Purple40 = Color(0xFF6650a4) -val PurpleGrey40 = Color(0xFF625b71) -val Pink40 = Color(0xFF7D5260) diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt deleted file mode 100644 index e742220a8d..0000000000 --- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt +++ /dev/null @@ -1,70 +0,0 @@ -package com.example.llama.ui.theme - -import android.app.Activity -import android.os.Build -import androidx.compose.foundation.isSystemInDarkTheme -import androidx.compose.material3.MaterialTheme -import androidx.compose.material3.darkColorScheme -import androidx.compose.material3.dynamicDarkColorScheme -import androidx.compose.material3.dynamicLightColorScheme -import androidx.compose.material3.lightColorScheme -import androidx.compose.runtime.Composable -import androidx.compose.runtime.SideEffect -import androidx.compose.ui.graphics.toArgb -import androidx.compose.ui.platform.LocalContext -import androidx.compose.ui.platform.LocalView -import androidx.core.view.WindowCompat - -private val DarkColorScheme = darkColorScheme( - primary = Purple80, - secondary = PurpleGrey80, - tertiary = Pink80 -) - -private val LightColorScheme = lightColorScheme( - primary = Purple40, - secondary = PurpleGrey40, - tertiary = Pink40 - - /* Other default colors to override - background = Color(0xFFFFFBFE), - surface = Color(0xFFFFFBFE), - onPrimary = Color.White, - onSecondary = Color.White, - onTertiary = Color.White, - onBackground = Color(0xFF1C1B1F), - onSurface = Color(0xFF1C1B1F), - */ -) - -@Composable -fun LlamaAndroidTheme( - darkTheme: Boolean = isSystemInDarkTheme(), - // Dynamic color is available on Android 12+ - dynamicColor: Boolean = true, - content: @Composable () -> Unit -) { - val colorScheme = when { - dynamicColor && Build.VERSION.SDK_INT >= Build.VERSION_CODES.S -> { - val context = LocalContext.current - if (darkTheme) dynamicDarkColorScheme(context) else dynamicLightColorScheme(context) - } - - darkTheme -> DarkColorScheme - else -> LightColorScheme - } - val view = LocalView.current - if (!view.isInEditMode) { - SideEffect { - val window = (view.context as Activity).window - window.statusBarColor = colorScheme.primary.toArgb() - WindowCompat.getInsetsController(window, view).isAppearanceLightStatusBars = darkTheme - } - } - - MaterialTheme( - colorScheme = colorScheme, - typography = Typography, - content = content - ) -} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt deleted file mode 100644 index 0b87946ca3..0000000000 --- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt +++ /dev/null @@ -1,34 +0,0 @@ -package com.example.llama.ui.theme - -import androidx.compose.material3.Typography -import androidx.compose.ui.text.TextStyle -import androidx.compose.ui.text.font.FontFamily -import androidx.compose.ui.text.font.FontWeight -import androidx.compose.ui.unit.sp - -// Set of Material typography styles to start with -val Typography = Typography( - bodyLarge = TextStyle( - fontFamily = FontFamily.Default, - fontWeight = FontWeight.Normal, - fontSize = 16.sp, - lineHeight = 24.sp, - letterSpacing = 0.5.sp - ) - /* Other default text styles to override - titleLarge = TextStyle( - fontFamily = FontFamily.Default, - fontWeight = FontWeight.Normal, - fontSize = 22.sp, - lineHeight = 28.sp, - letterSpacing = 0.sp - ), - labelSmall = TextStyle( - fontFamily = FontFamily.Default, - fontWeight = FontWeight.Medium, - fontSize = 11.sp, - lineHeight = 16.sp, - letterSpacing = 0.5.sp - ) - */ -) diff --git a/examples/llama.android/app/src/main/res/drawable/bg_assistant_message.xml b/examples/llama.android/app/src/main/res/drawable/bg_assistant_message.xml new file mode 100644 index 0000000000..f90c3db458 --- /dev/null +++ b/examples/llama.android/app/src/main/res/drawable/bg_assistant_message.xml @@ -0,0 +1,4 @@ + + + + diff --git a/examples/llama.android/app/src/main/res/drawable/bg_user_message.xml b/examples/llama.android/app/src/main/res/drawable/bg_user_message.xml new file mode 100644 index 0000000000..3ca7daefec --- /dev/null +++ b/examples/llama.android/app/src/main/res/drawable/bg_user_message.xml @@ -0,0 +1,4 @@ + + + + diff --git a/examples/llama.android/app/src/main/res/drawable/outline_folder_open_24.xml b/examples/llama.android/app/src/main/res/drawable/outline_folder_open_24.xml new file mode 100644 index 0000000000..f58b501e3b --- /dev/null +++ b/examples/llama.android/app/src/main/res/drawable/outline_folder_open_24.xml @@ -0,0 +1,10 @@ + + + diff --git a/examples/llama.android/app/src/main/res/drawable/outline_send_24.xml b/examples/llama.android/app/src/main/res/drawable/outline_send_24.xml new file mode 100644 index 0000000000..712adc00c4 --- /dev/null +++ b/examples/llama.android/app/src/main/res/drawable/outline_send_24.xml @@ -0,0 +1,11 @@ + + + diff --git a/examples/llama.android/app/src/main/res/layout/activity_main.xml b/examples/llama.android/app/src/main/res/layout/activity_main.xml new file mode 100644 index 0000000000..bf6ef35925 --- /dev/null +++ b/examples/llama.android/app/src/main/res/layout/activity_main.xml @@ -0,0 +1,76 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/examples/llama.android/app/src/main/res/layout/item_message_assistant.xml b/examples/llama.android/app/src/main/res/layout/item_message_assistant.xml new file mode 100644 index 0000000000..b7fb500393 --- /dev/null +++ b/examples/llama.android/app/src/main/res/layout/item_message_assistant.xml @@ -0,0 +1,15 @@ + + + + + diff --git a/examples/llama.android/app/src/main/res/layout/item_message_user.xml b/examples/llama.android/app/src/main/res/layout/item_message_user.xml new file mode 100644 index 0000000000..fe871f12fa --- /dev/null +++ b/examples/llama.android/app/src/main/res/layout/item_message_user.xml @@ -0,0 +1,15 @@ + + + + + diff --git a/examples/llama.android/app/src/main/res/values/strings.xml b/examples/llama.android/app/src/main/res/values/strings.xml index 7a9d314e29..36059fc799 100644 --- a/examples/llama.android/app/src/main/res/values/strings.xml +++ b/examples/llama.android/app/src/main/res/values/strings.xml @@ -1,3 +1,3 @@ - LlamaAndroid + AI Chat basic sample diff --git a/examples/llama.android/app/src/main/res/values/themes.xml b/examples/llama.android/app/src/main/res/values/themes.xml index 8a24fda566..2e4fdad72e 100644 --- a/examples/llama.android/app/src/main/res/values/themes.xml +++ b/examples/llama.android/app/src/main/res/values/themes.xml @@ -1,5 +1,10 @@ - + +