model : add KORMo model (#18032)

* vocab: add KORMo Tokenizer

* model: add KORMoForCausalLM

* vocab: change pretokenizer to qwen2

* lint: fix unintended line removal

* model: make qwen2 bias tensor optional

* model: use qwen2 architecture for KORMo
This commit is contained in:
HelloKS 2025-12-16 02:51:43 +09:00 committed by GitHub
parent 4529c660c8
commit 9d52f17ae3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 25 additions and 9 deletions

View File

@ -1203,6 +1203,9 @@ class TextModel(ModelBase):
if chkhsh == "f4f37b6c8eb9ea29b3eac6bb8c8487c5ab7885f8d8022e67edc1c68ce8403e95": if chkhsh == "f4f37b6c8eb9ea29b3eac6bb8c8487c5ab7885f8d8022e67edc1c68ce8403e95":
# ref: https://huggingface.co/MiniMaxAI/MiniMax-M2 # ref: https://huggingface.co/MiniMaxAI/MiniMax-M2
res = "minimax-m2" res = "minimax-m2"
if chkhsh == "4a2e2abae11ca2b86d570fc5b44be4d5eb5e72cc8f22dd136a94b37da83ab665":
# ref: https://huggingface.co/KORMo-Team/KORMo-tokenizer
res = "kormo"
if res is None: if res is None:
logger.warning("\n") logger.warning("\n")
@ -3398,7 +3401,7 @@ class QwenModel(TextModel):
self._set_vocab_qwen() self._set_vocab_qwen()
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration") @ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM")
class Qwen2Model(TextModel): class Qwen2Model(TextModel):
model_arch = gguf.MODEL_ARCH.QWEN2 model_arch = gguf.MODEL_ARCH.QWEN2

View File

@ -143,6 +143,7 @@ models = [
{"name": "bailingmoe2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-mini-base-2.0", }, {"name": "bailingmoe2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-mini-base-2.0", },
{"name": "granite-docling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", }, {"name": "granite-docling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
{"name": "minimax-m2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", }, {"name": "minimax-m2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", },
{"name": "kormo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/KORMo-Team/KORMo-tokenizer", },
] ]
# some models are known to be broken upstream, so we will skip them as exceptions # some models are known to be broken upstream, so we will skip them as exceptions

View File

@ -154,7 +154,8 @@ class TensorNameMap:
"model.layers.{bid}.operator_norm", # lfm2 "model.layers.{bid}.operator_norm", # lfm2
"model.transformer.blocks.{bid}.attn_norm", # llada "model.transformer.blocks.{bid}.attn_norm", # llada
"layers.{bid}.input_layernorm", # qwen3-embedding "layers.{bid}.input_layernorm", # qwen3-embedding
"model.layers.{bid}.attention_layernorm" # apertus "model.layers.{bid}.attention_layernorm", # apertus
"model.layers.{bid}.pre_attention_layernorm", # kormo
), ),
# Attention norm 2 # Attention norm 2
@ -342,6 +343,7 @@ class TensorNameMap:
"model.transformer.blocks.{bid}.ff_norm", # llada "model.transformer.blocks.{bid}.ff_norm", # llada
"layers.{bid}.post_attention_layernorm", # qwen3-embedding "layers.{bid}.post_attention_layernorm", # qwen3-embedding
"model.layers.{bid}.feedforward_layernorm", # apertus "model.layers.{bid}.feedforward_layernorm", # apertus
"model.layers.{bid}.pre_mlp_layernorm", # kormo
), ),
# Pre feed-forward norm # Pre feed-forward norm

View File

@ -3388,9 +3388,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
// optional bias tensors // optional bias tensors
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0); layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0); layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0); layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);

View File

@ -1895,7 +1895,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
clean_spaces = false; clean_spaces = false;
} else if ( } else if (
tokenizer_pre == "qwen2" || tokenizer_pre == "qwen2" ||
tokenizer_pre == "deepseek-r1-qwen") { tokenizer_pre == "deepseek-r1-qwen" ||
tokenizer_pre == "kormo") {
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2; pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
clean_spaces = false; clean_spaces = false;
} else if ( } else if (

View File

@ -31,16 +31,25 @@ llm_build_qwen2::llm_build_qwen2(const llama_model & model, const llm_graph_para
{ {
// compute Q and K and RoPE them // compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);
}
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);
}
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il);
}
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);