mtmd: successfully runs DeepSeek-OCR LM in llama-cli
This commit is contained in:
parent
eab28ed318
commit
76305878d5
|
|
@ -7112,13 +7112,16 @@ class DeepseekV2Model(TextModel):
|
|||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
|
||||
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
|
||||
self.gguf_writer.add_kv_lora_rank(kv_lora_rank)
|
||||
if "kv_lora_rank" in hparams and hparams["kv_lora_rank"] is not None:
|
||||
self.gguf_writer.add_kv_lora_rank(kv_lora_rank)
|
||||
|
||||
# note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
|
||||
self.gguf_writer.add_key_length(kv_lora_rank + hparams["qk_rope_head_dim"])
|
||||
self.gguf_writer.add_value_length(kv_lora_rank)
|
||||
self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
|
||||
self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
|
||||
if not is_ocr:
|
||||
self.gguf_writer.add_key_length(kv_lora_rank + hparams["qk_rope_head_dim"])
|
||||
self.gguf_writer.add_value_length(kv_lora_rank)
|
||||
self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
|
||||
self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
|
||||
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
||||
|
||||
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
||||
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
|
||||
|
|
@ -7133,8 +7136,6 @@ class DeepseekV2Model(TextModel):
|
|||
else:
|
||||
raise ValueError(f"Unsupported scoring_func value: {scoring_func}")
|
||||
|
||||
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
||||
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
|
|
|
|||
|
|
@ -813,7 +813,7 @@ class GGUFWriter:
|
|||
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
|
||||
|
||||
def add_layer_norm_rms_eps(self, value: float) -> None:
|
||||
self.add_float64(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value)
|
||||
self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value)
|
||||
|
||||
def add_group_norm_eps(self, value: float) -> None:
|
||||
self.add_float32(Keys.Attention.GROUPNORM_EPS.format(arch=self.arch), value)
|
||||
|
|
|
|||
|
|
@ -1562,12 +1562,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
case LLM_ARCH_DEEPSEEK2:
|
||||
{
|
||||
bool is_lite = (hparams.n_layer == 27);
|
||||
bool is_ocr = (name.find("ocr") != std::string::npos || name.find("OCR") != std::string::npos);
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||
if (!is_lite) {
|
||||
if (!is_lite && !is_ocr) {
|
||||
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
||||
}
|
||||
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
||||
if (!is_ocr) {
|
||||
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
||||
}
|
||||
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
|
|
@ -1583,6 +1587,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 12: type = LLM_TYPE_3B; break;
|
||||
case 27: type = LLM_TYPE_16B; break;
|
||||
case 60: type = LLM_TYPE_236B; break;
|
||||
case 61: type = LLM_TYPE_671B; break;
|
||||
|
|
@ -4578,10 +4583,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
|
||||
if (is_ocr) {
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0); // TODO
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0); // TODO
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
if (i < (int) hparams.n_layer_dense_lead) {
|
||||
|
|
|
|||
|
|
@ -46,6 +46,9 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|||
|
||||
// self_attention
|
||||
if (is_ocr) {
|
||||
const int n_embed_head = hparams.n_embd / hparams.n_head();
|
||||
GGML_ASSERT(n_embed_head == n_embd_head_k && n_embed_head == n_embd_head_v);
|
||||
|
||||
ggml_tensor * Qcur = NULL;
|
||||
ggml_tensor * Kcur = NULL;
|
||||
ggml_tensor * Vcur = NULL;
|
||||
|
|
@ -57,13 +60,13 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|||
cb(Kcur, "k", il);
|
||||
cb(Vcur, "v", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head, n_tokens);
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embed_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embed_head, n_head, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embed_head, n_head, n_tokens);
|
||||
|
||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
GGML_ASSERT(fabs(freq_base - 10000.0) < 1e-4);
|
||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_embed_head, rope_type, 0, freq_base, 1, 0, 1, 0, 0);
|
||||
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_embed_head, rope_type, 0, freq_base, 1, 0, 1, 0, 0);
|
||||
cb(Qcur, "q_pe", il);
|
||||
cb(Kcur, "k_pe", il);
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue