merge resolved

- fixed issues in convert
- tested several deepseek models
This commit is contained in:
Saba Fallah 2026-02-02 12:07:35 +01:00
parent ded92076a8
commit a94c241751
6 changed files with 5 additions and 47 deletions

View File

@ -7519,9 +7519,6 @@ class DeepseekV2Model(TextModel):
first_k_dense_replace = hparams["num_hidden_layers"] if not has_moe else 0
self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
kv_lora_rank = hparams["kv_lora_rank"] if hparams.get("kv_lora_rank") is not None else 512
routed_scaling_factor = hparams.get("routed_scaling_factor", 1.0)
norm_topk_prob = hparams.get("norm_topk_prob", False)
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
@ -7534,7 +7531,6 @@ class DeepseekV2Model(TextModel):
self.gguf_writer.add_value_length(kv_lora_rank)
self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
# MoE parameters (required by C++ code for DEEPSEEK2 arch)
# For non-MoE models like Youtu, use intermediate_size as expert_feed_forward_length
@ -7554,11 +7550,6 @@ class DeepseekV2Model(TextModel):
if (norm_topk_prob := hparams.get("norm_topk_prob")) is not None and norm_topk_prob:
self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])

View File

@ -1312,7 +1312,7 @@ class TensorNameMap:
"vision_model.positional_embedding_vlm", # llama 4
"vision_tower.patch_embed.pos_emb", # kimi-vl
"visual.pos_embed", # qwen3vl
"model.vision.patch_embedding.position_embedding", # cogvlm
"model.vision.patch_embedding.position_embedding", # cogvlm
"visual.embeddings.position_embedding", # glm4v
),

View File

@ -4917,7 +4917,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
case LLM_ARCH_DEEPSEEK2OCR:
{
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
const bool is_ocr = (arch == LLM_ARCH_DEEPSEEK2OCR);
const bool is_mla = hparams.is_mla();

View File

@ -3,8 +3,7 @@
llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
bool is_ocr = (model.arch == LLM_ARCH_DEEPSEEK2OCR);
bool is_ocr = model.arch == LLM_ARCH_DEEPSEEK2OCR;
const bool is_mla = hparams.is_mla();
@ -83,7 +82,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
cb(Qcur, "q_pe", il);
cb(Kcur, "k_pe", il);
cur = build_attn(inp_attn,
cur = build_attn(inp_attn_kv,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);

View File

@ -4180,37 +4180,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
return false;
}
// print debug nodes
if (ctx->debug_graph) {
LOG_INF("\n\n---\n\n");
LOG_INF("\n\nDebug graph:\n\n");
for (ggml_tensor * t : ctx->debug_print_tensors) {
std::vector<uint8_t> data(ggml_nbytes(t));
ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
print_tensor_info(t);
print_tensor_shape(t);
print_tensor_sum(t, data.data(), 3);
std::string tname_s = std::string(t->name);
bool is_stored = false;
std::vector<std::string> patterns = {
/* Add tensor names here to dump (e.g. "sam_output") */
};
for (auto & p : patterns) {
if (tname_s == p) {
save_tensor_to_file(t, data.data());
is_stored = true;
break;
}
}
if (!is_stored) {
print_tensor_data(t, data.data(), 3);
}
}
}
// the last node is the embedding tensor
ggml_tensor * embeddings = ggml_graph_node(gf, -1);

View File

@ -83,9 +83,9 @@ def read_expected_output(file_path: str) -> str:
def main():
ap = argparse.ArgumentParser(description="Compare llama.cpp and HuggingFace DeepSeek-OCR outputs")
ap.add_argument("--llama-model", default="gguf_models/deepseek-ai/deepseek-ocr-f16.gguf",
ap.add_argument("--llama-model", default="gguf_models/deepseek-ai/deepseek-ocr-q8_0_test.gguf",
help="Path to llama.cpp GGUF model")
ap.add_argument("--mmproj", default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-f16.gguf",
ap.add_argument("--mmproj", default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-f16_test.gguf",
help="Path to mmproj GGUF file")
ap.add_argument("--image", default="test-1.jpeg",
help="Path to test image")