From 47f0fee6c964d6c946b7db755ee847beb788cd3b Mon Sep 17 00:00:00 2001 From: Saba Fallah <10401143+sfallah@users.noreply.github.com> Date: Thu, 11 Dec 2025 17:00:11 +0100 Subject: [PATCH] testing deepseek-ocr quick and dirty test script comparing results of Qwen2.5-VL vs DeepSeek-OCR --- tools/mtmd/tests/test-deepseek-ocr.py | 202 ++++++++++++++++++++++++ tools/mtmd/tests/tests-requirements.txt | 14 ++ 2 files changed, 216 insertions(+) create mode 100644 tools/mtmd/tests/test-deepseek-ocr.py create mode 100644 tools/mtmd/tests/tests-requirements.txt diff --git a/tools/mtmd/tests/test-deepseek-ocr.py b/tools/mtmd/tests/test-deepseek-ocr.py new file mode 100644 index 0000000000..f3f68837ef --- /dev/null +++ b/tools/mtmd/tests/test-deepseek-ocr.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python3 +""" +Test script to compare llama.cpp mtmd-cli output with HuggingFace reference implementation +for DeepSeek-OCR model using embedding similarity. +""" + +import argparse +import subprocess +import sys +from pathlib import Path + +from sentence_transformers import SentenceTransformer +from sentence_transformers import util + + +def run_mtmd_deepseek_ocr( + model_path: str, + mmproj_path: str, + image_path: str, + bin_path: str +) -> str: + """ + Run inference using llama.cpp mtmd-cli. + """ + cmd = [ + bin_path, + "-m", model_path, + "--mmproj", mmproj_path, + "--image", image_path, + # "-p", "<|grounding|>Convert the document to markdown.", + "-p", "Free OCR.", + "--chat-template", "deepseek-ocr", + "--temp", "0", + "-n", "1024", + # "--verbose" + ] + + print(f"Running llama.cpp command: {' '.join(cmd)}") + + result = subprocess.run( + cmd, + capture_output=True, + text=False, + timeout=300 + ) + + if result.returncode != 0: + stderr = result.stderr.decode('utf-8', errors='replace') + print(f"llama.cpp stderr: {stderr}") + raise RuntimeError(f"llama-mtmd-cli failed with code {result.returncode}") + + output = result.stdout.decode('utf-8', errors='replace').strip() + print(f"llama.cpp output length: {len(output)} chars") + return output + + +def run_mtmd_qwen_vl( + model_path: str, + mmproj_path: str, + image_path: str, + prompt: str, + bin_path: str +) -> str: + """ + Run inference using llama.cpp mtmd-cli with Qwen2.5-VL model. + """ + cmd = [ + bin_path, + "-m", model_path, + "--mmproj", mmproj_path, + "--image", image_path, + "-p", prompt, + "--temp", "0" + ] + + print(f"Running llama.cpp command: {' '.join(cmd)}") + + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=300 + ) + + if result.returncode != 0: + print(f"llama.cpp stderr: {result.stderr}") + raise RuntimeError(f"llama-mtmd-cli failed with code {result.returncode}") + + output = result.stdout.strip() + print(f"llama.cpp output length: {len(output)} chars") + return output + + +def compute_embedding_similarity(text1: str, text2: str, model_name: str) -> float: + """ + Compute cosine similarity between two texts using embedding model. + """ + print(f"Loading embedding model: {model_name}") + + # Use sentence-transformers for easier embedding extraction + # For Gemma embedding, we use the sentence-transformers wrapper + try: + embed_model = SentenceTransformer(model_name, trust_remote_code=True) + except Exception: + # Fallback to a commonly available model if Gemma embedding not available + print(f"Could not load {model_name}, falling back to all-MiniLM-L6-v2") + embed_model = SentenceTransformer("all-MiniLM-L6-v2") + + print("Computing embeddings...") + embeddings = embed_model.encode([text1, text2], convert_to_numpy=True) + + similarity = util.similarity.cos_sim([embeddings[0]], [embeddings[1]])[0][0] + return float(similarity) + + +def main(): + ap = argparse.ArgumentParser(description="Compare llama.cpp and HuggingFace DeepSeek-OCR outputs") + ap.add_argument("--hf-model", default="Dogacel/DeepSeek-OCR-Metal-MPS", + help="HuggingFace model ID") + ap.add_argument("--llama-model", default="gguf_models/deepseek-ai/deepseek-ocr-f16.gguf", + help="Path to llama.cpp GGUF model") + ap.add_argument("--mmproj", default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-f16.gguf", + help="Path to mmproj GGUF file") + ap.add_argument("--image", default="test-1.jpeg", + help="Path to test image") + ap.add_argument("--llama-bin", default="build/bin/llama-mtmd-cli", + help="Path to llama-mtmd-cli binary") + ap.add_argument("--embedding-model", default="google/embeddinggemma-300m", + help="Embedding model for similarity computation") + ap.add_argument("--threshold", type=float, default=0.7, + help="Minimum similarity threshold for pass") + args = ap.parse_args() + + # Validate paths + # script directory + image + mtmd_dir = Path(__file__).parent.parent + args.image = str(mtmd_dir / args.image) + # project directory + llama model + args.llama_model = str(mtmd_dir.parent.parent / args.llama_model) + # project directory + mmproj + args.mmproj = str(mtmd_dir.parent.parent / args.mmproj) + args.llama_bin = str(mtmd_dir.parent.parent / args.llama_bin) + if not Path(args.image).exists(): + print(f"Error: Image not found: {args.image}") + sys.exit(1) + if not Path(args.llama_model).exists(): + print(f"Error: Model not found: {args.llama_model}") + sys.exit(1) + if not Path(args.mmproj).exists(): + print(f"Error: mmproj not found: {args.mmproj}") + sys.exit(1) + + print("=" * 60) + print("DeepSeek-OCR: llama.cpp vs HuggingFace Comparison") + print("=" * 60) + + # Default paths based on your command + + qwen_vl_out = run_mtmd_qwen_vl( + model_path=str(mtmd_dir.parent.parent / "gguf_models/qwen/Qwen2.5-VL-7B-Instruct-f16.gguf"), + mmproj_path=str(mtmd_dir.parent.parent / "gguf_models/qwen/mmproj-Qwen2.5-VL-7B-Instruct-f16.gguf"), + image_path=args.image, + prompt="tell me what do you see in this picture?", + bin_path=args.llama_bin + ) + + # Run llama.cpp inference + print("\n[2/3] Running llama.cpp implementation...") + llama_output = run_mtmd_deepseek_ocr( + args.llama_model, + args.mmproj, + args.image, + args.llama_bin + ) + + # Compute similarity + print("\n[3/3] Computing embedding similarity...") + similarity = compute_embedding_similarity( + qwen_vl_out, + llama_output, + args.embedding_model + ) + + # Results + print("\n" + "=" * 60) + print("RESULTS") + print("=" * 60) + print(f"\nQwen2.5-VL output:\n{'-' * 40}") + print(qwen_vl_out) + print(f"\nDeepSeek-OCR output:\n{'-' * 40}") + print(llama_output) + print(f"\n{'=' * 60}") + print(f"Cosine Similarity: {similarity:.4f}") + print(f"Threshold: {args.threshold}") + print(f"Result: {'PASS' if similarity >= args.threshold else 'FAIL'}") + print("=" * 60) + + sys.exit(0 if similarity >= args.threshold else 1) + + +if __name__ == "__main__": + main() diff --git a/tools/mtmd/tests/tests-requirements.txt b/tools/mtmd/tests/tests-requirements.txt new file mode 100644 index 0000000000..c8616c48b0 --- /dev/null +++ b/tools/mtmd/tests/tests-requirements.txt @@ -0,0 +1,14 @@ +sentence-transformers +transformers>=4.46.3 +tokenizers==0.20.3 +torch==2.9.1 +torchvision==0.24.1 +torchaudio==2.9.1 +matplotlib +PyMuPDF +img2pdf +einops +easydict +addict +Pillow +numpy \ No newline at end of file