check with fixed expected resutls

2025-12-14 16:32:36 +01:00 · 2025-12-14 16:32:36 +01:00 · dc2066e535
parent 47f0fee6c9
commit dc2066e535
4 changed files with 182 additions and 80 deletions
--- a/tools/mtmd/tests/test-1-extracted.md
+++ b/tools/mtmd/tests/test-1-extracted.md
@ -0,0 +1,85 @@
+<|ref|>title<|/ref|><|det|>[[61, 255, 907, 533]]<|/det|>
+# MEN WALK ON MOON
+ASTRONAUTS LAND ON PLAIN;
+COLLECT ROCKS, PLANT FLAG
+
+<|ref|>text<|/ref|><|det|>[[56, 559, 268, 629]]<|/det|>
+Voice From Moon:
+Eagle Has Landed'
+
+<|ref|>text<|/ref|><|det|>[[74, 645, 262, 675]]<|/det|>
+EAGLE (the lunar surface, Houston, Truesquily)
+Base here, The Eagle has landed.
+
+<|ref|>text<|/ref|><|det|>[[74, 675, 262, 720]]<|/det|>
+BOOTHROOM: Lounge, Truesquily, we enjoy you on the ground. You've got a bunch of guys about to toss bikes. We're breaking again. Thanks a lot.
+
+<|ref|>text<|/ref|><|det|>[[74, 720, 262, 750]]<|/det|>
+TRAVELLING MADE: Time you. BOOTHROOM: You're looking good here.
+
+<|ref|>text<|/ref|><|det|>[[74, 750, 262, 780]]<|/det|>
+TRAVELLING MADE: A very smooth touchdown. BEDROOM: Eagle, you are very far. I'll. (The first sign in the lunar appearance) (Over.)
+
+<|ref|>text<|/ref|><|det|>[[74, 780, 262, 810]]<|/det|>
+TRAVELLING MADE: Eagle, stay for I'll. BOOTHROOM: Bumper and we are you waiting the cue.
+
+<|ref|>text<|/ref|><|det|>[[74, 810, 262, 830]]<|/det|>
+TRAVELLING MADE: Eagle, and service mobility.
+
+<|ref|>text<|/ref|><|det|>[[74, 830, 262, 850]]<|/det|>
+How do you read me?
+
+<|ref|>text<|/ref|><|det|>[[74, 850, 262, 880]]<|/det|>
+TRAVELLING COLUMBIA, he has landed Truesquily. Base, Eagle is at Truesquily. I read you first by. Over.
+
+<|ref|>text<|/ref|><|det|>[[74, 880, 262, 900]]<|/det|>
+COLUMBIA: Yes, I heard the whole thing.
+
+<|ref|>text<|/ref|><|det|>[[74, 900, 262, 920]]<|/det|>
+BOOTHROOM: Well, it's a good show.
+
+<|ref|>text<|/ref|><|det|>[[74, 920, 262, 940]]<|/det|>
+COLUMBIA: Fantastic.
+
+<|ref|>text<|/ref|><|det|>[[74, 940, 262, 960]]<|/det|>
+TRAVELLING MADE: I'll read that.
+
+<|ref|>text<|/ref|><|det|>[[74, 960, 262, 980]]<|/det|>
+APOLLO CONTROL: The most major sky to sky will be for the 23 event, that is at 21 minutes 26 sec-
+
+<|ref|>text<|/ref|><|det|>[[74, 980, 262, 990]]<|/det|>
+tion of lunar descent.
+
+<|ref|>image<|/ref|><|det|>[[270, 545, 697, 990]]<|/det|>
+
+
+<|ref|>text<|/ref|><|det|>[[715, 559, 911, 629]]<|/det|>
+A Powdery Surface
+Is Closely Explored
+
+<|ref|>text<|/ref|><|det|>[[733, 645, 851, 665]]<|/det|>
+BY JOHN NOBLE WILFORD
+
+<|ref|>text<|/ref|><|det|>[[715, 669, 911, 700]]<|/det|>
+HOUSTON, Monday, July 21—New hires landed and walked on the moon.
+
+<|ref|>text<|/ref|><|det|>[[715, 700, 911, 750]]<|/det|>
+Two Americans, astronauts of Apollo 11, steered their Eagle-shaped lunar module safely and smoothly to the lunar landing yesterday at 4:17:40 P.M., Eastern day-light time.
+
+<|ref|>text<|/ref|><|det|>[[715, 750, 911, 780]]<|/det|>
+Neil A. Armstrong, the 38-year-old civilian commander, radioed to earth and the landing team here.
+
+<|ref|>text<|/ref|><|det|>[[715, 780, 911, 830]]<|/det|>
+"Boom, Truesquily! Base here. The Eagle has landed," the first man to reach the moon—Neil Armstrong and his engineer, Capt. Charles E. Alder, of the Jet Propulsion Laboratory, the space agency's rocket and space program manager.
+
+<|ref|>text<|/ref|><|det|>[[715, 830, 911, 880]]<|/det|>
+About six and a half hours later, Mr. Armstrong opened the landing craft's hatch, stepped slowly down the ladder and descended as he pointed his first landing footguard on the lunar crater.
+
+<|ref|>text<|/ref|><|det|>[[715, 880, 911, 920]]<|/det|>
+"That's one small step for man, one giant leap for mankind."
+
+<|ref|>text<|/ref|><|det|>[[715, 920, 911, 960]]<|/det|>
+His first step on the moon came on 10:56:29 P.M., as a television camera recorded the craft's transmitted his every word to an aerial and excited audiences of hundreds of millions of people on earth.
+
+<|ref|>text<|/ref|><|det|>[[749, 960, 861, 974]]<|/det|>
+Testable Slope Test Soil
--- a/tools/mtmd/tests/test-1-extracted.txt
+++ b/tools/mtmd/tests/test-1-extracted.txt
@ -0,0 +1,42 @@
+MEN WALK ON MOON
+ASTRONAUTS LAND ON PLAIN;
+COLLECT ROCKS, PLANT FLAG
+
+Voice From Moon:
+'Eagle Has Landed'
+
+A Powder Surface
+Is Closely Explored
+
+By JOHN NOBLE WILFORD
+NOVEMBER, Monday, July 21—New York Herald and
+wished on the moon.
+
+Two American astronauts of Apollo 11, steered their
+frigate Eagle toward the moon's surface and smoothly to
+the lunar landing yesterday at 4:17:40 P.M., Eastern day-
+light time.
+
+Neil A. Armstrong, the 38-year-old civilian commander,
+landed on the soft sand of the moon's surface here.
+
+"Beautiful, Triumph!" he said. "The Eagle has landed."
+
+The first man to reach the moon—Neil Armstrong and
+his co-pilot, Charles E. "Pete" Conrad, 26, of the Pentagon,
+brought their ship to rest on a level, rock-strewn plain near
+the moon's surface. The two men and two of the three
+astronauts on board, Armstrong, Conrad and Edwin E.
+Aldrin, 38, of Houston, stepped slowly down the ladder
+and descended as he pointed his first full-flaming footpad
+at the lunar crater.
+
+"That's one small step for man, one giant leap for
+mankind."
+
+His first step on the moon came at 10:56:20 P.M., as
+a television camera rolled the earth's thousandth line every
+second to an aerial and studied audiences of hundreds of
+millions of people on earth.
+
+Textile Slope Test Soil
--- a/tools/mtmd/tests/test-deepseek-ocr.py
+++ b/tools/mtmd/tests/test-deepseek-ocr.py
@ -17,7 +17,8 @@ def run_mtmd_deepseek_ocr(
        model_path: str,
        mmproj_path: str,
        image_path: str,
-        bin_path: str
+        bin_path: str,
+        prompt: str = "Free OCR."
 ) -> str:
    """
    Run inference using llama.cpp mtmd-cli.
@ -28,7 +29,7 @@ def run_mtmd_deepseek_ocr(
        "--mmproj", mmproj_path,
        "--image", image_path,
        # "-p", "<|grounding|>Convert the document to markdown.",
-        "-p", "Free OCR.",
+        "-p", prompt,
        "--chat-template", "deepseek-ocr",
        "--temp", "0",
        "-n", "1024",
@ -54,43 +55,6 @@ def run_mtmd_deepseek_ocr(
    return output


-def run_mtmd_qwen_vl(
-        model_path: str,
-        mmproj_path: str,
-        image_path: str,
-        prompt: str,
-        bin_path: str
-) -> str:
-    """
-    Run inference using llama.cpp mtmd-cli with Qwen2.5-VL model.
-    """
-    cmd = [
-        bin_path,
-        "-m", model_path,
-        "--mmproj", mmproj_path,
-        "--image", image_path,
-        "-p", prompt,
-        "--temp", "0"
-    ]
-
-    print(f"Running llama.cpp command: {' '.join(cmd)}")
-
-    result = subprocess.run(
-        cmd,
-        capture_output=True,
-        text=True,
-        timeout=300
-    )
-
-    if result.returncode != 0:
-        print(f"llama.cpp stderr: {result.stderr}")
-        raise RuntimeError(f"llama-mtmd-cli failed with code {result.returncode}")
-
-    output = result.stdout.strip()
-    print(f"llama.cpp output length: {len(output)} chars")
-    return output
-
-
 def compute_embedding_similarity(text1: str, text2: str, model_name: str) -> float:
    """
    Compute cosine similarity between two texts using embedding model.
@ -98,13 +62,7 @@ def compute_embedding_similarity(text1: str, text2: str, model_name: str) -> flo
    print(f"Loading embedding model: {model_name}")

    # Use sentence-transformers for easier embedding extraction
-    # For Gemma embedding, we use the sentence-transformers wrapper
-    try:
-        embed_model = SentenceTransformer(model_name, trust_remote_code=True)
-    except Exception:
-        # Fallback to a commonly available model if Gemma embedding not available
-        print(f"Could not load {model_name}, falling back to all-MiniLM-L6-v2")
-        embed_model = SentenceTransformer("all-MiniLM-L6-v2")
+    embed_model = SentenceTransformer(model_name)

    print("Computing embeddings...")
    embeddings = embed_model.encode([text1, text2], convert_to_numpy=True)
@ -113,10 +71,18 @@ def compute_embedding_similarity(text1: str, text2: str, model_name: str) -> flo
    return float(similarity)


+def read_expected_output(file_path: str) -> str:
+    """
+    Read expected OCR output from file.
+    """
+    cur_path = Path(__file__).parent
+    expected_path = str(cur_path / file_path)
+    with open(expected_path, "r", encoding="utf-8") as f:
+        return f.read().strip()
+
+
 def main():
    ap = argparse.ArgumentParser(description="Compare llama.cpp and HuggingFace DeepSeek-OCR outputs")
-    ap.add_argument("--hf-model", default="Dogacel/DeepSeek-OCR-Metal-MPS",
-                    help="HuggingFace model ID")
    ap.add_argument("--llama-model", default="gguf_models/deepseek-ai/deepseek-ocr-f16.gguf",
                    help="Path to llama.cpp GGUF model")
    ap.add_argument("--mmproj", default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-f16.gguf",
@ -125,7 +91,7 @@ def main():
                    help="Path to test image")
    ap.add_argument("--llama-bin", default="build/bin/llama-mtmd-cli",
                    help="Path to llama-mtmd-cli binary")
-    ap.add_argument("--embedding-model", default="google/embeddinggemma-300m",
+    ap.add_argument("--embedding-model", default="Qwen/Qwen3-Embedding-0.6B",
                    help="Embedding model for similarity computation")
    ap.add_argument("--threshold", type=float, default=0.7,
                    help="Minimum similarity threshold for pass")
@ -156,28 +122,37 @@ def main():

    # Default paths based on your command

-    qwen_vl_out = run_mtmd_qwen_vl(
-        model_path=str(mtmd_dir.parent.parent / "gguf_models/qwen/Qwen2.5-VL-7B-Instruct-f16.gguf"),
-        mmproj_path=str(mtmd_dir.parent.parent / "gguf_models/qwen/mmproj-Qwen2.5-VL-7B-Instruct-f16.gguf"),
-        image_path=args.image,
-        prompt="tell me what do you see in this picture?",
-        bin_path=args.llama_bin
-    )
-
    # Run llama.cpp inference
    print("\n[2/3] Running llama.cpp implementation...")
-    llama_output = run_mtmd_deepseek_ocr(
+    llama_free_ocr = run_mtmd_deepseek_ocr(
        args.llama_model,
        args.mmproj,
        args.image,
        args.llama_bin
    )

+    llama_md_ocr = run_mtmd_deepseek_ocr(
+        args.llama_model,
+        args.mmproj,
+        args.image,
+        args.llama_bin,
+        prompt="<|grounding|>Convert the document to markdown."
+    )
+
+    expected_free_ocr = read_expected_output("test-1-extracted.txt")
+    expected_md_ocr = read_expected_output("test-1-extracted.md")
+
    # Compute similarity
    print("\n[3/3] Computing embedding similarity...")
-    similarity = compute_embedding_similarity(
-        qwen_vl_out,
-        llama_output,
+    free_ocr_similarity = compute_embedding_similarity(
+        expected_free_ocr,
+        llama_free_ocr,
+        args.embedding_model
+    )
+    
+    md_ocr_similarity = compute_embedding_similarity(
+        expected_md_ocr,
+        llama_md_ocr,
        args.embedding_model
    )

@ -185,17 +160,28 @@ def main():
    print("\n" + "=" * 60)
    print("RESULTS")
    print("=" * 60)
-    print(f"\nQwen2.5-VL output:\n{'-' * 40}")
-    print(qwen_vl_out)
+    print(f"\nReference Model output:\n{'-' * 40}")
+    print(expected_free_ocr)
    print(f"\nDeepSeek-OCR output:\n{'-' * 40}")
-    print(llama_output)
+    print(llama_free_ocr)
    print(f"\n{'=' * 60}")
-    print(f"Cosine Similarity: {similarity:.4f}")
+    print(f"Cosine Similarity: {free_ocr_similarity:.4f}")
    print(f"Threshold: {args.threshold}")
-    print(f"Result: {'PASS' if similarity >= args.threshold else 'FAIL'}")
+    print(f"Result: {'PASS' if free_ocr_similarity >= args.threshold else 'FAIL'}")
    print("=" * 60)

-    sys.exit(0 if similarity >= args.threshold else 1)
+    # Markdown OCR results
+    print(f"\nReference Model Markdown output:\n{'-' * 40}")
+    print(expected_md_ocr)
+    print(f"\nDeepSeek-OCR Markdown output:\n{'-' * 40}")
+    print(llama_md_ocr)
+    print(f"\n{'=' * 60}")
+    print(f"Cosine Similarity (Markdown): {md_ocr_similarity:.4f}")
+    print(f"Threshold: {args.threshold}")
+    print(f"Result: {'PASS' if md_ocr_similarity >= args.threshold else 'FAIL'}")
+    print("=" * 60)
+    
+    


 if __name__ == "__main__":
--- a/tools/mtmd/tests/tests-requirements.txt
+++ b/tools/mtmd/tests/tests-requirements.txt
@ -1,14 +1,3 @@
 sentence-transformers
-transformers>=4.46.3
-tokenizers==0.20.3
-torch==2.9.1
-torchvision==0.24.1
-torchaudio==2.9.1
-matplotlib
-PyMuPDF
-img2pdf
-einops
-easydict
-addict 
-Pillow
-numpy
+transformers
+tokenizers