diff --git a/tools/mtmd/tests/test-1-extracted.md b/tools/mtmd/tests/test-1-extracted.md new file mode 100644 index 0000000000..af730bc0b8 --- /dev/null +++ b/tools/mtmd/tests/test-1-extracted.md @@ -0,0 +1,85 @@ +<|ref|>title<|/ref|><|det|>[[61, 255, 907, 533]]<|/det|> +# MEN WALK ON MOON +ASTRONAUTS LAND ON PLAIN; +COLLECT ROCKS, PLANT FLAG + +<|ref|>text<|/ref|><|det|>[[56, 559, 268, 629]]<|/det|> +Voice From Moon: +Eagle Has Landed' + +<|ref|>text<|/ref|><|det|>[[74, 645, 262, 675]]<|/det|> +EAGLE (the lunar surface, Houston, Truesquily) +Base here, The Eagle has landed. + +<|ref|>text<|/ref|><|det|>[[74, 675, 262, 720]]<|/det|> +BOOTHROOM: Lounge, Truesquily, we enjoy you on the ground. You've got a bunch of guys about to toss bikes. We're breaking again. Thanks a lot. + +<|ref|>text<|/ref|><|det|>[[74, 720, 262, 750]]<|/det|> +TRAVELLING MADE: Time you. BOOTHROOM: You're looking good here. + +<|ref|>text<|/ref|><|det|>[[74, 750, 262, 780]]<|/det|> +TRAVELLING MADE: A very smooth touchdown. BEDROOM: Eagle, you are very far. I'll. (The first sign in the lunar appearance) (Over.) + +<|ref|>text<|/ref|><|det|>[[74, 780, 262, 810]]<|/det|> +TRAVELLING MADE: Eagle, stay for I'll. BOOTHROOM: Bumper and we are you waiting the cue. + +<|ref|>text<|/ref|><|det|>[[74, 810, 262, 830]]<|/det|> +TRAVELLING MADE: Eagle, and service mobility. + +<|ref|>text<|/ref|><|det|>[[74, 830, 262, 850]]<|/det|> +How do you read me? + +<|ref|>text<|/ref|><|det|>[[74, 850, 262, 880]]<|/det|> +TRAVELLING COLUMBIA, he has landed Truesquily. Base, Eagle is at Truesquily. I read you first by. Over. + +<|ref|>text<|/ref|><|det|>[[74, 880, 262, 900]]<|/det|> +COLUMBIA: Yes, I heard the whole thing. + +<|ref|>text<|/ref|><|det|>[[74, 900, 262, 920]]<|/det|> +BOOTHROOM: Well, it's a good show. + +<|ref|>text<|/ref|><|det|>[[74, 920, 262, 940]]<|/det|> +COLUMBIA: Fantastic. + +<|ref|>text<|/ref|><|det|>[[74, 940, 262, 960]]<|/det|> +TRAVELLING MADE: I'll read that. + +<|ref|>text<|/ref|><|det|>[[74, 960, 262, 980]]<|/det|> +APOLLO CONTROL: The most major sky to sky will be for the 23 event, that is at 21 minutes 26 sec- + +<|ref|>text<|/ref|><|det|>[[74, 980, 262, 990]]<|/det|> +tion of lunar descent. + +<|ref|>image<|/ref|><|det|>[[270, 545, 697, 990]]<|/det|> + + +<|ref|>text<|/ref|><|det|>[[715, 559, 911, 629]]<|/det|> +A Powdery Surface +Is Closely Explored + +<|ref|>text<|/ref|><|det|>[[733, 645, 851, 665]]<|/det|> +BY JOHN NOBLE WILFORD + +<|ref|>text<|/ref|><|det|>[[715, 669, 911, 700]]<|/det|> +HOUSTON, Monday, July 21—New hires landed and walked on the moon. + +<|ref|>text<|/ref|><|det|>[[715, 700, 911, 750]]<|/det|> +Two Americans, astronauts of Apollo 11, steered their Eagle-shaped lunar module safely and smoothly to the lunar landing yesterday at 4:17:40 P.M., Eastern day-light time. + +<|ref|>text<|/ref|><|det|>[[715, 750, 911, 780]]<|/det|> +Neil A. Armstrong, the 38-year-old civilian commander, radioed to earth and the landing team here. + +<|ref|>text<|/ref|><|det|>[[715, 780, 911, 830]]<|/det|> +"Boom, Truesquily! Base here. The Eagle has landed," the first man to reach the moon—Neil Armstrong and his engineer, Capt. Charles E. Alder, of the Jet Propulsion Laboratory, the space agency's rocket and space program manager. + +<|ref|>text<|/ref|><|det|>[[715, 830, 911, 880]]<|/det|> +About six and a half hours later, Mr. Armstrong opened the landing craft's hatch, stepped slowly down the ladder and descended as he pointed his first landing footguard on the lunar crater. + +<|ref|>text<|/ref|><|det|>[[715, 880, 911, 920]]<|/det|> +"That's one small step for man, one giant leap for mankind." + +<|ref|>text<|/ref|><|det|>[[715, 920, 911, 960]]<|/det|> +His first step on the moon came on 10:56:29 P.M., as a television camera recorded the craft's transmitted his every word to an aerial and excited audiences of hundreds of millions of people on earth. + +<|ref|>text<|/ref|><|det|>[[749, 960, 861, 974]]<|/det|> +Testable Slope Test Soil \ No newline at end of file diff --git a/tools/mtmd/tests/test-1-extracted.txt b/tools/mtmd/tests/test-1-extracted.txt new file mode 100644 index 0000000000..4c0b5078e1 --- /dev/null +++ b/tools/mtmd/tests/test-1-extracted.txt @@ -0,0 +1,42 @@ +MEN WALK ON MOON +ASTRONAUTS LAND ON PLAIN; +COLLECT ROCKS, PLANT FLAG + +Voice From Moon: +'Eagle Has Landed' + +A Powder Surface +Is Closely Explored + +By JOHN NOBLE WILFORD +NOVEMBER, Monday, July 21—New York Herald and +wished on the moon. + +Two American astronauts of Apollo 11, steered their +frigate Eagle toward the moon's surface and smoothly to +the lunar landing yesterday at 4:17:40 P.M., Eastern day- +light time. + +Neil A. Armstrong, the 38-year-old civilian commander, +landed on the soft sand of the moon's surface here. + +"Beautiful, Triumph!" he said. "The Eagle has landed." + +The first man to reach the moon—Neil Armstrong and +his co-pilot, Charles E. "Pete" Conrad, 26, of the Pentagon, +brought their ship to rest on a level, rock-strewn plain near +the moon's surface. The two men and two of the three +astronauts on board, Armstrong, Conrad and Edwin E. +Aldrin, 38, of Houston, stepped slowly down the ladder +and descended as he pointed his first full-flaming footpad +at the lunar crater. + +"That's one small step for man, one giant leap for +mankind." + +His first step on the moon came at 10:56:20 P.M., as +a television camera rolled the earth's thousandth line every +second to an aerial and studied audiences of hundreds of +millions of people on earth. + +Textile Slope Test Soil \ No newline at end of file diff --git a/tools/mtmd/tests/test-deepseek-ocr.py b/tools/mtmd/tests/test-deepseek-ocr.py new file mode 100644 index 0000000000..308d5dd389 --- /dev/null +++ b/tools/mtmd/tests/test-deepseek-ocr.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python3 +""" +Test script to compare llama.cpp mtmd-cli output with HuggingFace reference implementation +for DeepSeek-OCR model using embedding similarity. +""" + +import argparse +import subprocess +import sys +from pathlib import Path + +from sentence_transformers import SentenceTransformer +from sentence_transformers import util + + +def run_mtmd_deepseek_ocr( + model_path: str, + mmproj_path: str, + image_path: str, + bin_path: str, + prompt: str = "Free OCR." +) -> str: + """ + Run inference using llama.cpp mtmd-cli. + """ + cmd = [ + bin_path, + "-m", model_path, + "--mmproj", mmproj_path, + "--image", image_path, + # "-p", "<|grounding|>Convert the document to markdown.", + "-p", prompt, + "--chat-template", "deepseek-ocr", + "--temp", "0", + "-n", "1024", + # "--verbose" + ] + + print(f"Running llama.cpp command: {' '.join(cmd)}") + + result = subprocess.run( + cmd, + capture_output=True, + text=False, + timeout=300 + ) + + if result.returncode != 0: + stderr = result.stderr.decode('utf-8', errors='replace') + print(f"llama.cpp stderr: {stderr}") + raise RuntimeError(f"llama-mtmd-cli failed with code {result.returncode}") + + output = result.stdout.decode('utf-8', errors='replace').strip() + print(f"llama.cpp output length: {len(output)} chars") + return output + + +def compute_embedding_similarity(text1: str, text2: str, model_name: str) -> float: + """ + Compute cosine similarity between two texts using embedding model. + """ + print(f"Loading embedding model: {model_name}") + + # Use sentence-transformers for easier embedding extraction + embed_model = SentenceTransformer(model_name) + + print("Computing embeddings...") + embeddings = embed_model.encode([text1, text2], convert_to_numpy=True) + + similarity = util.similarity.cos_sim([embeddings[0]], [embeddings[1]])[0][0] + return float(similarity) + + +def read_expected_output(file_path: str) -> str: + """ + Read expected OCR output from file. + """ + cur_path = Path(__file__).parent + expected_path = str(cur_path / file_path) + with open(expected_path, "r", encoding="utf-8") as f: + return f.read().strip() + + +def main(): + ap = argparse.ArgumentParser(description="Compare llama.cpp and HuggingFace DeepSeek-OCR outputs") + ap.add_argument("--llama-model", default="gguf_models/deepseek-ai/deepseek-ocr-f16.gguf", + help="Path to llama.cpp GGUF model") + ap.add_argument("--mmproj", default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-f16.gguf", + help="Path to mmproj GGUF file") + ap.add_argument("--image", default="test-1.jpeg", + help="Path to test image") + ap.add_argument("--llama-bin", default="build/bin/llama-mtmd-cli", + help="Path to llama-mtmd-cli binary") + ap.add_argument("--embedding-model", default="Qwen/Qwen3-Embedding-0.6B", + help="Embedding model for similarity computation") + ap.add_argument("--threshold", type=float, default=0.7, + help="Minimum similarity threshold for pass") + args = ap.parse_args() + + # Validate paths + # script directory + image + mtmd_dir = Path(__file__).parent.parent + args.image = str(mtmd_dir / args.image) + # project directory + llama model + args.llama_model = str(mtmd_dir.parent.parent / args.llama_model) + # project directory + mmproj + args.mmproj = str(mtmd_dir.parent.parent / args.mmproj) + args.llama_bin = str(mtmd_dir.parent.parent / args.llama_bin) + if not Path(args.image).exists(): + print(f"Error: Image not found: {args.image}") + sys.exit(1) + if not Path(args.llama_model).exists(): + print(f"Error: Model not found: {args.llama_model}") + sys.exit(1) + if not Path(args.mmproj).exists(): + print(f"Error: mmproj not found: {args.mmproj}") + sys.exit(1) + + print("=" * 60) + print("DeepSeek-OCR: llama.cpp vs HuggingFace Comparison") + print("=" * 60) + + # Default paths based on your command + + # Run llama.cpp inference + print("\n[2/3] Running llama.cpp implementation...") + llama_free_ocr = run_mtmd_deepseek_ocr( + args.llama_model, + args.mmproj, + args.image, + args.llama_bin + ) + + llama_md_ocr = run_mtmd_deepseek_ocr( + args.llama_model, + args.mmproj, + args.image, + args.llama_bin, + prompt="<|grounding|>Convert the document to markdown." + ) + + expected_free_ocr = read_expected_output("test-1-extracted.txt") + expected_md_ocr = read_expected_output("test-1-extracted.md") + + # Compute similarity + print("\n[3/3] Computing embedding similarity...") + free_ocr_similarity = compute_embedding_similarity( + expected_free_ocr, + llama_free_ocr, + args.embedding_model + ) + + md_ocr_similarity = compute_embedding_similarity( + expected_md_ocr, + llama_md_ocr, + args.embedding_model + ) + + # Results + print("\n" + "=" * 60) + print("RESULTS") + print("=" * 60) + print(f"\nReference Model output:\n{'-' * 40}") + print(expected_free_ocr) + print(f"\nDeepSeek-OCR output:\n{'-' * 40}") + print(llama_free_ocr) + print(f"\n{'=' * 60}") + print(f"Cosine Similarity: {free_ocr_similarity:.4f}") + print(f"Threshold: {args.threshold}") + print(f"Result: {'PASS' if free_ocr_similarity >= args.threshold else 'FAIL'}") + print("=" * 60) + + # Markdown OCR results + print(f"\nReference Model Markdown output:\n{'-' * 40}") + print(expected_md_ocr) + print(f"\nDeepSeek-OCR Markdown output:\n{'-' * 40}") + print(llama_md_ocr) + print(f"\n{'=' * 60}") + print(f"Cosine Similarity (Markdown): {md_ocr_similarity:.4f}") + print(f"Threshold: {args.threshold}") + print(f"Result: {'PASS' if md_ocr_similarity >= args.threshold else 'FAIL'}") + print("=" * 60) + + + + +if __name__ == "__main__": + main() diff --git a/tools/mtmd/tests/tests-requirements.txt b/tools/mtmd/tests/tests-requirements.txt new file mode 100644 index 0000000000..8ef2059cd6 --- /dev/null +++ b/tools/mtmd/tests/tests-requirements.txt @@ -0,0 +1,3 @@ +sentence-transformers +transformers +tokenizers \ No newline at end of file