check with fixed expected resutls

This commit is contained in:
Saba Fallah 2025-12-14 16:32:36 +01:00
parent 47f0fee6c9
commit dc2066e535
4 changed files with 182 additions and 80 deletions

View File

@ -0,0 +1,85 @@
<|ref|>title<|/ref|><|det|>[[61, 255, 907, 533]]<|/det|>
# MEN WALK ON MOON
ASTRONAUTS LAND ON PLAIN;
COLLECT ROCKS, PLANT FLAG
<|ref|>text<|/ref|><|det|>[[56, 559, 268, 629]]<|/det|>
Voice From Moon:
Eagle Has Landed'
<|ref|>text<|/ref|><|det|>[[74, 645, 262, 675]]<|/det|>
EAGLE (the lunar surface, Houston, Truesquily)
Base here, The Eagle has landed.
<|ref|>text<|/ref|><|det|>[[74, 675, 262, 720]]<|/det|>
BOOTHROOM: Lounge, Truesquily, we enjoy you on the ground. You've got a bunch of guys about to toss bikes. We're breaking again. Thanks a lot.
<|ref|>text<|/ref|><|det|>[[74, 720, 262, 750]]<|/det|>
TRAVELLING MADE: Time you. BOOTHROOM: You're looking good here.
<|ref|>text<|/ref|><|det|>[[74, 750, 262, 780]]<|/det|>
TRAVELLING MADE: A very smooth touchdown. BEDROOM: Eagle, you are very far. I'll. (The first sign in the lunar appearance) (Over.)
<|ref|>text<|/ref|><|det|>[[74, 780, 262, 810]]<|/det|>
TRAVELLING MADE: Eagle, stay for I'll. BOOTHROOM: Bumper and we are you waiting the cue.
<|ref|>text<|/ref|><|det|>[[74, 810, 262, 830]]<|/det|>
TRAVELLING MADE: Eagle, and service mobility.
<|ref|>text<|/ref|><|det|>[[74, 830, 262, 850]]<|/det|>
How do you read me?
<|ref|>text<|/ref|><|det|>[[74, 850, 262, 880]]<|/det|>
TRAVELLING COLUMBIA, he has landed Truesquily. Base, Eagle is at Truesquily. I read you first by. Over.
<|ref|>text<|/ref|><|det|>[[74, 880, 262, 900]]<|/det|>
COLUMBIA: Yes, I heard the whole thing.
<|ref|>text<|/ref|><|det|>[[74, 900, 262, 920]]<|/det|>
BOOTHROOM: Well, it's a good show.
<|ref|>text<|/ref|><|det|>[[74, 920, 262, 940]]<|/det|>
COLUMBIA: Fantastic.
<|ref|>text<|/ref|><|det|>[[74, 940, 262, 960]]<|/det|>
TRAVELLING MADE: I'll read that.
<|ref|>text<|/ref|><|det|>[[74, 960, 262, 980]]<|/det|>
APOLLO CONTROL: The most major sky to sky will be for the 23 event, that is at 21 minutes 26 sec-
<|ref|>text<|/ref|><|det|>[[74, 980, 262, 990]]<|/det|>
tion of lunar descent.
<|ref|>image<|/ref|><|det|>[[270, 545, 697, 990]]<|/det|>
<|ref|>text<|/ref|><|det|>[[715, 559, 911, 629]]<|/det|>
A Powdery Surface
Is Closely Explored
<|ref|>text<|/ref|><|det|>[[733, 645, 851, 665]]<|/det|>
BY JOHN NOBLE WILFORD
<|ref|>text<|/ref|><|det|>[[715, 669, 911, 700]]<|/det|>
HOUSTON, Monday, July 21—New hires landed and walked on the moon.
<|ref|>text<|/ref|><|det|>[[715, 700, 911, 750]]<|/det|>
Two Americans, astronauts of Apollo 11, steered their Eagle-shaped lunar module safely and smoothly to the lunar landing yesterday at 4:17:40 P.M., Eastern day-light time.
<|ref|>text<|/ref|><|det|>[[715, 750, 911, 780]]<|/det|>
Neil A. Armstrong, the 38-year-old civilian commander, radioed to earth and the landing team here.
<|ref|>text<|/ref|><|det|>[[715, 780, 911, 830]]<|/det|>
"Boom, Truesquily! Base here. The Eagle has landed," the first man to reach the moon—Neil Armstrong and his engineer, Capt. Charles E. Alder, of the Jet Propulsion Laboratory, the space agency's rocket and space program manager.
<|ref|>text<|/ref|><|det|>[[715, 830, 911, 880]]<|/det|>
About six and a half hours later, Mr. Armstrong opened the landing craft's hatch, stepped slowly down the ladder and descended as he pointed his first landing footguard on the lunar crater.
<|ref|>text<|/ref|><|det|>[[715, 880, 911, 920]]<|/det|>
"That's one small step for man, one giant leap for mankind."
<|ref|>text<|/ref|><|det|>[[715, 920, 911, 960]]<|/det|>
His first step on the moon came on 10:56:29 P.M., as a television camera recorded the craft's transmitted his every word to an aerial and excited audiences of hundreds of millions of people on earth.
<|ref|>text<|/ref|><|det|>[[749, 960, 861, 974]]<|/det|>
Testable Slope Test Soil

View File

@ -0,0 +1,42 @@
MEN WALK ON MOON
ASTRONAUTS LAND ON PLAIN;
COLLECT ROCKS, PLANT FLAG
Voice From Moon:
'Eagle Has Landed'
A Powder Surface
Is Closely Explored
By JOHN NOBLE WILFORD
NOVEMBER, Monday, July 21—New York Herald and
wished on the moon.
Two American astronauts of Apollo 11, steered their
frigate Eagle toward the moon's surface and smoothly to
the lunar landing yesterday at 4:17:40 P.M., Eastern day-
light time.
Neil A. Armstrong, the 38-year-old civilian commander,
landed on the soft sand of the moon's surface here.
"Beautiful, Triumph!" he said. "The Eagle has landed."
The first man to reach the moon—Neil Armstrong and
his co-pilot, Charles E. "Pete" Conrad, 26, of the Pentagon,
brought their ship to rest on a level, rock-strewn plain near
the moon's surface. The two men and two of the three
astronauts on board, Armstrong, Conrad and Edwin E.
Aldrin, 38, of Houston, stepped slowly down the ladder
and descended as he pointed his first full-flaming footpad
at the lunar crater.
"That's one small step for man, one giant leap for
mankind."
His first step on the moon came at 10:56:20 P.M., as
a television camera rolled the earth's thousandth line every
second to an aerial and studied audiences of hundreds of
millions of people on earth.
Textile Slope Test Soil

View File

@ -17,7 +17,8 @@ def run_mtmd_deepseek_ocr(
model_path: str,
mmproj_path: str,
image_path: str,
bin_path: str
bin_path: str,
prompt: str = "Free OCR."
) -> str:
"""
Run inference using llama.cpp mtmd-cli.
@ -28,7 +29,7 @@ def run_mtmd_deepseek_ocr(
"--mmproj", mmproj_path,
"--image", image_path,
# "-p", "<|grounding|>Convert the document to markdown.",
"-p", "Free OCR.",
"-p", prompt,
"--chat-template", "deepseek-ocr",
"--temp", "0",
"-n", "1024",
@ -54,43 +55,6 @@ def run_mtmd_deepseek_ocr(
return output
def run_mtmd_qwen_vl(
model_path: str,
mmproj_path: str,
image_path: str,
prompt: str,
bin_path: str
) -> str:
"""
Run inference using llama.cpp mtmd-cli with Qwen2.5-VL model.
"""
cmd = [
bin_path,
"-m", model_path,
"--mmproj", mmproj_path,
"--image", image_path,
"-p", prompt,
"--temp", "0"
]
print(f"Running llama.cpp command: {' '.join(cmd)}")
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=300
)
if result.returncode != 0:
print(f"llama.cpp stderr: {result.stderr}")
raise RuntimeError(f"llama-mtmd-cli failed with code {result.returncode}")
output = result.stdout.strip()
print(f"llama.cpp output length: {len(output)} chars")
return output
def compute_embedding_similarity(text1: str, text2: str, model_name: str) -> float:
"""
Compute cosine similarity between two texts using embedding model.
@ -98,13 +62,7 @@ def compute_embedding_similarity(text1: str, text2: str, model_name: str) -> flo
print(f"Loading embedding model: {model_name}")
# Use sentence-transformers for easier embedding extraction
# For Gemma embedding, we use the sentence-transformers wrapper
try:
embed_model = SentenceTransformer(model_name, trust_remote_code=True)
except Exception:
# Fallback to a commonly available model if Gemma embedding not available
print(f"Could not load {model_name}, falling back to all-MiniLM-L6-v2")
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
embed_model = SentenceTransformer(model_name)
print("Computing embeddings...")
embeddings = embed_model.encode([text1, text2], convert_to_numpy=True)
@ -113,10 +71,18 @@ def compute_embedding_similarity(text1: str, text2: str, model_name: str) -> flo
return float(similarity)
def read_expected_output(file_path: str) -> str:
"""
Read expected OCR output from file.
"""
cur_path = Path(__file__).parent
expected_path = str(cur_path / file_path)
with open(expected_path, "r", encoding="utf-8") as f:
return f.read().strip()
def main():
ap = argparse.ArgumentParser(description="Compare llama.cpp and HuggingFace DeepSeek-OCR outputs")
ap.add_argument("--hf-model", default="Dogacel/DeepSeek-OCR-Metal-MPS",
help="HuggingFace model ID")
ap.add_argument("--llama-model", default="gguf_models/deepseek-ai/deepseek-ocr-f16.gguf",
help="Path to llama.cpp GGUF model")
ap.add_argument("--mmproj", default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-f16.gguf",
@ -125,7 +91,7 @@ def main():
help="Path to test image")
ap.add_argument("--llama-bin", default="build/bin/llama-mtmd-cli",
help="Path to llama-mtmd-cli binary")
ap.add_argument("--embedding-model", default="google/embeddinggemma-300m",
ap.add_argument("--embedding-model", default="Qwen/Qwen3-Embedding-0.6B",
help="Embedding model for similarity computation")
ap.add_argument("--threshold", type=float, default=0.7,
help="Minimum similarity threshold for pass")
@ -156,28 +122,37 @@ def main():
# Default paths based on your command
qwen_vl_out = run_mtmd_qwen_vl(
model_path=str(mtmd_dir.parent.parent / "gguf_models/qwen/Qwen2.5-VL-7B-Instruct-f16.gguf"),
mmproj_path=str(mtmd_dir.parent.parent / "gguf_models/qwen/mmproj-Qwen2.5-VL-7B-Instruct-f16.gguf"),
image_path=args.image,
prompt="tell me what do you see in this picture?",
bin_path=args.llama_bin
)
# Run llama.cpp inference
print("\n[2/3] Running llama.cpp implementation...")
llama_output = run_mtmd_deepseek_ocr(
llama_free_ocr = run_mtmd_deepseek_ocr(
args.llama_model,
args.mmproj,
args.image,
args.llama_bin
)
llama_md_ocr = run_mtmd_deepseek_ocr(
args.llama_model,
args.mmproj,
args.image,
args.llama_bin,
prompt="<|grounding|>Convert the document to markdown."
)
expected_free_ocr = read_expected_output("test-1-extracted.txt")
expected_md_ocr = read_expected_output("test-1-extracted.md")
# Compute similarity
print("\n[3/3] Computing embedding similarity...")
similarity = compute_embedding_similarity(
qwen_vl_out,
llama_output,
free_ocr_similarity = compute_embedding_similarity(
expected_free_ocr,
llama_free_ocr,
args.embedding_model
)
md_ocr_similarity = compute_embedding_similarity(
expected_md_ocr,
llama_md_ocr,
args.embedding_model
)
@ -185,17 +160,28 @@ def main():
print("\n" + "=" * 60)
print("RESULTS")
print("=" * 60)
print(f"\nQwen2.5-VL output:\n{'-' * 40}")
print(qwen_vl_out)
print(f"\nReference Model output:\n{'-' * 40}")
print(expected_free_ocr)
print(f"\nDeepSeek-OCR output:\n{'-' * 40}")
print(llama_output)
print(llama_free_ocr)
print(f"\n{'=' * 60}")
print(f"Cosine Similarity: {similarity:.4f}")
print(f"Cosine Similarity: {free_ocr_similarity:.4f}")
print(f"Threshold: {args.threshold}")
print(f"Result: {'PASS' if similarity >= args.threshold else 'FAIL'}")
print(f"Result: {'PASS' if free_ocr_similarity >= args.threshold else 'FAIL'}")
print("=" * 60)
sys.exit(0 if similarity >= args.threshold else 1)
# Markdown OCR results
print(f"\nReference Model Markdown output:\n{'-' * 40}")
print(expected_md_ocr)
print(f"\nDeepSeek-OCR Markdown output:\n{'-' * 40}")
print(llama_md_ocr)
print(f"\n{'=' * 60}")
print(f"Cosine Similarity (Markdown): {md_ocr_similarity:.4f}")
print(f"Threshold: {args.threshold}")
print(f"Result: {'PASS' if md_ocr_similarity >= args.threshold else 'FAIL'}")
print("=" * 60)
if __name__ == "__main__":

View File

@ -1,14 +1,3 @@
sentence-transformers
transformers>=4.46.3
tokenizers==0.20.3
torch==2.9.1
torchvision==0.24.1
torchaudio==2.9.1
matplotlib
PyMuPDF
img2pdf
einops
easydict
addict
Pillow
numpy
transformers
tokenizers