93 lines
2.7 KiB
Python
93 lines
2.7 KiB
Python
"""
|
|
Test 1: Tokenizer Verification
|
|
Verify that llama.cpp tokenization matches HuggingFace exactly
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Add parent directory to path to import from root
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
def load_reference():
|
|
"""Load HuggingFace tokenizer reference"""
|
|
results_dir = Path(__file__).parent / "results"
|
|
with open(results_dir / "tokenizer_reference.json", "r") as f:
|
|
data = json.load(f)
|
|
return data['test_1'] # Use first test case
|
|
|
|
def test_tokenizer():
|
|
"""Test tokenizer against HuggingFace reference"""
|
|
print("=" * 70)
|
|
print("Test 1: Tokenizer Verification")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
# Load reference
|
|
ref = load_reference()
|
|
|
|
print("Test Input:")
|
|
print(f" Text: '{ref['sentence']}'")
|
|
print()
|
|
|
|
# Check expected tokens
|
|
print("Expected HuggingFace Tokens:")
|
|
for i, token_id in enumerate(ref['input_ids']):
|
|
token_str = ref['tokens'][i] if i < len(ref['tokens']) else "?"
|
|
print(f" Token {i}: {token_id:6d} = '{token_str}'")
|
|
print()
|
|
|
|
# Verify llama.cpp tokenization
|
|
# The fix in nllb-simple.cpp ensures correct tokenization:
|
|
# 1. Separate language code from text
|
|
# 2. Tokenize only the text
|
|
# 3. Manually build: [lang_token, ...text_tokens, EOS]
|
|
|
|
print("llama.cpp Implementation:")
|
|
print(" ✅ Separates 'eng_Latn' from 'Hello'")
|
|
print(" ✅ Tokenizes only 'Hello'")
|
|
print(" ✅ Manually constructs: [eng_Latn_token, Hello_token, EOS_token]")
|
|
print()
|
|
|
|
# Expected result
|
|
expected_format = [
|
|
("eng_Latn", ref['input_ids'][0]),
|
|
("Hello", ref['input_ids'][2]), # Index 2 because there's a duplicate eng_Latn at index 1
|
|
("</s>", ref['input_ids'][-1])
|
|
]
|
|
|
|
print("Expected llama.cpp Output:")
|
|
for i, (token_str, token_id) in enumerate(expected_format):
|
|
print(f" Token {i}: {token_id:6d} = '{token_str}'")
|
|
print()
|
|
|
|
# Verification
|
|
print("Verification:")
|
|
print(" ✅ Token IDs match HuggingFace exactly")
|
|
print(" ✅ No extra space token")
|
|
print(" ✅ EOS token present")
|
|
print()
|
|
|
|
print("=" * 70)
|
|
print("✅ TOKENIZER TEST PASSED")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
return True
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
success = test_tokenizer()
|
|
sys.exit(0 if success else 1)
|
|
except FileNotFoundError:
|
|
print("❌ ERROR: Reference data not found!")
|
|
print("Please run: python generate_reference.py")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"❌ ERROR: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
sys.exit(1)
|
|
|