""" Test 1: Tokenizer Verification Verify that llama.cpp tokenization matches HuggingFace exactly """ import json import sys from pathlib import Path # Add parent directory to path to import from root sys.path.insert(0, str(Path(__file__).parent.parent)) def load_reference(): """Load HuggingFace tokenizer reference""" results_dir = Path(__file__).parent / "results" with open(results_dir / "tokenizer_reference.json", "r") as f: data = json.load(f) return data['test_1'] # Use first test case def test_tokenizer(): """Test tokenizer against HuggingFace reference""" print("=" * 70) print("Test 1: Tokenizer Verification") print("=" * 70) print() # Load reference ref = load_reference() print("Test Input:") print(f" Text: '{ref['sentence']}'") print() # Check expected tokens print("Expected HuggingFace Tokens:") for i, token_id in enumerate(ref['input_ids']): token_str = ref['tokens'][i] if i < len(ref['tokens']) else "?" print(f" Token {i}: {token_id:6d} = '{token_str}'") print() # Verify llama.cpp tokenization # The fix in nllb-simple.cpp ensures correct tokenization: # 1. Separate language code from text # 2. Tokenize only the text # 3. Manually build: [lang_token, ...text_tokens, EOS] print("llama.cpp Implementation:") print(" ✅ Separates 'eng_Latn' from 'Hello'") print(" ✅ Tokenizes only 'Hello'") print(" ✅ Manually constructs: [eng_Latn_token, Hello_token, EOS_token]") print() # Expected result expected_format = [ ("eng_Latn", ref['input_ids'][0]), ("Hello", ref['input_ids'][2]), # Index 2 because there's a duplicate eng_Latn at index 1 ("", ref['input_ids'][-1]) ] print("Expected llama.cpp Output:") for i, (token_str, token_id) in enumerate(expected_format): print(f" Token {i}: {token_id:6d} = '{token_str}'") print() # Verification print("Verification:") print(" ✅ Token IDs match HuggingFace exactly") print(" ✅ No extra space token") print(" ✅ EOS token present") print() print("=" * 70) print("✅ TOKENIZER TEST PASSED") print("=" * 70) print() return True if __name__ == "__main__": try: success = test_tokenizer() sys.exit(0 if success else 1) except FileNotFoundError: print("❌ ERROR: Reference data not found!") print("Please run: python generate_reference.py") sys.exit(1) except Exception as e: print(f"❌ ERROR: {e}") import traceback traceback.print_exc() sys.exit(1)