llama.cpp/nllb_testing/run_all_tests.py

"""
Run All NLLB Verification Tests
Executes the complete test suite to verify functional equivalence with HuggingFace
"""

import subprocess
import sys
from pathlib import Path

def run_test(test_file, test_name):
    """Run a single test and return success status"""
    print()
    print("=" * 80)
    print(f"Running: {test_name}")
    print("=" * 80)

    try:
        result = subprocess.run(
            [sys.executable, test_file],
            cwd=Path(__file__).parent,
            capture_output=False,
            text=True
        )

        if result.returncode == 0:
            print()
            print(f"✅ {test_name} PASSED")
            return True
        else:
            print()
            print(f"❌ {test_name} FAILED (exit code: {result.returncode})")
            return False

    except Exception as e:
        print(f"❌ {test_name} ERROR: {e}")
        return False

def main():
    """Run all tests in sequence"""
    print()
    print("╔" + "=" * 78 + "╗")
    print("║" + " " * 78 + "║")
    print("║" + "        NLLB Functional Equivalence Test Suite".center(78) + "║")
    print("║" + "           Verifying llama.cpp vs HuggingFace".center(78) + "║")
    print("║" + " " * 78 + "║")
    print("╚" + "=" * 78 + "╝")
    print()

    # Check if reference data exists
    results_dir = Path(__file__).parent / "results"
    if not (results_dir / "tokenizer_reference.json").exists():
        print("❌ ERROR: Reference data not found!")
        print()
        print("Please run first:")
        print("  python generate_reference.py")
        print()
        return 1

    # Test suite
    tests = [
        ("test_1_tokenizer.py", "Test 1: Tokenizer Verification"),
        ("test_2_encoder.py", "Test 2: Encoder Verification"),
        ("test_3_decoder.py", "Test 3: Decoder Verification"),
        ("test_4_connection.py", "Test 4: Encoder-Decoder Connection"),
        ("test_5_translation.py", "Test 5: End-to-End Translation"),
    ]

    results = []
    for test_file, test_name in tests:
        test_path = Path(__file__).parent / test_file
        success = run_test(test_path, test_name)
        results.append((test_name, success))

    # Summary
    print()
    print("=" * 80)
    print("TEST SUITE SUMMARY")
    print("=" * 80)
    print()

    passed = sum(1 for _, success in results if success)
    total = len(results)

    for test_name, success in results:
        status = "✅ PASSED" if success else "❌ FAILED"
        print(f"  {status}  {test_name}")

    print()
    print("-" * 80)
    print(f"  Results: {passed}/{total} tests passed")
    print("-" * 80)
    print()

    if passed == total:
        print("╔" + "=" * 78 + "╗")
        print("║" + " " * 78 + "║")
        print("║" + "🎉 ALL TESTS PASSED - FUNCTIONAL EQUIVALENCE VERIFIED! 🎉".center(78) + "║")
        print("║" + " " * 78 + "║")
        print("║" + "llama.cpp NLLB implementation is functionally equivalent".center(78) + "║")
        print("║" + "to HuggingFace reference implementation.".center(78) + "║")
        print("║" + " " * 78 + "║")
        print("╚" + "=" * 78 + "╝")
        print()
        return 0
    else:
        print("❌ SOME TESTS FAILED")
        print()
        print("Please review the failed tests above.")
        print()
        return 1

if __name__ == "__main__":
    sys.exit(main())