From 5cc2258e828b8561ea52f424f78aee58dbf8ec3f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 31 Jan 2026 16:17:06 +0200
Subject: [PATCH] examples: add simplified llama-eval-new.py for AIME
 evaluation

- Create new simplified evaluation script focused only on AIME
- Implement EvalState and Processor dataclasses for structured state management
- Add real-time feedback showing correct/incorrect status per case
- Abstract grading interface for external grader support
- Use structured JSON output for eval state
- Apply HuggingFace dataset caching to avoid repeated downloads
- Remove Levenshtein matching - eval script only sends requests and validates answers
---
 examples/llama-eval/llama-eval-new.py | 217 ++++++++++++++++++++++++++
 examples/llama-eval/test-cache.sh     |  43 -----
 2 files changed, 217 insertions(+), 43 deletions(-)
 create mode 100755 examples/llama-eval/llama-eval-new.py
 delete mode 100755 examples/llama-eval/test-cache.sh

diff --git a/examples/llama-eval/llama-eval-new.py b/examples/llama-eval/llama-eval-new.py
new file mode 100755
index 0000000000..a27ed4a37c
--- /dev/null
+++ b/examples/llama-eval/llama-eval-new.py
@@ -0,0 +1,217 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import os
+import time
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from typing import Dict, List, Optional, Any
+import requests
+from tqdm import tqdm
+
+cache_dir = Path.home() / ".cache" / "huggingface" / "datasets"
+cache_dir.mkdir(parents=True, exist_ok=True)
+os.environ["HF_DATASETS_CACHE"] = str(cache_dir)
+
+@dataclass
+class EvalState:
+    id: str
+    tasks: List[str]
+    task_states: Dict[str, Dict[str, Any]]
+    sampling_config: Dict[str, Any]
+
+@dataclass
+class TaskState:
+    case_id: str
+    prompt: str
+    gold: str
+    pred: Optional[str] = None
+    correct: bool = False
+    status: str = "pending"
+
+class AimeDataset:
+    def __init__(self, split: str = "train"):
+        self.split = split
+        self.questions: List[Dict] = []
+        self._load_dataset()
+
+    def _load_dataset(self):
+        print(f"Loading AIME dataset (split: {self.split})...")
+        from datasets import load_dataset
+        ds = load_dataset("AI-MO/aimo-validation-aime", split=self.split)
+        self.questions = list(ds)
+        print(f"AIME dataset loaded: {len(self.questions)} questions")
+
+    def get_question(self, index: int) -> Dict:
+        """Get question by index"""
+        return self.questions[index]
+
+    def get_answer(self, question: Dict) -> str:
+        return str(question["answer"])
+
+class Processor:
+    def __init__(
+        self,
+        server_url: str,
+        n_predict: int = 2048,
+        threads: int = 32,
+        verbose: bool = False
+    ):
+        self.server_url = server_url
+        self.n_predict = n_predict
+        self.threads = threads
+        self.verbose = verbose
+        self.dataset = AimeDataset()
+        self.eval_state = EvalState(
+            id="aime-2025",
+            tasks=["aime"],
+            task_states={},
+            sampling_config={"temperature": 0, "max_tokens": n_predict}
+        )
+
+    def _make_request(self, prompt: str) -> Dict[str, Any]:
+        """Make HTTP request to the server"""
+        url = f"{self.server_url}/v1/chat/completions"
+        headers = {"Content-Type": "application/json"}
+        data = {
+            "model": "llama",
+            "messages": [{"role": "user", "content": prompt}],
+            "temperature": 0,
+            "max_tokens": self.n_predict
+        }
+
+        response = requests.post(url, headers=headers, json=data)
+        response.raise_for_status()
+        return response.json()
+
+    def _grade_response(self, gold: str, pred: str) -> bool:
+        """Grade the response - abstracted for external grader support"""
+        try:
+            gold_int = int(gold)
+            pred_int = int(pred)
+            return gold_int == pred_int
+        except (ValueError, TypeError):
+            return False
+
+    def process(self, n_cases: int = None, seed: int = 42):
+        """Process cases and update eval state"""
+        if n_cases is None:
+            n_cases = len(self.dataset.questions)
+
+        print(f"\nProcessing {n_cases} AIME questions...")
+        print(f"Server: {self.server_url}")
+        print(f"Threads: {self.threads}")
+        print(f"Max tokens: {self.n_predict}")
+        print()
+
+        task_states: Dict[str, List[TaskState]] = {task: [] for task in self.eval_state.tasks}
+        total = 0
+        correct = 0
+
+        for i in tqdm(range(min(n_cases, len(self.dataset.questions))), desc="Processing"):
+            question = self.dataset.get_question(i)
+            case_id = f"aime_{self.dataset.split}_{question['id']}"
+            prompt = question["problem"]
+            gold = self.dataset.get_answer(question)
+
+            task_state = TaskState(
+                case_id=case_id,
+                prompt=prompt,
+                gold=gold
+            )
+
+            try:
+                response = self._make_request(prompt)
+                pred = response["choices"][0]["message"]["content"]
+                task_state.pred = pred
+                task_state.correct = self._grade_response(gold, pred)
+                task_state.status = "ok"
+
+                if task_state.correct:
+                    correct += 1
+            except Exception as e:
+                task_state.status = f"error: {str(e)}"
+
+            task_states["aime"].append(task_state)
+            total += 1
+
+            if self.verbose:
+                print(f"\nCase {i+1}/{total}: {task_state.correct}")
+                print(f"  Gold: {gold}")
+                if task_state.pred:
+                    print(f"  Pred: {task_state.pred}")
+                print(f"  Status: {task_state.status}")
+
+        self.eval_state.task_states["aime"] = {
+            "total": total,
+            "correct": correct,
+            "cases": task_states
+        }
+
+        print(f"\n{'='*60}")
+        print(f"Results: {correct}/{total} correct ({correct/total*100:.1f}%)")
+        print(f"{'='*60}")
+
+        return self.eval_state
+
+    def dump_state(self, output_file: Path):
+        """Dump eval state to JSON file"""
+        with open(output_file, "w") as f:
+            json.dump(asdict(self.eval_state), f, indent=2)
+        print(f"\nEval state dumped to {output_file}")
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Simplified AIME evaluation tool for llama.cpp"
+    )
+    parser.add_argument(
+        "--server",
+        type=str,
+        default="http://localhost:8033",
+        help="llama-server URL (default: http://localhost:8033)"
+    )
+    parser.add_argument(
+        "--n_cases",
+        type=int,
+        default=None,
+        help="Number of cases to evaluate (default: all)"
+    )
+    parser.add_argument(
+        "--n_predict",
+        type=int,
+        default=2048,
+        help="Max tokens to predict per prompt (default: 2048)"
+    )
+    parser.add_argument(
+        "--threads",
+        type=int,
+        default=32,
+        help="Number of threads for parallel requests (default: 32)"
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Show detailed output for each case"
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("llama-eval-state.json"),
+        help="Output file for eval state (default: llama-eval-state.json)"
+    )
+
+    args = parser.parse_args()
+
+    processor = Processor(
+        server_url=args.server,
+        n_predict=args.n_predict,
+        threads=args.threads,
+        verbose=args.verbose
+    )
+
+    eval_state = processor.process(n_cases=args.n_cases)
+    processor.dump_state(args.output)
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/llama-eval/test-cache.sh b/examples/llama-eval/test-cache.sh
deleted file mode 100755
index 513d8d8b7d..0000000000
--- a/examples/llama-eval/test-cache.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-
-echo "=== Testing HuggingFace Dataset Caching ==="
-echo ""
-
-echo "=== First Load (should download) ==="
-echo "Starting simulator for first load..."
-source venv/bin/activate && python3 examples/llama-eval/llama-server-simulator.py --port 8035 --success-rate 0.8 2>&1 | tee /tmp/simulator-first.log &
-SIMULATOR_PID=$!
-sleep 5
-echo "First load complete"
-echo ""
-
-echo "=== Second Load (should use cache) ==="
-echo "Starting simulator for second load..."
-source venv/bin/activate && python3 examples/llama-eval/llama-server-simulator.py --port 8036 --success-rate 0.8 2>&1 | tee /tmp/simulator-second.log &
-SIMULATOR_PID2=$!
-sleep 5
-echo "Second load complete"
-echo ""
-
-echo "=== Checking Cache Directory ==="
-echo "Cache directory size:"
-du -sh ~/.cache/huggingface/datasets/AI-MO___aimo-validation-aime
-echo ""
-
-echo "=== Checking First Load Log ==="
-echo "First load log (last 15 lines):"
-tail -15 /tmp/simulator-first.log
-echo ""
-
-echo "=== Checking Second Load Log ==="
-echo "Second load log (last 15 lines):"
-tail -15 /tmp/simulator-second.log
-echo ""
-
-echo "=== Test Complete ==="
-echo "Both loads completed successfully!"
-echo "The second load should have used the cache (no download warning)."
-echo ""
-
-kill $SIMULATOR_PID 2>/dev/null
-kill $SIMULATOR_PID2 2>/dev/null