From 99e3c3d02c007ce1d516097195230ae4366cebe3 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 16 Feb 2026 11:07:54 +0200
Subject: [PATCH] datasets : add aime2025

---
 examples/llama-eval/IMPLEMENTATION.md |  7 ++++
 examples/llama-eval/README.md         |  9 ++++-
 examples/llama-eval/llama-eval.py     | 51 ++++++++++++++++++++++++++-
 3 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/examples/llama-eval/IMPLEMENTATION.md b/examples/llama-eval/IMPLEMENTATION.md
index 9ca7972882..9ce2bdc3f9 100644
--- a/examples/llama-eval/IMPLEMENTATION.md
+++ b/examples/llama-eval/IMPLEMENTATION.md
@@ -39,6 +39,7 @@ class EvalState:
 
 ### Datasets
 - `AimeDataset`: 90 AIME 2025 questions
+- `Aime2025Dataset`: 30 AIME 2025 I & II questions
 - `Gsm8kDataset`: 7473 math word problems
 - `GpqaDataset`: 198 GPQA Diamond questions with shuffling
 
@@ -56,6 +57,12 @@ class EvalState:
 - **cli**: External script with `--answer` and `--expected` args
 - **llm**: LLM-based extraction with few-shot examples and configurable server/model
 
+### Dataset Requirements
+- **AIME**: Supports regex, CLI, or LLM grader
+- **AIME2025**: Supports regex, CLI, or LLM grader
+- **GSM8K**: Supports regex, CLI, or LLM grader
+- **GPQA**: Requires LLM grader
+
 ## Output Format
 
 ### Progress Table
diff --git a/examples/llama-eval/README.md b/examples/llama-eval/README.md
index 8ad3ee2823..4409f9c90b 100644
--- a/examples/llama-eval/README.md
+++ b/examples/llama-eval/README.md
@@ -30,7 +30,7 @@ python llama-eval.py \
 - `--model`: Model name for evaluation (default: llama)
 - `--judge-model`: Model name for LLM judge (default: same as main model)
 - `--judge-server`: Server URL for LLM judge (default: same as main server)
-- `--dataset`: Dataset type (aime, gsm8k, gpqa)
+- `--dataset`: Dataset type (aime, aime2025, gsm8k, gpqa)
 - `--n_cases`: Number of cases to evaluate (default: all)
 - `--n_predict`: Max tokens to predict per prompt (default: -1, infinite)
 - `--temperature`: Sampling temperature (default: not passed)
@@ -51,6 +51,11 @@ python llama-eval.py \
 - Answers in boxed format: `\boxed{answer}`
 - Requires regex grader or LLM grader
 
+### AIME2025
+- 30 questions from 2025 AIME I & II competitions
+- Answers in boxed format: `\boxed{answer}`
+- Supports regex, CLI, or LLM grader
+
 ### GSM8K
 - 7473 math word problems
 - Answers are numeric values
@@ -66,6 +71,7 @@ python llama-eval.py \
 ### Regex Grader
 Built-in patterns for different datasets:
 - AIME: `\boxed{(\d+)}|\b(\d+)\b`
+- AIME2025: `\boxed{(\d+)}|\b(\d+)\b`
 - GSM8K: `\b(\d+)\b`
 - GPQA: Letter extraction (A, B, C, D)
 
@@ -81,6 +87,7 @@ Uses LLM to extract and compare answers:
 - Configurable server and model
 - Includes few-shot examples from sample answers
 - Case-insensitive comparison
+- Required for GPQA dataset
 
 ## Output
 
diff --git a/examples/llama-eval/llama-eval.py b/examples/llama-eval/llama-eval.py
index ecf1ded244..299816b6e2 100755
--- a/examples/llama-eval/llama-eval.py
+++ b/examples/llama-eval/llama-eval.py
@@ -50,6 +50,9 @@ SAMPLE_ANSWERS = {
 TEMPLATE_REGISTRY = {
     "aime": """{question}
 Please reason step by step, and put your final answer within \\boxed{{}}.
+""",
+    "aime2025": """{question}
+Please reason step by step, and put your final answer within \\boxed{{}}.
 """,
     "gsm8k": """{question}
 Please reason step by step, and provide your final answer.
@@ -133,6 +136,49 @@ class AimeDataset:
                 question=question["problem"] if "problem" in question else question["question"]
             )
 
+class Aime2025Dataset:
+    def __init__(self, variant: str = "I"):
+        self.variant = variant
+        self.questions: List[Dict] = []
+        self._load_dataset()
+
+    def _load_dataset(self):
+        print(f"Loading AIME2025 dataset (variant: {self.variant})...")
+        from datasets import load_dataset
+
+        config_name = f"AIME2025-{self.variant}"
+        cache_path = cache_dir / "opencompass___AIME2025" / "default" / "0.0.0"
+        if cache_path.exists():
+            print(f"Using cached dataset from {cache_path}")
+            ds = load_dataset("opencompass/AIME2025", config_name, split="test", cache_dir=str(cache_path))
+        else:
+            ds = load_dataset("opencompass/AIME2025", config_name, split="test")
+
+        self.questions = []
+        for row in ds:
+            question = dict(row)
+            question["dataset_type"] = "aime2025"
+            self.questions.append(question)
+
+        print(f"AIME2025 dataset loaded: {len(self.questions)} questions")
+
+    def get_question(self, index: int) -> Dict:
+        """Get question by index"""
+        return self.questions[index]
+
+    def get_answer(self, question: Dict) -> str:
+        answer = question["answer"]
+        if isinstance(answer, str):
+            normalized = normalize_number(answer)
+            return str(normalized) if normalized is not None else answer
+        return str(answer)
+
+    def get_prompt(self, question: Dict) -> str:
+        """Get formatted prompt for the question"""
+        return TEMPLATE_REGISTRY["aime2025"].format(
+            question=question["question"]
+        )
+
 class Gsm8kDataset:
     def __init__(self, split: str = "train"):
         self.split = split
@@ -342,6 +388,7 @@ Response: {pred}
 ===
 
 Please provide only the extracted answer, nothing else. If there is no clear answer that can be extracted from the response, reply with 'no answer'."""
+
         url = f"{self.judge_server_url}/v1/chat/completions"
         headers = {"Content-Type": "application/json"}
         data = {
@@ -418,6 +465,8 @@ class Processor:
         # Initialize appropriate dataset
         if dataset_type == "aime":
             self.dataset = AimeDataset()
+        elif dataset_type == "aime2025":
+            self.dataset = Aime2025Dataset(variant="I")
         elif dataset_type == "gsm8k":
             self.dataset = Gsm8kDataset()
         elif dataset_type == "gpqa":
@@ -593,7 +642,7 @@ def main():
         "--dataset",
         type=str,
         default="aime",
-        choices=["aime", "gsm8k", "gpqa"],
+        choices=["aime", "aime2025", "gsm8k", "gpqa"],
         help="Dataset type (default: aime)"
     )
     parser.add_argument(