datasets : add aime2025

2026-02-16 11:07:54 +02:00 · 2026-02-16 11:07:54 +02:00 · 99e3c3d02c
parent c6315655b7
commit 99e3c3d02c
3 changed files with 65 additions and 2 deletions
--- a/examples/llama-eval/IMPLEMENTATION.md
+++ b/examples/llama-eval/IMPLEMENTATION.md
@ -39,6 +39,7 @@ class EvalState:

 ### Datasets
 - `AimeDataset`: 90 AIME 2025 questions
+- `Aime2025Dataset`: 30 AIME 2025 I & II questions
 - `Gsm8kDataset`: 7473 math word problems
 - `GpqaDataset`: 198 GPQA Diamond questions with shuffling

@ -56,6 +57,12 @@ class EvalState:
 - **cli**: External script with `--answer` and `--expected` args
 - **llm**: LLM-based extraction with few-shot examples and configurable server/model

+### Dataset Requirements
+- **AIME**: Supports regex, CLI, or LLM grader
+- **AIME2025**: Supports regex, CLI, or LLM grader
+- **GSM8K**: Supports regex, CLI, or LLM grader
+- **GPQA**: Requires LLM grader
+
 ## Output Format

 ### Progress Table
--- a/examples/llama-eval/README.md
+++ b/examples/llama-eval/README.md
@ -30,7 +30,7 @@ python llama-eval.py \
 - `--model`: Model name for evaluation (default: llama)
 - `--judge-model`: Model name for LLM judge (default: same as main model)
 - `--judge-server`: Server URL for LLM judge (default: same as main server)
- `--dataset`: Dataset type (aime, gsm8k, gpqa)
+- `--dataset`: Dataset type (aime, aime2025, gsm8k, gpqa)
 - `--n_cases`: Number of cases to evaluate (default: all)
 - `--n_predict`: Max tokens to predict per prompt (default: -1, infinite)
 - `--temperature`: Sampling temperature (default: not passed)
@ -51,6 +51,11 @@ python llama-eval.py \
 - Answers in boxed format: `\boxed{answer}`
 - Requires regex grader or LLM grader

+### AIME2025
+- 30 questions from 2025 AIME I & II competitions
+- Answers in boxed format: `\boxed{answer}`
+- Supports regex, CLI, or LLM grader
+
 ### GSM8K
 - 7473 math word problems
 - Answers are numeric values
@ -66,6 +71,7 @@ python llama-eval.py \
 ### Regex Grader
 Built-in patterns for different datasets:
 - AIME: `\boxed{(\d+)}|\b(\d+)\b`
+- AIME2025: `\boxed{(\d+)}|\b(\d+)\b`
 - GSM8K: `\b(\d+)\b`
 - GPQA: Letter extraction (A, B, C, D)

@ -81,6 +87,7 @@ Uses LLM to extract and compare answers:
 - Configurable server and model
 - Includes few-shot examples from sample answers
 - Case-insensitive comparison
+- Required for GPQA dataset

 ## Output

--- a/examples/llama-eval/llama-eval.py
+++ b/examples/llama-eval/llama-eval.py
@ -50,6 +50,9 @@ SAMPLE_ANSWERS = {
 TEMPLATE_REGISTRY = {
    "aime": """{question}
 Please reason step by step, and put your final answer within \\boxed{{}}.
+""",
+    "aime2025": """{question}
+Please reason step by step, and put your final answer within \\boxed{{}}.
 """,
    "gsm8k": """{question}
 Please reason step by step, and provide your final answer.
@ -133,6 +136,49 @@ class AimeDataset:
                question=question["problem"] if "problem" in question else question["question"]
            )

+class Aime2025Dataset:
+    def __init__(self, variant: str = "I"):
+        self.variant = variant
+        self.questions: List[Dict] = []
+        self._load_dataset()
+
+    def _load_dataset(self):
+        print(f"Loading AIME2025 dataset (variant: {self.variant})...")
+        from datasets import load_dataset
+
+        config_name = f"AIME2025-{self.variant}"
+        cache_path = cache_dir / "opencompass___AIME2025" / "default" / "0.0.0"
+        if cache_path.exists():
+            print(f"Using cached dataset from {cache_path}")
+            ds = load_dataset("opencompass/AIME2025", config_name, split="test", cache_dir=str(cache_path))
+        else:
+            ds = load_dataset("opencompass/AIME2025", config_name, split="test")
+
+        self.questions = []
+        for row in ds:
+            question = dict(row)
+            question["dataset_type"] = "aime2025"
+            self.questions.append(question)
+
+        print(f"AIME2025 dataset loaded: {len(self.questions)} questions")
+
+    def get_question(self, index: int) -> Dict:
+        """Get question by index"""
+        return self.questions[index]
+
+    def get_answer(self, question: Dict) -> str:
+        answer = question["answer"]
+        if isinstance(answer, str):
+            normalized = normalize_number(answer)
+            return str(normalized) if normalized is not None else answer
+        return str(answer)
+
+    def get_prompt(self, question: Dict) -> str:
+        """Get formatted prompt for the question"""
+        return TEMPLATE_REGISTRY["aime2025"].format(
+            question=question["question"]
+        )
+
 class Gsm8kDataset:
    def __init__(self, split: str = "train"):
        self.split = split
@ -342,6 +388,7 @@ Response: {pred}
 ===

 Please provide only the extracted answer, nothing else. If there is no clear answer that can be extracted from the response, reply with 'no answer'."""
+
        url = f"{self.judge_server_url}/v1/chat/completions"
        headers = {"Content-Type": "application/json"}
        data = {
@ -418,6 +465,8 @@ class Processor:
        # Initialize appropriate dataset
        if dataset_type == "aime":
            self.dataset = AimeDataset()
+        elif dataset_type == "aime2025":
+            self.dataset = Aime2025Dataset(variant="I")
        elif dataset_type == "gsm8k":
            self.dataset = Gsm8kDataset()
        elif dataset_type == "gpqa":
@ -593,7 +642,7 @@ def main():
        "--dataset",
        type=str,
        default="aime",
-        choices=["aime", "gsm8k", "gpqa"],
+        choices=["aime", "aime2025", "gsm8k", "gpqa"],
        help="Dataset type (default: aime)"
    )
    parser.add_argument(