From 99e3c3d02c007ce1d516097195230ae4366cebe3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 16 Feb 2026 11:07:54 +0200 Subject: [PATCH] datasets : add aime2025 --- examples/llama-eval/IMPLEMENTATION.md | 7 ++++ examples/llama-eval/README.md | 9 ++++- examples/llama-eval/llama-eval.py | 51 ++++++++++++++++++++++++++- 3 files changed, 65 insertions(+), 2 deletions(-) diff --git a/examples/llama-eval/IMPLEMENTATION.md b/examples/llama-eval/IMPLEMENTATION.md index 9ca7972882..9ce2bdc3f9 100644 --- a/examples/llama-eval/IMPLEMENTATION.md +++ b/examples/llama-eval/IMPLEMENTATION.md @@ -39,6 +39,7 @@ class EvalState: ### Datasets - `AimeDataset`: 90 AIME 2025 questions +- `Aime2025Dataset`: 30 AIME 2025 I & II questions - `Gsm8kDataset`: 7473 math word problems - `GpqaDataset`: 198 GPQA Diamond questions with shuffling @@ -56,6 +57,12 @@ class EvalState: - **cli**: External script with `--answer` and `--expected` args - **llm**: LLM-based extraction with few-shot examples and configurable server/model +### Dataset Requirements +- **AIME**: Supports regex, CLI, or LLM grader +- **AIME2025**: Supports regex, CLI, or LLM grader +- **GSM8K**: Supports regex, CLI, or LLM grader +- **GPQA**: Requires LLM grader + ## Output Format ### Progress Table diff --git a/examples/llama-eval/README.md b/examples/llama-eval/README.md index 8ad3ee2823..4409f9c90b 100644 --- a/examples/llama-eval/README.md +++ b/examples/llama-eval/README.md @@ -30,7 +30,7 @@ python llama-eval.py \ - `--model`: Model name for evaluation (default: llama) - `--judge-model`: Model name for LLM judge (default: same as main model) - `--judge-server`: Server URL for LLM judge (default: same as main server) -- `--dataset`: Dataset type (aime, gsm8k, gpqa) +- `--dataset`: Dataset type (aime, aime2025, gsm8k, gpqa) - `--n_cases`: Number of cases to evaluate (default: all) - `--n_predict`: Max tokens to predict per prompt (default: -1, infinite) - `--temperature`: Sampling temperature (default: not passed) @@ -51,6 +51,11 @@ python llama-eval.py \ - Answers in boxed format: `\boxed{answer}` - Requires regex grader or LLM grader +### AIME2025 +- 30 questions from 2025 AIME I & II competitions +- Answers in boxed format: `\boxed{answer}` +- Supports regex, CLI, or LLM grader + ### GSM8K - 7473 math word problems - Answers are numeric values @@ -66,6 +71,7 @@ python llama-eval.py \ ### Regex Grader Built-in patterns for different datasets: - AIME: `\boxed{(\d+)}|\b(\d+)\b` +- AIME2025: `\boxed{(\d+)}|\b(\d+)\b` - GSM8K: `\b(\d+)\b` - GPQA: Letter extraction (A, B, C, D) @@ -81,6 +87,7 @@ Uses LLM to extract and compare answers: - Configurable server and model - Includes few-shot examples from sample answers - Case-insensitive comparison +- Required for GPQA dataset ## Output diff --git a/examples/llama-eval/llama-eval.py b/examples/llama-eval/llama-eval.py index ecf1ded244..299816b6e2 100755 --- a/examples/llama-eval/llama-eval.py +++ b/examples/llama-eval/llama-eval.py @@ -50,6 +50,9 @@ SAMPLE_ANSWERS = { TEMPLATE_REGISTRY = { "aime": """{question} Please reason step by step, and put your final answer within \\boxed{{}}. +""", + "aime2025": """{question} +Please reason step by step, and put your final answer within \\boxed{{}}. """, "gsm8k": """{question} Please reason step by step, and provide your final answer. @@ -133,6 +136,49 @@ class AimeDataset: question=question["problem"] if "problem" in question else question["question"] ) +class Aime2025Dataset: + def __init__(self, variant: str = "I"): + self.variant = variant + self.questions: List[Dict] = [] + self._load_dataset() + + def _load_dataset(self): + print(f"Loading AIME2025 dataset (variant: {self.variant})...") + from datasets import load_dataset + + config_name = f"AIME2025-{self.variant}" + cache_path = cache_dir / "opencompass___AIME2025" / "default" / "0.0.0" + if cache_path.exists(): + print(f"Using cached dataset from {cache_path}") + ds = load_dataset("opencompass/AIME2025", config_name, split="test", cache_dir=str(cache_path)) + else: + ds = load_dataset("opencompass/AIME2025", config_name, split="test") + + self.questions = [] + for row in ds: + question = dict(row) + question["dataset_type"] = "aime2025" + self.questions.append(question) + + print(f"AIME2025 dataset loaded: {len(self.questions)} questions") + + def get_question(self, index: int) -> Dict: + """Get question by index""" + return self.questions[index] + + def get_answer(self, question: Dict) -> str: + answer = question["answer"] + if isinstance(answer, str): + normalized = normalize_number(answer) + return str(normalized) if normalized is not None else answer + return str(answer) + + def get_prompt(self, question: Dict) -> str: + """Get formatted prompt for the question""" + return TEMPLATE_REGISTRY["aime2025"].format( + question=question["question"] + ) + class Gsm8kDataset: def __init__(self, split: str = "train"): self.split = split @@ -342,6 +388,7 @@ Response: {pred} === Please provide only the extracted answer, nothing else. If there is no clear answer that can be extracted from the response, reply with 'no answer'.""" + url = f"{self.judge_server_url}/v1/chat/completions" headers = {"Content-Type": "application/json"} data = { @@ -418,6 +465,8 @@ class Processor: # Initialize appropriate dataset if dataset_type == "aime": self.dataset = AimeDataset() + elif dataset_type == "aime2025": + self.dataset = Aime2025Dataset(variant="I") elif dataset_type == "gsm8k": self.dataset = Gsm8kDataset() elif dataset_type == "gpqa": @@ -593,7 +642,7 @@ def main(): "--dataset", type=str, default="aime", - choices=["aime", "gsm8k", "gpqa"], + choices=["aime", "aime2025", "gsm8k", "gpqa"], help="Dataset type (default: aime)" ) parser.add_argument(