datasets : add aime2025

This commit is contained in:
Georgi Gerganov 2026-02-16 11:07:54 +02:00
parent c6315655b7
commit 99e3c3d02c
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
3 changed files with 65 additions and 2 deletions

View File

@ -39,6 +39,7 @@ class EvalState:
### Datasets
- `AimeDataset`: 90 AIME 2025 questions
- `Aime2025Dataset`: 30 AIME 2025 I & II questions
- `Gsm8kDataset`: 7473 math word problems
- `GpqaDataset`: 198 GPQA Diamond questions with shuffling
@ -56,6 +57,12 @@ class EvalState:
- **cli**: External script with `--answer` and `--expected` args
- **llm**: LLM-based extraction with few-shot examples and configurable server/model
### Dataset Requirements
- **AIME**: Supports regex, CLI, or LLM grader
- **AIME2025**: Supports regex, CLI, or LLM grader
- **GSM8K**: Supports regex, CLI, or LLM grader
- **GPQA**: Requires LLM grader
## Output Format
### Progress Table

View File

@ -30,7 +30,7 @@ python llama-eval.py \
- `--model`: Model name for evaluation (default: llama)
- `--judge-model`: Model name for LLM judge (default: same as main model)
- `--judge-server`: Server URL for LLM judge (default: same as main server)
- `--dataset`: Dataset type (aime, gsm8k, gpqa)
- `--dataset`: Dataset type (aime, aime2025, gsm8k, gpqa)
- `--n_cases`: Number of cases to evaluate (default: all)
- `--n_predict`: Max tokens to predict per prompt (default: -1, infinite)
- `--temperature`: Sampling temperature (default: not passed)
@ -51,6 +51,11 @@ python llama-eval.py \
- Answers in boxed format: `\boxed{answer}`
- Requires regex grader or LLM grader
### AIME2025
- 30 questions from 2025 AIME I & II competitions
- Answers in boxed format: `\boxed{answer}`
- Supports regex, CLI, or LLM grader
### GSM8K
- 7473 math word problems
- Answers are numeric values
@ -66,6 +71,7 @@ python llama-eval.py \
### Regex Grader
Built-in patterns for different datasets:
- AIME: `\boxed{(\d+)}|\b(\d+)\b`
- AIME2025: `\boxed{(\d+)}|\b(\d+)\b`
- GSM8K: `\b(\d+)\b`
- GPQA: Letter extraction (A, B, C, D)
@ -81,6 +87,7 @@ Uses LLM to extract and compare answers:
- Configurable server and model
- Includes few-shot examples from sample answers
- Case-insensitive comparison
- Required for GPQA dataset
## Output

View File

@ -50,6 +50,9 @@ SAMPLE_ANSWERS = {
TEMPLATE_REGISTRY = {
"aime": """{question}
Please reason step by step, and put your final answer within \\boxed{{}}.
""",
"aime2025": """{question}
Please reason step by step, and put your final answer within \\boxed{{}}.
""",
"gsm8k": """{question}
Please reason step by step, and provide your final answer.
@ -133,6 +136,49 @@ class AimeDataset:
question=question["problem"] if "problem" in question else question["question"]
)
class Aime2025Dataset:
def __init__(self, variant: str = "I"):
self.variant = variant
self.questions: List[Dict] = []
self._load_dataset()
def _load_dataset(self):
print(f"Loading AIME2025 dataset (variant: {self.variant})...")
from datasets import load_dataset
config_name = f"AIME2025-{self.variant}"
cache_path = cache_dir / "opencompass___AIME2025" / "default" / "0.0.0"
if cache_path.exists():
print(f"Using cached dataset from {cache_path}")
ds = load_dataset("opencompass/AIME2025", config_name, split="test", cache_dir=str(cache_path))
else:
ds = load_dataset("opencompass/AIME2025", config_name, split="test")
self.questions = []
for row in ds:
question = dict(row)
question["dataset_type"] = "aime2025"
self.questions.append(question)
print(f"AIME2025 dataset loaded: {len(self.questions)} questions")
def get_question(self, index: int) -> Dict:
"""Get question by index"""
return self.questions[index]
def get_answer(self, question: Dict) -> str:
answer = question["answer"]
if isinstance(answer, str):
normalized = normalize_number(answer)
return str(normalized) if normalized is not None else answer
return str(answer)
def get_prompt(self, question: Dict) -> str:
"""Get formatted prompt for the question"""
return TEMPLATE_REGISTRY["aime2025"].format(
question=question["question"]
)
class Gsm8kDataset:
def __init__(self, split: str = "train"):
self.split = split
@ -342,6 +388,7 @@ Response: {pred}
===
Please provide only the extracted answer, nothing else. If there is no clear answer that can be extracted from the response, reply with 'no answer'."""
url = f"{self.judge_server_url}/v1/chat/completions"
headers = {"Content-Type": "application/json"}
data = {
@ -418,6 +465,8 @@ class Processor:
# Initialize appropriate dataset
if dataset_type == "aime":
self.dataset = AimeDataset()
elif dataset_type == "aime2025":
self.dataset = Aime2025Dataset(variant="I")
elif dataset_type == "gsm8k":
self.dataset = Gsm8kDataset()
elif dataset_type == "gpqa":
@ -593,7 +642,7 @@ def main():
"--dataset",
type=str,
default="aime",
choices=["aime", "gsm8k", "gpqa"],
choices=["aime", "aime2025", "gsm8k", "gpqa"],
help="Dataset type (default: aime)"
)
parser.add_argument(