datasets : add aime2025
This commit is contained in:
parent
c6315655b7
commit
99e3c3d02c
|
|
@ -39,6 +39,7 @@ class EvalState:
|
|||
|
||||
### Datasets
|
||||
- `AimeDataset`: 90 AIME 2025 questions
|
||||
- `Aime2025Dataset`: 30 AIME 2025 I & II questions
|
||||
- `Gsm8kDataset`: 7473 math word problems
|
||||
- `GpqaDataset`: 198 GPQA Diamond questions with shuffling
|
||||
|
||||
|
|
@ -56,6 +57,12 @@ class EvalState:
|
|||
- **cli**: External script with `--answer` and `--expected` args
|
||||
- **llm**: LLM-based extraction with few-shot examples and configurable server/model
|
||||
|
||||
### Dataset Requirements
|
||||
- **AIME**: Supports regex, CLI, or LLM grader
|
||||
- **AIME2025**: Supports regex, CLI, or LLM grader
|
||||
- **GSM8K**: Supports regex, CLI, or LLM grader
|
||||
- **GPQA**: Requires LLM grader
|
||||
|
||||
## Output Format
|
||||
|
||||
### Progress Table
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ python llama-eval.py \
|
|||
- `--model`: Model name for evaluation (default: llama)
|
||||
- `--judge-model`: Model name for LLM judge (default: same as main model)
|
||||
- `--judge-server`: Server URL for LLM judge (default: same as main server)
|
||||
- `--dataset`: Dataset type (aime, gsm8k, gpqa)
|
||||
- `--dataset`: Dataset type (aime, aime2025, gsm8k, gpqa)
|
||||
- `--n_cases`: Number of cases to evaluate (default: all)
|
||||
- `--n_predict`: Max tokens to predict per prompt (default: -1, infinite)
|
||||
- `--temperature`: Sampling temperature (default: not passed)
|
||||
|
|
@ -51,6 +51,11 @@ python llama-eval.py \
|
|||
- Answers in boxed format: `\boxed{answer}`
|
||||
- Requires regex grader or LLM grader
|
||||
|
||||
### AIME2025
|
||||
- 30 questions from 2025 AIME I & II competitions
|
||||
- Answers in boxed format: `\boxed{answer}`
|
||||
- Supports regex, CLI, or LLM grader
|
||||
|
||||
### GSM8K
|
||||
- 7473 math word problems
|
||||
- Answers are numeric values
|
||||
|
|
@ -66,6 +71,7 @@ python llama-eval.py \
|
|||
### Regex Grader
|
||||
Built-in patterns for different datasets:
|
||||
- AIME: `\boxed{(\d+)}|\b(\d+)\b`
|
||||
- AIME2025: `\boxed{(\d+)}|\b(\d+)\b`
|
||||
- GSM8K: `\b(\d+)\b`
|
||||
- GPQA: Letter extraction (A, B, C, D)
|
||||
|
||||
|
|
@ -81,6 +87,7 @@ Uses LLM to extract and compare answers:
|
|||
- Configurable server and model
|
||||
- Includes few-shot examples from sample answers
|
||||
- Case-insensitive comparison
|
||||
- Required for GPQA dataset
|
||||
|
||||
## Output
|
||||
|
||||
|
|
|
|||
|
|
@ -50,6 +50,9 @@ SAMPLE_ANSWERS = {
|
|||
TEMPLATE_REGISTRY = {
|
||||
"aime": """{question}
|
||||
Please reason step by step, and put your final answer within \\boxed{{}}.
|
||||
""",
|
||||
"aime2025": """{question}
|
||||
Please reason step by step, and put your final answer within \\boxed{{}}.
|
||||
""",
|
||||
"gsm8k": """{question}
|
||||
Please reason step by step, and provide your final answer.
|
||||
|
|
@ -133,6 +136,49 @@ class AimeDataset:
|
|||
question=question["problem"] if "problem" in question else question["question"]
|
||||
)
|
||||
|
||||
class Aime2025Dataset:
|
||||
def __init__(self, variant: str = "I"):
|
||||
self.variant = variant
|
||||
self.questions: List[Dict] = []
|
||||
self._load_dataset()
|
||||
|
||||
def _load_dataset(self):
|
||||
print(f"Loading AIME2025 dataset (variant: {self.variant})...")
|
||||
from datasets import load_dataset
|
||||
|
||||
config_name = f"AIME2025-{self.variant}"
|
||||
cache_path = cache_dir / "opencompass___AIME2025" / "default" / "0.0.0"
|
||||
if cache_path.exists():
|
||||
print(f"Using cached dataset from {cache_path}")
|
||||
ds = load_dataset("opencompass/AIME2025", config_name, split="test", cache_dir=str(cache_path))
|
||||
else:
|
||||
ds = load_dataset("opencompass/AIME2025", config_name, split="test")
|
||||
|
||||
self.questions = []
|
||||
for row in ds:
|
||||
question = dict(row)
|
||||
question["dataset_type"] = "aime2025"
|
||||
self.questions.append(question)
|
||||
|
||||
print(f"AIME2025 dataset loaded: {len(self.questions)} questions")
|
||||
|
||||
def get_question(self, index: int) -> Dict:
|
||||
"""Get question by index"""
|
||||
return self.questions[index]
|
||||
|
||||
def get_answer(self, question: Dict) -> str:
|
||||
answer = question["answer"]
|
||||
if isinstance(answer, str):
|
||||
normalized = normalize_number(answer)
|
||||
return str(normalized) if normalized is not None else answer
|
||||
return str(answer)
|
||||
|
||||
def get_prompt(self, question: Dict) -> str:
|
||||
"""Get formatted prompt for the question"""
|
||||
return TEMPLATE_REGISTRY["aime2025"].format(
|
||||
question=question["question"]
|
||||
)
|
||||
|
||||
class Gsm8kDataset:
|
||||
def __init__(self, split: str = "train"):
|
||||
self.split = split
|
||||
|
|
@ -342,6 +388,7 @@ Response: {pred}
|
|||
===
|
||||
|
||||
Please provide only the extracted answer, nothing else. If there is no clear answer that can be extracted from the response, reply with 'no answer'."""
|
||||
|
||||
url = f"{self.judge_server_url}/v1/chat/completions"
|
||||
headers = {"Content-Type": "application/json"}
|
||||
data = {
|
||||
|
|
@ -418,6 +465,8 @@ class Processor:
|
|||
# Initialize appropriate dataset
|
||||
if dataset_type == "aime":
|
||||
self.dataset = AimeDataset()
|
||||
elif dataset_type == "aime2025":
|
||||
self.dataset = Aime2025Dataset(variant="I")
|
||||
elif dataset_type == "gsm8k":
|
||||
self.dataset = Gsm8kDataset()
|
||||
elif dataset_type == "gpqa":
|
||||
|
|
@ -593,7 +642,7 @@ def main():
|
|||
"--dataset",
|
||||
type=str,
|
||||
default="aime",
|
||||
choices=["aime", "gsm8k", "gpqa"],
|
||||
choices=["aime", "aime2025", "gsm8k", "gpqa"],
|
||||
help="Dataset type (default: aime)"
|
||||
)
|
||||
parser.add_argument(
|
||||
|
|
|
|||
Loading…
Reference in New Issue