eval : add prompts

This commit is contained in:
Georgi Gerganov 2026-01-31 22:37:57 +02:00
parent 812ae13ec1
commit fb1481d60d
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
1 changed files with 30 additions and 3 deletions

View File

@ -27,6 +27,13 @@ GRADER_PATTERNS = {
"winogrande": r'[A-D]',
}
TEMPLATE_REGISTRY = {
"aime": """
{question}
Please reason step by step, and put your final answer within \\boxed{{}}.
""",
}
@dataclass
class EvalState:
id: str
@ -43,6 +50,12 @@ class TaskState:
correct: bool = False
status: str = "pending"
def normalize_number(s: str) -> Optional[int]:
match = re.match(r"\d+", s) # match digits from the start
if not match:
return None
return int(match.group(0))
class AimeDataset:
def __init__(self, split: str = "train"):
self.split = split
@ -60,7 +73,12 @@ class AimeDataset:
else:
ds = load_dataset("AI-MO/aimo-validation-aime", split=self.split)
self.questions = list(ds)
self.questions = []
for row in ds:
question = dict(row)
question["dataset_type"] = "aime"
self.questions.append(question)
print(f"AIME dataset loaded: {len(self.questions)} questions")
def get_question(self, index: int) -> Dict:
@ -68,7 +86,11 @@ class AimeDataset:
return self.questions[index]
def get_answer(self, question: Dict) -> str:
return str(question["answer"])
answer = question["answer"]
if isinstance(answer, str):
normalized = normalize_number(answer)
return str(normalized) if normalized is not None else answer
return str(answer)
class Grader:
def __init__(
@ -177,9 +199,14 @@ class Processor:
"""Process a single case (thread-safe)"""
question = self.dataset.get_question(i)
case_id = f"aime_{self.dataset.split}_{question['id']}"
prompt = question["problem"]
gold = self.dataset.get_answer(question)
# Apply template if available
if question["dataset_type"] in TEMPLATE_REGISTRY:
prompt = TEMPLATE_REGISTRY[question["dataset_type"]].format(question=question["problem"])
else:
prompt = question["problem"]
task_state = TaskState(
case_id=case_id,
prompt=prompt,