eval : add prompts

2026-01-31 22:37:57 +02:00 · 2026-01-31 22:37:57 +02:00 · fb1481d60d
parent 812ae13ec1
commit fb1481d60d
1 changed files with 30 additions and 3 deletions
--- a/examples/llama-eval/llama-eval-new.py
+++ b/examples/llama-eval/llama-eval-new.py
@ -27,6 +27,13 @@ GRADER_PATTERNS = {
    "winogrande": r'[A-D]',
 }

+TEMPLATE_REGISTRY = {
+    "aime": """
+{question}
+Please reason step by step, and put your final answer within \\boxed{{}}.
+""",
+}
+
@dataclass
 class EvalState:
    id: str
@ -43,6 +50,12 @@ class TaskState:
    correct: bool = False
    status: str = "pending"

+def normalize_number(s: str) -> Optional[int]:
+    match = re.match(r"\d+", s)  # match digits from the start
+    if not match:
+        return None
+    return int(match.group(0))
+
 class AimeDataset:
    def __init__(self, split: str = "train"):
        self.split = split
@ -60,7 +73,12 @@ class AimeDataset:
        else:
            ds = load_dataset("AI-MO/aimo-validation-aime", split=self.split)

-        self.questions = list(ds)
+        self.questions = []
+        for row in ds:
+            question = dict(row)
+            question["dataset_type"] = "aime"
+            self.questions.append(question)
+
        print(f"AIME dataset loaded: {len(self.questions)} questions")

    def get_question(self, index: int) -> Dict:
@ -68,7 +86,11 @@ class AimeDataset:
        return self.questions[index]

    def get_answer(self, question: Dict) -> str:
-        return str(question["answer"])
+        answer = question["answer"]
+        if isinstance(answer, str):
+            normalized = normalize_number(answer)
+            return str(normalized) if normalized is not None else answer
+        return str(answer)

 class Grader:
    def __init__(
@ -177,9 +199,14 @@ class Processor:
        """Process a single case (thread-safe)"""
        question = self.dataset.get_question(i)
        case_id = f"aime_{self.dataset.split}_{question['id']}"
-        prompt = question["problem"]
        gold = self.dataset.get_answer(question)

+        # Apply template if available
+        if question["dataset_type"] in TEMPLATE_REGISTRY:
+            prompt = TEMPLATE_REGISTRY[question["dataset_type"]].format(question=question["problem"])
+        else:
+            prompt = question["problem"]
+
        task_state = TaskState(
            case_id=case_id,
            prompt=prompt,