diff --git a/examples/llama-eval/README.md b/examples/llama-eval/README.md new file mode 100644 index 0000000000..82ba6c46f2 --- /dev/null +++ b/examples/llama-eval/README.md @@ -0,0 +1,5 @@ +# llama-eval + +Simple evaluation tool for llama.cpp with support for multiple datasets. + +TODO: add usage diff --git a/examples/llama-eval/llama-eval.py b/examples/llama-eval/llama-eval.py new file mode 100755 index 0000000000..6af1459e25 --- /dev/null +++ b/examples/llama-eval/llama-eval.py @@ -0,0 +1,1240 @@ +#!/usr/bin/env python3 +# type: ignore + +import argparse +import json +import os +import re +import subprocess +import sys +import time +from abc import ABC, abstractmethod +from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass, asdict, field +from pathlib import Path +from typing import Dict, List, Optional, Any, Tuple +import requests +from tqdm import tqdm +import random + +cache_dir = Path.home() / ".cache" / "huggingface" / "datasets" +cache_dir.mkdir(parents=True, exist_ok=True) +os.environ["HF_DATASETS_CACHE"] = str(cache_dir) +os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1" + +GRADER_PATTERNS = { + "aime": r'\boxed{(\d+)}|\b(\d+)\b', + "aime2025": r'\boxed{(\d+)}|\b(\d+)\b', + "gsm8k": r'\b(\d+)\b', +} + +SAMPLE_ANSWERS = { + "aime": [ + "42", + "-123", + "999" + ], + "aime2025": [ + "42", + "-123", + "999" + ], + "gsm8k": [ + "42", + "-123", + "999" + ], + "gpqa": [ + "A", + "D", + "C" + ], +} + +TEMPLATE_REGISTRY = { + "aime": """Solve the following math problem step by step. Put your answer inside \\boxed{{}}. + +{question} + +Remember to put your answer inside \\boxed{{}}. +""", + "aime2025": """Solve the following math problem step by step. Put your answer inside \\boxed{{}}. + +{question} + +Remember to put your answer inside \\boxed{{}}. +""", + "gsm8k": """{question} +Please reason step by step, and put your final numeric answer within \\boxed{{}} without any extra characters. +""", + "gpqa": """Answer the following multiple choice question. The last line of your response should be in the following format: 'Answer: A/B/C/D' (e.g. 'Answer: A'). + +{Question} + +A) {A} +B) {B} +C) {C} +D) {D} +""", +} + + +class BaseDataset(ABC): + @abstractmethod + def get_question(self, index: int) -> Dict: + pass + + @abstractmethod + def get_question_text(self, question: Dict) -> str: + pass + + @abstractmethod + def get_answer(self, question: Dict) -> str: + pass + + @abstractmethod + def get_prompt(self, question: Dict) -> str: + pass + + def __len__(self) -> int: + return len(self.questions) + + +@dataclass +class TaskState: + task_id: str + prompt: str + expected: str + question_text: str = "" + response: Optional[str] = None + answer: Optional[str] = None + grader_log: Dict[str, Any] = field(default_factory=dict) + correct: bool = False + status: str = "pending" + tokens: Optional[int] = None + reasoning_content: Optional[str] = None + + +class EvalState: + def __init__( + self, + dataset_type: str, + sampling_config: Dict[str, Any], + output_file: Path = Path("llama-eval-state.json") + ): + self.dataset_type = dataset_type + self.sampling_config = sampling_config + self.output_file = output_file + self.dataset: Optional[BaseDataset] = None + self.tasks: List[Tuple[int, str]] = [] + self.all_tasks: List[Tuple[int, str]] = [] + self.task_states: Dict[str, Any] = {} + self.total = 0 + self.correct = 0 + self.processed = 0 + self.total_time: float = 0.0 + + def load_dataset(self, seed: int = 1234): + if self.dataset_type == "aime": + self.dataset = AimeDataset() + elif self.dataset_type == "aime2025": + self.dataset = Aime2025Dataset() + elif self.dataset_type == "gsm8k": + self.dataset = Gsm8kDataset() + elif self.dataset_type == "gpqa": + self.dataset = GpqaDataset(variant="diamond", seed=seed) + else: + raise ValueError(f"Unknown dataset type: {self.dataset_type}") + + def setup_tasks(self, n_cases: Optional[int] = None, seed: int = 1234): + if self.dataset is None: + raise ValueError("Dataset not loaded. Call load_dataset() first.") + + if n_cases is None: + n_cases = len(self.dataset) + + dataset_size = len(self.dataset) + rng = random.Random(seed) + + self.tasks = [] + for chunk_idx in range((n_cases + dataset_size - 1) // dataset_size): + chunk_size = min(dataset_size, n_cases - chunk_idx * dataset_size) + indices = list(range(dataset_size)) + rng.shuffle(indices) + chunk_indices = indices[:chunk_size] + + for i in chunk_indices: + task_id = f"{self.dataset_type}_{chunk_idx:03d}_{i:03d}" + self.tasks.append((i, task_id)) + + self.all_tasks = list(self.tasks) + + def get_case(self, index: int) -> Tuple[str, str, str]: + if self.dataset is None: + raise ValueError("Dataset not loaded.") + question = self.dataset.get_question(index) + question_text = self.dataset.get_question_text(question) + prompt = self.dataset.get_prompt(question) + expected = self.dataset.get_answer(question) + return question_text, prompt, expected + + def add_result( + self, + task_id: str, + prompt: str, + expected: str, + response: Optional[str], + answer: Optional[str], + grader_log: Dict[str, Any], + correct: bool, + status: str, + tokens: Optional[int] = None, + reasoning_content: Optional[str] = None + ): + if "cases" not in self.task_states: + self.task_states["cases"] = {} + + self.task_states["cases"][task_id] = { + "task_id": task_id, + "prompt": prompt, + "expected": expected, + "response": response, + "answer": answer, + "grader_log": grader_log, + "correct": correct, + "status": status, + "tokens": tokens, + "reasoning_content": reasoning_content + } + + self.correct = sum(1 for c in self.task_states.get("cases", {}).values() if c.get("correct", False)) + + def print_progress(self, task_state: TaskState, total_tasks: int, correct_count: int = 0): + answer_display = task_state.answer if task_state.answer else "N/A" + tokens_display = str(task_state.tokens) if task_state.tokens is not None else "N/A" + success_ratio = correct_count / self.processed if self.processed > 0 else 0.0 + first_line = task_state.question_text.split('\n')[0] + truncated_question = first_line[:43] + if len(first_line) > 43: + truncated_question += "..." + else: + truncated_question = truncated_question.ljust(43) + "..." + print(f"{self.processed:3}/{total_tasks:3} {task_state.task_id:<20} {self.dataset_type.upper()} {truncated_question:<40} {task_state.expected:<10} {answer_display:<10} {tokens_display:<6} {'✓' if task_state.correct else '✗'} [{correct_count:3}/{self.processed:3}, {success_ratio:.3f}]") + + def print_summary(self): + if self.total == 0: + print(f"\n{'='*60}") + print(f"Results: 0/0 correct (0.0%)") + print(f"{'='*60}") + else: + print(f"\n{'='*60}") + print(f"Results: {self.correct}/{self.total} correct ({self.correct/self.total*100:.1f}%)") + print(f"{'='*60}") + + def dump(self): + tasks_to_save = self.all_tasks if self.all_tasks else self.tasks + all_cases = {} + for i, task_id in tasks_to_save: + question_text, prompt, expected = self.get_case(i) + if task_id in self.task_states.get("cases", {}): + all_cases[task_id] = self.task_states["cases"][task_id] + else: + all_cases[task_id] = { + "task_id": task_id, + "prompt": prompt, + "expected": expected, + "question_text": question_text, + "response": None, + "answer": None, + "grader_log": {}, + "correct": False, + "status": "pending", + "tokens": None, + "reasoning_content": None + } + + data = { + "id": self.dataset_type, + "tasks": [tid for _, tid in tasks_to_save], + "task_states": { + "total": self.total, + "correct": self.correct, + "total_time": self.total_time, + "cases": all_cases, + }, + "sampling_config": self.sampling_config + } + with open(self.output_file, "w") as f: + json.dump(data, f, indent=2) + + self.dump_html(tasks_to_save, all_cases) + + def dump_html(self, tasks_to_save: List[Tuple[int, str]], all_cases: Dict[str, Any]): + html_file = Path(str(self.output_file) + ".html") + + cases = all_cases + completed = {tid: c for tid, c in cases.items() if c.get("status") == "ok"} + correct_count = sum(1 for c in completed.values() if c.get("correct", False)) + incorrect_count = len(completed) - correct_count + pending_count = len(tasks_to_save) - len(completed) + accuracy = correct_count / len(completed) * 100 if completed else 0.0 + + sampling_parts = [] + for k, v in self.sampling_config.items(): + if v is not None: + sampling_parts.append(f"{k}={v}") + sampling_str = ", ".join(sampling_parts) if sampling_parts else "default" + + rows = [] + for i, task_id in tasks_to_save: + case = cases.get(task_id, {}) + status = case.get("status", "pending") + expected = case.get("expected", "") + answer = case.get("answer", "") if status == "ok" else "" + is_correct = case.get("correct", False) if status == "ok" else False + response = case.get("response", "") or "" + prompt = case.get("prompt", "") or "" + grader_log = case.get("grader_log", {}) + + if status == "ok": + status_class = "correct" if is_correct else "incorrect" + status_text = "✓ Correct" if is_correct else "✗ Incorrect" + elif status == "pending": + status_class = "pending" + status_text = "Pending" + else: + status_class = "error" + status_text = f"Error: {status}" + + tokens = case.get("tokens") + tokens_str = str(tokens) if tokens is not None else "" + reasoning_content = case.get("reasoning_content", "") or "" + + response_escaped = self._escape_html(response) + prompt_escaped = self._escape_html(prompt) + reasoning_escaped = self._escape_html(reasoning_content) + grader_log_str = self._escape_html(json.dumps(grader_log, indent=2)) + + rows.append(f""" + {task_id} + {status_text} + {self._escape_html(expected)} + {self._escape_html(answer)} + {tokens_str} + + + +
+

Prompt

+
{prompt_escaped}
+

Reasoning ▶

+ +

Response

+
{response_escaped}
+

Grader Log

+
{grader_log_str}
+
+ + """) + + rows_html = "\n".join(rows) + + html_content = f""" + + + + + Eval State - {self.dataset_type} + + + +

Eval State: {self.dataset_type.upper()}

+
+ + + + + + + + + + +
Dataset{self.dataset_type}
Total Tasks{len(tasks_to_save)}
Completed{len(completed)}
Correct{correct_count}
Incorrect{incorrect_count}
Pending{pending_count}
Accuracy{accuracy:.1f}%
Total Time{self.total_time:.1f}s
Sampling{sampling_str}
+
+ + + + + + + + + + + + {rows_html} + +
Task IDStatusGoldExtractedTokens
+ + +""" + + with open(html_file, "w") as f: + f.write(html_content) + + def _escape_html(self, s: str) -> str: + return (s.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """) + .replace("'", "'")) + + @classmethod + def load(cls, path: Path) -> "EvalState": + with open(path, "r") as f: + data = json.load(f) + + eval_state = cls( + dataset_type=data["id"], + sampling_config=data["sampling_config"], + output_file=path + ) + eval_state.load_dataset() + + eval_state.tasks = [] + eval_state.all_tasks = [] + for task_id in data.get("tasks", []): + parts = task_id.rsplit("_", 2) + if len(parts) >= 3: + idx = int(parts[-1]) + else: + idx = 0 + eval_state.tasks.append((idx, task_id)) + eval_state.all_tasks.append((idx, task_id)) + + eval_state.task_states = data.get("task_states", {}) + + cases = eval_state.task_states.get("cases", {}) + eval_state.total = eval_state.task_states.get("total", 0) + eval_state.correct = eval_state.task_states.get("correct", 0) + eval_state.total_time = eval_state.task_states.get("total_time", 0.0) + + if eval_state.total == 0: + eval_state.total = len(cases) + eval_state.correct = sum(1 for c in cases.values() if c.get("correct", False)) + + return eval_state + + def is_complete(self) -> bool: + if not self.all_tasks: + return False + cases = self.task_states.get("cases", {}) + completed = {tid for tid in self.task_states.get("cases", {}).keys() if cases.get(tid, {}).get("status") == "ok"} + return len(completed) == len(self.all_tasks) + + def get_pending_tasks(self) -> List[Tuple[int, str]]: + cases = self.task_states.get("cases", {}) + pending = [] + for i, task_id in self.all_tasks: + status = cases.get(task_id, {}).get("status", "pending") + if status != "ok": + pending.append((i, task_id)) + return pending + + def print_all_tasks(self): + cases = self.task_states.get("cases", {}) + tasks_to_show = self.all_tasks if self.all_tasks else self.tasks + print() + print("Tasks:") + print(" Task ID Dataset Prompt (first 40 chars) Expected Answer Tokens Status") + for i, task_id in tasks_to_show: + question, prompt, expected = self.get_case(i) + case = cases.get(task_id, {}) + status = case.get("status", "pending") + answer = case.get("answer", "N/A") if status == "ok" else "N/A" + tokens = case.get("tokens") + tokens_str = str(tokens) if tokens is not None else "N/A" + is_correct = case.get("correct", False) if status == "ok" else False + symbol = "✓ " if is_correct else ("✗ " if status == "ok" else "") + first_line = question.split('\n')[0] + question_trunc = first_line[:43] + if len(first_line) > 43: + question_trunc += "..." + else: + question_trunc = question_trunc.ljust(43) + "..." + print(f" {task_id:<20} {self.dataset_type.upper()} {question_trunc:<40} {expected:<10} {answer:<10} {tokens_str:<6} {symbol}{status}") + print() + + def print_existing_summary(self): + cases = self.task_states.get("cases", {}) + completed_cases = {tid: c for tid, c in cases.items() if c.get("status") == "ok"} + correct = sum(1 for c in completed_cases.values() if c.get("correct", False)) + total = len(completed_cases) + if total == 0: + print(f"{'='*60}") + print(f"Results: 0/0 correct (0.0%)") + print(f"{'='*60}") + else: + print(f"{'='*60}") + print(f"Results: {correct}/{total} correct ({correct/total*100:.1f}%)") + print(f"{'='*60}") + +def normalize_number(s: str) -> Optional[int]: + match = re.match(r"\d+", s) # match digits from the start + if not match: + return None + return int(match.group(0)) + +class AimeDataset(BaseDataset): + def __init__(self, split: str = "train"): + self.split = split + self.questions: List[Dict] = [] + self._load_dataset() + + def _load_dataset(self): + print(f"Loading AIME dataset (split: {self.split})...") + from datasets import load_dataset + + cache_path = cache_dir / "AI-MO___aimo-validation-aime" / "default" / "0.0.0" + if cache_path.exists(): + print(f"Using cached dataset from {cache_path}") + ds = load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path)) + else: + ds = load_dataset("AI-MO/aimo-validation-aime", split=self.split) + + self.questions = [] + for row in ds: + question = dict(row) + question["dataset_type"] = "aime" + self.questions.append(question) + + print(f"AIME dataset loaded: {len(self.questions)} questions") + + def get_question(self, index: int) -> Dict: + """Get question by index""" + return self.questions[index] + + def get_question_text(self, question: Dict) -> str: + """Get question string""" + return question["problem"] if "problem" in question else question["question"] + + def get_answer(self, question: Dict) -> str: + answer = question["answer"] + if isinstance(answer, str): + normalized = normalize_number(answer) + return str(normalized) if normalized is not None else answer + return str(answer) + + def get_prompt(self, question: Dict) -> str: + """Get formatted prompt for the question""" + return TEMPLATE_REGISTRY[question["dataset_type"]].format( + question=self.get_question_text(question), + ) + +class Aime2025Dataset(BaseDataset): + def __init__(self): + self.questions: List[Dict] = [] + self._load_dataset() + + def _load_dataset(self): + print(f"Loading AIME2025 dataset...") + from datasets import load_dataset + + config_name = "AIME2025-I" + cache_path = cache_dir / "opencompass___AIME2025" / "default" / "0.0.0" + if cache_path.exists(): + print(f"Using cached dataset from {cache_path}") + ds = load_dataset("opencompass/AIME2025", config_name, split="test", cache_dir=str(cache_path)) + else: + ds = load_dataset("opencompass/AIME2025", config_name, split="test") + + self.questions = [] + for row in ds: + question = dict(row) + question["dataset_type"] = "aime2025" + self.questions.append(question) + + print(f"AIME2025 dataset loaded: {len(self.questions)} questions") + + print(f"Loading AIME2025 dataset (part 2)...") + config_name_2 = "AIME2025-II" + cache_path_2 = cache_dir / "opencompass___AIME2025" / "default" / "0.0.0" + if cache_path_2.exists(): + print(f"Using cached dataset from {cache_path_2}") + ds_2 = load_dataset("opencompass/AIME2025", config_name_2, split="test", cache_dir=str(cache_path_2)) + else: + ds_2 = load_dataset("opencompass/AIME2025", config_name_2, split="test") + + for row in ds_2: + question = dict(row) + question["dataset_type"] = "aime2025" + self.questions.append(question) + + print(f"AIME2025 dataset loaded: {len(self.questions)} questions (total)") + + def get_question(self, index: int) -> Dict: + """Get question by index""" + return self.questions[index] + + def get_question_text(self, question: Dict) -> str: + """Get question string""" + return question["question"] + + def get_answer(self, question: Dict) -> str: + answer = question["answer"] + if isinstance(answer, str): + normalized = normalize_number(answer) + return str(normalized) if normalized is not None else answer + return str(answer) + + def get_prompt(self, question: Dict) -> str: + """Get formatted prompt for the question""" + return TEMPLATE_REGISTRY["aime2025"].format( + question=self.get_question_text(question), + ) + +class Gsm8kDataset(BaseDataset): + def __init__(self, split: str = "test"): + self.split = split + self.questions: List[Dict] = [] + self._load_dataset() + + def _load_dataset(self): + print(f"Loading GSM8K dataset (split: {self.split})...") + from datasets import load_dataset + + cache_path = cache_dir / "openai___gsm8k" / "default" / "0.0.0" + if cache_path.exists(): + print(f"Using cached dataset from {cache_path}") + ds = load_dataset("openai/gsm8k", "main", split=self.split, cache_dir=str(cache_path)) + else: + ds = load_dataset("openai/gsm8k", "main", split=self.split) + + self.questions = [] + for row in ds: + question = dict(row) + question["dataset_type"] = "gsm8k" + + # Extract numeric answer from the answer field (already has #### prefix) + gold = question["answer"] + # Split by #### and take the last part + parts = gold.split("####") + if len(parts) > 1: + gold = parts[-1].strip() + # Extract the first number from the remaining text + normalized = normalize_number(gold) + question["gold"] = str(normalized) if normalized is not None else gold + + self.questions.append(question) + + print(f"GSM8K dataset loaded: {len(self.questions)} questions") + + def get_question(self, index: int) -> Dict: + """Get question by index""" + return self.questions[index] + + def get_question_text(self, question: Dict) -> str: + """Get question string""" + return question["problem"] if "problem" in question else question["question"] + + def get_answer(self, question: Dict) -> str: + # GSM8K has pre-extracted gold field, AIME uses answer field + if "gold" in question: + return question["gold"] + answer = question["answer"] + if isinstance(answer, str): + normalized = normalize_number(answer) + return str(normalized) if normalized is not None else answer + return str(answer) + + def get_prompt(self, question: Dict) -> str: + """Get formatted prompt for the question""" + return TEMPLATE_REGISTRY[question["dataset_type"]].format( + question=self.get_question_text(question), + ) + +class GpqaDataset(BaseDataset): + def __init__(self, variant: str = "diamond", seed: int = 1234): + self.variant = variant + self.seed = seed + self.questions: List[Dict] = [] + self._load_dataset() + + def _load_dataset(self): + print(f"Loading GPQA dataset (variant: {self.variant})...") + import pandas as pd + + url = f"https://openaipublic.blob.core.windows.net/simple-evals/gpqa_{self.variant}.csv" + df = pd.read_csv(url) + + rng = random.Random(self.seed) + + self.questions = [] + for _, row in df.iterrows(): + question = row.to_dict() + question["dataset_type"] = "gpqa" + + # Shuffle the answer options + correct_answer = question["Correct Answer"] + incorrect_answers = [ + question["Incorrect Answer 1"], + question["Incorrect Answer 2"], + question["Incorrect Answer 3"] + ] + + # Create list of (answer, is_correct) tuples + options = [(ans, ans == correct_answer) for ans in incorrect_answers] + options.append((correct_answer, True)) + + # Shuffle the options + rng.shuffle(options) + + # Extract shuffled answers and determine correct letter + shuffled_answers = [ans for ans, _ in options] + correct_letter = chr(ord('A') + options.index((correct_answer, True))) + + # Store shuffled answers and correct letter + question["shuffled_answers"] = shuffled_answers + question["correct_letter"] = correct_letter + + self.questions.append(question) + + print(f"GPQA dataset loaded: {len(self.questions)} questions") + + def get_question(self, index: int) -> Dict: + """Get question by index""" + return self.questions[index] + + def get_question_text(self, question: Dict) -> str: + """Get question string""" + return question["Question"] + + def get_answer(self, question: Dict) -> str: + # GPQA returns the correct letter (A, B, C, or D) + return question["correct_letter"] + + def get_prompt(self, question: Dict) -> str: + """Get formatted prompt for the question""" + return TEMPLATE_REGISTRY["gpqa"].format( + Question=self.get_question_text(question), + A=question["shuffled_answers"][0], + B=question["shuffled_answers"][1], + C=question["shuffled_answers"][2], + D=question["shuffled_answers"][3] + ) + +class Grader: + def __init__( + self, + grader_type: str = "llm", + grader_script: Optional[str] = None, + judge_model_name: Optional[str] = None, + judge_server_url: str = "", + dataset_type: str = "aime" + ): + self.grader_type = grader_type + self.grader_script = grader_script + self.judge_model_name = judge_model_name + self.judge_server_url = judge_server_url + self.dataset_type = dataset_type + self.pattern = self._get_pattern() + + def _get_pattern(self) -> Optional[str]: + if self.grader_type == "regex": + return GRADER_PATTERNS.get(self.dataset_type) # Use dataset_type as key + return None + + def _extract_answer_regex(self, pred: str) -> Optional[str]: + """Extract answer using regex pattern""" + if not self.pattern: + return None + + # For AIME datasets, prioritize boxed answers + if self.dataset_type in ["aime", "aime2025"]: + boxed_pattern = r'\\boxed{([^}]+)}' + boxed_matches = re.findall(boxed_pattern, pred, re.IGNORECASE) + if boxed_matches: + # Return the last boxed answer found (most likely the final answer) + return boxed_matches[-1].strip() + + # For other datasets, search for numbers from the end of the text + # This prioritizes numbers that appear later in the response + matches = re.findall(self.pattern, pred, re.IGNORECASE) + if not matches: + return None + + # Process matches from end to start + for match in reversed(matches): + if isinstance(match, tuple): + match = match[0] if match[0] else match[1] + answer = match.strip() + if answer: + return answer + return None + + def _grade_regex(self, gold: str, pred: str) -> Tuple[bool, Optional[str]]: + """Grade using regex pattern matching""" + answer = self._extract_answer_regex(pred) + if answer is None: + return False, None + is_correct = answer.strip() == gold.strip() + return is_correct, answer + + def _grade_cli(self, gold: str, pred: str) -> Tuple[bool, Optional[str]]: + """Grade using external CLI script""" + if not self.grader_script: + raise ValueError("CLI grader requires --grader-script") + + script_path = Path(self.grader_script) + if not script_path.exists(): + raise FileNotFoundError(f"Grader script not found: {self.grader_script}") + + try: + result = subprocess.run( + [str(script_path), "--answer", pred, "--expected", gold], + capture_output=True, + text=True, + timeout=30 + ) + is_correct = result.returncode == 0 + answer = pred if is_correct else None + return is_correct, answer + except subprocess.TimeoutExpired: + return False, None + except Exception as e: + return False, None + + def _grade_llm(self, gold: str, pred: str, problem: str) -> Tuple[bool, Optional[str]]: + """Grade using LLM-based extraction with few-shot examples""" + sample_answers = SAMPLE_ANSWERS.get(self.dataset_type, []) + sample_examples = "\n".join([ + f"Example {i+1}: {ans}" for i, ans in enumerate(sample_answers) + ]) + + system_prompt = f"""You are an answer extraction system. Your task is to extract the answer from the model's response. + +Here are some examples of extracted answers to demonstrate what you are supposed to output: + +{sample_examples} + +When extracting the answer, provide only the extracted answer itself, nothing else. If there is no clear answer that can be extracted from the response, reply with 'no answer'.""" + + user_prompt = f"""Extract the answer from the following response: + +"{pred}" + +Please provide only the extracted answer, nothing else. If there is no clear answer that can be extracted from the response, reply with 'no answer'.""" + + url = f"{self.judge_server_url}/v1/chat/completions" + headers = {"Content-Type": "application/json"} + data = { + "model": self.judge_model_name, + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ], + "temperature": 0, + } + #print(json.dumps(data, indent=2)) + + try: + response = requests.post(url, headers=headers, json=data) + response.raise_for_status() + answer = response.json()["choices"][0]["message"]["content"].strip() + is_correct = answer.strip().lower() == gold.strip().lower() + return is_correct, answer + except Exception as e: + return False, None + + def _truncate_response(self, response: str, max_lines: int = 6) -> str: + """Keep only last N lines of response""" + lines = response.split('\n') + return '\n'.join(lines[-max_lines:]) if len(lines) > max_lines else response + + def grade(self, gold: str, pred: str, problem: str = "") -> Tuple[bool, Optional[str]]: + """Grade the response""" + if self.grader_type == "regex": + return self._grade_regex(gold, pred) + elif self.grader_type == "cli": + return self._grade_cli(gold, pred) + elif self.grader_type == "llm": + return self._grade_llm(gold, pred, problem) + else: + raise ValueError(f"Unknown grader type: {self.grader_type}") + +class Processor: + def __init__( + self, + server_url: str, + grader: Grader, + model_name: Optional[str] = None, + threads: int = 32, + n_predict: int = -1 + ): + self.server_url = server_url + self.grader = grader + self.model_name = model_name + self.threads = threads + self.n_predict = n_predict + + def _make_request(self, eval_state: EvalState, prompt: str) -> Tuple[Dict[str, Any], int, str]: + url = f"{self.server_url}/v1/chat/completions" + headers = {"Content-Type": "application/json"} + data = { + "model": self.model_name if self.model_name else "llama", + "messages": [{"role": "user", "content": prompt}], + "n_predict": self.n_predict + } + if eval_state.sampling_config.get("temperature") is not None: + data["temperature"] = eval_state.sampling_config["temperature"] + if eval_state.sampling_config.get("top_k") is not None: + data["top_k"] = eval_state.sampling_config["top_k"] + if eval_state.sampling_config.get("top_p") is not None: + data["top_p"] = eval_state.sampling_config["top_p"] + if eval_state.sampling_config.get("min_p") is not None: + data["min_p"] = eval_state.sampling_config["min_p"] + + response = requests.post(url, headers=headers, json=data) + response.raise_for_status() + result = response.json() + tokens = result.get("usage", {}).get("completion_tokens", 0) + finish_reason = result.get("choices", [{}])[0].get("finish_reason", "stop") + return result, tokens, finish_reason + + def _process_single_case(self, eval_state: EvalState, i: int, task_id: str) -> TaskState: + question_text, prompt, expected = eval_state.get_case(i) + + task_state = TaskState( + task_id=task_id, + prompt=prompt, + expected=expected, + question_text=question_text + ) + + try: + response, tokens, finish_reason = self._make_request(eval_state, prompt) + result = response["choices"][0]["message"]["content"] + reasoning_content = response["choices"][0].get("message", {}).get("reasoning_content") + task_state.response = result + task_state.tokens = tokens + task_state.reasoning_content = reasoning_content + + if finish_reason != "stop": + task_state.status = f"error: finish_reason={finish_reason}" + eval_state.add_result(task_id, prompt, expected, result, None, {"finish_reason": finish_reason}, False, task_state.status, tokens, reasoning_content) + eval_state.dump() + return task_state + + result_truncated = self.grader._truncate_response(result, max_lines=10) + is_correct, answer = self.grader.grade(expected, result_truncated, prompt) + + grader_log = { + "pred": result_truncated, + "grader_type": self.grader.grader_type + } + if self.grader.grader_type == "regex" and self.grader.pattern: + grader_log["pattern"] = self.grader.pattern + + task_state.correct = is_correct + task_state.answer = answer + task_state.grader_log = grader_log + task_state.status = "ok" + + eval_state.add_result(task_id, prompt, expected, result, answer, grader_log, is_correct, "ok", tokens, reasoning_content) + + eval_state.dump() + + except Exception as e: + task_state.status = f"error: {str(e)}" + + return task_state + + def evaluate(self, eval_state: EvalState, verbose: bool = False, resume: bool = False): + total_tasks = len(eval_state.tasks) + eval_state.total = len(eval_state.all_tasks) if eval_state.all_tasks else total_tasks + eval_state.processed = 0 + start_time = time.time() + + print(f"\nProcessing {len(eval_state.tasks)} {eval_state.dataset_type.upper()} tasks ...") + print(f"Server: {self.server_url} (model: {self.model_name})") + print(f"Grader: {self.grader.grader_type}") + print(f"Threads: {self.threads}") + print(f"Sampling: temp={eval_state.sampling_config.get('temperature', 'skip')}, top-k={eval_state.sampling_config.get('top_k', 'skip')}, top-p={eval_state.sampling_config.get('top_p', 'skip')}, min-p={eval_state.sampling_config.get('min_p', 'skip')}") + print() + + correct_count = 0 + + with ThreadPoolExecutor(max_workers=self.threads) as executor: + futures = { + executor.submit(self._process_single_case, eval_state, i, task_id): (i, task_id) + for i, task_id in eval_state.tasks + } + + session_time = 0.0 + for future in as_completed(futures): + task_state = future.result() + eval_state.processed += 1 + if task_state.correct: + correct_count += 1 + elapsed = time.time() - start_time + eval_state.total_time += elapsed + session_time += elapsed + start_time = time.time() + eval_state.print_progress(task_state, total_tasks, correct_count) + + if verbose: + print(f"\nCase {eval_state.processed}: {task_state.correct}") + print(f" Expected: {task_state.expected}") + if task_state.response: + print(f" Response: {task_state.response}") + if task_state.answer: + print(f" Answer: {task_state.answer}") + print(f" Status: {task_state.status}") + + print(f"\nSession time: {session_time:.1f}s | Total accumulated time: {eval_state.total_time:.1f}s") + eval_state.print_summary() + eval_state.dump() + +def main(): + parser = argparse.ArgumentParser( + description="Simplified evaluation tool for llama.cpp" + ) + parser.add_argument( + "--server", + type=str, + default="http://localhost:8033", + help="llama-server URL (default: http://localhost:8033)" + ) + parser.add_argument( + "--dataset", + type=str, + default="aime", + choices=["aime", "aime2025", "gsm8k", "gpqa"], + help="Dataset type (default: aime)" + ) + parser.add_argument( + "--n_cases", + type=int, + default=None, + help="Number of cases to evaluate (default: all)" + ) + parser.add_argument( + "--seed", + type=int, + default=1234, + help="Random seed for shuffling (default: 1234)" + ) + parser.add_argument( + "--n_predict", + type=int, + default=-1, + help="Max tokens to predict per prompt (default: -1, infinite)" + ) + parser.add_argument( + "--temperature", + type=float, + default=None, + help="Sampling temperature (default: not passed)" + ) + parser.add_argument( + "--top-k", + type=int, + default=None, + help="Top K sampling (default: not passed)" + ) + parser.add_argument( + "--top-p", + type=float, + default=None, + help="Top P sampling (default: not passed)" + ) + parser.add_argument( + "--min-p", + type=float, + default=None, + help="Min P sampling (default: not passed)" + ) + parser.add_argument( + "--threads", + type=int, + default=32, + help="Number of threads for parallel requests (default: 32)" + ) + parser.add_argument( + "--model", + type=str, + default=None, + help="Model name to append as query parameter (e.g., gpt-oss-20b-hf)" + ) + parser.add_argument( + "--verbose", + action="store_true", + help="Show detailed output for each case" + ) + parser.add_argument( + "--output", + type=Path, + default=Path("llama-eval-state.json"), + help="Output file for eval state (default: llama-eval-state.json)" + ) + parser.add_argument( + "--grader-type", + type=str, + default="llm", + choices=["regex", "cli", "llm"], + help="Grader type: regex, cli, or llm (default: llm)" + ) + parser.add_argument( + "--grader-script", + type=str, + default=None, + help="CLI grader script path (required for --grader-type cli)" + ) + parser.add_argument( + "--judge-server", + type=str, + default="", + help="Server URL for LLM judge (default: same as main server)" + ) + parser.add_argument( + "--judge-model", + type=str, + default="", + help="Model name for LLM judge (default: same as main model)" + ) + parser.add_argument( + "--resume", + action="store_true", + help="Resume from existing eval state" + ) + + args = parser.parse_args() + + if args.dataset == "gpqa" and args.grader_type != "llm": + print("Error: GPQA dataset requires --grader-type llm") + parser.print_help() + sys.exit(1) + + if args.output.exists(): + print(f"Loading existing eval state from {args.output}") + eval_state = EvalState.load(args.output) + + eval_state.print_all_tasks() + eval_state.print_existing_summary() + + if eval_state.is_complete(): + return + + print() + + if not args.resume: + print(f"Evaluation incomplete. Run with --resume to continue.") + return + + pending_tasks = eval_state.get_pending_tasks() + print(f"Resuming from {len(pending_tasks)} pending tasks") + + existing_cases = eval_state.task_states.get("cases", {}) + + eval_state.tasks = pending_tasks + eval_state.task_states["cases"] = existing_cases + + judge_server_url = args.judge_server if args.judge_server else args.server + judge_model_name = args.judge_model if args.judge_model else args.model + grader = Grader( + grader_type=args.grader_type, + grader_script=args.grader_script, + judge_model_name=judge_model_name, + judge_server_url=judge_server_url, + dataset_type=eval_state.dataset_type + ) + resume = True + else: + if args.resume: + print("Error: No existing eval state found to resume") + sys.exit(1) + + judge_server_url = args.judge_server if args.judge_server else args.server + judge_model_name = args.judge_model if args.judge_model else args.model + + grader = Grader( + grader_type=args.grader_type, + grader_script=args.grader_script, + judge_model_name=judge_model_name, + judge_server_url=judge_server_url, + dataset_type=args.dataset + ) + + if args.grader_type == "llm" and not args.judge_server: + print("Warning: Using same server for LLM judge (no --judge-server specified)") + + sampling_config = {} + if args.temperature is not None: + sampling_config["temperature"] = args.temperature + if args.top_k is not None: + sampling_config["top_k"] = args.top_k + if args.top_p is not None: + sampling_config["top_p"] = args.top_p + if args.min_p is not None: + sampling_config["min_p"] = args.min_p + + eval_state = EvalState( + dataset_type=args.dataset, + sampling_config=sampling_config, + output_file=args.output + ) + eval_state.load_dataset(seed=args.seed) + eval_state.setup_tasks(n_cases=args.n_cases, seed=args.seed) + eval_state.dump() + resume = False + + eval_state.print_all_tasks() + + processor = Processor( + server_url=args.server, + grader=grader, + model_name=args.model, + threads=args.threads, + n_predict=args.n_predict + ) + + processor.evaluate(eval_state, verbose=args.verbose, resume=resume) + print(f"\nEval state dumped to {args.output}") + +if __name__ == "__main__": + main() diff --git a/examples/llama-eval/llama-server-simulator.py b/examples/llama-eval/llama-server-simulator.py new file mode 100755 index 0000000000..210683953e --- /dev/null +++ b/examples/llama-eval/llama-server-simulator.py @@ -0,0 +1,283 @@ +#!/usr/bin/env python3 + +import argparse +import json +import random +import re +import time +import sys +import os +from typing import Dict, List, Optional +from dataclasses import dataclass, asdict +from pathlib import Path + +import datasets +from flask import Flask, request, jsonify + +# Set cache directory for HuggingFace datasets +cache_dir = Path.home() / ".cache" / "huggingface" / "datasets" +cache_dir.mkdir(parents=True, exist_ok=True) +os.environ["HF_DATASETS_CACHE"] = str(cache_dir) + +def dice(s1: str, s2: str) -> float: + """Calculate Dice coefficient between two strings based on bigram overlap.""" + if not s1 and not s2: + return 1.0 + + def _bigrams(s: str): + return [s[i : i + 2] for i in range(len(s) - 1)] + + bigrams1 = _bigrams(s1) + bigrams2 = _bigrams(s2) + + if not bigrams1 and not bigrams2: + return 1.0 + + from collections import Counter + + freq1 = Counter(bigrams1) + freq2 = Counter(bigrams2) + + intersection = sum(min(freq1[bg], freq2[bg]) for bg in freq1) + dice_coeff = 2 * intersection / (len(bigrams1) + len(bigrams2)) + return dice_coeff + +def debug_log(message: str): + """Log debug messages to both stdout and a file""" + print(message, file=sys.stderr) + with open("/tmp/simulator-debug.log", "a") as f: + f.write(message + "\n") + +app = Flask(__name__) + +@dataclass +class EvalState: + id: str + tasks: List[str] + task_states: Dict[str, Dict] + sampling_config: Dict + +def normalize_number(s: str) -> Optional[int]: + match = re.match(r"\d+", s) # match digits from the start + if not match: + return None + return int(match.group(0)) + +class AimeDataset: + def __init__(self, split: str = "train"): + self.split = split + self.questions: List[Dict] = [] + self._load_dataset() + + def _load_dataset(self): + print(f"Loading AIME dataset (split: {self.split})...") + + cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0" + if cache_path.exists(): + print(f"Using cached dataset from {cache_path}") + ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path)) + else: + ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split) + + self.questions = list(ds) + print(f"AIME dataset loaded: {len(self.questions)} questions") + + def find_question(self, request_text: str) -> Optional[Dict]: + best_match = None + best_distance = -1 + best_index = -1 + + for i, question in enumerate(self.questions): + question_text = question["problem"] + request_lower = request_text.lower() + question_lower = question_text.lower() + + # Exact match + if question_lower == request_lower: + debug_log(f"DEBUG: Found exact match at index {i}") + return question + + # Remove LaTeX formatting for more flexible matching + question_no_latex = re.sub(r'\$[^$]+\$', '', question_text) + if question_no_latex.lower() == request_lower: + debug_log(f"DEBUG: Found match (no LaTeX) at index {i}") + return question + + # Calculate Levenshtein distance for partial matches + # Only consider if request is at least 50% of question length + if len(request_lower) >= len(question_lower) * 0.5: + distance = dice(question_lower, request_lower) + + if distance > best_distance: + best_distance = distance + best_match = question + best_index = i + + if best_match and best_distance > 0.3: # Threshold for partial match + debug_log(f"DEBUG: Found best partial match at index {best_index} with distance {best_distance:.3f}") + return best_match + + debug_log(f"DEBUG: No matching question found for: {request_text[:100]}...") + return None + + def get_answer(self, question: Dict) -> str: + answer = question["answer"] + if isinstance(answer, str): + normalized = normalize_number(answer) + return str(normalized) if normalized is not None else answer + return str(answer) + +class Simulator: + def __init__( + self, + port: int = 8033, + host: str = "localhost", + success_rate: float = 0.8, + dataset_split: str = "train" + ): + self.port = port + self.host = host + self.success_rate = success_rate + self.dataset = AimeDataset(dataset_split) + self.eval_state = EvalState( + id="aime-2025", + tasks=["aime"], + task_states={}, + sampling_config={"temperature": 0, "max_tokens": 2048} + ) + + def _generate_response( + self, + question: Dict, + should_be_correct: bool + ) -> Dict: + expected_answer = self.dataset.get_answer(question) + + if should_be_correct: + response_text = expected_answer + else: + response_text = self._generate_wrong_answer(question) + + return { + "id": f"chatcmpl-{int(time.time())}", + "object": "chat.completion", + "created": int(time.time()), + "model": "llama", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": response_text + }, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 100, + "completion_tokens": 50, + "total_tokens": 150 + } + } + + def _generate_wrong_answer(self, question: Dict) -> str: + expected_answer = self.dataset.get_answer(question) + + if expected_answer.isdigit(): + wrong_answer = str(int(expected_answer) + 1) + else: + wrong_answer = expected_answer + " (wrong)" + + return wrong_answer + + def _process_request(self, request_data: Dict) -> Dict: + messages = request_data.get("messages", []) + if not messages: + return {"error": "No messages in request"} + + request_text = messages[0].get("content", "") + debug_log(f"DEBUG: Received request with content: {request_text[:150]}...") + + question = self.dataset.find_question(request_text) + if not question: + debug_log(f"DEBUG: find_question returned None") + return {"error": "No matching question found"} + + should_be_correct = random.random() < self.success_rate + + response = self._generate_response(question, should_be_correct) + + task_id = "aime" + self.eval_state.task_states[task_id] = { + "correct": should_be_correct, + "expected": self.dataset.get_answer(question), + "predicted": response["choices"][0]["message"]["content"] + } + + return response + +@app.route('/v1/chat/completions', methods=['POST']) +def chat_completions(): + try: + request_data = request.get_json() + + if not request_data: + return jsonify({"error": "Invalid JSON"}), 400 + + response = simulator._process_request(request_data) + + return jsonify(response) + + except Exception as e: + print(f"Error processing request: {e}") + return jsonify({"error": str(e)}), 500 + +def main(): + parser = argparse.ArgumentParser( + description="llama-server simulator for testing eval scripts" + ) + parser.add_argument( + "--port", + type=int, + default=8033, + help="Server port (default: 8033)" + ) + parser.add_argument( + "--host", + type=str, + default="localhost", + help="Server host (default: localhost)" + ) + parser.add_argument( + "--success-rate", + type=float, + default=0.8, + help="Success rate 0-1 (default: 0.8)" + ) + parser.add_argument( + "--dataset-split", + type=str, + default="train", + help="AIME dataset split to use (default: train)" + ) + + args = parser.parse_args() + + global simulator + simulator = Simulator( + port=args.port, + host=args.host, + success_rate=args.success_rate, + dataset_split=args.dataset_split + ) + + print("\n=== llama-server-simulator ===") + print(f"Server running on http://{args.host}:{args.port}") + print(f"Success rate: {args.success_rate}") + print(f"AIME dataset loaded: {len(simulator.dataset.questions)} questions") + print("\nPress Ctrl+C to stop\n") + + app.run(host=args.host, port=args.port, debug=False) + +if __name__ == "__main__": + main() diff --git a/examples/llama-eval/test-simulator.sh b/examples/llama-eval/test-simulator.sh new file mode 100755 index 0000000000..f3ddf3e95d --- /dev/null +++ b/examples/llama-eval/test-simulator.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +set -e + +# Get the directory where this script is located +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +echo "=== llama-server-simulator Test Script ===" +echo "" + +PORT=8033 +SUCCESS_RATE=0.8 +TEST_PORT=8034 + +echo "Starting simulator on port $PORT with success rate $SUCCESS_RATE..." +source "$SCRIPT_DIR/venv/bin/activate" +python3 "$SCRIPT_DIR/llama-server-simulator.py" --port $PORT --success-rate $SUCCESS_RATE > /tmp/simulator-test.log 2>&1 & +SIMULATOR_PID=$! + +echo "Waiting for simulator to start..." +sleep 5 + +# Helper function to make a request and extract the answer +make_request() { + local question="$1" + curl -s -X POST http://localhost:$PORT/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d "{ + \"model\": \"llama\", + \"messages\": [ + {\"role\": \"user\", \"content\": \"$question\"} + ], + \"temperature\": 0, + \"max_tokens\": 2048 + }" | python3 -c "import sys, json; data = json.load(sys.stdin); print(data.get('choices', [{}])[0].get('message', {}).get('content', data.get('error', 'No response')))" +} + +# Test question (repeated in multiple tests) +TEST_QUESTION="Quadratic polynomials P(x) and Q(x) have leading coefficients 2 and -2, respectively. The graphs of both polynomials pass through the two points (16,54) and (20,53). Find P(0) + Q(0)." + +echo "" +echo "=== Test 1: Correct Answer ===" +echo "Sending request with known question..." +answer=$(make_request "$TEST_QUESTION") +echo "Answer: $answer" +echo "Expected: 116" +echo "Correct: $([ "$answer" == "116" ] && echo "Yes" || echo "No")" + +echo "" +echo "=== Test 2: Wrong Answer ===" +echo "Sending request with known question (success rate 0.0)..." +answer=$(make_request "$TEST_QUESTION") +echo "Answer: $answer" +echo "Expected: 116" +echo "Correct: $([ "$answer" == "116" ] && echo "Yes" || echo "No")" + +echo "" +echo "=== Test 3: No Matching Question ===" +echo "Sending request with non-matching text..." +response=$(make_request "What is the capital of France?") +echo "Response: $response" +echo "Expected: No matching question found" +echo "Correct: $([ "$response" == "No matching question found" ] && echo "Yes" || echo "No")" + +echo "" +echo "=== Test 4: Success Rate Verification ===" +echo "Sending 10 requests to test success rate..." +correct_count=0 +for i in {1..10}; do + answer=$(make_request "$TEST_QUESTION") + if [ "$answer" == "116" ]; then + correct_count=$((correct_count + 1)) + fi + echo " Request $i: Answer = $answer" +done +echo "Correct answers: $correct_count/10" +echo "Expected: ~8/10 (80% success rate)" +echo "Success rate: $(echo "scale=1; $correct_count * 10" | bc)%" + +echo "" +echo "=== Test Complete ===" +echo "Stopping simulator..." +kill $SIMULATOR_PID 2>/dev/null +wait $SIMULATOR_PID 2>/dev/null || true + +echo "Simulator stopped."