From 1db8428f00fd24e346a7700bcd0aa45b50ba3df6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 15 Feb 2026 22:16:54 +0200
Subject: [PATCH] remove old files

---
 examples/llama-eval/llama-eval.py  | 703 -----------------------------
 examples/llama-eval/test-grader.py |  26 --
 2 files changed, 729 deletions(-)
 delete mode 100644 examples/llama-eval/llama-eval.py
 delete mode 100755 examples/llama-eval/test-grader.py

diff --git a/examples/llama-eval/llama-eval.py b/examples/llama-eval/llama-eval.py
deleted file mode 100644
index 78bfc0c2e4..0000000000
--- a/examples/llama-eval/llama-eval.py
+++ /dev/null
@@ -1,703 +0,0 @@
-#!/usr/bin/env python3
-
-import re
-import argparse
-import os
-from time import time
-from typing import Union, Any, Mapping, cast
-
-import datasets
-import logging
-import requests
-from tqdm.contrib.concurrent import thread_map
-from typing import Iterator, Set
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
-from pathlib import Path
-import json
-import threading
-
-logging.basicConfig(level=logging.INFO, format='%(message)s')
-logger = logging.getLogger("llama-eval")
-
-MATH_TEMPLATE = """
-{question}
-Do not include any explanation. Put your final answer within \\boxed{{}}.
-"""
-
-
-def format_multiple_choice(prompt: str, choices: list[str]):
-    lines = [prompt]
-
-    labels = [chr(ord("A") + i) for i in range(len(choices))]
-    for l, c in zip(labels, choices):
-        lines.append(f"({l}): {c.strip()}")
-    lines.append(
-        "Do not include any explanation. Answer with the corresponding option letter only"
-    )
-    lines.append(", ".join(labels))
-    lines.append("Put your final answer within \\boxed{{}}.")
-
-    return "\n".join(lines), labels
-
-
-def extract_boxed_text(text: str) -> str:
-    pattern = r"boxed{(.*?)}|framebox{(.*?)}"
-    matches = re.findall(pattern, text, re.DOTALL)
-    logger.debug(matches)
-    if matches:
-        for match in matches[::-1]:
-            for group in match:
-                if group != "":
-                    return group.split(",")[-1].strip()
-    logger.debug("Could not extract boxed text. Maybe expand context window")
-
-    return ""
-
-
-@dataclass(frozen=True)
-class Case:
-    task: str
-    kind: str
-    case_id: str
-    prompt: str
-    gold: str
-    meta_data: dict[str, Any]
-
-
-class TaskSpec(ABC):
-    name: str
-    kind: str
-
-    @abstractmethod
-    def load(self, limit, seed) -> datasets.Dataset:
-        pass
-
-    @abstractmethod
-    def iter_cases(self, limit: int, seed: int) -> Iterator[Case]:
-        pass
-
-    @staticmethod
-    @abstractmethod
-    def grade(case: Case, response: dict) -> dict[str, Any]:
-        pass
-
-
-class MCTaskSpec(TaskSpec):
-    @staticmethod
-    def grade(case: Case, response: dict) -> dict[str, Any]:
-        logger.debug(f"response {response}")
-        result = {
-            "task": case.task,
-            "case_id": case.case_id,
-            "correct": 0,
-            "pred": None,
-            "gold": case.gold,
-            "status": "ok",
-        }
-
-        try:
-            extracted_answer = extract_boxed_text(response["choices"][0]["text"])
-        except Exception as e:
-            result["status"] = "error"
-            logger.warning("ERROR: extract_boxed_text")
-
-            return result
-
-        if not extracted_answer:
-            result["status"] = "invalid"
-            logger.warning("INVALID: extract_boxed_text")
-            return result
-
-        logger.debug(f"extracted_answer {extracted_answer}")
-        logger.debug(f"data['answer'] {case.gold}")
-        result["pred"] = extracted_answer
-        result["correct"] = 1 if extracted_answer == case.gold else 0
-
-        return result
-
-
-class MathTaskSpec(TaskSpec):
-
-    @staticmethod
-    def grade(case: Case, response: dict) -> dict[str, Any]:
-        logger.debug(f"response {response}")
-        result = {
-            "task": case.task,
-            "case_id": case.case_id,
-            "correct": 0,
-            "gold": case.gold,
-            "status": "ok",
-            "pred": None,
-        }
-
-        try:
-            extracted_answer = extract_boxed_text(response["choices"][0]["text"])
-        except:
-            result["status"] = "error"
-            logger.warning("ERROR: extract_boxed_text")
-            return result
-
-        source_answer = case.gold
-        try:  # All AIME answers are integers, so we convert the extracted answer to an integer
-            extracted_answer = int(extracted_answer)
-            source_answer = int(case.gold)
-        except (ValueError, TypeError):
-            result["status"] = "invalid"
-            return result
-
-        logger.debug(f"extracted_answer {extracted_answer}")
-        logger.debug(f"data['answer'] {case.gold}")
-        result["pred"] = extracted_answer
-        result["correct"] = 1 if extracted_answer == source_answer else 0
-
-        return result
-
-
-class ARC_Task(MCTaskSpec):
-
-    def __init__(self):
-        self.name = "arc"
-        self.kind = "mc"
-        self.config = "ARC-Challenge"
-        self.split = "test"
-
-    def load(self, limit, seed) -> datasets.Dataset:
-        ds = datasets.load_dataset("allenai/ai2_arc", self.config, split=self.split)
-        ds = ds.add_column("_row_id", list(range(len(ds))))
-        if limit:
-            ds = ds.shuffle(seed=seed)
-            ds = ds.select(range(min(limit, len(ds))))
-        return ds
-
-    def iter_cases(self, limit: int, seed: int) -> Iterator[Case]:
-        ds = self.load(limit, seed)
-
-        for doc in ds:
-            doc = cast(Mapping[str, Any], doc)
-
-            prompt, labels = format_multiple_choice(
-                doc["question"], doc["choices"]["text"]
-            )
-            yield Case(
-                task=self.name,
-                kind=self.kind,
-                case_id=f"ARC-Challenge_{self.config}_{self.split}_{doc['_row_id']}",
-                prompt=prompt,
-                gold=doc["answerKey"],
-                meta_data={"labels": labels},
-            )
-
-
-class WinoGrande_Task(MCTaskSpec):
-
-    def __init__(self):
-        self.name = "winogrande"
-        self.kind = "mc"
-        self.config = "winogrande_debiased"
-        self.split = "validation"
-
-    def load(self, limit, seed) -> datasets.Dataset:
-        ds = datasets.load_dataset("winogrande", self.config, split=self.split)
-
-        ds = ds.add_column("_row_id", list(range(len(ds))))
-        if limit:
-            ds = ds.shuffle(seed=seed)
-            ds = ds.select(range(min(limit, len(ds))))
-        return ds
-
-    def iter_cases(self, limit: int, seed: int) -> Iterator[Case]:
-        ds = self.load(limit, seed)
-
-        for doc in ds:
-            doc = cast(Mapping[str, Any], doc)
-
-            prompt, labels = format_multiple_choice(
-                doc["sentence"], [doc["option1"], doc["option2"]]
-            )
-            yield Case(
-                task=self.name,
-                kind=self.kind,
-                case_id=f"winogrande_{self.config}_{self.split}_{doc['_row_id']}",
-                prompt=prompt,
-                gold=labels[int(doc["answer"]) - 1],  # winogrande answers are 1 based
-                meta_data={"labels": labels},
-            )
-
-
-class MMLU_Task(MCTaskSpec):
-
-    def __init__(self):
-        self.name = "mmlu"
-        self.kind = "mc"
-        self.config = "all"
-        self.split = "test"
-
-    def load(self, limit, seed) -> datasets.Dataset:
-        ds = datasets.load_dataset("cais/mmlu", self.config, split=self.split)
-        ds = ds.add_column("_row_id", list(range(len(ds))))
-        if limit:
-            ds = ds.shuffle(seed=seed)
-            ds = ds.select(range(min(limit, len(ds))))
-        return ds
-
-    def iter_cases(self, limit: int, seed: int) -> Iterator[Case]:
-        ds = self.load(limit, seed)
-
-        for doc in ds:
-            doc = cast(Mapping[str, Any], doc)
-
-            prompt, labels = format_multiple_choice(doc["question"], doc["choices"])
-            yield Case(
-                task=self.name,
-                kind=self.kind,
-                case_id=f"mmlu_{self.config}_{self.split}_{doc['subject']}_{doc['_row_id']}",
-                prompt=prompt,
-                gold=labels[int(doc["answer"])],
-                meta_data={"subject": doc["subject"], "labels": labels},
-            )
-
-
-class Hellaswag_Task(MCTaskSpec):
-
-    # Preprocess hellaswag
-    @staticmethod
-    def preprocess(text: str):
-        text = text.strip()
-        # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
-        text = text.replace(" [title]", ". ")
-        text = re.sub("\\[.*?\\]", "", text)
-        text = text.replace("  ", " ")
-        return text
-
-    @staticmethod
-    def hellaswag_process_doc(doc: dict[str, str]):
-        ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
-        question = Hellaswag_Task.preprocess(doc["activity_label"] + ": " + ctx)
-        proc_answers = [Hellaswag_Task.preprocess(answer) for answer in doc["endings"]]
-        prompt, labels = format_multiple_choice(question, proc_answers)
-        out_doc = {
-            "prompt": prompt,
-            "gold": labels[int(doc["label"])],
-        }
-        return out_doc
-
-    def __init__(self):
-        self.name = "hellaswag"
-        self.kind = "mc"
-
-    def load(self, limit, seed) -> datasets.Dataset:
-        ds = datasets.load_dataset("Rowan/hellaswag", split="validation")
-        if limit:
-            ds = ds.shuffle(seed=seed)
-            ds = ds.select(range(min(limit, len(ds))))
-        ds = ds.map(Hellaswag_Task.hellaswag_process_doc)
-
-        return ds
-
-    def iter_cases(self, limit: int, seed: int) -> Iterator[Case]:
-        ds = self.load(limit, seed)
-        for doc in ds:
-            doc = cast(Mapping[str, Any], doc)
-            yield Case(
-                task=self.name,
-                kind=self.kind,
-                case_id=f"hellaswag_{doc['split']}_{doc['ind']}",
-                prompt=doc["prompt"],
-                gold=doc["gold"],
-                meta_data={},
-            )
-
-
-class Aime_Task(MathTaskSpec):
-
-    def __init__(self):
-        self.name = "aime"
-        self.kind = "math"
-        self.split = "train"
-
-    def load(self, limit, seed) -> datasets.Dataset:
-        ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split)
-
-        if limit:
-            ds = ds.shuffle(seed=seed)
-            ds = ds.select(range(min(limit, len(ds))))
-
-        ds = ds.map(
-            lambda ex: {
-                "prompt": MATH_TEMPLATE.format(
-                    question=ex["problem"],
-                )
-            }
-        )
-        return ds
-
-    def iter_cases(self, limit: int, seed: int) -> Iterator[Case]:
-        ds = self.load(limit, seed)
-
-        for i, doc in enumerate(ds):
-            doc = cast(Mapping[str, Any], doc)
-            yield Case(
-                task=self.name,
-                kind=self.kind,
-                case_id=f"aime_{self.split}_{doc['id']}",
-                prompt=doc["prompt"],
-                gold=doc["answer"],
-                meta_data={"id": doc["id"]},
-            )
-
-
-class Gsm8k_Task(MathTaskSpec):
-
-    def __init__(self):
-        self.name = "gsm8k"
-        self.kind = "math"
-        self.config = "main"
-        self.split = "test"
-
-    def load(self, limit, seed) -> datasets.Dataset:
-        ds = datasets.load_dataset("openai/gsm8k", self.config, split=self.split)
-        ds = ds.add_column("_row_id", list(range(len(ds))))
-        if limit:
-            ds = ds.shuffle(seed=seed)
-            ds = ds.select(range(min(limit, len(ds))))
-
-        ds = ds.map(
-            lambda k: {
-                "prompt": MATH_TEMPLATE.format(
-                    question=k["question"],
-                ),
-                "gold": k["answer"].split("### ")[-1].rstrip(),
-            }
-        )
-        return ds
-
-    def iter_cases(self, limit: int, seed: int) -> Iterator[Case]:
-        ds = self.load(limit, seed)
-
-        for doc in ds:
-            doc = cast(Mapping[str, Any], doc)
-            yield Case(
-                task=self.name,
-                kind=self.kind,
-                case_id=f"gsm8k_{self.config}_{self.split}:{doc['_row_id']}",
-                prompt=doc["prompt"],
-                gold=doc["gold"],
-                meta_data={},
-            )
-
-
-TASK_DICT: dict[str, type[TaskSpec]] = {
-    "mmlu": MMLU_Task,
-    "aime": Aime_Task,
-    "gsm8k": Gsm8k_Task,
-    "hellaswag": Hellaswag_Task,
-    "arc": ARC_Task,
-    "winogrande": WinoGrande_Task,
-}
-
-
-def build_request(case: Case, n_predict: int) -> dict[str, Any]:
-    json_data = {
-        "n_predict": n_predict,
-        "max_tokens": n_predict,
-        "temperature": 0,
-        "prompt": case.prompt,
-    }
-    return json_data
-
-
-def write_checkpoint_line(
-    checkpoint_file: Path,
-    row: dict[str, Any],
-    file_lock: threading.Lock,
-):
-    with file_lock:
-        with checkpoint_file.open(mode="a", encoding="utf-8") as f:
-            f.write(json.dumps(row) + "\n")
-
-
-def send_prompt(
-    case: Case,
-    data: dict,
-) -> dict[str, Union[str, int]]:
-    result = {
-        "task": case.task,
-        "case_id": case.case_id,
-        "status": "error",
-        "correct": 0,
-        "gold": case.gold,
-        "pred": "",
-        "error": "",
-    }
-    session: requests.Session = data["session"]
-    server_address: str = data["server_address"]
-    task = TASK_DICT.get(case.task)
-    if task is None:
-        result["error"] = f"unknown_task: {case.task}"
-        return result
-    logger.debug(case.prompt)
-
-    json_data = build_request(case, data["n_predict"])
-    res_json = {}
-    try:
-        response = session.post(f"{server_address}/v1/completions", json=json_data)
-        res_json = response.json()
-        result["status"] = "ok"
-    except Exception as e:
-        result["error"] = f"http_exception: {e}"
-        logger.warning(result["error"])
-
-    if result["status"] == "ok":
-        result = TASK_DICT[case.task].grade(case, res_json)
-
-    write_checkpoint_line(
-        data["checkpoint_file"],
-        result.copy(),
-        data["file_lock"],
-    )
-    return result
-
-def aggregate_by_task(results: list[dict[str, Any]]) -> dict[str, dict[str, int]]:
-    tmp = {
-        "total": 0,
-        "error": 0,
-        "invalid": 0,
-        "correct": 0,
-    }
-    agg: dict[str, dict[str, int]] = {}
-    for row in results:
-        d = agg.get(row["task"], tmp.copy())
-        d["total"] += 1
-        status = row["status"]
-        if status == "ok":
-            d["correct"] += row["correct"]
-        elif status == "invalid":
-            d["invalid"] += 1
-        elif status == "error":
-            d["error"] += 1
-
-        agg[row["task"]] = d
-    return agg
-
-
-def print_summary(pertask_results: dict[str, dict[str, int]]):
-    print("\n=== llama-eval suite summary ===")
-    print(
-        f"{'Task':<15} {'Acc':>8} {'Correct':>8} {'Total':>8} {'Invalid':>8} {'Error':>8}"
-    )
-    print("-" * 65)
-
-    suite_total = 0
-    suite_correct = 0
-
-    for task in sorted(pertask_results.keys()):
-        stats = pertask_results[task]
-        total = stats["total"]
-        correct = stats["correct"]
-        invalid = stats["invalid"]
-        error = stats["error"]
-
-        acc = (correct / total) if total > 0 else 0.0
-
-        print(
-            f"{task:<15} "
-            f"{acc:8.3f} "
-            f"{correct:8d} "
-            f"{total:8d} "
-            f"{invalid:8d} "
-            f"{error:8d}"
-        )
-
-        suite_total += total
-        suite_correct += correct
-
-    # Overall summary
-    print("-" * 65)
-    suite_acc = (suite_correct / suite_total) if suite_total > 0 else 0.0
-    print(
-        f"{'ALL':<15} " f"{suite_acc:8.3f} " f"{suite_correct:8d} " f"{suite_total:8d}"
-    )
-
-
-def read_checkpoint(
-    checkpoint_file: Path, resume_flag: bool
-) -> tuple[Set[str], Set[str], list[dict[str, Any]]]:
-    done = set()
-    errored = set()
-    results = []
-    if not resume_flag or not checkpoint_file.is_file():
-        return done, errored, results
-
-    with checkpoint_file.open(mode="r", encoding="utf-8") as f:
-        for line in f:
-            line = line.strip()
-            if not line:
-                continue
-            try:
-                row = json.loads(line)
-            except Exception as e:
-                logger.warning(f"WARNING: malformed checkpoint line {line}\n{e}")
-                continue
-
-            case_id = row.get("case_id")
-            if not case_id:
-                continue
-
-            if row["status"] == "error":
-                errored.add(case_id)
-            else:
-                done.add(case_id)
-                results.append(row)
-    errored -= done
-    return done, errored, results
-
-
-def benchmark(
-    path_server: str,
-    prompt_source: str,
-    n_prompts: int,
-    n_predict: int,
-    rng_seed: int,
-    resume_flag: bool,
-    checkpoint_file: Path,
-    log_level: int,
-):
-    logger.setLevel(log_level)
-    done, errored, checkpoint_results = read_checkpoint(checkpoint_file, resume_flag)
-
-    if not path_server.startswith("http://") and not path_server.startswith("https://"):
-        logger.error("ERROR: malformed server path")
-        return
-
-    if os.environ.get("LLAMA_ARG_N_PARALLEL") is None:
-        logger.info("LLAMA_ARG_N_PARALLEL not explicitly set, using 32")
-        os.environ["LLAMA_ARG_N_PARALLEL"] = "32"
-
-    parallel: int = int(os.environ.get("LLAMA_ARG_N_PARALLEL"))  # type: ignore
-
-    task_queue: set[TaskSpec] = set()
-    for src in prompt_source.split(","):
-        if src == "all":
-            for v in TASK_DICT.values():
-                task_queue.add(v())
-            break
-        task_queue.add(TASK_DICT[src]())
-
-    session = None
-    try:
-        server_address: str = path_server
-
-        adapter = requests.adapters.HTTPAdapter(pool_connections=parallel, pool_maxsize=parallel)  # type: ignore
-        session = requests.Session()
-        session.mount("http://", adapter)
-        session.mount("https://", adapter)
-        file_lock = threading.Lock()
-        cases: list[Case] = []
-        data: list[dict] = []
-        for task in task_queue:
-            for case in task.iter_cases(n_prompts, rng_seed):
-                if case.case_id in done or case.case_id in errored:
-                    logger.debug(f"Skipping case_id {case.case_id} from checkpoint")
-                    continue
-
-                cases.append(case)
-                data.append(
-                    {
-                        "prompt_source": prompt_source,
-                        "session": session,
-                        "server_address": server_address,
-                        "n_predict": n_predict,
-                        "file_lock": file_lock,
-                        "checkpoint_file": checkpoint_file,
-                    }
-                )
-        logger.info("Starting the benchmark...\n")
-        t0 = time()
-        results: list[dict[str, Union[str, int]]] = thread_map(
-            send_prompt,
-            cases,
-            data,
-            max_workers=parallel,
-            chunksize=1,
-        )
-    finally:
-        if session is not None:
-            session.close()
-
-    t1 = time()
-    logger.info(f"\nllama-eval duration:           {t1-t0:.2f} s")
-    results.extend(checkpoint_results)
-    pertask_results = aggregate_by_task(results)
-    print_summary(pertask_results)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Tool for benchmarking the throughput of the llama.cpp HTTP server. "
-        "Results are printed to console and visualized as plots (saved to current working directory). "
-        "To pass arguments such as the model path to the server, set the corresponding environment variables (see llama-server --help). "
-        "The reported numbers are the speeds as observed by the Python script and may differ from the performance reported by the server, "
-        "particularly when the server is fast vs. the network or Python script (e.g. when serving a very small model)."
-    )
-    parser.add_argument(
-        "--path_server",
-        type=str,
-        default="http://localhost:8033",
-        help="llama-server url",
-    )
-    parser.add_argument(
-        "--prompt_source",
-        type=str,
-        default="mmlu",
-        help=f"Eval types supported: all,{list(TASK_DICT.keys())}",
-    )
-    parser.add_argument(
-        "--n_prompts", type=int, default=None, help="Number of prompts to evaluate"
-    )
-    parser.add_argument(
-        "--rng_seed",
-        type=int,
-        default=42,
-        help="Number to see rng (Used to select prompts from datasource)",
-    )
-    parser.add_argument(
-        "--n_predict",
-        type=int,
-        default=2048,
-        help="Max. number of tokens to predict per prompt",
-    )
-    parser.add_argument(
-        "--resume",
-        dest="resume_flag",
-        action="store_true",
-        default=True,
-        help="Enable resuming from last state stored in checkpoint file",
-    )
-    parser.add_argument(
-        "--no-resume",
-        dest="resume_flag",
-        action="store_false",
-        help="Disble resuming from last state stored in checkpoint file",
-    )
-    parser.add_argument(
-        "--checkpoint-file",
-        type=Path,
-        dest="checkpoint_file",
-        default="./llama-eval-checkpoint.jsonl",
-        help="Checkpoint file to read last state from",
-    )
-    parser.set_defaults(log_level=logging.INFO)
-    parser.add_argument(
-        "--quiet", action="store_const", dest="log_level", const=logging.ERROR
-    )
-    parser.add_argument(
-        "--debug",
-        action="store_const",
-        default=True,
-        dest="log_level",
-        const=logging.DEBUG,
-    )
-
-    args = parser.parse_args()
-    benchmark(**vars(args))
diff --git a/examples/llama-eval/test-grader.py b/examples/llama-eval/test-grader.py
deleted file mode 100755
index c32901cf70..0000000000
--- a/examples/llama-eval/test-grader.py
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import argparse
-
-def main():
-    parser = argparse.ArgumentParser(description="Test grader script")
-    parser.add_argument("--answer", type=str, required=True, help="Predicted answer")
-    parser.add_argument("--expected", type=str, required=True, help="Expected answer")
-    args = parser.parse_args()
-
-    pred = args.answer.strip()
-    gold = args.expected.strip()
-
-    print(f"Gold: {gold}")
-    print(f"Pred: {pred}")
-
-    if pred == gold:
-        print("Correct!")
-        sys.exit(0)
-    else:
-        print("Incorrect")
-        sys.exit(1)
-
-if __name__ == "__main__":
-    main()