From b0d50a5681706ca965044ba378a3a1c9bf9883b7 Mon Sep 17 00:00:00 2001 From: gatbontonpc Date: Mon, 12 Jan 2026 13:53:39 -0500 Subject: [PATCH] Add readme --- examples/llama-eval/README.md | 20 ++++++++++++++++++++ examples/llama-eval/llama-eval.py | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 examples/llama-eval/README.md diff --git a/examples/llama-eval/README.md b/examples/llama-eval/README.md new file mode 100644 index 0000000000..4dfaf09a22 --- /dev/null +++ b/examples/llama-eval/README.md @@ -0,0 +1,20 @@ +# llama.cpp/example/llama-eval + +The purpose of this example is to to run evaluations metrics against a an openapi api compatible LLM via http (llama-server). + +```bash +./llama-server -m model.gguf --port 8033 +``` + +```bash +python examples/llama-eval/llama-eval.py --path_server http://localhost:8033 --n_prompt 100 --prompt_source arc +``` + +## Supported tasks (MVP) + +- **GSM8K** — grade-school math (final-answer only) +- **AIME** — competition math (final-answer only) +- **MMLU** — multi-domain knowledge (multiple choice) +- **HellaSwag** — commonsense reasoning (multiple choice) +- **ARC** — grade-school science reasoning (multiple choice) +- **WinoGrande** — commonsense coreference resolution (multiple choice) \ No newline at end of file diff --git a/examples/llama-eval/llama-eval.py b/examples/llama-eval/llama-eval.py index 411d0adbab..0ded50545c 100644 --- a/examples/llama-eval/llama-eval.py +++ b/examples/llama-eval/llama-eval.py @@ -576,7 +576,7 @@ if __name__ == "__main__": "--prompt_source", type=str, default="mmlu", - help=f"Eval types supported: all,{TASK_DICT.keys()}", + help=f"Eval types supported: all,{list(TASK_DICT.keys())}", ) parser.add_argument( "--n_prompts", type=int, default=None, help="Number of prompts to evaluate"