From 013963cfd55d4f176c674500df3cc40763390a5a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 16 Feb 2026 21:22:06 +0200 Subject: [PATCH] add html --- examples/llama-eval/llama-eval.py | 139 ++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) diff --git a/examples/llama-eval/llama-eval.py b/examples/llama-eval/llama-eval.py index 726936ef40..66e7319a68 100755 --- a/examples/llama-eval/llama-eval.py +++ b/examples/llama-eval/llama-eval.py @@ -257,6 +257,145 @@ class EvalState: with open(self.output_file, "w") as f: json.dump(data, f, indent=2) + self.dump_html(tasks_to_save, all_cases) + + def dump_html(self, tasks_to_save: List[Tuple[int, str]], all_cases: Dict[str, Any]): + html_file = Path(str(self.output_file) + ".html") + + cases = all_cases + completed = {tid: c for tid, c in cases.items() if c.get("status") == "ok"} + correct_count = sum(1 for c in completed.values() if c.get("correct", False)) + incorrect_count = len(completed) - correct_count + pending_count = len(tasks_to_save) - len(completed) + accuracy = correct_count / len(completed) * 100 if completed else 0.0 + + sampling_parts = [] + for k, v in self.sampling_config.items(): + if v is not None: + sampling_parts.append(f"{k}={v}") + sampling_str = ", ".join(sampling_parts) if sampling_parts else "default" + + rows = [] + for i, task_id in tasks_to_save: + case = cases.get(task_id, {}) + status = case.get("status", "pending") + gold = case.get("gold", "") + extracted = case.get("extracted", "") if status == "ok" else "" + is_correct = case.get("correct", False) if status == "ok" else False + pred = case.get("pred", "") or "" + prompt = case.get("prompt", "") or "" + grader_log = case.get("grader_log", {}) + + if status == "ok": + status_class = "correct" if is_correct else "incorrect" + status_text = "✓ Correct" if is_correct else "✗ Incorrect" + elif status == "pending": + status_class = "pending" + status_text = "Pending" + else: + status_class = "error" + status_text = f"Error: {status}" + + pred_escaped = self._escape_html(pred) + prompt_escaped = self._escape_html(prompt) + grader_log_str = self._escape_html(json.dumps(grader_log, indent=2)) + + rows.append(f""" + {task_id} + {status_text} + {self._escape_html(gold)} + {self._escape_html(extracted)} + + + +
+

Prompt

+
{prompt_escaped}
+

Prediction

+
{pred_escaped}
+

Grader Log

+
{grader_log_str}
+
+ + """) + + rows_html = "\n".join(rows) + + html_content = f""" + + + + + Eval State - {self.dataset_type} + + + +

Eval State: {self.dataset_type.upper()}

+
+ + + + + + + + + +
Dataset{self.dataset_type}
Total Tasks{len(tasks_to_save)}
Completed{len(completed)}
Correct{correct_count}
Incorrect{incorrect_count}
Pending{pending_count}
Accuracy{accuracy:.1f}%
Sampling{sampling_str}
+
+ + + + + + + + + + + {rows_html} + +
Task IDStatusGoldExtracted
+ + +""" + + with open(html_file, "w") as f: + f.write(html_content) + + def _escape_html(self, s: str) -> str: + return (s.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """) + .replace("'", "'")) + @classmethod def load(cls, path: Path) -> "EvalState": with open(path, "r") as f: