diff --git a/examples/llama-eval/llama-eval.py b/examples/llama-eval/llama-eval.py
index 726936ef40..66e7319a68 100755
--- a/examples/llama-eval/llama-eval.py
+++ b/examples/llama-eval/llama-eval.py
@@ -257,6 +257,145 @@ class EvalState:
with open(self.output_file, "w") as f:
json.dump(data, f, indent=2)
+ self.dump_html(tasks_to_save, all_cases)
+
+ def dump_html(self, tasks_to_save: List[Tuple[int, str]], all_cases: Dict[str, Any]):
+ html_file = Path(str(self.output_file) + ".html")
+
+ cases = all_cases
+ completed = {tid: c for tid, c in cases.items() if c.get("status") == "ok"}
+ correct_count = sum(1 for c in completed.values() if c.get("correct", False))
+ incorrect_count = len(completed) - correct_count
+ pending_count = len(tasks_to_save) - len(completed)
+ accuracy = correct_count / len(completed) * 100 if completed else 0.0
+
+ sampling_parts = []
+ for k, v in self.sampling_config.items():
+ if v is not None:
+ sampling_parts.append(f"{k}={v}")
+ sampling_str = ", ".join(sampling_parts) if sampling_parts else "default"
+
+ rows = []
+ for i, task_id in tasks_to_save:
+ case = cases.get(task_id, {})
+ status = case.get("status", "pending")
+ gold = case.get("gold", "")
+ extracted = case.get("extracted", "") if status == "ok" else ""
+ is_correct = case.get("correct", False) if status == "ok" else False
+ pred = case.get("pred", "") or ""
+ prompt = case.get("prompt", "") or ""
+ grader_log = case.get("grader_log", {})
+
+ if status == "ok":
+ status_class = "correct" if is_correct else "incorrect"
+ status_text = "✓ Correct" if is_correct else "✗ Incorrect"
+ elif status == "pending":
+ status_class = "pending"
+ status_text = "Pending"
+ else:
+ status_class = "error"
+ status_text = f"Error: {status}"
+
+ pred_escaped = self._escape_html(pred)
+ prompt_escaped = self._escape_html(prompt)
+ grader_log_str = self._escape_html(json.dumps(grader_log, indent=2))
+
+ rows.append(f"""
+ | {task_id} |
+ {status_text} |
+ {self._escape_html(gold)} |
+ {self._escape_html(extracted)} |
+
+
+
+
+ Prompt
+ {prompt_escaped}
+ Prediction
+ {pred_escaped}
+ Grader Log
+ {grader_log_str}
+
+ |
+
""")
+
+ rows_html = "\n".join(rows)
+
+ html_content = f"""
+
+
+
+
+ Eval State - {self.dataset_type}
+
+
+
+ Eval State: {self.dataset_type.upper()}
+
+
+ | Dataset | {self.dataset_type} |
+ | Total Tasks | {len(tasks_to_save)} |
+ | Completed | {len(completed)} |
+ | Correct | {correct_count} |
+ | Incorrect | {incorrect_count} |
+ | Pending | {pending_count} |
+ | Accuracy | {accuracy:.1f}% |
+ | Sampling | {sampling_str} |
+
+
+
+
+
+ | Task ID |
+ Status |
+ Gold |
+ Extracted |
+
+
+
+ {rows_html}
+
+
+
+
+"""
+
+ with open(html_file, "w") as f:
+ f.write(html_content)
+
+ def _escape_html(self, s: str) -> str:
+ return (s.replace("&", "&")
+ .replace("<", "<")
+ .replace(">", ">")
+ .replace('"', """)
+ .replace("'", "'"))
+
@classmethod
def load(cls, path: Path) -> "EvalState":
with open(path, "r") as f: