diff --git a/examples/llama-eval/llama-eval.py b/examples/llama-eval/llama-eval.py index d44530e6ef..415c4472dc 100755 --- a/examples/llama-eval/llama-eval.py +++ b/examples/llama-eval/llama-eval.py @@ -111,6 +111,7 @@ class TaskState: correct: bool = False status: str = "pending" tokens: Optional[int] = None + reasoning_content: Optional[str] = None class EvalState: @@ -185,7 +186,8 @@ class EvalState: grader_log: Dict[str, Any], correct: bool, status: str, - tokens: Optional[int] = None + tokens: Optional[int] = None, + reasoning_content: Optional[str] = None ): if "cases" not in self.task_states: self.task_states["cases"] = {} @@ -199,7 +201,8 @@ class EvalState: "grader_log": grader_log, "correct": correct, "status": status, - "tokens": tokens + "tokens": tokens, + "reasoning_content": reasoning_content } if correct: @@ -246,7 +249,8 @@ class EvalState: "grader_log": {}, "correct": False, "status": "pending", - "tokens": None + "tokens": None, + "reasoning_content": None } data = { @@ -303,9 +307,11 @@ class EvalState: tokens = case.get("tokens") tokens_str = str(tokens) if tokens is not None else "" + reasoning_content = case.get("reasoning_content", "") or "" result_escaped = self._escape_html(result) prompt_escaped = self._escape_html(prompt) + reasoning_escaped = self._escape_html(reasoning_content) grader_log_str = self._escape_html(json.dumps(grader_log, indent=2)) rows.append(f"""
{prompt_escaped}
+ {result_escaped}