fix: address remaining PR review comments — null guard, markup escaping, empty criteria

- Add null check after _load_agent() in benchmark runner (agent can return None on circular refs)
- Escape user-sourced content in Rich markup via _safe_render() in memory panel and skills list
- Default to passed=True when benchmark case has neither expected nor criteria

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Joao Moura
2026-05-14 14:04:40 -04:00
parent 16488f5fe5
commit 2eb7e15f89
2 changed files with 29 additions and 4 deletions

View File

@@ -1120,7 +1120,7 @@ class AgentTUI(App[None]):
lines = [f"[bold]Active Skills[/] ({len(active)})"]
for s in active:
lines.append(f" [{_CORAL}]{s.name}[/] — {s.description}")
lines.append(f" [{_CORAL}]{_safe_render(s.name)}[/] — {_safe_render(s.description)}")
self._mount_sys("\n".join(lines))
def _handle_tasks_command(self, parts: list[str]) -> None:
@@ -1219,13 +1219,13 @@ class AgentTUI(App[None]):
if mem_type == "canonical"
else f"[dim]{mem_type}[/]"
)
importance_tag = f" [yellow]★{importance}[/]" if importance else ""
scope_tag = f" [{_DIM}]scope:{scope}[/]" if scope else ""
importance_tag = f" [yellow]★{_safe_render(str(importance))}[/]" if importance else ""
scope_tag = f" [{_DIM}]scope:{_safe_render(str(scope))}[/]" if scope else ""
time_tag = f" [{_DIM}]{timestamp}[/]" if timestamp else ""
return [
f" {i}. {type_tag}{importance_tag}{scope_tag}{time_tag}",
f" {content}",
f" {_safe_render(content)}",
"",
]

View File

@@ -304,6 +304,29 @@ async def _run_model_benchmark(
score=0.0,
)
if agent is None:
emit(
{
"type": "case_done",
"model": model,
"case_index": i,
"total_cases": total,
"passed": False,
"score": 0.0,
"time_ms": 0,
"error": "agent loader returned None",
}
)
return BenchmarkResult(
case_index=i,
input=case.input,
expected=case.expected,
actual="[Agent loader returned None]",
model=model,
passed=False,
score=0.0,
)
start_ms = _current_time_ms()
try:
response = await asyncio.wait_for(
@@ -365,6 +388,8 @@ async def _run_model_benchmark(
)
passed, score = False, 0.0
if case.expected is None and case.criteria is None:
passed, score = True, 1.0
if case.expected is not None:
passed, score = _check_expected(case.expected, actual)
if case.criteria is not None: