fix: address three review comments on benchmark/test CLI

- benchmark verbose path: pass on_progress callback the same way as
  the non-verbose path (was missing entirely)
- _train_new_agents: replace per-case asyncio.run() with a single
  event loop (new_event_loop / run_until_complete / close) to avoid
  creating and destroying a loop on every case iteration
- format_results_table: use case_index + 1 so the '#' column is
  1-based, matching the display in _test_new_agents failed output

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
alex-clawd
2026-05-13 09:03:14 -07:00
parent 74bf197ccb
commit a723d991f5
2 changed files with 6 additions and 2 deletions

View File

@@ -676,7 +676,7 @@ def format_results_table(results: list[BenchmarkResult]) -> str:
status = "PASS" if r.passed else "FAIL"
tokens = f"{r.input_tokens}/{r.output_tokens}"
input_trunc = r.input[:40] + "..." if len(r.input) > 40 else r.input
line = f"{r.case_index:<4} {status:<6} {r.score:<7.2f} {tokens:<12} {r.response_time_ms:<10} {input_trunc}"
line = f"{r.case_index + 1:<4} {status:<6} {r.score:<7.2f} {tokens:<12} {r.response_time_ms:<10} {input_trunc}"
lines.append(line)
if r.passed:

View File

@@ -243,6 +243,8 @@ def _train_new_agents(agent_files: list[Any], n_iterations: int) -> None:
from rich.console import Console as _Console
_console = _Console()
_loop = asyncio.new_event_loop()
asyncio.set_event_loop(_loop)
for iteration in range(n_iterations):
click.secho(f"\n Iteration {iteration + 1}/{n_iterations}", fg="cyan")
@@ -256,7 +258,7 @@ def _train_new_agents(agent_files: list[Any], n_iterations: int) -> None:
_t0 = _time.monotonic()
with _console.status("[cyan] Running…[/]", spinner="dots"):
response = asyncio.run(agent.amessage(user_input))
response = _loop.run_until_complete(agent.amessage(user_input))
_elapsed = _time.monotonic() - _t0
_console.print(f" [green]✓[/] done ({_elapsed:.1f}s)")
click.echo(f" Response: {response.content[:500]}")
@@ -279,6 +281,7 @@ def _train_new_agents(agent_files: list[Any], n_iterations: int) -> None:
)
click.secho(" ✓ Feedback saved as canonical memory", fg="green")
_loop.close()
agents_trained += 1
click.echo()
@@ -1755,6 +1758,7 @@ def benchmark(
cases=cases,
models=model_list,
judge_model=judge_model,
on_progress=progress.on_progress if progress else None,
verbose=verbose,
)
)