mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-05-06 17:52:35 +00:00
self improve with skills
This commit is contained in:
@@ -84,6 +84,7 @@ from crewai.rag.embeddings.types import EmbedderConfig
|
|||||||
from crewai.security.fingerprint import Fingerprint
|
from crewai.security.fingerprint import Fingerprint
|
||||||
from crewai.skills.loader import activate_skill, discover_skills
|
from crewai.skills.loader import activate_skill, discover_skills
|
||||||
from crewai.skills.models import INSTRUCTIONS, Skill as SkillModel
|
from crewai.skills.models import INSTRUCTIONS, Skill as SkillModel
|
||||||
|
from crewai.skills.self_improve.models import SelfImprovementConfig
|
||||||
from crewai.state.checkpoint_config import CheckpointConfig, apply_checkpoint
|
from crewai.state.checkpoint_config import CheckpointConfig, apply_checkpoint
|
||||||
from crewai.tools.agent_tools.agent_tools import AgentTools
|
from crewai.tools.agent_tools.agent_tools import AgentTools
|
||||||
from crewai.types.callback import SerializableCallable
|
from crewai.types.callback import SerializableCallable
|
||||||
@@ -190,6 +191,7 @@ class Agent(BaseAgent):
|
|||||||
_times_executed: int = PrivateAttr(default=0)
|
_times_executed: int = PrivateAttr(default=0)
|
||||||
_mcp_resolver: MCPToolResolver | None = PrivateAttr(default=None)
|
_mcp_resolver: MCPToolResolver | None = PrivateAttr(default=None)
|
||||||
_last_messages: list[LLMMessage] = PrivateAttr(default_factory=list)
|
_last_messages: list[LLMMessage] = PrivateAttr(default_factory=list)
|
||||||
|
_self_improve_collector: Any = PrivateAttr(default=None)
|
||||||
max_execution_time: int | None = Field(
|
max_execution_time: int | None = Field(
|
||||||
default=None,
|
default=None,
|
||||||
description="Maximum execution time for an agent to execute a task",
|
description="Maximum execution time for an agent to execute a task",
|
||||||
@@ -320,6 +322,15 @@ class Agent(BaseAgent):
|
|||||||
agent_executor: CrewAgentExecutor | AgentExecutor | None = Field(
|
agent_executor: CrewAgentExecutor | AgentExecutor | None = Field(
|
||||||
default=None, description="An instance of the CrewAgentExecutor class."
|
default=None, description="An instance of the CrewAgentExecutor class."
|
||||||
)
|
)
|
||||||
|
self_improve: bool | SelfImprovementConfig = Field(
|
||||||
|
default=False,
|
||||||
|
description=(
|
||||||
|
"Enable the self-improvement loop. ``True`` uses defaults; pass a "
|
||||||
|
"``SelfImprovementConfig`` to override (e.g. point ``skills_dir`` at "
|
||||||
|
"a project-relative path so accepted skills get committed alongside "
|
||||||
|
"the code). See ``crewai.skills.self_improve``."
|
||||||
|
),
|
||||||
|
)
|
||||||
executor_class: Annotated[
|
executor_class: Annotated[
|
||||||
type[CrewAgentExecutor] | type[AgentExecutor],
|
type[CrewAgentExecutor] | type[AgentExecutor],
|
||||||
BeforeValidator(_validate_executor_class),
|
BeforeValidator(_validate_executor_class),
|
||||||
@@ -360,6 +371,21 @@ class Agent(BaseAgent):
|
|||||||
|
|
||||||
self.set_skills()
|
self.set_skills()
|
||||||
|
|
||||||
|
if self.self_improve and self._self_improve_collector is None:
|
||||||
|
from crewai.skills.self_improve.collector import TraceCollector
|
||||||
|
|
||||||
|
collector = TraceCollector(self)
|
||||||
|
collector.attach(crewai_event_bus)
|
||||||
|
self._self_improve_collector = collector
|
||||||
|
|
||||||
|
def _self_improve_config(self) -> SelfImprovementConfig | None:
|
||||||
|
"""Return the active SelfImprovementConfig, or None when disabled."""
|
||||||
|
if not self.self_improve:
|
||||||
|
return None
|
||||||
|
if isinstance(self.self_improve, SelfImprovementConfig):
|
||||||
|
return self.self_improve
|
||||||
|
return SelfImprovementConfig()
|
||||||
|
|
||||||
if self.reasoning and self.planning_config is None:
|
if self.reasoning and self.planning_config is None:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"The 'reasoning' parameter is deprecated. Use 'planning_config=PlanningConfig()' instead.",
|
"The 'reasoning' parameter is deprecated. Use 'planning_config=PlanningConfig()' instead.",
|
||||||
@@ -429,7 +455,22 @@ class Agent(BaseAgent):
|
|||||||
else:
|
else:
|
||||||
crew_skills = list(resolved_crew_skills)
|
crew_skills = list(resolved_crew_skills)
|
||||||
|
|
||||||
if not self.skills and not crew_skills:
|
self_improve_dir: Path | None = None
|
||||||
|
if (config := self._self_improve_config()) is not None:
|
||||||
|
from crewai.skills.self_improve.storage import SkillStore, _slug
|
||||||
|
|
||||||
|
if config.skills_dir is not None:
|
||||||
|
candidate = config.skills_dir / _slug(self.role)
|
||||||
|
else:
|
||||||
|
candidate = SkillStore().role_dir(self.role)
|
||||||
|
if candidate.is_dir() and any(
|
||||||
|
(c / "SKILL.md").is_file()
|
||||||
|
for c in candidate.iterdir()
|
||||||
|
if c.is_dir()
|
||||||
|
):
|
||||||
|
self_improve_dir = candidate
|
||||||
|
|
||||||
|
if not self.skills and not crew_skills and self_improve_dir is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
needs_work = self.skills and any(
|
needs_work = self.skills and any(
|
||||||
@@ -437,7 +478,7 @@ class Agent(BaseAgent):
|
|||||||
or (isinstance(s, SkillModel) and s.disclosure_level < INSTRUCTIONS)
|
or (isinstance(s, SkillModel) and s.disclosure_level < INSTRUCTIONS)
|
||||||
for s in self.skills
|
for s in self.skills
|
||||||
)
|
)
|
||||||
if not needs_work and not crew_skills:
|
if not needs_work and not crew_skills and self_improve_dir is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
seen: set[str] = set()
|
seen: set[str] = set()
|
||||||
@@ -447,6 +488,9 @@ class Agent(BaseAgent):
|
|||||||
if crew_skills:
|
if crew_skills:
|
||||||
items.extend(crew_skills)
|
items.extend(crew_skills)
|
||||||
|
|
||||||
|
if self_improve_dir is not None:
|
||||||
|
items.append(self_improve_dir)
|
||||||
|
|
||||||
for item in items:
|
for item in items:
|
||||||
if isinstance(item, Path):
|
if isinstance(item, Path):
|
||||||
discovered = discover_skills(item, source=self)
|
discovered = discover_skills(item, source=self)
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ from crewai.cli.reset_memories_command import reset_memories_command
|
|||||||
from crewai.cli.run_crew import run_crew
|
from crewai.cli.run_crew import run_crew
|
||||||
from crewai.cli.settings.main import SettingsCommand
|
from crewai.cli.settings.main import SettingsCommand
|
||||||
from crewai.cli.shared.token_manager import TokenManager
|
from crewai.cli.shared.token_manager import TokenManager
|
||||||
|
from crewai.cli.skills_proposals import skills as skills_group
|
||||||
from crewai.cli.tools.main import ToolCommand
|
from crewai.cli.tools.main import ToolCommand
|
||||||
from crewai.cli.train_crew import train_crew
|
from crewai.cli.train_crew import train_crew
|
||||||
from crewai.cli.triggers.main import TriggersCommand
|
from crewai.cli.triggers.main import TriggersCommand
|
||||||
@@ -955,5 +956,8 @@ def checkpoint_prune(
|
|||||||
prune_checkpoints(ctx.obj["location"], keep, older_than, dry_run)
|
prune_checkpoints(ctx.obj["location"], keep, older_than, dry_run)
|
||||||
|
|
||||||
|
|
||||||
|
crewai.add_command(skills_group)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
crewai()
|
crewai()
|
||||||
|
|||||||
147
lib/crewai/src/crewai/cli/skill_proposals_tui.py
Normal file
147
lib/crewai/src/crewai/cli/skill_proposals_tui.py
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
"""Minimal Textual TUI for triaging skill proposals.
|
||||||
|
|
||||||
|
Two panes: the proposals list on the left, the highlighted proposal's
|
||||||
|
``SKILL.md`` body on the right. Keystrokes accept/reject in place. No
|
||||||
|
search, no scopes, no async workers — the underlying actions are the
|
||||||
|
same `accept_proposal` / `reject_proposal` calls the CLI uses.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from textual.app import App, ComposeResult
|
||||||
|
from textual.binding import Binding
|
||||||
|
from textual.containers import Horizontal, VerticalScroll
|
||||||
|
from textual.widgets import Footer, Header, OptionList, Static
|
||||||
|
|
||||||
|
from crewai.skills.self_improve import (
|
||||||
|
ProposalStore,
|
||||||
|
accept_proposal,
|
||||||
|
reject_proposal,
|
||||||
|
)
|
||||||
|
from crewai.skills.self_improve.models import SkillProposal
|
||||||
|
|
||||||
|
|
||||||
|
_PRIMARY = "#eb6658"
|
||||||
|
_SECONDARY = "#1F7982"
|
||||||
|
_TERTIARY = "#ffffff"
|
||||||
|
|
||||||
|
|
||||||
|
def _format_proposal_detail(p: SkillProposal) -> str:
|
||||||
|
kind = (
|
||||||
|
f"[bold]{p.proposal_kind}[/] → {p.target_skill}"
|
||||||
|
if p.proposal_kind == "patch_existing"
|
||||||
|
else "[bold]new[/]"
|
||||||
|
)
|
||||||
|
runs = ", ".join(p.derived_from_runs) or "-"
|
||||||
|
return (
|
||||||
|
f"[bold {_PRIMARY}]{p.name}[/]\n"
|
||||||
|
f"[dim]role:[/] {p.agent_role}\n"
|
||||||
|
f"[dim]kind:[/] {kind}\n"
|
||||||
|
f"[dim]confidence:[/] [bold]{p.confidence:.2f}[/]\n"
|
||||||
|
f"[dim]from runs:[/] {runs}\n\n"
|
||||||
|
f"[bold]Rationale[/]\n{p.rationale}\n\n"
|
||||||
|
f"[bold {_PRIMARY}]{'─' * 44}[/]\n"
|
||||||
|
f"{p.body}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class SkillProposalsTUI(App[None]):
|
||||||
|
"""Triage UI: navigate the queue, accept or reject in place."""
|
||||||
|
|
||||||
|
TITLE = "CrewAI Skill Proposals"
|
||||||
|
SUB_TITLE = "↑↓ list · tab focus pane · PgUp/PgDn or mouse to scroll body · a/r/q"
|
||||||
|
|
||||||
|
BINDINGS = [
|
||||||
|
Binding("a", "accept", "Accept"),
|
||||||
|
Binding("r", "reject", "Reject"),
|
||||||
|
Binding("q", "quit", "Quit"),
|
||||||
|
Binding("tab", "focus_next", "Switch pane", show=False),
|
||||||
|
]
|
||||||
|
|
||||||
|
CSS = f"""
|
||||||
|
Header {{ background: {_PRIMARY}; color: {_TERTIARY}; }}
|
||||||
|
Footer {{ background: {_SECONDARY}; color: {_TERTIARY}; }}
|
||||||
|
Footer > .footer-key--key {{ background: {_PRIMARY}; color: {_TERTIARY}; }}
|
||||||
|
Horizontal {{ height: 1fr; }}
|
||||||
|
#list {{
|
||||||
|
width: 40%;
|
||||||
|
border-right: solid {_SECONDARY};
|
||||||
|
scrollbar-color: {_PRIMARY};
|
||||||
|
}}
|
||||||
|
#list > .option-list--option-highlighted {{
|
||||||
|
background: {_SECONDARY}; color: {_TERTIARY};
|
||||||
|
}}
|
||||||
|
#detail-scroll {{
|
||||||
|
width: 60%;
|
||||||
|
padding: 1 2;
|
||||||
|
scrollbar-color: {_PRIMARY};
|
||||||
|
}}
|
||||||
|
#detail-scroll:focus {{
|
||||||
|
background: {_SECONDARY} 5%;
|
||||||
|
}}
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, store: ProposalStore | None = None) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self._store = store or ProposalStore()
|
||||||
|
self._proposals: list[SkillProposal] = []
|
||||||
|
|
||||||
|
def compose(self) -> ComposeResult:
|
||||||
|
yield Header(show_clock=False)
|
||||||
|
with Horizontal():
|
||||||
|
yield OptionList(id="list")
|
||||||
|
with VerticalScroll(id="detail-scroll"):
|
||||||
|
yield Static("Select a proposal to view its body.", id="detail")
|
||||||
|
yield Footer()
|
||||||
|
|
||||||
|
def on_mount(self) -> None:
|
||||||
|
self.query_one("#list", OptionList).border_title = "Pending"
|
||||||
|
self.query_one("#detail-scroll", VerticalScroll).border_title = "Detail"
|
||||||
|
self._reload()
|
||||||
|
|
||||||
|
def _reload(self) -> None:
|
||||||
|
self._proposals = self._store.list_all()
|
||||||
|
option_list = self.query_one("#list", OptionList)
|
||||||
|
option_list.clear_options()
|
||||||
|
for p in self._proposals:
|
||||||
|
kind_tag = "P" if p.proposal_kind == "patch_existing" else "N"
|
||||||
|
label = f"[{kind_tag}] {p.confidence:.2f} {p.name}"
|
||||||
|
option_list.add_option(label)
|
||||||
|
option_list.border_title = f"Pending ({len(self._proposals)})"
|
||||||
|
if not self._proposals:
|
||||||
|
self.query_one("#detail", Static).update("[dim](queue is empty)[/]")
|
||||||
|
|
||||||
|
def _selected(self) -> SkillProposal | None:
|
||||||
|
idx = self.query_one("#list", OptionList).highlighted
|
||||||
|
if idx is None or idx >= len(self._proposals):
|
||||||
|
return None
|
||||||
|
return self._proposals[idx]
|
||||||
|
|
||||||
|
def on_option_list_option_highlighted(
|
||||||
|
self, event: OptionList.OptionHighlighted
|
||||||
|
) -> None:
|
||||||
|
idx = event.option_index
|
||||||
|
if idx < len(self._proposals):
|
||||||
|
self.query_one("#detail", Static).update(
|
||||||
|
_format_proposal_detail(self._proposals[idx])
|
||||||
|
)
|
||||||
|
|
||||||
|
def action_accept(self) -> None:
|
||||||
|
prop = self._selected()
|
||||||
|
if prop is None:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
accept_proposal(prop)
|
||||||
|
self.notify(f"Accepted: {prop.name}", severity="information")
|
||||||
|
except FileExistsError as e:
|
||||||
|
self.notify(str(e), severity="warning", timeout=8)
|
||||||
|
return
|
||||||
|
self._reload()
|
||||||
|
|
||||||
|
def action_reject(self) -> None:
|
||||||
|
prop = self._selected()
|
||||||
|
if prop is None:
|
||||||
|
return
|
||||||
|
reject_proposal(prop)
|
||||||
|
self.notify(f"Rejected: {prop.name}", severity="information")
|
||||||
|
self._reload()
|
||||||
249
lib/crewai/src/crewai/cli/skills_proposals.py
Normal file
249
lib/crewai/src/crewai/cli/skills_proposals.py
Normal file
@@ -0,0 +1,249 @@
|
|||||||
|
"""``crewai skills`` subcommands for the self-improvement loop."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import click
|
||||||
|
|
||||||
|
from crewai.skills.self_improve import (
|
||||||
|
ProposalStore,
|
||||||
|
SkillReviewer,
|
||||||
|
TraceStore,
|
||||||
|
accept_proposal,
|
||||||
|
reject_proposal,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_DEFAULT_REVIEW_MODEL = "anthropic/claude-haiku-4-5"
|
||||||
|
|
||||||
|
|
||||||
|
def _print_proposal_summary(p) -> None:
|
||||||
|
kind = (
|
||||||
|
f"PATCH→{p.target_skill}"
|
||||||
|
if p.proposal_kind == "patch_existing"
|
||||||
|
else "NEW"
|
||||||
|
)
|
||||||
|
click.echo(
|
||||||
|
f" {p.id} {kind:<28} conf={p.confidence:.2f} role={p.agent_role}"
|
||||||
|
)
|
||||||
|
click.echo(f" {p.name}: {p.description}")
|
||||||
|
|
||||||
|
|
||||||
|
@click.group(name="skills")
|
||||||
|
def skills() -> None:
|
||||||
|
"""Manage agent skills."""
|
||||||
|
|
||||||
|
|
||||||
|
@skills.command(name="review")
|
||||||
|
@click.option(
|
||||||
|
"--role",
|
||||||
|
default=None,
|
||||||
|
help="Limit review to one role (slug or full name). Default: all roles with traces.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--model",
|
||||||
|
default=_DEFAULT_REVIEW_MODEL,
|
||||||
|
help=f"LiteLLM model id for the reviewer LLM. Default: {_DEFAULT_REVIEW_MODEL}.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--min-traces",
|
||||||
|
default=2,
|
||||||
|
type=int,
|
||||||
|
help="Minimum traces per role before review fires. Default: 2.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--floor",
|
||||||
|
default=0.6,
|
||||||
|
type=float,
|
||||||
|
help="Drop proposals below this confidence. Default: 0.6.",
|
||||||
|
)
|
||||||
|
def skills_review(role: str | None, model: str, min_traces: int, floor: float) -> None:
|
||||||
|
"""Mine accumulated traces for skill proposals.
|
||||||
|
|
||||||
|
Reads role + goal from the trace metadata, calls the reviewer LLM, and
|
||||||
|
persists each proposal that scores above ``--floor`` to the queue. Use
|
||||||
|
``crewai skills proposals list`` to see what came out.
|
||||||
|
"""
|
||||||
|
from crewai import LLM
|
||||||
|
|
||||||
|
trace_store = TraceStore()
|
||||||
|
proposal_store = ProposalStore()
|
||||||
|
|
||||||
|
if not trace_store.root.exists():
|
||||||
|
click.echo(f"No traces yet at {trace_store.root}", err=True)
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
# Group traces by role from disk; the role-slug dirs come from the store.
|
||||||
|
by_role: dict[str, list] = defaultdict(list)
|
||||||
|
role_dirs = (
|
||||||
|
[trace_store.role_dir(role)] if role else sorted(trace_store.root.iterdir())
|
||||||
|
)
|
||||||
|
for d in role_dirs:
|
||||||
|
if not d.is_dir():
|
||||||
|
continue
|
||||||
|
for path in sorted(d.glob("*.json")):
|
||||||
|
t = trace_store.load(path)
|
||||||
|
by_role[t.agent_role].append(t)
|
||||||
|
|
||||||
|
if not by_role:
|
||||||
|
click.echo("No traces found." if role is None else f"No traces for role={role!r}.")
|
||||||
|
return
|
||||||
|
|
||||||
|
reviewer_llm = LLM(model=model)
|
||||||
|
total_emitted = 0
|
||||||
|
|
||||||
|
for agent_role, traces in by_role.items():
|
||||||
|
if len(traces) < min_traces:
|
||||||
|
click.echo(
|
||||||
|
f"Skipping {agent_role!r}: {len(traces)} trace(s), "
|
||||||
|
f"need {min_traces}."
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
agent_goal = next((t.agent_goal for t in traces if t.agent_goal), "")
|
||||||
|
loaded_skills_seen = sorted({s for t in traces for s in t.loaded_skills})
|
||||||
|
pending = [
|
||||||
|
proposal_store.load(p) for p in proposal_store.list_for_role(agent_role)
|
||||||
|
]
|
||||||
|
|
||||||
|
reviewer = SkillReviewer(
|
||||||
|
agent_role=agent_role,
|
||||||
|
agent_goal=agent_goal,
|
||||||
|
llm=reviewer_llm,
|
||||||
|
min_traces=min_traces,
|
||||||
|
confidence_floor=floor,
|
||||||
|
)
|
||||||
|
click.echo(
|
||||||
|
f"🧠 Reviewing {len(traces)} trace(s) for {agent_role!r} "
|
||||||
|
f"(model={model}, pending={len(pending)})…"
|
||||||
|
)
|
||||||
|
proposals_out = reviewer.review(
|
||||||
|
traces,
|
||||||
|
loaded_skill_names=loaded_skills_seen,
|
||||||
|
pending_proposals=pending,
|
||||||
|
)
|
||||||
|
|
||||||
|
for p in proposals_out:
|
||||||
|
path = proposal_store.save(p)
|
||||||
|
click.echo(f" + {p.id} conf={p.confidence:.2f} {p.name}")
|
||||||
|
click.echo(f" → {path}")
|
||||||
|
total_emitted += len(proposals_out)
|
||||||
|
|
||||||
|
click.echo(
|
||||||
|
f"\n✅ Done. {total_emitted} proposal(s) added to the queue. "
|
||||||
|
f"Run `crewai skills proposals list` to view."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@skills.group(name="proposals")
|
||||||
|
def proposals() -> None:
|
||||||
|
"""Manage skill proposals from the self-improvement reviewer."""
|
||||||
|
|
||||||
|
|
||||||
|
@proposals.command(name="list")
|
||||||
|
@click.option("--role", default=None, help="Filter by agent role (slug or full name).")
|
||||||
|
def proposals_list(role: str | None) -> None:
|
||||||
|
"""List pending proposals across all roles."""
|
||||||
|
store = ProposalStore()
|
||||||
|
items = store.list_for_role(role) if role else None
|
||||||
|
|
||||||
|
if role:
|
||||||
|
records = [store.load(p) for p in store.list_for_role(role)]
|
||||||
|
else:
|
||||||
|
records = store.list_all()
|
||||||
|
|
||||||
|
if not records:
|
||||||
|
click.echo("(no pending proposals)")
|
||||||
|
return
|
||||||
|
|
||||||
|
click.echo(f"{len(records)} proposal(s):")
|
||||||
|
for p in records:
|
||||||
|
_print_proposal_summary(p)
|
||||||
|
|
||||||
|
|
||||||
|
@proposals.command(name="show")
|
||||||
|
@click.argument("proposal_id")
|
||||||
|
def proposals_show(proposal_id: str) -> None:
|
||||||
|
"""Print the full body of a proposal."""
|
||||||
|
store = ProposalStore()
|
||||||
|
prop = store.find(proposal_id)
|
||||||
|
if prop is None:
|
||||||
|
click.echo(f"No proposal with id {proposal_id!r}", err=True)
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
click.echo(f"id: {prop.id}")
|
||||||
|
click.echo(f"role: {prop.agent_role}")
|
||||||
|
click.echo(f"name: {prop.name}")
|
||||||
|
click.echo(f"description: {prop.description}")
|
||||||
|
click.echo(f"confidence: {prop.confidence:.2f}")
|
||||||
|
click.echo(f"kind: {prop.proposal_kind}")
|
||||||
|
if prop.target_skill:
|
||||||
|
click.echo(f"target: {prop.target_skill}")
|
||||||
|
click.echo(f"derived from: {', '.join(prop.derived_from_runs)}")
|
||||||
|
click.echo("\nrationale:")
|
||||||
|
click.echo(prop.rationale)
|
||||||
|
click.echo("\n--- SKILL.md body ---")
|
||||||
|
click.echo(prop.body)
|
||||||
|
|
||||||
|
|
||||||
|
@proposals.command(name="accept")
|
||||||
|
@click.argument("proposal_id")
|
||||||
|
@click.option("--force", is_flag=True, help="Overwrite an existing skill of the same name.")
|
||||||
|
@click.option(
|
||||||
|
"--skills-dir",
|
||||||
|
type=click.Path(file_okay=False, path_type=Path),
|
||||||
|
default=None,
|
||||||
|
envvar="CREWAI_SELF_IMPROVE_SKILLS_DIR",
|
||||||
|
help=(
|
||||||
|
"Directory to write the SKILL.md to. Defaults to the env var "
|
||||||
|
"CREWAI_SELF_IMPROVE_SKILLS_DIR, then to the platform data dir. "
|
||||||
|
"Use a project-relative path (e.g. ./skills/learned) to keep "
|
||||||
|
"accepted skills under version control — and pass the same path "
|
||||||
|
"to Agent(self_improve=SelfImprovementConfig(skills_dir=...)) so "
|
||||||
|
"the agent loads it on the next kickoff."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
def proposals_accept(proposal_id: str, force: bool, skills_dir: Path | None) -> None:
|
||||||
|
"""Materialize a proposal as a live SKILL.md."""
|
||||||
|
from crewai.skills.self_improve import SkillStore
|
||||||
|
|
||||||
|
store = ProposalStore()
|
||||||
|
prop = store.find(proposal_id)
|
||||||
|
if prop is None:
|
||||||
|
click.echo(f"No proposal with id {proposal_id!r}", err=True)
|
||||||
|
raise SystemExit(1)
|
||||||
|
skill_store = SkillStore(skills_root=skills_dir) if skills_dir else None
|
||||||
|
try:
|
||||||
|
path = accept_proposal(prop, force=force, skill_store=skill_store)
|
||||||
|
except FileExistsError as e:
|
||||||
|
click.echo(f"{e}", err=True)
|
||||||
|
raise SystemExit(2) from None
|
||||||
|
click.echo(f"✅ Accepted: {path}")
|
||||||
|
click.echo(
|
||||||
|
" This skill will load on the next kickoff for any agent with "
|
||||||
|
f"role={prop.agent_role!r} and self_improve enabled "
|
||||||
|
"(make sure SelfImprovementConfig.skills_dir matches if you used --skills-dir)."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@proposals.command(name="reject")
|
||||||
|
@click.argument("proposal_id")
|
||||||
|
def proposals_reject(proposal_id: str) -> None:
|
||||||
|
"""Drop a proposal from the queue without accepting."""
|
||||||
|
store = ProposalStore()
|
||||||
|
prop = store.find(proposal_id)
|
||||||
|
if prop is None:
|
||||||
|
click.echo(f"No proposal with id {proposal_id!r}", err=True)
|
||||||
|
raise SystemExit(1)
|
||||||
|
reject_proposal(prop)
|
||||||
|
click.echo(f"🗑 Rejected: {prop.id}")
|
||||||
|
|
||||||
|
|
||||||
|
@proposals.command(name="tui")
|
||||||
|
def proposals_tui() -> None:
|
||||||
|
"""Open an interactive triage TUI for the proposals queue."""
|
||||||
|
from crewai.cli.skill_proposals_tui import SkillProposalsTUI
|
||||||
|
|
||||||
|
SkillProposalsTUI().run()
|
||||||
38
lib/crewai/src/crewai/skills/self_improve/__init__.py
Normal file
38
lib/crewai/src/crewai/skills/self_improve/__init__.py
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
"""Self-improving skills for crewAI agents.
|
||||||
|
|
||||||
|
When an Agent is configured with ``self_improve=True``, a TraceCollector
|
||||||
|
subscribes to the event bus during kickoff, captures tool calls + outcome
|
||||||
|
signals into a RunTrace, auto-grades the run, and persists the trace to
|
||||||
|
disk. Across many runs, a SkillReviewer mines those traces for recurring
|
||||||
|
approaches and emits SkillProposals for human review.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from crewai.skills.self_improve.acceptance import accept_proposal, reject_proposal
|
||||||
|
from crewai.skills.self_improve.auto_grade import grade_trace
|
||||||
|
from crewai.skills.self_improve.collector import TraceCollector
|
||||||
|
from crewai.skills.self_improve.models import (
|
||||||
|
Outcome,
|
||||||
|
RunTrace,
|
||||||
|
SelfImprovementConfig,
|
||||||
|
SkillProposal,
|
||||||
|
ToolCallRecord,
|
||||||
|
)
|
||||||
|
from crewai.skills.self_improve.reviewer import SkillReviewer
|
||||||
|
from crewai.skills.self_improve.storage import ProposalStore, SkillStore, TraceStore
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"Outcome",
|
||||||
|
"ProposalStore",
|
||||||
|
"RunTrace",
|
||||||
|
"SelfImprovementConfig",
|
||||||
|
"SkillProposal",
|
||||||
|
"SkillReviewer",
|
||||||
|
"SkillStore",
|
||||||
|
"ToolCallRecord",
|
||||||
|
"TraceCollector",
|
||||||
|
"TraceStore",
|
||||||
|
"accept_proposal",
|
||||||
|
"grade_trace",
|
||||||
|
"reject_proposal",
|
||||||
|
]
|
||||||
91
lib/crewai/src/crewai/skills/self_improve/acceptance.py
Normal file
91
lib/crewai/src/crewai/skills/self_improve/acceptance.py
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
"""Materialize an accepted ``SkillProposal`` as a live ``SKILL.md`` file.
|
||||||
|
|
||||||
|
Acceptance is the human checkpoint in the self-improvement loop: until a
|
||||||
|
proposal is accepted, it lives in the proposals queue and never affects
|
||||||
|
the agent. After acceptance, the SKILL.md lands at
|
||||||
|
``~/.crewai/skills/<role-slug>/<skill-name>/SKILL.md`` where the existing
|
||||||
|
skill loader discovers it on the next kickoff.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from crewai.skills.self_improve.models import SkillProposal
|
||||||
|
from crewai.skills.self_improve.storage import ProposalStore, SkillStore
|
||||||
|
|
||||||
|
|
||||||
|
def _format_skill_md(proposal: SkillProposal) -> str:
|
||||||
|
"""Render the proposal as a SKILL.md document.
|
||||||
|
|
||||||
|
The body is written verbatim — proposals already include their own
|
||||||
|
markdown structure (title, sections). We only prepend the YAML
|
||||||
|
frontmatter the loader requires. Both fields are JSON-quoted because
|
||||||
|
JSON strings are valid YAML scalars and handle every special-char case
|
||||||
|
safely.
|
||||||
|
"""
|
||||||
|
body = proposal.body.lstrip()
|
||||||
|
# If the LLM already emitted frontmatter (defensive), don't double it.
|
||||||
|
if body.startswith("---"):
|
||||||
|
return body if body.endswith("\n") else body + "\n"
|
||||||
|
frontmatter = (
|
||||||
|
f"---\n"
|
||||||
|
f"name: {json.dumps(proposal.name)}\n"
|
||||||
|
f"description: {json.dumps(proposal.description)}\n"
|
||||||
|
f"---\n\n"
|
||||||
|
)
|
||||||
|
return frontmatter + body if body.endswith("\n") else frontmatter + body + "\n"
|
||||||
|
|
||||||
|
|
||||||
|
def accept_proposal(
|
||||||
|
proposal: SkillProposal,
|
||||||
|
*,
|
||||||
|
force: bool = False,
|
||||||
|
skill_store: SkillStore | None = None,
|
||||||
|
proposal_store: ProposalStore | None = None,
|
||||||
|
) -> Path:
|
||||||
|
"""Write the proposal as a SKILL.md and remove it from the queue.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
proposal: The proposal to materialize.
|
||||||
|
force: When True, overwrite an existing SKILL.md at the target path.
|
||||||
|
skill_store: Override for the live-skills store (test injection).
|
||||||
|
proposal_store: Override for the proposals store (test injection).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to the written ``SKILL.md``.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
FileExistsError: When a SKILL.md already exists at the target and
|
||||||
|
``force=False``.
|
||||||
|
"""
|
||||||
|
skill_store = skill_store or SkillStore()
|
||||||
|
proposal_store = proposal_store or ProposalStore()
|
||||||
|
|
||||||
|
target_dir = skill_store.skill_dir(proposal.agent_role, proposal.name)
|
||||||
|
skill_md = target_dir / "SKILL.md"
|
||||||
|
|
||||||
|
if skill_md.exists() and not force:
|
||||||
|
raise FileExistsError(
|
||||||
|
f"{skill_md} already exists. Pass force=True to overwrite."
|
||||||
|
)
|
||||||
|
|
||||||
|
target_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
skill_md.write_text(_format_skill_md(proposal), encoding="utf-8")
|
||||||
|
|
||||||
|
# Once accepted, the proposal record is no longer the source of truth —
|
||||||
|
# the SKILL.md is. Drop the queue entry so it doesn't show up in `list`.
|
||||||
|
proposal_store.delete(proposal.id, proposal.agent_role)
|
||||||
|
|
||||||
|
return skill_md
|
||||||
|
|
||||||
|
|
||||||
|
def reject_proposal(
|
||||||
|
proposal: SkillProposal,
|
||||||
|
*,
|
||||||
|
proposal_store: ProposalStore | None = None,
|
||||||
|
) -> bool:
|
||||||
|
"""Delete a proposal from the queue. Returns True if it existed."""
|
||||||
|
proposal_store = proposal_store or ProposalStore()
|
||||||
|
return proposal_store.delete(proposal.id, proposal.agent_role)
|
||||||
76
lib/crewai/src/crewai/skills/self_improve/auto_grade.py
Normal file
76
lib/crewai/src/crewai/skills/self_improve/auto_grade.py
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
"""Derive a run outcome from observable signals.
|
||||||
|
|
||||||
|
No human grading required. The reviewer treats outcomes as
|
||||||
|
confidence-weighted hints, not ground truth — clustering across runs
|
||||||
|
absorbs label noise.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
from crewai.skills.self_improve.models import Outcome, RunTrace
|
||||||
|
|
||||||
|
|
||||||
|
_RETRY_THRASH_THRESHOLD = 3
|
||||||
|
|
||||||
|
|
||||||
|
def _has_thrashing(trace: RunTrace) -> bool:
|
||||||
|
"""Detect a tool being called repeatedly with the same args summary.
|
||||||
|
|
||||||
|
A tool fired more than ``_RETRY_THRASH_THRESHOLD`` times with identical
|
||||||
|
args is likely stuck in a retry loop rather than making progress.
|
||||||
|
"""
|
||||||
|
if len(trace.tool_calls) <= _RETRY_THRASH_THRESHOLD:
|
||||||
|
return False
|
||||||
|
keys = [(t.name, t.args_summary) for t in trace.tool_calls]
|
||||||
|
most_common_count = Counter(keys).most_common(1)[0][1]
|
||||||
|
return most_common_count > _RETRY_THRASH_THRESHOLD
|
||||||
|
|
||||||
|
|
||||||
|
def _output_looks_empty(trace: RunTrace) -> bool:
|
||||||
|
if trace.output_summary is None:
|
||||||
|
return False
|
||||||
|
stripped = trace.output_summary.strip()
|
||||||
|
if not stripped:
|
||||||
|
return True
|
||||||
|
lowered = stripped.lower()
|
||||||
|
return lowered.startswith("error") or lowered.startswith("traceback")
|
||||||
|
|
||||||
|
|
||||||
|
def grade_trace(trace: RunTrace) -> Outcome:
|
||||||
|
"""Compute outcome from signals already present on the trace.
|
||||||
|
|
||||||
|
Signal hierarchy (strongest first):
|
||||||
|
1. explicit error → failure
|
||||||
|
2. guardrail decided → trust it
|
||||||
|
3. max_iter exhaustion → failure
|
||||||
|
4. tool error rate / thrashing / empty output → failure or partial
|
||||||
|
5. otherwise → success when we saw output, else unknown
|
||||||
|
"""
|
||||||
|
if trace.error:
|
||||||
|
return "failure"
|
||||||
|
|
||||||
|
if trace.guardrail_passed is True:
|
||||||
|
return "success"
|
||||||
|
if trace.guardrail_passed is False:
|
||||||
|
return "failure"
|
||||||
|
|
||||||
|
if trace.max_iter_exhausted:
|
||||||
|
return "failure"
|
||||||
|
|
||||||
|
if _has_thrashing(trace):
|
||||||
|
return "failure"
|
||||||
|
|
||||||
|
if _output_looks_empty(trace):
|
||||||
|
return "failure"
|
||||||
|
|
||||||
|
if trace.tool_call_count > 0:
|
||||||
|
error_rate = trace.tool_error_count / trace.tool_call_count
|
||||||
|
if error_rate >= 0.5:
|
||||||
|
return "failure"
|
||||||
|
|
||||||
|
if trace.output_summary:
|
||||||
|
return "success"
|
||||||
|
|
||||||
|
return "unknown"
|
||||||
286
lib/crewai/src/crewai/skills/self_improve/collector.py
Normal file
286
lib/crewai/src/crewai/skills/self_improve/collector.py
Normal file
@@ -0,0 +1,286 @@
|
|||||||
|
"""Trace collector that subscribes to the event bus.
|
||||||
|
|
||||||
|
One ``TraceCollector`` per Agent instance. It keeps a per-(agent, task)
|
||||||
|
in-flight trace, identifies which events belong to its agent, and
|
||||||
|
finalizes the trace on completion — auto-grading and persisting it.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import atexit
|
||||||
|
from datetime import UTC, datetime
|
||||||
|
import logging
|
||||||
|
import threading
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
from crewai.events.event_bus import CrewAIEventsBus, crewai_event_bus
|
||||||
|
from crewai.events.types.agent_events import (
|
||||||
|
AgentExecutionCompletedEvent,
|
||||||
|
AgentExecutionErrorEvent,
|
||||||
|
AgentExecutionStartedEvent,
|
||||||
|
LiteAgentExecutionCompletedEvent,
|
||||||
|
LiteAgentExecutionErrorEvent,
|
||||||
|
LiteAgentExecutionStartedEvent,
|
||||||
|
)
|
||||||
|
from crewai.events.types.tool_usage_events import (
|
||||||
|
ToolUsageErrorEvent,
|
||||||
|
ToolUsageFinishedEvent,
|
||||||
|
ToolUsageStartedEvent,
|
||||||
|
)
|
||||||
|
from crewai.skills.self_improve.auto_grade import grade_trace
|
||||||
|
from crewai.skills.self_improve.models import RunTrace, ToolCallRecord
|
||||||
|
from crewai.skills.self_improve.storage import TraceStore
|
||||||
|
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from crewai.agents.agent_builder.base_agent import BaseAgent
|
||||||
|
|
||||||
|
|
||||||
|
_OUTPUT_TRUNCATE = 4000
|
||||||
|
_ARGS_TRUNCATE = 200
|
||||||
|
_FLUSH_TIMEOUT = 10.0
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# ``Agent.kickoff()`` can return before the bus's thread-pool handlers drain;
|
||||||
|
# without this hook a script that exits immediately would lose the just-
|
||||||
|
# finalized trace. ``flush()`` is a no-op when no events are pending, so it's
|
||||||
|
# safe to register at module import time even if no agent ever opts in.
|
||||||
|
atexit.register(lambda: crewai_event_bus.flush(timeout=_FLUSH_TIMEOUT))
|
||||||
|
|
||||||
|
|
||||||
|
def _truncate(value: Any, limit: int) -> str:
|
||||||
|
if value is None:
|
||||||
|
return ""
|
||||||
|
s = value if isinstance(value, str) else repr(value)
|
||||||
|
if len(s) <= limit:
|
||||||
|
return s
|
||||||
|
return s[: limit - 1] + "…"
|
||||||
|
|
||||||
|
|
||||||
|
class TraceCollector:
|
||||||
|
"""Captures one ``RunTrace`` per agent execution by subscribing to events.
|
||||||
|
|
||||||
|
Lifecycle:
|
||||||
|
- ``attach(bus)`` registers handlers on the global event bus.
|
||||||
|
- On ``AgentExecutionStartedEvent`` for our agent, a new trace begins.
|
||||||
|
- ``ToolUsage*`` events for our agent append ``ToolCallRecord``s.
|
||||||
|
- On ``AgentExecutionCompletedEvent`` / ``AgentExecutionErrorEvent``
|
||||||
|
for our agent, the trace is auto-graded and persisted.
|
||||||
|
|
||||||
|
The collector is intentionally tolerant: a missing Started event
|
||||||
|
(e.g. because the agent was already executing when ``attach`` ran) just
|
||||||
|
skips that trace. Tool events without a current trace are ignored.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
agent: BaseAgent,
|
||||||
|
store: TraceStore | None = None,
|
||||||
|
) -> None:
|
||||||
|
self._agent = agent
|
||||||
|
self._store = store or TraceStore()
|
||||||
|
self._current: RunTrace | None = None
|
||||||
|
self._tool_started_at: dict[str, datetime] = {}
|
||||||
|
self._attached = False
|
||||||
|
# Bus dispatches handlers on a thread pool, and Agent.kickoff() +
|
||||||
|
# LiteAgent each emit Started/Completed for the same logical run, so
|
||||||
|
# the lock + ``self._current is not None`` check serializes the
|
||||||
|
# "create or skip" decision and dedupes the second half of the pair.
|
||||||
|
self._lock = threading.RLock()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def current_trace(self) -> RunTrace | None:
|
||||||
|
"""The in-flight trace, if an agent execution is active."""
|
||||||
|
return self._current
|
||||||
|
|
||||||
|
def _is_my_agent(self, event_agent: Any) -> bool:
|
||||||
|
if event_agent is None:
|
||||||
|
return False
|
||||||
|
if event_agent is self._agent:
|
||||||
|
return True
|
||||||
|
agent_id = getattr(event_agent, "id", None)
|
||||||
|
my_id = getattr(self._agent, "id", None)
|
||||||
|
return bool(agent_id is not None and my_id is not None and agent_id == my_id)
|
||||||
|
|
||||||
|
def _is_my_id(self, event_agent_id: str | None) -> bool:
|
||||||
|
if not event_agent_id:
|
||||||
|
return False
|
||||||
|
my_id = getattr(self._agent, "id", None)
|
||||||
|
return bool(my_id is not None and str(my_id) == str(event_agent_id))
|
||||||
|
|
||||||
|
def attach(self, bus: CrewAIEventsBus) -> None:
|
||||||
|
"""Register event handlers. Idempotent."""
|
||||||
|
if self._attached:
|
||||||
|
return
|
||||||
|
self._attached = True
|
||||||
|
|
||||||
|
@bus.on(AgentExecutionStartedEvent)
|
||||||
|
def _on_started(_source: Any, event: AgentExecutionStartedEvent) -> None:
|
||||||
|
if not self._is_my_agent(event.agent):
|
||||||
|
return
|
||||||
|
with self._lock:
|
||||||
|
if self._current is not None:
|
||||||
|
return # duplicate Started for an in-flight trace
|
||||||
|
task = event.task
|
||||||
|
self._current = RunTrace(
|
||||||
|
agent_id=str(getattr(self._agent, "id", "") or ""),
|
||||||
|
agent_role=self._agent.role,
|
||||||
|
agent_goal=getattr(self._agent, "goal", "") or "",
|
||||||
|
task_id=str(getattr(task, "id", "") or "") or None,
|
||||||
|
task_description=getattr(task, "description", None),
|
||||||
|
loaded_skills=self._collect_loaded_skills(),
|
||||||
|
)
|
||||||
|
|
||||||
|
@bus.on(ToolUsageStartedEvent)
|
||||||
|
def _on_tool_started(_source: Any, event: ToolUsageStartedEvent) -> None:
|
||||||
|
if not self._is_my_id(event.agent_id):
|
||||||
|
return
|
||||||
|
with self._lock:
|
||||||
|
if self._current is None:
|
||||||
|
return
|
||||||
|
self._tool_started_at[event.tool_name] = datetime.now(UTC)
|
||||||
|
|
||||||
|
@bus.on(ToolUsageFinishedEvent)
|
||||||
|
def _on_tool_finished(_source: Any, event: ToolUsageFinishedEvent) -> None:
|
||||||
|
if not self._is_my_id(event.agent_id):
|
||||||
|
return
|
||||||
|
with self._lock:
|
||||||
|
if self._current is None:
|
||||||
|
return
|
||||||
|
self._current.tool_calls.append(
|
||||||
|
ToolCallRecord(
|
||||||
|
name=event.tool_name,
|
||||||
|
args_summary=_truncate(event.tool_args, _ARGS_TRUNCATE),
|
||||||
|
ok=True,
|
||||||
|
duration_ms=self._duration_ms(
|
||||||
|
event.tool_name, event.finished_at
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self._tool_started_at.pop(event.tool_name, None)
|
||||||
|
|
||||||
|
@bus.on(ToolUsageErrorEvent)
|
||||||
|
def _on_tool_error(_source: Any, event: ToolUsageErrorEvent) -> None:
|
||||||
|
if not self._is_my_id(event.agent_id):
|
||||||
|
return
|
||||||
|
with self._lock:
|
||||||
|
if self._current is None:
|
||||||
|
return
|
||||||
|
self._current.tool_calls.append(
|
||||||
|
ToolCallRecord(
|
||||||
|
name=event.tool_name,
|
||||||
|
args_summary=_truncate(event.tool_args, _ARGS_TRUNCATE),
|
||||||
|
ok=False,
|
||||||
|
error=_truncate(event.error, _ARGS_TRUNCATE),
|
||||||
|
duration_ms=self._duration_ms(
|
||||||
|
event.tool_name, datetime.now(UTC)
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self._tool_started_at.pop(event.tool_name, None)
|
||||||
|
|
||||||
|
@bus.on(AgentExecutionCompletedEvent)
|
||||||
|
def _on_completed(_source: Any, event: AgentExecutionCompletedEvent) -> None:
|
||||||
|
if not self._is_my_agent(event.agent):
|
||||||
|
return
|
||||||
|
with self._lock:
|
||||||
|
if self._current is None:
|
||||||
|
return
|
||||||
|
self._current.output_summary = _truncate(
|
||||||
|
event.output, _OUTPUT_TRUNCATE
|
||||||
|
)
|
||||||
|
self._finalize_locked()
|
||||||
|
|
||||||
|
@bus.on(AgentExecutionErrorEvent)
|
||||||
|
def _on_error(_source: Any, event: AgentExecutionErrorEvent) -> None:
|
||||||
|
if not self._is_my_agent(event.agent):
|
||||||
|
return
|
||||||
|
with self._lock:
|
||||||
|
if self._current is None:
|
||||||
|
return
|
||||||
|
self._current.error = _truncate(event.error, _OUTPUT_TRUNCATE)
|
||||||
|
self._finalize_locked()
|
||||||
|
|
||||||
|
@bus.on(LiteAgentExecutionStartedEvent)
|
||||||
|
def _on_lite_started(
|
||||||
|
_source: Any, event: LiteAgentExecutionStartedEvent
|
||||||
|
) -> None:
|
||||||
|
if not self._is_my_id(event.agent_info.get("id")):
|
||||||
|
return
|
||||||
|
with self._lock:
|
||||||
|
if self._current is not None:
|
||||||
|
return # duplicate Started for an in-flight trace
|
||||||
|
messages = event.messages
|
||||||
|
if isinstance(messages, list):
|
||||||
|
task_desc = " ".join(
|
||||||
|
str(m.get("content", ""))
|
||||||
|
for m in messages
|
||||||
|
if isinstance(m, dict)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
task_desc = str(messages)
|
||||||
|
self._current = RunTrace(
|
||||||
|
agent_id=str(getattr(self._agent, "id", "") or ""),
|
||||||
|
agent_role=self._agent.role,
|
||||||
|
agent_goal=getattr(self._agent, "goal", "") or "",
|
||||||
|
task_description=_truncate(task_desc, _OUTPUT_TRUNCATE),
|
||||||
|
loaded_skills=self._collect_loaded_skills(),
|
||||||
|
)
|
||||||
|
|
||||||
|
@bus.on(LiteAgentExecutionCompletedEvent)
|
||||||
|
def _on_lite_completed(
|
||||||
|
_source: Any, event: LiteAgentExecutionCompletedEvent
|
||||||
|
) -> None:
|
||||||
|
if not self._is_my_id(event.agent_info.get("id")):
|
||||||
|
return
|
||||||
|
with self._lock:
|
||||||
|
if self._current is None:
|
||||||
|
return
|
||||||
|
self._current.output_summary = _truncate(
|
||||||
|
event.output, _OUTPUT_TRUNCATE
|
||||||
|
)
|
||||||
|
self._finalize_locked()
|
||||||
|
|
||||||
|
@bus.on(LiteAgentExecutionErrorEvent)
|
||||||
|
def _on_lite_error(
|
||||||
|
_source: Any, event: LiteAgentExecutionErrorEvent
|
||||||
|
) -> None:
|
||||||
|
if not self._is_my_id(event.agent_info.get("id")):
|
||||||
|
return
|
||||||
|
with self._lock:
|
||||||
|
if self._current is None:
|
||||||
|
return
|
||||||
|
self._current.error = _truncate(event.error, _OUTPUT_TRUNCATE)
|
||||||
|
self._finalize_locked()
|
||||||
|
|
||||||
|
def _duration_ms(self, tool_name: str, finished_at: datetime) -> int | None:
|
||||||
|
started = self._tool_started_at.get(tool_name)
|
||||||
|
if started is None:
|
||||||
|
return None
|
||||||
|
return max(0, int((finished_at - started).total_seconds() * 1000))
|
||||||
|
|
||||||
|
def _collect_loaded_skills(self) -> list[str]:
|
||||||
|
skills = getattr(self._agent, "skills", None) or []
|
||||||
|
names: list[str] = []
|
||||||
|
for s in skills:
|
||||||
|
name = getattr(s, "name", None)
|
||||||
|
if isinstance(name, str):
|
||||||
|
names.append(name)
|
||||||
|
return names
|
||||||
|
|
||||||
|
def _finalize_locked(self) -> None:
|
||||||
|
"""Caller holds ``self._lock``."""
|
||||||
|
trace = self._current
|
||||||
|
if trace is None:
|
||||||
|
return
|
||||||
|
self._current = None
|
||||||
|
self._tool_started_at.clear()
|
||||||
|
trace.ended_at = datetime.now(UTC)
|
||||||
|
trace.outcome = grade_trace(trace)
|
||||||
|
try:
|
||||||
|
self._store.save(trace)
|
||||||
|
except OSError:
|
||||||
|
logger.exception(
|
||||||
|
"Failed to persist run trace for role %s", trace.agent_role
|
||||||
|
)
|
||||||
116
lib/crewai/src/crewai/skills/self_improve/models.py
Normal file
116
lib/crewai/src/crewai/skills/self_improve/models.py
Normal file
@@ -0,0 +1,116 @@
|
|||||||
|
"""Data models for self-improving skills."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import UTC, datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Literal
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
from pydantic import BaseModel, ConfigDict, Field
|
||||||
|
|
||||||
|
|
||||||
|
def _now() -> datetime:
|
||||||
|
return datetime.now(UTC)
|
||||||
|
|
||||||
|
|
||||||
|
class SelfImprovementConfig(BaseModel):
|
||||||
|
"""Per-agent configuration for the self-improvement loop.
|
||||||
|
|
||||||
|
All fields are optional with sensible defaults. Pass to ``Agent`` as
|
||||||
|
``self_improve=SelfImprovementConfig(skills_dir=Path("./skills/learned"))``
|
||||||
|
to override; ``Agent(self_improve=True)`` uses defaults.
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||||
|
|
||||||
|
skills_dir: Path | None = Field(
|
||||||
|
default=None,
|
||||||
|
description=(
|
||||||
|
"Where accepted SKILL.md files are written and auto-loaded from. "
|
||||||
|
"When None, falls back to <db_storage_path>/self_improve/skills/. "
|
||||||
|
"Set to a project-relative path (e.g. Path('./skills/learned')) "
|
||||||
|
"to keep accepted skills under version control."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
Outcome = Literal["success", "failure", "unknown"]
|
||||||
|
ProposalKind = Literal["new", "patch_existing"]
|
||||||
|
|
||||||
|
|
||||||
|
def _new_id(prefix: str) -> str:
|
||||||
|
"""Generate a short id with a stable prefix."""
|
||||||
|
return f"{prefix}_{uuid.uuid4().hex[:12]}"
|
||||||
|
|
||||||
|
|
||||||
|
class ToolCallRecord(BaseModel):
|
||||||
|
"""One tool invocation within a run."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(frozen=True)
|
||||||
|
|
||||||
|
name: str
|
||||||
|
args_summary: str = Field(
|
||||||
|
default="",
|
||||||
|
description="Truncated string repr of args, suitable for clustering.",
|
||||||
|
)
|
||||||
|
ok: bool = True
|
||||||
|
error: str | None = None
|
||||||
|
duration_ms: int | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class RunTrace(BaseModel):
|
||||||
|
"""One agent + task execution.
|
||||||
|
|
||||||
|
Built incrementally by ``TraceCollector`` from event-bus events,
|
||||||
|
finalized at agent completion, then persisted to disk.
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||||
|
|
||||||
|
id: str = Field(default_factory=lambda: _new_id("run"))
|
||||||
|
agent_id: str | None = None
|
||||||
|
agent_role: str
|
||||||
|
agent_goal: str = ""
|
||||||
|
task_id: str | None = None
|
||||||
|
task_description: str | None = None
|
||||||
|
started_at: datetime = Field(default_factory=_now)
|
||||||
|
ended_at: datetime | None = None
|
||||||
|
tool_calls: list[ToolCallRecord] = Field(default_factory=list)
|
||||||
|
loaded_skills: list[str] = Field(default_factory=list)
|
||||||
|
outcome: Outcome = "unknown"
|
||||||
|
output_summary: str | None = None
|
||||||
|
error: str | None = None
|
||||||
|
max_iter_exhausted: bool = False
|
||||||
|
guardrail_passed: bool | None = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def tool_error_count(self) -> int:
|
||||||
|
return sum(1 for t in self.tool_calls if not t.ok)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def tool_call_count(self) -> int:
|
||||||
|
return len(self.tool_calls)
|
||||||
|
|
||||||
|
|
||||||
|
class SkillProposal(BaseModel):
|
||||||
|
"""A proposed new or updated skill, awaiting human review.
|
||||||
|
|
||||||
|
The reviewer LLM emits these directly via a thin envelope; the server
|
||||||
|
stamps ``agent_role`` and ``derived_from_runs`` after the call, which is
|
||||||
|
why those two have permissive defaults rather than being required.
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_config = ConfigDict(frozen=True)
|
||||||
|
|
||||||
|
id: str = Field(default_factory=lambda: _new_id("prop"))
|
||||||
|
agent_role: str = ""
|
||||||
|
name: str
|
||||||
|
description: str
|
||||||
|
body: str
|
||||||
|
rationale: str
|
||||||
|
confidence: float = Field(ge=0.0, le=1.0)
|
||||||
|
proposal_kind: ProposalKind = "new"
|
||||||
|
target_skill: str | None = None
|
||||||
|
derived_from_runs: list[str] = Field(default_factory=list)
|
||||||
|
created_at: datetime = Field(default_factory=_now)
|
||||||
208
lib/crewai/src/crewai/skills/self_improve/reviewer.py
Normal file
208
lib/crewai/src/crewai/skills/self_improve/reviewer.py
Normal file
@@ -0,0 +1,208 @@
|
|||||||
|
"""LLM-driven skill reviewer.
|
||||||
|
|
||||||
|
Reads N ``RunTrace`` records, identifies recurring *approaches* (not
|
||||||
|
facts — those go to memory), and emits ``SkillProposal``s for human
|
||||||
|
review.
|
||||||
|
|
||||||
|
The reviewer never writes to the active skills directory itself; it only
|
||||||
|
populates the proposals queue. Acceptance is a separate, explicit step.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pydantic import BaseModel, ConfigDict, Field
|
||||||
|
|
||||||
|
from crewai.llms.base_llm import BaseLLM
|
||||||
|
from crewai.skills.self_improve.models import RunTrace, SkillProposal
|
||||||
|
|
||||||
|
|
||||||
|
_TRACE_OUTPUT_TRUNCATE = 1500
|
||||||
|
_TRACE_TASK_TRUNCATE = 800
|
||||||
|
|
||||||
|
|
||||||
|
class _ReviewerOutput(BaseModel):
|
||||||
|
"""Envelope around the LLM's structured proposals.
|
||||||
|
|
||||||
|
``SkillProposal.agent_role`` and ``derived_from_runs`` are intentionally
|
||||||
|
server-filled post-call; the LLM doesn't need to emit them.
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_config = ConfigDict(extra="ignore")
|
||||||
|
|
||||||
|
proposals: list[SkillProposal] = Field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
_SYSTEM_PROMPT = """\
|
||||||
|
You are reviewing execution traces from a CrewAI agent to decide what
|
||||||
|
*reusable approaches* are worth saving as agent skills.
|
||||||
|
|
||||||
|
The agent's role: {role}
|
||||||
|
The agent's goal: {goal}
|
||||||
|
|
||||||
|
You see {n} traces from past runs. Your job:
|
||||||
|
|
||||||
|
1. Identify recurring NON-DETERMINISTIC APPROACHES — patterns of how this
|
||||||
|
agent thinks about a class of problems. Examples of approaches:
|
||||||
|
- "When asked to scaffold a project, always start by listing required
|
||||||
|
config files before writing code."
|
||||||
|
- "If a tool returns empty results, rephrase the query before retrying."
|
||||||
|
|
||||||
|
2. DROP facts. Facts go to memory, not skills. Examples of facts:
|
||||||
|
- "the user prefers dark mode"
|
||||||
|
- "the API key for service X is in vault Y"
|
||||||
|
- "this project uses pytest"
|
||||||
|
|
||||||
|
3. Drop one-off observations. A skill needs ≥2 traces showing the same
|
||||||
|
approach. If only one trace shows it, omit.
|
||||||
|
|
||||||
|
4. If a proposal would patch an EXISTING loaded skill (listed below), set
|
||||||
|
proposal_kind="patch_existing" and target_skill=<that skill name>.
|
||||||
|
Otherwise leave proposal_kind="new".
|
||||||
|
|
||||||
|
5. Score confidence honestly. Below 0.6 will be auto-rejected, so don't
|
||||||
|
pad. A pattern seen in 2/{n} traces with no contradictions ≈ 0.65; in
|
||||||
|
all {n} traces with consistent outcome ≈ 0.85.
|
||||||
|
|
||||||
|
6. SKILL.md body should be markdown:
|
||||||
|
- First line: short description in italics or as a sentence
|
||||||
|
- "When to use this skill" section
|
||||||
|
- "How to apply it" section with concrete steps
|
||||||
|
- No invented facts or fabricated tool names
|
||||||
|
|
||||||
|
Loaded skills already available to this agent:
|
||||||
|
{loaded_skills}
|
||||||
|
|
||||||
|
Proposals already QUEUED for human review (do not re-propose these — a
|
||||||
|
prior reviewer round already surfaced them, they're awaiting accept/reject):
|
||||||
|
{pending_proposals}
|
||||||
|
|
||||||
|
If a recurring pattern matches a queued proposal semantically, simply
|
||||||
|
omit it from your output. Don't restate the same approach under a new
|
||||||
|
name. The human curator will resolve the queue.
|
||||||
|
|
||||||
|
Return a JSON object matching the schema. Empty proposals list is a valid
|
||||||
|
answer when no recurring approach is worth saving.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
_USER_PROMPT = """\
|
||||||
|
TRACES ({n}):
|
||||||
|
|
||||||
|
{traces}
|
||||||
|
|
||||||
|
Review the traces above and emit your proposals.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def _format_trace(trace: RunTrace) -> str:
|
||||||
|
"""One block per trace, compact but enough signal."""
|
||||||
|
task = (trace.task_description or "").strip()
|
||||||
|
if len(task) > _TRACE_TASK_TRUNCATE:
|
||||||
|
task = task[: _TRACE_TASK_TRUNCATE - 1] + "…"
|
||||||
|
|
||||||
|
output = (trace.output_summary or "").strip()
|
||||||
|
if len(output) > _TRACE_OUTPUT_TRUNCATE:
|
||||||
|
output = output[: _TRACE_OUTPUT_TRUNCATE - 1] + "…"
|
||||||
|
|
||||||
|
tool_lines: list[str] = []
|
||||||
|
for t in trace.tool_calls:
|
||||||
|
tag = "ok" if t.ok else "ERR"
|
||||||
|
tool_lines.append(f" [{tag}] {t.name}({t.args_summary})")
|
||||||
|
tools_block = "\n".join(tool_lines) if tool_lines else " (no tool calls)"
|
||||||
|
|
||||||
|
return (
|
||||||
|
f"--- {trace.id} outcome={trace.outcome} "
|
||||||
|
f"max_iter_exhausted={trace.max_iter_exhausted} "
|
||||||
|
f"guardrail_passed={trace.guardrail_passed}\n"
|
||||||
|
f"task:\n{task}\n\n"
|
||||||
|
f"tool_calls ({len(trace.tool_calls)}):\n{tools_block}\n\n"
|
||||||
|
f"output_summary:\n{output}\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class SkillReviewer:
|
||||||
|
"""Synthesize ``SkillProposal``s from a batch of ``RunTrace``s.
|
||||||
|
|
||||||
|
Stateless; the only state lives in the disk store. Pass an LLM (any
|
||||||
|
``BaseLLM`` instance — typically a small/cheap model like Haiku is
|
||||||
|
enough since the reviewer just summarizes).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
agent_role: str,
|
||||||
|
agent_goal: str,
|
||||||
|
llm: BaseLLM,
|
||||||
|
min_traces: int = 3,
|
||||||
|
confidence_floor: float = 0.6,
|
||||||
|
) -> None:
|
||||||
|
self.agent_role = agent_role
|
||||||
|
self.agent_goal = agent_goal
|
||||||
|
self.llm = llm
|
||||||
|
self.min_traces = min_traces
|
||||||
|
self.confidence_floor = confidence_floor
|
||||||
|
|
||||||
|
def review(
|
||||||
|
self,
|
||||||
|
traces: list[RunTrace],
|
||||||
|
*,
|
||||||
|
loaded_skill_names: list[str] | None = None,
|
||||||
|
pending_proposals: list[SkillProposal] | None = None,
|
||||||
|
) -> list[SkillProposal]:
|
||||||
|
"""Run the LLM review and return filtered proposals.
|
||||||
|
|
||||||
|
Returns an empty list when ``len(traces) < self.min_traces`` so
|
||||||
|
the reviewer is safe to call early — it just no-ops.
|
||||||
|
|
||||||
|
Pass ``pending_proposals`` (typically the current contents of the
|
||||||
|
ProposalStore for this role) so the reviewer doesn't re-emit
|
||||||
|
semantic duplicates of items already awaiting human review.
|
||||||
|
"""
|
||||||
|
if len(traces) < self.min_traces:
|
||||||
|
return []
|
||||||
|
|
||||||
|
loaded_skills_str = (
|
||||||
|
"\n".join(f" - {name}" for name in loaded_skill_names)
|
||||||
|
if loaded_skill_names
|
||||||
|
else " (none)"
|
||||||
|
)
|
||||||
|
|
||||||
|
if pending_proposals:
|
||||||
|
pending_str = "\n".join(
|
||||||
|
f" - {p.name}: {p.description}" for p in pending_proposals
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
pending_str = " (none)"
|
||||||
|
|
||||||
|
system = _SYSTEM_PROMPT.format(
|
||||||
|
role=self.agent_role,
|
||||||
|
goal=self.agent_goal,
|
||||||
|
n=len(traces),
|
||||||
|
loaded_skills=loaded_skills_str,
|
||||||
|
pending_proposals=pending_str,
|
||||||
|
)
|
||||||
|
user = _USER_PROMPT.format(
|
||||||
|
n=len(traces),
|
||||||
|
traces="\n".join(_format_trace(t) for t in traces),
|
||||||
|
)
|
||||||
|
|
||||||
|
result = self.llm.call(
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": system},
|
||||||
|
{"role": "user", "content": user},
|
||||||
|
],
|
||||||
|
response_model=_ReviewerOutput,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not isinstance(result, _ReviewerOutput):
|
||||||
|
return []
|
||||||
|
|
||||||
|
run_ids = [t.id for t in traces]
|
||||||
|
return [
|
||||||
|
p.model_copy(
|
||||||
|
update={"agent_role": self.agent_role, "derived_from_runs": run_ids}
|
||||||
|
)
|
||||||
|
for p in result.proposals
|
||||||
|
if p.confidence >= self.confidence_floor
|
||||||
|
]
|
||||||
178
lib/crewai/src/crewai/skills/self_improve/storage.py
Normal file
178
lib/crewai/src/crewai/skills/self_improve/storage.py
Normal file
@@ -0,0 +1,178 @@
|
|||||||
|
"""On-disk storage for traces and skill proposals.
|
||||||
|
|
||||||
|
Layout::
|
||||||
|
|
||||||
|
<root>/
|
||||||
|
traces/<role>/<run_id>.json
|
||||||
|
skill_proposals/<role>/<proposal_id>.json
|
||||||
|
skills/<role>/<skill-name>/SKILL.md
|
||||||
|
|
||||||
|
The default root is ``db_storage_path() / "self_improve"`` — the same
|
||||||
|
project-scoped, platform-correct data dir that memory DBs use (set by
|
||||||
|
``appdirs.user_data_dir`` and overridable via ``CREWAI_STORAGE_DIR``).
|
||||||
|
``CREWAI_SELF_IMPROVE_DIR`` overrides specifically this feature's root,
|
||||||
|
useful for tests or migrations.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
import re
|
||||||
|
|
||||||
|
from crewai.skills.self_improve.models import RunTrace, SkillProposal
|
||||||
|
from crewai.utilities.paths import db_storage_path
|
||||||
|
|
||||||
|
|
||||||
|
_ENV_VAR = "CREWAI_SELF_IMPROVE_DIR"
|
||||||
|
_SLUG_RE = re.compile(r"[^a-z0-9_-]+")
|
||||||
|
|
||||||
|
|
||||||
|
def _slug(role: str) -> str:
|
||||||
|
"""Slugify an agent role for use as a directory name."""
|
||||||
|
s = role.strip().lower().replace(" ", "-")
|
||||||
|
s = _SLUG_RE.sub("", s)
|
||||||
|
return s or "agent"
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_root(root: Path | None) -> Path:
|
||||||
|
if root is not None:
|
||||||
|
return root
|
||||||
|
env = os.environ.get(_ENV_VAR)
|
||||||
|
if env:
|
||||||
|
return Path(env)
|
||||||
|
return Path(db_storage_path()) / "self_improve"
|
||||||
|
|
||||||
|
|
||||||
|
def _write_json(path: Path, payload: str) -> None:
|
||||||
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
path.write_text(payload, encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
class TraceStore:
|
||||||
|
"""Filesystem store for ``RunTrace`` records."""
|
||||||
|
|
||||||
|
def __init__(self, root: Path | None = None) -> None:
|
||||||
|
self.root = _resolve_root(root) / "traces"
|
||||||
|
|
||||||
|
def role_dir(self, role: str) -> Path:
|
||||||
|
return self.root / _slug(role)
|
||||||
|
|
||||||
|
def path_for(self, trace: RunTrace) -> Path:
|
||||||
|
return self.role_dir(trace.agent_role) / f"{trace.id}.json"
|
||||||
|
|
||||||
|
def save(self, trace: RunTrace) -> Path:
|
||||||
|
path = self.path_for(trace)
|
||||||
|
_write_json(path, trace.model_dump_json(indent=2))
|
||||||
|
return path
|
||||||
|
|
||||||
|
def list_for_role(self, role: str) -> list[Path]:
|
||||||
|
d = self.role_dir(role)
|
||||||
|
if not d.exists():
|
||||||
|
return []
|
||||||
|
return sorted(d.glob("*.json"))
|
||||||
|
|
||||||
|
def load(self, path: Path) -> RunTrace:
|
||||||
|
return RunTrace.model_validate_json(path.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
def count_for_role(self, role: str) -> int:
|
||||||
|
return len(self.list_for_role(role))
|
||||||
|
|
||||||
|
|
||||||
|
class ProposalStore:
|
||||||
|
"""Filesystem store for ``SkillProposal`` records pending human review."""
|
||||||
|
|
||||||
|
def __init__(self, root: Path | None = None) -> None:
|
||||||
|
self.root = _resolve_root(root) / "skill_proposals"
|
||||||
|
|
||||||
|
def role_dir(self, role: str) -> Path:
|
||||||
|
return self.root / _slug(role)
|
||||||
|
|
||||||
|
def path_for(self, proposal: SkillProposal) -> Path:
|
||||||
|
return self.role_dir(proposal.agent_role) / f"{proposal.id}.json"
|
||||||
|
|
||||||
|
def save(self, proposal: SkillProposal) -> Path:
|
||||||
|
path = self.path_for(proposal)
|
||||||
|
_write_json(path, proposal.model_dump_json(indent=2))
|
||||||
|
return path
|
||||||
|
|
||||||
|
def list_for_role(self, role: str) -> list[Path]:
|
||||||
|
d = self.role_dir(role)
|
||||||
|
if not d.exists():
|
||||||
|
return []
|
||||||
|
return sorted(d.glob("*.json"))
|
||||||
|
|
||||||
|
def load(self, path: Path) -> SkillProposal:
|
||||||
|
return SkillProposal.model_validate_json(path.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
def delete(self, proposal_id: str, role: str) -> bool:
|
||||||
|
path = self.role_dir(role) / f"{proposal_id}.json"
|
||||||
|
if path.exists():
|
||||||
|
path.unlink()
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def find(self, proposal_id: str) -> SkillProposal | None:
|
||||||
|
"""Locate a proposal by id across all role dirs. Returns None if missing."""
|
||||||
|
if not self.root.exists():
|
||||||
|
return None
|
||||||
|
for role_dir in self.root.iterdir():
|
||||||
|
if not role_dir.is_dir():
|
||||||
|
continue
|
||||||
|
path = role_dir / f"{proposal_id}.json"
|
||||||
|
if path.exists():
|
||||||
|
return self.load(path)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def list_all(self) -> list[SkillProposal]:
|
||||||
|
"""All proposals across roles, oldest first by file id."""
|
||||||
|
if not self.root.exists():
|
||||||
|
return []
|
||||||
|
out: list[SkillProposal] = []
|
||||||
|
for role_dir in sorted(self.root.iterdir()):
|
||||||
|
if not role_dir.is_dir():
|
||||||
|
continue
|
||||||
|
for path in sorted(role_dir.glob("*.json")):
|
||||||
|
out.append(self.load(path))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class SkillStore:
|
||||||
|
"""Filesystem store for accepted (live) skills.
|
||||||
|
|
||||||
|
Each accepted ``SkillProposal`` becomes a directory under
|
||||||
|
``role_dir(role) / skill_name`` with a ``SKILL.md`` inside, matching
|
||||||
|
the layout the existing skill loader discovers.
|
||||||
|
|
||||||
|
Two ways to construct:
|
||||||
|
|
||||||
|
- ``SkillStore()`` — root is ``<db_storage_path>/self_improve/skills/``
|
||||||
|
(the platform default colocated with traces + proposals).
|
||||||
|
- ``SkillStore(skills_root=Path("./skills/learned"))`` — root is the
|
||||||
|
given path verbatim. Use this when the agent is configured with
|
||||||
|
``SelfImprovementConfig(skills_dir=...)`` so accepted skills land in
|
||||||
|
the project tree.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
root: Path | None = None,
|
||||||
|
skills_root: Path | None = None,
|
||||||
|
) -> None:
|
||||||
|
self.root = (
|
||||||
|
skills_root
|
||||||
|
if skills_root is not None
|
||||||
|
else _resolve_root(root) / "skills"
|
||||||
|
)
|
||||||
|
|
||||||
|
def role_dir(self, role: str) -> Path:
|
||||||
|
return self.root / _slug(role)
|
||||||
|
|
||||||
|
def skill_dir(self, role: str, skill_name: str) -> Path:
|
||||||
|
return self.role_dir(role) / skill_name
|
||||||
|
|
||||||
|
def has_any(self, role: str) -> bool:
|
||||||
|
d = self.role_dir(role)
|
||||||
|
if not d.exists():
|
||||||
|
return False
|
||||||
|
return any((child / "SKILL.md").is_file() for child in d.iterdir() if child.is_dir())
|
||||||
0
lib/crewai/tests/skills/self_improve/__init__.py
Normal file
0
lib/crewai/tests/skills/self_improve/__init__.py
Normal file
165
lib/crewai/tests/skills/self_improve/test_acceptance.py
Normal file
165
lib/crewai/tests/skills/self_improve/test_acceptance.py
Normal file
@@ -0,0 +1,165 @@
|
|||||||
|
"""Tests for self_improve/acceptance.py."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from crewai.skills.self_improve.acceptance import (
|
||||||
|
_format_skill_md,
|
||||||
|
accept_proposal,
|
||||||
|
reject_proposal,
|
||||||
|
)
|
||||||
|
from crewai.skills.self_improve.models import SkillProposal
|
||||||
|
from crewai.skills.self_improve.storage import ProposalStore, SkillStore
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def proposal() -> SkillProposal:
|
||||||
|
return SkillProposal(
|
||||||
|
agent_role="Senior Researcher",
|
||||||
|
name="cite-sources",
|
||||||
|
description="Always cite sources in research outputs",
|
||||||
|
body="# Cite Sources\n\n*Always cite sources.*\n\n## When to use\n\nWhenever you write a research summary.\n",
|
||||||
|
rationale="Seen in 3 of 4 traces",
|
||||||
|
confidence=0.8,
|
||||||
|
derived_from_runs=["run_a", "run_b", "run_c"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def stores(tmp_path: Path):
|
||||||
|
return SkillStore(root=tmp_path), ProposalStore(root=tmp_path)
|
||||||
|
|
||||||
|
|
||||||
|
class TestFormatSkillMd:
|
||||||
|
def test_includes_yaml_frontmatter(self, proposal: SkillProposal) -> None:
|
||||||
|
out = _format_skill_md(proposal)
|
||||||
|
assert out.startswith("---\n")
|
||||||
|
# Values are JSON-quoted (valid YAML scalars).
|
||||||
|
assert 'name: "cite-sources"' in out
|
||||||
|
assert '"Always cite sources in research outputs"' in out
|
||||||
|
assert "# Cite Sources" in out
|
||||||
|
|
||||||
|
def test_quotes_descriptions_with_special_chars(self) -> None:
|
||||||
|
prop = SkillProposal(
|
||||||
|
agent_role="r",
|
||||||
|
name="n",
|
||||||
|
description='Has a "quote" and: colon',
|
||||||
|
body="b",
|
||||||
|
rationale="r",
|
||||||
|
confidence=0.7,
|
||||||
|
)
|
||||||
|
out = _format_skill_md(prop)
|
||||||
|
# JSON-quoted: backslash-escapes the inner quote, retains the colon literally.
|
||||||
|
assert 'description: "Has a \\"quote\\" and: colon"' in out
|
||||||
|
|
||||||
|
def test_passes_through_body_with_existing_frontmatter(self) -> None:
|
||||||
|
prop = SkillProposal(
|
||||||
|
agent_role="r",
|
||||||
|
name="n",
|
||||||
|
description="d",
|
||||||
|
body="---\nname: n\n---\n# Body\n",
|
||||||
|
rationale="r",
|
||||||
|
confidence=0.7,
|
||||||
|
)
|
||||||
|
out = _format_skill_md(prop)
|
||||||
|
# No double frontmatter
|
||||||
|
assert out.count("---") == 2 # one open, one close
|
||||||
|
|
||||||
|
|
||||||
|
class TestAcceptProposal:
|
||||||
|
def test_writes_skill_md_at_expected_path(
|
||||||
|
self, stores, proposal: SkillProposal
|
||||||
|
) -> None:
|
||||||
|
skill_store, proposal_store = stores
|
||||||
|
proposal_store.save(proposal)
|
||||||
|
|
||||||
|
path = accept_proposal(
|
||||||
|
proposal,
|
||||||
|
skill_store=skill_store,
|
||||||
|
proposal_store=proposal_store,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert path.name == "SKILL.md"
|
||||||
|
# role gets slugified
|
||||||
|
assert "senior-researcher" in str(path)
|
||||||
|
assert "cite-sources" in str(path)
|
||||||
|
|
||||||
|
body = path.read_text()
|
||||||
|
assert body.startswith("---\n")
|
||||||
|
assert "# Cite Sources" in body
|
||||||
|
|
||||||
|
def test_removes_proposal_from_queue_on_accept(
|
||||||
|
self, stores, proposal: SkillProposal
|
||||||
|
) -> None:
|
||||||
|
skill_store, proposal_store = stores
|
||||||
|
proposal_store.save(proposal)
|
||||||
|
assert proposal_store.find(proposal.id) is not None
|
||||||
|
|
||||||
|
accept_proposal(
|
||||||
|
proposal, skill_store=skill_store, proposal_store=proposal_store
|
||||||
|
)
|
||||||
|
|
||||||
|
assert proposal_store.find(proposal.id) is None
|
||||||
|
|
||||||
|
def test_refuses_to_overwrite_existing(
|
||||||
|
self, stores, proposal: SkillProposal
|
||||||
|
) -> None:
|
||||||
|
skill_store, proposal_store = stores
|
||||||
|
proposal_store.save(proposal)
|
||||||
|
accept_proposal(
|
||||||
|
proposal, skill_store=skill_store, proposal_store=proposal_store
|
||||||
|
)
|
||||||
|
# Re-save and re-accept the same proposal id (or a fresh one) → conflict
|
||||||
|
proposal_store.save(proposal)
|
||||||
|
with pytest.raises(FileExistsError):
|
||||||
|
accept_proposal(
|
||||||
|
proposal,
|
||||||
|
skill_store=skill_store,
|
||||||
|
proposal_store=proposal_store,
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_force_overwrites(self, stores, proposal: SkillProposal) -> None:
|
||||||
|
skill_store, proposal_store = stores
|
||||||
|
proposal_store.save(proposal)
|
||||||
|
accept_proposal(
|
||||||
|
proposal, skill_store=skill_store, proposal_store=proposal_store
|
||||||
|
)
|
||||||
|
|
||||||
|
proposal_store.save(proposal)
|
||||||
|
path = accept_proposal(
|
||||||
|
proposal,
|
||||||
|
skill_store=skill_store,
|
||||||
|
proposal_store=proposal_store,
|
||||||
|
force=True,
|
||||||
|
)
|
||||||
|
assert path.exists()
|
||||||
|
|
||||||
|
|
||||||
|
class TestRejectProposal:
|
||||||
|
def test_removes_from_queue(self, stores, proposal: SkillProposal) -> None:
|
||||||
|
_, proposal_store = stores
|
||||||
|
proposal_store.save(proposal)
|
||||||
|
assert reject_proposal(proposal, proposal_store=proposal_store) is True
|
||||||
|
assert proposal_store.find(proposal.id) is None
|
||||||
|
|
||||||
|
def test_returns_false_when_missing(
|
||||||
|
self, stores, proposal: SkillProposal
|
||||||
|
) -> None:
|
||||||
|
_, proposal_store = stores
|
||||||
|
assert reject_proposal(proposal, proposal_store=proposal_store) is False
|
||||||
|
|
||||||
|
|
||||||
|
class TestSkillStore:
|
||||||
|
def test_has_any_detects_at_least_one_skill(
|
||||||
|
self, stores, proposal: SkillProposal
|
||||||
|
) -> None:
|
||||||
|
skill_store, proposal_store = stores
|
||||||
|
assert skill_store.has_any("Senior Researcher") is False
|
||||||
|
proposal_store.save(proposal)
|
||||||
|
accept_proposal(
|
||||||
|
proposal, skill_store=skill_store, proposal_store=proposal_store
|
||||||
|
)
|
||||||
|
assert skill_store.has_any("Senior Researcher") is True
|
||||||
85
lib/crewai/tests/skills/self_improve/test_auto_grade.py
Normal file
85
lib/crewai/tests/skills/self_improve/test_auto_grade.py
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
"""Tests for self_improve/auto_grade.py."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from crewai.skills.self_improve.auto_grade import grade_trace
|
||||||
|
from crewai.skills.self_improve.models import RunTrace, ToolCallRecord
|
||||||
|
|
||||||
|
|
||||||
|
def _trace(**kw):
|
||||||
|
return RunTrace(agent_role="r", **kw)
|
||||||
|
|
||||||
|
|
||||||
|
def test_explicit_error_is_failure() -> None:
|
||||||
|
assert grade_trace(_trace(error="kaboom", output_summary="ok")) == "failure"
|
||||||
|
|
||||||
|
|
||||||
|
def test_guardrail_pass_overrides_other_signals() -> None:
|
||||||
|
trace = _trace(
|
||||||
|
guardrail_passed=True,
|
||||||
|
max_iter_exhausted=True, # would normally fail, but guardrail wins
|
||||||
|
output_summary="ok",
|
||||||
|
)
|
||||||
|
assert grade_trace(trace) == "success"
|
||||||
|
|
||||||
|
|
||||||
|
def test_guardrail_fail_is_failure() -> None:
|
||||||
|
assert grade_trace(_trace(guardrail_passed=False, output_summary="x")) == "failure"
|
||||||
|
|
||||||
|
|
||||||
|
def test_max_iter_is_failure() -> None:
|
||||||
|
assert grade_trace(_trace(max_iter_exhausted=True, output_summary="x")) == "failure"
|
||||||
|
|
||||||
|
|
||||||
|
def test_thrashing_is_failure() -> None:
|
||||||
|
trace = _trace(
|
||||||
|
tool_calls=[
|
||||||
|
ToolCallRecord(name="search", args_summary="q=x") for _ in range(5)
|
||||||
|
],
|
||||||
|
output_summary="ok",
|
||||||
|
)
|
||||||
|
assert grade_trace(trace) == "failure"
|
||||||
|
|
||||||
|
|
||||||
|
def test_empty_output_is_failure() -> None:
|
||||||
|
assert grade_trace(_trace(output_summary=" ")) == "failure"
|
||||||
|
|
||||||
|
|
||||||
|
def test_error_string_output_is_failure() -> None:
|
||||||
|
assert grade_trace(_trace(output_summary="Error: boom")) == "failure"
|
||||||
|
|
||||||
|
|
||||||
|
def test_minority_tool_errors_still_count_as_success() -> None:
|
||||||
|
trace = _trace(
|
||||||
|
tool_calls=[
|
||||||
|
ToolCallRecord(name="a", ok=True),
|
||||||
|
ToolCallRecord(name="b", ok=True),
|
||||||
|
ToolCallRecord(name="c", ok=False, error="x"),
|
||||||
|
],
|
||||||
|
output_summary="answer",
|
||||||
|
)
|
||||||
|
assert grade_trace(trace) == "success"
|
||||||
|
|
||||||
|
|
||||||
|
def test_failure_when_majority_tool_errors() -> None:
|
||||||
|
trace = _trace(
|
||||||
|
tool_calls=[
|
||||||
|
ToolCallRecord(name="a", ok=False, error="x"),
|
||||||
|
ToolCallRecord(name="b", ok=False, error="x"),
|
||||||
|
ToolCallRecord(name="c", ok=True),
|
||||||
|
],
|
||||||
|
output_summary="answer",
|
||||||
|
)
|
||||||
|
assert grade_trace(trace) == "failure"
|
||||||
|
|
||||||
|
|
||||||
|
def test_clean_run_is_success() -> None:
|
||||||
|
trace = _trace(
|
||||||
|
tool_calls=[ToolCallRecord(name="a", ok=True)],
|
||||||
|
output_summary="answer",
|
||||||
|
)
|
||||||
|
assert grade_trace(trace) == "success"
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_signal_is_unknown() -> None:
|
||||||
|
assert grade_trace(_trace()) == "unknown"
|
||||||
103
lib/crewai/tests/skills/self_improve/test_cli_skills.py
Normal file
103
lib/crewai/tests/skills/self_improve/test_cli_skills.py
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
"""Tests for ``crewai skills proposals`` CLI commands."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
from click.testing import CliRunner
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from crewai.cli.skills_proposals import skills as skills_group
|
||||||
|
from crewai.skills.self_improve.models import SkillProposal
|
||||||
|
from crewai.skills.self_improve.storage import ProposalStore, SkillStore
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def proposal() -> SkillProposal:
|
||||||
|
return SkillProposal(
|
||||||
|
agent_role="researcher",
|
||||||
|
name="cite-sources",
|
||||||
|
description="Always cite sources",
|
||||||
|
body="# Cite Sources\n\nbody.",
|
||||||
|
rationale="seen in 3 traces",
|
||||||
|
confidence=0.75,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def runner_with_root(tmp_path: Path):
|
||||||
|
"""CliRunner with the self-improve root patched at the storage layer."""
|
||||||
|
runner = CliRunner()
|
||||||
|
proposal_store = ProposalStore(root=tmp_path)
|
||||||
|
skill_store = SkillStore(root=tmp_path)
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"crewai.cli.skills_proposals.ProposalStore", return_value=proposal_store
|
||||||
|
), patch(
|
||||||
|
"crewai.skills.self_improve.acceptance.ProposalStore",
|
||||||
|
return_value=proposal_store,
|
||||||
|
), patch(
|
||||||
|
"crewai.skills.self_improve.acceptance.SkillStore", return_value=skill_store
|
||||||
|
):
|
||||||
|
yield runner, proposal_store, skill_store
|
||||||
|
|
||||||
|
|
||||||
|
class TestList:
|
||||||
|
def test_empty(self, runner_with_root) -> None:
|
||||||
|
runner, _, _ = runner_with_root
|
||||||
|
result = runner.invoke(skills_group, ["proposals", "list"])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "no pending proposals" in result.output
|
||||||
|
|
||||||
|
def test_one_pending(self, runner_with_root, proposal: SkillProposal) -> None:
|
||||||
|
runner, ps, _ = runner_with_root
|
||||||
|
ps.save(proposal)
|
||||||
|
result = runner.invoke(skills_group, ["proposals", "list"])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert proposal.id in result.output
|
||||||
|
assert "cite-sources" in result.output
|
||||||
|
|
||||||
|
|
||||||
|
class TestShow:
|
||||||
|
def test_unknown_id_exits_nonzero(self, runner_with_root) -> None:
|
||||||
|
runner, _, _ = runner_with_root
|
||||||
|
result = runner.invoke(skills_group, ["proposals", "show", "prop_does_not_exist"])
|
||||||
|
assert result.exit_code == 1
|
||||||
|
assert "No proposal" in result.output
|
||||||
|
|
||||||
|
def test_prints_body(self, runner_with_root, proposal: SkillProposal) -> None:
|
||||||
|
runner, ps, _ = runner_with_root
|
||||||
|
ps.save(proposal)
|
||||||
|
result = runner.invoke(skills_group, ["proposals", "show", proposal.id])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "# Cite Sources" in result.output
|
||||||
|
assert "rationale" in result.output
|
||||||
|
|
||||||
|
|
||||||
|
class TestAccept:
|
||||||
|
def test_writes_skill_md_and_clears_queue(
|
||||||
|
self, runner_with_root, proposal: SkillProposal
|
||||||
|
) -> None:
|
||||||
|
runner, ps, ss = runner_with_root
|
||||||
|
ps.save(proposal)
|
||||||
|
|
||||||
|
result = runner.invoke(skills_group, ["proposals", "accept", proposal.id])
|
||||||
|
assert result.exit_code == 0, result.output
|
||||||
|
assert ps.find(proposal.id) is None
|
||||||
|
assert (ss.skill_dir("researcher", "cite-sources") / "SKILL.md").is_file()
|
||||||
|
|
||||||
|
def test_unknown_id_exits_nonzero(self, runner_with_root) -> None:
|
||||||
|
runner, _, _ = runner_with_root
|
||||||
|
result = runner.invoke(skills_group, ["proposals", "accept", "prop_nope"])
|
||||||
|
assert result.exit_code == 1
|
||||||
|
|
||||||
|
|
||||||
|
class TestReject:
|
||||||
|
def test_removes_from_queue(self, runner_with_root, proposal: SkillProposal) -> None:
|
||||||
|
runner, ps, _ = runner_with_root
|
||||||
|
ps.save(proposal)
|
||||||
|
|
||||||
|
result = runner.invoke(skills_group, ["proposals", "reject", proposal.id])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert ps.find(proposal.id) is None
|
||||||
153
lib/crewai/tests/skills/self_improve/test_collector.py
Normal file
153
lib/crewai/tests/skills/self_improve/test_collector.py
Normal file
@@ -0,0 +1,153 @@
|
|||||||
|
"""Tests for self_improve/collector.py.
|
||||||
|
|
||||||
|
The collector is exercised through the real event bus inside a
|
||||||
|
``scoped_handlers()`` block. Events are constructed with
|
||||||
|
``model_construct`` so we don't need to spin up a real Agent + LLM.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import UTC, datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from types import SimpleNamespace
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from crewai.events.event_bus import crewai_event_bus
|
||||||
|
from crewai.events.types.agent_events import (
|
||||||
|
AgentExecutionCompletedEvent,
|
||||||
|
AgentExecutionErrorEvent,
|
||||||
|
AgentExecutionStartedEvent,
|
||||||
|
)
|
||||||
|
from crewai.events.types.tool_usage_events import (
|
||||||
|
ToolUsageErrorEvent,
|
||||||
|
ToolUsageFinishedEvent,
|
||||||
|
ToolUsageStartedEvent,
|
||||||
|
)
|
||||||
|
from crewai.skills.self_improve.collector import TraceCollector
|
||||||
|
from crewai.skills.self_improve.storage import TraceStore
|
||||||
|
|
||||||
|
|
||||||
|
def _fake_agent(*, agent_id: str = "agent-1", role: str = "researcher"):
|
||||||
|
return SimpleNamespace(id=agent_id, role=role, skills=None)
|
||||||
|
|
||||||
|
|
||||||
|
def _fake_task(task_id: str = "task-1", description: str = "do the thing"):
|
||||||
|
return SimpleNamespace(id=task_id, description=description)
|
||||||
|
|
||||||
|
|
||||||
|
def _started(agent, task) -> AgentExecutionStartedEvent:
|
||||||
|
return AgentExecutionStartedEvent.model_construct(
|
||||||
|
agent=agent, task=task, tools=[], task_prompt=task.description
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _completed(agent, task, output: str) -> AgentExecutionCompletedEvent:
|
||||||
|
return AgentExecutionCompletedEvent.model_construct(
|
||||||
|
agent=agent, task=task, output=output
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _error(agent, task, msg: str) -> AgentExecutionErrorEvent:
|
||||||
|
return AgentExecutionErrorEvent.model_construct(agent=agent, task=task, error=msg)
|
||||||
|
|
||||||
|
|
||||||
|
def _tool_started(agent_id: str, name: str, args="q=x") -> ToolUsageStartedEvent:
|
||||||
|
return ToolUsageStartedEvent.model_construct(
|
||||||
|
agent_id=agent_id, tool_name=name, tool_args=args
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _tool_finished(agent_id: str, name: str, args="q=x") -> ToolUsageFinishedEvent:
|
||||||
|
now = datetime.now(UTC)
|
||||||
|
return ToolUsageFinishedEvent.model_construct(
|
||||||
|
agent_id=agent_id,
|
||||||
|
tool_name=name,
|
||||||
|
tool_args=args,
|
||||||
|
started_at=now,
|
||||||
|
finished_at=now,
|
||||||
|
output="result",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _tool_error(agent_id: str, name: str, args="q=x") -> ToolUsageErrorEvent:
|
||||||
|
return ToolUsageErrorEvent.model_construct(
|
||||||
|
agent_id=agent_id, tool_name=name, tool_args=args, error="boom"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def store(tmp_path: Path) -> TraceStore:
|
||||||
|
return TraceStore(root=tmp_path)
|
||||||
|
|
||||||
|
|
||||||
|
def test_collects_full_run_and_persists(store: TraceStore) -> None:
|
||||||
|
agent = _fake_agent()
|
||||||
|
task = _fake_task()
|
||||||
|
collector = TraceCollector(agent, store=store)
|
||||||
|
|
||||||
|
with crewai_event_bus.scoped_handlers():
|
||||||
|
collector.attach(crewai_event_bus)
|
||||||
|
# Flush between emits so the bus thread pool can't reorder
|
||||||
|
# tool events past the completion event in this fast test path.
|
||||||
|
crewai_event_bus.emit(None, _started(agent, task))
|
||||||
|
crewai_event_bus.flush(timeout=5.0)
|
||||||
|
crewai_event_bus.emit(None, _tool_started(agent.id, "search"))
|
||||||
|
crewai_event_bus.emit(None, _tool_finished(agent.id, "search"))
|
||||||
|
crewai_event_bus.flush(timeout=5.0)
|
||||||
|
crewai_event_bus.emit(None, _completed(agent, task, "final answer"))
|
||||||
|
crewai_event_bus.flush(timeout=5.0)
|
||||||
|
|
||||||
|
saved = store.list_for_role("researcher")
|
||||||
|
assert len(saved) == 1
|
||||||
|
|
||||||
|
trace = store.load(saved[0])
|
||||||
|
assert trace.agent_role == "researcher"
|
||||||
|
assert trace.task_id == "task-1"
|
||||||
|
assert trace.output_summary == "final answer"
|
||||||
|
assert trace.outcome == "success"
|
||||||
|
assert trace.tool_call_count == 1
|
||||||
|
assert trace.tool_error_count == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_error_path_is_graded_as_failure(store: TraceStore) -> None:
|
||||||
|
agent = _fake_agent()
|
||||||
|
task = _fake_task()
|
||||||
|
collector = TraceCollector(agent, store=store)
|
||||||
|
|
||||||
|
with crewai_event_bus.scoped_handlers():
|
||||||
|
collector.attach(crewai_event_bus)
|
||||||
|
crewai_event_bus.emit(None, _started(agent, task))
|
||||||
|
crewai_event_bus.flush(timeout=5.0)
|
||||||
|
crewai_event_bus.emit(None, _tool_error(agent.id, "search"))
|
||||||
|
crewai_event_bus.flush(timeout=5.0)
|
||||||
|
crewai_event_bus.emit(None, _error(agent, task, "agent crashed"))
|
||||||
|
crewai_event_bus.flush(timeout=5.0)
|
||||||
|
|
||||||
|
[path] = store.list_for_role("researcher")
|
||||||
|
trace = store.load(path)
|
||||||
|
assert trace.outcome == "failure"
|
||||||
|
assert trace.error == "agent crashed"
|
||||||
|
assert trace.tool_error_count == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_ignores_events_for_other_agents(store: TraceStore) -> None:
|
||||||
|
mine = _fake_agent(agent_id="mine", role="researcher")
|
||||||
|
other = _fake_agent(agent_id="other", role="editor")
|
||||||
|
task = _fake_task()
|
||||||
|
collector = TraceCollector(mine, store=store)
|
||||||
|
|
||||||
|
with crewai_event_bus.scoped_handlers():
|
||||||
|
collector.attach(crewai_event_bus)
|
||||||
|
crewai_event_bus.emit(None, _started(mine, task))
|
||||||
|
crewai_event_bus.flush(timeout=5.0)
|
||||||
|
# tool events for some other agent must not pollute our trace
|
||||||
|
crewai_event_bus.emit(None, _tool_finished(other.id, "leaked-tool"))
|
||||||
|
crewai_event_bus.emit(None, _tool_finished(mine.id, "real-tool"))
|
||||||
|
crewai_event_bus.flush(timeout=5.0)
|
||||||
|
crewai_event_bus.emit(None, _completed(mine, task, "ok"))
|
||||||
|
crewai_event_bus.flush(timeout=5.0)
|
||||||
|
|
||||||
|
[path] = store.list_for_role("researcher")
|
||||||
|
trace = store.load(path)
|
||||||
|
assert [t.name for t in trace.tool_calls] == ["real-tool"]
|
||||||
82
lib/crewai/tests/skills/self_improve/test_models.py
Normal file
82
lib/crewai/tests/skills/self_improve/test_models.py
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
"""Tests for self_improve/models.py."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from pydantic import ValidationError
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from crewai.skills.self_improve.models import (
|
||||||
|
RunTrace,
|
||||||
|
SelfImprovementConfig,
|
||||||
|
SkillProposal,
|
||||||
|
ToolCallRecord,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestSelfImprovementConfig:
|
||||||
|
def test_defaults(self) -> None:
|
||||||
|
cfg = SelfImprovementConfig()
|
||||||
|
assert cfg.skills_dir is None
|
||||||
|
|
||||||
|
def test_skills_dir_round_trip(self, tmp_path: Path) -> None:
|
||||||
|
cfg = SelfImprovementConfig(skills_dir=tmp_path / "learned")
|
||||||
|
assert cfg.skills_dir == tmp_path / "learned"
|
||||||
|
|
||||||
|
|
||||||
|
class TestRunTrace:
|
||||||
|
def test_minimal_trace_has_defaults(self) -> None:
|
||||||
|
trace = RunTrace(agent_role="researcher")
|
||||||
|
assert trace.agent_role == "researcher"
|
||||||
|
assert trace.outcome == "unknown"
|
||||||
|
assert trace.tool_calls == []
|
||||||
|
assert trace.id.startswith("run_")
|
||||||
|
assert trace.tool_call_count == 0
|
||||||
|
assert trace.tool_error_count == 0
|
||||||
|
|
||||||
|
def test_tool_counters(self) -> None:
|
||||||
|
trace = RunTrace(
|
||||||
|
agent_role="researcher",
|
||||||
|
tool_calls=[
|
||||||
|
ToolCallRecord(name="search", ok=True),
|
||||||
|
ToolCallRecord(name="search", ok=False, error="boom"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert trace.tool_call_count == 2
|
||||||
|
assert trace.tool_error_count == 1
|
||||||
|
|
||||||
|
def test_serializes_roundtrip(self) -> None:
|
||||||
|
trace = RunTrace(
|
||||||
|
agent_role="researcher",
|
||||||
|
tool_calls=[ToolCallRecord(name="search", args_summary="q=hi")],
|
||||||
|
)
|
||||||
|
payload = trace.model_dump_json()
|
||||||
|
roundtrip = RunTrace.model_validate_json(payload)
|
||||||
|
assert roundtrip.id == trace.id
|
||||||
|
assert roundtrip.tool_calls[0].name == "search"
|
||||||
|
|
||||||
|
|
||||||
|
class TestSkillProposal:
|
||||||
|
def test_minimal_proposal(self) -> None:
|
||||||
|
prop = SkillProposal(
|
||||||
|
agent_role="researcher",
|
||||||
|
name="my-skill",
|
||||||
|
description="A skill",
|
||||||
|
body="# body",
|
||||||
|
rationale="seen 3 times",
|
||||||
|
confidence=0.8,
|
||||||
|
)
|
||||||
|
assert prop.proposal_kind == "new"
|
||||||
|
assert prop.id.startswith("prop_")
|
||||||
|
|
||||||
|
def test_confidence_must_be_in_range(self) -> None:
|
||||||
|
with pytest.raises(ValidationError):
|
||||||
|
SkillProposal(
|
||||||
|
agent_role="r",
|
||||||
|
name="n",
|
||||||
|
description="d",
|
||||||
|
body="b",
|
||||||
|
rationale="r",
|
||||||
|
confidence=1.5,
|
||||||
|
)
|
||||||
192
lib/crewai/tests/skills/self_improve/test_reviewer.py
Normal file
192
lib/crewai/tests/skills/self_improve/test_reviewer.py
Normal file
@@ -0,0 +1,192 @@
|
|||||||
|
"""Tests for self_improve/reviewer.py."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from crewai.skills.self_improve.models import RunTrace, SkillProposal, ToolCallRecord
|
||||||
|
from crewai.skills.self_improve.reviewer import (
|
||||||
|
SkillReviewer,
|
||||||
|
_ReviewerOutput,
|
||||||
|
_format_trace,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _llm_proposal(**kw):
|
||||||
|
"""Build a SkillProposal as the LLM would emit it (no server-filled fields)."""
|
||||||
|
base = dict(
|
||||||
|
name="x",
|
||||||
|
description="d",
|
||||||
|
body="b",
|
||||||
|
rationale="r",
|
||||||
|
confidence=0.7,
|
||||||
|
)
|
||||||
|
base.update(kw)
|
||||||
|
return SkillProposal(**base)
|
||||||
|
|
||||||
|
|
||||||
|
def _trace(**kw: Any) -> RunTrace:
|
||||||
|
base = {"agent_role": "researcher", "outcome": "success"}
|
||||||
|
base.update(kw)
|
||||||
|
return RunTrace(**base)
|
||||||
|
|
||||||
|
|
||||||
|
def _stub_llm(output: _ReviewerOutput) -> MagicMock:
|
||||||
|
"""Return a BaseLLM stand-in whose ``call`` returns the given output."""
|
||||||
|
llm = MagicMock()
|
||||||
|
llm.call = MagicMock(return_value=output)
|
||||||
|
return llm
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def reviewer_factory():
|
||||||
|
def make(llm, **kw):
|
||||||
|
return SkillReviewer(
|
||||||
|
agent_role="researcher",
|
||||||
|
agent_goal="answer questions",
|
||||||
|
llm=llm,
|
||||||
|
**kw,
|
||||||
|
)
|
||||||
|
|
||||||
|
return make
|
||||||
|
|
||||||
|
|
||||||
|
class TestFormatTrace:
|
||||||
|
def test_includes_outcome_task_and_tool_calls(self) -> None:
|
||||||
|
trace = _trace(
|
||||||
|
task_description="Find papers on X",
|
||||||
|
output_summary="Found 3 papers.",
|
||||||
|
tool_calls=[
|
||||||
|
ToolCallRecord(name="search", args_summary="q=X", ok=True),
|
||||||
|
ToolCallRecord(name="fetch", args_summary="id=42", ok=False, error="404"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
block = _format_trace(trace)
|
||||||
|
assert "outcome=success" in block
|
||||||
|
assert "Find papers on X" in block
|
||||||
|
assert "Found 3 papers." in block
|
||||||
|
assert "[ok] search(q=X)" in block
|
||||||
|
assert "[ERR] fetch(id=42)" in block
|
||||||
|
|
||||||
|
def test_truncates_long_output(self) -> None:
|
||||||
|
trace = _trace(output_summary="x" * 5000)
|
||||||
|
block = _format_trace(trace)
|
||||||
|
assert "…" in block
|
||||||
|
assert len(block) < 5000
|
||||||
|
|
||||||
|
|
||||||
|
class TestSkillReviewer:
|
||||||
|
def test_returns_empty_when_below_min_traces(self, reviewer_factory) -> None:
|
||||||
|
llm = _stub_llm(_ReviewerOutput(proposals=[]))
|
||||||
|
reviewer = reviewer_factory(llm, min_traces=3)
|
||||||
|
result = reviewer.review([_trace(), _trace()])
|
||||||
|
assert result == []
|
||||||
|
llm.call.assert_not_called()
|
||||||
|
|
||||||
|
def test_filters_by_confidence_floor(self, reviewer_factory) -> None:
|
||||||
|
llm_output = _ReviewerOutput(
|
||||||
|
proposals=[
|
||||||
|
_llm_proposal(name="keep", confidence=0.8),
|
||||||
|
_llm_proposal(name="drop", confidence=0.3),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
llm = _stub_llm(llm_output)
|
||||||
|
reviewer = reviewer_factory(llm, min_traces=2, confidence_floor=0.6)
|
||||||
|
out = reviewer.review([_trace(), _trace(), _trace()])
|
||||||
|
assert [p.name for p in out] == ["keep"]
|
||||||
|
|
||||||
|
def test_sets_agent_role_and_run_ids(self, reviewer_factory) -> None:
|
||||||
|
llm = _stub_llm(
|
||||||
|
_ReviewerOutput(proposals=[_llm_proposal(name="cite-sources")])
|
||||||
|
)
|
||||||
|
reviewer = reviewer_factory(llm, min_traces=2)
|
||||||
|
traces = [_trace(), _trace(), _trace()]
|
||||||
|
out = reviewer.review(traces)
|
||||||
|
assert len(out) == 1
|
||||||
|
prop = out[0]
|
||||||
|
assert prop.agent_role == "researcher"
|
||||||
|
assert prop.derived_from_runs == [t.id for t in traces]
|
||||||
|
assert prop.proposal_kind == "new"
|
||||||
|
|
||||||
|
def test_passes_loaded_skills_into_prompt(self, reviewer_factory) -> None:
|
||||||
|
llm = _stub_llm(_ReviewerOutput(proposals=[]))
|
||||||
|
reviewer = reviewer_factory(llm, min_traces=2)
|
||||||
|
reviewer.review(
|
||||||
|
[_trace(), _trace(), _trace()],
|
||||||
|
loaded_skill_names=["citing", "search-tactics"],
|
||||||
|
)
|
||||||
|
call_kwargs = llm.call.call_args.kwargs
|
||||||
|
messages = call_kwargs["messages"]
|
||||||
|
system_msg = next(m["content"] for m in messages if m["role"] == "system")
|
||||||
|
assert "citing" in system_msg
|
||||||
|
assert "search-tactics" in system_msg
|
||||||
|
|
||||||
|
def test_handles_non_model_response_gracefully(self, reviewer_factory) -> None:
|
||||||
|
# An LLM that returned a plain string instead of the structured model.
|
||||||
|
llm = MagicMock()
|
||||||
|
llm.call = MagicMock(return_value="totally not a model")
|
||||||
|
reviewer = reviewer_factory(llm, min_traces=2)
|
||||||
|
out = reviewer.review([_trace(), _trace(), _trace()])
|
||||||
|
assert out == []
|
||||||
|
|
||||||
|
def test_pending_proposals_appear_in_prompt(self, reviewer_factory) -> None:
|
||||||
|
llm = _stub_llm(_ReviewerOutput(proposals=[]))
|
||||||
|
reviewer = reviewer_factory(llm, min_traces=2)
|
||||||
|
pending = [
|
||||||
|
SkillProposal(
|
||||||
|
agent_role="researcher",
|
||||||
|
name="cite-sources",
|
||||||
|
description="Always cite sources in research output.",
|
||||||
|
body="b",
|
||||||
|
rationale="r",
|
||||||
|
confidence=0.8,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
reviewer.review(
|
||||||
|
[_trace(), _trace(), _trace()],
|
||||||
|
pending_proposals=pending,
|
||||||
|
)
|
||||||
|
messages = llm.call.call_args.kwargs["messages"]
|
||||||
|
system_msg = next(m["content"] for m in messages if m["role"] == "system")
|
||||||
|
assert "cite-sources" in system_msg
|
||||||
|
assert "Always cite sources" in system_msg
|
||||||
|
# And it should be in the queued-proposals section, not the loaded
|
||||||
|
# skills section.
|
||||||
|
assert "QUEUED" in system_msg
|
||||||
|
|
||||||
|
def test_pending_proposals_default_none_renders_none(
|
||||||
|
self, reviewer_factory
|
||||||
|
) -> None:
|
||||||
|
llm = _stub_llm(_ReviewerOutput(proposals=[]))
|
||||||
|
reviewer = reviewer_factory(llm, min_traces=2)
|
||||||
|
reviewer.review([_trace(), _trace(), _trace()])
|
||||||
|
system_msg = next(
|
||||||
|
m["content"]
|
||||||
|
for m in llm.call.call_args.kwargs["messages"]
|
||||||
|
if m["role"] == "system"
|
||||||
|
)
|
||||||
|
# The "(none)" sentinel renders under both sections when nothing was passed.
|
||||||
|
assert system_msg.count("(none)") >= 2
|
||||||
|
|
||||||
|
def test_patch_existing_kind_passes_through(self, reviewer_factory) -> None:
|
||||||
|
llm = _stub_llm(
|
||||||
|
_ReviewerOutput(
|
||||||
|
proposals=[
|
||||||
|
_llm_proposal(
|
||||||
|
name="citing-v2",
|
||||||
|
confidence=0.9,
|
||||||
|
proposal_kind="patch_existing",
|
||||||
|
target_skill="citing",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
reviewer = reviewer_factory(llm, min_traces=2)
|
||||||
|
[prop] = reviewer.review(
|
||||||
|
[_trace(), _trace(), _trace()], loaded_skill_names=["citing"]
|
||||||
|
)
|
||||||
|
assert prop.proposal_kind == "patch_existing"
|
||||||
|
assert prop.target_skill == "citing"
|
||||||
69
lib/crewai/tests/skills/self_improve/test_storage.py
Normal file
69
lib/crewai/tests/skills/self_improve/test_storage.py
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
"""Tests for self_improve/storage.py."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from crewai.skills.self_improve.models import RunTrace, SkillProposal
|
||||||
|
from crewai.skills.self_improve.storage import ProposalStore, TraceStore
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def trace() -> RunTrace:
|
||||||
|
return RunTrace(agent_role="Senior Researcher", output_summary="hello")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def proposal() -> SkillProposal:
|
||||||
|
return SkillProposal(
|
||||||
|
agent_role="Senior Researcher",
|
||||||
|
name="cite-sources",
|
||||||
|
description="Always cite sources",
|
||||||
|
body="# body",
|
||||||
|
rationale="3 of 5 runs cited",
|
||||||
|
confidence=0.7,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestTraceStore:
|
||||||
|
def test_save_and_load_roundtrip(self, tmp_path: Path, trace: RunTrace) -> None:
|
||||||
|
store = TraceStore(root=tmp_path)
|
||||||
|
path = store.save(trace)
|
||||||
|
assert path.exists()
|
||||||
|
# role gets slugified into the dir name
|
||||||
|
assert "senior-researcher" in str(path)
|
||||||
|
|
||||||
|
loaded = store.load(path)
|
||||||
|
assert loaded.id == trace.id
|
||||||
|
assert loaded.output_summary == "hello"
|
||||||
|
|
||||||
|
def test_list_for_role(self, tmp_path: Path) -> None:
|
||||||
|
store = TraceStore(root=tmp_path)
|
||||||
|
for _ in range(3):
|
||||||
|
store.save(RunTrace(agent_role="researcher"))
|
||||||
|
assert store.count_for_role("researcher") == 3
|
||||||
|
|
||||||
|
def test_role_slug_is_filesystem_safe(self, tmp_path: Path) -> None:
|
||||||
|
store = TraceStore(root=tmp_path)
|
||||||
|
store.save(RunTrace(agent_role="Weird/Role:Name!"))
|
||||||
|
# only safe chars survive after slugify
|
||||||
|
assert any(p.is_dir() for p in store.root.iterdir())
|
||||||
|
|
||||||
|
|
||||||
|
class TestProposalStore:
|
||||||
|
def test_save_and_load_roundtrip(
|
||||||
|
self, tmp_path: Path, proposal: SkillProposal
|
||||||
|
) -> None:
|
||||||
|
store = ProposalStore(root=tmp_path)
|
||||||
|
path = store.save(proposal)
|
||||||
|
loaded = store.load(path)
|
||||||
|
assert loaded.id == proposal.id
|
||||||
|
assert loaded.name == "cite-sources"
|
||||||
|
|
||||||
|
def test_delete(self, tmp_path: Path, proposal: SkillProposal) -> None:
|
||||||
|
store = ProposalStore(root=tmp_path)
|
||||||
|
store.save(proposal)
|
||||||
|
assert store.delete(proposal.id, "Senior Researcher") is True
|
||||||
|
assert store.delete(proposal.id, "Senior Researcher") is False
|
||||||
Reference in New Issue
Block a user