Merge branch 'main' into update_readme.md

2026-01-10 16:48:30 +00:00 · 2024-09-06 23:59:41 -07:00
parent be3680fb62 72fb109147
commit e53d26a1d3
16 changed files with 128 additions and 101 deletions
--- a/docs/assets/crewai-langtrace-spans.png
+++ b/docs/assets/crewai-langtrace-spans.png
--- a/docs/assets/crewai-langtrace-stats.png
+++ b/docs/assets/crewai-langtrace-stats.png
--- a/docs/assets/langtrace1.png
+++ b/docs/assets/langtrace1.png
--- a/docs/assets/langtrace2.png
+++ b/docs/assets/langtrace2.png
--- a/docs/assets/langtrace3.png
+++ b/docs/assets/langtrace3.png
--- a/docs/how-to/Langtrace-Observability.md
+++ b/docs/how-to/Langtrace-Observability.md
@@ -7,10 +7,14 @@ description: How to monitor cost, latency, and performance of CrewAI Agents usin
 Langtrace is an open-source, external tool that helps you set up observability and evaluations for Large Language Models (LLMs), LLM frameworks, and Vector Databases. While not built directly into CrewAI, Langtrace can be used alongside CrewAI to gain deep visibility into the cost, latency, and performance of your CrewAI Agents. This integration allows you to log hyperparameters, monitor performance regressions, and establish a process for continuous improvement of your Agents.
 ![Overview of a select series of agent session runs](..%2Fassets%2Flangtrace1.png)
 ![Overview of agent traces](..%2Fassets%2Flangtrace2.png)
 ![Overview of llm traces in details](..%2Fassets%2Flangtrace3.png)
 ## Setup Instructions
 1. Sign up for [Langtrace](https://langtrace.ai/) by visiting [https://langtrace.ai/signup](https://langtrace.ai/signup).
-2. Create a project and generate an API key.
+2. Create a project, set the project type to crewAI & generate an API key.
 3. Install Langtrace in your CrewAI project using the following commands:
 ```bash
@@ -32,58 +36,29 @@ langtrace.init(api_key='<LANGTRACE_API_KEY>')
 from crewai import Agent, Task, Crew
 ```
 2. Create your CrewAI agents and tasks as usual.
 3. Use Langtrace's tracking functions to monitor your CrewAI operations. For example:
 ```python
 with langtrace.trace("CrewAI Task Execution"):
    result = crew.kickoff()
 ```
 ### Features and Their Application to CrewAI
 1. **LLM Token and Cost Tracking**
   - Monitor the token usage and associated costs for each CrewAI agent interaction.
   - Example:
     ```python
     with langtrace.trace("Agent Interaction"):
         agent_response = agent.execute(task)
     ```
 2. **Trace Graph for Execution Steps**
   - Visualize the execution flow of your CrewAI tasks, including latency and logs.
   - Useful for identifying bottlenecks in your agent workflows.
 3. **Dataset Curation with Manual Annotation**
   - Create datasets from your CrewAI task outputs for future training or evaluation.
   - Example:
     ```python
     langtrace.log_dataset_item(task_input, agent_output, {"task_type": "research"})
     ```
 4. **Prompt Versioning and Management**
   - Keep track of different versions of prompts used in your CrewAI agents.
   - Useful for A/B testing and optimizing agent performance.
 5. **Prompt Playground with Model Comparisons**
   - Test and compare different prompts and models for your CrewAI agents before deployment.
 6. **Testing and Evaluations**
   - Set up automated tests for your CrewAI agents and tasks.
   - Example:
     ```python
     langtrace.evaluate(agent_output, expected_output, "accuracy")
     ```
 ## Monitoring New CrewAI Features
 CrewAI has introduced several new features that can be monitored using Langtrace:
 1. **Code Execution**: Monitor the performance and output of code executed by agents.
   ```python
   with langtrace.trace("Agent Code Execution"):
       code_output = agent.execute_code(code_snippet)
   ```
 2. **Third-party Agent Integration**: Track interactions with LlamaIndex, LangChain, and Autogen agents.
--- a/src/crewai/crew.py
+++ b/src/crewai/crew.py
@@ -584,7 +584,10 @@ class Crew(BaseModel):
            self.manager_agent.allow_delegation = True
            manager = self.manager_agent
            if manager.tools is not None and len(manager.tools) > 0:
-                raise Exception("Manager agent should not have tools")
+                self._logger.log(
                    "warning", "Manager agent should not have tools", color="orange"
                )
                manager.tools = []
            manager.tools = self.manager_agent.get_delegation_tools(self.agents)
        else:
            manager = Agent(
--- a/src/crewai/project/annotations.py
+++ b/src/crewai/project/annotations.py
@@ -103,7 +103,8 @@ def crew(func):
        for task_name in sorted_task_names:
            task_instance = tasks[task_name]()
            instantiated_tasks.append(task_instance)
-            if hasattr(task_instance, "agent"):
+            agent_instance = getattr(task_instance, "agent", None)
            if agent_instance is not None:
                agent_instance = task_instance.agent
                if agent_instance.role not in agent_roles:
                    instantiated_agents.append(agent_instance)
--- a/src/crewai/task.py
+++ b/src/crewai/task.py
@@ -6,7 +6,7 @@ import uuid
 from concurrent.futures import Future
 from copy import copy
 from hashlib import md5
-from typing import Any, Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
 from opentelemetry.trace import Span
 from pydantic import (
@@ -108,6 +108,7 @@ class Task(BaseModel):
        description="A converter class used to export structured output",
        default=None,
    )
    processed_by_agents: Set[str] = Field(default_factory=set)
    _telemetry: Telemetry = PrivateAttr(default_factory=Telemetry)
    _execution_span: Optional[Span] = PrivateAttr(default=None)
@@ -241,6 +242,8 @@ class Task(BaseModel):
        self.prompt_context = context
        tools = tools or self.tools or []
        self.processed_by_agents.add(agent.role)
        result = agent.execute_task(
            task=self,
            context=context,
@@ -273,9 +276,7 @@ class Task(BaseModel):
            content = (
                json_output
                if json_output
-                else pydantic_output.model_dump_json()
+                else pydantic_output.model_dump_json() if pydantic_output else result
                if pydantic_output
                else result
            )
            self._save_file(content)
@@ -310,8 +311,10 @@ class Task(BaseModel):
        """Increment the tools errors counter."""
        self.tools_errors += 1
-    def increment_delegations(self) -> None:
+    def increment_delegations(self, agent_name: Optional[str]) -> None:
        """Increment the delegations counter."""
        if agent_name:
            self.processed_by_agents.add(agent_name)
        self.delegations += 1
    def copy(self, agents: List["BaseAgent"]) -> "Task":
--- a/src/crewai/tools/tool_calling.py
+++ b/src/crewai/tools/tool_calling.py
@@ -1,8 +1,8 @@
 from typing import Any, Dict, Optional
 from pydantic import BaseModel, Field
 from pydantic import BaseModel as PydanticBaseModel
 from pydantic import Field as PydanticField
 from pydantic.v1 import BaseModel, Field
 class ToolCalling(BaseModel):
--- a/src/crewai/tools/tool_output_parser.py
+++ b/src/crewai/tools/tool_output_parser.py
@@ -5,7 +5,7 @@ import regex
 from langchain.output_parsers import PydanticOutputParser
 from langchain_core.exceptions import OutputParserException
 from langchain_core.outputs import Generation
-from langchain_core.pydantic_v1 import ValidationError
+from pydantic import ValidationError
 class ToolOutputParser(PydanticOutputParser):
--- a/src/crewai/tools/tool_usage.py
+++ b/src/crewai/tools/tool_usage.py
@@ -8,6 +8,7 @@ from langchain_core.tools import BaseTool
 from langchain_openai import ChatOpenAI
 from crewai.agents.tools_handler import ToolsHandler
 from crewai.task import Task
 from crewai.telemetry import Telemetry
 from crewai.tools.tool_calling import InstructorToolCalling, ToolCalling
 from crewai.utilities import I18N, Converter, ConverterError, Printer
@@ -51,7 +52,7 @@ class ToolUsage:
        original_tools: List[Any],
        tools_description: str,
        tools_names: str,
-        task: Any,
+        task: Task,
        function_calling_llm: Any,
        agent: Any,
        action: Any,
@@ -154,7 +155,10 @@ class ToolUsage:
                    "Delegate work to coworker",
                    "Ask question to coworker",
                ]:
-                    self.task.increment_delegations()
+                    coworker = (
                        calling.arguments.get("coworker") if calling.arguments else None
                    )
                    self.task.increment_delegations(coworker)
                if calling.arguments:
                    try:
@@ -241,7 +245,7 @@ class ToolUsage:
            result = self._remember_format(result=result)  # type: ignore # "_remember_format" of "ToolUsage" does not return a value (it only ever returns None)
        return result
-    def _should_remember_format(self) -> None:
+    def _should_remember_format(self) -> bool:
        return self.task.used_tools % self._remember_format_after_usages == 0
    def _remember_format(self, result: str) -> None:
@@ -353,10 +357,10 @@ class ToolUsage:
                    return ToolUsageErrorException(  # type: ignore # Incompatible return value type (got "ToolUsageErrorException", expected "ToolCalling | InstructorToolCalling")
                        f'{self._i18n.errors("tool_arguments_error")}'
                    )
-                calling = ToolCalling(  # type: ignore # Unexpected keyword argument "log" for "ToolCalling"
+                calling = ToolCalling(
                    tool_name=tool.name,
                    arguments=arguments,
-                    log=tool_string,
+                    log=tool_string,  # type: ignore
                )
        except Exception as e:
            self._run_attempts += 1
@@ -404,19 +408,19 @@ class ToolUsage:
                        '"' + value.replace('"', '\\"') + '"'
                    )  # Re-encapsulate with double quotes
                elif value.isdigit():  # Check if value is a digit, hence integer
-                    formatted_value = value
+                    value = value
                elif value.lower() in [
                    "true",
                    "false",
                    "null",
                ]:  # Check for boolean and null values
-                    formatted_value = value.lower()
+                    value = value.lower()
                else:
                    # Assume the value is a string and needs quotes
-                    formatted_value = '"' + value.replace('"', '\\"') + '"'
+                    value = '"' + value.replace('"', '\\"') + '"'
                # Rebuild the entry with proper quoting
-                formatted_entry = f'"{key}": {formatted_value}'
+                formatted_entry = f'"{key}": {value}'
                formatted_entries.append(formatted_entry)
            # Reconstruct the JSON string
--- a/src/crewai/utilities/config.py
+++ b/src/crewai/utilities/config.py
@@ -23,17 +23,16 @@ def process_config(
    # Copy values from config (originally from YAML) to the model's attributes.
    # Only copy if the attribute isn't already set, preserving any explicitly defined values.
    for key, value in config.items():
-        if key not in model_class.model_fields:
+        if key not in model_class.model_fields or values.get(key) is not None:
            continue
-        if values.get(key) is not None:
+
-            continue
+        if isinstance(value, dict):
        if isinstance(value, (str, int, float, bool, list)):
            values[key] = value
        elif isinstance(value, dict):
            if isinstance(values.get(key), dict):
                values[key].update(value)
            else:
                values[key] = value
        else:
            values[key] = value
    # Remove the config from values to avoid duplicate processing
    values.pop("config", None)
--- a/src/crewai/utilities/crew_pydantic_output_parser.py
+++ b/src/crewai/utilities/crew_pydantic_output_parser.py
@@ -5,8 +5,7 @@ import regex
 from langchain.output_parsers import PydanticOutputParser
 from langchain_core.exceptions import OutputParserException
 from langchain_core.outputs import Generation
-from langchain_core.pydantic_v1 import ValidationError
+from pydantic import BaseModel, ValidationError
 from pydantic import BaseModel
 class CrewPydanticOutputParser(PydanticOutputParser):
--- a/src/crewai/utilities/evaluators/crew_evaluator_handler.py
+++ b/src/crewai/utilities/evaluators/crew_evaluator_handler.py
@@ -1,14 +1,14 @@
 from collections import defaultdict
 from langchain_openai import ChatOpenAI
 from pydantic import BaseModel, Field
 from rich.console import Console
 from rich.table import Table
 from crewai.agent import Agent
 from crewai.task import Task
 from crewai.tasks.task_output import TaskOutput
 from crewai.telemetry import Telemetry
 from langchain_openai import ChatOpenAI
 from pydantic import BaseModel, Field
 from rich.box import HEAVY_EDGE
 from rich.console import Console
 from rich.table import Table
 class TaskEvaluationPydanticOutput(BaseModel):
@@ -77,50 +77,72 @@ class CrewEvaluator:
    def print_crew_evaluation_result(self) -> None:
        """
        Prints the evaluation result of the crew in a table.
-        A Crew with 2 tasks using the command crewai test -n 2
+        A Crew with 2 tasks using the command crewai test -n 3
        will output the following table:
-                        Task Scores
+                        Tasks Scores
                    (1-10 Higher is better)
-            ┏━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━┓
+        ┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
-            ┃ Tasks/Crew ┃ Run 1 ┃ Run 2 ┃ Avg. Total ┃
+        ┃ Tasks/Crew/Agents  ┃ Run 1 ┃ Run 2 ┃ Run 3 ┃ Avg. Total ┃ Agents                       ┃
-            ┡━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━┩
+        ┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
-            │ Task 1     │ 10.0  │ 9.0   │ 9.5        │
+        │ Task 1             │ 9.0   │ 10.0  │ 9.0   │ 9.3        │ - AI LLMs Senior Researcher  │
-            │ Task 2     │ 9.0   │ 9.0   │ 9.0        │
+        │                    │       │       │       │            │ - AI LLMs Reporting Analyst  │
-            │ Crew       │ 9.5   │ 9.0   │ 9.2        │
+        │                    │       │       │       │            │                              │
-            └────────────┴───────┴───────┴────────────┘
+        │ Task 2             │ 9.0   │ 9.0   │ 9.0   │ 9.0        │ - AI LLMs Senior Researcher  │
        │                    │       │       │       │            │ - AI LLMs Reporting Analyst  │
        │                    │       │       │       │            │                              │
        │ Crew               │ 9.0   │ 9.5   │ 9.0   │ 9.2        │                              │
        │ Execution Time (s) │ 42    │ 79    │ 52    │ 57         │                              │
        └────────────────────┴───────┴───────┴───────┴────────────┴──────────────────────────────┘
        """
        task_averages = [
            sum(scores) / len(scores) for scores in zip(*self.tasks_scores.values())
        ]
        crew_average = sum(task_averages) / len(task_averages)
-        # Create a table
+        table = Table(title="Tasks Scores \n (1-10 Higher is better)", box=HEAVY_EDGE)
        table = Table(title="Tasks Scores \n (1-10 Higher is better)")
-        # Add columns for the table
+        table.add_column("Tasks/Crew/Agents", style="cyan")
        table.add_column("Tasks/Crew")
        for run in range(1, len(self.tasks_scores) + 1):
-            table.add_column(f"Run {run}")
+            table.add_column(f"Run {run}", justify="center")
-        table.add_column("Avg. Total")
+        table.add_column("Avg. Total", justify="center")
        table.add_column("Agents", style="green")
-        # Add rows for each task
+        for task_index, task in enumerate(self.crew.tasks):
        for task_index in range(len(task_averages)):
            task_scores = [
                self.tasks_scores[run][task_index]
                for run in range(1, len(self.tasks_scores) + 1)
            ]
            avg_score = task_averages[task_index]
            agents = list(task.processed_by_agents)
            # Add the task row with the first agent
            table.add_row(
-                f"Task {task_index + 1}", *map(str, task_scores), f"{avg_score:.1f}"
+                f"Task {task_index + 1}",
                *[f"{score:.1f}" for score in task_scores],
                f"{avg_score:.1f}",
                f"- {agents[0]}" if agents else "",
            )
-        # Add a row for the crew average
+            # Add rows for additional agents
            for agent in agents[1:]:
                table.add_row("", "", "", "", "", f"- {agent}")
            # Add a blank separator row if it's not the last task
            if task_index < len(self.crew.tasks) - 1:
                table.add_row("", "", "", "", "", "")
        # Add Crew and Execution Time rows
        crew_scores = [
            sum(self.tasks_scores[run]) / len(self.tasks_scores[run])
            for run in range(1, len(self.tasks_scores) + 1)
        ]
-        table.add_row("Crew", *map(str, crew_scores), f"{crew_average:.1f}")
+        table.add_row(
            "Crew",
            *[f"{score:.2f}" for score in crew_scores],
            f"{crew_average:.1f}",
            "",
        )
        run_exec_times = [
            int(sum(tasks_exec_times))
@@ -128,11 +150,9 @@ class CrewEvaluator:
        ]
        execution_time_avg = int(sum(run_exec_times) / len(run_exec_times))
        table.add_row(
-            "Execution Time (s)",
+            "Execution Time (s)", *map(str, run_exec_times), f"{execution_time_avg}", ""
            *map(str, run_exec_times),
            f"{execution_time_avg}",
        )
-        # Display the table in the terminal
+
        console = Console()
        console.print(table)
--- a/tests/utilities/evaluators/test_crew_evaluator_handler.py
+++ b/tests/utilities/evaluators/test_crew_evaluator_handler.py
@@ -1,7 +1,6 @@
 from unittest import mock
 import pytest
 from crewai.agent import Agent
 from crewai.crew import Crew
 from crewai.task import Task
@@ -80,6 +79,7 @@ class TestCrewEvaluator:
    @mock.patch("crewai.utilities.evaluators.crew_evaluator_handler.Console")
    @mock.patch("crewai.utilities.evaluators.crew_evaluator_handler.Table")
    def test_print_crew_evaluation_result(self, table, console, crew_planner):
        # Set up task scores and execution times
        crew_planner.tasks_scores = {
            1: [10, 9, 8],
            2: [9, 8, 7],
@@ -89,22 +89,45 @@ class TestCrewEvaluator:
            2: [55, 33, 67],
        }
        # Mock agents and assign them to tasks
        crew_planner.crew.agents = [
            mock.Mock(role="Agent 1"),
            mock.Mock(role="Agent 2"),
        ]
        crew_planner.crew.tasks = [
            mock.Mock(
                agent=crew_planner.crew.agents[0], processed_by_agents=["Agent 1"]
            ),
            mock.Mock(
                agent=crew_planner.crew.agents[1], processed_by_agents=["Agent 2"]
            ),
        ]
        # Run the method
        crew_planner.print_crew_evaluation_result()
        # Verify that the table is created with the appropriate structure and rows
        table.assert_has_calls(
            [
-                mock.call(title="Tasks Scores \n (1-10 Higher is better)"),
+                mock.call(
-                mock.call().add_column("Tasks/Crew"),
+                    title="Tasks Scores \n (1-10 Higher is better)", box=mock.ANY
-                mock.call().add_column("Run 1"),
+                ),  # Title and styling
-                mock.call().add_column("Run 2"),
+                mock.call().add_column("Tasks/Crew/Agents", style="cyan"),  # Columns
-                mock.call().add_column("Avg. Total"),
+                mock.call().add_column("Run 1", justify="center"),
-                mock.call().add_row("Task 1", "10", "9", "9.5"),
+                mock.call().add_column("Run 2", justify="center"),
-                mock.call().add_row("Task 2", "9", "8", "8.5"),
+                mock.call().add_column("Avg. Total", justify="center"),
-                mock.call().add_row("Task 3", "8", "7", "7.5"),
+                mock.call().add_column("Agents", style="green"),
-                mock.call().add_row("Crew", "9.0", "8.0", "8.5"),
+                # Verify rows for tasks with agents
-                mock.call().add_row("Execution Time (s)", "135", "155", "145"),
+                mock.call().add_row("Task 1", "10.0", "9.0", "9.5", "- Agent 1"),
                mock.call().add_row("", "", "", "", "", ""),  # Blank row between tasks
                mock.call().add_row("Task 2", "9.0", "8.0", "8.5", "- Agent 2"),
                # Add crew averages and execution times
                mock.call().add_row("Crew", "9.00", "8.00", "8.5", ""),
                mock.call().add_row("Execution Time (s)", "135", "155", "145", ""),
            ]
        )
        # Ensure the console prints the table
        console.assert_has_calls([mock.call(), mock.call().print(table())])
    def test_evaluate(self, crew_planner):