feat: add prompt observability code (#2027)

* feat: add prompt observability code * feat: improve logic for llm call * feat: add tests for traces * feat: remove unused improt * feat: add function to clear and add task traces * feat: fix import * feat: chagne time * feat: fix type checking issues * feat: add fixed time to fix test * feat: fix datetime test issue * feat: add add task traces function * feat: add same logic as entp * feat: add start_time as reference for duplication of tool call * feat: add max_depth * feat: add protocols file to properly import on LLM --------- Co-authored-by: Brandon Hancock (bhancock_ai) <109994880+bhancockio@users.noreply.github.com>
2026-01-10 08:38:30 +00:00 · 2025-02-19 08:52:30 -03:00
parent 1cb5f57864
commit 90f1bee602
12 changed files with 1254 additions and 48 deletions
--- a/src/crewai/llm.py
+++ b/src/crewai/llm.py
@@ -1,3 +1,4 @@
+import inspect
 import json
 import logging
 import os
@@ -5,7 +6,17 @@ import sys
 import threading
 import warnings
 from contextlib import contextmanager
-from typing import Any, Dict, List, Literal, Optional, Type, Union, cast
+from typing import (
+    Any,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Type,
+    Union,
+    cast,
+)

 from dotenv import load_dotenv
 from pydantic import BaseModel
@@ -18,9 +29,11 @@ with warnings.catch_warnings():
    from litellm.utils import supports_response_schema


+from crewai.traces.unified_trace_controller import trace_llm_call
 from crewai.utilities.exceptions.context_window_exceeding_exception import (
    LLMContextLengthExceededException,
 )
+from crewai.utilities.protocols import AgentExecutorProtocol

 load_dotenv()

@@ -164,6 +177,7 @@ class LLM:
        self.context_window_size = 0
        self.reasoning_effort = reasoning_effort
        self.additional_params = kwargs
+        self._message_history: List[Dict[str, str]] = []
        self.is_anthropic = self._is_anthropic_model(model)

        litellm.drop_params = True
@@ -179,16 +193,22 @@ class LLM:
        self.set_callbacks(callbacks)
        self.set_env_callbacks()

+    @trace_llm_call
+    def _call_llm(self, params: Dict[str, Any]) -> Any:
+        with suppress_warnings():
+            response = litellm.completion(**params)
+            return response
+
    def _is_anthropic_model(self, model: str) -> bool:
        """Determine if the model is from Anthropic provider.
-        
+
        Args:
            model: The model identifier string.
-            
+
        Returns:
            bool: True if the model is from Anthropic, False otherwise.
        """
-        ANTHROPIC_PREFIXES = ('anthropic/', 'claude-', 'claude/')
+        ANTHROPIC_PREFIXES = ("anthropic/", "claude-", "claude/")
        return any(prefix in model.lower() for prefix in ANTHROPIC_PREFIXES)

    def call(
@@ -199,7 +219,7 @@ class LLM:
        available_functions: Optional[Dict[str, Any]] = None,
    ) -> Union[str, Any]:
        """High-level LLM call method.
-        
+
        Args:
            messages: Input messages for the LLM.
                     Can be a string or list of message dictionaries.
@@ -211,22 +231,22 @@ class LLM:
                      during and after the LLM call.
            available_functions: Optional dict mapping function names to callables
                               that can be invoked by the LLM.
-        
+
        Returns:
            Union[str, Any]: Either a text response from the LLM (str) or
                           the result of a tool function call (Any).
-        
+
        Raises:
            TypeError: If messages format is invalid
            ValueError: If response format is not supported
            LLMContextLengthExceededException: If input exceeds model's context limit
-        
+
        Examples:
            # Example 1: Simple string input
            >>> response = llm.call("Return the name of a random city.")
            >>> print(response)
            "Paris"
-            
+
            # Example 2: Message list with system and user messages
            >>> messages = [
            ...     {"role": "system", "content": "You are a geography expert"},
@@ -288,7 +308,7 @@ class LLM:
                params = {k: v for k, v in params.items() if v is not None}

                # --- 2) Make the completion call
-                response = litellm.completion(**params)
+                response = self._call_llm(params)
                response_message = cast(Choices, cast(ModelResponse, response).choices)[
                    0
                ].message
@@ -348,36 +368,40 @@ class LLM:
                    logging.error(f"LiteLLM call failed: {str(e)}")
                raise

-    def _format_messages_for_provider(self, messages: List[Dict[str, str]]) -> List[Dict[str, str]]:
+    def _format_messages_for_provider(
+        self, messages: List[Dict[str, str]]
+    ) -> List[Dict[str, str]]:
        """Format messages according to provider requirements.
-        
+
        Args:
            messages: List of message dictionaries with 'role' and 'content' keys.
                     Can be empty or None.
-        
+
        Returns:
            List of formatted messages according to provider requirements.
            For Anthropic models, ensures first message has 'user' role.
-        
+
        Raises:
            TypeError: If messages is None or contains invalid message format.
        """
        if messages is None:
            raise TypeError("Messages cannot be None")
-            
+
        # Validate message format first
        for msg in messages:
            if not isinstance(msg, dict) or "role" not in msg or "content" not in msg:
-                raise TypeError("Invalid message format. Each message must be a dict with 'role' and 'content' keys")
-            
+                raise TypeError(
+                    "Invalid message format. Each message must be a dict with 'role' and 'content' keys"
+                )
+
        if not self.is_anthropic:
            return messages
-            
+
        # Anthropic requires messages to start with 'user' role
        if not messages or messages[0]["role"] == "system":
            # If first message is system or empty, add a placeholder user message
            return [{"role": "user", "content": "."}, *messages]
-                
+
        return messages

    def _get_custom_llm_provider(self) -> str:
@@ -495,3 +519,95 @@ class LLM:

                litellm.success_callback = success_callbacks
                litellm.failure_callback = failure_callbacks
+
+    def _get_execution_context(self) -> Tuple[Optional[Any], Optional[Any]]:
+        """Get the agent and task from the execution context.
+
+        Returns:
+            tuple: (agent, task) from any AgentExecutor context, or (None, None) if not found
+        """
+        frame = inspect.currentframe()
+        caller_frame = frame.f_back if frame else None
+        agent = None
+        task = None
+
+        # Add a maximum depth to prevent infinite loops
+        max_depth = 100  # Reasonable limit for call stack depth
+        current_depth = 0
+
+        while caller_frame and current_depth < max_depth:
+            if "self" in caller_frame.f_locals:
+                caller_self = caller_frame.f_locals["self"]
+                if isinstance(caller_self, AgentExecutorProtocol):
+                    agent = caller_self.agent
+                    task = caller_self.task
+                    break
+            caller_frame = caller_frame.f_back
+            current_depth += 1
+
+        return agent, task
+
+    def _get_new_messages(self, messages: List[Dict[str, str]]) -> List[Dict[str, str]]:
+        """Get only the new messages that haven't been processed before."""
+        if not hasattr(self, "_message_history"):
+            self._message_history = []
+
+        new_messages = []
+        for message in messages:
+            message_key = (message["role"], message["content"])
+            if message_key not in [
+                (m["role"], m["content"]) for m in self._message_history
+            ]:
+                new_messages.append(message)
+                self._message_history.append(message)
+        return new_messages
+
+    def _get_new_tool_results(self, agent) -> List[Dict]:
+        """Get only the new tool results that haven't been processed before."""
+        if not agent or not agent.tools_results:
+            return []
+
+        if not hasattr(self, "_tool_results_history"):
+            self._tool_results_history: List[Dict] = []
+
+        new_tool_results = []
+
+        for result in agent.tools_results:
+            # Process tool arguments to extract actual values
+            processed_args = {}
+            if isinstance(result["tool_args"], dict):
+                for key, value in result["tool_args"].items():
+                    if isinstance(value, dict) and "type" in value:
+                        # Skip metadata and just store the actual value
+                        continue
+                    processed_args[key] = value
+
+            # Create a clean result with processed arguments
+            clean_result = {
+                "tool_name": result["tool_name"],
+                "tool_args": processed_args,
+                "result": result["result"],
+                "content": result.get("content", ""),
+                "start_time": result.get("start_time", ""),
+            }
+
+            # Check if this exact tool execution exists in history
+            is_duplicate = False
+            for history_result in self._tool_results_history:
+                if (
+                    clean_result["tool_name"] == history_result["tool_name"]
+                    and str(clean_result["tool_args"])
+                    == str(history_result["tool_args"])
+                    and str(clean_result["result"]) == str(history_result["result"])
+                    and clean_result["content"] == history_result.get("content", "")
+                    and clean_result["start_time"]
+                    == history_result.get("start_time", "")
+                ):
+                    is_duplicate = True
+                    break
+
+            if not is_duplicate:
+                new_tool_results.append(clean_result)
+                self._tool_results_history.append(clean_result)
+
+        return new_tool_results