feat: add prompt observability code (#2027)

* feat: add prompt observability code

* feat: improve logic for llm call

* feat: add tests for traces

* feat: remove unused improt

* feat: add function to clear and add task traces

* feat: fix import

* feat:  chagne time

* feat: fix type checking issues

* feat: add fixed time to fix test

* feat: fix datetime test issue

* feat: add add task traces function

* feat: add same logic as entp

* feat: add start_time as reference for duplication of tool call

* feat: add max_depth

* feat: add protocols file to properly import on LLM

---------

Co-authored-by: Brandon Hancock (bhancock_ai) <109994880+bhancockio@users.noreply.github.com>
This commit is contained in:
Eduardo Chiarotti
2025-02-19 08:52:30 -03:00
committed by GitHub
parent 1cb5f57864
commit 90f1bee602
12 changed files with 1254 additions and 48 deletions

View File

@@ -1,3 +1,4 @@
import inspect
import json
import logging
import os
@@ -5,7 +6,17 @@ import sys
import threading
import warnings
from contextlib import contextmanager
from typing import Any, Dict, List, Literal, Optional, Type, Union, cast
from typing import (
Any,
Dict,
List,
Literal,
Optional,
Tuple,
Type,
Union,
cast,
)
from dotenv import load_dotenv
from pydantic import BaseModel
@@ -18,9 +29,11 @@ with warnings.catch_warnings():
from litellm.utils import supports_response_schema
from crewai.traces.unified_trace_controller import trace_llm_call
from crewai.utilities.exceptions.context_window_exceeding_exception import (
LLMContextLengthExceededException,
)
from crewai.utilities.protocols import AgentExecutorProtocol
load_dotenv()
@@ -164,6 +177,7 @@ class LLM:
self.context_window_size = 0
self.reasoning_effort = reasoning_effort
self.additional_params = kwargs
self._message_history: List[Dict[str, str]] = []
self.is_anthropic = self._is_anthropic_model(model)
litellm.drop_params = True
@@ -179,16 +193,22 @@ class LLM:
self.set_callbacks(callbacks)
self.set_env_callbacks()
@trace_llm_call
def _call_llm(self, params: Dict[str, Any]) -> Any:
with suppress_warnings():
response = litellm.completion(**params)
return response
def _is_anthropic_model(self, model: str) -> bool:
"""Determine if the model is from Anthropic provider.
Args:
model: The model identifier string.
Returns:
bool: True if the model is from Anthropic, False otherwise.
"""
ANTHROPIC_PREFIXES = ('anthropic/', 'claude-', 'claude/')
ANTHROPIC_PREFIXES = ("anthropic/", "claude-", "claude/")
return any(prefix in model.lower() for prefix in ANTHROPIC_PREFIXES)
def call(
@@ -199,7 +219,7 @@ class LLM:
available_functions: Optional[Dict[str, Any]] = None,
) -> Union[str, Any]:
"""High-level LLM call method.
Args:
messages: Input messages for the LLM.
Can be a string or list of message dictionaries.
@@ -211,22 +231,22 @@ class LLM:
during and after the LLM call.
available_functions: Optional dict mapping function names to callables
that can be invoked by the LLM.
Returns:
Union[str, Any]: Either a text response from the LLM (str) or
the result of a tool function call (Any).
Raises:
TypeError: If messages format is invalid
ValueError: If response format is not supported
LLMContextLengthExceededException: If input exceeds model's context limit
Examples:
# Example 1: Simple string input
>>> response = llm.call("Return the name of a random city.")
>>> print(response)
"Paris"
# Example 2: Message list with system and user messages
>>> messages = [
... {"role": "system", "content": "You are a geography expert"},
@@ -288,7 +308,7 @@ class LLM:
params = {k: v for k, v in params.items() if v is not None}
# --- 2) Make the completion call
response = litellm.completion(**params)
response = self._call_llm(params)
response_message = cast(Choices, cast(ModelResponse, response).choices)[
0
].message
@@ -348,36 +368,40 @@ class LLM:
logging.error(f"LiteLLM call failed: {str(e)}")
raise
def _format_messages_for_provider(self, messages: List[Dict[str, str]]) -> List[Dict[str, str]]:
def _format_messages_for_provider(
self, messages: List[Dict[str, str]]
) -> List[Dict[str, str]]:
"""Format messages according to provider requirements.
Args:
messages: List of message dictionaries with 'role' and 'content' keys.
Can be empty or None.
Returns:
List of formatted messages according to provider requirements.
For Anthropic models, ensures first message has 'user' role.
Raises:
TypeError: If messages is None or contains invalid message format.
"""
if messages is None:
raise TypeError("Messages cannot be None")
# Validate message format first
for msg in messages:
if not isinstance(msg, dict) or "role" not in msg or "content" not in msg:
raise TypeError("Invalid message format. Each message must be a dict with 'role' and 'content' keys")
raise TypeError(
"Invalid message format. Each message must be a dict with 'role' and 'content' keys"
)
if not self.is_anthropic:
return messages
# Anthropic requires messages to start with 'user' role
if not messages or messages[0]["role"] == "system":
# If first message is system or empty, add a placeholder user message
return [{"role": "user", "content": "."}, *messages]
return messages
def _get_custom_llm_provider(self) -> str:
@@ -495,3 +519,95 @@ class LLM:
litellm.success_callback = success_callbacks
litellm.failure_callback = failure_callbacks
def _get_execution_context(self) -> Tuple[Optional[Any], Optional[Any]]:
"""Get the agent and task from the execution context.
Returns:
tuple: (agent, task) from any AgentExecutor context, or (None, None) if not found
"""
frame = inspect.currentframe()
caller_frame = frame.f_back if frame else None
agent = None
task = None
# Add a maximum depth to prevent infinite loops
max_depth = 100 # Reasonable limit for call stack depth
current_depth = 0
while caller_frame and current_depth < max_depth:
if "self" in caller_frame.f_locals:
caller_self = caller_frame.f_locals["self"]
if isinstance(caller_self, AgentExecutorProtocol):
agent = caller_self.agent
task = caller_self.task
break
caller_frame = caller_frame.f_back
current_depth += 1
return agent, task
def _get_new_messages(self, messages: List[Dict[str, str]]) -> List[Dict[str, str]]:
"""Get only the new messages that haven't been processed before."""
if not hasattr(self, "_message_history"):
self._message_history = []
new_messages = []
for message in messages:
message_key = (message["role"], message["content"])
if message_key not in [
(m["role"], m["content"]) for m in self._message_history
]:
new_messages.append(message)
self._message_history.append(message)
return new_messages
def _get_new_tool_results(self, agent) -> List[Dict]:
"""Get only the new tool results that haven't been processed before."""
if not agent or not agent.tools_results:
return []
if not hasattr(self, "_tool_results_history"):
self._tool_results_history: List[Dict] = []
new_tool_results = []
for result in agent.tools_results:
# Process tool arguments to extract actual values
processed_args = {}
if isinstance(result["tool_args"], dict):
for key, value in result["tool_args"].items():
if isinstance(value, dict) and "type" in value:
# Skip metadata and just store the actual value
continue
processed_args[key] = value
# Create a clean result with processed arguments
clean_result = {
"tool_name": result["tool_name"],
"tool_args": processed_args,
"result": result["result"],
"content": result.get("content", ""),
"start_time": result.get("start_time", ""),
}
# Check if this exact tool execution exists in history
is_duplicate = False
for history_result in self._tool_results_history:
if (
clean_result["tool_name"] == history_result["tool_name"]
and str(clean_result["tool_args"])
== str(history_result["tool_args"])
and str(clean_result["result"]) == str(history_result["result"])
and clean_result["content"] == history_result.get("content", "")
and clean_result["start_time"]
== history_result.get("start_time", "")
):
is_duplicate = True
break
if not is_duplicate:
new_tool_results.append(clean_result)
self._tool_results_history.append(clean_result)
return new_tool_results