Merge pull request #165 from patronus-ai/feat/add-patronus-api-tool

Feat/add patronus api tool
This commit is contained in:
Brandon Hancock (bhancock_ai)
2025-01-07 12:48:08 -05:00
committed by GitHub
6 changed files with 393 additions and 0 deletions

View File

@@ -23,6 +23,9 @@ from .tools import (
MultiOnTool,
MySQLSearchTool,
NL2SQLTool,
PatronusEvalTool,
PatronusLocalEvaluatorTool,
PatronusPredefinedCriteriaEvalTool,
PDFSearchTool,
PGSearchTool,
RagTool,

View File

@@ -26,6 +26,7 @@ from .mdx_seach_tool.mdx_search_tool import MDXSearchTool
from .multion_tool.multion_tool import MultiOnTool
from .mysql_search_tool.mysql_search_tool import MySQLSearchTool
from .nl2sql.nl2sql_tool import NL2SQLTool
from .patronus_eval_tool.eval_tool import PatronusEvalTool
from .pdf_search_tool.pdf_search_tool import PDFSearchTool
from .pg_seach_tool.pg_search_tool import PGSearchTool
from .rag.rag_tool import RagTool

View File

@@ -0,0 +1,59 @@
from crewai import Agent, Crew, Task
from patronus_eval_tool import (
PatronusEvalTool,
)
from patronus_local_evaluator_tool import (
PatronusLocalEvaluatorTool,
)
from patronus_predefined_criteria_eval_tool import (
PatronusPredefinedCriteriaEvalTool,
)
from patronus import Client, EvaluationResult
import random
# Test the PatronusLocalEvaluatorTool where agent uses the local evaluator
client = Client()
# Example of an evaluator that returns a random pass/fail result
@client.register_local_evaluator("random_evaluator")
def random_evaluator(**kwargs):
score = random.random()
return EvaluationResult(
score_raw=score,
pass_=score >= 0.5,
explanation="example explanation" # Optional justification for LLM judges
)
# 1. Uses PatronusEvalTool: agent can pick the best evaluator and criteria
# patronus_eval_tool = PatronusEvalTool()
# 2. Uses PatronusPredefinedCriteriaEvalTool: agent uses the defined evaluator and criteria
# patronus_eval_tool = PatronusPredefinedCriteriaEvalTool(
# evaluators=[{"evaluator": "judge", "criteria": "contains-code"}]
# )
# 3. Uses PatronusLocalEvaluatorTool: agent uses user defined evaluator
patronus_eval_tool = PatronusLocalEvaluatorTool(
patronus_client=client, evaluator="random_evaluator", evaluated_model_gold_answer="example label"
)
# Create a new agent
coding_agent = Agent(
role="Coding Agent",
goal="Generate high quality code and verify that the output is code by using Patronus AI's evaluation tool.",
backstory="You are an experienced coder who can generate high quality python code. You can follow complex instructions accurately and effectively.",
tools=[patronus_eval_tool],
verbose=True,
)
# Define tasks
generate_code = Task(
description="Create a simple program to generate the first N numbers in the Fibonacci sequence. Select the most appropriate evaluator and criteria for evaluating your output.",
expected_output="Program that generates the first N numbers in the Fibonacci sequence.",
agent=coding_agent,
)
crew = Crew(agents=[coding_agent], tasks=[generate_code])
crew.kickoff()

View File

@@ -0,0 +1,139 @@
import os
import json
import requests
import warnings
from typing import Any, List, Dict, Optional
from crewai.tools import BaseTool
class PatronusEvalTool(BaseTool):
name: str = "Patronus Evaluation Tool"
evaluate_url: str = "https://api.patronus.ai/v1/evaluate"
evaluators: List[Dict[str, str]] = []
criteria: List[Dict[str, str]] = []
description: str = ""
def __init__(self, **kwargs: Any):
super().__init__(**kwargs)
temp_evaluators, temp_criteria = self._init_run()
self.evaluators = temp_evaluators
self.criteria = temp_criteria
self.description = self._generate_description()
warnings.warn("You are allowing the agent to select the best evaluator and criteria when you use the `PatronusEvalTool`. If this is not intended then please use `PatronusPredefinedCriteriaEvalTool` instead.")
def _init_run(self):
evaluators_set = json.loads(
requests.get(
"https://api.patronus.ai/v1/evaluators",
headers={
"accept": "application/json",
"X-API-KEY": os.environ["PATRONUS_API_KEY"],
},
).text
)["evaluators"]
ids, evaluators = set(), []
for ev in evaluators_set:
if not ev["deprecated"] and ev["id"] not in ids:
evaluators.append(
{
"id": ev["id"],
"name": ev["name"],
"description": ev["description"],
"aliases": ev["aliases"],
}
)
ids.add(ev["id"])
criteria_set = json.loads(
requests.get(
"https://api.patronus.ai/v1/evaluator-criteria",
headers={
"accept": "application/json",
"X-API-KEY": os.environ["PATRONUS_API_KEY"],
},
).text
)["evaluator_criteria"]
criteria = []
for cr in criteria_set:
if cr["config"].get("pass_criteria", None):
if cr["config"].get("rubric", None):
criteria.append(
{
"evaluator": cr["evaluator_family"],
"name": cr["name"],
"pass_criteria": cr["config"]["pass_criteria"],
"rubric": cr["config"]["rubric"],
}
)
else:
criteria.append(
{
"evaluator": cr["evaluator_family"],
"name": cr["name"],
"pass_criteria": cr["config"]["pass_criteria"],
}
)
elif cr["description"]:
criteria.append(
{
"evaluator": cr["evaluator_family"],
"name": cr["name"],
"description": cr["description"],
}
)
return evaluators, criteria
def _generate_description(self) -> str:
criteria = "\n".join([json.dumps(i) for i in self.criteria])
return f"""This tool calls the Patronus Evaluation API that takes the following arguments:
1. evaluated_model_input: str: The agent's task description in simple text
2. evaluated_model_output: str: The agent's output of the task
3. evaluated_model_retrieved_context: str: The agent's context
4. evaluators: This is a list of dictionaries containing one of the following evaluators and the corresponding criteria. An example input for this field: [{{"evaluator": "Judge", "criteria": "patronus:is-code"}}]
Evaluators:
{criteria}
You must ONLY choose the most appropriate evaluator and criteria based on the "pass_criteria" or "description" fields for your evaluation task and nothing from outside of the options present."""
def _run(
self,
evaluated_model_input: Optional[str],
evaluated_model_output: Optional[str],
evaluated_model_retrieved_context: Optional[str],
evaluators: List[Dict[str, str]],
) -> Any:
# Assert correct format of evaluators
evals = []
for ev in evaluators:
evals.append(
{
"evaluator": ev["evaluator"].lower(),
"criteria": ev["name"] if "name" in ev else ev["criteria"],
}
)
data = {
"evaluated_model_input": evaluated_model_input,
"evaluated_model_output": evaluated_model_output,
"evaluated_model_retrieved_context": evaluated_model_retrieved_context,
"evaluators": evals,
}
headers = {
"X-API-KEY": os.getenv("PATRONUS_API_KEY"),
"accept": "application/json",
"content-type": "application/json",
}
response = requests.post(
self.evaluate_url, headers=headers, data=json.dumps(data)
)
if response.status_code != 200:
raise Exception(
f"Failed to evaluate model input and output. Response status code: {response.status_code}. Reason: {response.text}"
)
return response.json()

View File

@@ -0,0 +1,85 @@
from typing import Any, Type
from crewai.tools import BaseTool
from pydantic import BaseModel, Field
from patronus import Client
class FixedLocalEvaluatorToolSchema(BaseModel):
evaluated_model_input: str = Field(
..., description="The agent's task description in simple text"
)
evaluated_model_output: str = Field(
..., description="The agent's output of the task"
)
evaluated_model_retrieved_context: str = Field(
..., description="The agent's context"
)
evaluated_model_gold_answer: str = Field(
..., description="The agent's gold answer only if available"
)
evaluator: str = Field(..., description="The registered local evaluator")
class PatronusLocalEvaluatorTool(BaseTool):
name: str = "Patronus Local Evaluator Tool"
evaluator: str = "The registered local evaluator"
evaluated_model_gold_answer: str = "The agent's gold answer"
description: str = (
"This tool is used to evaluate the model input and output using custom function evaluators."
)
client: Any = None
args_schema: Type[BaseModel] = FixedLocalEvaluatorToolSchema
class Config:
arbitrary_types_allowed = True
def __init__(self, patronus_client: Client, evaluator: str, evaluated_model_gold_answer: str, **kwargs: Any):
super().__init__(**kwargs)
self.client = patronus_client
if evaluator:
self.evaluator = evaluator
self.evaluated_model_gold_answer = evaluated_model_gold_answer
self.description = f"This tool calls the Patronus Evaluation API that takes an additional argument in addition to the following new argument:\n evaluators={evaluator}, evaluated_model_gold_answer={evaluated_model_gold_answer}"
self._generate_description()
print(
f"Updating judge evaluator, gold_answer to: {self.evaluator}, {self.evaluated_model_gold_answer}"
)
def _run(
self,
**kwargs: Any,
) -> Any:
evaluated_model_input = kwargs.get("evaluated_model_input")
evaluated_model_output = kwargs.get("evaluated_model_output")
evaluated_model_retrieved_context = kwargs.get(
"evaluated_model_retrieved_context"
)
evaluated_model_gold_answer = self.evaluated_model_gold_answer
evaluator = self.evaluator
result = self.client.evaluate(
evaluator=evaluator,
evaluated_model_input=(
evaluated_model_input
if isinstance(evaluated_model_input, str)
else evaluated_model_input.get("description")
),
evaluated_model_output=(
evaluated_model_output
if isinstance(evaluated_model_output, str)
else evaluated_model_output.get("description")
),
evaluated_model_retrieved_context=(
evaluated_model_retrieved_context
if isinstance(evaluated_model_retrieved_context, str)
else evaluated_model_retrieved_context.get("description")
),
evaluated_model_gold_answer=(
evaluated_model_gold_answer
if isinstance(evaluated_model_gold_answer, str)
else evaluated_model_gold_answer.get("description")
),
tags={}, # Optional metadata, supports arbitrary kv pairs
)
output = f"Evaluation result: {result.pass_}, Explanation: {result.explanation}"
return output

View File

@@ -0,0 +1,106 @@
import os
import json
import requests
from typing import Any, List, Dict, Type
from crewai.tools import BaseTool
from pydantic import BaseModel, Field
class FixedBaseToolSchema(BaseModel):
evaluated_model_input: Dict = Field(
..., description="The agent's task description in simple text"
)
evaluated_model_output: Dict = Field(
..., description="The agent's output of the task"
)
evaluated_model_retrieved_context: Dict = Field(
..., description="The agent's context"
)
evaluated_model_gold_answer: Dict = Field(
..., description="The agent's gold answer only if available"
)
evaluators: List[Dict[str, str]] = Field(
...,
description="List of dictionaries containing the evaluator and criteria to evaluate the model input and output. An example input for this field: [{'evaluator': '[evaluator-from-user]', 'criteria': '[criteria-from-user]'}]",
)
class PatronusPredefinedCriteriaEvalTool(BaseTool):
"""
PatronusEvalTool is a tool to automatically evaluate and score agent interactions.
Results are logged to the Patronus platform at app.patronus.ai
"""
name: str = "Call Patronus API tool for evaluation of model inputs and outputs"
description: str = (
"""This tool calls the Patronus Evaluation API that takes the following arguments:"""
)
evaluate_url: str = "https://api.patronus.ai/v1/evaluate"
args_schema: Type[BaseModel] = FixedBaseToolSchema
evaluators: List[Dict[str, str]] = []
def __init__(self, evaluators: List[Dict[str, str]], **kwargs: Any):
super().__init__(**kwargs)
if evaluators:
self.evaluators = evaluators
self.description = f"This tool calls the Patronus Evaluation API that takes an additional argument in addition to the following new argument:\n evaluators={evaluators}"
self._generate_description()
print(f"Updating judge criteria to: {self.evaluators}")
def _run(
self,
**kwargs: Any,
) -> Any:
evaluated_model_input = kwargs.get("evaluated_model_input")
evaluated_model_output = kwargs.get("evaluated_model_output")
evaluated_model_retrieved_context = kwargs.get(
"evaluated_model_retrieved_context"
)
evaluated_model_gold_answer = kwargs.get("evaluated_model_gold_answer")
evaluators = self.evaluators
headers = {
"X-API-KEY": os.getenv("PATRONUS_API_KEY"),
"accept": "application/json",
"content-type": "application/json",
}
data = {
"evaluated_model_input": (
evaluated_model_input
if isinstance(evaluated_model_input, str)
else evaluated_model_input.get("description")
),
"evaluated_model_output": (
evaluated_model_output
if isinstance(evaluated_model_output, str)
else evaluated_model_output.get("description")
),
"evaluated_model_retrieved_context": (
evaluated_model_retrieved_context
if isinstance(evaluated_model_retrieved_context, str)
else evaluated_model_retrieved_context.get("description")
),
"evaluated_model_gold_answer": (
evaluated_model_gold_answer
if isinstance(evaluated_model_gold_answer, str)
else evaluated_model_gold_answer.get("description")
),
"evaluators": (
evaluators
if isinstance(evaluators, list)
else evaluators.get("description")
),
}
response = requests.post(
self.evaluate_url, headers=headers, data=json.dumps(data)
)
if response.status_code != 200:
raise Exception(
f"Failed to evaluate model input and output. Status code: {response.status_code}. Reason: {response.text}"
)
return response.json()