From d94f7e03dce4866ee180d7f4e97e7a2aa51b8373 Mon Sep 17 00:00:00 2001 From: DarshanDeshpande Date: Sat, 14 Dec 2024 15:46:10 -0500 Subject: [PATCH 1/3] Update Patronus AI evaluator tool and example --- .../tools/patronus_eval_tool/example.py | 17 +++----- .../patronus_eval_tool/patronus_eval_tool.py | 39 ++++++++++++------- 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/src/crewai_tools/tools/patronus_eval_tool/example.py b/src/crewai_tools/tools/patronus_eval_tool/example.py index 99088d17f..4015a5f4a 100644 --- a/src/crewai_tools/tools/patronus_eval_tool/example.py +++ b/src/crewai_tools/tools/patronus_eval_tool/example.py @@ -1,34 +1,27 @@ -import os - from crewai import Agent, Crew, Task from patronus_eval_tool import PatronusEvalTool patronus_eval_tool = PatronusEvalTool( - evaluators=[{ - "evaluator": "judge", - "criteria": "patronus:is-code" - }], - tags={} + evaluators=[{"evaluator": "judge", "criteria": "patronus:is-code"}], tags={} ) # Create a new agent coding_agent = Agent( role="Coding Agent", - goal="Generate high quality code. Use the evaluation tool to score the agent outputs", - backstory="Coding agent to generate high quality code. Use the evaluation tool to score the agent outputs", + goal="Generate high quality code and verify that the code is correct by using Patronus AI's evaluation tool to check validity of your output code.", + backstory="You are an experienced coder who can generate high quality python code. You can follow complex instructions accurately and effectively.", tools=[patronus_eval_tool], verbose=True, ) # Define tasks generate_code = Task( - description="Create a simple program to generate the first N numbers in the Fibonacci sequence.", + description="Create a simple program to generate the first N numbers in the Fibonacci sequence. Use the evaluator as `judge` from Patronus AI with the criteria `patronus:is-code` and feed your task input as input and your code as output to verify your code validity.", expected_output="Program that generates the first N numbers in the Fibonacci sequence.", agent=coding_agent, ) - crew = Crew(agents=[coding_agent], tasks=[generate_code]) -crew.kickoff() \ No newline at end of file +crew.kickoff() diff --git a/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py b/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py index c0e2b95e0..88ad28253 100644 --- a/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py +++ b/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py @@ -1,45 +1,54 @@ -from typing import Any, Optional, Type, cast, ClassVar - -from crewai.tools import BaseTool -import json import os +import json import requests +from typing import Any, List, Dict +from crewai.tools import BaseTool + class PatronusEvalTool(BaseTool): """ PatronusEvalTool is a tool to automatically evaluate and score agent interactions. - + Results are logged to the Patronus platform at app.patronus.ai """ - name: str = "Call Patronus API tool" + name: str = "Call Patronus API tool for evaluation of model inputs and outputs" description: str = ( - "This tool calls the Patronus Evaluation API. This function returns the response from the API." + """This tool calls the Patronus Evaluation API that takes the following arguments: +1. evaluated_model_input: str: The agent's task description +2. evaluated_model_output: str: The agent's output code +3. evaluators: list[dict[str,str]]: list of dictionaries, each with a an evaluator (such as `judge`) and a criteria (like `patronus:[criteria-name-here]`).""" ) evaluate_url: str = "https://api.patronus.ai/v1/evaluate" - def _run( self, evaluated_model_input: str, evaluated_model_output: str, - evaluators: list, - tags: dict + evaluators: List[Dict[str, str]], + tags: dict, ) -> Any: - + api_key = os.getenv("PATRONUS_API_KEY") headers = { "X-API-KEY": api_key, "accept": "application/json", - "content-type": "application/json" + "content-type": "application/json", } data = { "evaluated_model_input": evaluated_model_input, "evaluated_model_output": evaluated_model_output, "evaluators": evaluators, - "tags": tags + "tags": tags, } - # Make the POST request - response = requests.post(self.evaluate_url, headers=headers, data=json.dumps(data)) \ No newline at end of file + response = requests.post( + self.evaluate_url, headers=headers, data=json.dumps(data) + ) + if response.status_code != 200: + raise Exception( + f"Failed to evaluate model input and output. Reason: {response.text}" + ) + + return response.json() From 0ac6f915fb83c64cdb0b3aa1c5662077caaf66fa Mon Sep 17 00:00:00 2001 From: DarshanDeshpande Date: Thu, 26 Dec 2024 17:37:22 -0500 Subject: [PATCH 2/3] Add all Patronus eval tools and update example --- .../tools/patronus_eval_tool/example.py | 36 ++- .../patronus_eval_tool/patronus_eval_tool.py | 302 +++++++++++++++++- 2 files changed, 314 insertions(+), 24 deletions(-) diff --git a/src/crewai_tools/tools/patronus_eval_tool/example.py b/src/crewai_tools/tools/patronus_eval_tool/example.py index 4015a5f4a..1b0ba028d 100644 --- a/src/crewai_tools/tools/patronus_eval_tool/example.py +++ b/src/crewai_tools/tools/patronus_eval_tool/example.py @@ -1,15 +1,37 @@ from crewai import Agent, Crew, Task -from patronus_eval_tool import PatronusEvalTool - - -patronus_eval_tool = PatronusEvalTool( - evaluators=[{"evaluator": "judge", "criteria": "patronus:is-code"}], tags={} +from patronus_eval_tool import ( + PatronusEvalTool, + PatronusPredifinedCriteriaEvalTool, + PatronusLocalEvaluatorTool, ) +from patronus import Client, EvaluationResult + +# Test the PatronusEvalTool where agent can pick the best evaluator and criteria +patronus_eval_tool = PatronusEvalTool() + +# Test the PatronusPredifinedCriteriaEvalTool where agent uses the defined evaluator and criteria +patronus_eval_tool = PatronusPredifinedCriteriaEvalTool( + evaluators=[{"evaluator": "judge", "criteria": "contains-code"}] +) + +# Test the PatronusLocalEvaluatorTool where agent uses the local evaluator +client = Client() + + +@client.register_local_evaluator("local_evaluator_name") +def my_evaluator(**kwargs): + return EvaluationResult(pass_="PASS", score=0.5, explanation="Explanation test") + + +patronus_eval_tool = PatronusLocalEvaluatorTool( + evaluator="local_evaluator_name", evaluated_model_gold_answer="test" +) + # Create a new agent coding_agent = Agent( role="Coding Agent", - goal="Generate high quality code and verify that the code is correct by using Patronus AI's evaluation tool to check validity of your output code.", + goal="Generate high quality code and verify that the output is code by using Patronus AI's evaluation tool.", backstory="You are an experienced coder who can generate high quality python code. You can follow complex instructions accurately and effectively.", tools=[patronus_eval_tool], verbose=True, @@ -17,7 +39,7 @@ coding_agent = Agent( # Define tasks generate_code = Task( - description="Create a simple program to generate the first N numbers in the Fibonacci sequence. Use the evaluator as `judge` from Patronus AI with the criteria `patronus:is-code` and feed your task input as input and your code as output to verify your code validity.", + description="Create a simple program to generate the first N numbers in the Fibonacci sequence. Select the most appropriate evaluator and criteria for evaluating your output.", expected_output="Program that generates the first N numbers in the Fibonacci sequence.", agent=coding_agent, ) diff --git a/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py b/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py index 88ad28253..d765c1701 100644 --- a/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py +++ b/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py @@ -1,12 +1,247 @@ import os import json import requests - -from typing import Any, List, Dict +import warnings +from typing import Any, List, Dict, Optional, Type from crewai.tools import BaseTool +from pydantic import BaseModel, Field +from patronus import Client + + +class FixedBaseToolSchema(BaseModel): + evaluated_model_input: Dict = Field( + ..., description="The agent's task description in simple text" + ) + evaluated_model_output: Dict = Field( + ..., description="The agent's output of the task" + ) + evaluated_model_retrieved_context: Dict = Field( + ..., description="The agent's context" + ) + evaluated_model_gold_answer: Dict = Field( + ..., description="The agent's gold answer only if available" + ) + evaluators: List[Dict[str, str]] = Field( + ..., + description="List of dictionaries containing the evaluator and criteria to evaluate the model input and output. An example input for this field: [{'evaluator': '[evaluator-from-user]', 'criteria': '[criteria-from-user]'}]", + ) + + +class FixedLocalEvaluatorToolSchema(BaseModel): + evaluated_model_input: Dict = Field( + ..., description="The agent's task description in simple text" + ) + evaluated_model_output: Dict = Field( + ..., description="The agent's output of the task" + ) + evaluated_model_retrieved_context: Dict = Field( + ..., description="The agent's context" + ) + evaluated_model_gold_answer: Dict = Field( + ..., description="The agent's gold answer only if available" + ) + evaluator: str = Field(..., description="The registered local evaluator") class PatronusEvalTool(BaseTool): + name: str = "Patronus Evaluation Tool" + evaluate_url: str = "https://api.patronus.ai/v1/evaluate" + evaluators: List[Dict[str, str]] = [] + criteria: List[Dict[str, str]] = [] + description: str = "" + + def __init__(self, **kwargs: Any): + super().__init__(**kwargs) + temp_evaluators, temp_criteria = self._init_run() + self.evaluators = temp_evaluators + self.criteria = temp_criteria + self.description = self._generate_description() + warnings.warn("You are allowing the agent to select the best evaluator and criteria when you use the `PatronusEvalTool`. If this is not intended then please use `PatronusPredifinedCriteriaEvalTool` instead.") + + def _init_run(self): + content = json.loads( + requests.get( + "https://api.patronus.ai/v1/evaluators", + headers={ + "accept": "application/json", + "X-API-KEY": os.environ["PATRONUS_API_KEY"], + }, + ).text + )["evaluators"] + ids, evaluators = set(), [] + for i in content: + if not i["deprecated"] and i["id"] not in ids: + evaluators.append( + { + "id": i["id"], + "name": i["name"], + "description": i["description"], + "aliases": i["aliases"], + } + ) + ids.add(i["id"]) + + content = json.loads( + requests.get( + "https://api.patronus.ai/v1/evaluator-criteria", + headers={ + "accept": "application/json", + "X-API-KEY": os.environ["PATRONUS_API_KEY"], + }, + ).text + )["evaluator_criteria"] + criteria = [] + for i in content: + if i["config"].get("pass_criteria", None): + if i["config"].get("rubric", None): + criteria.append( + { + "evaluator": i["evaluator_family"], + "name": i["name"], + "pass_criteria": i["config"]["pass_criteria"], + "rubric": i["config"]["rubric"], + } + ) + else: + criteria.append( + { + "evaluator": i["evaluator_family"], + "name": i["name"], + "pass_criteria": i["config"]["pass_criteria"], + } + ) + elif i["description"]: + criteria.append( + { + "evaluator": i["evaluator_family"], + "name": i["name"], + "description": i["description"], + } + ) + + return evaluators, criteria + + def _generate_description(self) -> str: + criteria = "\n".join([json.dumps(i) for i in self.criteria]) + return f"""This tool calls the Patronus Evaluation API that takes the following arguments: +1. evaluated_model_input: str: The agent's task description in simple text +2. evaluated_model_output: str: The agent's output of the task +3. evaluated_model_retrieved_context: str: The agent's context +4. evaluators: This is a list of dictionaries containing one of the following evaluators and the corresponding criteria. An example input for this field: [{{"evaluator": "Judge", "criteria": "patronus:is-code"}}] + +Evaluators: +{criteria} + +You must ONLY choose the most appropriate evaluator and criteria based on the "pass_criteria" or "description" fields for your evaluation task and nothing from outside of the options present.""" + + def _run( + self, + evaluated_model_input: Optional[str], + evaluated_model_output: Optional[str], + evaluated_model_retrieved_context: Optional[str], + evaluators: List[Dict[str, str]], + ) -> Any: + + # Assert correct format of evaluators + evals = [] + for e in evaluators: + evals.append( + { + "evaluator": e["evaluator"].lower(), + "criteria": e["name"] if "name" in e else e["criteria"], + } + ) + + data = { + "evaluated_model_input": evaluated_model_input, + "evaluated_model_output": evaluated_model_output, + "evaluated_model_retrieved_context": evaluated_model_retrieved_context, + "evaluators": evals, + } + + headers = { + "X-API-KEY": os.getenv("PATRONUS_API_KEY"), + "accept": "application/json", + "content-type": "application/json", + } + + response = requests.post( + self.evaluate_url, headers=headers, data=json.dumps(data) + ) + if response.status_code != 200: + raise Exception( + f"Failed to evaluate model input and output. Response status code: {response.status_code}. Reason: {response.text}" + ) + + return response.json() + + +class PatronusLocalEvaluatorTool(BaseTool): + name: str = "Patronus Local Evaluator Tool" + evaluator: str = "The registered local evaluator" + evaluated_model_gold_answer: str = "The agent's gold answer" + description: str = ( + "This tool is used to evaluate the model input and output using custom function evaluators." + ) + client: Any = None + args_schema: Type[BaseModel] = FixedLocalEvaluatorToolSchema + + class Config: + arbitrary_types_allowed = True + + def __init__(self, evaluator: str, evaluated_model_gold_answer: str, **kwargs: Any): + super().__init__(**kwargs) + self.client = Client() + if evaluator: + self.evaluator = evaluator + self.evaluated_model_gold_answer = evaluated_model_gold_answer + self.description = f"This tool calls the Patronus Evaluation API that takes an additional argument in addition to the following new argument:\n evaluators={evaluator}, evaluated_model_gold_answer={evaluated_model_gold_answer}" + self._generate_description() + print( + f"Updating judge criteria, project name, experiment name, and output file, gold_answer to: {self.evaluator}, {self.evaluated_model_gold_answer}" + ) + + def _run( + self, + **kwargs: Any, + ) -> Any: + evaluated_model_input = kwargs.get("evaluated_model_input") + evaluated_model_output = kwargs.get("evaluated_model_output") + evaluated_model_retrieved_context = kwargs.get( + "evaluated_model_retrieved_context" + ) + evaluated_model_gold_answer = self.evaluated_model_gold_answer + evaluator = self.evaluator + + result = self.client.evaluate( + evaluator=evaluator, + evaluated_model_input=( + evaluated_model_input + if isinstance(evaluated_model_input, str) + else evaluated_model_input.get("description") + ), + evaluated_model_output=( + evaluated_model_output + if isinstance(evaluated_model_output, str) + else evaluated_model_output.get("description") + ), + evaluated_model_retrieved_context=( + evaluated_model_retrieved_context + if isinstance(evaluated_model_retrieved_context, str) + else evaluated_model_retrieved_context.get("description") + ), + evaluated_model_gold_answer=( + evaluated_model_gold_answer + if isinstance(evaluated_model_gold_answer, str) + else evaluated_model_gold_answer.get("description") + ), + tags={}, + ) + output = f"Evaluation result: {result.pass_}, Explanation: {result.explanation}" + return output + + +class PatronusPredifinedCriteriaEvalTool(BaseTool): """ PatronusEvalTool is a tool to automatically evaluate and score agent interactions. @@ -15,32 +250,65 @@ class PatronusEvalTool(BaseTool): name: str = "Call Patronus API tool for evaluation of model inputs and outputs" description: str = ( - """This tool calls the Patronus Evaluation API that takes the following arguments: -1. evaluated_model_input: str: The agent's task description -2. evaluated_model_output: str: The agent's output code -3. evaluators: list[dict[str,str]]: list of dictionaries, each with a an evaluator (such as `judge`) and a criteria (like `patronus:[criteria-name-here]`).""" + """This tool calls the Patronus Evaluation API that takes the following arguments:""" ) evaluate_url: str = "https://api.patronus.ai/v1/evaluate" + args_schema: Type[BaseModel] = FixedBaseToolSchema + evaluators: List[Dict[str, str]] = [] + + def __init__(self, evaluators: List[Dict[str, str]], **kwargs: Any): + super().__init__(**kwargs) + if evaluators: + self.evaluators = evaluators + self.description = f"This tool calls the Patronus Evaluation API that takes an additional argument in addition to the following new argument:\n evaluators={evaluators}" + self._generate_description() + print(f"Updating judge criteria to: {self.evaluators}") def _run( self, - evaluated_model_input: str, - evaluated_model_output: str, - evaluators: List[Dict[str, str]], - tags: dict, + **kwargs: Any, ) -> Any: - api_key = os.getenv("PATRONUS_API_KEY") + evaluated_model_input = kwargs.get("evaluated_model_input") + evaluated_model_output = kwargs.get("evaluated_model_output") + evaluated_model_retrieved_context = kwargs.get( + "evaluated_model_retrieved_context" + ) + evaluated_model_gold_answer = kwargs.get("evaluated_model_gold_answer") + evaluators = self.evaluators + headers = { - "X-API-KEY": api_key, + "X-API-KEY": os.getenv("PATRONUS_API_KEY"), "accept": "application/json", "content-type": "application/json", } + data = { - "evaluated_model_input": evaluated_model_input, - "evaluated_model_output": evaluated_model_output, - "evaluators": evaluators, - "tags": tags, + "evaluated_model_input": ( + evaluated_model_input + if isinstance(evaluated_model_input, str) + else evaluated_model_input.get("description") + ), + "evaluated_model_output": ( + evaluated_model_output + if isinstance(evaluated_model_output, str) + else evaluated_model_output.get("description") + ), + "evaluated_model_retrieved_context": ( + evaluated_model_retrieved_context + if isinstance(evaluated_model_retrieved_context, str) + else evaluated_model_retrieved_context.get("description") + ), + "evaluated_model_gold_answer": ( + evaluated_model_gold_answer + if isinstance(evaluated_model_gold_answer, str) + else evaluated_model_gold_answer.get("description") + ), + "evaluators": ( + evaluators + if isinstance(evaluators, list) + else evaluators.get("description") + ), } response = requests.post( @@ -48,7 +316,7 @@ class PatronusEvalTool(BaseTool): ) if response.status_code != 200: raise Exception( - f"Failed to evaluate model input and output. Reason: {response.text}" + f"Failed to evaluate model input and output. Status code: {response.status_code}. Reason: {response.text}" ) return response.json() From 7da783ef0ebde2895f7507d1abeba8b42aac97fa Mon Sep 17 00:00:00 2001 From: DarshanDeshpande Date: Thu, 26 Dec 2024 17:44:04 -0500 Subject: [PATCH 3/3] Minor formatting changes --- src/crewai_tools/tools/patronus_eval_tool/example.py | 2 -- src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/crewai_tools/tools/patronus_eval_tool/example.py b/src/crewai_tools/tools/patronus_eval_tool/example.py index 1b0ba028d..56b8f90d6 100644 --- a/src/crewai_tools/tools/patronus_eval_tool/example.py +++ b/src/crewai_tools/tools/patronus_eval_tool/example.py @@ -17,12 +17,10 @@ patronus_eval_tool = PatronusPredifinedCriteriaEvalTool( # Test the PatronusLocalEvaluatorTool where agent uses the local evaluator client = Client() - @client.register_local_evaluator("local_evaluator_name") def my_evaluator(**kwargs): return EvaluationResult(pass_="PASS", score=0.5, explanation="Explanation test") - patronus_eval_tool = PatronusLocalEvaluatorTool( evaluator="local_evaluator_name", evaluated_model_gold_answer="test" ) diff --git a/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py b/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py index d765c1701..1dfee31ba 100644 --- a/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py +++ b/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py @@ -198,7 +198,7 @@ class PatronusLocalEvaluatorTool(BaseTool): self.description = f"This tool calls the Patronus Evaluation API that takes an additional argument in addition to the following new argument:\n evaluators={evaluator}, evaluated_model_gold_answer={evaluated_model_gold_answer}" self._generate_description() print( - f"Updating judge criteria, project name, experiment name, and output file, gold_answer to: {self.evaluator}, {self.evaluated_model_gold_answer}" + f"Updating judge evaluator, gold_answer to: {self.evaluator}, {self.evaluated_model_gold_answer}" ) def _run(