From c76e0f3445ea84268be6318c4931ec729b70f2dc Mon Sep 17 00:00:00 2001 From: Rebecca Qian Date: Fri, 13 Dec 2024 18:55:42 -0500 Subject: [PATCH 1/7] Add patronus evaluation tools remove fields rename eval tool remove eval tool init files --- src/crewai_tools/__init__.py | 1 + src/crewai_tools/tools/__init__.py | 1 + .../tools/patronus_eval_tool/example.py | 34 ++++++++++++++ .../patronus_eval_tool/patronus_eval_tool.py | 45 +++++++++++++++++++ 4 files changed, 81 insertions(+) create mode 100644 src/crewai_tools/tools/patronus_eval_tool/example.py create mode 100644 src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py diff --git a/src/crewai_tools/__init__.py b/src/crewai_tools/__init__.py index 12523a214..7e27286e7 100644 --- a/src/crewai_tools/__init__.py +++ b/src/crewai_tools/__init__.py @@ -22,6 +22,7 @@ from .tools import ( MultiOnTool, MySQLSearchTool, NL2SQLTool, + PatronusEvalTool, PDFSearchTool, PGSearchTool, RagTool, diff --git a/src/crewai_tools/tools/__init__.py b/src/crewai_tools/tools/__init__.py index 23565dbea..9831a2346 100644 --- a/src/crewai_tools/tools/__init__.py +++ b/src/crewai_tools/tools/__init__.py @@ -25,6 +25,7 @@ from .mdx_seach_tool.mdx_search_tool import MDXSearchTool from .multion_tool.multion_tool import MultiOnTool from .mysql_search_tool.mysql_search_tool import MySQLSearchTool from .nl2sql.nl2sql_tool import NL2SQLTool +from .patronus_eval_tool.eval_tool import PatronusEvalTool from .pdf_search_tool.pdf_search_tool import PDFSearchTool from .pg_seach_tool.pg_search_tool import PGSearchTool from .rag.rag_tool import RagTool diff --git a/src/crewai_tools/tools/patronus_eval_tool/example.py b/src/crewai_tools/tools/patronus_eval_tool/example.py new file mode 100644 index 000000000..99088d17f --- /dev/null +++ b/src/crewai_tools/tools/patronus_eval_tool/example.py @@ -0,0 +1,34 @@ +import os + +from crewai import Agent, Crew, Task +from patronus_eval_tool import PatronusEvalTool + + +patronus_eval_tool = PatronusEvalTool( + evaluators=[{ + "evaluator": "judge", + "criteria": "patronus:is-code" + }], + tags={} +) + +# Create a new agent +coding_agent = Agent( + role="Coding Agent", + goal="Generate high quality code. Use the evaluation tool to score the agent outputs", + backstory="Coding agent to generate high quality code. Use the evaluation tool to score the agent outputs", + tools=[patronus_eval_tool], + verbose=True, +) + +# Define tasks +generate_code = Task( + description="Create a simple program to generate the first N numbers in the Fibonacci sequence.", + expected_output="Program that generates the first N numbers in the Fibonacci sequence.", + agent=coding_agent, +) + + +crew = Crew(agents=[coding_agent], tasks=[generate_code]) + +crew.kickoff() \ No newline at end of file diff --git a/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py b/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py new file mode 100644 index 000000000..c0e2b95e0 --- /dev/null +++ b/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py @@ -0,0 +1,45 @@ +from typing import Any, Optional, Type, cast, ClassVar + +from crewai.tools import BaseTool +import json +import os +import requests + + +class PatronusEvalTool(BaseTool): + """ + PatronusEvalTool is a tool to automatically evaluate and score agent interactions. + + Results are logged to the Patronus platform at app.patronus.ai + """ + + name: str = "Call Patronus API tool" + description: str = ( + "This tool calls the Patronus Evaluation API. This function returns the response from the API." + ) + evaluate_url: str = "https://api.patronus.ai/v1/evaluate" + + + def _run( + self, + evaluated_model_input: str, + evaluated_model_output: str, + evaluators: list, + tags: dict + ) -> Any: + + api_key = os.getenv("PATRONUS_API_KEY") + headers = { + "X-API-KEY": api_key, + "accept": "application/json", + "content-type": "application/json" + } + data = { + "evaluated_model_input": evaluated_model_input, + "evaluated_model_output": evaluated_model_output, + "evaluators": evaluators, + "tags": tags + } + + # Make the POST request + response = requests.post(self.evaluate_url, headers=headers, data=json.dumps(data)) \ No newline at end of file From d94f7e03dce4866ee180d7f4e97e7a2aa51b8373 Mon Sep 17 00:00:00 2001 From: DarshanDeshpande Date: Sat, 14 Dec 2024 15:46:10 -0500 Subject: [PATCH 2/7] Update Patronus AI evaluator tool and example --- .../tools/patronus_eval_tool/example.py | 17 +++----- .../patronus_eval_tool/patronus_eval_tool.py | 39 ++++++++++++------- 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/src/crewai_tools/tools/patronus_eval_tool/example.py b/src/crewai_tools/tools/patronus_eval_tool/example.py index 99088d17f..4015a5f4a 100644 --- a/src/crewai_tools/tools/patronus_eval_tool/example.py +++ b/src/crewai_tools/tools/patronus_eval_tool/example.py @@ -1,34 +1,27 @@ -import os - from crewai import Agent, Crew, Task from patronus_eval_tool import PatronusEvalTool patronus_eval_tool = PatronusEvalTool( - evaluators=[{ - "evaluator": "judge", - "criteria": "patronus:is-code" - }], - tags={} + evaluators=[{"evaluator": "judge", "criteria": "patronus:is-code"}], tags={} ) # Create a new agent coding_agent = Agent( role="Coding Agent", - goal="Generate high quality code. Use the evaluation tool to score the agent outputs", - backstory="Coding agent to generate high quality code. Use the evaluation tool to score the agent outputs", + goal="Generate high quality code and verify that the code is correct by using Patronus AI's evaluation tool to check validity of your output code.", + backstory="You are an experienced coder who can generate high quality python code. You can follow complex instructions accurately and effectively.", tools=[patronus_eval_tool], verbose=True, ) # Define tasks generate_code = Task( - description="Create a simple program to generate the first N numbers in the Fibonacci sequence.", + description="Create a simple program to generate the first N numbers in the Fibonacci sequence. Use the evaluator as `judge` from Patronus AI with the criteria `patronus:is-code` and feed your task input as input and your code as output to verify your code validity.", expected_output="Program that generates the first N numbers in the Fibonacci sequence.", agent=coding_agent, ) - crew = Crew(agents=[coding_agent], tasks=[generate_code]) -crew.kickoff() \ No newline at end of file +crew.kickoff() diff --git a/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py b/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py index c0e2b95e0..88ad28253 100644 --- a/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py +++ b/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py @@ -1,45 +1,54 @@ -from typing import Any, Optional, Type, cast, ClassVar - -from crewai.tools import BaseTool -import json import os +import json import requests +from typing import Any, List, Dict +from crewai.tools import BaseTool + class PatronusEvalTool(BaseTool): """ PatronusEvalTool is a tool to automatically evaluate and score agent interactions. - + Results are logged to the Patronus platform at app.patronus.ai """ - name: str = "Call Patronus API tool" + name: str = "Call Patronus API tool for evaluation of model inputs and outputs" description: str = ( - "This tool calls the Patronus Evaluation API. This function returns the response from the API." + """This tool calls the Patronus Evaluation API that takes the following arguments: +1. evaluated_model_input: str: The agent's task description +2. evaluated_model_output: str: The agent's output code +3. evaluators: list[dict[str,str]]: list of dictionaries, each with a an evaluator (such as `judge`) and a criteria (like `patronus:[criteria-name-here]`).""" ) evaluate_url: str = "https://api.patronus.ai/v1/evaluate" - def _run( self, evaluated_model_input: str, evaluated_model_output: str, - evaluators: list, - tags: dict + evaluators: List[Dict[str, str]], + tags: dict, ) -> Any: - + api_key = os.getenv("PATRONUS_API_KEY") headers = { "X-API-KEY": api_key, "accept": "application/json", - "content-type": "application/json" + "content-type": "application/json", } data = { "evaluated_model_input": evaluated_model_input, "evaluated_model_output": evaluated_model_output, "evaluators": evaluators, - "tags": tags + "tags": tags, } - # Make the POST request - response = requests.post(self.evaluate_url, headers=headers, data=json.dumps(data)) \ No newline at end of file + response = requests.post( + self.evaluate_url, headers=headers, data=json.dumps(data) + ) + if response.status_code != 200: + raise Exception( + f"Failed to evaluate model input and output. Reason: {response.text}" + ) + + return response.json() From 0ac6f915fb83c64cdb0b3aa1c5662077caaf66fa Mon Sep 17 00:00:00 2001 From: DarshanDeshpande Date: Thu, 26 Dec 2024 17:37:22 -0500 Subject: [PATCH 3/7] Add all Patronus eval tools and update example --- .../tools/patronus_eval_tool/example.py | 36 ++- .../patronus_eval_tool/patronus_eval_tool.py | 302 +++++++++++++++++- 2 files changed, 314 insertions(+), 24 deletions(-) diff --git a/src/crewai_tools/tools/patronus_eval_tool/example.py b/src/crewai_tools/tools/patronus_eval_tool/example.py index 4015a5f4a..1b0ba028d 100644 --- a/src/crewai_tools/tools/patronus_eval_tool/example.py +++ b/src/crewai_tools/tools/patronus_eval_tool/example.py @@ -1,15 +1,37 @@ from crewai import Agent, Crew, Task -from patronus_eval_tool import PatronusEvalTool - - -patronus_eval_tool = PatronusEvalTool( - evaluators=[{"evaluator": "judge", "criteria": "patronus:is-code"}], tags={} +from patronus_eval_tool import ( + PatronusEvalTool, + PatronusPredifinedCriteriaEvalTool, + PatronusLocalEvaluatorTool, ) +from patronus import Client, EvaluationResult + +# Test the PatronusEvalTool where agent can pick the best evaluator and criteria +patronus_eval_tool = PatronusEvalTool() + +# Test the PatronusPredifinedCriteriaEvalTool where agent uses the defined evaluator and criteria +patronus_eval_tool = PatronusPredifinedCriteriaEvalTool( + evaluators=[{"evaluator": "judge", "criteria": "contains-code"}] +) + +# Test the PatronusLocalEvaluatorTool where agent uses the local evaluator +client = Client() + + +@client.register_local_evaluator("local_evaluator_name") +def my_evaluator(**kwargs): + return EvaluationResult(pass_="PASS", score=0.5, explanation="Explanation test") + + +patronus_eval_tool = PatronusLocalEvaluatorTool( + evaluator="local_evaluator_name", evaluated_model_gold_answer="test" +) + # Create a new agent coding_agent = Agent( role="Coding Agent", - goal="Generate high quality code and verify that the code is correct by using Patronus AI's evaluation tool to check validity of your output code.", + goal="Generate high quality code and verify that the output is code by using Patronus AI's evaluation tool.", backstory="You are an experienced coder who can generate high quality python code. You can follow complex instructions accurately and effectively.", tools=[patronus_eval_tool], verbose=True, @@ -17,7 +39,7 @@ coding_agent = Agent( # Define tasks generate_code = Task( - description="Create a simple program to generate the first N numbers in the Fibonacci sequence. Use the evaluator as `judge` from Patronus AI with the criteria `patronus:is-code` and feed your task input as input and your code as output to verify your code validity.", + description="Create a simple program to generate the first N numbers in the Fibonacci sequence. Select the most appropriate evaluator and criteria for evaluating your output.", expected_output="Program that generates the first N numbers in the Fibonacci sequence.", agent=coding_agent, ) diff --git a/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py b/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py index 88ad28253..d765c1701 100644 --- a/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py +++ b/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py @@ -1,12 +1,247 @@ import os import json import requests - -from typing import Any, List, Dict +import warnings +from typing import Any, List, Dict, Optional, Type from crewai.tools import BaseTool +from pydantic import BaseModel, Field +from patronus import Client + + +class FixedBaseToolSchema(BaseModel): + evaluated_model_input: Dict = Field( + ..., description="The agent's task description in simple text" + ) + evaluated_model_output: Dict = Field( + ..., description="The agent's output of the task" + ) + evaluated_model_retrieved_context: Dict = Field( + ..., description="The agent's context" + ) + evaluated_model_gold_answer: Dict = Field( + ..., description="The agent's gold answer only if available" + ) + evaluators: List[Dict[str, str]] = Field( + ..., + description="List of dictionaries containing the evaluator and criteria to evaluate the model input and output. An example input for this field: [{'evaluator': '[evaluator-from-user]', 'criteria': '[criteria-from-user]'}]", + ) + + +class FixedLocalEvaluatorToolSchema(BaseModel): + evaluated_model_input: Dict = Field( + ..., description="The agent's task description in simple text" + ) + evaluated_model_output: Dict = Field( + ..., description="The agent's output of the task" + ) + evaluated_model_retrieved_context: Dict = Field( + ..., description="The agent's context" + ) + evaluated_model_gold_answer: Dict = Field( + ..., description="The agent's gold answer only if available" + ) + evaluator: str = Field(..., description="The registered local evaluator") class PatronusEvalTool(BaseTool): + name: str = "Patronus Evaluation Tool" + evaluate_url: str = "https://api.patronus.ai/v1/evaluate" + evaluators: List[Dict[str, str]] = [] + criteria: List[Dict[str, str]] = [] + description: str = "" + + def __init__(self, **kwargs: Any): + super().__init__(**kwargs) + temp_evaluators, temp_criteria = self._init_run() + self.evaluators = temp_evaluators + self.criteria = temp_criteria + self.description = self._generate_description() + warnings.warn("You are allowing the agent to select the best evaluator and criteria when you use the `PatronusEvalTool`. If this is not intended then please use `PatronusPredifinedCriteriaEvalTool` instead.") + + def _init_run(self): + content = json.loads( + requests.get( + "https://api.patronus.ai/v1/evaluators", + headers={ + "accept": "application/json", + "X-API-KEY": os.environ["PATRONUS_API_KEY"], + }, + ).text + )["evaluators"] + ids, evaluators = set(), [] + for i in content: + if not i["deprecated"] and i["id"] not in ids: + evaluators.append( + { + "id": i["id"], + "name": i["name"], + "description": i["description"], + "aliases": i["aliases"], + } + ) + ids.add(i["id"]) + + content = json.loads( + requests.get( + "https://api.patronus.ai/v1/evaluator-criteria", + headers={ + "accept": "application/json", + "X-API-KEY": os.environ["PATRONUS_API_KEY"], + }, + ).text + )["evaluator_criteria"] + criteria = [] + for i in content: + if i["config"].get("pass_criteria", None): + if i["config"].get("rubric", None): + criteria.append( + { + "evaluator": i["evaluator_family"], + "name": i["name"], + "pass_criteria": i["config"]["pass_criteria"], + "rubric": i["config"]["rubric"], + } + ) + else: + criteria.append( + { + "evaluator": i["evaluator_family"], + "name": i["name"], + "pass_criteria": i["config"]["pass_criteria"], + } + ) + elif i["description"]: + criteria.append( + { + "evaluator": i["evaluator_family"], + "name": i["name"], + "description": i["description"], + } + ) + + return evaluators, criteria + + def _generate_description(self) -> str: + criteria = "\n".join([json.dumps(i) for i in self.criteria]) + return f"""This tool calls the Patronus Evaluation API that takes the following arguments: +1. evaluated_model_input: str: The agent's task description in simple text +2. evaluated_model_output: str: The agent's output of the task +3. evaluated_model_retrieved_context: str: The agent's context +4. evaluators: This is a list of dictionaries containing one of the following evaluators and the corresponding criteria. An example input for this field: [{{"evaluator": "Judge", "criteria": "patronus:is-code"}}] + +Evaluators: +{criteria} + +You must ONLY choose the most appropriate evaluator and criteria based on the "pass_criteria" or "description" fields for your evaluation task and nothing from outside of the options present.""" + + def _run( + self, + evaluated_model_input: Optional[str], + evaluated_model_output: Optional[str], + evaluated_model_retrieved_context: Optional[str], + evaluators: List[Dict[str, str]], + ) -> Any: + + # Assert correct format of evaluators + evals = [] + for e in evaluators: + evals.append( + { + "evaluator": e["evaluator"].lower(), + "criteria": e["name"] if "name" in e else e["criteria"], + } + ) + + data = { + "evaluated_model_input": evaluated_model_input, + "evaluated_model_output": evaluated_model_output, + "evaluated_model_retrieved_context": evaluated_model_retrieved_context, + "evaluators": evals, + } + + headers = { + "X-API-KEY": os.getenv("PATRONUS_API_KEY"), + "accept": "application/json", + "content-type": "application/json", + } + + response = requests.post( + self.evaluate_url, headers=headers, data=json.dumps(data) + ) + if response.status_code != 200: + raise Exception( + f"Failed to evaluate model input and output. Response status code: {response.status_code}. Reason: {response.text}" + ) + + return response.json() + + +class PatronusLocalEvaluatorTool(BaseTool): + name: str = "Patronus Local Evaluator Tool" + evaluator: str = "The registered local evaluator" + evaluated_model_gold_answer: str = "The agent's gold answer" + description: str = ( + "This tool is used to evaluate the model input and output using custom function evaluators." + ) + client: Any = None + args_schema: Type[BaseModel] = FixedLocalEvaluatorToolSchema + + class Config: + arbitrary_types_allowed = True + + def __init__(self, evaluator: str, evaluated_model_gold_answer: str, **kwargs: Any): + super().__init__(**kwargs) + self.client = Client() + if evaluator: + self.evaluator = evaluator + self.evaluated_model_gold_answer = evaluated_model_gold_answer + self.description = f"This tool calls the Patronus Evaluation API that takes an additional argument in addition to the following new argument:\n evaluators={evaluator}, evaluated_model_gold_answer={evaluated_model_gold_answer}" + self._generate_description() + print( + f"Updating judge criteria, project name, experiment name, and output file, gold_answer to: {self.evaluator}, {self.evaluated_model_gold_answer}" + ) + + def _run( + self, + **kwargs: Any, + ) -> Any: + evaluated_model_input = kwargs.get("evaluated_model_input") + evaluated_model_output = kwargs.get("evaluated_model_output") + evaluated_model_retrieved_context = kwargs.get( + "evaluated_model_retrieved_context" + ) + evaluated_model_gold_answer = self.evaluated_model_gold_answer + evaluator = self.evaluator + + result = self.client.evaluate( + evaluator=evaluator, + evaluated_model_input=( + evaluated_model_input + if isinstance(evaluated_model_input, str) + else evaluated_model_input.get("description") + ), + evaluated_model_output=( + evaluated_model_output + if isinstance(evaluated_model_output, str) + else evaluated_model_output.get("description") + ), + evaluated_model_retrieved_context=( + evaluated_model_retrieved_context + if isinstance(evaluated_model_retrieved_context, str) + else evaluated_model_retrieved_context.get("description") + ), + evaluated_model_gold_answer=( + evaluated_model_gold_answer + if isinstance(evaluated_model_gold_answer, str) + else evaluated_model_gold_answer.get("description") + ), + tags={}, + ) + output = f"Evaluation result: {result.pass_}, Explanation: {result.explanation}" + return output + + +class PatronusPredifinedCriteriaEvalTool(BaseTool): """ PatronusEvalTool is a tool to automatically evaluate and score agent interactions. @@ -15,32 +250,65 @@ class PatronusEvalTool(BaseTool): name: str = "Call Patronus API tool for evaluation of model inputs and outputs" description: str = ( - """This tool calls the Patronus Evaluation API that takes the following arguments: -1. evaluated_model_input: str: The agent's task description -2. evaluated_model_output: str: The agent's output code -3. evaluators: list[dict[str,str]]: list of dictionaries, each with a an evaluator (such as `judge`) and a criteria (like `patronus:[criteria-name-here]`).""" + """This tool calls the Patronus Evaluation API that takes the following arguments:""" ) evaluate_url: str = "https://api.patronus.ai/v1/evaluate" + args_schema: Type[BaseModel] = FixedBaseToolSchema + evaluators: List[Dict[str, str]] = [] + + def __init__(self, evaluators: List[Dict[str, str]], **kwargs: Any): + super().__init__(**kwargs) + if evaluators: + self.evaluators = evaluators + self.description = f"This tool calls the Patronus Evaluation API that takes an additional argument in addition to the following new argument:\n evaluators={evaluators}" + self._generate_description() + print(f"Updating judge criteria to: {self.evaluators}") def _run( self, - evaluated_model_input: str, - evaluated_model_output: str, - evaluators: List[Dict[str, str]], - tags: dict, + **kwargs: Any, ) -> Any: - api_key = os.getenv("PATRONUS_API_KEY") + evaluated_model_input = kwargs.get("evaluated_model_input") + evaluated_model_output = kwargs.get("evaluated_model_output") + evaluated_model_retrieved_context = kwargs.get( + "evaluated_model_retrieved_context" + ) + evaluated_model_gold_answer = kwargs.get("evaluated_model_gold_answer") + evaluators = self.evaluators + headers = { - "X-API-KEY": api_key, + "X-API-KEY": os.getenv("PATRONUS_API_KEY"), "accept": "application/json", "content-type": "application/json", } + data = { - "evaluated_model_input": evaluated_model_input, - "evaluated_model_output": evaluated_model_output, - "evaluators": evaluators, - "tags": tags, + "evaluated_model_input": ( + evaluated_model_input + if isinstance(evaluated_model_input, str) + else evaluated_model_input.get("description") + ), + "evaluated_model_output": ( + evaluated_model_output + if isinstance(evaluated_model_output, str) + else evaluated_model_output.get("description") + ), + "evaluated_model_retrieved_context": ( + evaluated_model_retrieved_context + if isinstance(evaluated_model_retrieved_context, str) + else evaluated_model_retrieved_context.get("description") + ), + "evaluated_model_gold_answer": ( + evaluated_model_gold_answer + if isinstance(evaluated_model_gold_answer, str) + else evaluated_model_gold_answer.get("description") + ), + "evaluators": ( + evaluators + if isinstance(evaluators, list) + else evaluators.get("description") + ), } response = requests.post( @@ -48,7 +316,7 @@ class PatronusEvalTool(BaseTool): ) if response.status_code != 200: raise Exception( - f"Failed to evaluate model input and output. Reason: {response.text}" + f"Failed to evaluate model input and output. Status code: {response.status_code}. Reason: {response.text}" ) return response.json() From 7da783ef0ebde2895f7507d1abeba8b42aac97fa Mon Sep 17 00:00:00 2001 From: DarshanDeshpande Date: Thu, 26 Dec 2024 17:44:04 -0500 Subject: [PATCH 4/7] Minor formatting changes --- src/crewai_tools/tools/patronus_eval_tool/example.py | 2 -- src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/crewai_tools/tools/patronus_eval_tool/example.py b/src/crewai_tools/tools/patronus_eval_tool/example.py index 1b0ba028d..56b8f90d6 100644 --- a/src/crewai_tools/tools/patronus_eval_tool/example.py +++ b/src/crewai_tools/tools/patronus_eval_tool/example.py @@ -17,12 +17,10 @@ patronus_eval_tool = PatronusPredifinedCriteriaEvalTool( # Test the PatronusLocalEvaluatorTool where agent uses the local evaluator client = Client() - @client.register_local_evaluator("local_evaluator_name") def my_evaluator(**kwargs): return EvaluationResult(pass_="PASS", score=0.5, explanation="Explanation test") - patronus_eval_tool = PatronusLocalEvaluatorTool( evaluator="local_evaluator_name", evaluated_model_gold_answer="test" ) diff --git a/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py b/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py index d765c1701..1dfee31ba 100644 --- a/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py +++ b/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py @@ -198,7 +198,7 @@ class PatronusLocalEvaluatorTool(BaseTool): self.description = f"This tool calls the Patronus Evaluation API that takes an additional argument in addition to the following new argument:\n evaluators={evaluator}, evaluated_model_gold_answer={evaluated_model_gold_answer}" self._generate_description() print( - f"Updating judge criteria, project name, experiment name, and output file, gold_answer to: {self.evaluator}, {self.evaluated_model_gold_answer}" + f"Updating judge evaluator, gold_answer to: {self.evaluator}, {self.evaluated_model_gold_answer}" ) def _run( From 15d6314379cd32b29431fcff101185246fb315d3 Mon Sep 17 00:00:00 2001 From: Rebecca Qian Date: Tue, 31 Dec 2024 03:02:15 -0500 Subject: [PATCH 5/7] Create separate tool classes --- .../tools/patronus_eval_tool/example.py | 38 ++- .../patronus_eval_tool/patronus_eval_tool.py | 250 +++--------------- .../patronus_local_evaluator_tool.py | 89 +++++++ .../patronus_predefined_criteria_eval_tool.py | 108 ++++++++ 4 files changed, 257 insertions(+), 228 deletions(-) create mode 100644 src/crewai_tools/tools/patronus_eval_tool/patronus_local_evaluator_tool.py create mode 100644 src/crewai_tools/tools/patronus_eval_tool/patronus_predefined_criteria_eval_tool.py diff --git a/src/crewai_tools/tools/patronus_eval_tool/example.py b/src/crewai_tools/tools/patronus_eval_tool/example.py index 56b8f90d6..83787c86e 100644 --- a/src/crewai_tools/tools/patronus_eval_tool/example.py +++ b/src/crewai_tools/tools/patronus_eval_tool/example.py @@ -1,31 +1,43 @@ from crewai import Agent, Crew, Task from patronus_eval_tool import ( PatronusEvalTool, - PatronusPredifinedCriteriaEvalTool, +) +from patronus_local_evaluator_tool import ( PatronusLocalEvaluatorTool, ) -from patronus import Client, EvaluationResult - -# Test the PatronusEvalTool where agent can pick the best evaluator and criteria -patronus_eval_tool = PatronusEvalTool() - -# Test the PatronusPredifinedCriteriaEvalTool where agent uses the defined evaluator and criteria -patronus_eval_tool = PatronusPredifinedCriteriaEvalTool( - evaluators=[{"evaluator": "judge", "criteria": "contains-code"}] +from patronus_predefined_criteria_eval_tool import ( + PatronusPredefinedCriteriaEvalTool, ) +from patronus import Client, EvaluationResult +import random + # Test the PatronusLocalEvaluatorTool where agent uses the local evaluator client = Client() -@client.register_local_evaluator("local_evaluator_name") +# Example of an evaluator that returns a random pass/fail result +@client.register_local_evaluator("random_evaluator") def my_evaluator(**kwargs): - return EvaluationResult(pass_="PASS", score=0.5, explanation="Explanation test") + score = random.random() + return EvaluationResult( + score_raw=score, + pass_=score >= 0.5, + explanation="example explanation" # Optional justification for LLM judges + ) +# 1. Uses PatronusEvalTool: agent can pick the best evaluator and criteria +# patronus_eval_tool = PatronusEvalTool() + +# 2. Uses PatronusPredefinedCriteriaEvalTool: agent uses the defined evaluator and criteria +# patronus_eval_tool = PatronusPredefinedCriteriaEvalTool( +# evaluators=[{"evaluator": "judge", "criteria": "contains-code"}] +# ) + +# 3. Uses PatronusLocalEvaluatorTool: agent uses user defined evaluator patronus_eval_tool = PatronusLocalEvaluatorTool( - evaluator="local_evaluator_name", evaluated_model_gold_answer="test" + evaluator="random_evaluator", evaluated_model_gold_answer="example label" ) - # Create a new agent coding_agent = Agent( role="Coding Agent", diff --git a/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py b/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py index 1dfee31ba..9136cfb59 100644 --- a/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py +++ b/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py @@ -8,40 +8,6 @@ from pydantic import BaseModel, Field from patronus import Client -class FixedBaseToolSchema(BaseModel): - evaluated_model_input: Dict = Field( - ..., description="The agent's task description in simple text" - ) - evaluated_model_output: Dict = Field( - ..., description="The agent's output of the task" - ) - evaluated_model_retrieved_context: Dict = Field( - ..., description="The agent's context" - ) - evaluated_model_gold_answer: Dict = Field( - ..., description="The agent's gold answer only if available" - ) - evaluators: List[Dict[str, str]] = Field( - ..., - description="List of dictionaries containing the evaluator and criteria to evaluate the model input and output. An example input for this field: [{'evaluator': '[evaluator-from-user]', 'criteria': '[criteria-from-user]'}]", - ) - - -class FixedLocalEvaluatorToolSchema(BaseModel): - evaluated_model_input: Dict = Field( - ..., description="The agent's task description in simple text" - ) - evaluated_model_output: Dict = Field( - ..., description="The agent's output of the task" - ) - evaluated_model_retrieved_context: Dict = Field( - ..., description="The agent's context" - ) - evaluated_model_gold_answer: Dict = Field( - ..., description="The agent's gold answer only if available" - ) - evaluator: str = Field(..., description="The registered local evaluator") - class PatronusEvalTool(BaseTool): name: str = "Patronus Evaluation Tool" @@ -56,10 +22,10 @@ class PatronusEvalTool(BaseTool): self.evaluators = temp_evaluators self.criteria = temp_criteria self.description = self._generate_description() - warnings.warn("You are allowing the agent to select the best evaluator and criteria when you use the `PatronusEvalTool`. If this is not intended then please use `PatronusPredifinedCriteriaEvalTool` instead.") + warnings.warn("You are allowing the agent to select the best evaluator and criteria when you use the `PatronusEvalTool`. If this is not intended then please use `PatronusPredefinedCriteriaEvalTool` instead.") def _init_run(self): - content = json.loads( + evaluators_set = json.loads( requests.get( "https://api.patronus.ai/v1/evaluators", headers={ @@ -69,19 +35,19 @@ class PatronusEvalTool(BaseTool): ).text )["evaluators"] ids, evaluators = set(), [] - for i in content: - if not i["deprecated"] and i["id"] not in ids: + for ev in evaluators_set: + if not ev["deprecated"] and ev["id"] not in ids: evaluators.append( { - "id": i["id"], - "name": i["name"], - "description": i["description"], - "aliases": i["aliases"], + "id": ev["id"], + "name": ev["name"], + "description": ev["description"], + "aliases": ev["aliases"], } ) - ids.add(i["id"]) + ids.add(ev["id"]) - content = json.loads( + criteria_set = json.loads( requests.get( "https://api.patronus.ai/v1/evaluator-criteria", headers={ @@ -91,31 +57,31 @@ class PatronusEvalTool(BaseTool): ).text )["evaluator_criteria"] criteria = [] - for i in content: - if i["config"].get("pass_criteria", None): - if i["config"].get("rubric", None): + for cr in criteria_set: + if cr["config"].get("pass_criteria", None): + if cr["config"].get("rubric", None): criteria.append( { - "evaluator": i["evaluator_family"], - "name": i["name"], - "pass_criteria": i["config"]["pass_criteria"], - "rubric": i["config"]["rubric"], + "evaluator": cr["evaluator_family"], + "name": cr["name"], + "pass_criteria": cr["config"]["pass_criteria"], + "rubric": cr["config"]["rubric"], } ) else: criteria.append( { - "evaluator": i["evaluator_family"], - "name": i["name"], - "pass_criteria": i["config"]["pass_criteria"], + "evaluator": cr["evaluator_family"], + "name": cr["name"], + "pass_criteria": cr["config"]["pass_criteria"], } ) - elif i["description"]: + elif cr["description"]: criteria.append( { - "evaluator": i["evaluator_family"], - "name": i["name"], - "description": i["description"], + "evaluator": cr["evaluator_family"], + "name": cr["name"], + "description": cr["description"], } ) @@ -124,15 +90,15 @@ class PatronusEvalTool(BaseTool): def _generate_description(self) -> str: criteria = "\n".join([json.dumps(i) for i in self.criteria]) return f"""This tool calls the Patronus Evaluation API that takes the following arguments: -1. evaluated_model_input: str: The agent's task description in simple text -2. evaluated_model_output: str: The agent's output of the task -3. evaluated_model_retrieved_context: str: The agent's context -4. evaluators: This is a list of dictionaries containing one of the following evaluators and the corresponding criteria. An example input for this field: [{{"evaluator": "Judge", "criteria": "patronus:is-code"}}] + 1. evaluated_model_input: str: The agent's task description in simple text + 2. evaluated_model_output: str: The agent's output of the task + 3. evaluated_model_retrieved_context: str: The agent's context + 4. evaluators: This is a list of dictionaries containing one of the following evaluators and the corresponding criteria. An example input for this field: [{{"evaluator": "Judge", "criteria": "patronus:is-code"}}] -Evaluators: -{criteria} + Evaluators: + {criteria} -You must ONLY choose the most appropriate evaluator and criteria based on the "pass_criteria" or "description" fields for your evaluation task and nothing from outside of the options present.""" + You must ONLY choose the most appropriate evaluator and criteria based on the "pass_criteria" or "description" fields for your evaluation task and nothing from outside of the options present.""" def _run( self, @@ -144,11 +110,11 @@ You must ONLY choose the most appropriate evaluator and criteria based on the "p # Assert correct format of evaluators evals = [] - for e in evaluators: + for ev in evaluators: evals.append( { - "evaluator": e["evaluator"].lower(), - "criteria": e["name"] if "name" in e else e["criteria"], + "evaluator": ev["evaluator"].lower(), + "criteria": ev["name"] if "name" in ev else ev["criteria"], } ) @@ -173,150 +139,4 @@ You must ONLY choose the most appropriate evaluator and criteria based on the "p f"Failed to evaluate model input and output. Response status code: {response.status_code}. Reason: {response.text}" ) - return response.json() - - -class PatronusLocalEvaluatorTool(BaseTool): - name: str = "Patronus Local Evaluator Tool" - evaluator: str = "The registered local evaluator" - evaluated_model_gold_answer: str = "The agent's gold answer" - description: str = ( - "This tool is used to evaluate the model input and output using custom function evaluators." - ) - client: Any = None - args_schema: Type[BaseModel] = FixedLocalEvaluatorToolSchema - - class Config: - arbitrary_types_allowed = True - - def __init__(self, evaluator: str, evaluated_model_gold_answer: str, **kwargs: Any): - super().__init__(**kwargs) - self.client = Client() - if evaluator: - self.evaluator = evaluator - self.evaluated_model_gold_answer = evaluated_model_gold_answer - self.description = f"This tool calls the Patronus Evaluation API that takes an additional argument in addition to the following new argument:\n evaluators={evaluator}, evaluated_model_gold_answer={evaluated_model_gold_answer}" - self._generate_description() - print( - f"Updating judge evaluator, gold_answer to: {self.evaluator}, {self.evaluated_model_gold_answer}" - ) - - def _run( - self, - **kwargs: Any, - ) -> Any: - evaluated_model_input = kwargs.get("evaluated_model_input") - evaluated_model_output = kwargs.get("evaluated_model_output") - evaluated_model_retrieved_context = kwargs.get( - "evaluated_model_retrieved_context" - ) - evaluated_model_gold_answer = self.evaluated_model_gold_answer - evaluator = self.evaluator - - result = self.client.evaluate( - evaluator=evaluator, - evaluated_model_input=( - evaluated_model_input - if isinstance(evaluated_model_input, str) - else evaluated_model_input.get("description") - ), - evaluated_model_output=( - evaluated_model_output - if isinstance(evaluated_model_output, str) - else evaluated_model_output.get("description") - ), - evaluated_model_retrieved_context=( - evaluated_model_retrieved_context - if isinstance(evaluated_model_retrieved_context, str) - else evaluated_model_retrieved_context.get("description") - ), - evaluated_model_gold_answer=( - evaluated_model_gold_answer - if isinstance(evaluated_model_gold_answer, str) - else evaluated_model_gold_answer.get("description") - ), - tags={}, - ) - output = f"Evaluation result: {result.pass_}, Explanation: {result.explanation}" - return output - - -class PatronusPredifinedCriteriaEvalTool(BaseTool): - """ - PatronusEvalTool is a tool to automatically evaluate and score agent interactions. - - Results are logged to the Patronus platform at app.patronus.ai - """ - - name: str = "Call Patronus API tool for evaluation of model inputs and outputs" - description: str = ( - """This tool calls the Patronus Evaluation API that takes the following arguments:""" - ) - evaluate_url: str = "https://api.patronus.ai/v1/evaluate" - args_schema: Type[BaseModel] = FixedBaseToolSchema - evaluators: List[Dict[str, str]] = [] - - def __init__(self, evaluators: List[Dict[str, str]], **kwargs: Any): - super().__init__(**kwargs) - if evaluators: - self.evaluators = evaluators - self.description = f"This tool calls the Patronus Evaluation API that takes an additional argument in addition to the following new argument:\n evaluators={evaluators}" - self._generate_description() - print(f"Updating judge criteria to: {self.evaluators}") - - def _run( - self, - **kwargs: Any, - ) -> Any: - - evaluated_model_input = kwargs.get("evaluated_model_input") - evaluated_model_output = kwargs.get("evaluated_model_output") - evaluated_model_retrieved_context = kwargs.get( - "evaluated_model_retrieved_context" - ) - evaluated_model_gold_answer = kwargs.get("evaluated_model_gold_answer") - evaluators = self.evaluators - - headers = { - "X-API-KEY": os.getenv("PATRONUS_API_KEY"), - "accept": "application/json", - "content-type": "application/json", - } - - data = { - "evaluated_model_input": ( - evaluated_model_input - if isinstance(evaluated_model_input, str) - else evaluated_model_input.get("description") - ), - "evaluated_model_output": ( - evaluated_model_output - if isinstance(evaluated_model_output, str) - else evaluated_model_output.get("description") - ), - "evaluated_model_retrieved_context": ( - evaluated_model_retrieved_context - if isinstance(evaluated_model_retrieved_context, str) - else evaluated_model_retrieved_context.get("description") - ), - "evaluated_model_gold_answer": ( - evaluated_model_gold_answer - if isinstance(evaluated_model_gold_answer, str) - else evaluated_model_gold_answer.get("description") - ), - "evaluators": ( - evaluators - if isinstance(evaluators, list) - else evaluators.get("description") - ), - } - - response = requests.post( - self.evaluate_url, headers=headers, data=json.dumps(data) - ) - if response.status_code != 200: - raise Exception( - f"Failed to evaluate model input and output. Status code: {response.status_code}. Reason: {response.text}" - ) - - return response.json() + return response.json() \ No newline at end of file diff --git a/src/crewai_tools/tools/patronus_eval_tool/patronus_local_evaluator_tool.py b/src/crewai_tools/tools/patronus_eval_tool/patronus_local_evaluator_tool.py new file mode 100644 index 000000000..ca4c972d1 --- /dev/null +++ b/src/crewai_tools/tools/patronus_eval_tool/patronus_local_evaluator_tool.py @@ -0,0 +1,89 @@ +import os +import json +import requests +import warnings +from typing import Any, List, Dict, Optional, Type +from crewai.tools import BaseTool +from pydantic import BaseModel, Field +from patronus import Client + + +class FixedLocalEvaluatorToolSchema(BaseModel): + evaluated_model_input: Dict = Field( + ..., description="The agent's task description in simple text" + ) + evaluated_model_output: Dict = Field( + ..., description="The agent's output of the task" + ) + evaluated_model_retrieved_context: Dict = Field( + ..., description="The agent's context" + ) + evaluated_model_gold_answer: Dict = Field( + ..., description="The agent's gold answer only if available" + ) + evaluator: str = Field(..., description="The registered local evaluator") + + +class PatronusLocalEvaluatorTool(BaseTool): + name: str = "Patronus Local Evaluator Tool" + evaluator: str = "The registered local evaluator" + evaluated_model_gold_answer: str = "The agent's gold answer" + description: str = ( + "This tool is used to evaluate the model input and output using custom function evaluators." + ) + client: Any = None + args_schema: Type[BaseModel] = FixedLocalEvaluatorToolSchema + + class Config: + arbitrary_types_allowed = True + + def __init__(self, evaluator: str, evaluated_model_gold_answer: str, **kwargs: Any): + super().__init__(**kwargs) + self.client = Client() + if evaluator: + self.evaluator = evaluator + self.evaluated_model_gold_answer = evaluated_model_gold_answer + self.description = f"This tool calls the Patronus Evaluation API that takes an additional argument in addition to the following new argument:\n evaluators={evaluator}, evaluated_model_gold_answer={evaluated_model_gold_answer}" + self._generate_description() + print( + f"Updating judge evaluator, gold_answer to: {self.evaluator}, {self.evaluated_model_gold_answer}" + ) + + def _run( + self, + **kwargs: Any, + ) -> Any: + evaluated_model_input = kwargs.get("evaluated_model_input") + evaluated_model_output = kwargs.get("evaluated_model_output") + evaluated_model_retrieved_context = kwargs.get( + "evaluated_model_retrieved_context" + ) + evaluated_model_gold_answer = self.evaluated_model_gold_answer + evaluator = self.evaluator + + result = self.client.evaluate( + evaluator=evaluator, + evaluated_model_input=( + evaluated_model_input + if isinstance(evaluated_model_input, str) + else evaluated_model_input.get("description") + ), + evaluated_model_output=( + evaluated_model_output + if isinstance(evaluated_model_output, str) + else evaluated_model_output.get("description") + ), + evaluated_model_retrieved_context=( + evaluated_model_retrieved_context + if isinstance(evaluated_model_retrieved_context, str) + else evaluated_model_retrieved_context.get("description") + ), + evaluated_model_gold_answer=( + evaluated_model_gold_answer + if isinstance(evaluated_model_gold_answer, str) + else evaluated_model_gold_answer.get("description") + ), + tags={}, + ) + output = f"Evaluation result: {result.pass_}, Explanation: {result.explanation}" + return output diff --git a/src/crewai_tools/tools/patronus_eval_tool/patronus_predefined_criteria_eval_tool.py b/src/crewai_tools/tools/patronus_eval_tool/patronus_predefined_criteria_eval_tool.py new file mode 100644 index 000000000..28661f64b --- /dev/null +++ b/src/crewai_tools/tools/patronus_eval_tool/patronus_predefined_criteria_eval_tool.py @@ -0,0 +1,108 @@ +import os +import json +import requests +import warnings +from typing import Any, List, Dict, Optional, Type +from crewai.tools import BaseTool +from pydantic import BaseModel, Field +from patronus import Client + + +class FixedBaseToolSchema(BaseModel): + evaluated_model_input: Dict = Field( + ..., description="The agent's task description in simple text" + ) + evaluated_model_output: Dict = Field( + ..., description="The agent's output of the task" + ) + evaluated_model_retrieved_context: Dict = Field( + ..., description="The agent's context" + ) + evaluated_model_gold_answer: Dict = Field( + ..., description="The agent's gold answer only if available" + ) + evaluators: List[Dict[str, str]] = Field( + ..., + description="List of dictionaries containing the evaluator and criteria to evaluate the model input and output. An example input for this field: [{'evaluator': '[evaluator-from-user]', 'criteria': '[criteria-from-user]'}]", + ) + + +class PatronusPredefinedCriteriaEvalTool(BaseTool): + """ + PatronusEvalTool is a tool to automatically evaluate and score agent interactions. + + Results are logged to the Patronus platform at app.patronus.ai + """ + + name: str = "Call Patronus API tool for evaluation of model inputs and outputs" + description: str = ( + """This tool calls the Patronus Evaluation API that takes the following arguments:""" + ) + evaluate_url: str = "https://api.patronus.ai/v1/evaluate" + args_schema: Type[BaseModel] = FixedBaseToolSchema + evaluators: List[Dict[str, str]] = [] + + def __init__(self, evaluators: List[Dict[str, str]], **kwargs: Any): + super().__init__(**kwargs) + if evaluators: + self.evaluators = evaluators + self.description = f"This tool calls the Patronus Evaluation API that takes an additional argument in addition to the following new argument:\n evaluators={evaluators}" + self._generate_description() + print(f"Updating judge criteria to: {self.evaluators}") + + def _run( + self, + **kwargs: Any, + ) -> Any: + + evaluated_model_input = kwargs.get("evaluated_model_input") + evaluated_model_output = kwargs.get("evaluated_model_output") + evaluated_model_retrieved_context = kwargs.get( + "evaluated_model_retrieved_context" + ) + evaluated_model_gold_answer = kwargs.get("evaluated_model_gold_answer") + evaluators = self.evaluators + + headers = { + "X-API-KEY": os.getenv("PATRONUS_API_KEY"), + "accept": "application/json", + "content-type": "application/json", + } + + data = { + "evaluated_model_input": ( + evaluated_model_input + if isinstance(evaluated_model_input, str) + else evaluated_model_input.get("description") + ), + "evaluated_model_output": ( + evaluated_model_output + if isinstance(evaluated_model_output, str) + else evaluated_model_output.get("description") + ), + "evaluated_model_retrieved_context": ( + evaluated_model_retrieved_context + if isinstance(evaluated_model_retrieved_context, str) + else evaluated_model_retrieved_context.get("description") + ), + "evaluated_model_gold_answer": ( + evaluated_model_gold_answer + if isinstance(evaluated_model_gold_answer, str) + else evaluated_model_gold_answer.get("description") + ), + "evaluators": ( + evaluators + if isinstance(evaluators, list) + else evaluators.get("description") + ), + } + + response = requests.post( + self.evaluate_url, headers=headers, data=json.dumps(data) + ) + if response.status_code != 200: + raise Exception( + f"Failed to evaluate model input and output. Status code: {response.status_code}. Reason: {response.text}" + ) + + return response.json() \ No newline at end of file From a7316a86bf72bdcbabe9c80192bc726edcea9463 Mon Sep 17 00:00:00 2001 From: Rebecca Qian Date: Tue, 31 Dec 2024 04:01:26 -0500 Subject: [PATCH 6/7] fix bug in local evaluator tool --- src/crewai_tools/__init__.py | 2 ++ .../tools/patronus_eval_tool/example.py | 4 +-- .../patronus_eval_tool/patronus_eval_tool.py | 5 +--- .../patronus_local_evaluator_tool.py | 26 +++++++++---------- .../patronus_predefined_criteria_eval_tool.py | 4 +-- 5 files changed, 19 insertions(+), 22 deletions(-) diff --git a/src/crewai_tools/__init__.py b/src/crewai_tools/__init__.py index 7e27286e7..e920a5969 100644 --- a/src/crewai_tools/__init__.py +++ b/src/crewai_tools/__init__.py @@ -23,6 +23,8 @@ from .tools import ( MySQLSearchTool, NL2SQLTool, PatronusEvalTool, + PatronusLocalEvaluatorTool, + PatronusPredefinedCriteriaEvalTool, PDFSearchTool, PGSearchTool, RagTool, diff --git a/src/crewai_tools/tools/patronus_eval_tool/example.py b/src/crewai_tools/tools/patronus_eval_tool/example.py index 83787c86e..b9e1bad5e 100644 --- a/src/crewai_tools/tools/patronus_eval_tool/example.py +++ b/src/crewai_tools/tools/patronus_eval_tool/example.py @@ -17,7 +17,7 @@ client = Client() # Example of an evaluator that returns a random pass/fail result @client.register_local_evaluator("random_evaluator") -def my_evaluator(**kwargs): +def random_evaluator(**kwargs): score = random.random() return EvaluationResult( score_raw=score, @@ -35,7 +35,7 @@ def my_evaluator(**kwargs): # 3. Uses PatronusLocalEvaluatorTool: agent uses user defined evaluator patronus_eval_tool = PatronusLocalEvaluatorTool( - evaluator="random_evaluator", evaluated_model_gold_answer="example label" + patronus_client=client, evaluator="random_evaluator", evaluated_model_gold_answer="example label" ) # Create a new agent diff --git a/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py b/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py index 9136cfb59..23ffe2fd4 100644 --- a/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py +++ b/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py @@ -2,11 +2,8 @@ import os import json import requests import warnings -from typing import Any, List, Dict, Optional, Type +from typing import Any, List, Dict, Optional from crewai.tools import BaseTool -from pydantic import BaseModel, Field -from patronus import Client - class PatronusEvalTool(BaseTool): diff --git a/src/crewai_tools/tools/patronus_eval_tool/patronus_local_evaluator_tool.py b/src/crewai_tools/tools/patronus_eval_tool/patronus_local_evaluator_tool.py index ca4c972d1..5f75ad26c 100644 --- a/src/crewai_tools/tools/patronus_eval_tool/patronus_local_evaluator_tool.py +++ b/src/crewai_tools/tools/patronus_eval_tool/patronus_local_evaluator_tool.py @@ -1,24 +1,20 @@ -import os -import json -import requests -import warnings -from typing import Any, List, Dict, Optional, Type +from typing import Any, Type from crewai.tools import BaseTool from pydantic import BaseModel, Field from patronus import Client class FixedLocalEvaluatorToolSchema(BaseModel): - evaluated_model_input: Dict = Field( + evaluated_model_input: str = Field( ..., description="The agent's task description in simple text" ) - evaluated_model_output: Dict = Field( + evaluated_model_output: str = Field( ..., description="The agent's output of the task" ) - evaluated_model_retrieved_context: Dict = Field( + evaluated_model_retrieved_context: str = Field( ..., description="The agent's context" ) - evaluated_model_gold_answer: Dict = Field( + evaluated_model_gold_answer: str = Field( ..., description="The agent's gold answer only if available" ) evaluator: str = Field(..., description="The registered local evaluator") @@ -37,9 +33,9 @@ class PatronusLocalEvaluatorTool(BaseTool): class Config: arbitrary_types_allowed = True - def __init__(self, evaluator: str, evaluated_model_gold_answer: str, **kwargs: Any): + def __init__(self, patronus_client: Client, evaluator: str, evaluated_model_gold_answer: str, **kwargs: Any): super().__init__(**kwargs) - self.client = Client() + self.client = patronus_client #Client() if evaluator: self.evaluator = evaluator self.evaluated_model_gold_answer = evaluated_model_gold_answer @@ -58,9 +54,13 @@ class PatronusLocalEvaluatorTool(BaseTool): evaluated_model_retrieved_context = kwargs.get( "evaluated_model_retrieved_context" ) - evaluated_model_gold_answer = self.evaluated_model_gold_answer + evaluated_model_gold_answer = kwargs.get("evaluated_model_gold_answer") + # evaluated_model_gold_answer = self.evaluated_model_gold_answer evaluator = self.evaluator + print(kwargs) + print(self.evaluator) + result = self.client.evaluate( evaluator=evaluator, evaluated_model_input=( @@ -83,7 +83,7 @@ class PatronusLocalEvaluatorTool(BaseTool): if isinstance(evaluated_model_gold_answer, str) else evaluated_model_gold_answer.get("description") ), - tags={}, + tags={}, # Optional metadata, supports arbitrary kv pairs ) output = f"Evaluation result: {result.pass_}, Explanation: {result.explanation}" return output diff --git a/src/crewai_tools/tools/patronus_eval_tool/patronus_predefined_criteria_eval_tool.py b/src/crewai_tools/tools/patronus_eval_tool/patronus_predefined_criteria_eval_tool.py index 28661f64b..28ffc2912 100644 --- a/src/crewai_tools/tools/patronus_eval_tool/patronus_predefined_criteria_eval_tool.py +++ b/src/crewai_tools/tools/patronus_eval_tool/patronus_predefined_criteria_eval_tool.py @@ -1,11 +1,9 @@ import os import json import requests -import warnings -from typing import Any, List, Dict, Optional, Type +from typing import Any, List, Dict, Type from crewai.tools import BaseTool from pydantic import BaseModel, Field -from patronus import Client class FixedBaseToolSchema(BaseModel): From 10f8a8731779c2b6a1bfce9ce6a6e87e947c8017 Mon Sep 17 00:00:00 2001 From: Rebecca Qian Date: Tue, 31 Dec 2024 04:05:46 -0500 Subject: [PATCH 7/7] update local evaluator --- .../patronus_eval_tool/patronus_local_evaluator_tool.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/crewai_tools/tools/patronus_eval_tool/patronus_local_evaluator_tool.py b/src/crewai_tools/tools/patronus_eval_tool/patronus_local_evaluator_tool.py index 5f75ad26c..e65cb342d 100644 --- a/src/crewai_tools/tools/patronus_eval_tool/patronus_local_evaluator_tool.py +++ b/src/crewai_tools/tools/patronus_eval_tool/patronus_local_evaluator_tool.py @@ -35,7 +35,7 @@ class PatronusLocalEvaluatorTool(BaseTool): def __init__(self, patronus_client: Client, evaluator: str, evaluated_model_gold_answer: str, **kwargs: Any): super().__init__(**kwargs) - self.client = patronus_client #Client() + self.client = patronus_client if evaluator: self.evaluator = evaluator self.evaluated_model_gold_answer = evaluated_model_gold_answer @@ -54,13 +54,9 @@ class PatronusLocalEvaluatorTool(BaseTool): evaluated_model_retrieved_context = kwargs.get( "evaluated_model_retrieved_context" ) - evaluated_model_gold_answer = kwargs.get("evaluated_model_gold_answer") - # evaluated_model_gold_answer = self.evaluated_model_gold_answer + evaluated_model_gold_answer = self.evaluated_model_gold_answer evaluator = self.evaluator - print(kwargs) - print(self.evaluator) - result = self.client.evaluate( evaluator=evaluator, evaluated_model_input=(