From d94f7e03dce4866ee180d7f4e97e7a2aa51b8373 Mon Sep 17 00:00:00 2001 From: DarshanDeshpande Date: Sat, 14 Dec 2024 15:46:10 -0500 Subject: [PATCH] Update Patronus AI evaluator tool and example --- .../tools/patronus_eval_tool/example.py | 17 +++----- .../patronus_eval_tool/patronus_eval_tool.py | 39 ++++++++++++------- 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/src/crewai_tools/tools/patronus_eval_tool/example.py b/src/crewai_tools/tools/patronus_eval_tool/example.py index 99088d17f..4015a5f4a 100644 --- a/src/crewai_tools/tools/patronus_eval_tool/example.py +++ b/src/crewai_tools/tools/patronus_eval_tool/example.py @@ -1,34 +1,27 @@ -import os - from crewai import Agent, Crew, Task from patronus_eval_tool import PatronusEvalTool patronus_eval_tool = PatronusEvalTool( - evaluators=[{ - "evaluator": "judge", - "criteria": "patronus:is-code" - }], - tags={} + evaluators=[{"evaluator": "judge", "criteria": "patronus:is-code"}], tags={} ) # Create a new agent coding_agent = Agent( role="Coding Agent", - goal="Generate high quality code. Use the evaluation tool to score the agent outputs", - backstory="Coding agent to generate high quality code. Use the evaluation tool to score the agent outputs", + goal="Generate high quality code and verify that the code is correct by using Patronus AI's evaluation tool to check validity of your output code.", + backstory="You are an experienced coder who can generate high quality python code. You can follow complex instructions accurately and effectively.", tools=[patronus_eval_tool], verbose=True, ) # Define tasks generate_code = Task( - description="Create a simple program to generate the first N numbers in the Fibonacci sequence.", + description="Create a simple program to generate the first N numbers in the Fibonacci sequence. Use the evaluator as `judge` from Patronus AI with the criteria `patronus:is-code` and feed your task input as input and your code as output to verify your code validity.", expected_output="Program that generates the first N numbers in the Fibonacci sequence.", agent=coding_agent, ) - crew = Crew(agents=[coding_agent], tasks=[generate_code]) -crew.kickoff() \ No newline at end of file +crew.kickoff() diff --git a/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py b/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py index c0e2b95e0..88ad28253 100644 --- a/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py +++ b/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py @@ -1,45 +1,54 @@ -from typing import Any, Optional, Type, cast, ClassVar - -from crewai.tools import BaseTool -import json import os +import json import requests +from typing import Any, List, Dict +from crewai.tools import BaseTool + class PatronusEvalTool(BaseTool): """ PatronusEvalTool is a tool to automatically evaluate and score agent interactions. - + Results are logged to the Patronus platform at app.patronus.ai """ - name: str = "Call Patronus API tool" + name: str = "Call Patronus API tool for evaluation of model inputs and outputs" description: str = ( - "This tool calls the Patronus Evaluation API. This function returns the response from the API." + """This tool calls the Patronus Evaluation API that takes the following arguments: +1. evaluated_model_input: str: The agent's task description +2. evaluated_model_output: str: The agent's output code +3. evaluators: list[dict[str,str]]: list of dictionaries, each with a an evaluator (such as `judge`) and a criteria (like `patronus:[criteria-name-here]`).""" ) evaluate_url: str = "https://api.patronus.ai/v1/evaluate" - def _run( self, evaluated_model_input: str, evaluated_model_output: str, - evaluators: list, - tags: dict + evaluators: List[Dict[str, str]], + tags: dict, ) -> Any: - + api_key = os.getenv("PATRONUS_API_KEY") headers = { "X-API-KEY": api_key, "accept": "application/json", - "content-type": "application/json" + "content-type": "application/json", } data = { "evaluated_model_input": evaluated_model_input, "evaluated_model_output": evaluated_model_output, "evaluators": evaluators, - "tags": tags + "tags": tags, } - # Make the POST request - response = requests.post(self.evaluate_url, headers=headers, data=json.dumps(data)) \ No newline at end of file + response = requests.post( + self.evaluate_url, headers=headers, data=json.dumps(data) + ) + if response.status_code != 200: + raise Exception( + f"Failed to evaluate model input and output. Reason: {response.text}" + ) + + return response.json()