Update Patronus AI evaluator tool and example

This commit is contained in:
DarshanDeshpande
2024-12-14 15:46:10 -05:00
parent c76e0f3445
commit d94f7e03dc
2 changed files with 29 additions and 27 deletions

View File

@@ -1,34 +1,27 @@
import os
from crewai import Agent, Crew, Task
from patronus_eval_tool import PatronusEvalTool
patronus_eval_tool = PatronusEvalTool(
evaluators=[{
"evaluator": "judge",
"criteria": "patronus:is-code"
}],
tags={}
evaluators=[{"evaluator": "judge", "criteria": "patronus:is-code"}], tags={}
)
# Create a new agent
coding_agent = Agent(
role="Coding Agent",
goal="Generate high quality code. Use the evaluation tool to score the agent outputs",
backstory="Coding agent to generate high quality code. Use the evaluation tool to score the agent outputs",
goal="Generate high quality code and verify that the code is correct by using Patronus AI's evaluation tool to check validity of your output code.",
backstory="You are an experienced coder who can generate high quality python code. You can follow complex instructions accurately and effectively.",
tools=[patronus_eval_tool],
verbose=True,
)
# Define tasks
generate_code = Task(
description="Create a simple program to generate the first N numbers in the Fibonacci sequence.",
description="Create a simple program to generate the first N numbers in the Fibonacci sequence. Use the evaluator as `judge` from Patronus AI with the criteria `patronus:is-code` and feed your task input as input and your code as output to verify your code validity.",
expected_output="Program that generates the first N numbers in the Fibonacci sequence.",
agent=coding_agent,
)
crew = Crew(agents=[coding_agent], tasks=[generate_code])
crew.kickoff()
crew.kickoff()

View File

@@ -1,45 +1,54 @@
from typing import Any, Optional, Type, cast, ClassVar
from crewai.tools import BaseTool
import json
import os
import json
import requests
from typing import Any, List, Dict
from crewai.tools import BaseTool
class PatronusEvalTool(BaseTool):
"""
PatronusEvalTool is a tool to automatically evaluate and score agent interactions.
Results are logged to the Patronus platform at app.patronus.ai
"""
name: str = "Call Patronus API tool"
name: str = "Call Patronus API tool for evaluation of model inputs and outputs"
description: str = (
"This tool calls the Patronus Evaluation API. This function returns the response from the API."
"""This tool calls the Patronus Evaluation API that takes the following arguments:
1. evaluated_model_input: str: The agent's task description
2. evaluated_model_output: str: The agent's output code
3. evaluators: list[dict[str,str]]: list of dictionaries, each with a an evaluator (such as `judge`) and a criteria (like `patronus:[criteria-name-here]`)."""
)
evaluate_url: str = "https://api.patronus.ai/v1/evaluate"
def _run(
self,
evaluated_model_input: str,
evaluated_model_output: str,
evaluators: list,
tags: dict
evaluators: List[Dict[str, str]],
tags: dict,
) -> Any:
api_key = os.getenv("PATRONUS_API_KEY")
headers = {
"X-API-KEY": api_key,
"accept": "application/json",
"content-type": "application/json"
"content-type": "application/json",
}
data = {
"evaluated_model_input": evaluated_model_input,
"evaluated_model_output": evaluated_model_output,
"evaluators": evaluators,
"tags": tags
"tags": tags,
}
# Make the POST request
response = requests.post(self.evaluate_url, headers=headers, data=json.dumps(data))
response = requests.post(
self.evaluate_url, headers=headers, data=json.dumps(data)
)
if response.status_code != 200:
raise Exception(
f"Failed to evaluate model input and output. Reason: {response.text}"
)
return response.json()