Update Patronus AI evaluator tool and example

2026-01-27 17:18:13 +00:00 · 2024-12-14 15:46:10 -05:00
parent c76e0f3445
commit d94f7e03dc
2 changed files with 29 additions and 27 deletions
--- a/src/crewai_tools/tools/patronus_eval_tool/example.py
+++ b/src/crewai_tools/tools/patronus_eval_tool/example.py
@@ -1,34 +1,27 @@
 import os
 from crewai import Agent, Crew, Task
 from patronus_eval_tool import PatronusEvalTool
 patronus_eval_tool = PatronusEvalTool(
-    evaluators=[{
+    evaluators=[{"evaluator": "judge", "criteria": "patronus:is-code"}], tags={}
        "evaluator": "judge",
        "criteria": "patronus:is-code"
    }],
    tags={}
 )
 # Create a new agent
 coding_agent = Agent(
    role="Coding Agent",
-    goal="Generate high quality code. Use the evaluation tool to score the agent outputs",
+    goal="Generate high quality code and verify that the code is correct by using Patronus AI's evaluation tool to check validity of your output code.",
-    backstory="Coding agent to generate high quality code. Use the evaluation tool to score the agent outputs",
+    backstory="You are an experienced coder who can generate high quality python code. You can follow complex instructions accurately and effectively.",
    tools=[patronus_eval_tool],
    verbose=True,
 )
 # Define tasks
 generate_code = Task(
-    description="Create a simple program to generate the first N numbers in the Fibonacci sequence.",
+    description="Create a simple program to generate the first N numbers in the Fibonacci sequence. Use the evaluator as `judge` from Patronus AI with the criteria `patronus:is-code` and feed your task input as input and your code as output to verify your code validity.",
    expected_output="Program that generates the first N numbers in the Fibonacci sequence.",
    agent=coding_agent,
 )
 crew = Crew(agents=[coding_agent], tasks=[generate_code])
-crew.kickoff()
+crew.kickoff()
--- a/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py
+++ b/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py
@@ -1,45 +1,54 @@
 from typing import Any, Optional, Type, cast, ClassVar
 from crewai.tools import BaseTool
 import json
 import os
 import json
 import requests
 from typing import Any, List, Dict
 from crewai.tools import BaseTool
 class PatronusEvalTool(BaseTool):
    """
    PatronusEvalTool is a tool to automatically evaluate and score agent interactions.
-    
+
    Results are logged to the Patronus platform at app.patronus.ai
    """
-    name: str = "Call Patronus API tool"
+    name: str = "Call Patronus API tool for evaluation of model inputs and outputs"
    description: str = (
-        "This tool calls the Patronus Evaluation API. This function returns the response from the API."
+        """This tool calls the Patronus Evaluation API that takes the following arguments:
 1. evaluated_model_input: str: The agent's task description 
 2. evaluated_model_output: str: The agent's output code
 3. evaluators: list[dict[str,str]]: list of dictionaries, each with a an evaluator (such as `judge`) and a criteria (like `patronus:[criteria-name-here]`)."""
    )
    evaluate_url: str = "https://api.patronus.ai/v1/evaluate"
    def _run(
        self,
        evaluated_model_input: str,
        evaluated_model_output: str,
-        evaluators: list,
+        evaluators: List[Dict[str, str]],
-        tags: dict
+        tags: dict,
    ) -> Any:
-        
+
        api_key = os.getenv("PATRONUS_API_KEY")
        headers = {
            "X-API-KEY": api_key,
            "accept": "application/json",
-            "content-type": "application/json"
+            "content-type": "application/json",
        }
        data = {
            "evaluated_model_input": evaluated_model_input,
            "evaluated_model_output": evaluated_model_output,
            "evaluators": evaluators,
-            "tags": tags
+            "tags": tags,
        }
-        # Make the POST request
+        response = requests.post(
-        response = requests.post(self.evaluate_url, headers=headers, data=json.dumps(data))
+            self.evaluate_url, headers=headers, data=json.dumps(data)
        )
        if response.status_code != 200:
            raise Exception(
                f"Failed to evaluate model input and output. Reason: {response.text}"
            )
        return response.json()