Merge pull request #165 from patronus-ai/feat/add-patronus-api-tool

Feat/add patronus api tool
2026-01-09 08:08:32 +00:00 · 2025-01-07 12:48:08 -05:00
parent ad4c711223 10f8a87317
commit 4388235846
6 changed files with 393 additions and 0 deletions
--- a/src/crewai_tools/init.py
+++ b/src/crewai_tools/init.py
@@ -23,6 +23,9 @@ from .tools import (
    MultiOnTool,
    MySQLSearchTool,
    NL2SQLTool,
+    PatronusEvalTool,
+    PatronusLocalEvaluatorTool,
+    PatronusPredefinedCriteriaEvalTool,
    PDFSearchTool,
    PGSearchTool,
    RagTool,
--- a/src/crewai_tools/tools/init.py
+++ b/src/crewai_tools/tools/init.py
@@ -26,6 +26,7 @@ from .mdx_seach_tool.mdx_search_tool import MDXSearchTool
 from .multion_tool.multion_tool import MultiOnTool
 from .mysql_search_tool.mysql_search_tool import MySQLSearchTool
 from .nl2sql.nl2sql_tool import NL2SQLTool
+from .patronus_eval_tool.eval_tool import PatronusEvalTool
 from .pdf_search_tool.pdf_search_tool import PDFSearchTool
 from .pg_seach_tool.pg_search_tool import PGSearchTool
 from .rag.rag_tool import RagTool
--- a/src/crewai_tools/tools/patronus_eval_tool/example.py
+++ b/src/crewai_tools/tools/patronus_eval_tool/example.py
@@ -0,0 +1,59 @@
+from crewai import Agent, Crew, Task
+from patronus_eval_tool import (
+    PatronusEvalTool,
+)
+from patronus_local_evaluator_tool import (
+    PatronusLocalEvaluatorTool,
+)
+from patronus_predefined_criteria_eval_tool import (
+    PatronusPredefinedCriteriaEvalTool,
+)
+from patronus import Client, EvaluationResult
+import random
+
+
+# Test the PatronusLocalEvaluatorTool where agent uses the local evaluator
+client = Client()
+
+# Example of an evaluator that returns a random pass/fail result
+@client.register_local_evaluator("random_evaluator")
+def random_evaluator(**kwargs):
+    score = random.random()
+    return EvaluationResult(
+      score_raw=score,
+      pass_=score >= 0.5,
+      explanation="example explanation" # Optional justification for LLM judges
+    )
+
+# 1. Uses PatronusEvalTool: agent can pick the best evaluator and criteria
+# patronus_eval_tool = PatronusEvalTool()
+
+# 2. Uses PatronusPredefinedCriteriaEvalTool: agent uses the defined evaluator and criteria
+# patronus_eval_tool = PatronusPredefinedCriteriaEvalTool(
+#     evaluators=[{"evaluator": "judge", "criteria": "contains-code"}]
+# )
+
+# 3. Uses PatronusLocalEvaluatorTool: agent uses user defined evaluator
+patronus_eval_tool = PatronusLocalEvaluatorTool(
+    patronus_client=client, evaluator="random_evaluator", evaluated_model_gold_answer="example label"
+)
+
+# Create a new agent
+coding_agent = Agent(
+    role="Coding Agent",
+    goal="Generate high quality code and verify that the output is code by using Patronus AI's evaluation tool.",
+    backstory="You are an experienced coder who can generate high quality python code. You can follow complex instructions accurately and effectively.",
+    tools=[patronus_eval_tool],
+    verbose=True,
+)
+
+# Define tasks
+generate_code = Task(
+    description="Create a simple program to generate the first N numbers in the Fibonacci sequence. Select the most appropriate evaluator and criteria for evaluating your output.",
+    expected_output="Program that generates the first N numbers in the Fibonacci sequence.",
+    agent=coding_agent,
+)
+
+crew = Crew(agents=[coding_agent], tasks=[generate_code])
+
+crew.kickoff()
--- a/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py
+++ b/src/crewai_tools/tools/patronus_eval_tool/patronus_eval_tool.py
@@ -0,0 +1,139 @@
+import os
+import json
+import requests
+import warnings
+from typing import Any, List, Dict, Optional
+from crewai.tools import BaseTool
+
+
+class PatronusEvalTool(BaseTool):
+    name: str = "Patronus Evaluation Tool"
+    evaluate_url: str = "https://api.patronus.ai/v1/evaluate"
+    evaluators: List[Dict[str, str]] = []
+    criteria: List[Dict[str, str]] = []
+    description: str = ""
+
+    def __init__(self, **kwargs: Any):
+        super().__init__(**kwargs)
+        temp_evaluators, temp_criteria = self._init_run()
+        self.evaluators = temp_evaluators
+        self.criteria = temp_criteria
+        self.description = self._generate_description()
+        warnings.warn("You are allowing the agent to select the best evaluator and criteria when you use the `PatronusEvalTool`. If this is not intended then please use `PatronusPredefinedCriteriaEvalTool` instead.")
+
+    def _init_run(self):
+        evaluators_set = json.loads(
+            requests.get(
+                "https://api.patronus.ai/v1/evaluators",
+                headers={
+                    "accept": "application/json",
+                    "X-API-KEY": os.environ["PATRONUS_API_KEY"],
+                },
+            ).text
+        )["evaluators"]
+        ids, evaluators = set(), []
+        for ev in evaluators_set:
+            if not ev["deprecated"] and ev["id"] not in ids:
+                evaluators.append(
+                    {
+                        "id": ev["id"],
+                        "name": ev["name"],
+                        "description": ev["description"],
+                        "aliases": ev["aliases"],
+                    }
+                )
+                ids.add(ev["id"])
+
+        criteria_set = json.loads(
+            requests.get(
+                "https://api.patronus.ai/v1/evaluator-criteria",
+                headers={
+                    "accept": "application/json",
+                    "X-API-KEY": os.environ["PATRONUS_API_KEY"],
+                },
+            ).text
+        )["evaluator_criteria"]
+        criteria = []
+        for cr in criteria_set:
+            if cr["config"].get("pass_criteria", None):
+                if cr["config"].get("rubric", None):
+                    criteria.append(
+                        {
+                            "evaluator": cr["evaluator_family"],
+                            "name": cr["name"],
+                            "pass_criteria": cr["config"]["pass_criteria"],
+                            "rubric": cr["config"]["rubric"],
+                        }
+                    )
+                else:
+                    criteria.append(
+                        {
+                            "evaluator": cr["evaluator_family"],
+                            "name": cr["name"],
+                            "pass_criteria": cr["config"]["pass_criteria"],
+                        }
+                    )
+            elif cr["description"]:
+                criteria.append(
+                    {
+                        "evaluator": cr["evaluator_family"],
+                        "name": cr["name"],
+                        "description": cr["description"],
+                    }
+                )
+
+        return evaluators, criteria
+
+    def _generate_description(self) -> str:
+        criteria = "\n".join([json.dumps(i) for i in self.criteria])
+        return f"""This tool calls the Patronus Evaluation API that takes the following arguments:
+        1. evaluated_model_input: str: The agent's task description in simple text
+        2. evaluated_model_output: str: The agent's output of the task
+        3. evaluated_model_retrieved_context: str: The agent's context
+        4. evaluators: This is a list of dictionaries containing one of the following evaluators and the corresponding criteria. An example input for this field: [{{"evaluator": "Judge", "criteria": "patronus:is-code"}}] 
+
+        Evaluators: 
+        {criteria}
+
+        You must ONLY choose the most appropriate evaluator and criteria based on the "pass_criteria" or "description" fields for your evaluation task and nothing from outside of the options present."""
+
+    def _run(
+        self,
+        evaluated_model_input: Optional[str],
+        evaluated_model_output: Optional[str],
+        evaluated_model_retrieved_context: Optional[str],
+        evaluators: List[Dict[str, str]],
+    ) -> Any:
+
+        # Assert correct format of evaluators
+        evals = []
+        for ev in evaluators:
+            evals.append(
+                {
+                    "evaluator": ev["evaluator"].lower(),
+                    "criteria": ev["name"] if "name" in ev else ev["criteria"],
+                }
+            )
+
+        data = {
+            "evaluated_model_input": evaluated_model_input,
+            "evaluated_model_output": evaluated_model_output,
+            "evaluated_model_retrieved_context": evaluated_model_retrieved_context,
+            "evaluators": evals,
+        }
+
+        headers = {
+            "X-API-KEY": os.getenv("PATRONUS_API_KEY"),
+            "accept": "application/json",
+            "content-type": "application/json",
+        }
+
+        response = requests.post(
+            self.evaluate_url, headers=headers, data=json.dumps(data)
+        )
+        if response.status_code != 200:
+            raise Exception(
+                f"Failed to evaluate model input and output. Response status code: {response.status_code}. Reason: {response.text}"
+            )
+
+        return response.json()
--- a/src/crewai_tools/tools/patronus_eval_tool/patronus_local_evaluator_tool.py
+++ b/src/crewai_tools/tools/patronus_eval_tool/patronus_local_evaluator_tool.py
@@ -0,0 +1,85 @@
+from typing import Any, Type
+from crewai.tools import BaseTool
+from pydantic import BaseModel, Field
+from patronus import Client
+
+
+class FixedLocalEvaluatorToolSchema(BaseModel):
+    evaluated_model_input: str = Field(
+        ..., description="The agent's task description in simple text"
+    )
+    evaluated_model_output: str = Field(
+        ..., description="The agent's output of the task"
+    )
+    evaluated_model_retrieved_context: str = Field(
+        ..., description="The agent's context"
+    )
+    evaluated_model_gold_answer: str = Field(
+        ..., description="The agent's gold answer only if available"
+    )
+    evaluator: str = Field(..., description="The registered local evaluator")
+
+
+class PatronusLocalEvaluatorTool(BaseTool):
+    name: str = "Patronus Local Evaluator Tool"
+    evaluator: str = "The registered local evaluator"
+    evaluated_model_gold_answer: str = "The agent's gold answer"
+    description: str = (
+        "This tool is used to evaluate the model input and output using custom function evaluators."
+    )
+    client: Any = None
+    args_schema: Type[BaseModel] = FixedLocalEvaluatorToolSchema
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    def __init__(self, patronus_client: Client, evaluator: str, evaluated_model_gold_answer: str, **kwargs: Any):
+        super().__init__(**kwargs)
+        self.client = patronus_client
+        if evaluator:
+            self.evaluator = evaluator
+            self.evaluated_model_gold_answer = evaluated_model_gold_answer
+            self.description = f"This tool calls the Patronus Evaluation API that takes an additional argument in addition to the following new argument:\n evaluators={evaluator}, evaluated_model_gold_answer={evaluated_model_gold_answer}"
+            self._generate_description()
+            print(
+                f"Updating judge evaluator, gold_answer to: {self.evaluator}, {self.evaluated_model_gold_answer}"
+            )
+
+    def _run(
+        self,
+        **kwargs: Any,
+    ) -> Any:
+        evaluated_model_input = kwargs.get("evaluated_model_input")
+        evaluated_model_output = kwargs.get("evaluated_model_output")
+        evaluated_model_retrieved_context = kwargs.get(
+            "evaluated_model_retrieved_context"
+        )
+        evaluated_model_gold_answer = self.evaluated_model_gold_answer
+        evaluator = self.evaluator
+
+        result = self.client.evaluate(
+            evaluator=evaluator,
+            evaluated_model_input=(
+                evaluated_model_input
+                if isinstance(evaluated_model_input, str)
+                else evaluated_model_input.get("description")
+            ),
+            evaluated_model_output=(
+                evaluated_model_output
+                if isinstance(evaluated_model_output, str)
+                else evaluated_model_output.get("description")
+            ),
+            evaluated_model_retrieved_context=(
+                evaluated_model_retrieved_context
+                if isinstance(evaluated_model_retrieved_context, str)
+                else evaluated_model_retrieved_context.get("description")
+            ),
+            evaluated_model_gold_answer=(
+                evaluated_model_gold_answer
+                if isinstance(evaluated_model_gold_answer, str)
+                else evaluated_model_gold_answer.get("description")
+            ),
+            tags={}, # Optional metadata, supports arbitrary kv pairs
+        )
+        output = f"Evaluation result: {result.pass_}, Explanation: {result.explanation}"
+        return output
--- a/src/crewai_tools/tools/patronus_eval_tool/patronus_predefined_criteria_eval_tool.py
+++ b/src/crewai_tools/tools/patronus_eval_tool/patronus_predefined_criteria_eval_tool.py
@@ -0,0 +1,106 @@
+import os
+import json
+import requests
+from typing import Any, List, Dict, Type
+from crewai.tools import BaseTool
+from pydantic import BaseModel, Field
+
+
+class FixedBaseToolSchema(BaseModel):
+    evaluated_model_input: Dict = Field(
+        ..., description="The agent's task description in simple text"
+    )
+    evaluated_model_output: Dict = Field(
+        ..., description="The agent's output of the task"
+    )
+    evaluated_model_retrieved_context: Dict = Field(
+        ..., description="The agent's context"
+    )
+    evaluated_model_gold_answer: Dict = Field(
+        ..., description="The agent's gold answer only if available"
+    )
+    evaluators: List[Dict[str, str]] = Field(
+        ...,
+        description="List of dictionaries containing the evaluator and criteria to evaluate the model input and output. An example input for this field: [{'evaluator': '[evaluator-from-user]', 'criteria': '[criteria-from-user]'}]",
+    )
+
+
+class PatronusPredefinedCriteriaEvalTool(BaseTool):
+    """
+    PatronusEvalTool is a tool to automatically evaluate and score agent interactions.
+
+    Results are logged to the Patronus platform at app.patronus.ai
+    """
+
+    name: str = "Call Patronus API tool for evaluation of model inputs and outputs"
+    description: str = (
+        """This tool calls the Patronus Evaluation API that takes the following arguments:"""
+    )
+    evaluate_url: str = "https://api.patronus.ai/v1/evaluate"
+    args_schema: Type[BaseModel] = FixedBaseToolSchema
+    evaluators: List[Dict[str, str]] = []
+
+    def __init__(self, evaluators: List[Dict[str, str]], **kwargs: Any):
+        super().__init__(**kwargs)
+        if evaluators:
+            self.evaluators = evaluators
+            self.description = f"This tool calls the Patronus Evaluation API that takes an additional argument in addition to the following new argument:\n evaluators={evaluators}"
+            self._generate_description()
+            print(f"Updating judge criteria to: {self.evaluators}")
+
+    def _run(
+        self,
+        **kwargs: Any,
+    ) -> Any:
+
+        evaluated_model_input = kwargs.get("evaluated_model_input")
+        evaluated_model_output = kwargs.get("evaluated_model_output")
+        evaluated_model_retrieved_context = kwargs.get(
+            "evaluated_model_retrieved_context"
+        )
+        evaluated_model_gold_answer = kwargs.get("evaluated_model_gold_answer")
+        evaluators = self.evaluators
+
+        headers = {
+            "X-API-KEY": os.getenv("PATRONUS_API_KEY"),
+            "accept": "application/json",
+            "content-type": "application/json",
+        }
+
+        data = {
+            "evaluated_model_input": (
+                evaluated_model_input
+                if isinstance(evaluated_model_input, str)
+                else evaluated_model_input.get("description")
+            ),
+            "evaluated_model_output": (
+                evaluated_model_output
+                if isinstance(evaluated_model_output, str)
+                else evaluated_model_output.get("description")
+            ),
+            "evaluated_model_retrieved_context": (
+                evaluated_model_retrieved_context
+                if isinstance(evaluated_model_retrieved_context, str)
+                else evaluated_model_retrieved_context.get("description")
+            ),
+            "evaluated_model_gold_answer": (
+                evaluated_model_gold_answer
+                if isinstance(evaluated_model_gold_answer, str)
+                else evaluated_model_gold_answer.get("description")
+            ),
+            "evaluators": (
+                evaluators
+                if isinstance(evaluators, list)
+                else evaluators.get("description")
+            ),
+        }
+
+        response = requests.post(
+            self.evaluate_url, headers=headers, data=json.dumps(data)
+        )
+        if response.status_code != 200:
+            raise Exception(
+                f"Failed to evaluate model input and output. Status code: {response.status_code}. Reason: {response.text}"
+            )
+
+        return response.json()