Feature/kickoff consistent output (#847)

* Cleaned up task execution to now have separate paths for async and sync execution. Updating all kickoff functions to return CrewOutput. WIP. Waiting for Joao feedback on async task execution with task_output * Consistently storing async and sync output for context * outline tests I need to create going forward * Major rehaul of TaskOutput and CrewOutput. Updated all tests to work with new change. Need to add in a few final tricky async tests and add a few more to verify output types on TaskOutput and CrewOutput. * Encountering issues with callback. Need to test on main. WIP * working on tests. WIP * WIP. Figuring out disconnect issue. * Cleaned up logs now that I've isolated the issue to the LLM * more wip. * WIP. It looks like usage metrics has always been broken for async * Update parent crew who is managing for_each loop * Merge in main to bugfix/kickoff-for-each-usage-metrics * Clean up code for review * Add new tests * Final cleanup. Ready for review. * Moving copy functionality from Agent to BaseAgent * Fix renaming issue * Fix linting errors * use BaseAgent instead of Agent where applicable * Fixing missing function. Working on tests. * WIP. Needing team to review change * Fixing issues brought about by merge * WIP * Implement major fixes from yesterdays group conversation. Now working on tests. * The majority of tasks are working now. Need to fix converter class * Fix final failing test * Fix linting and type-checker issues * Add more tests to fully test CrewOutput and TaskOutput changes * Add in validation for async cannot depend on other async tasks. * Update validators and tests
2026-01-10 16:48:30 +00:00 · 2024-07-10 23:35:02 -04:00
parent ca22a91b8a
commit 2fcaf68312
30 changed files with 260509 additions and 697 deletions
--- a/tests/crew_test.py
+++ b/tests/crew_test.py
@@ -1,6 +1,7 @@
 """Test Agent creation and execution basic functionality."""

 import json
+from concurrent.futures import Future
 from unittest import mock
 from unittest.mock import patch

@@ -10,9 +11,11 @@ import pytest
 from crewai.agent import Agent
 from crewai.agents.cache import CacheHandler
 from crewai.crew import Crew
+from crewai.crews.crew_output import CrewOutput
 from crewai.memory.contextual.contextual_memory import ContextualMemory
 from crewai.process import Process
 from crewai.task import Task
+from crewai.tasks.task_output import TaskOutput
 from crewai.utilities import Logger, RPMController

 ceo = Agent(
@@ -84,6 +87,87 @@ def test_crew_config_conditional_requirement():
    ]


+def test_async_task_cannot_include_sequential_async_tasks_in_context():
+    task1 = Task(
+        description="Task 1",
+        async_execution=True,
+        expected_output="output",
+        agent=researcher,
+    )
+    task2 = Task(
+        description="Task 2",
+        async_execution=True,
+        expected_output="output",
+        agent=researcher,
+        context=[task1],
+    )
+    task3 = Task(
+        description="Task 3",
+        async_execution=True,
+        expected_output="output",
+        agent=researcher,
+        context=[task2],
+    )
+    task4 = Task(
+        description="Task 4",
+        expected_output="output",
+        agent=writer,
+    )
+    task5 = Task(
+        description="Task 5",
+        async_execution=True,
+        expected_output="output",
+        agent=researcher,
+        context=[task4],
+    )
+
+    # This should raise an error because task2 is async and has task1 in its context without a sync task in between
+    with pytest.raises(
+        ValueError,
+        match="Task 'Task 2' is asynchronous and cannot include other sequential asynchronous tasks in its context.",
+    ):
+        Crew(tasks=[task1, task2, task3, task4, task5], agents=[researcher, writer])
+
+    # This should not raise an error because task5 has a sync task (task4) in its context
+    try:
+        Crew(tasks=[task1, task4, task5], agents=[researcher, writer])
+    except ValueError:
+        pytest.fail("Unexpected ValidationError raised")
+
+
+def test_context_no_future_tasks():
+
+    task2 = Task(
+        description="Task 2",
+        expected_output="output",
+        agent=researcher,
+    )
+    task3 = Task(
+        description="Task 3",
+        expected_output="output",
+        agent=researcher,
+        context=[task2],
+    )
+    task4 = Task(
+        description="Task 4",
+        expected_output="output",
+        agent=researcher,
+    )
+    task1 = Task(
+        description="Task 1",
+        expected_output="output",
+        agent=researcher,
+        context=[task4],
+    )
+
+    # This should raise an error because task1 has a context dependency on a future task (task4)
+    with pytest.raises(
+        ValueError,
+        match="Task 'Task 1' has a context dependency on a future task 'Task 4', which is not allowed.",
+    ):
+        Crew(tasks=[task1, task2, task3, task4], agents=[researcher, writer])
+
+
 def test_crew_config_with_wrong_keys():
    no_tasks_config = json.dumps(
        {
@@ -136,11 +220,57 @@ def test_crew_creation():
        tasks=tasks,
    )

-    assert (
-        crew.kickoff()
-        == "1. **The Rise of AI in Healthcare**: The convergence of AI and healthcare is a promising frontier, offering unprecedented opportunities for disease diagnosis and patient outcome prediction. AI's potential to revolutionize healthcare lies in its capacity to synthesize vast amounts of data, generating precise and efficient results. This technological breakthrough, however, is not just about improving accuracy and efficiency; it's about saving lives. As we stand on the precipice of this transformative era, we must prepare for the complex challenges and ethical questions it poses, while embracing its ability to reshape healthcare as we know it.\n\n2. **Ethical Implications of AI**: As AI intertwines with our daily lives, it presents a complex web of ethical dilemmas. This fusion of technology, philosophy, and ethics is not merely academically intriguing but profoundly impacts the fabric of our society. The questions raised range from decision-making transparency to accountability, and from privacy to potential biases. As we navigate this ethical labyrinth, it is crucial to establish robust frameworks and regulations to ensure that AI serves humanity, and not the other way around.\n\n3. **AI and Data Privacy**: The rise of AI brings with it an insatiable appetite for data, spawning new debates around privacy rights. Balancing the potential benefits of AI with the right to privacy is a unique challenge that intersects technology, law, and human rights. In an increasingly digital world, where personal information forms the backbone of many services, we must grapple with these issues. It's time to redefine the concept of privacy and devise innovative solutions that ensure our digital footprints are not abused.\n\n4. **AI in Job Market**: The discourse around AI's impact on employment is a narrative of contrast, a tale of displacement and creation. On one hand, AI threatens to automate a multitude of jobs, on the other, it promises to create new roles that we cannot yet imagine. This intersection of technology, economics, and labor rights is a critical dialogue that will shape our future. As we stand at this crossroads, we must not only brace ourselves for the changes but also seize the opportunities that this technological wave brings.\n\n5. **Future of AI Agents**: The evolution of AI agents signifies a leap towards a future where AI is not just a tool, but a partner. These sophisticated AI agents, employed in customer service to personal assistants, are redefining our interactions with technology. As we gaze into the future of AI agents, we see a landscape of possibilities and challenges. This journey will be about harnessing the potential of AI agents while navigating the issues of trust, dependence, and ethical use."
+    result = crew.kickoff()
+
+    expected_string_output = "1. **The Rise of AI in Healthcare**: The convergence of AI and healthcare is a promising frontier, offering unprecedented opportunities for disease diagnosis and patient outcome prediction. AI's potential to revolutionize healthcare lies in its capacity to synthesize vast amounts of data, generating precise and efficient results. This technological breakthrough, however, is not just about improving accuracy and efficiency; it's about saving lives. As we stand on the precipice of this transformative era, we must prepare for the complex challenges and ethical questions it poses, while embracing its ability to reshape healthcare as we know it.\n\n2. **Ethical Implications of AI**: As AI intertwines with our daily lives, it presents a complex web of ethical dilemmas. This fusion of technology, philosophy, and ethics is not merely academically intriguing but profoundly impacts the fabric of our society. The questions raised range from decision-making transparency to accountability, and from privacy to potential biases. As we navigate this ethical labyrinth, it is crucial to establish robust frameworks and regulations to ensure that AI serves humanity, and not the other way around.\n\n3. **AI and Data Privacy**: The rise of AI brings with it an insatiable appetite for data, spawning new debates around privacy rights. Balancing the potential benefits of AI with the right to privacy is a unique challenge that intersects technology, law, and human rights. In an increasingly digital world, where personal information forms the backbone of many services, we must grapple with these issues. It's time to redefine the concept of privacy and devise innovative solutions that ensure our digital footprints are not abused.\n\n4. **AI in Job Market**: The discourse around AI's impact on employment is a narrative of contrast, a tale of displacement and creation. On one hand, AI threatens to automate a multitude of jobs, on the other, it promises to create new roles that we cannot yet imagine. This intersection of technology, economics, and labor rights is a critical dialogue that will shape our future. As we stand at this crossroads, we must not only brace ourselves for the changes but also seize the opportunities that this technological wave brings.\n\n5. **Future of AI Agents**: The evolution of AI agents signifies a leap towards a future where AI is not just a tool, but a partner. These sophisticated AI agents, employed in customer service to personal assistants, are redefining our interactions with technology. As we gaze into the future of AI agents, we see a landscape of possibilities and challenges. This journey will be about harnessing the potential of AI agents while navigating the issues of trust, dependence, and ethical use."
+
+    assert str(result) == expected_string_output
+    assert result.raw == expected_string_output
+    assert isinstance(result, CrewOutput)
+    assert len(result.tasks_output) == len(tasks)
+    assert result.raw == expected_string_output
+
+
+@pytest.mark.vcr(filter_headers=["authorization"])
+def test_sync_task_execution():
+    from unittest.mock import patch
+
+    tasks = [
+        Task(
+            description="Give me a list of 5 interesting ideas to explore for an article, what makes them unique and interesting.",
+            expected_output="Bullet point list of 5 important events.",
+            agent=researcher,
+        ),
+        Task(
+            description="Write an amazing paragraph highlight for each idea that showcases how good an article about this topic could be. Return the list of ideas with their paragraph and your notes.",
+            expected_output="A 4 paragraph article about AI.",
+            agent=writer,
+        ),
+    ]
+
+    crew = Crew(
+        agents=[researcher, writer],
+        process=Process.sequential,
+        tasks=tasks,
    )

+    mock_task_output = TaskOutput(
+        description="Mock description", raw="mocked output", agent="mocked agent"
+    )
+
+    # Because we are mocking execute_sync, we never hit the underlying _execute_core
+    # which sets the output attribute of the task
+    for task in tasks:
+        task.output = mock_task_output
+
+    with patch.object(
+        Task, "execute_sync", return_value=mock_task_output
+    ) as mock_execute_sync:
+        crew.kickoff()
+
+        # Assert that execute_sync was called for each task
+        assert mock_execute_sync.call_count == len(tasks)
+

@pytest.mark.vcr(filter_headers=["authorization"])
 def test_hierarchical_process():
@@ -157,9 +287,11 @@ def test_hierarchical_process():
        manager_llm=ChatOpenAI(temperature=0, model="gpt-4"),
        tasks=[task],
    )
+
    result = crew.kickoff()
+
    assert (
-        result
+        result.raw
        == "1. 'Demystifying AI: An in-depth exploration of Artificial Intelligence for the layperson' - In this piece, we will unravel the enigma of AI, simplifying its complexities into digestible information for the everyday individual. By using relatable examples and analogies, we will journey through the neural networks and machine learning algorithms that define AI, without the jargon and convoluted explanations that often accompany such topics.\n\n2. 'The Role of AI in Startups: A Game Changer?' - Startups today are harnessing the power of AI to revolutionize their businesses. This article will delve into how AI, as an innovative force, is shaping the startup ecosystem, transforming everything from customer service to product development. We'll explore real-life case studies of startups that have leveraged AI to accelerate their growth and disrupt their respective industries.\n\n3. 'AI and Ethics: Navigating the Complex Landscape' - AI brings with it not just technological advancements, but ethical dilemmas as well. This article will engage readers in a thought-provoking discussion on the ethical implications of AI, exploring issues like bias in algorithms, privacy concerns, job displacement, and the moral responsibility of AI developers. We will also discuss potential solutions and frameworks to address these challenges.\n\n4. 'Unveiling the AI Agents: The Future of Customer Service' - AI agents are poised to reshape the customer service landscape, offering businesses the ability to provide round-the-clock support and personalized experiences. In this article, we'll dive deep into the world of AI agents, examining how they work, their benefits and limitations, and how they're set to redefine customer interactions in the digital age.\n\n5. 'From Science Fiction to Reality: AI in Everyday Life' - AI, once a concept limited to the realm of sci-fi, has now permeated our daily lives. This article will highlight the ubiquitous presence of AI, from voice assistants and recommendation algorithms, to autonomous vehicles and smart homes. We'll explore how AI, in its various forms, is transforming our everyday experiences, making the future seem a lot closer than we imagined."
    )

@@ -194,8 +326,10 @@ def test_crew_with_delegating_agents():
        tasks=tasks,
    )

+    result = crew.kickoff()
+
    assert (
-        crew.kickoff()
+        result.raw
        == "AI Agents, simply put, are intelligent systems that can perceive their environment and take actions to reach specific goals. Imagine them as digital assistants that can learn, adapt and make decisions. They operate in the realms of software or hardware, like a chatbot on a website or a self-driving car. The key to their intelligence is their ability to learn from their experiences, making them better at their tasks over time. In today's interconnected world, AI agents are transforming our lives. They enhance customer service experiences, streamline business processes, and even predict trends in data. Vehicles equipped with AI agents are making transportation safer. In healthcare, AI agents are helping to diagnose diseases, personalizing treatment plans, and monitoring patient health. As we embrace the digital era, these AI agents are not just important, they're becoming indispensable, shaping a future where technology works intuitively and intelligently to meet our needs."
    )

@@ -359,49 +493,8 @@ def test_api_calls_throttling(capsys):
        moveon.assert_called()


-# This test is not consistent, some issue is happening on the CI when it comes to Prompt tokens
-#  {'usage_metrics': {'completion_tokens': 34, 'prompt_tokens': 0, 'successful_requests': 2, 'total_tokens': 34}} CI OUTPUT
-#  {'usage_metrics': {'completion_tokens': 34, 'prompt_tokens': 314, 'successful_requests': 2, 'total_tokens': 348}}
-# The issue migh be related to the calculate_usage_metrics function
-# @pytest.mark.vcr(filter_headers=["authorization"])
-# def test_crew_full_output():
-#     agent = Agent(
-#         role="test role",
-#         goal="test goal",
-#         backstory="test backstory",
-#         allow_delegation=False,
-#         verbose=True,
-#     )
-
-#     task1 = Task(
-#         description="just say hi!",
-#         expected_output="your greeting",
-#         agent=agent,
-#     )
-#     task2 = Task(
-#         description="just say hello!",
-#         expected_output="your greeting",
-#         agent=agent,
-#     )
-
-#     crew = Crew(agents=[agent], tasks=[task1, task2], full_output=True)
-
-#     result = crew.kickoff()
-
-#     assert result == {
-#         "final_output": "Hello!",
-#         "tasks_outputs": [task1.output, task2.output],
-#         "usage_metrics": {
-#             "total_tokens": 348,
-#             "prompt_tokens": 314,
-#             "completion_tokens": 34,
-#             "successful_requests": 2,
-#         },
-#     }
-
-
@pytest.mark.vcr(filter_headers=["authorization"])
-def test_crew_kickoff_for_each_full_ouput():
+def test_crew_kickoff_usage_metrics():
    inputs = [
        {"topic": "dog"},
        {"topic": "cat"},
@@ -420,14 +513,11 @@ def test_crew_kickoff_for_each_full_ouput():
        agent=agent,
    )

-    crew = Crew(agents=[agent], tasks=[task], full_output=True)
+    crew = Crew(agents=[agent], tasks=[task])
    results = crew.kickoff_for_each(inputs=inputs)

    assert len(results) == len(inputs)
    for result in results:
-        assert "usage_metrics" in result
-        assert isinstance(result["usage_metrics"], dict)
-
        # Assert that all required keys are in usage_metrics and their values are not None
        for key in [
            "total_tokens",
@@ -435,49 +525,8 @@ def test_crew_kickoff_for_each_full_ouput():
            "completion_tokens",
            "successful_requests",
        ]:
-            assert key in result["usage_metrics"]
-            assert result["usage_metrics"][key] > 0
-
-
-@pytest.mark.vcr(filter_headers=["authorization"])
-@pytest.mark.asyncio
-async def test_crew_async_kickoff_for_each_full_ouput():
-    inputs = [
-        {"topic": "dog"},
-        {"topic": "cat"},
-        {"topic": "apple"},
-    ]
-
-    agent = Agent(
-        role="{topic} Researcher",
-        goal="Express hot takes on {topic}.",
-        backstory="You have a lot of experience with {topic}.",
-    )
-
-    task = Task(
-        description="Give me an analysis around {topic}.",
-        expected_output="1 bullet point about {topic} that's under 15 words.",
-        agent=agent,
-    )
-
-    crew = Crew(agents=[agent], tasks=[task], full_output=True)
-    results = await crew.kickoff_for_each_async(inputs=inputs)
-
-    assert len(results) == len(inputs)
-    for result in results:
-        assert "usage_metrics" in result
-        assert isinstance(result["usage_metrics"], dict)
-
-        # Assert that all required keys are in usage_metrics and their values are not None
-        for key in [
-            "total_tokens",
-            "prompt_tokens",
-            "completion_tokens",
-            "successful_requests",
-        ]:
-            assert key in result["usage_metrics"]
-            # TODO: FIX THIS WHEN USAGE METRICS ARE RE-DONE
-            # assert result["usage_metrics"][key] > 0
+            assert key in result.token_usage
+            assert result.token_usage[key] > 0


 def test_agents_rpm_is_never_set_if_crew_max_RPM_is_not_set():
@@ -500,12 +549,8 @@ def test_agents_rpm_is_never_set_if_crew_max_RPM_is_not_set():
    assert agent._rpm_controller is None


-def test_async_task_execution():
-    from concurrent.futures import ThreadPoolExecutor
-    from unittest.mock import MagicMock, patch
-
-    from crewai.tasks.task_output import TaskOutput
-
+@pytest.mark.vcr(filter_headers=["authorization"])
+def test_sequential_async_task_execution_completion():
    list_ideas = Task(
        description="Give me a list of 5 interesting ideas to explore for an article, what makes them unique and interesting.",
        expected_output="Bullet point list of 5 important events.",
@@ -525,41 +570,188 @@ def test_async_task_execution():
        context=[list_ideas, list_important_history],
    )

+    sequential_crew = Crew(
+        agents=[researcher, writer],
+        process=Process.sequential,
+        tasks=[list_ideas, list_important_history, write_article],
+    )
+
+    sequential_result = sequential_crew.kickoff()
+    assert sequential_result.raw.startswith(
+        "**The Evolution of Artificial Intelligence: A Journey Through Milestones**"
+    )
+
+
+@pytest.mark.vcr(filter_headers=["authorization"])
+def test_single_task_with_async_execution():
+
+    researcher_agent = Agent(
+        role="Researcher",
+        goal="Make the best research and analysis on content about AI and AI agents",
+        backstory="You're an expert researcher, specialized in technology, software engineering, AI and startups. You work as a freelancer and is now working on doing research and analysis for a new customer.",
+        allow_delegation=False,
+    )
+
+    list_ideas = Task(
+        description="Generate a list of 5 interesting ideas to explore for an article, where each bulletpoint is under 15 words.",
+        expected_output="Bullet point list of 5 important events. No additional commentary.",
+        agent=researcher_agent,
+        async_execution=True,
+    )
+
+    crew = Crew(
+        agents=[researcher_agent],
+        process=Process.sequential,
+        tasks=[list_ideas],
+    )
+
+    result = crew.kickoff()
+    assert result.raw.startswith(
+        "- The impact of AI agents on remote work productivity."
+    )
+
+
+@pytest.mark.vcr(filter_headers=["authorization"])
+def test_three_task_with_async_execution():
+    researcher_agent = Agent(
+        role="Researcher",
+        goal="Make the best research and analysis on content about AI and AI agents",
+        backstory="You're an expert researcher, specialized in technology, software engineering, AI and startups. You work as a freelancer and is now working on doing research and analysis for a new customer.",
+        allow_delegation=False,
+    )
+
+    bullet_list = Task(
+        description="Generate a list of 5 interesting ideas to explore for an article, where each bulletpoint is under 15 words.",
+        expected_output="Bullet point list of 5 important events. No additional commentary.",
+        agent=researcher_agent,
+        async_execution=True,
+    )
+    numbered_list = Task(
+        description="Generate a list of 5 interesting ideas to explore for an article, where each bulletpoint is under 15 words.",
+        expected_output="Numbered list of 5 important events. No additional commentary.",
+        agent=researcher_agent,
+        async_execution=True,
+    )
+    letter_list = Task(
+        description="Generate a list of 5 interesting ideas to explore for an article, where each bulletpoint is under 15 words.",
+        expected_output="Numbered list using [A), B), C)] list of 5 important events. No additional commentary.",
+        agent=researcher_agent,
+        async_execution=True,
+    )
+
+    # Expected result is that we will get an error
+    # because a crew can end only end with one or less
+    # async tasks
+    with pytest.raises(pydantic_core._pydantic_core.ValidationError) as error:
+        Crew(
+            agents=[researcher_agent],
+            process=Process.sequential,
+            tasks=[bullet_list, numbered_list, letter_list],
+        )
+
+    assert error.value.errors()[0]["type"] == "async_task_count"
+    assert (
+        "The crew must end with at most one asynchronous task."
+        in error.value.errors()[0]["msg"]
+    )
+
+
+@pytest.mark.vcr(filter_headers=["authorization"])
+@pytest.mark.asyncio
+async def test_crew_async_kickoff():
+    inputs = [
+        {"topic": "dog"},
+        {"topic": "cat"},
+        {"topic": "apple"},
+    ]
+
+    agent = Agent(
+        role="{topic} Researcher",
+        goal="Express hot takes on {topic}.",
+        backstory="You have a lot of experience with {topic}.",
+    )
+
+    task = Task(
+        description="Give me an analysis around {topic}.",
+        expected_output="1 bullet point about {topic} that's under 15 words.",
+        agent=agent,
+    )
+
+    crew = Crew(agents=[agent], tasks=[task])
+    results = await crew.kickoff_for_each_async(inputs=inputs)
+
+    assert len(results) == len(inputs)
+    for result in results:
+        # Assert that all required keys are in usage_metrics and their values are not None
+        for key in [
+            "total_tokens",
+            "prompt_tokens",
+            "completion_tokens",
+            "successful_requests",
+        ]:
+            assert key in result.token_usage
+            assert result.token_usage[key] > 0
+
+
+@pytest.mark.vcr(filter_headers=["authorization"])
+def test_async_task_execution_call_count():
+    from unittest.mock import MagicMock, patch
+
+    list_ideas = Task(
+        description="Give me a list of 5 interesting ideas to explore for na article, what makes them unique and interesting.",
+        expected_output="Bullet point list of 5 important events.",
+        agent=researcher,
+        async_execution=True,
+    )
+    list_important_history = Task(
+        description="Research the history of AI and give me the 5 most important events that shaped the technology.",
+        expected_output="Bullet point list of 5 important events.",
+        agent=researcher,
+        async_execution=True,
+    )
+    write_article = Task(
+        description="Write an article about the history of AI and its most important events.",
+        expected_output="A 4 paragraph article about AI.",
+        agent=writer,
+    )
+
    crew = Crew(
        agents=[researcher, writer],
        process=Process.sequential,
        tasks=[list_ideas, list_important_history, write_article],
    )

-    with patch.object(Agent, "execute_task") as execute:
-        execute.return_value = "ok"
-        with patch.object(ThreadPoolExecutor, "submit") as submit:
-            future = MagicMock()
-            future.result.return_value = "ok"
-            submit.return_value = future
+    # Create a valid TaskOutput instance to mock the return value
+    mock_task_output = TaskOutput(
+        description="Mock description", raw="mocked output", agent="mocked agent"
+    )

-            list_ideas.output = TaskOutput(
-                description="A 4 paragraph article about AI.",
-                raw_output="ok",
-                agent="writer",
-            )
-            list_important_history.output = TaskOutput(
-                description="A 4 paragraph article about AI.",
-                raw_output="ok",
-                agent="writer",
-            )
-            crew.kickoff()
-            submit.assert_called()
-            future.result.assert_called()
+    # Create a MagicMock Future instance
+    mock_future = MagicMock(spec=Future)
+    mock_future.result.return_value = mock_task_output
+
+    # Directly set the output attribute for each task
+    list_ideas.output = mock_task_output
+    list_important_history.output = mock_task_output
+    write_article.output = mock_task_output
+
+    with patch.object(
+        Task, "execute_sync", return_value=mock_task_output
+    ) as mock_execute_sync, patch.object(
+        Task, "execute_async", return_value=mock_future
+    ) as mock_execute_async:
+
+        crew.kickoff()
+
+        assert mock_execute_async.call_count == 2
+        assert mock_execute_sync.call_count == 1


@pytest.mark.vcr(filter_headers=["authorization"])
 def test_kickoff_for_each_single_input():
    """Tests if kickoff_for_each works with a single input."""
-    from unittest.mock import patch

    inputs = [{"topic": "dog"}]
-    expected_outputs = ["Dogs are loyal companions and popular pets."]

    agent = Agent(
        role="{topic} Researcher",
@@ -573,30 +765,21 @@ def test_kickoff_for_each_single_input():
        agent=agent,
    )

-    with patch.object(Agent, "execute_task") as mock_execute_task:
-        mock_execute_task.side_effect = expected_outputs
-        crew = Crew(agents=[agent], tasks=[task])
-        results = crew.kickoff_for_each(inputs=inputs)
+    crew = Crew(agents=[agent], tasks=[task])
+    results = crew.kickoff_for_each(inputs=inputs)

    assert len(results) == 1
-    assert results == expected_outputs


@pytest.mark.vcr(filter_headers=["authorization"])
 def test_kickoff_for_each_multiple_inputs():
    """Tests if kickoff_for_each works with multiple inputs."""
-    from unittest.mock import patch

    inputs = [
        {"topic": "dog"},
        {"topic": "cat"},
        {"topic": "apple"},
    ]
-    expected_outputs = [
-        "Dogs are loyal companions and popular pets.",
-        "Cats are independent and low-maintenance pets.",
-        "Apples are a rich source of dietary fiber and vitamin C.",
-    ]

    agent = Agent(
        role="{topic} Researcher",
@@ -610,14 +793,10 @@ def test_kickoff_for_each_multiple_inputs():
        agent=agent,
    )

-    with patch.object(Agent, "execute_task") as mock_execute_task:
-        mock_execute_task.side_effect = expected_outputs
-        crew = Crew(agents=[agent], tasks=[task])
-        results = crew.kickoff_for_each(inputs=inputs)
+    crew = Crew(agents=[agent], tasks=[task])
+    results = crew.kickoff_for_each(inputs=inputs)

    assert len(results) == len(inputs)
-    for i, res in enumerate(results):
-        assert res == expected_outputs[i]


@pytest.mark.vcr(filter_headers=["authorization"])
@@ -889,7 +1068,7 @@ def test_crew_function_calling_llm():
    with patch.object(llm.client, "create", wraps=llm.client.create) as private_mock:

        @tool
-        def learn_about_AI(topic) -> float:
+        def learn_about_AI(topic) -> str:
            """Useful for when you need to learn about AI to write an paragraph about it."""
            return "AI is a very broad field."

@@ -938,7 +1117,7 @@ def test_task_with_no_arguments():
    crew = Crew(agents=[researcher], tasks=[task])

    result = crew.kickoff()
-    assert result == "75"
+    assert result.raw == "75"


 def test_code_execution_flag_adds_code_tool_upon_kickoff():
@@ -969,7 +1148,6 @@ def test_code_execution_flag_adds_code_tool_upon_kickoff():

@pytest.mark.vcr(filter_headers=["authorization"])
 def test_delegation_is_not_enabled_if_there_are_only_one_agent():
-    from unittest.mock import patch

    researcher = Agent(
        role="Researcher",
@@ -985,10 +1163,9 @@ def test_delegation_is_not_enabled_if_there_are_only_one_agent():
    )

    crew = Crew(agents=[researcher], tasks=[task])
-    with patch.object(Task, "execute") as execute:
-        execute.return_value = "ok"
-        crew.kickoff()
-        assert task.tools == []
+
+    crew.kickoff()
+    assert task.tools == []


@pytest.mark.vcr(filter_headers=["authorization"])
@@ -1006,35 +1183,12 @@ def test_agents_do_not_get_delegation_tools_with_there_is_only_one_agent():

    result = crew.kickoff()
    assert (
-        result
+        result.raw
        == "Howdy! I hope this message finds you well and brings a smile to your face. Have a fantastic day!"
    )
    assert len(agent.tools) == 0


-@pytest.mark.vcr(filter_headers=["authorization"])
-def test_agent_usage_metrics_are_captured_for_sequential_process():
-    agent = Agent(
-        role="Researcher",
-        goal="Be super empathetic.",
-        backstory="You're love to sey howdy.",
-        allow_delegation=False,
-    )
-
-    task = Task(description="say howdy", expected_output="Howdy!", agent=agent)
-
-    crew = Crew(agents=[agent], tasks=[task])
-
-    result = crew.kickoff()
-    assert result == "Howdy!"
-    assert crew.usage_metrics == {
-        "completion_tokens": 17,
-        "prompt_tokens": 158,
-        "successful_requests": 1,
-        "total_tokens": 175,
-    }
-
-
@pytest.mark.vcr(filter_headers=["authorization"])
 def test_sequential_crew_creation_tasks_without_agents():
    task = Task(
@@ -1079,13 +1233,15 @@ def test_agent_usage_metrics_are_captured_for_hierarchical_process():
    )

    result = crew.kickoff()
-    assert result == '"Howdy!"'
+    assert result.raw == '"Howdy!"'
+
+    print(crew.usage_metrics)

    assert crew.usage_metrics == {
-        "total_tokens": 1927,
-        "prompt_tokens": 1557,
-        "completion_tokens": 370,
-        "successful_requests": 4,
+        "total_tokens": 311,
+        "prompt_tokens": 224,
+        "completion_tokens": 87,
+        "successful_requests": 1,
    }


@@ -1110,15 +1266,17 @@ def test_hierarchical_crew_creation_tasks_with_agents():
        manager_llm=ChatOpenAI(model="gpt-4o"),
    )
    crew.kickoff()
+
    assert crew.manager_agent is not None
    assert crew.manager_agent.tools is not None
+    print("TOOL DESCRIPTION", crew.manager_agent.tools[0].description)
    assert crew.manager_agent.tools[0].description.startswith(
-        "Delegate a specific task to one of the following coworkers: [Senior Writer]"
+        "Delegate a specific task to one of the following coworkers: [Senior Writer, Researcher]"
    )


@pytest.mark.vcr(filter_headers=["authorization"])
-def test_hierarchical_crew_creation_tasks_without_async_execution():
+def test_hierarchical_crew_creation_tasks_with_async_execution():
    from langchain_openai import ChatOpenAI

    task = Task(
@@ -1198,9 +1356,81 @@ def test_crew_inputs_interpolate_both_agents_and_tasks_diff():
                interpolate_task_inputs.assert_called()


-def test_task_callback_on_crew():
+def test_crew_does_not_interpolate_without_inputs():
    from unittest.mock import patch

+    agent = Agent(
+        role="{topic} Researcher",
+        goal="Express hot takes on {topic}.",
+        backstory="You have a lot of experience with {topic}.",
+    )
+
+    task = Task(
+        description="Give me an analysis around {topic}.",
+        expected_output="{points} bullet points about {topic}.",
+        agent=agent,
+    )
+
+    crew = Crew(agents=[agent], tasks=[task])
+
+    with patch.object(Agent, "interpolate_inputs") as interpolate_agent_inputs:
+        with patch.object(Task, "interpolate_inputs") as interpolate_task_inputs:
+            crew.kickoff()
+            interpolate_agent_inputs.assert_not_called()
+            interpolate_task_inputs.assert_not_called()
+
+
+# TODO: Ask @joao if we want to start throwing errors if inputs are not provided
+# def test_crew_partial_inputs():
+#     agent = Agent(
+#         role="{topic} Researcher",
+#         goal="Express hot takes on {topic}.",
+#         backstory="You have a lot of experience with {topic}.",
+#     )
+
+#     task = Task(
+#         description="Give me an analysis around {topic}.",
+#         expected_output="{points} bullet points about {topic}.",
+#     )
+
+#     crew = Crew(agents=[agent], tasks=[task], inputs={"topic": "AI"})
+#     inputs = {"topic": "AI"}
+#     crew._interpolate_inputs(inputs=inputs)  # Manual call for now
+
+#     assert crew.tasks[0].description == "Give me an analysis around AI."
+#     assert crew.tasks[0].expected_output == "{points} bullet points about AI."
+#     assert crew.agents[0].role == "AI Researcher"
+#     assert crew.agents[0].goal == "Express hot takes on AI."
+#     assert crew.agents[0].backstory == "You have a lot of experience with AI."
+
+
+# TODO: If we do want ot throw errors if we are missing inputs. Add in this test.
+# def test_crew_invalid_inputs():
+#     agent = Agent(
+#         role="{topic} Researcher",
+#         goal="Express hot takes on {topic}.",
+#         backstory="You have a lot of experience with {topic}.",
+#     )
+
+#     task = Task(
+#         description="Give me an analysis around {topic}.",
+#         expected_output="{points} bullet points about {topic}.",
+#     )
+
+#     crew = Crew(agents=[agent], tasks=[task], inputs={"subject": "AI"})
+#     inputs = {"subject": "AI"}
+#     crew._interpolate_inputs(inputs=inputs)  # Manual call for now
+
+#     assert crew.tasks[0].description == "Give me an analysis around {topic}."
+#     assert crew.tasks[0].expected_output == "{points} bullet points about {topic}."
+#     assert crew.agents[0].role == "{topic} Researcher"
+#     assert crew.agents[0].goal == "Express hot takes on {topic}."
+#     assert crew.agents[0].backstory == "You have a lot of experience with {topic}."
+
+
+def test_task_callback_on_crew():
+    from unittest.mock import MagicMock, patch
+
    researcher_agent = Agent(
        role="Researcher",
        goal="Make the best research and analysis on content about AI and AI agents",
@@ -1215,17 +1445,23 @@ def test_task_callback_on_crew():
        async_execution=True,
    )

+    mock_callback = MagicMock()
+
    crew = Crew(
        agents=[researcher_agent],
        process=Process.sequential,
        tasks=[list_ideas],
-        task_callback=lambda: None,
+        task_callback=mock_callback,
    )

    with patch.object(Agent, "execute_task") as execute:
        execute.return_value = "ok"
        crew.kickoff()
+
        assert list_ideas.callback is not None
+        mock_callback.assert_called_once()
+        args, _ = mock_callback.call_args
+        assert isinstance(args[0], TaskOutput)


@pytest.mark.vcr(filter_headers=["authorization"])
@@ -1235,7 +1471,7 @@ def test_tools_with_custom_caching():
    from crewai_tools import tool

    @tool
-    def multiplcation_tool(first_number: int, second_number: int) -> str:
+    def multiplcation_tool(first_number: int, second_number: int) -> int:
        """Useful for when you need to multiply two numbers together."""
        return first_number * second_number

@@ -1297,7 +1533,7 @@ def test_tools_with_custom_caching():
                input={"first_number": 2, "second_number": 6},
                output=12,
            )
-            assert result == "3"
+            assert result.raw == "3"


@pytest.mark.vcr(filter_headers=["authorization"])
@@ -1395,10 +1631,20 @@ def test_manager_agent():
        tasks=[task],
    )

-    with patch.object(Task, "execute") as execute:
+    mock_task_output = TaskOutput(
+        description="Mock description", raw="mocked output", agent="mocked agent"
+    )
+
+    # Because we are mocking execute_sync, we never hit the underlying _execute_core
+    # which sets the output attribute of the task
+    task.output = mock_task_output
+
+    with patch.object(
+        Task, "execute_sync", return_value=mock_task_output
+    ) as mock_execute_sync:
        crew.kickoff()
        assert manager.allow_delegation is True
-        execute.assert_called()
+        mock_execute_sync.assert_called()


 def test_manager_agent_in_agents_raises_exception():