mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-05-04 08:42:38 +00:00
feat: add CheckpointConfig for automatic checkpointing
This commit is contained in:
187
docs/ar/concepts/checkpointing.mdx
Normal file
187
docs/ar/concepts/checkpointing.mdx
Normal file
@@ -0,0 +1,187 @@
|
|||||||
|
---
|
||||||
|
title: Checkpointing
|
||||||
|
description: حفظ حالة التنفيذ تلقائيا حتى تتمكن الطواقم والتدفقات والوكلاء من الاستئناف بعد الفشل.
|
||||||
|
icon: floppy-disk
|
||||||
|
mode: "wide"
|
||||||
|
---
|
||||||
|
|
||||||
|
<Warning>
|
||||||
|
الـ Checkpointing في اصدار مبكر. قد تتغير واجهات البرمجة في الاصدارات المستقبلية.
|
||||||
|
</Warning>
|
||||||
|
|
||||||
|
## نظرة عامة
|
||||||
|
|
||||||
|
يقوم الـ Checkpointing بحفظ حالة التنفيذ تلقائيا اثناء التشغيل. اذا فشل طاقم او تدفق او وكيل اثناء التنفيذ، يمكنك الاستعادة من اخر نقطة حفظ والاستئناف دون اعادة تنفيذ العمل المكتمل.
|
||||||
|
|
||||||
|
## البداية السريعة
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai import Crew, CheckpointConfig
|
||||||
|
|
||||||
|
crew = Crew(
|
||||||
|
agents=[...],
|
||||||
|
tasks=[...],
|
||||||
|
checkpoint=True, # يستخدم الافتراضيات: ./.checkpoints, عند task_completed
|
||||||
|
)
|
||||||
|
result = crew.kickoff()
|
||||||
|
```
|
||||||
|
|
||||||
|
تتم كتابة ملفات نقاط الحفظ في `./.checkpoints/` بعد اكتمال كل مهمة.
|
||||||
|
|
||||||
|
## التكوين
|
||||||
|
|
||||||
|
استخدم `CheckpointConfig` للتحكم الكامل:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai import Crew, CheckpointConfig
|
||||||
|
|
||||||
|
crew = Crew(
|
||||||
|
agents=[...],
|
||||||
|
tasks=[...],
|
||||||
|
checkpoint=CheckpointConfig(
|
||||||
|
directory="./my_checkpoints",
|
||||||
|
on_events=["task_completed", "crew_kickoff_completed"],
|
||||||
|
max_checkpoints=5,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### حقول CheckpointConfig
|
||||||
|
|
||||||
|
| الحقل | النوع | الافتراضي | الوصف |
|
||||||
|
|:------|:------|:----------|:------|
|
||||||
|
| `directory` | `str` | `"./.checkpoints"` | مسار ملفات نقاط الحفظ |
|
||||||
|
| `on_events` | `list[str]` | `["task_completed"]` | انواع الاحداث التي تطلق نقطة حفظ |
|
||||||
|
| `provider` | `BaseProvider` | `JsonProvider()` | واجهة التخزين |
|
||||||
|
| `max_checkpoints` | `int \| None` | `None` | الحد الاقصى للملفات؛ يتم حذف الاقدم اولا |
|
||||||
|
|
||||||
|
### الوراثة والانسحاب
|
||||||
|
|
||||||
|
يقبل حقل `checkpoint` في Crew و Flow و Agent قيم `CheckpointConfig` او `True` او `False` او `None`:
|
||||||
|
|
||||||
|
| القيمة | السلوك |
|
||||||
|
|:-------|:-------|
|
||||||
|
| `None` (افتراضي) | يرث من الاصل. الوكيل يرث اعدادات الطاقم. |
|
||||||
|
| `True` | تفعيل بالاعدادات الافتراضية. |
|
||||||
|
| `False` | انسحاب صريح. يوقف الوراثة من الاصل. |
|
||||||
|
| `CheckpointConfig(...)` | اعدادات مخصصة. |
|
||||||
|
|
||||||
|
```python
|
||||||
|
crew = Crew(
|
||||||
|
agents=[
|
||||||
|
Agent(role="Researcher", ...), # يرث checkpoint من الطاقم
|
||||||
|
Agent(role="Writer", ..., checkpoint=False), # منسحب، بدون نقاط حفظ
|
||||||
|
],
|
||||||
|
tasks=[...],
|
||||||
|
checkpoint=True,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## الاستئناف من نقطة حفظ
|
||||||
|
|
||||||
|
```python
|
||||||
|
# استعادة واستئناف
|
||||||
|
crew = Crew.from_checkpoint("./my_checkpoints/20260407T120000_abc123.json")
|
||||||
|
result = crew.kickoff() # يستأنف من اخر مهمة مكتملة
|
||||||
|
```
|
||||||
|
|
||||||
|
يتخطى الطاقم المستعاد المهام المكتملة ويستأنف من اول مهمة غير مكتملة.
|
||||||
|
|
||||||
|
## يعمل على Crew و Flow و Agent
|
||||||
|
|
||||||
|
### Crew
|
||||||
|
|
||||||
|
```python
|
||||||
|
crew = Crew(
|
||||||
|
agents=[researcher, writer],
|
||||||
|
tasks=[research_task, write_task, review_task],
|
||||||
|
checkpoint=CheckpointConfig(directory="./crew_cp"),
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
المشغل الافتراضي: `task_completed` (نقطة حفظ واحدة لكل مهمة مكتملة).
|
||||||
|
|
||||||
|
### Flow
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai.flow.flow import Flow, start, listen
|
||||||
|
from crewai import CheckpointConfig
|
||||||
|
|
||||||
|
class MyFlow(Flow):
|
||||||
|
@start()
|
||||||
|
def step_one(self):
|
||||||
|
return "data"
|
||||||
|
|
||||||
|
@listen(step_one)
|
||||||
|
def step_two(self, data):
|
||||||
|
return process(data)
|
||||||
|
|
||||||
|
flow = MyFlow(
|
||||||
|
checkpoint=CheckpointConfig(
|
||||||
|
directory="./flow_cp",
|
||||||
|
on_events=["method_execution_finished"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
result = flow.kickoff()
|
||||||
|
|
||||||
|
# استئناف
|
||||||
|
flow = MyFlow.from_checkpoint("./flow_cp/20260407T120000_abc123.json")
|
||||||
|
result = flow.kickoff()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Agent
|
||||||
|
|
||||||
|
```python
|
||||||
|
agent = Agent(
|
||||||
|
role="Researcher",
|
||||||
|
goal="Research topics",
|
||||||
|
backstory="Expert researcher",
|
||||||
|
checkpoint=CheckpointConfig(
|
||||||
|
directory="./agent_cp",
|
||||||
|
on_events=["lite_agent_execution_completed"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
result = agent.kickoff(messages=[{"role": "user", "content": "Research AI trends"}])
|
||||||
|
```
|
||||||
|
|
||||||
|
## انواع الاحداث
|
||||||
|
|
||||||
|
يقبل حقل `on_events` اي مجموعة من سلاسل انواع الاحداث. الخيارات الشائعة:
|
||||||
|
|
||||||
|
| حالة الاستخدام | الاحداث |
|
||||||
|
|:---------------|:--------|
|
||||||
|
| بعد كل مهمة (Crew) | `["task_completed"]` |
|
||||||
|
| بعد كل طريقة في التدفق | `["method_execution_finished"]` |
|
||||||
|
| بعد تنفيذ الوكيل | `["agent_execution_completed"]`, `["lite_agent_execution_completed"]` |
|
||||||
|
| عند اكتمال الطاقم فقط | `["crew_kickoff_completed"]` |
|
||||||
|
| بعد كل استدعاء LLM | `["llm_call_completed"]` |
|
||||||
|
| على كل شيء | `["*"]` |
|
||||||
|
|
||||||
|
<Warning>
|
||||||
|
استخدام `["*"]` او احداث عالية التردد مثل `llm_call_completed` سيكتب العديد من ملفات نقاط الحفظ وقد يؤثر على الاداء. استخدم `max_checkpoints` للحد من استخدام المساحة.
|
||||||
|
</Warning>
|
||||||
|
|
||||||
|
## نقاط الحفظ اليدوية
|
||||||
|
|
||||||
|
للتحكم الكامل، سجل معالج الاحداث الخاص بك واستدع `state.checkpoint()` مباشرة:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai.events.event_bus import crewai_event_bus
|
||||||
|
from crewai.events.types.llm_events import LLMCallCompletedEvent
|
||||||
|
|
||||||
|
# معالج متزامن
|
||||||
|
@crewai_event_bus.on(LLMCallCompletedEvent)
|
||||||
|
def on_llm_done(source, event, state):
|
||||||
|
path = state.checkpoint("./my_checkpoints")
|
||||||
|
print(f"تم حفظ نقطة الحفظ: {path}")
|
||||||
|
|
||||||
|
# معالج غير متزامن
|
||||||
|
@crewai_event_bus.on(LLMCallCompletedEvent)
|
||||||
|
async def on_llm_done_async(source, event, state):
|
||||||
|
path = await state.acheckpoint("./my_checkpoints")
|
||||||
|
print(f"تم حفظ نقطة الحفظ: {path}")
|
||||||
|
```
|
||||||
|
|
||||||
|
وسيط `state` هو `RuntimeState` الذي يتم تمريره تلقائيا بواسطة ناقل الاحداث عندما يقبل المعالج 3 معاملات. يمكنك تسجيل معالجات على اي نوع حدث مدرج في وثائق [Event Listeners](/ar/concepts/event-listener).
|
||||||
|
|
||||||
|
الـ Checkpointing يعمل بافضل جهد: اذا فشلت كتابة نقطة حفظ، يتم تسجيل الخطأ ولكن التنفيذ يستمر دون انقطاع.
|
||||||
1582
docs/docs.json
1582
docs/docs.json
File diff suppressed because it is too large
Load Diff
187
docs/en/concepts/checkpointing.mdx
Normal file
187
docs/en/concepts/checkpointing.mdx
Normal file
@@ -0,0 +1,187 @@
|
|||||||
|
---
|
||||||
|
title: Checkpointing
|
||||||
|
description: Automatically save execution state so crews, flows, and agents can resume after failures.
|
||||||
|
icon: floppy-disk
|
||||||
|
mode: "wide"
|
||||||
|
---
|
||||||
|
|
||||||
|
<Warning>
|
||||||
|
Checkpointing is in early release. APIs may change in future versions.
|
||||||
|
</Warning>
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Checkpointing automatically saves execution state during a run. If a crew, flow, or agent fails mid-execution, you can restore from the last checkpoint and resume without re-running completed work.
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai import Crew, CheckpointConfig
|
||||||
|
|
||||||
|
crew = Crew(
|
||||||
|
agents=[...],
|
||||||
|
tasks=[...],
|
||||||
|
checkpoint=True, # uses defaults: ./.checkpoints, on task_completed
|
||||||
|
)
|
||||||
|
result = crew.kickoff()
|
||||||
|
```
|
||||||
|
|
||||||
|
Checkpoint files are written to `./.checkpoints/` after each completed task.
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
Use `CheckpointConfig` for full control:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai import Crew, CheckpointConfig
|
||||||
|
|
||||||
|
crew = Crew(
|
||||||
|
agents=[...],
|
||||||
|
tasks=[...],
|
||||||
|
checkpoint=CheckpointConfig(
|
||||||
|
directory="./my_checkpoints",
|
||||||
|
on_events=["task_completed", "crew_kickoff_completed"],
|
||||||
|
max_checkpoints=5,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### CheckpointConfig Fields
|
||||||
|
|
||||||
|
| Field | Type | Default | Description |
|
||||||
|
|:------|:-----|:--------|:------------|
|
||||||
|
| `directory` | `str` | `"./.checkpoints"` | Filesystem path for checkpoint files |
|
||||||
|
| `on_events` | `list[str]` | `["task_completed"]` | Event types that trigger a checkpoint |
|
||||||
|
| `provider` | `BaseProvider` | `JsonProvider()` | Storage backend |
|
||||||
|
| `max_checkpoints` | `int \| None` | `None` | Max files to keep; oldest pruned first |
|
||||||
|
|
||||||
|
### Inheritance and Opt-Out
|
||||||
|
|
||||||
|
The `checkpoint` field on Crew, Flow, and Agent accepts `CheckpointConfig`, `True`, `False`, or `None`:
|
||||||
|
|
||||||
|
| Value | Behavior |
|
||||||
|
|:------|:---------|
|
||||||
|
| `None` (default) | Inherit from parent. An agent inherits its crew's config. |
|
||||||
|
| `True` | Enable with defaults. |
|
||||||
|
| `False` | Explicit opt-out. Stops inheritance from parent. |
|
||||||
|
| `CheckpointConfig(...)` | Custom configuration. |
|
||||||
|
|
||||||
|
```python
|
||||||
|
crew = Crew(
|
||||||
|
agents=[
|
||||||
|
Agent(role="Researcher", ...), # inherits crew's checkpoint
|
||||||
|
Agent(role="Writer", ..., checkpoint=False), # opted out, no checkpoints
|
||||||
|
],
|
||||||
|
tasks=[...],
|
||||||
|
checkpoint=True,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Resuming from a Checkpoint
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Restore and resume
|
||||||
|
crew = Crew.from_checkpoint("./my_checkpoints/20260407T120000_abc123.json")
|
||||||
|
result = crew.kickoff() # picks up from last completed task
|
||||||
|
```
|
||||||
|
|
||||||
|
The restored crew skips already-completed tasks and resumes from the first incomplete one.
|
||||||
|
|
||||||
|
## Works on Crew, Flow, and Agent
|
||||||
|
|
||||||
|
### Crew
|
||||||
|
|
||||||
|
```python
|
||||||
|
crew = Crew(
|
||||||
|
agents=[researcher, writer],
|
||||||
|
tasks=[research_task, write_task, review_task],
|
||||||
|
checkpoint=CheckpointConfig(directory="./crew_cp"),
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Default trigger: `task_completed` (one checkpoint per finished task).
|
||||||
|
|
||||||
|
### Flow
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai.flow.flow import Flow, start, listen
|
||||||
|
from crewai import CheckpointConfig
|
||||||
|
|
||||||
|
class MyFlow(Flow):
|
||||||
|
@start()
|
||||||
|
def step_one(self):
|
||||||
|
return "data"
|
||||||
|
|
||||||
|
@listen(step_one)
|
||||||
|
def step_two(self, data):
|
||||||
|
return process(data)
|
||||||
|
|
||||||
|
flow = MyFlow(
|
||||||
|
checkpoint=CheckpointConfig(
|
||||||
|
directory="./flow_cp",
|
||||||
|
on_events=["method_execution_finished"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
result = flow.kickoff()
|
||||||
|
|
||||||
|
# Resume
|
||||||
|
flow = MyFlow.from_checkpoint("./flow_cp/20260407T120000_abc123.json")
|
||||||
|
result = flow.kickoff()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Agent
|
||||||
|
|
||||||
|
```python
|
||||||
|
agent = Agent(
|
||||||
|
role="Researcher",
|
||||||
|
goal="Research topics",
|
||||||
|
backstory="Expert researcher",
|
||||||
|
checkpoint=CheckpointConfig(
|
||||||
|
directory="./agent_cp",
|
||||||
|
on_events=["lite_agent_execution_completed"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
result = agent.kickoff(messages=[{"role": "user", "content": "Research AI trends"}])
|
||||||
|
```
|
||||||
|
|
||||||
|
## Event Types
|
||||||
|
|
||||||
|
The `on_events` field accepts any combination of event type strings. Common choices:
|
||||||
|
|
||||||
|
| Use Case | Events |
|
||||||
|
|:---------|:-------|
|
||||||
|
| After each task (Crew) | `["task_completed"]` |
|
||||||
|
| After each flow method | `["method_execution_finished"]` |
|
||||||
|
| After agent execution | `["agent_execution_completed"]`, `["lite_agent_execution_completed"]` |
|
||||||
|
| On crew completion only | `["crew_kickoff_completed"]` |
|
||||||
|
| After every LLM call | `["llm_call_completed"]` |
|
||||||
|
| On everything | `["*"]` |
|
||||||
|
|
||||||
|
<Warning>
|
||||||
|
Using `["*"]` or high-frequency events like `llm_call_completed` will write many checkpoint files and may impact performance. Use `max_checkpoints` to limit disk usage.
|
||||||
|
</Warning>
|
||||||
|
|
||||||
|
## Manual Checkpointing
|
||||||
|
|
||||||
|
For full control, register your own event handler and call `state.checkpoint()` directly:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai.events.event_bus import crewai_event_bus
|
||||||
|
from crewai.events.types.llm_events import LLMCallCompletedEvent
|
||||||
|
|
||||||
|
# Sync handler
|
||||||
|
@crewai_event_bus.on(LLMCallCompletedEvent)
|
||||||
|
def on_llm_done(source, event, state):
|
||||||
|
path = state.checkpoint("./my_checkpoints")
|
||||||
|
print(f"Saved checkpoint: {path}")
|
||||||
|
|
||||||
|
# Async handler
|
||||||
|
@crewai_event_bus.on(LLMCallCompletedEvent)
|
||||||
|
async def on_llm_done_async(source, event, state):
|
||||||
|
path = await state.acheckpoint("./my_checkpoints")
|
||||||
|
print(f"Saved checkpoint: {path}")
|
||||||
|
```
|
||||||
|
|
||||||
|
The `state` argument is the `RuntimeState` passed automatically by the event bus when your handler accepts 3 parameters. You can register handlers on any event type listed in the [Event Listeners](/en/concepts/event-listener) documentation.
|
||||||
|
|
||||||
|
Checkpointing is best-effort: if a checkpoint write fails, the error is logged but execution continues uninterrupted.
|
||||||
187
docs/ko/concepts/checkpointing.mdx
Normal file
187
docs/ko/concepts/checkpointing.mdx
Normal file
@@ -0,0 +1,187 @@
|
|||||||
|
---
|
||||||
|
title: Checkpointing
|
||||||
|
description: 실행 상태를 자동으로 저장하여 크루, 플로우, 에이전트가 실패 후 재개할 수 있습니다.
|
||||||
|
icon: floppy-disk
|
||||||
|
mode: "wide"
|
||||||
|
---
|
||||||
|
|
||||||
|
<Warning>
|
||||||
|
체크포인팅은 초기 릴리스 단계입니다. API는 향후 버전에서 변경될 수 있습니다.
|
||||||
|
</Warning>
|
||||||
|
|
||||||
|
## 개요
|
||||||
|
|
||||||
|
체크포인팅은 실행 중 자동으로 실행 상태를 저장합니다. 크루, 플로우 또는 에이전트가 실행 도중 실패하면 마지막 체크포인트에서 복원하여 이미 완료된 작업을 다시 실행하지 않고 재개할 수 있습니다.
|
||||||
|
|
||||||
|
## 빠른 시작
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai import Crew, CheckpointConfig
|
||||||
|
|
||||||
|
crew = Crew(
|
||||||
|
agents=[...],
|
||||||
|
tasks=[...],
|
||||||
|
checkpoint=True, # 기본값 사용: ./.checkpoints, task_completed 이벤트
|
||||||
|
)
|
||||||
|
result = crew.kickoff()
|
||||||
|
```
|
||||||
|
|
||||||
|
각 태스크가 완료된 후 `./.checkpoints/`에 체크포인트 파일이 기록됩니다.
|
||||||
|
|
||||||
|
## 설정
|
||||||
|
|
||||||
|
`CheckpointConfig`를 사용하여 세부 설정을 제어합니다:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai import Crew, CheckpointConfig
|
||||||
|
|
||||||
|
crew = Crew(
|
||||||
|
agents=[...],
|
||||||
|
tasks=[...],
|
||||||
|
checkpoint=CheckpointConfig(
|
||||||
|
directory="./my_checkpoints",
|
||||||
|
on_events=["task_completed", "crew_kickoff_completed"],
|
||||||
|
max_checkpoints=5,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### CheckpointConfig 필드
|
||||||
|
|
||||||
|
| 필드 | 타입 | 기본값 | 설명 |
|
||||||
|
|:-----|:-----|:-------|:-----|
|
||||||
|
| `directory` | `str` | `"./.checkpoints"` | 체크포인트 파일 경로 |
|
||||||
|
| `on_events` | `list[str]` | `["task_completed"]` | 체크포인트를 트리거하는 이벤트 타입 |
|
||||||
|
| `provider` | `BaseProvider` | `JsonProvider()` | 스토리지 백엔드 |
|
||||||
|
| `max_checkpoints` | `int \| None` | `None` | 보관할 최대 파일 수; 오래된 것부터 삭제 |
|
||||||
|
|
||||||
|
### 상속 및 옵트아웃
|
||||||
|
|
||||||
|
Crew, Flow, Agent의 `checkpoint` 필드는 `CheckpointConfig`, `True`, `False`, `None`을 받습니다:
|
||||||
|
|
||||||
|
| 값 | 동작 |
|
||||||
|
|:---|:-----|
|
||||||
|
| `None` (기본값) | 부모에서 상속. 에이전트는 크루의 설정을 상속합니다. |
|
||||||
|
| `True` | 기본값으로 활성화. |
|
||||||
|
| `False` | 명시적 옵트아웃. 부모 상속을 중단합니다. |
|
||||||
|
| `CheckpointConfig(...)` | 사용자 정의 설정. |
|
||||||
|
|
||||||
|
```python
|
||||||
|
crew = Crew(
|
||||||
|
agents=[
|
||||||
|
Agent(role="Researcher", ...), # 크루의 checkpoint 상속
|
||||||
|
Agent(role="Writer", ..., checkpoint=False), # 옵트아웃, 체크포인트 없음
|
||||||
|
],
|
||||||
|
tasks=[...],
|
||||||
|
checkpoint=True,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## 체크포인트에서 재개
|
||||||
|
|
||||||
|
```python
|
||||||
|
# 복원 및 재개
|
||||||
|
crew = Crew.from_checkpoint("./my_checkpoints/20260407T120000_abc123.json")
|
||||||
|
result = crew.kickoff() # 마지막으로 완료된 태스크부터 재개
|
||||||
|
```
|
||||||
|
|
||||||
|
복원된 크루는 이미 완료된 태스크를 건너뛰고 첫 번째 미완료 태스크부터 재개합니다.
|
||||||
|
|
||||||
|
## Crew, Flow, Agent에서 사용 가능
|
||||||
|
|
||||||
|
### Crew
|
||||||
|
|
||||||
|
```python
|
||||||
|
crew = Crew(
|
||||||
|
agents=[researcher, writer],
|
||||||
|
tasks=[research_task, write_task, review_task],
|
||||||
|
checkpoint=CheckpointConfig(directory="./crew_cp"),
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
기본 트리거: `task_completed` (완료된 태스크당 하나의 체크포인트).
|
||||||
|
|
||||||
|
### Flow
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai.flow.flow import Flow, start, listen
|
||||||
|
from crewai import CheckpointConfig
|
||||||
|
|
||||||
|
class MyFlow(Flow):
|
||||||
|
@start()
|
||||||
|
def step_one(self):
|
||||||
|
return "data"
|
||||||
|
|
||||||
|
@listen(step_one)
|
||||||
|
def step_two(self, data):
|
||||||
|
return process(data)
|
||||||
|
|
||||||
|
flow = MyFlow(
|
||||||
|
checkpoint=CheckpointConfig(
|
||||||
|
directory="./flow_cp",
|
||||||
|
on_events=["method_execution_finished"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
result = flow.kickoff()
|
||||||
|
|
||||||
|
# 재개
|
||||||
|
flow = MyFlow.from_checkpoint("./flow_cp/20260407T120000_abc123.json")
|
||||||
|
result = flow.kickoff()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Agent
|
||||||
|
|
||||||
|
```python
|
||||||
|
agent = Agent(
|
||||||
|
role="Researcher",
|
||||||
|
goal="Research topics",
|
||||||
|
backstory="Expert researcher",
|
||||||
|
checkpoint=CheckpointConfig(
|
||||||
|
directory="./agent_cp",
|
||||||
|
on_events=["lite_agent_execution_completed"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
result = agent.kickoff(messages=[{"role": "user", "content": "Research AI trends"}])
|
||||||
|
```
|
||||||
|
|
||||||
|
## 이벤트 타입
|
||||||
|
|
||||||
|
`on_events` 필드는 이벤트 타입 문자열의 조합을 받습니다. 일반적인 선택:
|
||||||
|
|
||||||
|
| 사용 사례 | 이벤트 |
|
||||||
|
|:----------|:-------|
|
||||||
|
| 각 태스크 완료 후 (Crew) | `["task_completed"]` |
|
||||||
|
| 각 플로우 메서드 완료 후 | `["method_execution_finished"]` |
|
||||||
|
| 에이전트 실행 완료 후 | `["agent_execution_completed"]`, `["lite_agent_execution_completed"]` |
|
||||||
|
| 크루 완료 시에만 | `["crew_kickoff_completed"]` |
|
||||||
|
| 모든 LLM 호출 후 | `["llm_call_completed"]` |
|
||||||
|
| 모든 이벤트 | `["*"]` |
|
||||||
|
|
||||||
|
<Warning>
|
||||||
|
`["*"]` 또는 `llm_call_completed`와 같은 고빈도 이벤트를 사용하면 많은 체크포인트 파일이 생성되어 성능에 영향을 줄 수 있습니다. `max_checkpoints`를 사용하여 디스크 사용량을 제한하세요.
|
||||||
|
</Warning>
|
||||||
|
|
||||||
|
## 수동 체크포인팅
|
||||||
|
|
||||||
|
완전한 제어를 위해 자체 이벤트 핸들러를 등록하고 `state.checkpoint()`를 직접 호출할 수 있습니다:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai.events.event_bus import crewai_event_bus
|
||||||
|
from crewai.events.types.llm_events import LLMCallCompletedEvent
|
||||||
|
|
||||||
|
# 동기 핸들러
|
||||||
|
@crewai_event_bus.on(LLMCallCompletedEvent)
|
||||||
|
def on_llm_done(source, event, state):
|
||||||
|
path = state.checkpoint("./my_checkpoints")
|
||||||
|
print(f"체크포인트 저장: {path}")
|
||||||
|
|
||||||
|
# 비동기 핸들러
|
||||||
|
@crewai_event_bus.on(LLMCallCompletedEvent)
|
||||||
|
async def on_llm_done_async(source, event, state):
|
||||||
|
path = await state.acheckpoint("./my_checkpoints")
|
||||||
|
print(f"체크포인트 저장: {path}")
|
||||||
|
```
|
||||||
|
|
||||||
|
`state` 인수는 핸들러가 3개의 매개변수를 받을 때 이벤트 버스가 자동으로 전달하는 `RuntimeState`입니다. [Event Listeners](/ko/concepts/event-listener) 문서에 나열된 모든 이벤트 타입에 핸들러를 등록할 수 있습니다.
|
||||||
|
|
||||||
|
체크포인팅은 best-effort입니다: 체크포인트 기록이 실패하면 오류가 로그에 기록되지만 실행은 중단 없이 계속됩니다.
|
||||||
187
docs/pt-BR/concepts/checkpointing.mdx
Normal file
187
docs/pt-BR/concepts/checkpointing.mdx
Normal file
@@ -0,0 +1,187 @@
|
|||||||
|
---
|
||||||
|
title: Checkpointing
|
||||||
|
description: Salve automaticamente o estado de execucao para que crews, flows e agentes possam retomar apos falhas.
|
||||||
|
icon: floppy-disk
|
||||||
|
mode: "wide"
|
||||||
|
---
|
||||||
|
|
||||||
|
<Warning>
|
||||||
|
O checkpointing esta em versao inicial. As APIs podem mudar em versoes futuras.
|
||||||
|
</Warning>
|
||||||
|
|
||||||
|
## Visao Geral
|
||||||
|
|
||||||
|
O checkpointing salva automaticamente o estado de execucao durante uma execucao. Se uma crew, flow ou agente falhar no meio da execucao, voce pode restaurar a partir do ultimo checkpoint e retomar sem reexecutar o trabalho ja concluido.
|
||||||
|
|
||||||
|
## Inicio Rapido
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai import Crew, CheckpointConfig
|
||||||
|
|
||||||
|
crew = Crew(
|
||||||
|
agents=[...],
|
||||||
|
tasks=[...],
|
||||||
|
checkpoint=True, # usa padroes: ./.checkpoints, em task_completed
|
||||||
|
)
|
||||||
|
result = crew.kickoff()
|
||||||
|
```
|
||||||
|
|
||||||
|
Os arquivos de checkpoint sao gravados em `./.checkpoints/` apos cada tarefa concluida.
|
||||||
|
|
||||||
|
## Configuracao
|
||||||
|
|
||||||
|
Use `CheckpointConfig` para controle total:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai import Crew, CheckpointConfig
|
||||||
|
|
||||||
|
crew = Crew(
|
||||||
|
agents=[...],
|
||||||
|
tasks=[...],
|
||||||
|
checkpoint=CheckpointConfig(
|
||||||
|
directory="./my_checkpoints",
|
||||||
|
on_events=["task_completed", "crew_kickoff_completed"],
|
||||||
|
max_checkpoints=5,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Campos do CheckpointConfig
|
||||||
|
|
||||||
|
| Campo | Tipo | Padrao | Descricao |
|
||||||
|
|:------|:-----|:-------|:----------|
|
||||||
|
| `directory` | `str` | `"./.checkpoints"` | Caminho para os arquivos de checkpoint |
|
||||||
|
| `on_events` | `list[str]` | `["task_completed"]` | Tipos de evento que acionam um checkpoint |
|
||||||
|
| `provider` | `BaseProvider` | `JsonProvider()` | Backend de armazenamento |
|
||||||
|
| `max_checkpoints` | `int \| None` | `None` | Maximo de arquivos a manter; os mais antigos sao removidos primeiro |
|
||||||
|
|
||||||
|
### Heranca e Desativacao
|
||||||
|
|
||||||
|
O campo `checkpoint` em Crew, Flow e Agent aceita `CheckpointConfig`, `True`, `False` ou `None`:
|
||||||
|
|
||||||
|
| Valor | Comportamento |
|
||||||
|
|:------|:--------------|
|
||||||
|
| `None` (padrao) | Herda do pai. Um agente herda a configuracao da crew. |
|
||||||
|
| `True` | Ativa com padroes. |
|
||||||
|
| `False` | Desativacao explicita. Interrompe a heranca do pai. |
|
||||||
|
| `CheckpointConfig(...)` | Configuracao personalizada. |
|
||||||
|
|
||||||
|
```python
|
||||||
|
crew = Crew(
|
||||||
|
agents=[
|
||||||
|
Agent(role="Researcher", ...), # herda checkpoint da crew
|
||||||
|
Agent(role="Writer", ..., checkpoint=False), # desativado, sem checkpoints
|
||||||
|
],
|
||||||
|
tasks=[...],
|
||||||
|
checkpoint=True,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Retomando a partir de um Checkpoint
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Restaurar e retomar
|
||||||
|
crew = Crew.from_checkpoint("./my_checkpoints/20260407T120000_abc123.json")
|
||||||
|
result = crew.kickoff() # retoma a partir da ultima tarefa concluida
|
||||||
|
```
|
||||||
|
|
||||||
|
A crew restaurada pula tarefas ja concluidas e retoma a partir da primeira incompleta.
|
||||||
|
|
||||||
|
## Funciona em Crew, Flow e Agent
|
||||||
|
|
||||||
|
### Crew
|
||||||
|
|
||||||
|
```python
|
||||||
|
crew = Crew(
|
||||||
|
agents=[researcher, writer],
|
||||||
|
tasks=[research_task, write_task, review_task],
|
||||||
|
checkpoint=CheckpointConfig(directory="./crew_cp"),
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Gatilho padrao: `task_completed` (um checkpoint por tarefa finalizada).
|
||||||
|
|
||||||
|
### Flow
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai.flow.flow import Flow, start, listen
|
||||||
|
from crewai import CheckpointConfig
|
||||||
|
|
||||||
|
class MyFlow(Flow):
|
||||||
|
@start()
|
||||||
|
def step_one(self):
|
||||||
|
return "data"
|
||||||
|
|
||||||
|
@listen(step_one)
|
||||||
|
def step_two(self, data):
|
||||||
|
return process(data)
|
||||||
|
|
||||||
|
flow = MyFlow(
|
||||||
|
checkpoint=CheckpointConfig(
|
||||||
|
directory="./flow_cp",
|
||||||
|
on_events=["method_execution_finished"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
result = flow.kickoff()
|
||||||
|
|
||||||
|
# Retomar
|
||||||
|
flow = MyFlow.from_checkpoint("./flow_cp/20260407T120000_abc123.json")
|
||||||
|
result = flow.kickoff()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Agent
|
||||||
|
|
||||||
|
```python
|
||||||
|
agent = Agent(
|
||||||
|
role="Researcher",
|
||||||
|
goal="Research topics",
|
||||||
|
backstory="Expert researcher",
|
||||||
|
checkpoint=CheckpointConfig(
|
||||||
|
directory="./agent_cp",
|
||||||
|
on_events=["lite_agent_execution_completed"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
result = agent.kickoff(messages=[{"role": "user", "content": "Research AI trends"}])
|
||||||
|
```
|
||||||
|
|
||||||
|
## Tipos de Evento
|
||||||
|
|
||||||
|
O campo `on_events` aceita qualquer combinacao de strings de tipo de evento. Escolhas comuns:
|
||||||
|
|
||||||
|
| Caso de Uso | Eventos |
|
||||||
|
|:------------|:--------|
|
||||||
|
| Apos cada tarefa (Crew) | `["task_completed"]` |
|
||||||
|
| Apos cada metodo do flow | `["method_execution_finished"]` |
|
||||||
|
| Apos execucao do agente | `["agent_execution_completed"]`, `["lite_agent_execution_completed"]` |
|
||||||
|
| Apenas na conclusao da crew | `["crew_kickoff_completed"]` |
|
||||||
|
| Apos cada chamada LLM | `["llm_call_completed"]` |
|
||||||
|
| Em tudo | `["*"]` |
|
||||||
|
|
||||||
|
<Warning>
|
||||||
|
Usar `["*"]` ou eventos de alta frequencia como `llm_call_completed` gravara muitos arquivos de checkpoint e pode impactar o desempenho. Use `max_checkpoints` para limitar o uso de disco.
|
||||||
|
</Warning>
|
||||||
|
|
||||||
|
## Checkpointing Manual
|
||||||
|
|
||||||
|
Para controle total, registre seu proprio handler de evento e chame `state.checkpoint()` diretamente:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai.events.event_bus import crewai_event_bus
|
||||||
|
from crewai.events.types.llm_events import LLMCallCompletedEvent
|
||||||
|
|
||||||
|
# Handler sincrono
|
||||||
|
@crewai_event_bus.on(LLMCallCompletedEvent)
|
||||||
|
def on_llm_done(source, event, state):
|
||||||
|
path = state.checkpoint("./my_checkpoints")
|
||||||
|
print(f"Checkpoint salvo: {path}")
|
||||||
|
|
||||||
|
# Handler assincrono
|
||||||
|
@crewai_event_bus.on(LLMCallCompletedEvent)
|
||||||
|
async def on_llm_done_async(source, event, state):
|
||||||
|
path = await state.acheckpoint("./my_checkpoints")
|
||||||
|
print(f"Checkpoint salvo: {path}")
|
||||||
|
```
|
||||||
|
|
||||||
|
O argumento `state` e o `RuntimeState` passado automaticamente pelo barramento de eventos quando seu handler aceita 3 parametros. Voce pode registrar handlers em qualquer tipo de evento listado na documentacao de [Event Listeners](/pt-BR/concepts/event-listener).
|
||||||
|
|
||||||
|
O checkpointing e best-effort: se uma gravacao de checkpoint falhar, o erro e registrado no log, mas a execucao continua sem interrupcao.
|
||||||
@@ -16,6 +16,7 @@ from crewai.knowledge.knowledge import Knowledge
|
|||||||
from crewai.llm import LLM
|
from crewai.llm import LLM
|
||||||
from crewai.llms.base_llm import BaseLLM
|
from crewai.llms.base_llm import BaseLLM
|
||||||
from crewai.process import Process
|
from crewai.process import Process
|
||||||
|
from crewai.state.checkpoint_config import CheckpointConfig # noqa: F401
|
||||||
from crewai.task import Task
|
from crewai.task import Task
|
||||||
from crewai.tasks.llm_guardrail import LLMGuardrail
|
from crewai.tasks.llm_guardrail import LLMGuardrail
|
||||||
from crewai.tasks.task_output import TaskOutput
|
from crewai.tasks.task_output import TaskOutput
|
||||||
@@ -210,6 +211,7 @@ try:
|
|||||||
Agent.model_rebuild(force=True, _types_namespace=_full_namespace)
|
Agent.model_rebuild(force=True, _types_namespace=_full_namespace)
|
||||||
except PydanticUserError:
|
except PydanticUserError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
except (ImportError, PydanticUserError):
|
except (ImportError, PydanticUserError):
|
||||||
import logging as _logging
|
import logging as _logging
|
||||||
|
|
||||||
|
|||||||
@@ -39,6 +39,7 @@ from crewai.memory.unified_memory import Memory
|
|||||||
from crewai.rag.embeddings.types import EmbedderConfig
|
from crewai.rag.embeddings.types import EmbedderConfig
|
||||||
from crewai.security.security_config import SecurityConfig
|
from crewai.security.security_config import SecurityConfig
|
||||||
from crewai.skills.models import Skill
|
from crewai.skills.models import Skill
|
||||||
|
from crewai.state.checkpoint_config import CheckpointConfig
|
||||||
from crewai.tools.base_tool import BaseTool, Tool
|
from crewai.tools.base_tool import BaseTool, Tool
|
||||||
from crewai.types.callback import SerializableCallable
|
from crewai.types.callback import SerializableCallable
|
||||||
from crewai.utilities.config import process_config
|
from crewai.utilities.config import process_config
|
||||||
@@ -299,6 +300,11 @@ class BaseAgent(BaseModel, ABC, metaclass=AgentMeta):
|
|||||||
default_factory=SecurityConfig,
|
default_factory=SecurityConfig,
|
||||||
description="Security configuration for the agent, including fingerprinting.",
|
description="Security configuration for the agent, including fingerprinting.",
|
||||||
)
|
)
|
||||||
|
checkpoint: CheckpointConfig | bool | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="Automatic checkpointing configuration. "
|
||||||
|
"True for defaults, False to opt out, None to inherit.",
|
||||||
|
)
|
||||||
callbacks: list[SerializableCallable] = Field(
|
callbacks: list[SerializableCallable] = Field(
|
||||||
default_factory=list, description="Callbacks to be used for the agent"
|
default_factory=list, description="Callbacks to be used for the agent"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -104,6 +104,7 @@ from crewai.rag.types import SearchResult
|
|||||||
from crewai.security.fingerprint import Fingerprint
|
from crewai.security.fingerprint import Fingerprint
|
||||||
from crewai.security.security_config import SecurityConfig
|
from crewai.security.security_config import SecurityConfig
|
||||||
from crewai.skills.models import Skill
|
from crewai.skills.models import Skill
|
||||||
|
from crewai.state.checkpoint_config import CheckpointConfig
|
||||||
from crewai.task import Task
|
from crewai.task import Task
|
||||||
from crewai.tasks.conditional_task import ConditionalTask
|
from crewai.tasks.conditional_task import ConditionalTask
|
||||||
from crewai.tasks.task_output import TaskOutput
|
from crewai.tasks.task_output import TaskOutput
|
||||||
@@ -340,6 +341,11 @@ class Crew(FlowTrackable, BaseModel):
|
|||||||
default_factory=SecurityConfig,
|
default_factory=SecurityConfig,
|
||||||
description="Security configuration for the crew, including fingerprinting.",
|
description="Security configuration for the crew, including fingerprinting.",
|
||||||
)
|
)
|
||||||
|
checkpoint: CheckpointConfig | bool | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="Automatic checkpointing configuration. "
|
||||||
|
"True for defaults, False to opt out, None to inherit.",
|
||||||
|
)
|
||||||
token_usage: UsageMetrics | None = Field(
|
token_usage: UsageMetrics | None = Field(
|
||||||
default=None,
|
default=None,
|
||||||
description="Metrics for the LLM usage during all tasks execution.",
|
description="Metrics for the LLM usage during all tasks execution.",
|
||||||
|
|||||||
@@ -113,6 +113,7 @@ from crewai.flow.utils import (
|
|||||||
)
|
)
|
||||||
from crewai.memory.memory_scope import MemoryScope, MemorySlice
|
from crewai.memory.memory_scope import MemoryScope, MemorySlice
|
||||||
from crewai.memory.unified_memory import Memory
|
from crewai.memory.unified_memory import Memory
|
||||||
|
from crewai.state.checkpoint_config import CheckpointConfig
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@@ -920,6 +921,7 @@ class Flow(BaseModel, Generic[T], metaclass=FlowMeta):
|
|||||||
max_method_calls: int = Field(default=100)
|
max_method_calls: int = Field(default=100)
|
||||||
|
|
||||||
execution_context: ExecutionContext | None = Field(default=None)
|
execution_context: ExecutionContext | None = Field(default=None)
|
||||||
|
checkpoint: CheckpointConfig | bool | None = Field(default=None)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_checkpoint(
|
def from_checkpoint(
|
||||||
|
|||||||
@@ -0,0 +1,4 @@
|
|||||||
|
from crewai.state.checkpoint_config import CheckpointConfig, CheckpointEventType
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["CheckpointConfig", "CheckpointEventType"]
|
||||||
|
|||||||
193
lib/crewai/src/crewai/state/checkpoint_config.py
Normal file
193
lib/crewai/src/crewai/state/checkpoint_config.py
Normal file
@@ -0,0 +1,193 @@
|
|||||||
|
"""Checkpoint configuration for automatic state persistence."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
from crewai.state.provider.core import BaseProvider
|
||||||
|
from crewai.state.provider.json_provider import JsonProvider
|
||||||
|
|
||||||
|
|
||||||
|
CheckpointEventType = Literal[
|
||||||
|
# Task
|
||||||
|
"task_started",
|
||||||
|
"task_completed",
|
||||||
|
"task_failed",
|
||||||
|
"task_evaluation",
|
||||||
|
# Crew
|
||||||
|
"crew_kickoff_started",
|
||||||
|
"crew_kickoff_completed",
|
||||||
|
"crew_kickoff_failed",
|
||||||
|
"crew_train_started",
|
||||||
|
"crew_train_completed",
|
||||||
|
"crew_train_failed",
|
||||||
|
"crew_test_started",
|
||||||
|
"crew_test_completed",
|
||||||
|
"crew_test_failed",
|
||||||
|
"crew_test_result",
|
||||||
|
# Agent
|
||||||
|
"agent_execution_started",
|
||||||
|
"agent_execution_completed",
|
||||||
|
"agent_execution_error",
|
||||||
|
"lite_agent_execution_started",
|
||||||
|
"lite_agent_execution_completed",
|
||||||
|
"lite_agent_execution_error",
|
||||||
|
"agent_evaluation_started",
|
||||||
|
"agent_evaluation_completed",
|
||||||
|
"agent_evaluation_failed",
|
||||||
|
# Flow
|
||||||
|
"flow_created",
|
||||||
|
"flow_started",
|
||||||
|
"flow_finished",
|
||||||
|
"flow_paused",
|
||||||
|
"method_execution_started",
|
||||||
|
"method_execution_finished",
|
||||||
|
"method_execution_failed",
|
||||||
|
"method_execution_paused",
|
||||||
|
"human_feedback_requested",
|
||||||
|
"human_feedback_received",
|
||||||
|
"flow_input_requested",
|
||||||
|
"flow_input_received",
|
||||||
|
# LLM
|
||||||
|
"llm_call_started",
|
||||||
|
"llm_call_completed",
|
||||||
|
"llm_call_failed",
|
||||||
|
"llm_stream_chunk",
|
||||||
|
"llm_thinking_chunk",
|
||||||
|
# LLM Guardrail
|
||||||
|
"llm_guardrail_started",
|
||||||
|
"llm_guardrail_completed",
|
||||||
|
"llm_guardrail_failed",
|
||||||
|
# Tool
|
||||||
|
"tool_usage_started",
|
||||||
|
"tool_usage_finished",
|
||||||
|
"tool_usage_error",
|
||||||
|
"tool_validate_input_error",
|
||||||
|
"tool_selection_error",
|
||||||
|
"tool_execution_error",
|
||||||
|
# Memory
|
||||||
|
"memory_save_started",
|
||||||
|
"memory_save_completed",
|
||||||
|
"memory_save_failed",
|
||||||
|
"memory_query_started",
|
||||||
|
"memory_query_completed",
|
||||||
|
"memory_query_failed",
|
||||||
|
"memory_retrieval_started",
|
||||||
|
"memory_retrieval_completed",
|
||||||
|
"memory_retrieval_failed",
|
||||||
|
# Knowledge
|
||||||
|
"knowledge_search_query_started",
|
||||||
|
"knowledge_search_query_completed",
|
||||||
|
"knowledge_query_started",
|
||||||
|
"knowledge_query_completed",
|
||||||
|
"knowledge_query_failed",
|
||||||
|
"knowledge_search_query_failed",
|
||||||
|
# Reasoning
|
||||||
|
"agent_reasoning_started",
|
||||||
|
"agent_reasoning_completed",
|
||||||
|
"agent_reasoning_failed",
|
||||||
|
# MCP
|
||||||
|
"mcp_connection_started",
|
||||||
|
"mcp_connection_completed",
|
||||||
|
"mcp_connection_failed",
|
||||||
|
"mcp_tool_execution_started",
|
||||||
|
"mcp_tool_execution_completed",
|
||||||
|
"mcp_tool_execution_failed",
|
||||||
|
"mcp_config_fetch_failed",
|
||||||
|
# Observation
|
||||||
|
"step_observation_started",
|
||||||
|
"step_observation_completed",
|
||||||
|
"step_observation_failed",
|
||||||
|
"plan_refinement",
|
||||||
|
"plan_replan_triggered",
|
||||||
|
"goal_achieved_early",
|
||||||
|
# Skill
|
||||||
|
"skill_discovery_started",
|
||||||
|
"skill_discovery_completed",
|
||||||
|
"skill_loaded",
|
||||||
|
"skill_activated",
|
||||||
|
"skill_load_failed",
|
||||||
|
# Logging
|
||||||
|
"agent_logs_started",
|
||||||
|
"agent_logs_execution",
|
||||||
|
# A2A
|
||||||
|
"a2a_delegation_started",
|
||||||
|
"a2a_delegation_completed",
|
||||||
|
"a2a_conversation_started",
|
||||||
|
"a2a_conversation_completed",
|
||||||
|
"a2a_message_sent",
|
||||||
|
"a2a_response_received",
|
||||||
|
"a2a_polling_started",
|
||||||
|
"a2a_polling_status",
|
||||||
|
"a2a_push_notification_registered",
|
||||||
|
"a2a_push_notification_received",
|
||||||
|
"a2a_push_notification_sent",
|
||||||
|
"a2a_push_notification_timeout",
|
||||||
|
"a2a_streaming_started",
|
||||||
|
"a2a_streaming_chunk",
|
||||||
|
"a2a_agent_card_fetched",
|
||||||
|
"a2a_authentication_failed",
|
||||||
|
"a2a_artifact_received",
|
||||||
|
"a2a_connection_error",
|
||||||
|
"a2a_server_task_started",
|
||||||
|
"a2a_server_task_completed",
|
||||||
|
"a2a_server_task_canceled",
|
||||||
|
"a2a_server_task_failed",
|
||||||
|
"a2a_parallel_delegation_started",
|
||||||
|
"a2a_parallel_delegation_completed",
|
||||||
|
"a2a_transport_negotiated",
|
||||||
|
"a2a_content_type_negotiated",
|
||||||
|
"a2a_context_created",
|
||||||
|
"a2a_context_expired",
|
||||||
|
"a2a_context_idle",
|
||||||
|
"a2a_context_completed",
|
||||||
|
"a2a_context_pruned",
|
||||||
|
# System
|
||||||
|
"SIGTERM",
|
||||||
|
"SIGINT",
|
||||||
|
"SIGHUP",
|
||||||
|
"SIGTSTP",
|
||||||
|
"SIGCONT",
|
||||||
|
# Env
|
||||||
|
"cc_env",
|
||||||
|
"codex_env",
|
||||||
|
"cursor_env",
|
||||||
|
"default_env",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class CheckpointConfig(BaseModel):
|
||||||
|
"""Configuration for automatic checkpointing.
|
||||||
|
|
||||||
|
When set on a Crew, Flow, or Agent, checkpoints are written
|
||||||
|
automatically whenever the specified event(s) fire.
|
||||||
|
"""
|
||||||
|
|
||||||
|
directory: str = Field(
|
||||||
|
default="./.checkpoints",
|
||||||
|
description="Filesystem path where checkpoint JSON files are written.",
|
||||||
|
)
|
||||||
|
on_events: list[CheckpointEventType | Literal["*"]] = Field(
|
||||||
|
default=["task_completed"],
|
||||||
|
description="Event types that trigger a checkpoint write. "
|
||||||
|
'Use ["*"] to checkpoint on every event.',
|
||||||
|
)
|
||||||
|
provider: BaseProvider = Field(
|
||||||
|
default_factory=JsonProvider,
|
||||||
|
description="Storage backend. Defaults to JsonProvider.",
|
||||||
|
)
|
||||||
|
max_checkpoints: int | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="Maximum checkpoint files to keep. Oldest are pruned first. "
|
||||||
|
"None means keep all.",
|
||||||
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def trigger_all(self) -> bool:
|
||||||
|
return "*" in self.on_events
|
||||||
|
|
||||||
|
@property
|
||||||
|
def trigger_events(self) -> set[str]:
|
||||||
|
return set(self.on_events)
|
||||||
176
lib/crewai/src/crewai/state/checkpoint_listener.py
Normal file
176
lib/crewai/src/crewai/state/checkpoint_listener.py
Normal file
@@ -0,0 +1,176 @@
|
|||||||
|
"""Event listener that writes checkpoints automatically.
|
||||||
|
|
||||||
|
Handlers are registered lazily — only when the first ``CheckpointConfig``
|
||||||
|
is resolved (i.e. an entity actually has checkpointing enabled). This
|
||||||
|
avoids per-event overhead when no entity uses checkpointing.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import glob
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import threading
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from crewai.agents.agent_builder.base_agent import BaseAgent
|
||||||
|
from crewai.crew import Crew
|
||||||
|
from crewai.events.base_events import BaseEvent
|
||||||
|
from crewai.events.event_bus import CrewAIEventsBus, crewai_event_bus
|
||||||
|
from crewai.flow.flow import Flow
|
||||||
|
from crewai.state.checkpoint_config import CheckpointConfig
|
||||||
|
from crewai.state.runtime import RuntimeState, _prepare_entities
|
||||||
|
from crewai.task import Task
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_handlers_registered = False
|
||||||
|
_register_lock = threading.Lock()
|
||||||
|
|
||||||
|
_SENTINEL = object()
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_handlers_registered() -> None:
|
||||||
|
"""Register checkpoint handlers on the event bus once, lazily."""
|
||||||
|
global _handlers_registered
|
||||||
|
if _handlers_registered:
|
||||||
|
return
|
||||||
|
with _register_lock:
|
||||||
|
if _handlers_registered:
|
||||||
|
return
|
||||||
|
_register_all_handlers(crewai_event_bus)
|
||||||
|
_handlers_registered = True
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve(value: CheckpointConfig | bool | None) -> CheckpointConfig | None | object:
|
||||||
|
"""Coerce a checkpoint field value.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
CheckpointConfig — use this config.
|
||||||
|
_SENTINEL — explicit opt-out (``False``), stop walking parents.
|
||||||
|
None — not configured, keep walking parents.
|
||||||
|
"""
|
||||||
|
if isinstance(value, CheckpointConfig):
|
||||||
|
_ensure_handlers_registered()
|
||||||
|
return value
|
||||||
|
if value is True:
|
||||||
|
_ensure_handlers_registered()
|
||||||
|
return CheckpointConfig()
|
||||||
|
if value is False:
|
||||||
|
return _SENTINEL
|
||||||
|
return None # None = inherit
|
||||||
|
|
||||||
|
|
||||||
|
def _find_checkpoint(source: Any) -> CheckpointConfig | None:
|
||||||
|
"""Find the CheckpointConfig for an event source.
|
||||||
|
|
||||||
|
Walks known relationships: Task -> Agent -> Crew. Flow and Agent
|
||||||
|
carry their own checkpoint field directly.
|
||||||
|
|
||||||
|
A ``None`` value means "not configured, inherit from parent".
|
||||||
|
A ``False`` value means "opt out" and stops the walk.
|
||||||
|
"""
|
||||||
|
if isinstance(source, Flow):
|
||||||
|
result = _resolve(source.checkpoint)
|
||||||
|
return result if isinstance(result, CheckpointConfig) else None
|
||||||
|
if isinstance(source, Crew):
|
||||||
|
result = _resolve(source.checkpoint)
|
||||||
|
return result if isinstance(result, CheckpointConfig) else None
|
||||||
|
if isinstance(source, BaseAgent):
|
||||||
|
result = _resolve(source.checkpoint)
|
||||||
|
if isinstance(result, CheckpointConfig):
|
||||||
|
return result
|
||||||
|
if result is _SENTINEL:
|
||||||
|
return None
|
||||||
|
crew = source.crew
|
||||||
|
if isinstance(crew, Crew):
|
||||||
|
result = _resolve(crew.checkpoint)
|
||||||
|
return result if isinstance(result, CheckpointConfig) else None
|
||||||
|
return None
|
||||||
|
if isinstance(source, Task):
|
||||||
|
agent = source.agent
|
||||||
|
if isinstance(agent, BaseAgent):
|
||||||
|
result = _resolve(agent.checkpoint)
|
||||||
|
if isinstance(result, CheckpointConfig):
|
||||||
|
return result
|
||||||
|
if result is _SENTINEL:
|
||||||
|
return None
|
||||||
|
crew = agent.crew
|
||||||
|
if isinstance(crew, Crew):
|
||||||
|
result = _resolve(crew.checkpoint)
|
||||||
|
return result if isinstance(result, CheckpointConfig) else None
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _do_checkpoint(state: RuntimeState, cfg: CheckpointConfig) -> None:
|
||||||
|
"""Write a checkpoint synchronously and optionally prune old files."""
|
||||||
|
_prepare_entities(state.root)
|
||||||
|
data = state.model_dump_json()
|
||||||
|
cfg.provider.checkpoint(data, cfg.directory)
|
||||||
|
|
||||||
|
if cfg.max_checkpoints is not None:
|
||||||
|
_prune(cfg.directory, cfg.max_checkpoints)
|
||||||
|
|
||||||
|
|
||||||
|
def _safe_remove(path: str) -> None:
|
||||||
|
try:
|
||||||
|
os.remove(path)
|
||||||
|
except OSError:
|
||||||
|
logger.debug("Failed to remove checkpoint file %s", path, exc_info=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _prune(directory: str, max_keep: int) -> None:
|
||||||
|
"""Remove oldest checkpoint files beyond *max_keep*."""
|
||||||
|
pattern = os.path.join(directory, "*.json")
|
||||||
|
files = sorted(glob.glob(pattern), key=os.path.getmtime)
|
||||||
|
to_remove = files if max_keep == 0 else files[:-max_keep]
|
||||||
|
for path in to_remove:
|
||||||
|
_safe_remove(path)
|
||||||
|
|
||||||
|
|
||||||
|
def _should_checkpoint(source: Any, event: BaseEvent) -> CheckpointConfig | None:
|
||||||
|
"""Return the CheckpointConfig if this event should trigger a checkpoint."""
|
||||||
|
cfg = _find_checkpoint(source)
|
||||||
|
if cfg is None:
|
||||||
|
return None
|
||||||
|
if not cfg.trigger_all and event.type not in cfg.trigger_events:
|
||||||
|
return None
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
|
||||||
|
def _on_any_event(source: Any, event: BaseEvent, state: Any) -> None:
|
||||||
|
"""Sync handler registered on every event class."""
|
||||||
|
cfg = _should_checkpoint(source, event)
|
||||||
|
if cfg is None:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
_do_checkpoint(state, cfg)
|
||||||
|
except Exception:
|
||||||
|
logger.warning("Auto-checkpoint failed for event %s", event.type, exc_info=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _register_all_handlers(event_bus: CrewAIEventsBus) -> None:
|
||||||
|
"""Register the checkpoint handler on all known event classes.
|
||||||
|
|
||||||
|
Only the sync handler is registered. The event bus runs sync handlers
|
||||||
|
in a ``ThreadPoolExecutor``, so blocking I/O is safe and we avoid
|
||||||
|
writing duplicate checkpoints from both sync and async dispatch.
|
||||||
|
"""
|
||||||
|
seen: set[type] = set()
|
||||||
|
|
||||||
|
def _collect(cls: type[BaseEvent]) -> None:
|
||||||
|
for sub in cls.__subclasses__():
|
||||||
|
if sub not in seen:
|
||||||
|
seen.add(sub)
|
||||||
|
type_field = sub.model_fields.get("type")
|
||||||
|
if (
|
||||||
|
type_field
|
||||||
|
and type_field.default
|
||||||
|
and type_field.default != "base_event"
|
||||||
|
):
|
||||||
|
event_bus.register_handler(sub, _on_any_event)
|
||||||
|
_collect(sub)
|
||||||
|
|
||||||
|
_collect(BaseEvent)
|
||||||
169
lib/crewai/tests/test_checkpoint.py
Normal file
169
lib/crewai/tests/test_checkpoint.py
Normal file
@@ -0,0 +1,169 @@
|
|||||||
|
"""Tests for CheckpointConfig, checkpoint listener, and pruning."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
import time
|
||||||
|
from typing import Any
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from crewai.agent.core import Agent
|
||||||
|
from crewai.agents.agent_builder.base_agent import BaseAgent
|
||||||
|
from crewai.crew import Crew
|
||||||
|
from crewai.flow.flow import Flow, start
|
||||||
|
from crewai.state.checkpoint_config import CheckpointConfig
|
||||||
|
from crewai.state.checkpoint_listener import (
|
||||||
|
_find_checkpoint,
|
||||||
|
_prune,
|
||||||
|
_resolve,
|
||||||
|
_SENTINEL,
|
||||||
|
)
|
||||||
|
from crewai.task import Task
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- _resolve ----------
|
||||||
|
|
||||||
|
|
||||||
|
class TestResolve:
|
||||||
|
def test_none_returns_none(self) -> None:
|
||||||
|
assert _resolve(None) is None
|
||||||
|
|
||||||
|
def test_false_returns_sentinel(self) -> None:
|
||||||
|
assert _resolve(False) is _SENTINEL
|
||||||
|
|
||||||
|
def test_true_returns_config(self) -> None:
|
||||||
|
result = _resolve(True)
|
||||||
|
assert isinstance(result, CheckpointConfig)
|
||||||
|
assert result.directory == "./.checkpoints"
|
||||||
|
|
||||||
|
def test_config_returns_config(self) -> None:
|
||||||
|
cfg = CheckpointConfig(directory="/tmp/cp")
|
||||||
|
assert _resolve(cfg) is cfg
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- _find_checkpoint inheritance ----------
|
||||||
|
|
||||||
|
|
||||||
|
class TestFindCheckpoint:
|
||||||
|
def _make_agent(self, checkpoint: Any = None) -> Agent:
|
||||||
|
return Agent(role="r", goal="g", backstory="b", checkpoint=checkpoint)
|
||||||
|
|
||||||
|
def _make_crew(
|
||||||
|
self, agents: list[Agent], checkpoint: Any = None
|
||||||
|
) -> Crew:
|
||||||
|
crew = Crew(agents=agents, tasks=[], checkpoint=checkpoint)
|
||||||
|
for a in agents:
|
||||||
|
a.crew = crew
|
||||||
|
return crew
|
||||||
|
|
||||||
|
def test_crew_true(self) -> None:
|
||||||
|
a = self._make_agent()
|
||||||
|
self._make_crew([a], checkpoint=True)
|
||||||
|
cfg = _find_checkpoint(a)
|
||||||
|
assert isinstance(cfg, CheckpointConfig)
|
||||||
|
|
||||||
|
def test_crew_true_agent_false_opts_out(self) -> None:
|
||||||
|
a = self._make_agent(checkpoint=False)
|
||||||
|
self._make_crew([a], checkpoint=True)
|
||||||
|
assert _find_checkpoint(a) is None
|
||||||
|
|
||||||
|
def test_crew_none_agent_none(self) -> None:
|
||||||
|
a = self._make_agent()
|
||||||
|
self._make_crew([a])
|
||||||
|
assert _find_checkpoint(a) is None
|
||||||
|
|
||||||
|
def test_agent_config_overrides_crew(self) -> None:
|
||||||
|
a = self._make_agent(
|
||||||
|
checkpoint=CheckpointConfig(directory="/agent_cp")
|
||||||
|
)
|
||||||
|
self._make_crew([a], checkpoint=True)
|
||||||
|
cfg = _find_checkpoint(a)
|
||||||
|
assert isinstance(cfg, CheckpointConfig)
|
||||||
|
assert cfg.directory == "/agent_cp"
|
||||||
|
|
||||||
|
def test_task_inherits_from_crew(self) -> None:
|
||||||
|
a = self._make_agent()
|
||||||
|
self._make_crew([a], checkpoint=True)
|
||||||
|
task = Task(description="d", expected_output="e", agent=a)
|
||||||
|
cfg = _find_checkpoint(task)
|
||||||
|
assert isinstance(cfg, CheckpointConfig)
|
||||||
|
|
||||||
|
def test_task_agent_false_blocks(self) -> None:
|
||||||
|
a = self._make_agent(checkpoint=False)
|
||||||
|
self._make_crew([a], checkpoint=True)
|
||||||
|
task = Task(description="d", expected_output="e", agent=a)
|
||||||
|
assert _find_checkpoint(task) is None
|
||||||
|
|
||||||
|
def test_flow_direct(self) -> None:
|
||||||
|
flow = Flow(checkpoint=True)
|
||||||
|
cfg = _find_checkpoint(flow)
|
||||||
|
assert isinstance(cfg, CheckpointConfig)
|
||||||
|
|
||||||
|
def test_flow_none(self) -> None:
|
||||||
|
flow = Flow()
|
||||||
|
assert _find_checkpoint(flow) is None
|
||||||
|
|
||||||
|
def test_unknown_source(self) -> None:
|
||||||
|
assert _find_checkpoint("random") is None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- _prune ----------
|
||||||
|
|
||||||
|
|
||||||
|
class TestPrune:
|
||||||
|
def test_prune_keeps_newest(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as d:
|
||||||
|
for i in range(5):
|
||||||
|
path = os.path.join(d, f"cp_{i}.json")
|
||||||
|
with open(path, "w") as f:
|
||||||
|
f.write("{}")
|
||||||
|
# Ensure distinct mtime
|
||||||
|
time.sleep(0.01)
|
||||||
|
|
||||||
|
_prune(d, max_keep=2)
|
||||||
|
remaining = os.listdir(d)
|
||||||
|
assert len(remaining) == 2
|
||||||
|
assert "cp_3.json" in remaining
|
||||||
|
assert "cp_4.json" in remaining
|
||||||
|
|
||||||
|
def test_prune_zero_removes_all(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as d:
|
||||||
|
for i in range(3):
|
||||||
|
with open(os.path.join(d, f"cp_{i}.json"), "w") as f:
|
||||||
|
f.write("{}")
|
||||||
|
|
||||||
|
_prune(d, max_keep=0)
|
||||||
|
assert os.listdir(d) == []
|
||||||
|
|
||||||
|
def test_prune_more_than_existing(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as d:
|
||||||
|
with open(os.path.join(d, "cp.json"), "w") as f:
|
||||||
|
f.write("{}")
|
||||||
|
|
||||||
|
_prune(d, max_keep=10)
|
||||||
|
assert len(os.listdir(d)) == 1
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- CheckpointConfig ----------
|
||||||
|
|
||||||
|
|
||||||
|
class TestCheckpointConfig:
|
||||||
|
def test_defaults(self) -> None:
|
||||||
|
cfg = CheckpointConfig()
|
||||||
|
assert cfg.directory == "./.checkpoints"
|
||||||
|
assert cfg.on_events == ["task_completed"]
|
||||||
|
assert cfg.max_checkpoints is None
|
||||||
|
assert not cfg.trigger_all
|
||||||
|
|
||||||
|
def test_trigger_all(self) -> None:
|
||||||
|
cfg = CheckpointConfig(on_events=["*"])
|
||||||
|
assert cfg.trigger_all
|
||||||
|
|
||||||
|
def test_trigger_events(self) -> None:
|
||||||
|
cfg = CheckpointConfig(
|
||||||
|
on_events=["task_completed", "crew_kickoff_completed"]
|
||||||
|
)
|
||||||
|
assert cfg.trigger_events == {"task_completed", "crew_kickoff_completed"}
|
||||||
Reference in New Issue
Block a user