feat: Add comprehensive error tracking to steps table (#3765)

This commit is contained in:
Matthew Zhou
2025-08-06 20:19:29 -07:00
committed by GitHub
parent 8b1490c879
commit 609bcf84f2
11 changed files with 774 additions and 149 deletions

View File

@@ -0,0 +1,43 @@
"""Add error tracking to steps table
Revision ID: f7f757414d20
Revises: 05c3bc564286
Create Date: 2025-08-05 18:17:06.026153
"""
from typing import Sequence, Union
import sqlalchemy as sa
from alembic import op
# revision identifiers, used by Alembic.
revision: str = "f7f757414d20"
down_revision: Union[str, None] = "05c3bc564286"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
# Create the enum type first
stepstatus = sa.Enum("PENDING", "SUCCESS", "FAILED", "CANCELLED", name="stepstatus")
stepstatus.create(op.get_bind(), checkfirst=True)
op.add_column("steps", sa.Column("error_type", sa.String(), nullable=True))
op.add_column("steps", sa.Column("error_data", sa.JSON(), nullable=True))
op.add_column("steps", sa.Column("status", stepstatus, nullable=True))
# ### end Alembic commands ###
def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column("steps", "status")
op.drop_column("steps", "error_data")
op.drop_column("steps", "error_type")
# Drop the enum type
stepstatus = sa.Enum("PENDING", "SUCCESS", "FAILED", "CANCELLED", name="stepstatus")
stepstatus.drop(op.get_bind(), checkfirst=True)
# ### end Alembic commands ###

View File

@@ -45,7 +45,7 @@ from letta.otel.tracing import log_event, trace_method
from letta.schemas.agent import AgentState, AgentStepResponse, UpdateAgent, get_prompt_template_for_agent_type
from letta.schemas.block import BlockUpdate
from letta.schemas.embedding_config import EmbeddingConfig
from letta.schemas.enums import MessageRole, ProviderType, ToolType
from letta.schemas.enums import MessageRole, ProviderType, StepStatus, ToolType
from letta.schemas.letta_message_content import ImageContent, TextContent
from letta.schemas.memory import ContextWindowOverview, Memory
from letta.schemas.message import Message, MessageCreate, ToolReturn
@@ -991,6 +991,7 @@ class Agent(BaseAgent):
job_id=job_id,
step_id=step_id,
project_id=self.agent_state.project_id,
status=StepStatus.SUCCESS, # Set to SUCCESS since we're logging after successful completion
)
for message in all_new_messages:
message.step_id = step.id

View File

@@ -34,7 +34,7 @@ from letta.otel.context import get_ctx_attributes
from letta.otel.metric_registry import MetricRegistry
from letta.otel.tracing import log_event, trace_method, tracer
from letta.schemas.agent import AgentState, UpdateAgent
from letta.schemas.enums import JobStatus, MessageRole, ProviderType, ToolType
from letta.schemas.enums import JobStatus, MessageRole, ProviderType, StepStatus, ToolType
from letta.schemas.letta_message import MessageType
from letta.schemas.letta_message_content import OmittedReasoningContent, ReasoningContent, RedactedReasoningContent, TextContent
from letta.schemas.letta_response import LettaResponse
@@ -241,6 +241,26 @@ class LettaAgent(BaseAgent):
step_progression = StepProgression.START
should_continue = False
# Create step early with PENDING status
logged_step = await self.step_manager.log_step_async(
actor=self.actor,
agent_id=agent_state.id,
provider_name=agent_state.llm_config.model_endpoint_type,
provider_category=agent_state.llm_config.provider_category or "base",
model=agent_state.llm_config.model,
model_endpoint=agent_state.llm_config.model_endpoint,
context_window_limit=agent_state.llm_config.context_window,
usage=UsageStatistics(completion_tokens=0, prompt_tokens=0, total_tokens=0),
provider_id=None,
job_id=self.current_run_id if self.current_run_id else None,
step_id=step_id,
project_id=agent_state.project_id,
status=StepStatus.PENDING,
)
# Only use step_id in messages if step was actually created
effective_step_id = step_id if logged_step else None
try:
request_data, response_data, current_in_context_messages, new_in_context_messages, valid_tool_names = (
await self._build_and_request_from_llm(
@@ -295,13 +315,17 @@ class LettaAgent(BaseAgent):
tool_rules_solver,
response.usage,
reasoning_content=reasoning,
step_id=step_id,
step_id=effective_step_id,
initial_messages=initial_messages,
agent_step_span=agent_step_span,
is_final_step=(i == max_steps - 1),
)
step_progression = StepProgression.STEP_LOGGED
# Update step with actual usage now that we have it (if step was created)
if logged_step:
await self.step_manager.update_step_success_async(self.actor, step_id, response.usage, stop_reason)
# TODO (cliandy): handle message contexts with larger refactor and dedupe logic
new_message_idx = len(initial_messages) if initial_messages else 0
self.response_messages.extend(persisted_messages[new_message_idx:])
@@ -321,7 +345,7 @@ class LettaAgent(BaseAgent):
provider_trace_create=ProviderTraceCreate(
request_json=request_data,
response_json=response_data,
step_id=step_id,
step_id=step_id, # Use original step_id for telemetry
organization_id=self.actor.organization_id,
),
)
@@ -358,54 +382,57 @@ class LettaAgent(BaseAgent):
# Update step if it needs to be updated
finally:
if settings.track_stop_reason:
if step_progression == StepProgression.FINISHED and should_continue:
continue
if step_progression == StepProgression.FINISHED and should_continue:
continue
self.logger.debug("Running cleanup for agent loop run: %s", self.current_run_id)
self.logger.info("Running final update. Step Progression: %s", step_progression)
try:
if step_progression == StepProgression.FINISHED and not should_continue:
if stop_reason is None:
stop_reason = LettaStopReason(stop_reason=StopReasonType.end_turn.value)
self.logger.debug("Running cleanup for agent loop run: %s", self.current_run_id)
self.logger.info("Running final update. Step Progression: %s", step_progression)
try:
if step_progression == StepProgression.FINISHED and not should_continue:
# Successfully completed - update with final usage and stop reason
if stop_reason is None:
stop_reason = LettaStopReason(stop_reason=StopReasonType.end_turn.value)
# Note: step already updated with success status after _handle_ai_response
if logged_step:
await self.step_manager.update_step_stop_reason(self.actor, step_id, stop_reason.stop_reason)
break
break
if step_progression < StepProgression.STEP_LOGGED:
await self.step_manager.log_step_async(
# Handle error cases
if step_progression < StepProgression.STEP_LOGGED:
# Error occurred before step was fully logged
import traceback
if logged_step:
await self.step_manager.update_step_error_async(
actor=self.actor,
agent_id=agent_state.id,
provider_name=agent_state.llm_config.model_endpoint_type,
provider_category=agent_state.llm_config.provider_category or "base",
model=agent_state.llm_config.model,
model_endpoint=agent_state.llm_config.model_endpoint,
context_window_limit=agent_state.llm_config.context_window,
usage=UsageStatistics(completion_tokens=0, prompt_tokens=0, total_tokens=0),
provider_id=None,
job_id=self.current_run_id if self.current_run_id else None,
step_id=step_id,
project_id=agent_state.project_id,
step_id=step_id, # Use original step_id for telemetry
error_type=type(e).__name__ if "e" in locals() else "Unknown",
error_message=str(e) if "e" in locals() else "Unknown error",
error_traceback=traceback.format_exc(),
stop_reason=stop_reason,
)
if step_progression <= StepProgression.RESPONSE_RECEIVED:
# TODO (cliandy): persist response if we get it back
if settings.track_errored_messages:
for message in initial_messages:
message.is_err = True
message.step_id = step_id
await self.message_manager.create_many_messages_async(initial_messages, actor=self.actor)
elif step_progression <= StepProgression.LOGGED_TRACE:
if stop_reason is None:
self.logger.error("Error in step after logging step")
stop_reason = LettaStopReason(stop_reason=StopReasonType.error.value)
await self.step_manager.update_step_stop_reason(self.actor, step_id, stop_reason.stop_reason)
else:
self.logger.error("Invalid StepProgression value")
if step_progression <= StepProgression.RESPONSE_RECEIVED:
# TODO (cliandy): persist response if we get it back
if settings.track_errored_messages:
for message in initial_messages:
message.is_err = True
message.step_id = effective_step_id
await self.message_manager.create_many_messages_async(initial_messages, actor=self.actor)
elif step_progression <= StepProgression.LOGGED_TRACE:
if stop_reason is None:
self.logger.error("Error in step after logging step")
stop_reason = LettaStopReason(stop_reason=StopReasonType.error.value)
if logged_step:
await self.step_manager.update_step_stop_reason(self.actor, step_id, stop_reason.stop_reason)
else:
self.logger.error("Invalid StepProgression value")
if settings.track_stop_reason:
await self._log_request(request_start_timestamp_ns, request_span)
except Exception as e:
self.logger.error("Failed to update step: %s", e)
except Exception as e:
self.logger.error("Failed to update step: %s", e)
if not should_continue:
break
@@ -484,6 +511,25 @@ class LettaAgent(BaseAgent):
step_progression = StepProgression.START
should_continue = False
# Create step early with PENDING status
logged_step = await self.step_manager.log_step_async(
actor=self.actor,
agent_id=agent_state.id,
provider_name=agent_state.llm_config.model_endpoint_type,
provider_category=agent_state.llm_config.provider_category or "base",
model=agent_state.llm_config.model,
model_endpoint=agent_state.llm_config.model_endpoint,
context_window_limit=agent_state.llm_config.context_window,
usage=UsageStatistics(completion_tokens=0, prompt_tokens=0, total_tokens=0),
provider_id=None,
job_id=run_id if run_id else self.current_run_id,
step_id=step_id,
project_id=agent_state.project_id,
status=StepStatus.PENDING,
)
# Only use step_id in messages if step was actually created
effective_step_id = step_id if logged_step else None
try:
request_data, response_data, current_in_context_messages, new_in_context_messages, valid_tool_names = (
await self._build_and_request_from_llm(
@@ -533,7 +579,7 @@ class LettaAgent(BaseAgent):
tool_rules_solver,
response.usage,
reasoning_content=reasoning,
step_id=step_id,
step_id=effective_step_id,
initial_messages=initial_messages,
agent_step_span=agent_step_span,
is_final_step=(i == max_steps - 1),
@@ -541,6 +587,10 @@ class LettaAgent(BaseAgent):
)
step_progression = StepProgression.STEP_LOGGED
# Update step with actual usage now that we have it (if step was created)
if logged_step:
await self.step_manager.update_step_success_async(self.actor, step_id, response.usage, stop_reason)
new_message_idx = len(initial_messages) if initial_messages else 0
self.response_messages.extend(persisted_messages[new_message_idx:])
new_in_context_messages.extend(persisted_messages[new_message_idx:])
@@ -560,7 +610,7 @@ class LettaAgent(BaseAgent):
provider_trace_create=ProviderTraceCreate(
request_json=request_data,
response_json=response_data,
step_id=step_id,
step_id=step_id, # Use original step_id for telemetry
organization_id=self.actor.organization_id,
),
)
@@ -584,54 +634,56 @@ class LettaAgent(BaseAgent):
# Update step if it needs to be updated
finally:
if settings.track_stop_reason:
if step_progression == StepProgression.FINISHED and should_continue:
continue
if step_progression == StepProgression.FINISHED and should_continue:
continue
self.logger.debug("Running cleanup for agent loop run: %s", self.current_run_id)
self.logger.info("Running final update. Step Progression: %s", step_progression)
try:
if step_progression == StepProgression.FINISHED and not should_continue:
if stop_reason is None:
stop_reason = LettaStopReason(stop_reason=StopReasonType.end_turn.value)
await self.step_manager.update_step_stop_reason(self.actor, step_id, stop_reason.stop_reason)
break
self.logger.debug("Running cleanup for agent loop run: %s", self.current_run_id)
self.logger.info("Running final update. Step Progression: %s", step_progression)
try:
if step_progression == StepProgression.FINISHED and not should_continue:
# Successfully completed - update with final usage and stop reason
if stop_reason is None:
stop_reason = LettaStopReason(stop_reason=StopReasonType.end_turn.value)
if logged_step:
await self.step_manager.update_step_success_async(self.actor, step_id, usage, stop_reason)
break
if step_progression < StepProgression.STEP_LOGGED:
await self.step_manager.log_step_async(
# Handle error cases
if step_progression < StepProgression.STEP_LOGGED:
# Error occurred before step was fully logged
import traceback
if logged_step:
await self.step_manager.update_step_error_async(
actor=self.actor,
agent_id=agent_state.id,
provider_name=agent_state.llm_config.model_endpoint_type,
provider_category=agent_state.llm_config.provider_category or "base",
model=agent_state.llm_config.model,
model_endpoint=agent_state.llm_config.model_endpoint,
context_window_limit=agent_state.llm_config.context_window,
usage=UsageStatistics(completion_tokens=0, prompt_tokens=0, total_tokens=0),
provider_id=None,
job_id=self.current_run_id if self.current_run_id else None,
step_id=step_id,
project_id=agent_state.project_id,
step_id=step_id, # Use original step_id for telemetry
error_type=type(e).__name__ if "e" in locals() else "Unknown",
error_message=str(e) if "e" in locals() else "Unknown error",
error_traceback=traceback.format_exc(),
stop_reason=stop_reason,
)
if step_progression <= StepProgression.RESPONSE_RECEIVED:
# TODO (cliandy): persist response if we get it back
if settings.track_errored_messages:
for message in initial_messages:
message.is_err = True
message.step_id = step_id
await self.message_manager.create_many_messages_async(initial_messages, actor=self.actor)
elif step_progression <= StepProgression.LOGGED_TRACE:
if stop_reason is None:
self.logger.error("Error in step after logging step")
stop_reason = LettaStopReason(stop_reason=StopReasonType.error.value)
await self.step_manager.update_step_stop_reason(self.actor, step_id, stop_reason.stop_reason)
else:
self.logger.error("Invalid StepProgression value")
if step_progression <= StepProgression.RESPONSE_RECEIVED:
# TODO (cliandy): persist response if we get it back
if settings.track_errored_messages:
for message in initial_messages:
message.is_err = True
message.step_id = effective_step_id
await self.message_manager.create_many_messages_async(initial_messages, actor=self.actor)
elif step_progression <= StepProgression.LOGGED_TRACE:
if stop_reason is None:
self.logger.error("Error in step after logging step")
stop_reason = LettaStopReason(stop_reason=StopReasonType.error.value)
if logged_step:
await self.step_manager.update_step_stop_reason(self.actor, step_id, stop_reason.stop_reason)
else:
self.logger.error("Invalid StepProgression value")
if settings.track_stop_reason:
await self._log_request(request_start_timestamp_ns, request_span)
except Exception as e:
self.logger.error("Failed to update step: %s", e)
except Exception as e:
self.logger.error("Failed to update step: %s", e)
if not should_continue:
break
@@ -717,6 +769,26 @@ class LettaAgent(BaseAgent):
step_progression = StepProgression.START
should_continue = False
# Create step early with PENDING status
logged_step = await self.step_manager.log_step_async(
actor=self.actor,
agent_id=agent_state.id,
provider_name=agent_state.llm_config.model_endpoint_type,
provider_category=agent_state.llm_config.provider_category or "base",
model=agent_state.llm_config.model,
model_endpoint=agent_state.llm_config.model_endpoint,
context_window_limit=agent_state.llm_config.context_window,
usage=UsageStatistics(completion_tokens=0, prompt_tokens=0, total_tokens=0),
provider_id=None,
job_id=self.current_run_id if self.current_run_id else None,
step_id=step_id,
project_id=agent_state.project_id,
status=StepStatus.PENDING,
)
# Only use step_id in messages if step was actually created
effective_step_id = step_id if logged_step else None
try:
(
request_data,
@@ -827,13 +899,26 @@ class LettaAgent(BaseAgent):
),
reasoning_content=reasoning_content,
pre_computed_assistant_message_id=interface.letta_message_id,
step_id=step_id,
step_id=effective_step_id,
initial_messages=initial_messages,
agent_step_span=agent_step_span,
is_final_step=(i == max_steps - 1),
)
step_progression = StepProgression.STEP_LOGGED
# Update step with actual usage now that we have it (if step was created)
if logged_step:
await self.step_manager.update_step_success_async(
self.actor,
step_id,
UsageStatistics(
completion_tokens=usage.completion_tokens,
prompt_tokens=usage.prompt_tokens,
total_tokens=usage.total_tokens,
),
stop_reason,
)
new_message_idx = len(initial_messages) if initial_messages else 0
self.response_messages.extend(persisted_messages[new_message_idx:])
new_in_context_messages.extend(persisted_messages[new_message_idx:])
@@ -872,7 +957,7 @@ class LettaAgent(BaseAgent):
"output_tokens": usage.completion_tokens,
},
},
step_id=step_id,
step_id=step_id, # Use original step_id for telemetry
organization_id=self.actor.organization_id,
),
)
@@ -907,54 +992,57 @@ class LettaAgent(BaseAgent):
# Update step if it needs to be updated
finally:
if settings.track_stop_reason:
if step_progression == StepProgression.FINISHED and should_continue:
continue
if step_progression == StepProgression.FINISHED and should_continue:
continue
self.logger.debug("Running cleanup for agent loop run: %s", self.current_run_id)
self.logger.info("Running final update. Step Progression: %s", step_progression)
try:
if step_progression == StepProgression.FINISHED and not should_continue:
if stop_reason is None:
stop_reason = LettaStopReason(stop_reason=StopReasonType.end_turn.value)
self.logger.debug("Running cleanup for agent loop run: %s", self.current_run_id)
self.logger.info("Running final update. Step Progression: %s", step_progression)
try:
if step_progression == StepProgression.FINISHED and not should_continue:
# Successfully completed - update with final usage and stop reason
if stop_reason is None:
stop_reason = LettaStopReason(stop_reason=StopReasonType.end_turn.value)
# Note: step already updated with success status after _handle_ai_response
if logged_step:
await self.step_manager.update_step_stop_reason(self.actor, step_id, stop_reason.stop_reason)
break
break
if step_progression < StepProgression.STEP_LOGGED:
await self.step_manager.log_step_async(
# Handle error cases
if step_progression < StepProgression.STEP_LOGGED:
# Error occurred before step was fully logged
import traceback
if logged_step:
await self.step_manager.update_step_error_async(
actor=self.actor,
agent_id=agent_state.id,
provider_name=agent_state.llm_config.model_endpoint_type,
provider_category=agent_state.llm_config.provider_category or "base",
model=agent_state.llm_config.model,
model_endpoint=agent_state.llm_config.model_endpoint,
context_window_limit=agent_state.llm_config.context_window,
usage=UsageStatistics(completion_tokens=0, prompt_tokens=0, total_tokens=0),
provider_id=None,
job_id=self.current_run_id if self.current_run_id else None,
step_id=step_id,
project_id=agent_state.project_id,
step_id=step_id, # Use original step_id for telemetry
error_type=type(e).__name__ if "e" in locals() else "Unknown",
error_message=str(e) if "e" in locals() else "Unknown error",
error_traceback=traceback.format_exc(),
stop_reason=stop_reason,
)
if step_progression <= StepProgression.STREAM_RECEIVED:
if first_chunk and settings.track_errored_messages:
for message in initial_messages:
message.is_err = True
message.step_id = step_id
await self.message_manager.create_many_messages_async(initial_messages, actor=self.actor)
elif step_progression <= StepProgression.LOGGED_TRACE:
if stop_reason is None:
self.logger.error("Error in step after logging step")
stop_reason = LettaStopReason(stop_reason=StopReasonType.error.value)
await self.step_manager.update_step_stop_reason(self.actor, step_id, stop_reason.stop_reason)
else:
self.logger.error("Invalid StepProgression value")
# Do tracking for failure cases. Can consolidate with success conditions later.
if step_progression <= StepProgression.STREAM_RECEIVED:
if first_chunk and settings.track_errored_messages:
for message in initial_messages:
message.is_err = True
message.step_id = effective_step_id
await self.message_manager.create_many_messages_async(initial_messages, actor=self.actor)
elif step_progression <= StepProgression.LOGGED_TRACE:
if stop_reason is None:
self.logger.error("Error in step after logging step")
stop_reason = LettaStopReason(stop_reason=StopReasonType.error.value)
if logged_step:
await self.step_manager.update_step_stop_reason(self.actor, step_id, stop_reason.stop_reason)
else:
self.logger.error("Invalid StepProgression value")
# Do tracking for failure cases. Can consolidate with success conditions later.
if settings.track_stop_reason:
await self._log_request(request_start_timestamp_ns, request_span)
except Exception as e:
self.logger.error("Failed to update step: %s", e)
except Exception as e:
self.logger.error("Failed to update step: %s", e)
if not should_continue:
break
@@ -1315,23 +1403,7 @@ class LettaAgent(BaseAgent):
is_final_step=is_final_step,
)
# 5. Persist step + messages and propagate to jobs
logged_step = await self.step_manager.log_step_async(
actor=self.actor,
agent_id=agent_state.id,
provider_name=agent_state.llm_config.model_endpoint_type,
provider_category=agent_state.llm_config.provider_category or "base",
model=agent_state.llm_config.model,
model_endpoint=agent_state.llm_config.model_endpoint,
context_window_limit=agent_state.llm_config.context_window,
usage=usage,
provider_id=None,
job_id=run_id if run_id else self.current_run_id,
step_id=step_id,
project_id=agent_state.project_id,
stop_reason=stop_reason,
)
# 5. Create messages (step was already created at the beginning)
tool_call_messages = create_letta_messages_from_llm_response(
agent_id=agent_state.id,
model=agent_state.llm_config.model,
@@ -1347,7 +1419,7 @@ class LettaAgent(BaseAgent):
heartbeat_reason=heartbeat_reason,
reasoning_content=reasoning_content,
pre_computed_assistant_message_id=pre_computed_assistant_message_id,
step_id=logged_step.id if logged_step else None,
step_id=step_id,
)
persisted_messages = await self.message_manager.create_many_messages_async(

View File

@@ -6,6 +6,7 @@ from sqlalchemy.orm import Mapped, mapped_column, relationship
from letta.orm.mixins import ProjectMixin
from letta.orm.sqlalchemy_base import SqlalchemyBase
from letta.schemas.enums import StepStatus
from letta.schemas.letta_stop_reason import StopReasonType
from letta.schemas.step import Step as PydanticStep
@@ -55,6 +56,13 @@ class Step(SqlalchemyBase, ProjectMixin):
None, nullable=True, doc="The feedback for this step. Must be either 'positive' or 'negative'."
)
# error handling
error_type: Mapped[Optional[str]] = mapped_column(None, nullable=True, doc="The type/class of the error that occurred")
error_data: Mapped[Optional[Dict]] = mapped_column(
JSON, nullable=True, doc="Error details including message, traceback, and additional context"
)
status: Mapped[Optional[StepStatus]] = mapped_column(None, nullable=True, doc="Step status: pending, success, or failed")
# Relationships (foreign keys)
organization: Mapped[Optional["Organization"]] = relationship("Organization")
provider: Mapped[Optional["Provider"]] = relationship("Provider")

View File

@@ -160,3 +160,12 @@ class SandboxType(str, Enum):
E2B = "e2b"
MODAL = "modal"
LOCAL = "local"
class StepStatus(str, Enum):
"""Status of a step execution"""
PENDING = "pending"
SUCCESS = "success"
FAILED = "failed"
CANCELLED = "cancelled"

View File

@@ -3,6 +3,7 @@ from typing import Dict, List, Literal, Optional
from pydantic import Field
from letta.schemas.enums import StepStatus
from letta.schemas.letta_base import LettaBase
from letta.schemas.letta_stop_reason import StopReasonType
from letta.schemas.message import Message
@@ -40,6 +41,11 @@ class Step(StepBase):
)
project_id: Optional[str] = Field(None, description="The project that the agent that executed this step belongs to (cloud only).")
# error tracking fields
error_type: Optional[str] = Field(None, description="The type/class of the error that occurred")
error_data: Optional[Dict] = Field(None, description="Error details including message, traceback, and additional context")
status: Optional[StepStatus] = Field(StepStatus.PENDING, description="Step status: pending, success, or failed")
class StepProgression(int, Enum):
START = auto()

View File

@@ -1,6 +1,6 @@
from datetime import datetime
from enum import Enum
from typing import List, Literal, Optional
from typing import Dict, List, Literal, Optional
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
@@ -12,6 +12,7 @@ from letta.orm.job import Job as JobModel
from letta.orm.sqlalchemy_base import AccessType
from letta.orm.step import Step as StepModel
from letta.otel.tracing import get_trace_id, trace_method
from letta.schemas.enums import StepStatus
from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType
from letta.schemas.openai.chat_completion_response import UsageStatistics
from letta.schemas.step import Step as PydanticStep
@@ -87,6 +88,10 @@ class StepManager:
job_id: Optional[str] = None,
step_id: Optional[str] = None,
project_id: Optional[str] = None,
stop_reason: Optional[LettaStopReason] = None,
status: Optional[StepStatus] = None,
error_type: Optional[str] = None,
error_data: Optional[Dict] = None,
) -> PydanticStep:
step_data = {
"origin": None,
@@ -106,9 +111,14 @@ class StepManager:
"tid": None,
"trace_id": get_trace_id(), # Get the current trace ID
"project_id": project_id,
"status": status if status else StepStatus.PENDING,
"error_type": error_type,
"error_data": error_data,
}
if step_id:
step_data["id"] = step_id
if stop_reason:
step_data["stop_reason"] = stop_reason.stop_reason
with db_registry.session() as session:
if job_id:
self._verify_job_access(session, job_id, actor, access=["write"])
@@ -133,6 +143,9 @@ class StepManager:
step_id: Optional[str] = None,
project_id: Optional[str] = None,
stop_reason: Optional[LettaStopReason] = None,
status: Optional[StepStatus] = None,
error_type: Optional[str] = None,
error_data: Optional[Dict] = None,
) -> PydanticStep:
step_data = {
"origin": None,
@@ -152,6 +165,9 @@ class StepManager:
"tid": None,
"trace_id": get_trace_id(), # Get the current trace ID
"project_id": project_id,
"status": status if status else StepStatus.PENDING,
"error_type": error_type,
"error_data": error_data,
}
if step_id:
step_data["id"] = step_id
@@ -236,6 +252,126 @@ class StepManager:
await session.commit()
return step
@enforce_types
@trace_method
async def update_step_error_async(
self,
actor: PydanticUser,
step_id: str,
error_type: str,
error_message: str,
error_traceback: str,
error_details: Optional[Dict] = None,
stop_reason: Optional[LettaStopReason] = None,
) -> PydanticStep:
"""Update a step with error information.
Args:
actor: The user making the request
step_id: The ID of the step to update
error_type: The type/class of the error
error_message: The error message
error_traceback: Full error traceback
error_details: Additional error context
stop_reason: The stop reason to set
Returns:
The updated step
Raises:
NoResultFound: If the step does not exist
"""
async with db_registry.async_session() as session:
step = await session.get(StepModel, step_id)
if not step:
raise NoResultFound(f"Step with id {step_id} does not exist")
if step.organization_id != actor.organization_id:
raise Exception("Unauthorized")
step.status = StepStatus.FAILED
step.error_type = error_type
step.error_data = {"message": error_message, "traceback": error_traceback, "details": error_details}
if stop_reason:
step.stop_reason = stop_reason.stop_reason
await session.commit()
return step.to_pydantic()
@enforce_types
@trace_method
async def update_step_success_async(
self,
actor: PydanticUser,
step_id: str,
usage: UsageStatistics,
stop_reason: Optional[LettaStopReason] = None,
) -> PydanticStep:
"""Update a step with success status and final usage statistics.
Args:
actor: The user making the request
step_id: The ID of the step to update
usage: Final usage statistics
stop_reason: The stop reason to set
Returns:
The updated step
Raises:
NoResultFound: If the step does not exist
"""
async with db_registry.async_session() as session:
step = await session.get(StepModel, step_id)
if not step:
raise NoResultFound(f"Step with id {step_id} does not exist")
if step.organization_id != actor.organization_id:
raise Exception("Unauthorized")
step.status = StepStatus.SUCCESS
step.completion_tokens = usage.completion_tokens
step.prompt_tokens = usage.prompt_tokens
step.total_tokens = usage.total_tokens
if stop_reason:
step.stop_reason = stop_reason.stop_reason
await session.commit()
return step.to_pydantic()
@enforce_types
@trace_method
async def update_step_cancelled_async(
self,
actor: PydanticUser,
step_id: str,
stop_reason: Optional[LettaStopReason] = None,
) -> PydanticStep:
"""Update a step with cancelled status.
Args:
actor: The user making the request
step_id: The ID of the step to update
stop_reason: The stop reason to set
Returns:
The updated step
Raises:
NoResultFound: If the step does not exist
"""
async with db_registry.async_session() as session:
step = await session.get(StepModel, step_id)
if not step:
raise NoResultFound(f"Step with id {step_id} does not exist")
if step.organization_id != actor.organization_id:
raise Exception("Unauthorized")
step.status = StepStatus.CANCELLED
if stop_reason:
step.stop_reason = stop_reason.stop_reason
await session.commit()
return step.to_pydantic()
def _verify_job_access(
self,
session: Session,
@@ -319,6 +455,10 @@ class NoopStepManager(StepManager):
job_id: Optional[str] = None,
step_id: Optional[str] = None,
project_id: Optional[str] = None,
stop_reason: Optional[LettaStopReason] = None,
status: Optional[StepStatus] = None,
error_type: Optional[str] = None,
error_data: Optional[Dict] = None,
) -> PydanticStep:
return
@@ -339,5 +479,43 @@ class NoopStepManager(StepManager):
step_id: Optional[str] = None,
project_id: Optional[str] = None,
stop_reason: Optional[LettaStopReason] = None,
status: Optional[StepStatus] = None,
error_type: Optional[str] = None,
error_data: Optional[Dict] = None,
) -> PydanticStep:
return
@enforce_types
@trace_method
async def update_step_error_async(
self,
actor: PydanticUser,
step_id: str,
error_type: str,
error_message: str,
error_traceback: str,
error_details: Optional[Dict] = None,
stop_reason: Optional[LettaStopReason] = None,
) -> PydanticStep:
return
@enforce_types
@trace_method
async def update_step_success_async(
self,
actor: PydanticUser,
step_id: str,
usage: UsageStatistics,
stop_reason: Optional[LettaStopReason] = None,
) -> PydanticStep:
return
@enforce_types
@trace_method
async def update_step_cancelled_async(
self,
actor: PydanticUser,
step_id: str,
stop_reason: Optional[LettaStopReason] = None,
) -> PydanticStep:
return

View File

@@ -209,7 +209,10 @@ class LettaBuiltinToolExecutor(ToolExecutor):
logger.info(f"[DEBUG] Starting Firecrawl search for query: '{task.query}' with limit={limit}")
# Perform the search for this task
search_result = await app.search(task.query, limit=limit, scrape_options=ScrapeOptions(formats=["markdown"]))
scrape_options = ScrapeOptions(
formats=["markdown"], excludeTags=["#ad", "#footer"], onlyMainContent=True, parsePDF=True, removeBase64Images=True
)
search_result = await app.search(task.query, limit=limit, scrape_options=scrape_options)
logger.info(
f"[DEBUG] Firecrawl search completed for '{task.query}': {len(search_result.get('data', [])) if search_result else 0} results"

View File

@@ -95,7 +95,7 @@ def agent_obj(client):
)
yield agent_state_instance
client.agents.delete(agent_state_instance.id)
# client.agents.delete(agent_state_instance.id)
@pytest.fixture(scope="function")
@@ -111,7 +111,7 @@ def other_agent_obj(client):
yield agent_state_instance
client.agents.delete(agent_state_instance.id)
# client.agents.delete(agent_state_instance.id)
@pytest.fixture
@@ -150,7 +150,7 @@ def test_send_message_to_agent(client, server, agent_obj, other_agent_obj):
actor = server.user_manager.get_user_or_default()
# Encourage the agent to send a message to the other agent_obj with the secret string
client.agents.messages.create(
response = client.agents.messages.create(
agent_id=agent_obj.id,
messages=[
{

View File

@@ -257,6 +257,7 @@ async def test_sleeptime_group_chat_v2(server, actor):
job_manager=server.job_manager,
actor=actor,
group=main_agent.multi_agent_group,
step_manager=server.step_manager,
)
response = await agent.step(

View File

@@ -62,6 +62,7 @@ from letta.schemas.enums import (
MessageRole,
ProviderType,
SandboxType,
StepStatus,
ToolType,
)
from letta.schemas.environment_variables import SandboxEnvironmentVariableCreate, SandboxEnvironmentVariableUpdate
@@ -74,6 +75,7 @@ from letta.schemas.job import Job as PydanticJob
from letta.schemas.job import JobUpdate, LettaRequestConfig
from letta.schemas.letta_message import UpdateAssistantMessage, UpdateReasoningMessage, UpdateSystemMessage, UpdateUserMessage
from letta.schemas.letta_message_content import TextContent
from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType
from letta.schemas.llm_batch_job import AgentStepState, LLMBatchItem
from letta.schemas.llm_config import LLMConfig
from letta.schemas.message import Message as PydanticMessage
@@ -8036,6 +8038,308 @@ async def test_job_usage_stats_add_multiple(server: SyncServer, sarah_agent, def
assert len(steps_without_feedback) == 2
@pytest.mark.asyncio
async def test_step_manager_error_tracking(server: SyncServer, sarah_agent, default_job, default_user, event_loop):
"""Test step manager error tracking functionality."""
step_manager = server.step_manager
# Create a step with pending status
step = await step_manager.log_step_async(
agent_id=sarah_agent.id,
provider_name="openai",
provider_category="base",
model="gpt-4o-mini",
model_endpoint="https://api.openai.com/v1",
context_window_limit=8192,
job_id=default_job.id,
usage=UsageStatistics(
completion_tokens=0,
prompt_tokens=0,
total_tokens=0,
),
actor=default_user,
project_id=sarah_agent.project_id,
status=StepStatus.PENDING,
)
assert step.status == StepStatus.PENDING
assert step.error_type is None
assert step.error_data is None
# Test update_step_error_async
error_details = {"step_progression": "RESPONSE_RECEIVED", "context": "Test error context"}
updated_step = await step_manager.update_step_error_async(
actor=default_user,
step_id=step.id,
error_type="ValueError",
error_message="Test error message",
error_traceback="Traceback (most recent call last):\n File test.py, line 1\n raise ValueError('Test error')",
error_details=error_details,
stop_reason=LettaStopReason(stop_reason=StopReasonType.error.value),
)
assert updated_step.status == StepStatus.FAILED
assert updated_step.error_type == "ValueError"
assert updated_step.error_data["message"] == "Test error message"
assert updated_step.error_data["traceback"].startswith("Traceback")
assert updated_step.error_data["details"] == error_details
assert updated_step.stop_reason == StopReasonType.error
# Create another step to test success update
success_step = await step_manager.log_step_async(
agent_id=sarah_agent.id,
provider_name="openai",
provider_category="base",
model="gpt-4o-mini",
model_endpoint="https://api.openai.com/v1",
context_window_limit=8192,
job_id=default_job.id,
usage=UsageStatistics(
completion_tokens=0,
prompt_tokens=0,
total_tokens=0,
),
actor=default_user,
project_id=sarah_agent.project_id,
status=StepStatus.PENDING,
)
# Test update_step_success_async
final_usage = UsageStatistics(
completion_tokens=150,
prompt_tokens=100,
total_tokens=250,
)
updated_success_step = await step_manager.update_step_success_async(
actor=default_user,
step_id=success_step.id,
usage=final_usage,
stop_reason=LettaStopReason(stop_reason=StopReasonType.end_turn.value),
)
assert updated_success_step.status == StepStatus.SUCCESS
assert updated_success_step.completion_tokens == 150
assert updated_success_step.prompt_tokens == 100
assert updated_success_step.total_tokens == 250
assert updated_success_step.stop_reason == StopReasonType.end_turn
assert updated_success_step.error_type is None
assert updated_success_step.error_data is None
# Create a step to test cancellation
cancelled_step = await step_manager.log_step_async(
agent_id=sarah_agent.id,
provider_name="openai",
provider_category="base",
model="gpt-4o-mini",
model_endpoint="https://api.openai.com/v1",
context_window_limit=8192,
job_id=default_job.id,
usage=UsageStatistics(
completion_tokens=0,
prompt_tokens=0,
total_tokens=0,
),
actor=default_user,
project_id=sarah_agent.project_id,
status=StepStatus.PENDING,
)
# Test update_step_cancelled_async
updated_cancelled_step = await step_manager.update_step_cancelled_async(
actor=default_user,
step_id=cancelled_step.id,
stop_reason=LettaStopReason(stop_reason=StopReasonType.cancelled.value),
)
assert updated_cancelled_step.status == StepStatus.CANCELLED
assert updated_cancelled_step.stop_reason == StopReasonType.cancelled
assert updated_cancelled_step.error_type is None
assert updated_cancelled_step.error_data is None
@pytest.mark.asyncio
async def test_step_manager_error_tracking_edge_cases(server: SyncServer, sarah_agent, default_job, default_user, event_loop):
"""Test edge cases for step manager error tracking."""
step_manager = server.step_manager
# Test 1: Attempt to update non-existent step
with pytest.raises(NoResultFound):
await step_manager.update_step_error_async(
actor=default_user,
step_id="non-existent-step-id",
error_type="TestError",
error_message="Test",
error_traceback="Test traceback",
)
# Test 2: Create step with initial error information
step_with_error = await step_manager.log_step_async(
agent_id=sarah_agent.id,
provider_name="openai",
provider_category="base",
model="gpt-4o-mini",
model_endpoint="https://api.openai.com/v1",
context_window_limit=8192,
job_id=default_job.id,
usage=UsageStatistics(
completion_tokens=0,
prompt_tokens=0,
total_tokens=0,
),
actor=default_user,
project_id=sarah_agent.project_id,
status=StepStatus.FAILED,
error_type="InitialError",
error_data={"message": "Step failed at creation", "traceback": "Initial traceback", "details": {"initial": True}},
)
assert step_with_error.status == StepStatus.FAILED
assert step_with_error.error_type == "InitialError"
assert step_with_error.error_data["message"] == "Step failed at creation"
assert step_with_error.error_data["details"] == {"initial": True}
# Test 3: Update from failed to success (recovery scenario)
recovered_step = await step_manager.update_step_success_async(
actor=default_user,
step_id=step_with_error.id,
usage=UsageStatistics(
completion_tokens=50,
prompt_tokens=30,
total_tokens=80,
),
)
# Verify error fields are still present but status changed
assert recovered_step.status == StepStatus.SUCCESS
assert recovered_step.error_type == "InitialError" # Should retain error info
assert recovered_step.completion_tokens == 50
# Test 4: Very long error messages and tracebacks
long_error_step = await step_manager.log_step_async(
agent_id=sarah_agent.id,
provider_name="openai",
provider_category="base",
model="gpt-4o-mini",
model_endpoint="https://api.openai.com/v1",
context_window_limit=8192,
job_id=default_job.id,
usage=UsageStatistics(
completion_tokens=0,
prompt_tokens=0,
total_tokens=0,
),
actor=default_user,
project_id=sarah_agent.project_id,
status=StepStatus.PENDING,
)
very_long_traceback = "Traceback (most recent call last):\n" + "\n".join([f" File 'test{i}.py', line {i}" for i in range(100)])
complex_error_details = {
"nested": {"data": {"arrays": [1, 2, 3, 4, 5], "strings": ["error1", "error2", "error3"], "booleans": [True, False, True]}},
"timestamp": "2024-01-01T00:00:00Z",
"context": "Complex nested error details",
}
updated_long_error = await step_manager.update_step_error_async(
actor=default_user,
step_id=long_error_step.id,
error_type="VeryLongError",
error_message="A" * 500, # Very long error message
error_traceback=very_long_traceback,
error_details=complex_error_details,
)
assert updated_long_error.status == StepStatus.FAILED
assert len(updated_long_error.error_data["message"]) == 500
assert "test99.py" in updated_long_error.error_data["traceback"]
assert updated_long_error.error_data["details"]["nested"]["data"]["arrays"] == [1, 2, 3, 4, 5]
# Test 5: Multiple status updates on same step
multi_update_step = await step_manager.log_step_async(
agent_id=sarah_agent.id,
provider_name="openai",
provider_category="base",
model="gpt-4o-mini",
model_endpoint="https://api.openai.com/v1",
context_window_limit=8192,
job_id=default_job.id,
usage=UsageStatistics(
completion_tokens=0,
prompt_tokens=0,
total_tokens=0,
),
actor=default_user,
project_id=sarah_agent.project_id,
status=StepStatus.PENDING,
)
# First update to cancelled
step1 = await step_manager.update_step_cancelled_async(
actor=default_user,
step_id=multi_update_step.id,
)
assert step1.status == StepStatus.CANCELLED
# Then update to error (simulating race condition or retry)
step2 = await step_manager.update_step_error_async(
actor=default_user,
step_id=multi_update_step.id,
error_type="PostCancellationError",
error_message="Error after cancellation",
error_traceback="Traceback after cancel",
)
assert step2.status == StepStatus.FAILED
assert step2.error_type == "PostCancellationError"
@pytest.mark.asyncio
async def test_step_manager_list_steps_with_status_filter(server: SyncServer, sarah_agent, default_job, default_user, event_loop):
"""Test listing steps with status filters."""
step_manager = server.step_manager
# Create steps with different statuses
statuses = [StepStatus.PENDING, StepStatus.SUCCESS, StepStatus.FAILED, StepStatus.CANCELLED]
created_steps = []
for status in statuses:
step = await step_manager.log_step_async(
agent_id=sarah_agent.id,
provider_name="openai",
provider_category="base",
model="gpt-4o-mini",
model_endpoint="https://api.openai.com/v1",
context_window_limit=8192,
job_id=default_job.id,
usage=UsageStatistics(
completion_tokens=10,
prompt_tokens=20,
total_tokens=30,
),
actor=default_user,
project_id=sarah_agent.project_id,
status=status,
)
created_steps.append(step)
# List all steps for the agent
all_steps = await step_manager.list_steps_async(
agent_id=sarah_agent.id,
actor=default_user,
)
# Verify we can find steps with each status
status_counts = {status: 0 for status in statuses}
for step in all_steps:
if step.status in status_counts:
status_counts[step.status] += 1
# Each status should have at least one step
for status in statuses:
assert status_counts[status] >= 1, f"No steps found with status {status}"
def test_job_usage_stats_get_nonexistent_job(server: SyncServer, default_user):
"""Test getting usage statistics for a nonexistent job."""
job_manager = server.job_manager