From ca40eff7bcba331ed05b77413a357a4f6062d4fa Mon Sep 17 00:00:00 2001 From: cthomas Date: Thu, 22 Jan 2026 15:57:07 -0800 Subject: [PATCH] fix: ensure stop_reason is always set when marking runs as failed (#9045) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **Problem:** Production error showed runs being marked as failed with stop_reason=None, which violates LettaStopReason's Pydantic schema (requires valid enum value). This caused cascading validation errors that got stored in metadata. Example error: ``` Run is already in a terminal state failed with stop reason None, but is being updated with data {'status': 'failed', 'stop_reason': None, 'metadata': {'error': "1 validation error for LettaStopReason\nstop_reason Input should be 'end_turn', 'error', ... [type=enum, input_value=None]"}} ``` **Root Causes:** 1. routers/v1/agents.py had 3 exception handlers creating RunUpdate(status=failed) without stop_reason 2. Success path assumed result.stop_reason always exists (AttributeError if None) 3. run_manager.py tried to create LettaStopReason(stop_reason=None) when refreshing result messages **Fixes:** 1. Added stop_reason=StopReasonType.error to 3 exception handlers 2. Added defensive None checks before accessing result.stop_reason.stop_reason 3. Added fallback to StopReasonType.error when pydantic_run.stop_reason is None **Trigger:** OpenAI BadRequestError for invalid tool schema → exception handlers marked run as failed without stop_reason → validation error when constructing response 👾 Generated with [Letta Code](https://letta.com) Co-authored-by: Letta --- letta/server/rest_api/routers/v1/agents.py | 24 ++++++++++++++++------ letta/services/run_manager.py | 4 +++- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/letta/server/rest_api/routers/v1/agents.py b/letta/server/rest_api/routers/v1/agents.py index 1694f6df..c5ee4ed5 100644 --- a/letta/server/rest_api/routers/v1/agents.py +++ b/letta/server/rest_api/routers/v1/agents.py @@ -1859,15 +1859,24 @@ async def _process_message_background( runs_manager = RunManager() from letta.schemas.enums import RunStatus + from letta.schemas.letta_stop_reason import StopReasonType - if result.stop_reason.stop_reason == "cancelled": + # Handle cases where stop_reason might be None (defensive) + if result.stop_reason and result.stop_reason.stop_reason == "cancelled": run_status = RunStatus.cancelled - else: + stop_reason = result.stop_reason.stop_reason + elif result.stop_reason: run_status = RunStatus.completed + stop_reason = result.stop_reason.stop_reason + else: + # Fallback: no stop_reason set (shouldn't happen but defensive) + logger.error(f"Run {run_id} completed without stop_reason in result, defaulting to end_turn") + run_status = RunStatus.completed + stop_reason = StopReasonType.end_turn await runs_manager.update_run_by_id_async( run_id=run_id, - update=RunUpdate(status=run_status, stop_reason=result.stop_reason.stop_reason), + update=RunUpdate(status=run_status, stop_reason=stop_reason), actor=actor, ) @@ -1875,20 +1884,22 @@ async def _process_message_background( # Update run status to failed with specific error info runs_manager = RunManager() from letta.schemas.enums import RunStatus + from letta.schemas.letta_stop_reason import StopReasonType await runs_manager.update_run_by_id_async( run_id=run_id, - update=RunUpdate(status=RunStatus.failed, metadata={"error": str(e)}), + update=RunUpdate(status=RunStatus.failed, stop_reason=StopReasonType.error, metadata={"error": str(e)}), actor=actor, ) except Exception as e: # Update run status to failed runs_manager = RunManager() from letta.schemas.enums import RunStatus + from letta.schemas.letta_stop_reason import StopReasonType await runs_manager.update_run_by_id_async( run_id=run_id, - update=RunUpdate(status=RunStatus.failed, metadata={"error": str(e)}), + update=RunUpdate(status=RunStatus.failed, stop_reason=StopReasonType.error, metadata={"error": str(e)}), actor=actor, ) finally: @@ -2028,10 +2039,11 @@ async def send_message_async( async def update_failed_run(): runs_manager = RunManager() from letta.schemas.enums import RunStatus + from letta.schemas.letta_stop_reason import StopReasonType await runs_manager.update_run_by_id_async( run_id=run.id, - update=RunUpdate(status=RunStatus.failed, metadata={"error": error_str}), + update=RunUpdate(status=RunStatus.failed, stop_reason=StopReasonType.error, metadata={"error": error_str}), actor=actor, ) diff --git a/letta/services/run_manager.py b/letta/services/run_manager.py index 4aedc99e..c841a62d 100644 --- a/letta/services/run_manager.py +++ b/letta/services/run_manager.py @@ -455,9 +455,11 @@ class RunManager: # Dispatch callback outside of database session if needed if needs_callback: if refresh_result_messages: + # Defensive: ensure stop_reason is never None + stop_reason_value = pydantic_run.stop_reason if pydantic_run.stop_reason else StopReasonType.completed result = LettaResponse( messages=await self.get_run_messages(run_id=run_id, actor=actor), - stop_reason=LettaStopReason(stop_reason=pydantic_run.stop_reason), + stop_reason=LettaStopReason(stop_reason=stop_reason_value), usage=await self.get_run_usage(run_id=run_id, actor=actor), ) final_metadata["result"] = result.model_dump()