fix: ensure stop_reason is always set when marking runs as failed (#9045)

**Problem:**
Production error showed runs being marked as failed with stop_reason=None,
which violates LettaStopReason's Pydantic schema (requires valid enum value).
This caused cascading validation errors that got stored in metadata.

Example error:
```
Run is already in a terminal state failed with stop reason None, but is being
updated with data {'status': 'failed', 'stop_reason': None, 'metadata':
{'error': "1 validation error for LettaStopReason\nstop_reason Input should
be 'end_turn', 'error', ... [type=enum, input_value=None]"}}
```

**Root Causes:**
1. routers/v1/agents.py had 3 exception handlers creating RunUpdate(status=failed)
   without stop_reason
2. Success path assumed result.stop_reason always exists (AttributeError if None)
3. run_manager.py tried to create LettaStopReason(stop_reason=None) when
   refreshing result messages

**Fixes:**
1. Added stop_reason=StopReasonType.error to 3 exception handlers
2. Added defensive None checks before accessing result.stop_reason.stop_reason
3. Added fallback to StopReasonType.error when pydantic_run.stop_reason is None

**Trigger:**
OpenAI BadRequestError for invalid tool schema → exception handlers marked
run as failed without stop_reason → validation error when constructing response

👾 Generated with [Letta Code](https://letta.com)

Co-authored-by: Letta <noreply@letta.com>
This commit is contained in:
cthomas
2026-01-22 15:57:07 -08:00
committed by Caren Thomas
parent 5533c723df
commit ca40eff7bc
2 changed files with 21 additions and 7 deletions

View File

@@ -1859,15 +1859,24 @@ async def _process_message_background(
runs_manager = RunManager()
from letta.schemas.enums import RunStatus
from letta.schemas.letta_stop_reason import StopReasonType
if result.stop_reason.stop_reason == "cancelled":
# Handle cases where stop_reason might be None (defensive)
if result.stop_reason and result.stop_reason.stop_reason == "cancelled":
run_status = RunStatus.cancelled
else:
stop_reason = result.stop_reason.stop_reason
elif result.stop_reason:
run_status = RunStatus.completed
stop_reason = result.stop_reason.stop_reason
else:
# Fallback: no stop_reason set (shouldn't happen but defensive)
logger.error(f"Run {run_id} completed without stop_reason in result, defaulting to end_turn")
run_status = RunStatus.completed
stop_reason = StopReasonType.end_turn
await runs_manager.update_run_by_id_async(
run_id=run_id,
update=RunUpdate(status=run_status, stop_reason=result.stop_reason.stop_reason),
update=RunUpdate(status=run_status, stop_reason=stop_reason),
actor=actor,
)
@@ -1875,20 +1884,22 @@ async def _process_message_background(
# Update run status to failed with specific error info
runs_manager = RunManager()
from letta.schemas.enums import RunStatus
from letta.schemas.letta_stop_reason import StopReasonType
await runs_manager.update_run_by_id_async(
run_id=run_id,
update=RunUpdate(status=RunStatus.failed, metadata={"error": str(e)}),
update=RunUpdate(status=RunStatus.failed, stop_reason=StopReasonType.error, metadata={"error": str(e)}),
actor=actor,
)
except Exception as e:
# Update run status to failed
runs_manager = RunManager()
from letta.schemas.enums import RunStatus
from letta.schemas.letta_stop_reason import StopReasonType
await runs_manager.update_run_by_id_async(
run_id=run_id,
update=RunUpdate(status=RunStatus.failed, metadata={"error": str(e)}),
update=RunUpdate(status=RunStatus.failed, stop_reason=StopReasonType.error, metadata={"error": str(e)}),
actor=actor,
)
finally:
@@ -2028,10 +2039,11 @@ async def send_message_async(
async def update_failed_run():
runs_manager = RunManager()
from letta.schemas.enums import RunStatus
from letta.schemas.letta_stop_reason import StopReasonType
await runs_manager.update_run_by_id_async(
run_id=run.id,
update=RunUpdate(status=RunStatus.failed, metadata={"error": error_str}),
update=RunUpdate(status=RunStatus.failed, stop_reason=StopReasonType.error, metadata={"error": error_str}),
actor=actor,
)

View File

@@ -455,9 +455,11 @@ class RunManager:
# Dispatch callback outside of database session if needed
if needs_callback:
if refresh_result_messages:
# Defensive: ensure stop_reason is never None
stop_reason_value = pydantic_run.stop_reason if pydantic_run.stop_reason else StopReasonType.completed
result = LettaResponse(
messages=await self.get_run_messages(run_id=run_id, actor=actor),
stop_reason=LettaStopReason(stop_reason=pydantic_run.stop_reason),
stop_reason=LettaStopReason(stop_reason=stop_reason_value),
usage=await self.get_run_usage(run_id=run_id, actor=actor),
)
final_metadata["result"] = result.model_dump()