fix: add more error logging and tests for streaming LLM errors (#5844)

2025-10-29 19:41:32 -07:00
parent 29c4ed20eb
commit 24a14490d8
2 changed files with 42 additions and 0 deletions
--- a/letta/services/streaming_service.py
+++ b/letta/services/streaming_service.py
@@ -304,6 +304,7 @@ class StreamingService:
                run_status = RunStatus.failed
                error_data = {"error": {"type": "llm_timeout", "message": "The LLM request timed out. Please try again.", "detail": str(e)}}
                stop_reason = StopReasonType.llm_api_error
+                logger.error(f"Run {run_id} stopped with LLM timeout error: {e}, error_data: {error_data}")
                yield (f"data: {json.dumps(error_data)}\n\n", 504)
                # Send [DONE] marker to properly close the stream
                yield "data: [DONE]\n\n"
@@ -317,6 +318,7 @@ class StreamingService:
                    }
                }
                stop_reason = StopReasonType.llm_api_error
+                logger.warning(f"Run {run_id} stopped with LLM rate limit error: {e}, error_data: {error_data}")
                yield (f"data: {json.dumps(error_data)}\n\n", 429)
                # Send [DONE] marker to properly close the stream
                yield "data: [DONE]\n\n"
@@ -329,6 +331,7 @@ class StreamingService:
                        "detail": str(e),
                    }
                }
+                logger.warning(f"Run {run_id} stopped with LLM authentication error: {e}, error_data: {error_data}")
                stop_reason = StopReasonType.llm_api_error
                yield (f"data: {json.dumps(error_data)}\n\n", 401)
                # Send [DONE] marker to properly close the stream
@@ -336,6 +339,7 @@ class StreamingService:
            except LLMError as e:
                run_status = RunStatus.failed
                error_data = {"error": {"type": "llm_error", "message": "An error occurred with the LLM request.", "detail": str(e)}}
+                logger.error(f"Run {run_id} stopped with LLM error: {e}, error_data: {error_data}")
                yield (f"data: {json.dumps(error_data)}\n\n", 502)
                # Send [DONE] marker to properly close the stream
                stop_reason = StopReasonType.llm_api_error
@@ -349,6 +353,7 @@ class StreamingService:
                        "detail": str(e),
                    }
                }
+                logger.error(f"Run {run_id} stopped with unknown error: {e}, error_data: {error_data}")
                stop_reason = StopReasonType.error
                yield (f"data: {json.dumps(error_data)}\n\n", 500)
                # Re-raise to ensure proper error handling and Sentry capture
--- a/tests/managers/test_run_manager.py
+++ b/tests/managers/test_run_manager.py
@@ -195,6 +195,43 @@ async def test_update_run_by_id(server: SyncServer, sarah_agent, default_user):
    assert updated_run.status == RunStatus.completed


+@pytest.mark.asyncio
+async def test_update_run_metadata_persistence(server: SyncServer, sarah_agent, default_user):
+    """Test that metadata is properly persisted when updating a run."""
+    # Create a run with initial metadata
+    run_data = PydanticRun(
+        metadata={"type": "test", "initial": "value"},
+        agent_id=sarah_agent.id,
+    )
+    created_run = await server.run_manager.create_run(pydantic_run=run_data, actor=default_user)
+
+    # Verify initial metadata
+    assert created_run.metadata == {"type": "test", "initial": "value"}
+
+    # Update the run with error metadata (simulating what happens in streaming service)
+    error_data = {
+        "error": {"type": "llm_timeout", "message": "The LLM request timed out. Please try again.", "detail": "Timeout after 30s"}
+    }
+    updated_run = await server.run_manager.update_run_by_id_async(
+        created_run.id,
+        RunUpdate(status=RunStatus.failed, stop_reason=StopReasonType.llm_api_error, metadata=error_data),
+        actor=default_user,
+    )
+
+    # Verify metadata was properly updated
+    assert updated_run.status == RunStatus.failed
+    assert updated_run.stop_reason == StopReasonType.llm_api_error
+    assert updated_run.metadata == error_data
+    assert "error" in updated_run.metadata
+    assert updated_run.metadata["error"]["type"] == "llm_timeout"
+
+    # Fetch the run again to ensure it's persisted in DB
+    fetched_run = await server.run_manager.get_run_by_id(created_run.id, actor=default_user)
+    assert fetched_run.metadata == error_data
+    assert "error" in fetched_run.metadata
+    assert fetched_run.metadata["error"]["type"] == "llm_timeout"
+
+
@pytest.mark.asyncio
 async def test_delete_run_by_id(server: SyncServer, sarah_agent, default_user):
    """Test deleting a run by its ID."""