fix: add more error logging and tests for streaming LLM errors (#5844)

This commit is contained in:
Sarah Wooders
2025-10-29 19:41:32 -07:00
committed by Caren Thomas
parent 29c4ed20eb
commit 24a14490d8
2 changed files with 42 additions and 0 deletions

View File

@@ -304,6 +304,7 @@ class StreamingService:
run_status = RunStatus.failed
error_data = {"error": {"type": "llm_timeout", "message": "The LLM request timed out. Please try again.", "detail": str(e)}}
stop_reason = StopReasonType.llm_api_error
logger.error(f"Run {run_id} stopped with LLM timeout error: {e}, error_data: {error_data}")
yield (f"data: {json.dumps(error_data)}\n\n", 504)
# Send [DONE] marker to properly close the stream
yield "data: [DONE]\n\n"
@@ -317,6 +318,7 @@ class StreamingService:
}
}
stop_reason = StopReasonType.llm_api_error
logger.warning(f"Run {run_id} stopped with LLM rate limit error: {e}, error_data: {error_data}")
yield (f"data: {json.dumps(error_data)}\n\n", 429)
# Send [DONE] marker to properly close the stream
yield "data: [DONE]\n\n"
@@ -329,6 +331,7 @@ class StreamingService:
"detail": str(e),
}
}
logger.warning(f"Run {run_id} stopped with LLM authentication error: {e}, error_data: {error_data}")
stop_reason = StopReasonType.llm_api_error
yield (f"data: {json.dumps(error_data)}\n\n", 401)
# Send [DONE] marker to properly close the stream
@@ -336,6 +339,7 @@ class StreamingService:
except LLMError as e:
run_status = RunStatus.failed
error_data = {"error": {"type": "llm_error", "message": "An error occurred with the LLM request.", "detail": str(e)}}
logger.error(f"Run {run_id} stopped with LLM error: {e}, error_data: {error_data}")
yield (f"data: {json.dumps(error_data)}\n\n", 502)
# Send [DONE] marker to properly close the stream
stop_reason = StopReasonType.llm_api_error
@@ -349,6 +353,7 @@ class StreamingService:
"detail": str(e),
}
}
logger.error(f"Run {run_id} stopped with unknown error: {e}, error_data: {error_data}")
stop_reason = StopReasonType.error
yield (f"data: {json.dumps(error_data)}\n\n", 500)
# Re-raise to ensure proper error handling and Sentry capture

View File

@@ -195,6 +195,43 @@ async def test_update_run_by_id(server: SyncServer, sarah_agent, default_user):
assert updated_run.status == RunStatus.completed
@pytest.mark.asyncio
async def test_update_run_metadata_persistence(server: SyncServer, sarah_agent, default_user):
"""Test that metadata is properly persisted when updating a run."""
# Create a run with initial metadata
run_data = PydanticRun(
metadata={"type": "test", "initial": "value"},
agent_id=sarah_agent.id,
)
created_run = await server.run_manager.create_run(pydantic_run=run_data, actor=default_user)
# Verify initial metadata
assert created_run.metadata == {"type": "test", "initial": "value"}
# Update the run with error metadata (simulating what happens in streaming service)
error_data = {
"error": {"type": "llm_timeout", "message": "The LLM request timed out. Please try again.", "detail": "Timeout after 30s"}
}
updated_run = await server.run_manager.update_run_by_id_async(
created_run.id,
RunUpdate(status=RunStatus.failed, stop_reason=StopReasonType.llm_api_error, metadata=error_data),
actor=default_user,
)
# Verify metadata was properly updated
assert updated_run.status == RunStatus.failed
assert updated_run.stop_reason == StopReasonType.llm_api_error
assert updated_run.metadata == error_data
assert "error" in updated_run.metadata
assert updated_run.metadata["error"]["type"] == "llm_timeout"
# Fetch the run again to ensure it's persisted in DB
fetched_run = await server.run_manager.get_run_by_id(created_run.id, actor=default_user)
assert fetched_run.metadata == error_data
assert "error" in fetched_run.metadata
assert fetched_run.metadata["error"]["type"] == "llm_timeout"
@pytest.mark.asyncio
async def test_delete_run_by_id(server: SyncServer, sarah_agent, default_user):
"""Test deleting a run by its ID."""