diff --git a/letta/adapters/letta_llm_stream_adapter.py b/letta/adapters/letta_llm_stream_adapter.py index 1d9aa396..c345ee74 100644 --- a/letta/adapters/letta_llm_stream_adapter.py +++ b/letta/adapters/letta_llm_stream_adapter.py @@ -1,6 +1,7 @@ from typing import AsyncGenerator from letta.adapters.letta_llm_adapter import LettaLLMAdapter +from letta.errors import LLMError from letta.helpers.datetime_helpers import get_utc_timestamp_ns from letta.interfaces.anthropic_streaming_interface import AnthropicStreamingInterface from letta.interfaces.openai_streaming_interface import OpenAIStreamingInterface @@ -133,6 +134,8 @@ class LettaLLMStreamAdapter(LettaLLMAdapter): error_msg=str(e), error_type=type(e).__name__, ) + if isinstance(e, LLMError): + raise raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config) # After streaming completes, extract the accumulated data diff --git a/letta/llm_api/google_vertex_client.py b/letta/llm_api/google_vertex_client.py index 368ba5e7..4eaa6bb2 100644 --- a/letta/llm_api/google_vertex_client.py +++ b/letta/llm_api/google_vertex_client.py @@ -895,7 +895,9 @@ class GoogleVertexClient(LLMClientBase): # Handle specific error codes if e.code == 400: error_str = str(e).lower() - if "context" in error_str and ("exceed" in error_str or "limit" in error_str or "too long" in error_str): + if ("context" in error_str or "token count" in error_str or "tokens allowed" in error_str) and ( + "exceed" in error_str or "limit" in error_str or "too long" in error_str + ): return ContextWindowExceededError( message=f"Bad request to {self._provider_name()} (context window exceeded): {str(e)}", details={"is_byok": is_byok}, diff --git a/letta/llm_api/openai_client.py b/letta/llm_api/openai_client.py index b48280cb..674088ba 100644 --- a/letta/llm_api/openai_client.py +++ b/letta/llm_api/openai_client.py @@ -1098,6 +1098,7 @@ class OpenAIClient(LLMClientBase): else: body_details = e.body if isinstance(e.body, dict) else {"body": e.body} return LLMBadRequestError( + message=f"Bad request to OpenAI-compatible endpoint: {str(e)}", code=ErrorCode.INVALID_ARGUMENT, details={**body_details, "is_byok": is_byok}, ) diff --git a/tests/adapters/test_letta_llm_stream_adapter_error_handling.py b/tests/adapters/test_letta_llm_stream_adapter_error_handling.py index 58951c59..4d3842fc 100644 --- a/tests/adapters/test_letta_llm_stream_adapter_error_handling.py +++ b/tests/adapters/test_letta_llm_stream_adapter_error_handling.py @@ -1,10 +1,12 @@ import anthropic import httpx import pytest +from google.genai import errors as google_errors from letta.adapters.letta_llm_stream_adapter import LettaLLMStreamAdapter -from letta.errors import ContextWindowExceededError, LLMConnectionError, LLMServerError +from letta.errors import ContextWindowExceededError, LLMBadRequestError, LLMConnectionError, LLMError, LLMServerError from letta.llm_api.anthropic_client import AnthropicClient +from letta.llm_api.google_vertex_client import GoogleVertexClient from letta.schemas.enums import LLMCallType from letta.schemas.llm_config import LLMConfig @@ -188,3 +190,48 @@ def test_anthropic_client_handle_llm_error_request_too_large_string(): assert isinstance(result, ContextWindowExceededError) assert "request_too_large" in result.message.lower() or "context window exceeded" in result.message.lower() + + +@pytest.mark.parametrize( + "error_message", + [ + "The input token count exceeds the maximum number of tokens allowed 1048576.", + "Token count of 1500000 exceeds the model limit of 1048576 tokens allowed.", + ], + ids=["gemini-token-count-exceeds", "gemini-tokens-allowed-limit"], +) +def test_google_client_handle_llm_error_token_limit_returns_context_window_exceeded(error_message): + """Google 400 errors about token limits should map to ContextWindowExceededError.""" + client = GoogleVertexClient.__new__(GoogleVertexClient) + response_json = { + "message": f'{{"error": {{"code": 400, "message": "{error_message}", "status": "INVALID_ARGUMENT"}}}}', + "status": "Bad Request", + } + error = google_errors.ClientError(400, response_json) + result = client.handle_llm_error(error) + assert isinstance(result, ContextWindowExceededError) + + +def test_google_client_handle_llm_error_context_exceeded_returns_context_window_exceeded(): + """Google 400 errors with 'context' + 'exceeded' should map to ContextWindowExceededError.""" + client = GoogleVertexClient.__new__(GoogleVertexClient) + response_json = { + "message": '{"error": {"code": 400, "message": "Request context window exceeded the limit.", "status": "INVALID_ARGUMENT"}}', + "status": "Bad Request", + } + error = google_errors.ClientError(400, response_json) + result = client.handle_llm_error(error) + assert isinstance(result, ContextWindowExceededError) + + +def test_google_client_handle_llm_error_generic_400_returns_bad_request(): + """Google 400 errors without token/context keywords should map to LLMBadRequestError.""" + client = GoogleVertexClient.__new__(GoogleVertexClient) + response_json = { + "message": '{"error": {"code": 400, "message": "Invalid argument: unsupported parameter.", "status": "INVALID_ARGUMENT"}}', + "status": "Bad Request", + } + error = google_errors.ClientError(400, response_json) + result = client.handle_llm_error(error) + assert isinstance(result, LLMBadRequestError) + assert not isinstance(result, ContextWindowExceededError)