From 382e216cbbf08b8ba3bd57b5fca789ffe8fd03b9 Mon Sep 17 00:00:00 2001 From: Kian Jones <11655409+kianjones9@users.noreply.github.com> Date: Tue, 10 Feb 2026 20:38:25 -0800 Subject: [PATCH] fix(core): differentiate BYOK vs base provider in all LLM error details (#9425) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add is_byok flag to every LLMError's details dict returned from handle_llm_error across all providers (OpenAI, Anthropic, Google, ChatGPT OAuth). This enables observability into whether errors originate from Letta's production keys or user-provided BYOK keys. The rate limit handler in app.py now returns a more helpful message for BYOK users ("check your provider's rate limits and billing") versus the generic message for base provider rate limits. Datadog issues: - https://us5.datadoghq.com/error-tracking/issue/b711c824-f490-11f0-96e4-da7ad0900000 - https://us5.datadoghq.com/error-tracking/issue/76623036-f4de-11f0-8697-da7ad0900000 - https://us5.datadoghq.com/error-tracking/issue/43e9888a-dfcf-11f0-a645-da7ad0900000 🤖 Generated with [Letta Code](https://letta.com) Co-authored-by: Letta --- letta/adapters/letta_llm_stream_adapter.py | 4 +- letta/adapters/simple_llm_request_adapter.py | 2 +- letta/adapters/simple_llm_stream_adapter.py | 4 +- letta/agents/letta_agent.py | 2 +- letta/llm_api/anthropic_client.py | 26 ++++++++--- letta/llm_api/azure_client.py | 2 +- letta/llm_api/chatgpt_oauth_client.py | 18 ++++++-- letta/llm_api/google_vertex_client.py | 45 ++++++++++++------ letta/llm_api/llm_client_base.py | 13 ++++-- letta/llm_api/openai_client.py | 48 ++++++++++++++------ letta/server/rest_api/app.py | 9 +++- letta/services/summarizer/summarizer.py | 4 +- 12 files changed, 123 insertions(+), 54 deletions(-) diff --git a/letta/adapters/letta_llm_stream_adapter.py b/letta/adapters/letta_llm_stream_adapter.py index 2bb7ed9c..1d9aa396 100644 --- a/letta/adapters/letta_llm_stream_adapter.py +++ b/letta/adapters/letta_llm_stream_adapter.py @@ -114,7 +114,7 @@ class LettaLLMStreamAdapter(LettaLLMAdapter): error_msg=str(e), error_type=type(e).__name__, ) - raise self.llm_client.handle_llm_error(e) + raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config) # Process the stream and yield chunks immediately for TTFT # Wrap in error handling to convert provider errors to common LLMError types @@ -133,7 +133,7 @@ class LettaLLMStreamAdapter(LettaLLMAdapter): error_msg=str(e), error_type=type(e).__name__, ) - raise self.llm_client.handle_llm_error(e) + raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config) # After streaming completes, extract the accumulated data self.llm_request_finish_timestamp_ns = get_utc_timestamp_ns() diff --git a/letta/adapters/simple_llm_request_adapter.py b/letta/adapters/simple_llm_request_adapter.py index 8ab2a904..a2818f50 100644 --- a/letta/adapters/simple_llm_request_adapter.py +++ b/letta/adapters/simple_llm_request_adapter.py @@ -54,7 +54,7 @@ class SimpleLLMRequestAdapter(LettaLLMRequestAdapter): try: self.response_data = await self.llm_client.request_async_with_telemetry(request_data, self.llm_config) except Exception as e: - raise self.llm_client.handle_llm_error(e) + raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config) self.llm_request_finish_timestamp_ns = get_utc_timestamp_ns() diff --git a/letta/adapters/simple_llm_stream_adapter.py b/letta/adapters/simple_llm_stream_adapter.py index 216c437c..80da0a64 100644 --- a/letta/adapters/simple_llm_stream_adapter.py +++ b/letta/adapters/simple_llm_stream_adapter.py @@ -151,7 +151,7 @@ class SimpleLLMStreamAdapter(LettaLLMStreamAdapter): error_msg=str(e), error_type=type(e).__name__, ) - raise self.llm_client.handle_llm_error(e) + raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config) # Process the stream and yield chunks immediately for TTFT try: @@ -169,7 +169,7 @@ class SimpleLLMStreamAdapter(LettaLLMStreamAdapter): error_msg=str(e), error_type=type(e).__name__, ) - raise self.llm_client.handle_llm_error(e) + raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config) # After streaming completes, extract the accumulated data self.llm_request_finish_timestamp_ns = get_utc_timestamp_ns() diff --git a/letta/agents/letta_agent.py b/letta/agents/letta_agent.py index 98e56205..219762e7 100644 --- a/letta/agents/letta_agent.py +++ b/letta/agents/letta_agent.py @@ -1547,7 +1547,7 @@ class LettaAgent(BaseAgent): step_id=step_id, ) else: - raise llm_client.handle_llm_error(e) + raise llm_client.handle_llm_error(e, llm_config=llm_config) @trace_method async def _rebuild_context_window( diff --git a/letta/llm_api/anthropic_client.py b/letta/llm_api/anthropic_client.py index f3903dfe..442aceea 100644 --- a/letta/llm_api/anthropic_client.py +++ b/letta/llm_api/anthropic_client.py @@ -37,6 +37,7 @@ from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG from letta.log import get_logger from letta.otel.tracing import trace_method from letta.schemas.agent import AgentType +from letta.schemas.enums import ProviderCategory from letta.schemas.llm_config import LLMConfig from letta.schemas.message import Message as PydanticMessage from letta.schemas.openai.chat_completion_request import Tool as OpenAITool @@ -937,7 +938,9 @@ class AnthropicClient(LLMClientBase): ) @trace_method - def handle_llm_error(self, e: Exception) -> Exception: + def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception: + is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None + # make sure to check for overflow errors, regardless of error type error_str = str(e).lower() if ( @@ -952,6 +955,7 @@ class AnthropicClient(LLMClientBase): logger.warning(f"[Anthropic] Context window exceeded: {str(e)}") return ContextWindowExceededError( message=f"Context window exceeded for Anthropic: {str(e)}", + details={"is_byok": is_byok}, ) if isinstance(e, anthropic.APITimeoutError): @@ -959,7 +963,7 @@ class AnthropicClient(LLMClientBase): return LLMTimeoutError( message=f"Request to Anthropic timed out: {str(e)}", code=ErrorCode.TIMEOUT, - details={"cause": str(e.__cause__) if e.__cause__ else None}, + details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok}, ) if isinstance(e, anthropic.APIConnectionError): @@ -967,7 +971,7 @@ class AnthropicClient(LLMClientBase): return LLMConnectionError( message=f"Failed to connect to Anthropic: {str(e)}", code=ErrorCode.INTERNAL_SERVER_ERROR, - details={"cause": str(e.__cause__) if e.__cause__ else None}, + details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok}, ) # Handle httpx.RemoteProtocolError which can occur during streaming @@ -978,7 +982,7 @@ class AnthropicClient(LLMClientBase): return LLMConnectionError( message=f"Connection error during Anthropic streaming: {str(e)}", code=ErrorCode.INTERNAL_SERVER_ERROR, - details={"cause": str(e.__cause__) if e.__cause__ else None}, + details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok}, ) # Handle httpx network errors which can occur during streaming @@ -988,7 +992,7 @@ class AnthropicClient(LLMClientBase): return LLMConnectionError( message=f"Network error during Anthropic streaming: {str(e)}", code=ErrorCode.INTERNAL_SERVER_ERROR, - details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__}, + details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__, "is_byok": is_byok}, ) if isinstance(e, anthropic.RateLimitError): @@ -996,6 +1000,7 @@ class AnthropicClient(LLMClientBase): return LLMRateLimitError( message=f"Rate limited by Anthropic: {str(e)}", code=ErrorCode.RATE_LIMIT_EXCEEDED, + details={"is_byok": is_byok}, ) if isinstance(e, anthropic.BadRequestError): @@ -1013,11 +1018,13 @@ class AnthropicClient(LLMClientBase): # 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'input length and `max_tokens` exceed context limit: 173298 + 32000 > 200000, decrease input length or `max_tokens` and try again'}} return ContextWindowExceededError( message=f"Bad request to Anthropic (context window exceeded): {str(e)}", + details={"is_byok": is_byok}, ) else: return LLMBadRequestError( message=f"Bad request to Anthropic: {str(e)}", code=ErrorCode.INTERNAL_SERVER_ERROR, + details={"is_byok": is_byok}, ) if isinstance(e, anthropic.AuthenticationError): @@ -1025,6 +1032,7 @@ class AnthropicClient(LLMClientBase): return LLMAuthenticationError( message=f"Authentication failed with Anthropic: {str(e)}", code=ErrorCode.INTERNAL_SERVER_ERROR, + details={"is_byok": is_byok}, ) if isinstance(e, anthropic.PermissionDeniedError): @@ -1032,6 +1040,7 @@ class AnthropicClient(LLMClientBase): return LLMPermissionDeniedError( message=f"Permission denied by Anthropic: {str(e)}", code=ErrorCode.INTERNAL_SERVER_ERROR, + details={"is_byok": is_byok}, ) if isinstance(e, anthropic.NotFoundError): @@ -1039,6 +1048,7 @@ class AnthropicClient(LLMClientBase): return LLMNotFoundError( message=f"Resource not found in Anthropic: {str(e)}", code=ErrorCode.INTERNAL_SERVER_ERROR, + details={"is_byok": is_byok}, ) if isinstance(e, anthropic.UnprocessableEntityError): @@ -1046,6 +1056,7 @@ class AnthropicClient(LLMClientBase): return LLMUnprocessableEntityError( message=f"Invalid request content for Anthropic: {str(e)}", code=ErrorCode.INTERNAL_SERVER_ERROR, + details={"is_byok": is_byok}, ) if isinstance(e, anthropic.APIStatusError): @@ -1055,11 +1066,13 @@ class AnthropicClient(LLMClientBase): logger.warning(f"[Anthropic] Request too large (413): {str(e)}") return ContextWindowExceededError( message=f"Request too large for Anthropic (413): {str(e)}", + details={"is_byok": is_byok}, ) if "overloaded" in str(e).lower(): return LLMProviderOverloaded( message=f"Anthropic API is overloaded: {str(e)}", code=ErrorCode.INTERNAL_SERVER_ERROR, + details={"is_byok": is_byok}, ) return LLMServerError( message=f"Anthropic API error: {str(e)}", @@ -1067,10 +1080,11 @@ class AnthropicClient(LLMClientBase): details={ "status_code": e.status_code if hasattr(e, "status_code") else None, "response": str(e.response) if hasattr(e, "response") else None, + "is_byok": is_byok, }, ) - return super().handle_llm_error(e) + return super().handle_llm_error(e, llm_config=llm_config) def extract_usage_statistics(self, response_data: dict | None, llm_config: LLMConfig) -> LettaUsageStatistics: """Extract usage statistics from Anthropic response and return as LettaUsageStatistics.""" diff --git a/letta/llm_api/azure_client.py b/letta/llm_api/azure_client.py index 80926aec..59085100 100644 --- a/letta/llm_api/azure_client.py +++ b/letta/llm_api/azure_client.py @@ -97,7 +97,7 @@ class AzureClient(OpenAIClient): response: ChatCompletion = await client.chat.completions.create(**request_data) return response.model_dump() except Exception as e: - raise self.handle_llm_error(e) + raise self.handle_llm_error(e, llm_config=llm_config) @trace_method async def stream_async(self, request_data: dict, llm_config: LLMConfig) -> AsyncStream[ChatCompletionChunk | ResponseStreamEvent]: diff --git a/letta/llm_api/chatgpt_oauth_client.py b/letta/llm_api/chatgpt_oauth_client.py index 2a25e7b6..96a0f15b 100644 --- a/letta/llm_api/chatgpt_oauth_client.py +++ b/letta/llm_api/chatgpt_oauth_client.py @@ -1019,29 +1019,33 @@ class ChatGPTOAuthClient(LLMClientBase): return "o1" in model or "o3" in model or "o4" in model or "gpt-5" in model @trace_method - def handle_llm_error(self, e: Exception) -> Exception: + def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception: """Map ChatGPT-specific errors to common LLMError types. Args: e: Original exception. + llm_config: Optional LLM config to determine if this is a BYOK key. Returns: Mapped LLMError subclass. """ + is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None + # Already a typed LLM/Letta error (e.g. from SSE error handling) — pass through if isinstance(e, LettaError): return e if isinstance(e, httpx.HTTPStatusError): - return self._handle_http_error(e) + return self._handle_http_error(e, is_byok=is_byok) - return super().handle_llm_error(e) + return super().handle_llm_error(e, llm_config=llm_config) - def _handle_http_error(self, e: httpx.HTTPStatusError) -> Exception: + def _handle_http_error(self, e: httpx.HTTPStatusError, is_byok: bool | None = None) -> Exception: """Handle HTTP status errors from ChatGPT backend. Args: e: HTTP status error. + is_byok: Whether the request used a BYOK key. Returns: Appropriate LLMError subclass. @@ -1059,30 +1063,36 @@ class ChatGPTOAuthClient(LLMClientBase): return LLMAuthenticationError( message=f"ChatGPT authentication failed: {error_message}", code=ErrorCode.UNAUTHENTICATED, + details={"is_byok": is_byok}, ) elif status_code == 429: return LLMRateLimitError( message=f"ChatGPT rate limit exceeded: {error_message}", code=ErrorCode.RATE_LIMIT_EXCEEDED, + details={"is_byok": is_byok}, ) elif status_code == 400: if "context" in error_message.lower() or "token" in error_message.lower(): return ContextWindowExceededError( message=f"ChatGPT context window exceeded: {error_message}", + details={"is_byok": is_byok}, ) return LLMBadRequestError( message=f"ChatGPT bad request: {error_message}", code=ErrorCode.INVALID_ARGUMENT, + details={"is_byok": is_byok}, ) elif status_code >= 500: return LLMServerError( message=f"ChatGPT server error: {error_message}", code=ErrorCode.INTERNAL_SERVER_ERROR, + details={"is_byok": is_byok}, ) else: return LLMBadRequestError( message=f"ChatGPT request failed ({status_code}): {error_message}", code=ErrorCode.INTERNAL_SERVER_ERROR, + details={"is_byok": is_byok}, ) def _handle_sse_error_event(self, raw_event: dict) -> Exception: diff --git a/letta/llm_api/google_vertex_client.py b/letta/llm_api/google_vertex_client.py index 9b41e918..49fc64a8 100644 --- a/letta/llm_api/google_vertex_client.py +++ b/letta/llm_api/google_vertex_client.py @@ -37,6 +37,7 @@ from letta.local_llm.json_parser import clean_json_string_extra_backslash from letta.log import get_logger from letta.otel.tracing import trace_method from letta.schemas.agent import AgentType +from letta.schemas.enums import ProviderCategory from letta.schemas.llm_config import LLMConfig from letta.schemas.message import Message as PydanticMessage from letta.schemas.openai.chat_completion_request import Tool, Tool as OpenAITool @@ -93,7 +94,7 @@ class GoogleVertexClient(LLMClientBase): code=ErrorCode.INTERNAL_SERVER_ERROR, ) except Exception as e: - raise self.handle_llm_error(e) + raise self.handle_llm_error(e, llm_config=llm_config) @trace_method async def request_async(self, request_data: dict, llm_config: LLMConfig) -> dict: @@ -135,11 +136,11 @@ class GoogleVertexClient(LLMClientBase): logger.warning(f"Received {e}, retrying {retry_count}/{self.MAX_RETRIES}") retry_count += 1 if retry_count > self.MAX_RETRIES: - raise self.handle_llm_error(e) + raise self.handle_llm_error(e, llm_config=llm_config) continue - raise self.handle_llm_error(e) + raise self.handle_llm_error(e, llm_config=llm_config) except Exception as e: - raise self.handle_llm_error(e) + raise self.handle_llm_error(e, llm_config=llm_config) response_data = response.model_dump() is_malformed_function_call = self.is_malformed_function_call(response_data) if is_malformed_function_call: @@ -211,9 +212,9 @@ class GoogleVertexClient(LLMClientBase): if e.code == 499: logger.info(f"{self._provider_prefix()} Stream cancelled by client (499): {e}") return - raise self.handle_llm_error(e) + raise self.handle_llm_error(e, llm_config=llm_config) except errors.APIError as e: - raise self.handle_llm_error(e) + raise self.handle_llm_error(e, llm_config=llm_config) @staticmethod def add_dummy_model_messages(messages: List[dict]) -> List[dict]: @@ -851,7 +852,9 @@ class GoogleVertexClient(LLMClientBase): return False @trace_method - def handle_llm_error(self, e: Exception) -> Exception: + def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception: + is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None + # Handle Google GenAI specific errors if isinstance(e, errors.ClientError): if e.code == 499: @@ -859,7 +862,7 @@ class GoogleVertexClient(LLMClientBase): return LLMConnectionError( message=f"Request to {self._provider_name()} was cancelled (client disconnected): {str(e)}", code=ErrorCode.INTERNAL_SERVER_ERROR, - details={"status_code": 499, "cause": "client_cancelled"}, + details={"status_code": 499, "cause": "client_cancelled", "is_byok": is_byok}, ) logger.warning(f"{self._provider_prefix()} Client error ({e.code}): {e}") @@ -870,43 +873,50 @@ class GoogleVertexClient(LLMClientBase): if "context" in error_str and ("exceed" in error_str or "limit" in error_str or "too long" in error_str): return ContextWindowExceededError( message=f"Bad request to {self._provider_name()} (context window exceeded): {str(e)}", + details={"is_byok": is_byok}, ) else: return LLMBadRequestError( message=f"Bad request to {self._provider_name()}: {str(e)}", code=ErrorCode.INTERNAL_SERVER_ERROR, + details={"is_byok": is_byok}, ) elif e.code == 401: return LLMAuthenticationError( message=f"Authentication failed with {self._provider_name()}: {str(e)}", code=ErrorCode.INTERNAL_SERVER_ERROR, + details={"is_byok": is_byok}, ) elif e.code == 403: return LLMPermissionDeniedError( message=f"Permission denied by {self._provider_name()}: {str(e)}", code=ErrorCode.INTERNAL_SERVER_ERROR, + details={"is_byok": is_byok}, ) elif e.code == 404: return LLMNotFoundError( message=f"Resource not found in {self._provider_name()}: {str(e)}", code=ErrorCode.INTERNAL_SERVER_ERROR, + details={"is_byok": is_byok}, ) elif e.code == 408: return LLMTimeoutError( message=f"Request to {self._provider_name()} timed out: {str(e)}", code=ErrorCode.TIMEOUT, - details={"cause": str(e.__cause__) if e.__cause__ else None}, + details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok}, ) elif e.code == 422: return LLMUnprocessableEntityError( message=f"Invalid request content for {self._provider_name()}: {str(e)}", code=ErrorCode.INTERNAL_SERVER_ERROR, + details={"is_byok": is_byok}, ) elif e.code == 429: logger.warning(f"{self._provider_prefix()} Rate limited (429). Consider backoff.") return LLMRateLimitError( message=f"Rate limited by {self._provider_name()}: {str(e)}", code=ErrorCode.RATE_LIMIT_EXCEEDED, + details={"is_byok": is_byok}, ) else: return LLMServerError( @@ -915,6 +925,7 @@ class GoogleVertexClient(LLMClientBase): details={ "status_code": e.code, "response_json": getattr(e, "response_json", None), + "is_byok": is_byok, }, ) @@ -929,13 +940,14 @@ class GoogleVertexClient(LLMClientBase): details={ "status_code": e.code, "response_json": getattr(e, "response_json", None), + "is_byok": is_byok, }, ) elif e.code == 502: return LLMConnectionError( message=f"Bad gateway from {self._provider_name()}: {str(e)}", code=ErrorCode.INTERNAL_SERVER_ERROR, - details={"cause": str(e.__cause__) if e.__cause__ else None}, + details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok}, ) elif e.code == 503: return LLMServerError( @@ -944,13 +956,14 @@ class GoogleVertexClient(LLMClientBase): details={ "status_code": e.code, "response_json": getattr(e, "response_json", None), + "is_byok": is_byok, }, ) elif e.code == 504: return LLMTimeoutError( message=f"Gateway timeout from {self._provider_name()}: {str(e)}", code=ErrorCode.TIMEOUT, - details={"cause": str(e.__cause__) if e.__cause__ else None}, + details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok}, ) else: return LLMServerError( @@ -959,6 +972,7 @@ class GoogleVertexClient(LLMClientBase): details={ "status_code": e.code, "response_json": getattr(e, "response_json", None), + "is_byok": is_byok, }, ) @@ -970,6 +984,7 @@ class GoogleVertexClient(LLMClientBase): details={ "status_code": e.code, "response_json": getattr(e, "response_json", None), + "is_byok": is_byok, }, ) @@ -981,7 +996,7 @@ class GoogleVertexClient(LLMClientBase): return LLMConnectionError( message=f"Connection error during {self._provider_name()} streaming: {str(e)}", code=ErrorCode.INTERNAL_SERVER_ERROR, - details={"cause": str(e.__cause__) if e.__cause__ else None}, + details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok}, ) # Handle httpx network errors which can occur during streaming @@ -991,7 +1006,7 @@ class GoogleVertexClient(LLMClientBase): return LLMConnectionError( message=f"Network error during {self._provider_name()} streaming: {str(e)}", code=ErrorCode.INTERNAL_SERVER_ERROR, - details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__}, + details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__, "is_byok": is_byok}, ) # Handle connection-related errors @@ -1000,11 +1015,11 @@ class GoogleVertexClient(LLMClientBase): return LLMConnectionError( message=f"Failed to connect to {self._provider_name()}: {str(e)}", code=ErrorCode.INTERNAL_SERVER_ERROR, - details={"cause": str(e.__cause__) if e.__cause__ else None}, + details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok}, ) # Fallback to base implementation for other errors - return super().handle_llm_error(e) + return super().handle_llm_error(e, llm_config=llm_config) async def count_tokens(self, messages: List[dict] = None, model: str = None, tools: List[OpenAITool] = None) -> int: """ diff --git a/letta/llm_api/llm_client_base.py b/letta/llm_api/llm_client_base.py index be54b6b9..b4b8d69d 100644 --- a/letta/llm_api/llm_client_base.py +++ b/letta/llm_api/llm_client_base.py @@ -226,7 +226,7 @@ class LLMClientBase: ) log_event(name="llm_response_received", attributes=response_data) except Exception as e: - raise self.handle_llm_error(e) + raise self.handle_llm_error(e, llm_config=llm_config) return await self.convert_response_to_chat_completion(response_data, messages, llm_config) @@ -261,7 +261,7 @@ class LLMClientBase: log_event(name="llm_response_received", attributes=response_data) except Exception as e: - raise self.handle_llm_error(e) + raise self.handle_llm_error(e, llm_config=llm_config) return await self.convert_response_to_chat_completion(response_data, messages, llm_config) @@ -353,17 +353,20 @@ class LLMClientBase: raise NotImplementedError @abstractmethod - def handle_llm_error(self, e: Exception) -> Exception: + def handle_llm_error(self, e: Exception, llm_config: Optional["LLMConfig"] = None) -> Exception: """ Maps provider-specific errors to common LLMError types. Each LLM provider should implement this to translate their specific errors. Args: e: The original provider-specific exception + llm_config: Optional LLM config to determine if this is a BYOK key Returns: An LLMError subclass that represents the error in a provider-agnostic way """ + is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None + # Handle httpx.RemoteProtocolError which can occur during streaming # when the remote server closes the connection unexpectedly # (e.g., "peer closed connection without sending complete message body") @@ -375,10 +378,10 @@ class LLMClientBase: return LLMConnectionError( message=f"Connection error during streaming: {str(e)}", code=ErrorCode.INTERNAL_SERVER_ERROR, - details={"cause": str(e.__cause__) if e.__cause__ else None}, + details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok}, ) - return LLMError(f"Unhandled LLM error: {str(e)}") + return LLMError(message=f"Unhandled LLM error: {str(e)}", details={"is_byok": is_byok}) def get_byok_overrides(self, llm_config: LLMConfig) -> Tuple[Optional[str], Optional[str], Optional[str]]: """ diff --git a/letta/llm_api/openai_client.py b/letta/llm_api/openai_client.py index 93ddc32b..7b6f84e2 100644 --- a/letta/llm_api/openai_client.py +++ b/letta/llm_api/openai_client.py @@ -40,6 +40,7 @@ from letta.log import get_logger from letta.otel.tracing import trace_method from letta.schemas.agent import AgentType from letta.schemas.embedding_config import EmbeddingConfig +from letta.schemas.enums import ProviderCategory from letta.schemas.letta_message_content import MessageContentType from letta.schemas.llm_config import LLMConfig from letta.schemas.message import Message as PydanticMessage @@ -1015,10 +1016,11 @@ class OpenAIClient(LLMClientBase): return results @trace_method - def handle_llm_error(self, e: Exception) -> Exception: + def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception: """ Maps OpenAI-specific errors to common LLMError types. """ + is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None if isinstance(e, openai.APITimeoutError): timeout_duration = getattr(e, "timeout", "unknown") logger.warning(f"[OpenAI] Request timeout after {timeout_duration} seconds: {e}") @@ -1028,6 +1030,7 @@ class OpenAIClient(LLMClientBase): details={ "timeout_duration": timeout_duration, "cause": str(e.__cause__) if e.__cause__ else None, + "is_byok": is_byok, }, ) @@ -1036,7 +1039,7 @@ class OpenAIClient(LLMClientBase): return LLMConnectionError( message=f"Failed to connect to OpenAI: {str(e)}", code=ErrorCode.INTERNAL_SERVER_ERROR, - details={"cause": str(e.__cause__) if e.__cause__ else None}, + details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok}, ) # Handle httpx.RemoteProtocolError which can occur during streaming @@ -1047,7 +1050,7 @@ class OpenAIClient(LLMClientBase): return LLMConnectionError( message=f"Connection error during OpenAI streaming: {str(e)}", code=ErrorCode.INTERNAL_SERVER_ERROR, - details={"cause": str(e.__cause__) if e.__cause__ else None}, + details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok}, ) # Handle httpx network errors which can occur during streaming @@ -1057,15 +1060,16 @@ class OpenAIClient(LLMClientBase): return LLMConnectionError( message=f"Network error during OpenAI streaming: {str(e)}", code=ErrorCode.INTERNAL_SERVER_ERROR, - details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__}, + details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__, "is_byok": is_byok}, ) if isinstance(e, openai.RateLimitError): logger.warning(f"[OpenAI] Rate limited (429). Consider backoff. Error: {e}") + body_details = e.body if isinstance(e.body, dict) else {"body": e.body} return LLMRateLimitError( message=f"Rate limited by OpenAI: {str(e)}", code=ErrorCode.RATE_LIMIT_EXCEEDED, - details=e.body, # Include body which often has rate limit details + details={**body_details, "is_byok": is_byok}, ) if isinstance(e, openai.BadRequestError): @@ -1082,12 +1086,14 @@ class OpenAIClient(LLMClientBase): if error_code == "context_length_exceeded" or is_context_window_overflow_message(str(e)): return ContextWindowExceededError( message=f"Bad request to OpenAI (context window exceeded): {str(e)}", + details={"is_byok": is_byok}, ) else: + body_details = e.body if isinstance(e.body, dict) else {"body": e.body} return LLMBadRequestError( message=f"Bad request to OpenAI: {str(e)}", - code=ErrorCode.INVALID_ARGUMENT, # Or more specific if detectable - details=e.body, + code=ErrorCode.INVALID_ARGUMENT, + details={**body_details, "is_byok": is_byok}, ) # NOTE: The OpenAI Python SDK may raise a generic `openai.APIError` while *iterating* @@ -1104,34 +1110,46 @@ class OpenAIClient(LLMClientBase): message=f"OpenAI request exceeded the context window: {msg}", details={ "provider_exception_type": type(e).__name__, - # Best-effort extraction (may not exist on APIError) "body": getattr(e, "body", None), + "is_byok": is_byok, }, ) if isinstance(e, openai.AuthenticationError): logger.error(f"[OpenAI] Authentication error (401): {str(e)}") # More severe log level + body_details = e.body if isinstance(e.body, dict) else {"body": e.body} return LLMAuthenticationError( - message=f"Authentication failed with OpenAI: {str(e)}", code=ErrorCode.UNAUTHENTICATED, details=e.body + message=f"Authentication failed with OpenAI: {str(e)}", + code=ErrorCode.UNAUTHENTICATED, + details={**body_details, "is_byok": is_byok}, ) if isinstance(e, openai.PermissionDeniedError): logger.error(f"[OpenAI] Permission denied (403): {str(e)}") # More severe log level + body_details = e.body if isinstance(e.body, dict) else {"body": e.body} return LLMPermissionDeniedError( - message=f"Permission denied by OpenAI: {str(e)}", code=ErrorCode.PERMISSION_DENIED, details=e.body + message=f"Permission denied by OpenAI: {str(e)}", + code=ErrorCode.PERMISSION_DENIED, + details={**body_details, "is_byok": is_byok}, ) if isinstance(e, openai.NotFoundError): logger.warning(f"[OpenAI] Resource not found (404): {str(e)}") # Could be invalid model name, etc. - return LLMNotFoundError(message=f"Resource not found in OpenAI: {str(e)}", code=ErrorCode.NOT_FOUND, details=e.body) + body_details = e.body if isinstance(e.body, dict) else {"body": e.body} + return LLMNotFoundError( + message=f"Resource not found in OpenAI: {str(e)}", + code=ErrorCode.NOT_FOUND, + details={**body_details, "is_byok": is_byok}, + ) if isinstance(e, openai.UnprocessableEntityError): logger.warning(f"[OpenAI] Unprocessable entity (422): {str(e)}") + body_details = e.body if isinstance(e.body, dict) else {"body": e.body} return LLMUnprocessableEntityError( message=f"Invalid request content for OpenAI: {str(e)}", - code=ErrorCode.INVALID_ARGUMENT, # Usually validation errors - details=e.body, + code=ErrorCode.INVALID_ARGUMENT, + details={**body_details, "is_byok": is_byok}, ) # General API error catch-all @@ -1141,6 +1159,7 @@ class OpenAIClient(LLMClientBase): if e.status_code == 413: return ContextWindowExceededError( message=f"Request too large for OpenAI (413): {str(e)}", + details={"is_byok": is_byok}, ) # Map based on status code potentially if e.status_code >= 500: @@ -1158,11 +1177,12 @@ class OpenAIClient(LLMClientBase): "status_code": e.status_code, "response": str(e.response), "body": e.body, + "is_byok": is_byok, }, ) # Fallback for unexpected errors - return super().handle_llm_error(e) + return super().handle_llm_error(e, llm_config=llm_config) def fill_image_content_in_messages(openai_message_list: List[dict], pydantic_message_list: List[PydanticMessage]) -> List[dict]: diff --git a/letta/server/rest_api/app.py b/letta/server/rest_api/app.py index 56a2c500..d5842cc9 100644 --- a/letta/server/rest_api/app.py +++ b/letta/server/rest_api/app.py @@ -668,12 +668,19 @@ def create_application() -> "FastAPI": @app.exception_handler(LLMRateLimitError) async def llm_rate_limit_error_handler(request: Request, exc: LLMRateLimitError): + is_byok = exc.details.get("is_byok") if isinstance(exc.details, dict) else None + if is_byok: + message = ( + "Rate limit exceeded on your API key. Please check your provider's rate limits and billing, or reduce request frequency." + ) + else: + message = "Rate limit exceeded for LLM model provider. Please wait before making another request." return JSONResponse( status_code=429, content={ "error": { "type": "llm_rate_limit", - "message": "Rate limit exceeded for LLM model provider. Please wait before making another request.", + "message": message, "detail": str(exc), } }, diff --git a/letta/services/summarizer/summarizer.py b/letta/services/summarizer/summarizer.py index a3247497..e7aabbae 100644 --- a/letta/services/summarizer/summarizer.py +++ b/letta/services/summarizer/summarizer.py @@ -592,7 +592,7 @@ async def simple_summary( except Exception as e: # handle LLM error (likely a context window exceeded error) try: - raise llm_client.handle_llm_error(e) + raise llm_client.handle_llm_error(e, llm_config=llm_config) except ContextWindowExceededError as context_error: logger.warning(f"Context window exceeded during summarization. Applying clamping fallbacks. Original error: {context_error}") @@ -667,7 +667,7 @@ async def simple_summary( except Exception as fallback_error_b: logger.error(f"Transcript truncation fallback also failed: {fallback_error_b}. Propagating error.") logger.info(f"Full fallback summarization payload: {request_data}") - raise llm_client.handle_llm_error(fallback_error_b) + raise llm_client.handle_llm_error(fallback_error_b, llm_config=llm_config) logger.info(f"Summarized {len(messages)}: {summary}")