fix(core): differentiate BYOK vs base provider in all LLM error details (#9425)

Add is_byok flag to every LLMError's details dict returned from handle_llm_error across all providers (OpenAI, Anthropic, Google, ChatGPT OAuth). This enables observability into whether errors originate from Letta's production keys or user-provided BYOK keys. The rate limit handler in app.py now returns a more helpful message for BYOK users ("check your provider's rate limits and billing") versus the generic message for base provider rate limits. Datadog issues: - https://us5.datadoghq.com/error-tracking/issue/b711c824-f490-11f0-96e4-da7ad0900000 - https://us5.datadoghq.com/error-tracking/issue/76623036-f4de-11f0-8697-da7ad0900000 - https://us5.datadoghq.com/error-tracking/issue/43e9888a-dfcf-11f0-a645-da7ad0900000 🤖 Generated with [Letta Code](https://letta.com) Co-authored-by: Letta <noreply@letta.com>
2026-02-10 20:38:25 -08:00
parent 424a1ada64
commit 382e216cbb
12 changed files with 123 additions and 54 deletions
--- a/letta/adapters/letta_llm_stream_adapter.py
+++ b/letta/adapters/letta_llm_stream_adapter.py
@@ -114,7 +114,7 @@ class LettaLLMStreamAdapter(LettaLLMAdapter):
                error_msg=str(e),
                error_type=type(e).__name__,
            )
-            raise self.llm_client.handle_llm_error(e)
+            raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config)

        # Process the stream and yield chunks immediately for TTFT
        # Wrap in error handling to convert provider errors to common LLMError types
@@ -133,7 +133,7 @@ class LettaLLMStreamAdapter(LettaLLMAdapter):
                error_msg=str(e),
                error_type=type(e).__name__,
            )
-            raise self.llm_client.handle_llm_error(e)
+            raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config)

        # After streaming completes, extract the accumulated data
        self.llm_request_finish_timestamp_ns = get_utc_timestamp_ns()
--- a/letta/adapters/simple_llm_request_adapter.py
+++ b/letta/adapters/simple_llm_request_adapter.py
@@ -54,7 +54,7 @@ class SimpleLLMRequestAdapter(LettaLLMRequestAdapter):
        try:
            self.response_data = await self.llm_client.request_async_with_telemetry(request_data, self.llm_config)
        except Exception as e:
-            raise self.llm_client.handle_llm_error(e)
+            raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config)

        self.llm_request_finish_timestamp_ns = get_utc_timestamp_ns()

--- a/letta/adapters/simple_llm_stream_adapter.py
+++ b/letta/adapters/simple_llm_stream_adapter.py
@@ -151,7 +151,7 @@ class SimpleLLMStreamAdapter(LettaLLMStreamAdapter):
                error_msg=str(e),
                error_type=type(e).__name__,
            )
-            raise self.llm_client.handle_llm_error(e)
+            raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config)

        # Process the stream and yield chunks immediately for TTFT
        try:
@@ -169,7 +169,7 @@ class SimpleLLMStreamAdapter(LettaLLMStreamAdapter):
                error_msg=str(e),
                error_type=type(e).__name__,
            )
-            raise self.llm_client.handle_llm_error(e)
+            raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config)

        # After streaming completes, extract the accumulated data
        self.llm_request_finish_timestamp_ns = get_utc_timestamp_ns()
--- a/letta/agents/letta_agent.py
+++ b/letta/agents/letta_agent.py
@@ -1547,7 +1547,7 @@ class LettaAgent(BaseAgent):
                step_id=step_id,
            )
        else:
-            raise llm_client.handle_llm_error(e)
+            raise llm_client.handle_llm_error(e, llm_config=llm_config)

    @trace_method
    async def _rebuild_context_window(
--- a/letta/llm_api/anthropic_client.py
+++ b/letta/llm_api/anthropic_client.py
@@ -37,6 +37,7 @@ from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG
 from letta.log import get_logger
 from letta.otel.tracing import trace_method
 from letta.schemas.agent import AgentType
+from letta.schemas.enums import ProviderCategory
 from letta.schemas.llm_config import LLMConfig
 from letta.schemas.message import Message as PydanticMessage
 from letta.schemas.openai.chat_completion_request import Tool as OpenAITool
@@ -937,7 +938,9 @@ class AnthropicClient(LLMClientBase):
        )

    @trace_method
-    def handle_llm_error(self, e: Exception) -> Exception:
+    def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
+        is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
+
        # make sure to check for overflow errors, regardless of error type
        error_str = str(e).lower()
        if (
@@ -952,6 +955,7 @@ class AnthropicClient(LLMClientBase):
            logger.warning(f"[Anthropic] Context window exceeded: {str(e)}")
            return ContextWindowExceededError(
                message=f"Context window exceeded for Anthropic: {str(e)}",
+                details={"is_byok": is_byok},
            )

        if isinstance(e, anthropic.APITimeoutError):
@@ -959,7 +963,7 @@ class AnthropicClient(LLMClientBase):
            return LLMTimeoutError(
                message=f"Request to Anthropic timed out: {str(e)}",
                code=ErrorCode.TIMEOUT,
-                details={"cause": str(e.__cause__) if e.__cause__ else None},
+                details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
            )

        if isinstance(e, anthropic.APIConnectionError):
@@ -967,7 +971,7 @@ class AnthropicClient(LLMClientBase):
            return LLMConnectionError(
                message=f"Failed to connect to Anthropic: {str(e)}",
                code=ErrorCode.INTERNAL_SERVER_ERROR,
-                details={"cause": str(e.__cause__) if e.__cause__ else None},
+                details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
            )

        # Handle httpx.RemoteProtocolError which can occur during streaming
@@ -978,7 +982,7 @@ class AnthropicClient(LLMClientBase):
            return LLMConnectionError(
                message=f"Connection error during Anthropic streaming: {str(e)}",
                code=ErrorCode.INTERNAL_SERVER_ERROR,
-                details={"cause": str(e.__cause__) if e.__cause__ else None},
+                details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
            )

        # Handle httpx network errors which can occur during streaming
@@ -988,7 +992,7 @@ class AnthropicClient(LLMClientBase):
            return LLMConnectionError(
                message=f"Network error during Anthropic streaming: {str(e)}",
                code=ErrorCode.INTERNAL_SERVER_ERROR,
-                details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__},
+                details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__, "is_byok": is_byok},
            )

        if isinstance(e, anthropic.RateLimitError):
@@ -996,6 +1000,7 @@ class AnthropicClient(LLMClientBase):
            return LLMRateLimitError(
                message=f"Rate limited by Anthropic: {str(e)}",
                code=ErrorCode.RATE_LIMIT_EXCEEDED,
+                details={"is_byok": is_byok},
            )

        if isinstance(e, anthropic.BadRequestError):
@@ -1013,11 +1018,13 @@ class AnthropicClient(LLMClientBase):
                # 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'input length and `max_tokens` exceed context limit: 173298 + 32000 > 200000, decrease input length or `max_tokens` and try again'}}
                return ContextWindowExceededError(
                    message=f"Bad request to Anthropic (context window exceeded): {str(e)}",
+                    details={"is_byok": is_byok},
                )
            else:
                return LLMBadRequestError(
                    message=f"Bad request to Anthropic: {str(e)}",
                    code=ErrorCode.INTERNAL_SERVER_ERROR,
+                    details={"is_byok": is_byok},
                )

        if isinstance(e, anthropic.AuthenticationError):
@@ -1025,6 +1032,7 @@ class AnthropicClient(LLMClientBase):
            return LLMAuthenticationError(
                message=f"Authentication failed with Anthropic: {str(e)}",
                code=ErrorCode.INTERNAL_SERVER_ERROR,
+                details={"is_byok": is_byok},
            )

        if isinstance(e, anthropic.PermissionDeniedError):
@@ -1032,6 +1040,7 @@ class AnthropicClient(LLMClientBase):
            return LLMPermissionDeniedError(
                message=f"Permission denied by Anthropic: {str(e)}",
                code=ErrorCode.INTERNAL_SERVER_ERROR,
+                details={"is_byok": is_byok},
            )

        if isinstance(e, anthropic.NotFoundError):
@@ -1039,6 +1048,7 @@ class AnthropicClient(LLMClientBase):
            return LLMNotFoundError(
                message=f"Resource not found in Anthropic: {str(e)}",
                code=ErrorCode.INTERNAL_SERVER_ERROR,
+                details={"is_byok": is_byok},
            )

        if isinstance(e, anthropic.UnprocessableEntityError):
@@ -1046,6 +1056,7 @@ class AnthropicClient(LLMClientBase):
            return LLMUnprocessableEntityError(
                message=f"Invalid request content for Anthropic: {str(e)}",
                code=ErrorCode.INTERNAL_SERVER_ERROR,
+                details={"is_byok": is_byok},
            )

        if isinstance(e, anthropic.APIStatusError):
@@ -1055,11 +1066,13 @@ class AnthropicClient(LLMClientBase):
                logger.warning(f"[Anthropic] Request too large (413): {str(e)}")
                return ContextWindowExceededError(
                    message=f"Request too large for Anthropic (413): {str(e)}",
+                    details={"is_byok": is_byok},
                )
            if "overloaded" in str(e).lower():
                return LLMProviderOverloaded(
                    message=f"Anthropic API is overloaded: {str(e)}",
                    code=ErrorCode.INTERNAL_SERVER_ERROR,
+                    details={"is_byok": is_byok},
                )
            return LLMServerError(
                message=f"Anthropic API error: {str(e)}",
@@ -1067,10 +1080,11 @@ class AnthropicClient(LLMClientBase):
                details={
                    "status_code": e.status_code if hasattr(e, "status_code") else None,
                    "response": str(e.response) if hasattr(e, "response") else None,
+                    "is_byok": is_byok,
                },
            )

-        return super().handle_llm_error(e)
+        return super().handle_llm_error(e, llm_config=llm_config)

    def extract_usage_statistics(self, response_data: dict | None, llm_config: LLMConfig) -> LettaUsageStatistics:
        """Extract usage statistics from Anthropic response and return as LettaUsageStatistics."""
--- a/letta/llm_api/azure_client.py
+++ b/letta/llm_api/azure_client.py
@@ -97,7 +97,7 @@ class AzureClient(OpenAIClient):
                response: ChatCompletion = await client.chat.completions.create(**request_data)
                return response.model_dump()
        except Exception as e:
-            raise self.handle_llm_error(e)
+            raise self.handle_llm_error(e, llm_config=llm_config)

    @trace_method
    async def stream_async(self, request_data: dict, llm_config: LLMConfig) -> AsyncStream[ChatCompletionChunk | ResponseStreamEvent]:
--- a/letta/llm_api/chatgpt_oauth_client.py
+++ b/letta/llm_api/chatgpt_oauth_client.py
@@ -1019,29 +1019,33 @@ class ChatGPTOAuthClient(LLMClientBase):
        return "o1" in model or "o3" in model or "o4" in model or "gpt-5" in model

    @trace_method
-    def handle_llm_error(self, e: Exception) -> Exception:
+    def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
        """Map ChatGPT-specific errors to common LLMError types.

        Args:
            e: Original exception.
+            llm_config: Optional LLM config to determine if this is a BYOK key.

        Returns:
            Mapped LLMError subclass.
        """
+        is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
+
        # Already a typed LLM/Letta error (e.g. from SSE error handling) — pass through
        if isinstance(e, LettaError):
            return e

        if isinstance(e, httpx.HTTPStatusError):
-            return self._handle_http_error(e)
+            return self._handle_http_error(e, is_byok=is_byok)

-        return super().handle_llm_error(e)
+        return super().handle_llm_error(e, llm_config=llm_config)

-    def _handle_http_error(self, e: httpx.HTTPStatusError) -> Exception:
+    def _handle_http_error(self, e: httpx.HTTPStatusError, is_byok: bool | None = None) -> Exception:
        """Handle HTTP status errors from ChatGPT backend.

        Args:
            e: HTTP status error.
+            is_byok: Whether the request used a BYOK key.

        Returns:
            Appropriate LLMError subclass.
@@ -1059,30 +1063,36 @@ class ChatGPTOAuthClient(LLMClientBase):
            return LLMAuthenticationError(
                message=f"ChatGPT authentication failed: {error_message}",
                code=ErrorCode.UNAUTHENTICATED,
+                details={"is_byok": is_byok},
            )
        elif status_code == 429:
            return LLMRateLimitError(
                message=f"ChatGPT rate limit exceeded: {error_message}",
                code=ErrorCode.RATE_LIMIT_EXCEEDED,
+                details={"is_byok": is_byok},
            )
        elif status_code == 400:
            if "context" in error_message.lower() or "token" in error_message.lower():
                return ContextWindowExceededError(
                    message=f"ChatGPT context window exceeded: {error_message}",
+                    details={"is_byok": is_byok},
                )
            return LLMBadRequestError(
                message=f"ChatGPT bad request: {error_message}",
                code=ErrorCode.INVALID_ARGUMENT,
+                details={"is_byok": is_byok},
            )
        elif status_code >= 500:
            return LLMServerError(
                message=f"ChatGPT server error: {error_message}",
                code=ErrorCode.INTERNAL_SERVER_ERROR,
+                details={"is_byok": is_byok},
            )
        else:
            return LLMBadRequestError(
                message=f"ChatGPT request failed ({status_code}): {error_message}",
                code=ErrorCode.INTERNAL_SERVER_ERROR,
+                details={"is_byok": is_byok},
            )

    def _handle_sse_error_event(self, raw_event: dict) -> Exception:
--- a/letta/llm_api/google_vertex_client.py
+++ b/letta/llm_api/google_vertex_client.py
@@ -37,6 +37,7 @@ from letta.local_llm.json_parser import clean_json_string_extra_backslash
 from letta.log import get_logger
 from letta.otel.tracing import trace_method
 from letta.schemas.agent import AgentType
+from letta.schemas.enums import ProviderCategory
 from letta.schemas.llm_config import LLMConfig
 from letta.schemas.message import Message as PydanticMessage
 from letta.schemas.openai.chat_completion_request import Tool, Tool as OpenAITool
@@ -93,7 +94,7 @@ class GoogleVertexClient(LLMClientBase):
                code=ErrorCode.INTERNAL_SERVER_ERROR,
            )
        except Exception as e:
-            raise self.handle_llm_error(e)
+            raise self.handle_llm_error(e, llm_config=llm_config)

    @trace_method
    async def request_async(self, request_data: dict, llm_config: LLMConfig) -> dict:
@@ -135,11 +136,11 @@ class GoogleVertexClient(LLMClientBase):
                    logger.warning(f"Received {e}, retrying {retry_count}/{self.MAX_RETRIES}")
                    retry_count += 1
                    if retry_count > self.MAX_RETRIES:
-                        raise self.handle_llm_error(e)
+                        raise self.handle_llm_error(e, llm_config=llm_config)
                    continue
-                raise self.handle_llm_error(e)
+                raise self.handle_llm_error(e, llm_config=llm_config)
            except Exception as e:
-                raise self.handle_llm_error(e)
+                raise self.handle_llm_error(e, llm_config=llm_config)
            response_data = response.model_dump()
            is_malformed_function_call = self.is_malformed_function_call(response_data)
            if is_malformed_function_call:
@@ -211,9 +212,9 @@ class GoogleVertexClient(LLMClientBase):
            if e.code == 499:
                logger.info(f"{self._provider_prefix()} Stream cancelled by client (499): {e}")
                return
-            raise self.handle_llm_error(e)
+            raise self.handle_llm_error(e, llm_config=llm_config)
        except errors.APIError as e:
-            raise self.handle_llm_error(e)
+            raise self.handle_llm_error(e, llm_config=llm_config)

    @staticmethod
    def add_dummy_model_messages(messages: List[dict]) -> List[dict]:
@@ -851,7 +852,9 @@ class GoogleVertexClient(LLMClientBase):
        return False

    @trace_method
-    def handle_llm_error(self, e: Exception) -> Exception:
+    def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
+        is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
+
        # Handle Google GenAI specific errors
        if isinstance(e, errors.ClientError):
            if e.code == 499:
@@ -859,7 +862,7 @@ class GoogleVertexClient(LLMClientBase):
                return LLMConnectionError(
                    message=f"Request to {self._provider_name()} was cancelled (client disconnected): {str(e)}",
                    code=ErrorCode.INTERNAL_SERVER_ERROR,
-                    details={"status_code": 499, "cause": "client_cancelled"},
+                    details={"status_code": 499, "cause": "client_cancelled", "is_byok": is_byok},
                )

            logger.warning(f"{self._provider_prefix()} Client error ({e.code}): {e}")
@@ -870,43 +873,50 @@ class GoogleVertexClient(LLMClientBase):
                if "context" in error_str and ("exceed" in error_str or "limit" in error_str or "too long" in error_str):
                    return ContextWindowExceededError(
                        message=f"Bad request to {self._provider_name()} (context window exceeded): {str(e)}",
+                        details={"is_byok": is_byok},
                    )
                else:
                    return LLMBadRequestError(
                        message=f"Bad request to {self._provider_name()}: {str(e)}",
                        code=ErrorCode.INTERNAL_SERVER_ERROR,
+                        details={"is_byok": is_byok},
                    )
            elif e.code == 401:
                return LLMAuthenticationError(
                    message=f"Authentication failed with {self._provider_name()}: {str(e)}",
                    code=ErrorCode.INTERNAL_SERVER_ERROR,
+                    details={"is_byok": is_byok},
                )
            elif e.code == 403:
                return LLMPermissionDeniedError(
                    message=f"Permission denied by {self._provider_name()}: {str(e)}",
                    code=ErrorCode.INTERNAL_SERVER_ERROR,
+                    details={"is_byok": is_byok},
                )
            elif e.code == 404:
                return LLMNotFoundError(
                    message=f"Resource not found in {self._provider_name()}: {str(e)}",
                    code=ErrorCode.INTERNAL_SERVER_ERROR,
+                    details={"is_byok": is_byok},
                )
            elif e.code == 408:
                return LLMTimeoutError(
                    message=f"Request to {self._provider_name()} timed out: {str(e)}",
                    code=ErrorCode.TIMEOUT,
-                    details={"cause": str(e.__cause__) if e.__cause__ else None},
+                    details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
                )
            elif e.code == 422:
                return LLMUnprocessableEntityError(
                    message=f"Invalid request content for {self._provider_name()}: {str(e)}",
                    code=ErrorCode.INTERNAL_SERVER_ERROR,
+                    details={"is_byok": is_byok},
                )
            elif e.code == 429:
                logger.warning(f"{self._provider_prefix()} Rate limited (429). Consider backoff.")
                return LLMRateLimitError(
                    message=f"Rate limited by {self._provider_name()}: {str(e)}",
                    code=ErrorCode.RATE_LIMIT_EXCEEDED,
+                    details={"is_byok": is_byok},
                )
            else:
                return LLMServerError(
@@ -915,6 +925,7 @@ class GoogleVertexClient(LLMClientBase):
                    details={
                        "status_code": e.code,
                        "response_json": getattr(e, "response_json", None),
+                        "is_byok": is_byok,
                    },
                )

@@ -929,13 +940,14 @@ class GoogleVertexClient(LLMClientBase):
                    details={
                        "status_code": e.code,
                        "response_json": getattr(e, "response_json", None),
+                        "is_byok": is_byok,
                    },
                )
            elif e.code == 502:
                return LLMConnectionError(
                    message=f"Bad gateway from {self._provider_name()}: {str(e)}",
                    code=ErrorCode.INTERNAL_SERVER_ERROR,
-                    details={"cause": str(e.__cause__) if e.__cause__ else None},
+                    details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
                )
            elif e.code == 503:
                return LLMServerError(
@@ -944,13 +956,14 @@ class GoogleVertexClient(LLMClientBase):
                    details={
                        "status_code": e.code,
                        "response_json": getattr(e, "response_json", None),
+                        "is_byok": is_byok,
                    },
                )
            elif e.code == 504:
                return LLMTimeoutError(
                    message=f"Gateway timeout from {self._provider_name()}: {str(e)}",
                    code=ErrorCode.TIMEOUT,
-                    details={"cause": str(e.__cause__) if e.__cause__ else None},
+                    details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
                )
            else:
                return LLMServerError(
@@ -959,6 +972,7 @@ class GoogleVertexClient(LLMClientBase):
                    details={
                        "status_code": e.code,
                        "response_json": getattr(e, "response_json", None),
+                        "is_byok": is_byok,
                    },
                )

@@ -970,6 +984,7 @@ class GoogleVertexClient(LLMClientBase):
                details={
                    "status_code": e.code,
                    "response_json": getattr(e, "response_json", None),
+                    "is_byok": is_byok,
                },
            )

@@ -981,7 +996,7 @@ class GoogleVertexClient(LLMClientBase):
            return LLMConnectionError(
                message=f"Connection error during {self._provider_name()} streaming: {str(e)}",
                code=ErrorCode.INTERNAL_SERVER_ERROR,
-                details={"cause": str(e.__cause__) if e.__cause__ else None},
+                details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
            )

        # Handle httpx network errors which can occur during streaming
@@ -991,7 +1006,7 @@ class GoogleVertexClient(LLMClientBase):
            return LLMConnectionError(
                message=f"Network error during {self._provider_name()} streaming: {str(e)}",
                code=ErrorCode.INTERNAL_SERVER_ERROR,
-                details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__},
+                details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__, "is_byok": is_byok},
            )

        # Handle connection-related errors
@@ -1000,11 +1015,11 @@ class GoogleVertexClient(LLMClientBase):
            return LLMConnectionError(
                message=f"Failed to connect to {self._provider_name()}: {str(e)}",
                code=ErrorCode.INTERNAL_SERVER_ERROR,
-                details={"cause": str(e.__cause__) if e.__cause__ else None},
+                details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
            )

        # Fallback to base implementation for other errors
-        return super().handle_llm_error(e)
+        return super().handle_llm_error(e, llm_config=llm_config)

    async def count_tokens(self, messages: List[dict] = None, model: str = None, tools: List[OpenAITool] = None) -> int:
        """
--- a/letta/llm_api/llm_client_base.py
+++ b/letta/llm_api/llm_client_base.py
@@ -226,7 +226,7 @@ class LLMClientBase:
                )
            log_event(name="llm_response_received", attributes=response_data)
        except Exception as e:
-            raise self.handle_llm_error(e)
+            raise self.handle_llm_error(e, llm_config=llm_config)

        return await self.convert_response_to_chat_completion(response_data, messages, llm_config)

@@ -261,7 +261,7 @@ class LLMClientBase:

            log_event(name="llm_response_received", attributes=response_data)
        except Exception as e:
-            raise self.handle_llm_error(e)
+            raise self.handle_llm_error(e, llm_config=llm_config)

        return await self.convert_response_to_chat_completion(response_data, messages, llm_config)

@@ -353,17 +353,20 @@ class LLMClientBase:
        raise NotImplementedError

    @abstractmethod
-    def handle_llm_error(self, e: Exception) -> Exception:
+    def handle_llm_error(self, e: Exception, llm_config: Optional["LLMConfig"] = None) -> Exception:
        """
        Maps provider-specific errors to common LLMError types.
        Each LLM provider should implement this to translate their specific errors.

        Args:
            e: The original provider-specific exception
+            llm_config: Optional LLM config to determine if this is a BYOK key

        Returns:
            An LLMError subclass that represents the error in a provider-agnostic way
        """
+        is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
+
        # Handle httpx.RemoteProtocolError which can occur during streaming
        # when the remote server closes the connection unexpectedly
        # (e.g., "peer closed connection without sending complete message body")
@@ -375,10 +378,10 @@ class LLMClientBase:
            return LLMConnectionError(
                message=f"Connection error during streaming: {str(e)}",
                code=ErrorCode.INTERNAL_SERVER_ERROR,
-                details={"cause": str(e.__cause__) if e.__cause__ else None},
+                details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
            )

-        return LLMError(f"Unhandled LLM error: {str(e)}")
+        return LLMError(message=f"Unhandled LLM error: {str(e)}", details={"is_byok": is_byok})

    def get_byok_overrides(self, llm_config: LLMConfig) -> Tuple[Optional[str], Optional[str], Optional[str]]:
        """
--- a/letta/llm_api/openai_client.py
+++ b/letta/llm_api/openai_client.py
@@ -40,6 +40,7 @@ from letta.log import get_logger
 from letta.otel.tracing import trace_method
 from letta.schemas.agent import AgentType
 from letta.schemas.embedding_config import EmbeddingConfig
+from letta.schemas.enums import ProviderCategory
 from letta.schemas.letta_message_content import MessageContentType
 from letta.schemas.llm_config import LLMConfig
 from letta.schemas.message import Message as PydanticMessage
@@ -1015,10 +1016,11 @@ class OpenAIClient(LLMClientBase):
        return results

    @trace_method
-    def handle_llm_error(self, e: Exception) -> Exception:
+    def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
        """
        Maps OpenAI-specific errors to common LLMError types.
        """
+        is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
        if isinstance(e, openai.APITimeoutError):
            timeout_duration = getattr(e, "timeout", "unknown")
            logger.warning(f"[OpenAI] Request timeout after {timeout_duration} seconds: {e}")
@@ -1028,6 +1030,7 @@ class OpenAIClient(LLMClientBase):
                details={
                    "timeout_duration": timeout_duration,
                    "cause": str(e.__cause__) if e.__cause__ else None,
+                    "is_byok": is_byok,
                },
            )

@@ -1036,7 +1039,7 @@ class OpenAIClient(LLMClientBase):
            return LLMConnectionError(
                message=f"Failed to connect to OpenAI: {str(e)}",
                code=ErrorCode.INTERNAL_SERVER_ERROR,
-                details={"cause": str(e.__cause__) if e.__cause__ else None},
+                details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
            )

        # Handle httpx.RemoteProtocolError which can occur during streaming
@@ -1047,7 +1050,7 @@ class OpenAIClient(LLMClientBase):
            return LLMConnectionError(
                message=f"Connection error during OpenAI streaming: {str(e)}",
                code=ErrorCode.INTERNAL_SERVER_ERROR,
-                details={"cause": str(e.__cause__) if e.__cause__ else None},
+                details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
            )

        # Handle httpx network errors which can occur during streaming
@@ -1057,15 +1060,16 @@ class OpenAIClient(LLMClientBase):
            return LLMConnectionError(
                message=f"Network error during OpenAI streaming: {str(e)}",
                code=ErrorCode.INTERNAL_SERVER_ERROR,
-                details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__},
+                details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__, "is_byok": is_byok},
            )

        if isinstance(e, openai.RateLimitError):
            logger.warning(f"[OpenAI] Rate limited (429). Consider backoff. Error: {e}")
+            body_details = e.body if isinstance(e.body, dict) else {"body": e.body}
            return LLMRateLimitError(
                message=f"Rate limited by OpenAI: {str(e)}",
                code=ErrorCode.RATE_LIMIT_EXCEEDED,
-                details=e.body,  # Include body which often has rate limit details
+                details={**body_details, "is_byok": is_byok},
            )

        if isinstance(e, openai.BadRequestError):
@@ -1082,12 +1086,14 @@ class OpenAIClient(LLMClientBase):
            if error_code == "context_length_exceeded" or is_context_window_overflow_message(str(e)):
                return ContextWindowExceededError(
                    message=f"Bad request to OpenAI (context window exceeded): {str(e)}",
+                    details={"is_byok": is_byok},
                )
            else:
+                body_details = e.body if isinstance(e.body, dict) else {"body": e.body}
                return LLMBadRequestError(
                    message=f"Bad request to OpenAI: {str(e)}",
-                    code=ErrorCode.INVALID_ARGUMENT,  # Or more specific if detectable
-                    details=e.body,
+                    code=ErrorCode.INVALID_ARGUMENT,
+                    details={**body_details, "is_byok": is_byok},
                )

        # NOTE: The OpenAI Python SDK may raise a generic `openai.APIError` while *iterating*
@@ -1104,34 +1110,46 @@ class OpenAIClient(LLMClientBase):
                    message=f"OpenAI request exceeded the context window: {msg}",
                    details={
                        "provider_exception_type": type(e).__name__,
-                        # Best-effort extraction (may not exist on APIError)
                        "body": getattr(e, "body", None),
+                        "is_byok": is_byok,
                    },
                )

        if isinstance(e, openai.AuthenticationError):
            logger.error(f"[OpenAI] Authentication error (401): {str(e)}")  # More severe log level
+            body_details = e.body if isinstance(e.body, dict) else {"body": e.body}
            return LLMAuthenticationError(
-                message=f"Authentication failed with OpenAI: {str(e)}", code=ErrorCode.UNAUTHENTICATED, details=e.body
+                message=f"Authentication failed with OpenAI: {str(e)}",
+                code=ErrorCode.UNAUTHENTICATED,
+                details={**body_details, "is_byok": is_byok},
            )

        if isinstance(e, openai.PermissionDeniedError):
            logger.error(f"[OpenAI] Permission denied (403): {str(e)}")  # More severe log level
+            body_details = e.body if isinstance(e.body, dict) else {"body": e.body}
            return LLMPermissionDeniedError(
-                message=f"Permission denied by OpenAI: {str(e)}", code=ErrorCode.PERMISSION_DENIED, details=e.body
+                message=f"Permission denied by OpenAI: {str(e)}",
+                code=ErrorCode.PERMISSION_DENIED,
+                details={**body_details, "is_byok": is_byok},
            )

        if isinstance(e, openai.NotFoundError):
            logger.warning(f"[OpenAI] Resource not found (404): {str(e)}")
            # Could be invalid model name, etc.
-            return LLMNotFoundError(message=f"Resource not found in OpenAI: {str(e)}", code=ErrorCode.NOT_FOUND, details=e.body)
+            body_details = e.body if isinstance(e.body, dict) else {"body": e.body}
+            return LLMNotFoundError(
+                message=f"Resource not found in OpenAI: {str(e)}",
+                code=ErrorCode.NOT_FOUND,
+                details={**body_details, "is_byok": is_byok},
+            )

        if isinstance(e, openai.UnprocessableEntityError):
            logger.warning(f"[OpenAI] Unprocessable entity (422): {str(e)}")
+            body_details = e.body if isinstance(e.body, dict) else {"body": e.body}
            return LLMUnprocessableEntityError(
                message=f"Invalid request content for OpenAI: {str(e)}",
-                code=ErrorCode.INVALID_ARGUMENT,  # Usually validation errors
-                details=e.body,
+                code=ErrorCode.INVALID_ARGUMENT,
+                details={**body_details, "is_byok": is_byok},
            )

        # General API error catch-all
@@ -1141,6 +1159,7 @@ class OpenAIClient(LLMClientBase):
            if e.status_code == 413:
                return ContextWindowExceededError(
                    message=f"Request too large for OpenAI (413): {str(e)}",
+                    details={"is_byok": is_byok},
                )
            # Map based on status code potentially
            if e.status_code >= 500:
@@ -1158,11 +1177,12 @@ class OpenAIClient(LLMClientBase):
                    "status_code": e.status_code,
                    "response": str(e.response),
                    "body": e.body,
+                    "is_byok": is_byok,
                },
            )

        # Fallback for unexpected errors
-        return super().handle_llm_error(e)
+        return super().handle_llm_error(e, llm_config=llm_config)


 def fill_image_content_in_messages(openai_message_list: List[dict], pydantic_message_list: List[PydanticMessage]) -> List[dict]:
--- a/letta/server/rest_api/app.py
+++ b/letta/server/rest_api/app.py
@@ -668,12 +668,19 @@ def create_application() -> "FastAPI":

    @app.exception_handler(LLMRateLimitError)
    async def llm_rate_limit_error_handler(request: Request, exc: LLMRateLimitError):
+        is_byok = exc.details.get("is_byok") if isinstance(exc.details, dict) else None
+        if is_byok:
+            message = (
+                "Rate limit exceeded on your API key. Please check your provider's rate limits and billing, or reduce request frequency."
+            )
+        else:
+            message = "Rate limit exceeded for LLM model provider. Please wait before making another request."
        return JSONResponse(
            status_code=429,
            content={
                "error": {
                    "type": "llm_rate_limit",
-                    "message": "Rate limit exceeded for LLM model provider. Please wait before making another request.",
+                    "message": message,
                    "detail": str(exc),
                }
            },
--- a/letta/services/summarizer/summarizer.py
+++ b/letta/services/summarizer/summarizer.py
@@ -592,7 +592,7 @@ async def simple_summary(
    except Exception as e:
        # handle LLM error (likely a context window exceeded error)
        try:
-            raise llm_client.handle_llm_error(e)
+            raise llm_client.handle_llm_error(e, llm_config=llm_config)
        except ContextWindowExceededError as context_error:
            logger.warning(f"Context window exceeded during summarization. Applying clamping fallbacks. Original error: {context_error}")

@@ -667,7 +667,7 @@ async def simple_summary(
                except Exception as fallback_error_b:
                    logger.error(f"Transcript truncation fallback also failed: {fallback_error_b}. Propagating error.")
                    logger.info(f"Full fallback summarization payload: {request_data}")
-                    raise llm_client.handle_llm_error(fallback_error_b)
+                    raise llm_client.handle_llm_error(fallback_error_b, llm_config=llm_config)

    logger.info(f"Summarized {len(messages)}: {summary}")