From 382e216cbbf08b8ba3bd57b5fca789ffe8fd03b9 Mon Sep 17 00:00:00 2001
From: Kian Jones <11655409+kianjones9@users.noreply.github.com>
Date: Tue, 10 Feb 2026 20:38:25 -0800
Subject: [PATCH] fix(core): differentiate BYOK vs base provider in all LLM
 error details (#9425)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add is_byok flag to every LLMError's details dict returned from
handle_llm_error across all providers (OpenAI, Anthropic, Google,
ChatGPT OAuth). This enables observability into whether errors
originate from Letta's production keys or user-provided BYOK keys.

The rate limit handler in app.py now returns a more helpful message
for BYOK users ("check your provider's rate limits and billing")
versus the generic message for base provider rate limits.

Datadog issues:
- https://us5.datadoghq.com/error-tracking/issue/b711c824-f490-11f0-96e4-da7ad0900000
- https://us5.datadoghq.com/error-tracking/issue/76623036-f4de-11f0-8697-da7ad0900000
- https://us5.datadoghq.com/error-tracking/issue/43e9888a-dfcf-11f0-a645-da7ad0900000

🤖 Generated with [Letta Code](https://letta.com)

Co-authored-by: Letta <noreply@letta.com>
---
 letta/adapters/letta_llm_stream_adapter.py   |  4 +-
 letta/adapters/simple_llm_request_adapter.py |  2 +-
 letta/adapters/simple_llm_stream_adapter.py  |  4 +-
 letta/agents/letta_agent.py                  |  2 +-
 letta/llm_api/anthropic_client.py            | 26 ++++++++---
 letta/llm_api/azure_client.py                |  2 +-
 letta/llm_api/chatgpt_oauth_client.py        | 18 ++++++--
 letta/llm_api/google_vertex_client.py        | 45 ++++++++++++------
 letta/llm_api/llm_client_base.py             | 13 ++++--
 letta/llm_api/openai_client.py               | 48 ++++++++++++++------
 letta/server/rest_api/app.py                 |  9 +++-
 letta/services/summarizer/summarizer.py      |  4 +-
 12 files changed, 123 insertions(+), 54 deletions(-)

diff --git a/letta/adapters/letta_llm_stream_adapter.py b/letta/adapters/letta_llm_stream_adapter.py
index 2bb7ed9c..1d9aa396 100644
--- a/letta/adapters/letta_llm_stream_adapter.py
+++ b/letta/adapters/letta_llm_stream_adapter.py
@@ -114,7 +114,7 @@ class LettaLLMStreamAdapter(LettaLLMAdapter):
                 error_msg=str(e),
                 error_type=type(e).__name__,
             )
-            raise self.llm_client.handle_llm_error(e)
+            raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config)
 
         # Process the stream and yield chunks immediately for TTFT
         # Wrap in error handling to convert provider errors to common LLMError types
@@ -133,7 +133,7 @@ class LettaLLMStreamAdapter(LettaLLMAdapter):
                 error_msg=str(e),
                 error_type=type(e).__name__,
             )
-            raise self.llm_client.handle_llm_error(e)
+            raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config)
 
         # After streaming completes, extract the accumulated data
         self.llm_request_finish_timestamp_ns = get_utc_timestamp_ns()
diff --git a/letta/adapters/simple_llm_request_adapter.py b/letta/adapters/simple_llm_request_adapter.py
index 8ab2a904..a2818f50 100644
--- a/letta/adapters/simple_llm_request_adapter.py
+++ b/letta/adapters/simple_llm_request_adapter.py
@@ -54,7 +54,7 @@ class SimpleLLMRequestAdapter(LettaLLMRequestAdapter):
         try:
             self.response_data = await self.llm_client.request_async_with_telemetry(request_data, self.llm_config)
         except Exception as e:
-            raise self.llm_client.handle_llm_error(e)
+            raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config)
 
         self.llm_request_finish_timestamp_ns = get_utc_timestamp_ns()
 
diff --git a/letta/adapters/simple_llm_stream_adapter.py b/letta/adapters/simple_llm_stream_adapter.py
index 216c437c..80da0a64 100644
--- a/letta/adapters/simple_llm_stream_adapter.py
+++ b/letta/adapters/simple_llm_stream_adapter.py
@@ -151,7 +151,7 @@ class SimpleLLMStreamAdapter(LettaLLMStreamAdapter):
                 error_msg=str(e),
                 error_type=type(e).__name__,
             )
-            raise self.llm_client.handle_llm_error(e)
+            raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config)
 
         # Process the stream and yield chunks immediately for TTFT
         try:
@@ -169,7 +169,7 @@ class SimpleLLMStreamAdapter(LettaLLMStreamAdapter):
                 error_msg=str(e),
                 error_type=type(e).__name__,
             )
-            raise self.llm_client.handle_llm_error(e)
+            raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config)
 
         # After streaming completes, extract the accumulated data
         self.llm_request_finish_timestamp_ns = get_utc_timestamp_ns()
diff --git a/letta/agents/letta_agent.py b/letta/agents/letta_agent.py
index 98e56205..219762e7 100644
--- a/letta/agents/letta_agent.py
+++ b/letta/agents/letta_agent.py
@@ -1547,7 +1547,7 @@ class LettaAgent(BaseAgent):
                 step_id=step_id,
             )
         else:
-            raise llm_client.handle_llm_error(e)
+            raise llm_client.handle_llm_error(e, llm_config=llm_config)
 
     @trace_method
     async def _rebuild_context_window(
diff --git a/letta/llm_api/anthropic_client.py b/letta/llm_api/anthropic_client.py
index f3903dfe..442aceea 100644
--- a/letta/llm_api/anthropic_client.py
+++ b/letta/llm_api/anthropic_client.py
@@ -37,6 +37,7 @@ from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG
 from letta.log import get_logger
 from letta.otel.tracing import trace_method
 from letta.schemas.agent import AgentType
+from letta.schemas.enums import ProviderCategory
 from letta.schemas.llm_config import LLMConfig
 from letta.schemas.message import Message as PydanticMessage
 from letta.schemas.openai.chat_completion_request import Tool as OpenAITool
@@ -937,7 +938,9 @@ class AnthropicClient(LLMClientBase):
         )
 
     @trace_method
-    def handle_llm_error(self, e: Exception) -> Exception:
+    def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
+        is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
+
         # make sure to check for overflow errors, regardless of error type
         error_str = str(e).lower()
         if (
@@ -952,6 +955,7 @@ class AnthropicClient(LLMClientBase):
             logger.warning(f"[Anthropic] Context window exceeded: {str(e)}")
             return ContextWindowExceededError(
                 message=f"Context window exceeded for Anthropic: {str(e)}",
+                details={"is_byok": is_byok},
             )
 
         if isinstance(e, anthropic.APITimeoutError):
@@ -959,7 +963,7 @@ class AnthropicClient(LLMClientBase):
             return LLMTimeoutError(
                 message=f"Request to Anthropic timed out: {str(e)}",
                 code=ErrorCode.TIMEOUT,
-                details={"cause": str(e.__cause__) if e.__cause__ else None},
+                details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
             )
 
         if isinstance(e, anthropic.APIConnectionError):
@@ -967,7 +971,7 @@ class AnthropicClient(LLMClientBase):
             return LLMConnectionError(
                 message=f"Failed to connect to Anthropic: {str(e)}",
                 code=ErrorCode.INTERNAL_SERVER_ERROR,
-                details={"cause": str(e.__cause__) if e.__cause__ else None},
+                details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
             )
 
         # Handle httpx.RemoteProtocolError which can occur during streaming
@@ -978,7 +982,7 @@ class AnthropicClient(LLMClientBase):
             return LLMConnectionError(
                 message=f"Connection error during Anthropic streaming: {str(e)}",
                 code=ErrorCode.INTERNAL_SERVER_ERROR,
-                details={"cause": str(e.__cause__) if e.__cause__ else None},
+                details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
             )
 
         # Handle httpx network errors which can occur during streaming
@@ -988,7 +992,7 @@ class AnthropicClient(LLMClientBase):
             return LLMConnectionError(
                 message=f"Network error during Anthropic streaming: {str(e)}",
                 code=ErrorCode.INTERNAL_SERVER_ERROR,
-                details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__},
+                details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__, "is_byok": is_byok},
             )
 
         if isinstance(e, anthropic.RateLimitError):
@@ -996,6 +1000,7 @@ class AnthropicClient(LLMClientBase):
             return LLMRateLimitError(
                 message=f"Rate limited by Anthropic: {str(e)}",
                 code=ErrorCode.RATE_LIMIT_EXCEEDED,
+                details={"is_byok": is_byok},
             )
 
         if isinstance(e, anthropic.BadRequestError):
@@ -1013,11 +1018,13 @@ class AnthropicClient(LLMClientBase):
                 # 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'input length and `max_tokens` exceed context limit: 173298 + 32000 > 200000, decrease input length or `max_tokens` and try again'}}
                 return ContextWindowExceededError(
                     message=f"Bad request to Anthropic (context window exceeded): {str(e)}",
+                    details={"is_byok": is_byok},
                 )
             else:
                 return LLMBadRequestError(
                     message=f"Bad request to Anthropic: {str(e)}",
                     code=ErrorCode.INTERNAL_SERVER_ERROR,
+                    details={"is_byok": is_byok},
                 )
 
         if isinstance(e, anthropic.AuthenticationError):
@@ -1025,6 +1032,7 @@ class AnthropicClient(LLMClientBase):
             return LLMAuthenticationError(
                 message=f"Authentication failed with Anthropic: {str(e)}",
                 code=ErrorCode.INTERNAL_SERVER_ERROR,
+                details={"is_byok": is_byok},
             )
 
         if isinstance(e, anthropic.PermissionDeniedError):
@@ -1032,6 +1040,7 @@ class AnthropicClient(LLMClientBase):
             return LLMPermissionDeniedError(
                 message=f"Permission denied by Anthropic: {str(e)}",
                 code=ErrorCode.INTERNAL_SERVER_ERROR,
+                details={"is_byok": is_byok},
             )
 
         if isinstance(e, anthropic.NotFoundError):
@@ -1039,6 +1048,7 @@ class AnthropicClient(LLMClientBase):
             return LLMNotFoundError(
                 message=f"Resource not found in Anthropic: {str(e)}",
                 code=ErrorCode.INTERNAL_SERVER_ERROR,
+                details={"is_byok": is_byok},
             )
 
         if isinstance(e, anthropic.UnprocessableEntityError):
@@ -1046,6 +1056,7 @@ class AnthropicClient(LLMClientBase):
             return LLMUnprocessableEntityError(
                 message=f"Invalid request content for Anthropic: {str(e)}",
                 code=ErrorCode.INTERNAL_SERVER_ERROR,
+                details={"is_byok": is_byok},
             )
 
         if isinstance(e, anthropic.APIStatusError):
@@ -1055,11 +1066,13 @@ class AnthropicClient(LLMClientBase):
                 logger.warning(f"[Anthropic] Request too large (413): {str(e)}")
                 return ContextWindowExceededError(
                     message=f"Request too large for Anthropic (413): {str(e)}",
+                    details={"is_byok": is_byok},
                 )
             if "overloaded" in str(e).lower():
                 return LLMProviderOverloaded(
                     message=f"Anthropic API is overloaded: {str(e)}",
                     code=ErrorCode.INTERNAL_SERVER_ERROR,
+                    details={"is_byok": is_byok},
                 )
             return LLMServerError(
                 message=f"Anthropic API error: {str(e)}",
@@ -1067,10 +1080,11 @@ class AnthropicClient(LLMClientBase):
                 details={
                     "status_code": e.status_code if hasattr(e, "status_code") else None,
                     "response": str(e.response) if hasattr(e, "response") else None,
+                    "is_byok": is_byok,
                 },
             )
 
-        return super().handle_llm_error(e)
+        return super().handle_llm_error(e, llm_config=llm_config)
 
     def extract_usage_statistics(self, response_data: dict | None, llm_config: LLMConfig) -> LettaUsageStatistics:
         """Extract usage statistics from Anthropic response and return as LettaUsageStatistics."""
diff --git a/letta/llm_api/azure_client.py b/letta/llm_api/azure_client.py
index 80926aec..59085100 100644
--- a/letta/llm_api/azure_client.py
+++ b/letta/llm_api/azure_client.py
@@ -97,7 +97,7 @@ class AzureClient(OpenAIClient):
                 response: ChatCompletion = await client.chat.completions.create(**request_data)
                 return response.model_dump()
         except Exception as e:
-            raise self.handle_llm_error(e)
+            raise self.handle_llm_error(e, llm_config=llm_config)
 
     @trace_method
     async def stream_async(self, request_data: dict, llm_config: LLMConfig) -> AsyncStream[ChatCompletionChunk | ResponseStreamEvent]:
diff --git a/letta/llm_api/chatgpt_oauth_client.py b/letta/llm_api/chatgpt_oauth_client.py
index 2a25e7b6..96a0f15b 100644
--- a/letta/llm_api/chatgpt_oauth_client.py
+++ b/letta/llm_api/chatgpt_oauth_client.py
@@ -1019,29 +1019,33 @@ class ChatGPTOAuthClient(LLMClientBase):
         return "o1" in model or "o3" in model or "o4" in model or "gpt-5" in model
 
     @trace_method
-    def handle_llm_error(self, e: Exception) -> Exception:
+    def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
         """Map ChatGPT-specific errors to common LLMError types.
 
         Args:
             e: Original exception.
+            llm_config: Optional LLM config to determine if this is a BYOK key.
 
         Returns:
             Mapped LLMError subclass.
         """
+        is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
+
         # Already a typed LLM/Letta error (e.g. from SSE error handling) — pass through
         if isinstance(e, LettaError):
             return e
 
         if isinstance(e, httpx.HTTPStatusError):
-            return self._handle_http_error(e)
+            return self._handle_http_error(e, is_byok=is_byok)
 
-        return super().handle_llm_error(e)
+        return super().handle_llm_error(e, llm_config=llm_config)
 
-    def _handle_http_error(self, e: httpx.HTTPStatusError) -> Exception:
+    def _handle_http_error(self, e: httpx.HTTPStatusError, is_byok: bool | None = None) -> Exception:
         """Handle HTTP status errors from ChatGPT backend.
 
         Args:
             e: HTTP status error.
+            is_byok: Whether the request used a BYOK key.
 
         Returns:
             Appropriate LLMError subclass.
@@ -1059,30 +1063,36 @@ class ChatGPTOAuthClient(LLMClientBase):
             return LLMAuthenticationError(
                 message=f"ChatGPT authentication failed: {error_message}",
                 code=ErrorCode.UNAUTHENTICATED,
+                details={"is_byok": is_byok},
             )
         elif status_code == 429:
             return LLMRateLimitError(
                 message=f"ChatGPT rate limit exceeded: {error_message}",
                 code=ErrorCode.RATE_LIMIT_EXCEEDED,
+                details={"is_byok": is_byok},
             )
         elif status_code == 400:
             if "context" in error_message.lower() or "token" in error_message.lower():
                 return ContextWindowExceededError(
                     message=f"ChatGPT context window exceeded: {error_message}",
+                    details={"is_byok": is_byok},
                 )
             return LLMBadRequestError(
                 message=f"ChatGPT bad request: {error_message}",
                 code=ErrorCode.INVALID_ARGUMENT,
+                details={"is_byok": is_byok},
             )
         elif status_code >= 500:
             return LLMServerError(
                 message=f"ChatGPT server error: {error_message}",
                 code=ErrorCode.INTERNAL_SERVER_ERROR,
+                details={"is_byok": is_byok},
             )
         else:
             return LLMBadRequestError(
                 message=f"ChatGPT request failed ({status_code}): {error_message}",
                 code=ErrorCode.INTERNAL_SERVER_ERROR,
+                details={"is_byok": is_byok},
             )
 
     def _handle_sse_error_event(self, raw_event: dict) -> Exception:
diff --git a/letta/llm_api/google_vertex_client.py b/letta/llm_api/google_vertex_client.py
index 9b41e918..49fc64a8 100644
--- a/letta/llm_api/google_vertex_client.py
+++ b/letta/llm_api/google_vertex_client.py
@@ -37,6 +37,7 @@ from letta.local_llm.json_parser import clean_json_string_extra_backslash
 from letta.log import get_logger
 from letta.otel.tracing import trace_method
 from letta.schemas.agent import AgentType
+from letta.schemas.enums import ProviderCategory
 from letta.schemas.llm_config import LLMConfig
 from letta.schemas.message import Message as PydanticMessage
 from letta.schemas.openai.chat_completion_request import Tool, Tool as OpenAITool
@@ -93,7 +94,7 @@ class GoogleVertexClient(LLMClientBase):
                 code=ErrorCode.INTERNAL_SERVER_ERROR,
             )
         except Exception as e:
-            raise self.handle_llm_error(e)
+            raise self.handle_llm_error(e, llm_config=llm_config)
 
     @trace_method
     async def request_async(self, request_data: dict, llm_config: LLMConfig) -> dict:
@@ -135,11 +136,11 @@ class GoogleVertexClient(LLMClientBase):
                     logger.warning(f"Received {e}, retrying {retry_count}/{self.MAX_RETRIES}")
                     retry_count += 1
                     if retry_count > self.MAX_RETRIES:
-                        raise self.handle_llm_error(e)
+                        raise self.handle_llm_error(e, llm_config=llm_config)
                     continue
-                raise self.handle_llm_error(e)
+                raise self.handle_llm_error(e, llm_config=llm_config)
             except Exception as e:
-                raise self.handle_llm_error(e)
+                raise self.handle_llm_error(e, llm_config=llm_config)
             response_data = response.model_dump()
             is_malformed_function_call = self.is_malformed_function_call(response_data)
             if is_malformed_function_call:
@@ -211,9 +212,9 @@ class GoogleVertexClient(LLMClientBase):
             if e.code == 499:
                 logger.info(f"{self._provider_prefix()} Stream cancelled by client (499): {e}")
                 return
-            raise self.handle_llm_error(e)
+            raise self.handle_llm_error(e, llm_config=llm_config)
         except errors.APIError as e:
-            raise self.handle_llm_error(e)
+            raise self.handle_llm_error(e, llm_config=llm_config)
 
     @staticmethod
     def add_dummy_model_messages(messages: List[dict]) -> List[dict]:
@@ -851,7 +852,9 @@ class GoogleVertexClient(LLMClientBase):
         return False
 
     @trace_method
-    def handle_llm_error(self, e: Exception) -> Exception:
+    def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
+        is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
+
         # Handle Google GenAI specific errors
         if isinstance(e, errors.ClientError):
             if e.code == 499:
@@ -859,7 +862,7 @@ class GoogleVertexClient(LLMClientBase):
                 return LLMConnectionError(
                     message=f"Request to {self._provider_name()} was cancelled (client disconnected): {str(e)}",
                     code=ErrorCode.INTERNAL_SERVER_ERROR,
-                    details={"status_code": 499, "cause": "client_cancelled"},
+                    details={"status_code": 499, "cause": "client_cancelled", "is_byok": is_byok},
                 )
 
             logger.warning(f"{self._provider_prefix()} Client error ({e.code}): {e}")
@@ -870,43 +873,50 @@ class GoogleVertexClient(LLMClientBase):
                 if "context" in error_str and ("exceed" in error_str or "limit" in error_str or "too long" in error_str):
                     return ContextWindowExceededError(
                         message=f"Bad request to {self._provider_name()} (context window exceeded): {str(e)}",
+                        details={"is_byok": is_byok},
                     )
                 else:
                     return LLMBadRequestError(
                         message=f"Bad request to {self._provider_name()}: {str(e)}",
                         code=ErrorCode.INTERNAL_SERVER_ERROR,
+                        details={"is_byok": is_byok},
                     )
             elif e.code == 401:
                 return LLMAuthenticationError(
                     message=f"Authentication failed with {self._provider_name()}: {str(e)}",
                     code=ErrorCode.INTERNAL_SERVER_ERROR,
+                    details={"is_byok": is_byok},
                 )
             elif e.code == 403:
                 return LLMPermissionDeniedError(
                     message=f"Permission denied by {self._provider_name()}: {str(e)}",
                     code=ErrorCode.INTERNAL_SERVER_ERROR,
+                    details={"is_byok": is_byok},
                 )
             elif e.code == 404:
                 return LLMNotFoundError(
                     message=f"Resource not found in {self._provider_name()}: {str(e)}",
                     code=ErrorCode.INTERNAL_SERVER_ERROR,
+                    details={"is_byok": is_byok},
                 )
             elif e.code == 408:
                 return LLMTimeoutError(
                     message=f"Request to {self._provider_name()} timed out: {str(e)}",
                     code=ErrorCode.TIMEOUT,
-                    details={"cause": str(e.__cause__) if e.__cause__ else None},
+                    details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
                 )
             elif e.code == 422:
                 return LLMUnprocessableEntityError(
                     message=f"Invalid request content for {self._provider_name()}: {str(e)}",
                     code=ErrorCode.INTERNAL_SERVER_ERROR,
+                    details={"is_byok": is_byok},
                 )
             elif e.code == 429:
                 logger.warning(f"{self._provider_prefix()} Rate limited (429). Consider backoff.")
                 return LLMRateLimitError(
                     message=f"Rate limited by {self._provider_name()}: {str(e)}",
                     code=ErrorCode.RATE_LIMIT_EXCEEDED,
+                    details={"is_byok": is_byok},
                 )
             else:
                 return LLMServerError(
@@ -915,6 +925,7 @@ class GoogleVertexClient(LLMClientBase):
                     details={
                         "status_code": e.code,
                         "response_json": getattr(e, "response_json", None),
+                        "is_byok": is_byok,
                     },
                 )
 
@@ -929,13 +940,14 @@ class GoogleVertexClient(LLMClientBase):
                     details={
                         "status_code": e.code,
                         "response_json": getattr(e, "response_json", None),
+                        "is_byok": is_byok,
                     },
                 )
             elif e.code == 502:
                 return LLMConnectionError(
                     message=f"Bad gateway from {self._provider_name()}: {str(e)}",
                     code=ErrorCode.INTERNAL_SERVER_ERROR,
-                    details={"cause": str(e.__cause__) if e.__cause__ else None},
+                    details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
                 )
             elif e.code == 503:
                 return LLMServerError(
@@ -944,13 +956,14 @@ class GoogleVertexClient(LLMClientBase):
                     details={
                         "status_code": e.code,
                         "response_json": getattr(e, "response_json", None),
+                        "is_byok": is_byok,
                     },
                 )
             elif e.code == 504:
                 return LLMTimeoutError(
                     message=f"Gateway timeout from {self._provider_name()}: {str(e)}",
                     code=ErrorCode.TIMEOUT,
-                    details={"cause": str(e.__cause__) if e.__cause__ else None},
+                    details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
                 )
             else:
                 return LLMServerError(
@@ -959,6 +972,7 @@ class GoogleVertexClient(LLMClientBase):
                     details={
                         "status_code": e.code,
                         "response_json": getattr(e, "response_json", None),
+                        "is_byok": is_byok,
                     },
                 )
 
@@ -970,6 +984,7 @@ class GoogleVertexClient(LLMClientBase):
                 details={
                     "status_code": e.code,
                     "response_json": getattr(e, "response_json", None),
+                    "is_byok": is_byok,
                 },
             )
 
@@ -981,7 +996,7 @@ class GoogleVertexClient(LLMClientBase):
             return LLMConnectionError(
                 message=f"Connection error during {self._provider_name()} streaming: {str(e)}",
                 code=ErrorCode.INTERNAL_SERVER_ERROR,
-                details={"cause": str(e.__cause__) if e.__cause__ else None},
+                details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
             )
 
         # Handle httpx network errors which can occur during streaming
@@ -991,7 +1006,7 @@ class GoogleVertexClient(LLMClientBase):
             return LLMConnectionError(
                 message=f"Network error during {self._provider_name()} streaming: {str(e)}",
                 code=ErrorCode.INTERNAL_SERVER_ERROR,
-                details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__},
+                details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__, "is_byok": is_byok},
             )
 
         # Handle connection-related errors
@@ -1000,11 +1015,11 @@ class GoogleVertexClient(LLMClientBase):
             return LLMConnectionError(
                 message=f"Failed to connect to {self._provider_name()}: {str(e)}",
                 code=ErrorCode.INTERNAL_SERVER_ERROR,
-                details={"cause": str(e.__cause__) if e.__cause__ else None},
+                details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
             )
 
         # Fallback to base implementation for other errors
-        return super().handle_llm_error(e)
+        return super().handle_llm_error(e, llm_config=llm_config)
 
     async def count_tokens(self, messages: List[dict] = None, model: str = None, tools: List[OpenAITool] = None) -> int:
         """
diff --git a/letta/llm_api/llm_client_base.py b/letta/llm_api/llm_client_base.py
index be54b6b9..b4b8d69d 100644
--- a/letta/llm_api/llm_client_base.py
+++ b/letta/llm_api/llm_client_base.py
@@ -226,7 +226,7 @@ class LLMClientBase:
                 )
             log_event(name="llm_response_received", attributes=response_data)
         except Exception as e:
-            raise self.handle_llm_error(e)
+            raise self.handle_llm_error(e, llm_config=llm_config)
 
         return await self.convert_response_to_chat_completion(response_data, messages, llm_config)
 
@@ -261,7 +261,7 @@ class LLMClientBase:
 
             log_event(name="llm_response_received", attributes=response_data)
         except Exception as e:
-            raise self.handle_llm_error(e)
+            raise self.handle_llm_error(e, llm_config=llm_config)
 
         return await self.convert_response_to_chat_completion(response_data, messages, llm_config)
 
@@ -353,17 +353,20 @@ class LLMClientBase:
         raise NotImplementedError
 
     @abstractmethod
-    def handle_llm_error(self, e: Exception) -> Exception:
+    def handle_llm_error(self, e: Exception, llm_config: Optional["LLMConfig"] = None) -> Exception:
         """
         Maps provider-specific errors to common LLMError types.
         Each LLM provider should implement this to translate their specific errors.
 
         Args:
             e: The original provider-specific exception
+            llm_config: Optional LLM config to determine if this is a BYOK key
 
         Returns:
             An LLMError subclass that represents the error in a provider-agnostic way
         """
+        is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
+
         # Handle httpx.RemoteProtocolError which can occur during streaming
         # when the remote server closes the connection unexpectedly
         # (e.g., "peer closed connection without sending complete message body")
@@ -375,10 +378,10 @@ class LLMClientBase:
             return LLMConnectionError(
                 message=f"Connection error during streaming: {str(e)}",
                 code=ErrorCode.INTERNAL_SERVER_ERROR,
-                details={"cause": str(e.__cause__) if e.__cause__ else None},
+                details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
             )
 
-        return LLMError(f"Unhandled LLM error: {str(e)}")
+        return LLMError(message=f"Unhandled LLM error: {str(e)}", details={"is_byok": is_byok})
 
     def get_byok_overrides(self, llm_config: LLMConfig) -> Tuple[Optional[str], Optional[str], Optional[str]]:
         """
diff --git a/letta/llm_api/openai_client.py b/letta/llm_api/openai_client.py
index 93ddc32b..7b6f84e2 100644
--- a/letta/llm_api/openai_client.py
+++ b/letta/llm_api/openai_client.py
@@ -40,6 +40,7 @@ from letta.log import get_logger
 from letta.otel.tracing import trace_method
 from letta.schemas.agent import AgentType
 from letta.schemas.embedding_config import EmbeddingConfig
+from letta.schemas.enums import ProviderCategory
 from letta.schemas.letta_message_content import MessageContentType
 from letta.schemas.llm_config import LLMConfig
 from letta.schemas.message import Message as PydanticMessage
@@ -1015,10 +1016,11 @@ class OpenAIClient(LLMClientBase):
         return results
 
     @trace_method
-    def handle_llm_error(self, e: Exception) -> Exception:
+    def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
         """
         Maps OpenAI-specific errors to common LLMError types.
         """
+        is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
         if isinstance(e, openai.APITimeoutError):
             timeout_duration = getattr(e, "timeout", "unknown")
             logger.warning(f"[OpenAI] Request timeout after {timeout_duration} seconds: {e}")
@@ -1028,6 +1030,7 @@ class OpenAIClient(LLMClientBase):
                 details={
                     "timeout_duration": timeout_duration,
                     "cause": str(e.__cause__) if e.__cause__ else None,
+                    "is_byok": is_byok,
                 },
             )
 
@@ -1036,7 +1039,7 @@ class OpenAIClient(LLMClientBase):
             return LLMConnectionError(
                 message=f"Failed to connect to OpenAI: {str(e)}",
                 code=ErrorCode.INTERNAL_SERVER_ERROR,
-                details={"cause": str(e.__cause__) if e.__cause__ else None},
+                details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
             )
 
         # Handle httpx.RemoteProtocolError which can occur during streaming
@@ -1047,7 +1050,7 @@ class OpenAIClient(LLMClientBase):
             return LLMConnectionError(
                 message=f"Connection error during OpenAI streaming: {str(e)}",
                 code=ErrorCode.INTERNAL_SERVER_ERROR,
-                details={"cause": str(e.__cause__) if e.__cause__ else None},
+                details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
             )
 
         # Handle httpx network errors which can occur during streaming
@@ -1057,15 +1060,16 @@ class OpenAIClient(LLMClientBase):
             return LLMConnectionError(
                 message=f"Network error during OpenAI streaming: {str(e)}",
                 code=ErrorCode.INTERNAL_SERVER_ERROR,
-                details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__},
+                details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__, "is_byok": is_byok},
             )
 
         if isinstance(e, openai.RateLimitError):
             logger.warning(f"[OpenAI] Rate limited (429). Consider backoff. Error: {e}")
+            body_details = e.body if isinstance(e.body, dict) else {"body": e.body}
             return LLMRateLimitError(
                 message=f"Rate limited by OpenAI: {str(e)}",
                 code=ErrorCode.RATE_LIMIT_EXCEEDED,
-                details=e.body,  # Include body which often has rate limit details
+                details={**body_details, "is_byok": is_byok},
             )
 
         if isinstance(e, openai.BadRequestError):
@@ -1082,12 +1086,14 @@ class OpenAIClient(LLMClientBase):
             if error_code == "context_length_exceeded" or is_context_window_overflow_message(str(e)):
                 return ContextWindowExceededError(
                     message=f"Bad request to OpenAI (context window exceeded): {str(e)}",
+                    details={"is_byok": is_byok},
                 )
             else:
+                body_details = e.body if isinstance(e.body, dict) else {"body": e.body}
                 return LLMBadRequestError(
                     message=f"Bad request to OpenAI: {str(e)}",
-                    code=ErrorCode.INVALID_ARGUMENT,  # Or more specific if detectable
-                    details=e.body,
+                    code=ErrorCode.INVALID_ARGUMENT,
+                    details={**body_details, "is_byok": is_byok},
                 )
 
         # NOTE: The OpenAI Python SDK may raise a generic `openai.APIError` while *iterating*
@@ -1104,34 +1110,46 @@ class OpenAIClient(LLMClientBase):
                     message=f"OpenAI request exceeded the context window: {msg}",
                     details={
                         "provider_exception_type": type(e).__name__,
-                        # Best-effort extraction (may not exist on APIError)
                         "body": getattr(e, "body", None),
+                        "is_byok": is_byok,
                     },
                 )
 
         if isinstance(e, openai.AuthenticationError):
             logger.error(f"[OpenAI] Authentication error (401): {str(e)}")  # More severe log level
+            body_details = e.body if isinstance(e.body, dict) else {"body": e.body}
             return LLMAuthenticationError(
-                message=f"Authentication failed with OpenAI: {str(e)}", code=ErrorCode.UNAUTHENTICATED, details=e.body
+                message=f"Authentication failed with OpenAI: {str(e)}",
+                code=ErrorCode.UNAUTHENTICATED,
+                details={**body_details, "is_byok": is_byok},
             )
 
         if isinstance(e, openai.PermissionDeniedError):
             logger.error(f"[OpenAI] Permission denied (403): {str(e)}")  # More severe log level
+            body_details = e.body if isinstance(e.body, dict) else {"body": e.body}
             return LLMPermissionDeniedError(
-                message=f"Permission denied by OpenAI: {str(e)}", code=ErrorCode.PERMISSION_DENIED, details=e.body
+                message=f"Permission denied by OpenAI: {str(e)}",
+                code=ErrorCode.PERMISSION_DENIED,
+                details={**body_details, "is_byok": is_byok},
             )
 
         if isinstance(e, openai.NotFoundError):
             logger.warning(f"[OpenAI] Resource not found (404): {str(e)}")
             # Could be invalid model name, etc.
-            return LLMNotFoundError(message=f"Resource not found in OpenAI: {str(e)}", code=ErrorCode.NOT_FOUND, details=e.body)
+            body_details = e.body if isinstance(e.body, dict) else {"body": e.body}
+            return LLMNotFoundError(
+                message=f"Resource not found in OpenAI: {str(e)}",
+                code=ErrorCode.NOT_FOUND,
+                details={**body_details, "is_byok": is_byok},
+            )
 
         if isinstance(e, openai.UnprocessableEntityError):
             logger.warning(f"[OpenAI] Unprocessable entity (422): {str(e)}")
+            body_details = e.body if isinstance(e.body, dict) else {"body": e.body}
             return LLMUnprocessableEntityError(
                 message=f"Invalid request content for OpenAI: {str(e)}",
-                code=ErrorCode.INVALID_ARGUMENT,  # Usually validation errors
-                details=e.body,
+                code=ErrorCode.INVALID_ARGUMENT,
+                details={**body_details, "is_byok": is_byok},
             )
 
         # General API error catch-all
@@ -1141,6 +1159,7 @@ class OpenAIClient(LLMClientBase):
             if e.status_code == 413:
                 return ContextWindowExceededError(
                     message=f"Request too large for OpenAI (413): {str(e)}",
+                    details={"is_byok": is_byok},
                 )
             # Map based on status code potentially
             if e.status_code >= 500:
@@ -1158,11 +1177,12 @@ class OpenAIClient(LLMClientBase):
                     "status_code": e.status_code,
                     "response": str(e.response),
                     "body": e.body,
+                    "is_byok": is_byok,
                 },
             )
 
         # Fallback for unexpected errors
-        return super().handle_llm_error(e)
+        return super().handle_llm_error(e, llm_config=llm_config)
 
 
 def fill_image_content_in_messages(openai_message_list: List[dict], pydantic_message_list: List[PydanticMessage]) -> List[dict]:
diff --git a/letta/server/rest_api/app.py b/letta/server/rest_api/app.py
index 56a2c500..d5842cc9 100644
--- a/letta/server/rest_api/app.py
+++ b/letta/server/rest_api/app.py
@@ -668,12 +668,19 @@ def create_application() -> "FastAPI":
 
     @app.exception_handler(LLMRateLimitError)
     async def llm_rate_limit_error_handler(request: Request, exc: LLMRateLimitError):
+        is_byok = exc.details.get("is_byok") if isinstance(exc.details, dict) else None
+        if is_byok:
+            message = (
+                "Rate limit exceeded on your API key. Please check your provider's rate limits and billing, or reduce request frequency."
+            )
+        else:
+            message = "Rate limit exceeded for LLM model provider. Please wait before making another request."
         return JSONResponse(
             status_code=429,
             content={
                 "error": {
                     "type": "llm_rate_limit",
-                    "message": "Rate limit exceeded for LLM model provider. Please wait before making another request.",
+                    "message": message,
                     "detail": str(exc),
                 }
             },
diff --git a/letta/services/summarizer/summarizer.py b/letta/services/summarizer/summarizer.py
index a3247497..e7aabbae 100644
--- a/letta/services/summarizer/summarizer.py
+++ b/letta/services/summarizer/summarizer.py
@@ -592,7 +592,7 @@ async def simple_summary(
     except Exception as e:
         # handle LLM error (likely a context window exceeded error)
         try:
-            raise llm_client.handle_llm_error(e)
+            raise llm_client.handle_llm_error(e, llm_config=llm_config)
         except ContextWindowExceededError as context_error:
             logger.warning(f"Context window exceeded during summarization. Applying clamping fallbacks. Original error: {context_error}")
 
@@ -667,7 +667,7 @@ async def simple_summary(
                 except Exception as fallback_error_b:
                     logger.error(f"Transcript truncation fallback also failed: {fallback_error_b}. Propagating error.")
                     logger.info(f"Full fallback summarization payload: {request_data}")
-                    raise llm_client.handle_llm_error(fallback_error_b)
+                    raise llm_client.handle_llm_error(fallback_error_b, llm_config=llm_config)
 
     logger.info(f"Summarized {len(messages)}: {summary}")