fix(core): differentiate BYOK vs base provider in all LLM error details (#9425)

Add is_byok flag to every LLMError's details dict returned from handle_llm_error across all providers (OpenAI, Anthropic, Google, ChatGPT OAuth). This enables observability into whether errors originate from Letta's production keys or user-provided BYOK keys. The rate limit handler in app.py now returns a more helpful message for BYOK users ("check your provider's rate limits and billing") versus the generic message for base provider rate limits. Datadog issues: - https://us5.datadoghq.com/error-tracking/issue/b711c824-f490-11f0-96e4-da7ad0900000 - https://us5.datadoghq.com/error-tracking/issue/76623036-f4de-11f0-8697-da7ad0900000 - https://us5.datadoghq.com/error-tracking/issue/43e9888a-dfcf-11f0-a645-da7ad0900000 🤖 Generated with [Letta Code](https://letta.com) Co-authored-by: Letta <noreply@letta.com>
2026-02-10 20:38:25 -08:00
parent 424a1ada64
commit 382e216cbb
12 changed files with 123 additions and 54 deletions
--- a/letta/llm_api/google_vertex_client.py
+++ b/letta/llm_api/google_vertex_client.py
@@ -37,6 +37,7 @@ from letta.local_llm.json_parser import clean_json_string_extra_backslash
 from letta.log import get_logger
 from letta.otel.tracing import trace_method
 from letta.schemas.agent import AgentType
+from letta.schemas.enums import ProviderCategory
 from letta.schemas.llm_config import LLMConfig
 from letta.schemas.message import Message as PydanticMessage
 from letta.schemas.openai.chat_completion_request import Tool, Tool as OpenAITool
@@ -93,7 +94,7 @@ class GoogleVertexClient(LLMClientBase):
                code=ErrorCode.INTERNAL_SERVER_ERROR,
            )
        except Exception as e:
-            raise self.handle_llm_error(e)
+            raise self.handle_llm_error(e, llm_config=llm_config)

    @trace_method
    async def request_async(self, request_data: dict, llm_config: LLMConfig) -> dict:
@@ -135,11 +136,11 @@ class GoogleVertexClient(LLMClientBase):
                    logger.warning(f"Received {e}, retrying {retry_count}/{self.MAX_RETRIES}")
                    retry_count += 1
                    if retry_count > self.MAX_RETRIES:
-                        raise self.handle_llm_error(e)
+                        raise self.handle_llm_error(e, llm_config=llm_config)
                    continue
-                raise self.handle_llm_error(e)
+                raise self.handle_llm_error(e, llm_config=llm_config)
            except Exception as e:
-                raise self.handle_llm_error(e)
+                raise self.handle_llm_error(e, llm_config=llm_config)
            response_data = response.model_dump()
            is_malformed_function_call = self.is_malformed_function_call(response_data)
            if is_malformed_function_call:
@@ -211,9 +212,9 @@ class GoogleVertexClient(LLMClientBase):
            if e.code == 499:
                logger.info(f"{self._provider_prefix()} Stream cancelled by client (499): {e}")
                return
-            raise self.handle_llm_error(e)
+            raise self.handle_llm_error(e, llm_config=llm_config)
        except errors.APIError as e:
-            raise self.handle_llm_error(e)
+            raise self.handle_llm_error(e, llm_config=llm_config)

    @staticmethod
    def add_dummy_model_messages(messages: List[dict]) -> List[dict]:
@@ -851,7 +852,9 @@ class GoogleVertexClient(LLMClientBase):
        return False

    @trace_method
-    def handle_llm_error(self, e: Exception) -> Exception:
+    def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
+        is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
+
        # Handle Google GenAI specific errors
        if isinstance(e, errors.ClientError):
            if e.code == 499:
@@ -859,7 +862,7 @@ class GoogleVertexClient(LLMClientBase):
                return LLMConnectionError(
                    message=f"Request to {self._provider_name()} was cancelled (client disconnected): {str(e)}",
                    code=ErrorCode.INTERNAL_SERVER_ERROR,
-                    details={"status_code": 499, "cause": "client_cancelled"},
+                    details={"status_code": 499, "cause": "client_cancelled", "is_byok": is_byok},
                )

            logger.warning(f"{self._provider_prefix()} Client error ({e.code}): {e}")
@@ -870,43 +873,50 @@ class GoogleVertexClient(LLMClientBase):
                if "context" in error_str and ("exceed" in error_str or "limit" in error_str or "too long" in error_str):
                    return ContextWindowExceededError(
                        message=f"Bad request to {self._provider_name()} (context window exceeded): {str(e)}",
+                        details={"is_byok": is_byok},
                    )
                else:
                    return LLMBadRequestError(
                        message=f"Bad request to {self._provider_name()}: {str(e)}",
                        code=ErrorCode.INTERNAL_SERVER_ERROR,
+                        details={"is_byok": is_byok},
                    )
            elif e.code == 401:
                return LLMAuthenticationError(
                    message=f"Authentication failed with {self._provider_name()}: {str(e)}",
                    code=ErrorCode.INTERNAL_SERVER_ERROR,
+                    details={"is_byok": is_byok},
                )
            elif e.code == 403:
                return LLMPermissionDeniedError(
                    message=f"Permission denied by {self._provider_name()}: {str(e)}",
                    code=ErrorCode.INTERNAL_SERVER_ERROR,
+                    details={"is_byok": is_byok},
                )
            elif e.code == 404:
                return LLMNotFoundError(
                    message=f"Resource not found in {self._provider_name()}: {str(e)}",
                    code=ErrorCode.INTERNAL_SERVER_ERROR,
+                    details={"is_byok": is_byok},
                )
            elif e.code == 408:
                return LLMTimeoutError(
                    message=f"Request to {self._provider_name()} timed out: {str(e)}",
                    code=ErrorCode.TIMEOUT,
-                    details={"cause": str(e.__cause__) if e.__cause__ else None},
+                    details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
                )
            elif e.code == 422:
                return LLMUnprocessableEntityError(
                    message=f"Invalid request content for {self._provider_name()}: {str(e)}",
                    code=ErrorCode.INTERNAL_SERVER_ERROR,
+                    details={"is_byok": is_byok},
                )
            elif e.code == 429:
                logger.warning(f"{self._provider_prefix()} Rate limited (429). Consider backoff.")
                return LLMRateLimitError(
                    message=f"Rate limited by {self._provider_name()}: {str(e)}",
                    code=ErrorCode.RATE_LIMIT_EXCEEDED,
+                    details={"is_byok": is_byok},
                )
            else:
                return LLMServerError(
@@ -915,6 +925,7 @@ class GoogleVertexClient(LLMClientBase):
                    details={
                        "status_code": e.code,
                        "response_json": getattr(e, "response_json", None),
+                        "is_byok": is_byok,
                    },
                )

@@ -929,13 +940,14 @@ class GoogleVertexClient(LLMClientBase):
                    details={
                        "status_code": e.code,
                        "response_json": getattr(e, "response_json", None),
+                        "is_byok": is_byok,
                    },
                )
            elif e.code == 502:
                return LLMConnectionError(
                    message=f"Bad gateway from {self._provider_name()}: {str(e)}",
                    code=ErrorCode.INTERNAL_SERVER_ERROR,
-                    details={"cause": str(e.__cause__) if e.__cause__ else None},
+                    details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
                )
            elif e.code == 503:
                return LLMServerError(
@@ -944,13 +956,14 @@ class GoogleVertexClient(LLMClientBase):
                    details={
                        "status_code": e.code,
                        "response_json": getattr(e, "response_json", None),
+                        "is_byok": is_byok,
                    },
                )
            elif e.code == 504:
                return LLMTimeoutError(
                    message=f"Gateway timeout from {self._provider_name()}: {str(e)}",
                    code=ErrorCode.TIMEOUT,
-                    details={"cause": str(e.__cause__) if e.__cause__ else None},
+                    details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
                )
            else:
                return LLMServerError(
@@ -959,6 +972,7 @@ class GoogleVertexClient(LLMClientBase):
                    details={
                        "status_code": e.code,
                        "response_json": getattr(e, "response_json", None),
+                        "is_byok": is_byok,
                    },
                )

@@ -970,6 +984,7 @@ class GoogleVertexClient(LLMClientBase):
                details={
                    "status_code": e.code,
                    "response_json": getattr(e, "response_json", None),
+                    "is_byok": is_byok,
                },
            )

@@ -981,7 +996,7 @@ class GoogleVertexClient(LLMClientBase):
            return LLMConnectionError(
                message=f"Connection error during {self._provider_name()} streaming: {str(e)}",
                code=ErrorCode.INTERNAL_SERVER_ERROR,
-                details={"cause": str(e.__cause__) if e.__cause__ else None},
+                details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
            )

        # Handle httpx network errors which can occur during streaming
@@ -991,7 +1006,7 @@ class GoogleVertexClient(LLMClientBase):
            return LLMConnectionError(
                message=f"Network error during {self._provider_name()} streaming: {str(e)}",
                code=ErrorCode.INTERNAL_SERVER_ERROR,
-                details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__},
+                details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__, "is_byok": is_byok},
            )

        # Handle connection-related errors
@@ -1000,11 +1015,11 @@ class GoogleVertexClient(LLMClientBase):
            return LLMConnectionError(
                message=f"Failed to connect to {self._provider_name()}: {str(e)}",
                code=ErrorCode.INTERNAL_SERVER_ERROR,
-                details={"cause": str(e.__cause__) if e.__cause__ else None},
+                details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
            )

        # Fallback to base implementation for other errors
-        return super().handle_llm_error(e)
+        return super().handle_llm_error(e, llm_config=llm_config)

    async def count_tokens(self, messages: List[dict] = None, model: str = None, tools: List[OpenAITool] = None) -> int:
        """