fix(core): differentiate BYOK vs base provider in all LLM error details (#9425)
Add is_byok flag to every LLMError's details dict returned from
handle_llm_error across all providers (OpenAI, Anthropic, Google,
ChatGPT OAuth). This enables observability into whether errors
originate from Letta's production keys or user-provided BYOK keys.
The rate limit handler in app.py now returns a more helpful message
for BYOK users ("check your provider's rate limits and billing")
versus the generic message for base provider rate limits.
Datadog issues:
- https://us5.datadoghq.com/error-tracking/issue/b711c824-f490-11f0-96e4-da7ad0900000
- https://us5.datadoghq.com/error-tracking/issue/76623036-f4de-11f0-8697-da7ad0900000
- https://us5.datadoghq.com/error-tracking/issue/43e9888a-dfcf-11f0-a645-da7ad0900000
🤖 Generated with [Letta Code](https://letta.com)
Co-authored-by: Letta <noreply@letta.com>
This commit is contained in:
@@ -37,6 +37,7 @@ from letta.local_llm.json_parser import clean_json_string_extra_backslash
|
||||
from letta.log import get_logger
|
||||
from letta.otel.tracing import trace_method
|
||||
from letta.schemas.agent import AgentType
|
||||
from letta.schemas.enums import ProviderCategory
|
||||
from letta.schemas.llm_config import LLMConfig
|
||||
from letta.schemas.message import Message as PydanticMessage
|
||||
from letta.schemas.openai.chat_completion_request import Tool, Tool as OpenAITool
|
||||
@@ -93,7 +94,7 @@ class GoogleVertexClient(LLMClientBase):
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
)
|
||||
except Exception as e:
|
||||
raise self.handle_llm_error(e)
|
||||
raise self.handle_llm_error(e, llm_config=llm_config)
|
||||
|
||||
@trace_method
|
||||
async def request_async(self, request_data: dict, llm_config: LLMConfig) -> dict:
|
||||
@@ -135,11 +136,11 @@ class GoogleVertexClient(LLMClientBase):
|
||||
logger.warning(f"Received {e}, retrying {retry_count}/{self.MAX_RETRIES}")
|
||||
retry_count += 1
|
||||
if retry_count > self.MAX_RETRIES:
|
||||
raise self.handle_llm_error(e)
|
||||
raise self.handle_llm_error(e, llm_config=llm_config)
|
||||
continue
|
||||
raise self.handle_llm_error(e)
|
||||
raise self.handle_llm_error(e, llm_config=llm_config)
|
||||
except Exception as e:
|
||||
raise self.handle_llm_error(e)
|
||||
raise self.handle_llm_error(e, llm_config=llm_config)
|
||||
response_data = response.model_dump()
|
||||
is_malformed_function_call = self.is_malformed_function_call(response_data)
|
||||
if is_malformed_function_call:
|
||||
@@ -211,9 +212,9 @@ class GoogleVertexClient(LLMClientBase):
|
||||
if e.code == 499:
|
||||
logger.info(f"{self._provider_prefix()} Stream cancelled by client (499): {e}")
|
||||
return
|
||||
raise self.handle_llm_error(e)
|
||||
raise self.handle_llm_error(e, llm_config=llm_config)
|
||||
except errors.APIError as e:
|
||||
raise self.handle_llm_error(e)
|
||||
raise self.handle_llm_error(e, llm_config=llm_config)
|
||||
|
||||
@staticmethod
|
||||
def add_dummy_model_messages(messages: List[dict]) -> List[dict]:
|
||||
@@ -851,7 +852,9 @@ class GoogleVertexClient(LLMClientBase):
|
||||
return False
|
||||
|
||||
@trace_method
|
||||
def handle_llm_error(self, e: Exception) -> Exception:
|
||||
def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
|
||||
is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
|
||||
|
||||
# Handle Google GenAI specific errors
|
||||
if isinstance(e, errors.ClientError):
|
||||
if e.code == 499:
|
||||
@@ -859,7 +862,7 @@ class GoogleVertexClient(LLMClientBase):
|
||||
return LLMConnectionError(
|
||||
message=f"Request to {self._provider_name()} was cancelled (client disconnected): {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"status_code": 499, "cause": "client_cancelled"},
|
||||
details={"status_code": 499, "cause": "client_cancelled", "is_byok": is_byok},
|
||||
)
|
||||
|
||||
logger.warning(f"{self._provider_prefix()} Client error ({e.code}): {e}")
|
||||
@@ -870,43 +873,50 @@ class GoogleVertexClient(LLMClientBase):
|
||||
if "context" in error_str and ("exceed" in error_str or "limit" in error_str or "too long" in error_str):
|
||||
return ContextWindowExceededError(
|
||||
message=f"Bad request to {self._provider_name()} (context window exceeded): {str(e)}",
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
else:
|
||||
return LLMBadRequestError(
|
||||
message=f"Bad request to {self._provider_name()}: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
elif e.code == 401:
|
||||
return LLMAuthenticationError(
|
||||
message=f"Authentication failed with {self._provider_name()}: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
elif e.code == 403:
|
||||
return LLMPermissionDeniedError(
|
||||
message=f"Permission denied by {self._provider_name()}: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
elif e.code == 404:
|
||||
return LLMNotFoundError(
|
||||
message=f"Resource not found in {self._provider_name()}: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
elif e.code == 408:
|
||||
return LLMTimeoutError(
|
||||
message=f"Request to {self._provider_name()} timed out: {str(e)}",
|
||||
code=ErrorCode.TIMEOUT,
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None},
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
|
||||
)
|
||||
elif e.code == 422:
|
||||
return LLMUnprocessableEntityError(
|
||||
message=f"Invalid request content for {self._provider_name()}: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
elif e.code == 429:
|
||||
logger.warning(f"{self._provider_prefix()} Rate limited (429). Consider backoff.")
|
||||
return LLMRateLimitError(
|
||||
message=f"Rate limited by {self._provider_name()}: {str(e)}",
|
||||
code=ErrorCode.RATE_LIMIT_EXCEEDED,
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
else:
|
||||
return LLMServerError(
|
||||
@@ -915,6 +925,7 @@ class GoogleVertexClient(LLMClientBase):
|
||||
details={
|
||||
"status_code": e.code,
|
||||
"response_json": getattr(e, "response_json", None),
|
||||
"is_byok": is_byok,
|
||||
},
|
||||
)
|
||||
|
||||
@@ -929,13 +940,14 @@ class GoogleVertexClient(LLMClientBase):
|
||||
details={
|
||||
"status_code": e.code,
|
||||
"response_json": getattr(e, "response_json", None),
|
||||
"is_byok": is_byok,
|
||||
},
|
||||
)
|
||||
elif e.code == 502:
|
||||
return LLMConnectionError(
|
||||
message=f"Bad gateway from {self._provider_name()}: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None},
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
|
||||
)
|
||||
elif e.code == 503:
|
||||
return LLMServerError(
|
||||
@@ -944,13 +956,14 @@ class GoogleVertexClient(LLMClientBase):
|
||||
details={
|
||||
"status_code": e.code,
|
||||
"response_json": getattr(e, "response_json", None),
|
||||
"is_byok": is_byok,
|
||||
},
|
||||
)
|
||||
elif e.code == 504:
|
||||
return LLMTimeoutError(
|
||||
message=f"Gateway timeout from {self._provider_name()}: {str(e)}",
|
||||
code=ErrorCode.TIMEOUT,
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None},
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
|
||||
)
|
||||
else:
|
||||
return LLMServerError(
|
||||
@@ -959,6 +972,7 @@ class GoogleVertexClient(LLMClientBase):
|
||||
details={
|
||||
"status_code": e.code,
|
||||
"response_json": getattr(e, "response_json", None),
|
||||
"is_byok": is_byok,
|
||||
},
|
||||
)
|
||||
|
||||
@@ -970,6 +984,7 @@ class GoogleVertexClient(LLMClientBase):
|
||||
details={
|
||||
"status_code": e.code,
|
||||
"response_json": getattr(e, "response_json", None),
|
||||
"is_byok": is_byok,
|
||||
},
|
||||
)
|
||||
|
||||
@@ -981,7 +996,7 @@ class GoogleVertexClient(LLMClientBase):
|
||||
return LLMConnectionError(
|
||||
message=f"Connection error during {self._provider_name()} streaming: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None},
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
|
||||
)
|
||||
|
||||
# Handle httpx network errors which can occur during streaming
|
||||
@@ -991,7 +1006,7 @@ class GoogleVertexClient(LLMClientBase):
|
||||
return LLMConnectionError(
|
||||
message=f"Network error during {self._provider_name()} streaming: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__},
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__, "is_byok": is_byok},
|
||||
)
|
||||
|
||||
# Handle connection-related errors
|
||||
@@ -1000,11 +1015,11 @@ class GoogleVertexClient(LLMClientBase):
|
||||
return LLMConnectionError(
|
||||
message=f"Failed to connect to {self._provider_name()}: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None},
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
|
||||
)
|
||||
|
||||
# Fallback to base implementation for other errors
|
||||
return super().handle_llm_error(e)
|
||||
return super().handle_llm_error(e, llm_config=llm_config)
|
||||
|
||||
async def count_tokens(self, messages: List[dict] = None, model: str = None, tools: List[OpenAITool] = None) -> int:
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user