fix(core): differentiate BYOK vs base provider in all LLM error details (#9425)
Add is_byok flag to every LLMError's details dict returned from
handle_llm_error across all providers (OpenAI, Anthropic, Google,
ChatGPT OAuth). This enables observability into whether errors
originate from Letta's production keys or user-provided BYOK keys.
The rate limit handler in app.py now returns a more helpful message
for BYOK users ("check your provider's rate limits and billing")
versus the generic message for base provider rate limits.
Datadog issues:
- https://us5.datadoghq.com/error-tracking/issue/b711c824-f490-11f0-96e4-da7ad0900000
- https://us5.datadoghq.com/error-tracking/issue/76623036-f4de-11f0-8697-da7ad0900000
- https://us5.datadoghq.com/error-tracking/issue/43e9888a-dfcf-11f0-a645-da7ad0900000
🤖 Generated with [Letta Code](https://letta.com)
Co-authored-by: Letta <noreply@letta.com>
This commit is contained in:
@@ -114,7 +114,7 @@ class LettaLLMStreamAdapter(LettaLLMAdapter):
|
||||
error_msg=str(e),
|
||||
error_type=type(e).__name__,
|
||||
)
|
||||
raise self.llm_client.handle_llm_error(e)
|
||||
raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config)
|
||||
|
||||
# Process the stream and yield chunks immediately for TTFT
|
||||
# Wrap in error handling to convert provider errors to common LLMError types
|
||||
@@ -133,7 +133,7 @@ class LettaLLMStreamAdapter(LettaLLMAdapter):
|
||||
error_msg=str(e),
|
||||
error_type=type(e).__name__,
|
||||
)
|
||||
raise self.llm_client.handle_llm_error(e)
|
||||
raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config)
|
||||
|
||||
# After streaming completes, extract the accumulated data
|
||||
self.llm_request_finish_timestamp_ns = get_utc_timestamp_ns()
|
||||
|
||||
@@ -54,7 +54,7 @@ class SimpleLLMRequestAdapter(LettaLLMRequestAdapter):
|
||||
try:
|
||||
self.response_data = await self.llm_client.request_async_with_telemetry(request_data, self.llm_config)
|
||||
except Exception as e:
|
||||
raise self.llm_client.handle_llm_error(e)
|
||||
raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config)
|
||||
|
||||
self.llm_request_finish_timestamp_ns = get_utc_timestamp_ns()
|
||||
|
||||
|
||||
@@ -151,7 +151,7 @@ class SimpleLLMStreamAdapter(LettaLLMStreamAdapter):
|
||||
error_msg=str(e),
|
||||
error_type=type(e).__name__,
|
||||
)
|
||||
raise self.llm_client.handle_llm_error(e)
|
||||
raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config)
|
||||
|
||||
# Process the stream and yield chunks immediately for TTFT
|
||||
try:
|
||||
@@ -169,7 +169,7 @@ class SimpleLLMStreamAdapter(LettaLLMStreamAdapter):
|
||||
error_msg=str(e),
|
||||
error_type=type(e).__name__,
|
||||
)
|
||||
raise self.llm_client.handle_llm_error(e)
|
||||
raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config)
|
||||
|
||||
# After streaming completes, extract the accumulated data
|
||||
self.llm_request_finish_timestamp_ns = get_utc_timestamp_ns()
|
||||
|
||||
@@ -1547,7 +1547,7 @@ class LettaAgent(BaseAgent):
|
||||
step_id=step_id,
|
||||
)
|
||||
else:
|
||||
raise llm_client.handle_llm_error(e)
|
||||
raise llm_client.handle_llm_error(e, llm_config=llm_config)
|
||||
|
||||
@trace_method
|
||||
async def _rebuild_context_window(
|
||||
|
||||
@@ -37,6 +37,7 @@ from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG
|
||||
from letta.log import get_logger
|
||||
from letta.otel.tracing import trace_method
|
||||
from letta.schemas.agent import AgentType
|
||||
from letta.schemas.enums import ProviderCategory
|
||||
from letta.schemas.llm_config import LLMConfig
|
||||
from letta.schemas.message import Message as PydanticMessage
|
||||
from letta.schemas.openai.chat_completion_request import Tool as OpenAITool
|
||||
@@ -937,7 +938,9 @@ class AnthropicClient(LLMClientBase):
|
||||
)
|
||||
|
||||
@trace_method
|
||||
def handle_llm_error(self, e: Exception) -> Exception:
|
||||
def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
|
||||
is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
|
||||
|
||||
# make sure to check for overflow errors, regardless of error type
|
||||
error_str = str(e).lower()
|
||||
if (
|
||||
@@ -952,6 +955,7 @@ class AnthropicClient(LLMClientBase):
|
||||
logger.warning(f"[Anthropic] Context window exceeded: {str(e)}")
|
||||
return ContextWindowExceededError(
|
||||
message=f"Context window exceeded for Anthropic: {str(e)}",
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
|
||||
if isinstance(e, anthropic.APITimeoutError):
|
||||
@@ -959,7 +963,7 @@ class AnthropicClient(LLMClientBase):
|
||||
return LLMTimeoutError(
|
||||
message=f"Request to Anthropic timed out: {str(e)}",
|
||||
code=ErrorCode.TIMEOUT,
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None},
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
|
||||
)
|
||||
|
||||
if isinstance(e, anthropic.APIConnectionError):
|
||||
@@ -967,7 +971,7 @@ class AnthropicClient(LLMClientBase):
|
||||
return LLMConnectionError(
|
||||
message=f"Failed to connect to Anthropic: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None},
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
|
||||
)
|
||||
|
||||
# Handle httpx.RemoteProtocolError which can occur during streaming
|
||||
@@ -978,7 +982,7 @@ class AnthropicClient(LLMClientBase):
|
||||
return LLMConnectionError(
|
||||
message=f"Connection error during Anthropic streaming: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None},
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
|
||||
)
|
||||
|
||||
# Handle httpx network errors which can occur during streaming
|
||||
@@ -988,7 +992,7 @@ class AnthropicClient(LLMClientBase):
|
||||
return LLMConnectionError(
|
||||
message=f"Network error during Anthropic streaming: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__},
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__, "is_byok": is_byok},
|
||||
)
|
||||
|
||||
if isinstance(e, anthropic.RateLimitError):
|
||||
@@ -996,6 +1000,7 @@ class AnthropicClient(LLMClientBase):
|
||||
return LLMRateLimitError(
|
||||
message=f"Rate limited by Anthropic: {str(e)}",
|
||||
code=ErrorCode.RATE_LIMIT_EXCEEDED,
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
|
||||
if isinstance(e, anthropic.BadRequestError):
|
||||
@@ -1013,11 +1018,13 @@ class AnthropicClient(LLMClientBase):
|
||||
# 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'input length and `max_tokens` exceed context limit: 173298 + 32000 > 200000, decrease input length or `max_tokens` and try again'}}
|
||||
return ContextWindowExceededError(
|
||||
message=f"Bad request to Anthropic (context window exceeded): {str(e)}",
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
else:
|
||||
return LLMBadRequestError(
|
||||
message=f"Bad request to Anthropic: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
|
||||
if isinstance(e, anthropic.AuthenticationError):
|
||||
@@ -1025,6 +1032,7 @@ class AnthropicClient(LLMClientBase):
|
||||
return LLMAuthenticationError(
|
||||
message=f"Authentication failed with Anthropic: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
|
||||
if isinstance(e, anthropic.PermissionDeniedError):
|
||||
@@ -1032,6 +1040,7 @@ class AnthropicClient(LLMClientBase):
|
||||
return LLMPermissionDeniedError(
|
||||
message=f"Permission denied by Anthropic: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
|
||||
if isinstance(e, anthropic.NotFoundError):
|
||||
@@ -1039,6 +1048,7 @@ class AnthropicClient(LLMClientBase):
|
||||
return LLMNotFoundError(
|
||||
message=f"Resource not found in Anthropic: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
|
||||
if isinstance(e, anthropic.UnprocessableEntityError):
|
||||
@@ -1046,6 +1056,7 @@ class AnthropicClient(LLMClientBase):
|
||||
return LLMUnprocessableEntityError(
|
||||
message=f"Invalid request content for Anthropic: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
|
||||
if isinstance(e, anthropic.APIStatusError):
|
||||
@@ -1055,11 +1066,13 @@ class AnthropicClient(LLMClientBase):
|
||||
logger.warning(f"[Anthropic] Request too large (413): {str(e)}")
|
||||
return ContextWindowExceededError(
|
||||
message=f"Request too large for Anthropic (413): {str(e)}",
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
if "overloaded" in str(e).lower():
|
||||
return LLMProviderOverloaded(
|
||||
message=f"Anthropic API is overloaded: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
return LLMServerError(
|
||||
message=f"Anthropic API error: {str(e)}",
|
||||
@@ -1067,10 +1080,11 @@ class AnthropicClient(LLMClientBase):
|
||||
details={
|
||||
"status_code": e.status_code if hasattr(e, "status_code") else None,
|
||||
"response": str(e.response) if hasattr(e, "response") else None,
|
||||
"is_byok": is_byok,
|
||||
},
|
||||
)
|
||||
|
||||
return super().handle_llm_error(e)
|
||||
return super().handle_llm_error(e, llm_config=llm_config)
|
||||
|
||||
def extract_usage_statistics(self, response_data: dict | None, llm_config: LLMConfig) -> LettaUsageStatistics:
|
||||
"""Extract usage statistics from Anthropic response and return as LettaUsageStatistics."""
|
||||
|
||||
@@ -97,7 +97,7 @@ class AzureClient(OpenAIClient):
|
||||
response: ChatCompletion = await client.chat.completions.create(**request_data)
|
||||
return response.model_dump()
|
||||
except Exception as e:
|
||||
raise self.handle_llm_error(e)
|
||||
raise self.handle_llm_error(e, llm_config=llm_config)
|
||||
|
||||
@trace_method
|
||||
async def stream_async(self, request_data: dict, llm_config: LLMConfig) -> AsyncStream[ChatCompletionChunk | ResponseStreamEvent]:
|
||||
|
||||
@@ -1019,29 +1019,33 @@ class ChatGPTOAuthClient(LLMClientBase):
|
||||
return "o1" in model or "o3" in model or "o4" in model or "gpt-5" in model
|
||||
|
||||
@trace_method
|
||||
def handle_llm_error(self, e: Exception) -> Exception:
|
||||
def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
|
||||
"""Map ChatGPT-specific errors to common LLMError types.
|
||||
|
||||
Args:
|
||||
e: Original exception.
|
||||
llm_config: Optional LLM config to determine if this is a BYOK key.
|
||||
|
||||
Returns:
|
||||
Mapped LLMError subclass.
|
||||
"""
|
||||
is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
|
||||
|
||||
# Already a typed LLM/Letta error (e.g. from SSE error handling) — pass through
|
||||
if isinstance(e, LettaError):
|
||||
return e
|
||||
|
||||
if isinstance(e, httpx.HTTPStatusError):
|
||||
return self._handle_http_error(e)
|
||||
return self._handle_http_error(e, is_byok=is_byok)
|
||||
|
||||
return super().handle_llm_error(e)
|
||||
return super().handle_llm_error(e, llm_config=llm_config)
|
||||
|
||||
def _handle_http_error(self, e: httpx.HTTPStatusError) -> Exception:
|
||||
def _handle_http_error(self, e: httpx.HTTPStatusError, is_byok: bool | None = None) -> Exception:
|
||||
"""Handle HTTP status errors from ChatGPT backend.
|
||||
|
||||
Args:
|
||||
e: HTTP status error.
|
||||
is_byok: Whether the request used a BYOK key.
|
||||
|
||||
Returns:
|
||||
Appropriate LLMError subclass.
|
||||
@@ -1059,30 +1063,36 @@ class ChatGPTOAuthClient(LLMClientBase):
|
||||
return LLMAuthenticationError(
|
||||
message=f"ChatGPT authentication failed: {error_message}",
|
||||
code=ErrorCode.UNAUTHENTICATED,
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
elif status_code == 429:
|
||||
return LLMRateLimitError(
|
||||
message=f"ChatGPT rate limit exceeded: {error_message}",
|
||||
code=ErrorCode.RATE_LIMIT_EXCEEDED,
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
elif status_code == 400:
|
||||
if "context" in error_message.lower() or "token" in error_message.lower():
|
||||
return ContextWindowExceededError(
|
||||
message=f"ChatGPT context window exceeded: {error_message}",
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
return LLMBadRequestError(
|
||||
message=f"ChatGPT bad request: {error_message}",
|
||||
code=ErrorCode.INVALID_ARGUMENT,
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
elif status_code >= 500:
|
||||
return LLMServerError(
|
||||
message=f"ChatGPT server error: {error_message}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
else:
|
||||
return LLMBadRequestError(
|
||||
message=f"ChatGPT request failed ({status_code}): {error_message}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
|
||||
def _handle_sse_error_event(self, raw_event: dict) -> Exception:
|
||||
|
||||
@@ -37,6 +37,7 @@ from letta.local_llm.json_parser import clean_json_string_extra_backslash
|
||||
from letta.log import get_logger
|
||||
from letta.otel.tracing import trace_method
|
||||
from letta.schemas.agent import AgentType
|
||||
from letta.schemas.enums import ProviderCategory
|
||||
from letta.schemas.llm_config import LLMConfig
|
||||
from letta.schemas.message import Message as PydanticMessage
|
||||
from letta.schemas.openai.chat_completion_request import Tool, Tool as OpenAITool
|
||||
@@ -93,7 +94,7 @@ class GoogleVertexClient(LLMClientBase):
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
)
|
||||
except Exception as e:
|
||||
raise self.handle_llm_error(e)
|
||||
raise self.handle_llm_error(e, llm_config=llm_config)
|
||||
|
||||
@trace_method
|
||||
async def request_async(self, request_data: dict, llm_config: LLMConfig) -> dict:
|
||||
@@ -135,11 +136,11 @@ class GoogleVertexClient(LLMClientBase):
|
||||
logger.warning(f"Received {e}, retrying {retry_count}/{self.MAX_RETRIES}")
|
||||
retry_count += 1
|
||||
if retry_count > self.MAX_RETRIES:
|
||||
raise self.handle_llm_error(e)
|
||||
raise self.handle_llm_error(e, llm_config=llm_config)
|
||||
continue
|
||||
raise self.handle_llm_error(e)
|
||||
raise self.handle_llm_error(e, llm_config=llm_config)
|
||||
except Exception as e:
|
||||
raise self.handle_llm_error(e)
|
||||
raise self.handle_llm_error(e, llm_config=llm_config)
|
||||
response_data = response.model_dump()
|
||||
is_malformed_function_call = self.is_malformed_function_call(response_data)
|
||||
if is_malformed_function_call:
|
||||
@@ -211,9 +212,9 @@ class GoogleVertexClient(LLMClientBase):
|
||||
if e.code == 499:
|
||||
logger.info(f"{self._provider_prefix()} Stream cancelled by client (499): {e}")
|
||||
return
|
||||
raise self.handle_llm_error(e)
|
||||
raise self.handle_llm_error(e, llm_config=llm_config)
|
||||
except errors.APIError as e:
|
||||
raise self.handle_llm_error(e)
|
||||
raise self.handle_llm_error(e, llm_config=llm_config)
|
||||
|
||||
@staticmethod
|
||||
def add_dummy_model_messages(messages: List[dict]) -> List[dict]:
|
||||
@@ -851,7 +852,9 @@ class GoogleVertexClient(LLMClientBase):
|
||||
return False
|
||||
|
||||
@trace_method
|
||||
def handle_llm_error(self, e: Exception) -> Exception:
|
||||
def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
|
||||
is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
|
||||
|
||||
# Handle Google GenAI specific errors
|
||||
if isinstance(e, errors.ClientError):
|
||||
if e.code == 499:
|
||||
@@ -859,7 +862,7 @@ class GoogleVertexClient(LLMClientBase):
|
||||
return LLMConnectionError(
|
||||
message=f"Request to {self._provider_name()} was cancelled (client disconnected): {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"status_code": 499, "cause": "client_cancelled"},
|
||||
details={"status_code": 499, "cause": "client_cancelled", "is_byok": is_byok},
|
||||
)
|
||||
|
||||
logger.warning(f"{self._provider_prefix()} Client error ({e.code}): {e}")
|
||||
@@ -870,43 +873,50 @@ class GoogleVertexClient(LLMClientBase):
|
||||
if "context" in error_str and ("exceed" in error_str or "limit" in error_str or "too long" in error_str):
|
||||
return ContextWindowExceededError(
|
||||
message=f"Bad request to {self._provider_name()} (context window exceeded): {str(e)}",
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
else:
|
||||
return LLMBadRequestError(
|
||||
message=f"Bad request to {self._provider_name()}: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
elif e.code == 401:
|
||||
return LLMAuthenticationError(
|
||||
message=f"Authentication failed with {self._provider_name()}: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
elif e.code == 403:
|
||||
return LLMPermissionDeniedError(
|
||||
message=f"Permission denied by {self._provider_name()}: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
elif e.code == 404:
|
||||
return LLMNotFoundError(
|
||||
message=f"Resource not found in {self._provider_name()}: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
elif e.code == 408:
|
||||
return LLMTimeoutError(
|
||||
message=f"Request to {self._provider_name()} timed out: {str(e)}",
|
||||
code=ErrorCode.TIMEOUT,
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None},
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
|
||||
)
|
||||
elif e.code == 422:
|
||||
return LLMUnprocessableEntityError(
|
||||
message=f"Invalid request content for {self._provider_name()}: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
elif e.code == 429:
|
||||
logger.warning(f"{self._provider_prefix()} Rate limited (429). Consider backoff.")
|
||||
return LLMRateLimitError(
|
||||
message=f"Rate limited by {self._provider_name()}: {str(e)}",
|
||||
code=ErrorCode.RATE_LIMIT_EXCEEDED,
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
else:
|
||||
return LLMServerError(
|
||||
@@ -915,6 +925,7 @@ class GoogleVertexClient(LLMClientBase):
|
||||
details={
|
||||
"status_code": e.code,
|
||||
"response_json": getattr(e, "response_json", None),
|
||||
"is_byok": is_byok,
|
||||
},
|
||||
)
|
||||
|
||||
@@ -929,13 +940,14 @@ class GoogleVertexClient(LLMClientBase):
|
||||
details={
|
||||
"status_code": e.code,
|
||||
"response_json": getattr(e, "response_json", None),
|
||||
"is_byok": is_byok,
|
||||
},
|
||||
)
|
||||
elif e.code == 502:
|
||||
return LLMConnectionError(
|
||||
message=f"Bad gateway from {self._provider_name()}: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None},
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
|
||||
)
|
||||
elif e.code == 503:
|
||||
return LLMServerError(
|
||||
@@ -944,13 +956,14 @@ class GoogleVertexClient(LLMClientBase):
|
||||
details={
|
||||
"status_code": e.code,
|
||||
"response_json": getattr(e, "response_json", None),
|
||||
"is_byok": is_byok,
|
||||
},
|
||||
)
|
||||
elif e.code == 504:
|
||||
return LLMTimeoutError(
|
||||
message=f"Gateway timeout from {self._provider_name()}: {str(e)}",
|
||||
code=ErrorCode.TIMEOUT,
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None},
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
|
||||
)
|
||||
else:
|
||||
return LLMServerError(
|
||||
@@ -959,6 +972,7 @@ class GoogleVertexClient(LLMClientBase):
|
||||
details={
|
||||
"status_code": e.code,
|
||||
"response_json": getattr(e, "response_json", None),
|
||||
"is_byok": is_byok,
|
||||
},
|
||||
)
|
||||
|
||||
@@ -970,6 +984,7 @@ class GoogleVertexClient(LLMClientBase):
|
||||
details={
|
||||
"status_code": e.code,
|
||||
"response_json": getattr(e, "response_json", None),
|
||||
"is_byok": is_byok,
|
||||
},
|
||||
)
|
||||
|
||||
@@ -981,7 +996,7 @@ class GoogleVertexClient(LLMClientBase):
|
||||
return LLMConnectionError(
|
||||
message=f"Connection error during {self._provider_name()} streaming: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None},
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
|
||||
)
|
||||
|
||||
# Handle httpx network errors which can occur during streaming
|
||||
@@ -991,7 +1006,7 @@ class GoogleVertexClient(LLMClientBase):
|
||||
return LLMConnectionError(
|
||||
message=f"Network error during {self._provider_name()} streaming: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__},
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__, "is_byok": is_byok},
|
||||
)
|
||||
|
||||
# Handle connection-related errors
|
||||
@@ -1000,11 +1015,11 @@ class GoogleVertexClient(LLMClientBase):
|
||||
return LLMConnectionError(
|
||||
message=f"Failed to connect to {self._provider_name()}: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None},
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
|
||||
)
|
||||
|
||||
# Fallback to base implementation for other errors
|
||||
return super().handle_llm_error(e)
|
||||
return super().handle_llm_error(e, llm_config=llm_config)
|
||||
|
||||
async def count_tokens(self, messages: List[dict] = None, model: str = None, tools: List[OpenAITool] = None) -> int:
|
||||
"""
|
||||
|
||||
@@ -226,7 +226,7 @@ class LLMClientBase:
|
||||
)
|
||||
log_event(name="llm_response_received", attributes=response_data)
|
||||
except Exception as e:
|
||||
raise self.handle_llm_error(e)
|
||||
raise self.handle_llm_error(e, llm_config=llm_config)
|
||||
|
||||
return await self.convert_response_to_chat_completion(response_data, messages, llm_config)
|
||||
|
||||
@@ -261,7 +261,7 @@ class LLMClientBase:
|
||||
|
||||
log_event(name="llm_response_received", attributes=response_data)
|
||||
except Exception as e:
|
||||
raise self.handle_llm_error(e)
|
||||
raise self.handle_llm_error(e, llm_config=llm_config)
|
||||
|
||||
return await self.convert_response_to_chat_completion(response_data, messages, llm_config)
|
||||
|
||||
@@ -353,17 +353,20 @@ class LLMClientBase:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def handle_llm_error(self, e: Exception) -> Exception:
|
||||
def handle_llm_error(self, e: Exception, llm_config: Optional["LLMConfig"] = None) -> Exception:
|
||||
"""
|
||||
Maps provider-specific errors to common LLMError types.
|
||||
Each LLM provider should implement this to translate their specific errors.
|
||||
|
||||
Args:
|
||||
e: The original provider-specific exception
|
||||
llm_config: Optional LLM config to determine if this is a BYOK key
|
||||
|
||||
Returns:
|
||||
An LLMError subclass that represents the error in a provider-agnostic way
|
||||
"""
|
||||
is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
|
||||
|
||||
# Handle httpx.RemoteProtocolError which can occur during streaming
|
||||
# when the remote server closes the connection unexpectedly
|
||||
# (e.g., "peer closed connection without sending complete message body")
|
||||
@@ -375,10 +378,10 @@ class LLMClientBase:
|
||||
return LLMConnectionError(
|
||||
message=f"Connection error during streaming: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None},
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
|
||||
)
|
||||
|
||||
return LLMError(f"Unhandled LLM error: {str(e)}")
|
||||
return LLMError(message=f"Unhandled LLM error: {str(e)}", details={"is_byok": is_byok})
|
||||
|
||||
def get_byok_overrides(self, llm_config: LLMConfig) -> Tuple[Optional[str], Optional[str], Optional[str]]:
|
||||
"""
|
||||
|
||||
@@ -40,6 +40,7 @@ from letta.log import get_logger
|
||||
from letta.otel.tracing import trace_method
|
||||
from letta.schemas.agent import AgentType
|
||||
from letta.schemas.embedding_config import EmbeddingConfig
|
||||
from letta.schemas.enums import ProviderCategory
|
||||
from letta.schemas.letta_message_content import MessageContentType
|
||||
from letta.schemas.llm_config import LLMConfig
|
||||
from letta.schemas.message import Message as PydanticMessage
|
||||
@@ -1015,10 +1016,11 @@ class OpenAIClient(LLMClientBase):
|
||||
return results
|
||||
|
||||
@trace_method
|
||||
def handle_llm_error(self, e: Exception) -> Exception:
|
||||
def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
|
||||
"""
|
||||
Maps OpenAI-specific errors to common LLMError types.
|
||||
"""
|
||||
is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
|
||||
if isinstance(e, openai.APITimeoutError):
|
||||
timeout_duration = getattr(e, "timeout", "unknown")
|
||||
logger.warning(f"[OpenAI] Request timeout after {timeout_duration} seconds: {e}")
|
||||
@@ -1028,6 +1030,7 @@ class OpenAIClient(LLMClientBase):
|
||||
details={
|
||||
"timeout_duration": timeout_duration,
|
||||
"cause": str(e.__cause__) if e.__cause__ else None,
|
||||
"is_byok": is_byok,
|
||||
},
|
||||
)
|
||||
|
||||
@@ -1036,7 +1039,7 @@ class OpenAIClient(LLMClientBase):
|
||||
return LLMConnectionError(
|
||||
message=f"Failed to connect to OpenAI: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None},
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
|
||||
)
|
||||
|
||||
# Handle httpx.RemoteProtocolError which can occur during streaming
|
||||
@@ -1047,7 +1050,7 @@ class OpenAIClient(LLMClientBase):
|
||||
return LLMConnectionError(
|
||||
message=f"Connection error during OpenAI streaming: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None},
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
|
||||
)
|
||||
|
||||
# Handle httpx network errors which can occur during streaming
|
||||
@@ -1057,15 +1060,16 @@ class OpenAIClient(LLMClientBase):
|
||||
return LLMConnectionError(
|
||||
message=f"Network error during OpenAI streaming: {str(e)}",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__},
|
||||
details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__, "is_byok": is_byok},
|
||||
)
|
||||
|
||||
if isinstance(e, openai.RateLimitError):
|
||||
logger.warning(f"[OpenAI] Rate limited (429). Consider backoff. Error: {e}")
|
||||
body_details = e.body if isinstance(e.body, dict) else {"body": e.body}
|
||||
return LLMRateLimitError(
|
||||
message=f"Rate limited by OpenAI: {str(e)}",
|
||||
code=ErrorCode.RATE_LIMIT_EXCEEDED,
|
||||
details=e.body, # Include body which often has rate limit details
|
||||
details={**body_details, "is_byok": is_byok},
|
||||
)
|
||||
|
||||
if isinstance(e, openai.BadRequestError):
|
||||
@@ -1082,12 +1086,14 @@ class OpenAIClient(LLMClientBase):
|
||||
if error_code == "context_length_exceeded" or is_context_window_overflow_message(str(e)):
|
||||
return ContextWindowExceededError(
|
||||
message=f"Bad request to OpenAI (context window exceeded): {str(e)}",
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
else:
|
||||
body_details = e.body if isinstance(e.body, dict) else {"body": e.body}
|
||||
return LLMBadRequestError(
|
||||
message=f"Bad request to OpenAI: {str(e)}",
|
||||
code=ErrorCode.INVALID_ARGUMENT, # Or more specific if detectable
|
||||
details=e.body,
|
||||
code=ErrorCode.INVALID_ARGUMENT,
|
||||
details={**body_details, "is_byok": is_byok},
|
||||
)
|
||||
|
||||
# NOTE: The OpenAI Python SDK may raise a generic `openai.APIError` while *iterating*
|
||||
@@ -1104,34 +1110,46 @@ class OpenAIClient(LLMClientBase):
|
||||
message=f"OpenAI request exceeded the context window: {msg}",
|
||||
details={
|
||||
"provider_exception_type": type(e).__name__,
|
||||
# Best-effort extraction (may not exist on APIError)
|
||||
"body": getattr(e, "body", None),
|
||||
"is_byok": is_byok,
|
||||
},
|
||||
)
|
||||
|
||||
if isinstance(e, openai.AuthenticationError):
|
||||
logger.error(f"[OpenAI] Authentication error (401): {str(e)}") # More severe log level
|
||||
body_details = e.body if isinstance(e.body, dict) else {"body": e.body}
|
||||
return LLMAuthenticationError(
|
||||
message=f"Authentication failed with OpenAI: {str(e)}", code=ErrorCode.UNAUTHENTICATED, details=e.body
|
||||
message=f"Authentication failed with OpenAI: {str(e)}",
|
||||
code=ErrorCode.UNAUTHENTICATED,
|
||||
details={**body_details, "is_byok": is_byok},
|
||||
)
|
||||
|
||||
if isinstance(e, openai.PermissionDeniedError):
|
||||
logger.error(f"[OpenAI] Permission denied (403): {str(e)}") # More severe log level
|
||||
body_details = e.body if isinstance(e.body, dict) else {"body": e.body}
|
||||
return LLMPermissionDeniedError(
|
||||
message=f"Permission denied by OpenAI: {str(e)}", code=ErrorCode.PERMISSION_DENIED, details=e.body
|
||||
message=f"Permission denied by OpenAI: {str(e)}",
|
||||
code=ErrorCode.PERMISSION_DENIED,
|
||||
details={**body_details, "is_byok": is_byok},
|
||||
)
|
||||
|
||||
if isinstance(e, openai.NotFoundError):
|
||||
logger.warning(f"[OpenAI] Resource not found (404): {str(e)}")
|
||||
# Could be invalid model name, etc.
|
||||
return LLMNotFoundError(message=f"Resource not found in OpenAI: {str(e)}", code=ErrorCode.NOT_FOUND, details=e.body)
|
||||
body_details = e.body if isinstance(e.body, dict) else {"body": e.body}
|
||||
return LLMNotFoundError(
|
||||
message=f"Resource not found in OpenAI: {str(e)}",
|
||||
code=ErrorCode.NOT_FOUND,
|
||||
details={**body_details, "is_byok": is_byok},
|
||||
)
|
||||
|
||||
if isinstance(e, openai.UnprocessableEntityError):
|
||||
logger.warning(f"[OpenAI] Unprocessable entity (422): {str(e)}")
|
||||
body_details = e.body if isinstance(e.body, dict) else {"body": e.body}
|
||||
return LLMUnprocessableEntityError(
|
||||
message=f"Invalid request content for OpenAI: {str(e)}",
|
||||
code=ErrorCode.INVALID_ARGUMENT, # Usually validation errors
|
||||
details=e.body,
|
||||
code=ErrorCode.INVALID_ARGUMENT,
|
||||
details={**body_details, "is_byok": is_byok},
|
||||
)
|
||||
|
||||
# General API error catch-all
|
||||
@@ -1141,6 +1159,7 @@ class OpenAIClient(LLMClientBase):
|
||||
if e.status_code == 413:
|
||||
return ContextWindowExceededError(
|
||||
message=f"Request too large for OpenAI (413): {str(e)}",
|
||||
details={"is_byok": is_byok},
|
||||
)
|
||||
# Map based on status code potentially
|
||||
if e.status_code >= 500:
|
||||
@@ -1158,11 +1177,12 @@ class OpenAIClient(LLMClientBase):
|
||||
"status_code": e.status_code,
|
||||
"response": str(e.response),
|
||||
"body": e.body,
|
||||
"is_byok": is_byok,
|
||||
},
|
||||
)
|
||||
|
||||
# Fallback for unexpected errors
|
||||
return super().handle_llm_error(e)
|
||||
return super().handle_llm_error(e, llm_config=llm_config)
|
||||
|
||||
|
||||
def fill_image_content_in_messages(openai_message_list: List[dict], pydantic_message_list: List[PydanticMessage]) -> List[dict]:
|
||||
|
||||
@@ -668,12 +668,19 @@ def create_application() -> "FastAPI":
|
||||
|
||||
@app.exception_handler(LLMRateLimitError)
|
||||
async def llm_rate_limit_error_handler(request: Request, exc: LLMRateLimitError):
|
||||
is_byok = exc.details.get("is_byok") if isinstance(exc.details, dict) else None
|
||||
if is_byok:
|
||||
message = (
|
||||
"Rate limit exceeded on your API key. Please check your provider's rate limits and billing, or reduce request frequency."
|
||||
)
|
||||
else:
|
||||
message = "Rate limit exceeded for LLM model provider. Please wait before making another request."
|
||||
return JSONResponse(
|
||||
status_code=429,
|
||||
content={
|
||||
"error": {
|
||||
"type": "llm_rate_limit",
|
||||
"message": "Rate limit exceeded for LLM model provider. Please wait before making another request.",
|
||||
"message": message,
|
||||
"detail": str(exc),
|
||||
}
|
||||
},
|
||||
|
||||
@@ -592,7 +592,7 @@ async def simple_summary(
|
||||
except Exception as e:
|
||||
# handle LLM error (likely a context window exceeded error)
|
||||
try:
|
||||
raise llm_client.handle_llm_error(e)
|
||||
raise llm_client.handle_llm_error(e, llm_config=llm_config)
|
||||
except ContextWindowExceededError as context_error:
|
||||
logger.warning(f"Context window exceeded during summarization. Applying clamping fallbacks. Original error: {context_error}")
|
||||
|
||||
@@ -667,7 +667,7 @@ async def simple_summary(
|
||||
except Exception as fallback_error_b:
|
||||
logger.error(f"Transcript truncation fallback also failed: {fallback_error_b}. Propagating error.")
|
||||
logger.info(f"Full fallback summarization payload: {request_data}")
|
||||
raise llm_client.handle_llm_error(fallback_error_b)
|
||||
raise llm_client.handle_llm_error(fallback_error_b, llm_config=llm_config)
|
||||
|
||||
logger.info(f"Summarized {len(messages)}: {summary}")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user