fix(core): differentiate BYOK vs base provider in all LLM error details (#9425)

Add is_byok flag to every LLMError's details dict returned from
handle_llm_error across all providers (OpenAI, Anthropic, Google,
ChatGPT OAuth). This enables observability into whether errors
originate from Letta's production keys or user-provided BYOK keys.

The rate limit handler in app.py now returns a more helpful message
for BYOK users ("check your provider's rate limits and billing")
versus the generic message for base provider rate limits.

Datadog issues:
- https://us5.datadoghq.com/error-tracking/issue/b711c824-f490-11f0-96e4-da7ad0900000
- https://us5.datadoghq.com/error-tracking/issue/76623036-f4de-11f0-8697-da7ad0900000
- https://us5.datadoghq.com/error-tracking/issue/43e9888a-dfcf-11f0-a645-da7ad0900000

🤖 Generated with [Letta Code](https://letta.com)

Co-authored-by: Letta <noreply@letta.com>
This commit is contained in:
Kian Jones
2026-02-10 20:38:25 -08:00
committed by Caren Thomas
parent 424a1ada64
commit 382e216cbb
12 changed files with 123 additions and 54 deletions

View File

@@ -114,7 +114,7 @@ class LettaLLMStreamAdapter(LettaLLMAdapter):
error_msg=str(e),
error_type=type(e).__name__,
)
raise self.llm_client.handle_llm_error(e)
raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config)
# Process the stream and yield chunks immediately for TTFT
# Wrap in error handling to convert provider errors to common LLMError types
@@ -133,7 +133,7 @@ class LettaLLMStreamAdapter(LettaLLMAdapter):
error_msg=str(e),
error_type=type(e).__name__,
)
raise self.llm_client.handle_llm_error(e)
raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config)
# After streaming completes, extract the accumulated data
self.llm_request_finish_timestamp_ns = get_utc_timestamp_ns()

View File

@@ -54,7 +54,7 @@ class SimpleLLMRequestAdapter(LettaLLMRequestAdapter):
try:
self.response_data = await self.llm_client.request_async_with_telemetry(request_data, self.llm_config)
except Exception as e:
raise self.llm_client.handle_llm_error(e)
raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config)
self.llm_request_finish_timestamp_ns = get_utc_timestamp_ns()

View File

@@ -151,7 +151,7 @@ class SimpleLLMStreamAdapter(LettaLLMStreamAdapter):
error_msg=str(e),
error_type=type(e).__name__,
)
raise self.llm_client.handle_llm_error(e)
raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config)
# Process the stream and yield chunks immediately for TTFT
try:
@@ -169,7 +169,7 @@ class SimpleLLMStreamAdapter(LettaLLMStreamAdapter):
error_msg=str(e),
error_type=type(e).__name__,
)
raise self.llm_client.handle_llm_error(e)
raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config)
# After streaming completes, extract the accumulated data
self.llm_request_finish_timestamp_ns = get_utc_timestamp_ns()

View File

@@ -1547,7 +1547,7 @@ class LettaAgent(BaseAgent):
step_id=step_id,
)
else:
raise llm_client.handle_llm_error(e)
raise llm_client.handle_llm_error(e, llm_config=llm_config)
@trace_method
async def _rebuild_context_window(

View File

@@ -37,6 +37,7 @@ from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG
from letta.log import get_logger
from letta.otel.tracing import trace_method
from letta.schemas.agent import AgentType
from letta.schemas.enums import ProviderCategory
from letta.schemas.llm_config import LLMConfig
from letta.schemas.message import Message as PydanticMessage
from letta.schemas.openai.chat_completion_request import Tool as OpenAITool
@@ -937,7 +938,9 @@ class AnthropicClient(LLMClientBase):
)
@trace_method
def handle_llm_error(self, e: Exception) -> Exception:
def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
# make sure to check for overflow errors, regardless of error type
error_str = str(e).lower()
if (
@@ -952,6 +955,7 @@ class AnthropicClient(LLMClientBase):
logger.warning(f"[Anthropic] Context window exceeded: {str(e)}")
return ContextWindowExceededError(
message=f"Context window exceeded for Anthropic: {str(e)}",
details={"is_byok": is_byok},
)
if isinstance(e, anthropic.APITimeoutError):
@@ -959,7 +963,7 @@ class AnthropicClient(LLMClientBase):
return LLMTimeoutError(
message=f"Request to Anthropic timed out: {str(e)}",
code=ErrorCode.TIMEOUT,
details={"cause": str(e.__cause__) if e.__cause__ else None},
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
)
if isinstance(e, anthropic.APIConnectionError):
@@ -967,7 +971,7 @@ class AnthropicClient(LLMClientBase):
return LLMConnectionError(
message=f"Failed to connect to Anthropic: {str(e)}",
code=ErrorCode.INTERNAL_SERVER_ERROR,
details={"cause": str(e.__cause__) if e.__cause__ else None},
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
)
# Handle httpx.RemoteProtocolError which can occur during streaming
@@ -978,7 +982,7 @@ class AnthropicClient(LLMClientBase):
return LLMConnectionError(
message=f"Connection error during Anthropic streaming: {str(e)}",
code=ErrorCode.INTERNAL_SERVER_ERROR,
details={"cause": str(e.__cause__) if e.__cause__ else None},
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
)
# Handle httpx network errors which can occur during streaming
@@ -988,7 +992,7 @@ class AnthropicClient(LLMClientBase):
return LLMConnectionError(
message=f"Network error during Anthropic streaming: {str(e)}",
code=ErrorCode.INTERNAL_SERVER_ERROR,
details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__},
details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__, "is_byok": is_byok},
)
if isinstance(e, anthropic.RateLimitError):
@@ -996,6 +1000,7 @@ class AnthropicClient(LLMClientBase):
return LLMRateLimitError(
message=f"Rate limited by Anthropic: {str(e)}",
code=ErrorCode.RATE_LIMIT_EXCEEDED,
details={"is_byok": is_byok},
)
if isinstance(e, anthropic.BadRequestError):
@@ -1013,11 +1018,13 @@ class AnthropicClient(LLMClientBase):
# 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'input length and `max_tokens` exceed context limit: 173298 + 32000 > 200000, decrease input length or `max_tokens` and try again'}}
return ContextWindowExceededError(
message=f"Bad request to Anthropic (context window exceeded): {str(e)}",
details={"is_byok": is_byok},
)
else:
return LLMBadRequestError(
message=f"Bad request to Anthropic: {str(e)}",
code=ErrorCode.INTERNAL_SERVER_ERROR,
details={"is_byok": is_byok},
)
if isinstance(e, anthropic.AuthenticationError):
@@ -1025,6 +1032,7 @@ class AnthropicClient(LLMClientBase):
return LLMAuthenticationError(
message=f"Authentication failed with Anthropic: {str(e)}",
code=ErrorCode.INTERNAL_SERVER_ERROR,
details={"is_byok": is_byok},
)
if isinstance(e, anthropic.PermissionDeniedError):
@@ -1032,6 +1040,7 @@ class AnthropicClient(LLMClientBase):
return LLMPermissionDeniedError(
message=f"Permission denied by Anthropic: {str(e)}",
code=ErrorCode.INTERNAL_SERVER_ERROR,
details={"is_byok": is_byok},
)
if isinstance(e, anthropic.NotFoundError):
@@ -1039,6 +1048,7 @@ class AnthropicClient(LLMClientBase):
return LLMNotFoundError(
message=f"Resource not found in Anthropic: {str(e)}",
code=ErrorCode.INTERNAL_SERVER_ERROR,
details={"is_byok": is_byok},
)
if isinstance(e, anthropic.UnprocessableEntityError):
@@ -1046,6 +1056,7 @@ class AnthropicClient(LLMClientBase):
return LLMUnprocessableEntityError(
message=f"Invalid request content for Anthropic: {str(e)}",
code=ErrorCode.INTERNAL_SERVER_ERROR,
details={"is_byok": is_byok},
)
if isinstance(e, anthropic.APIStatusError):
@@ -1055,11 +1066,13 @@ class AnthropicClient(LLMClientBase):
logger.warning(f"[Anthropic] Request too large (413): {str(e)}")
return ContextWindowExceededError(
message=f"Request too large for Anthropic (413): {str(e)}",
details={"is_byok": is_byok},
)
if "overloaded" in str(e).lower():
return LLMProviderOverloaded(
message=f"Anthropic API is overloaded: {str(e)}",
code=ErrorCode.INTERNAL_SERVER_ERROR,
details={"is_byok": is_byok},
)
return LLMServerError(
message=f"Anthropic API error: {str(e)}",
@@ -1067,10 +1080,11 @@ class AnthropicClient(LLMClientBase):
details={
"status_code": e.status_code if hasattr(e, "status_code") else None,
"response": str(e.response) if hasattr(e, "response") else None,
"is_byok": is_byok,
},
)
return super().handle_llm_error(e)
return super().handle_llm_error(e, llm_config=llm_config)
def extract_usage_statistics(self, response_data: dict | None, llm_config: LLMConfig) -> LettaUsageStatistics:
"""Extract usage statistics from Anthropic response and return as LettaUsageStatistics."""

View File

@@ -97,7 +97,7 @@ class AzureClient(OpenAIClient):
response: ChatCompletion = await client.chat.completions.create(**request_data)
return response.model_dump()
except Exception as e:
raise self.handle_llm_error(e)
raise self.handle_llm_error(e, llm_config=llm_config)
@trace_method
async def stream_async(self, request_data: dict, llm_config: LLMConfig) -> AsyncStream[ChatCompletionChunk | ResponseStreamEvent]:

View File

@@ -1019,29 +1019,33 @@ class ChatGPTOAuthClient(LLMClientBase):
return "o1" in model or "o3" in model or "o4" in model or "gpt-5" in model
@trace_method
def handle_llm_error(self, e: Exception) -> Exception:
def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
"""Map ChatGPT-specific errors to common LLMError types.
Args:
e: Original exception.
llm_config: Optional LLM config to determine if this is a BYOK key.
Returns:
Mapped LLMError subclass.
"""
is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
# Already a typed LLM/Letta error (e.g. from SSE error handling) — pass through
if isinstance(e, LettaError):
return e
if isinstance(e, httpx.HTTPStatusError):
return self._handle_http_error(e)
return self._handle_http_error(e, is_byok=is_byok)
return super().handle_llm_error(e)
return super().handle_llm_error(e, llm_config=llm_config)
def _handle_http_error(self, e: httpx.HTTPStatusError) -> Exception:
def _handle_http_error(self, e: httpx.HTTPStatusError, is_byok: bool | None = None) -> Exception:
"""Handle HTTP status errors from ChatGPT backend.
Args:
e: HTTP status error.
is_byok: Whether the request used a BYOK key.
Returns:
Appropriate LLMError subclass.
@@ -1059,30 +1063,36 @@ class ChatGPTOAuthClient(LLMClientBase):
return LLMAuthenticationError(
message=f"ChatGPT authentication failed: {error_message}",
code=ErrorCode.UNAUTHENTICATED,
details={"is_byok": is_byok},
)
elif status_code == 429:
return LLMRateLimitError(
message=f"ChatGPT rate limit exceeded: {error_message}",
code=ErrorCode.RATE_LIMIT_EXCEEDED,
details={"is_byok": is_byok},
)
elif status_code == 400:
if "context" in error_message.lower() or "token" in error_message.lower():
return ContextWindowExceededError(
message=f"ChatGPT context window exceeded: {error_message}",
details={"is_byok": is_byok},
)
return LLMBadRequestError(
message=f"ChatGPT bad request: {error_message}",
code=ErrorCode.INVALID_ARGUMENT,
details={"is_byok": is_byok},
)
elif status_code >= 500:
return LLMServerError(
message=f"ChatGPT server error: {error_message}",
code=ErrorCode.INTERNAL_SERVER_ERROR,
details={"is_byok": is_byok},
)
else:
return LLMBadRequestError(
message=f"ChatGPT request failed ({status_code}): {error_message}",
code=ErrorCode.INTERNAL_SERVER_ERROR,
details={"is_byok": is_byok},
)
def _handle_sse_error_event(self, raw_event: dict) -> Exception:

View File

@@ -37,6 +37,7 @@ from letta.local_llm.json_parser import clean_json_string_extra_backslash
from letta.log import get_logger
from letta.otel.tracing import trace_method
from letta.schemas.agent import AgentType
from letta.schemas.enums import ProviderCategory
from letta.schemas.llm_config import LLMConfig
from letta.schemas.message import Message as PydanticMessage
from letta.schemas.openai.chat_completion_request import Tool, Tool as OpenAITool
@@ -93,7 +94,7 @@ class GoogleVertexClient(LLMClientBase):
code=ErrorCode.INTERNAL_SERVER_ERROR,
)
except Exception as e:
raise self.handle_llm_error(e)
raise self.handle_llm_error(e, llm_config=llm_config)
@trace_method
async def request_async(self, request_data: dict, llm_config: LLMConfig) -> dict:
@@ -135,11 +136,11 @@ class GoogleVertexClient(LLMClientBase):
logger.warning(f"Received {e}, retrying {retry_count}/{self.MAX_RETRIES}")
retry_count += 1
if retry_count > self.MAX_RETRIES:
raise self.handle_llm_error(e)
raise self.handle_llm_error(e, llm_config=llm_config)
continue
raise self.handle_llm_error(e)
raise self.handle_llm_error(e, llm_config=llm_config)
except Exception as e:
raise self.handle_llm_error(e)
raise self.handle_llm_error(e, llm_config=llm_config)
response_data = response.model_dump()
is_malformed_function_call = self.is_malformed_function_call(response_data)
if is_malformed_function_call:
@@ -211,9 +212,9 @@ class GoogleVertexClient(LLMClientBase):
if e.code == 499:
logger.info(f"{self._provider_prefix()} Stream cancelled by client (499): {e}")
return
raise self.handle_llm_error(e)
raise self.handle_llm_error(e, llm_config=llm_config)
except errors.APIError as e:
raise self.handle_llm_error(e)
raise self.handle_llm_error(e, llm_config=llm_config)
@staticmethod
def add_dummy_model_messages(messages: List[dict]) -> List[dict]:
@@ -851,7 +852,9 @@ class GoogleVertexClient(LLMClientBase):
return False
@trace_method
def handle_llm_error(self, e: Exception) -> Exception:
def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
# Handle Google GenAI specific errors
if isinstance(e, errors.ClientError):
if e.code == 499:
@@ -859,7 +862,7 @@ class GoogleVertexClient(LLMClientBase):
return LLMConnectionError(
message=f"Request to {self._provider_name()} was cancelled (client disconnected): {str(e)}",
code=ErrorCode.INTERNAL_SERVER_ERROR,
details={"status_code": 499, "cause": "client_cancelled"},
details={"status_code": 499, "cause": "client_cancelled", "is_byok": is_byok},
)
logger.warning(f"{self._provider_prefix()} Client error ({e.code}): {e}")
@@ -870,43 +873,50 @@ class GoogleVertexClient(LLMClientBase):
if "context" in error_str and ("exceed" in error_str or "limit" in error_str or "too long" in error_str):
return ContextWindowExceededError(
message=f"Bad request to {self._provider_name()} (context window exceeded): {str(e)}",
details={"is_byok": is_byok},
)
else:
return LLMBadRequestError(
message=f"Bad request to {self._provider_name()}: {str(e)}",
code=ErrorCode.INTERNAL_SERVER_ERROR,
details={"is_byok": is_byok},
)
elif e.code == 401:
return LLMAuthenticationError(
message=f"Authentication failed with {self._provider_name()}: {str(e)}",
code=ErrorCode.INTERNAL_SERVER_ERROR,
details={"is_byok": is_byok},
)
elif e.code == 403:
return LLMPermissionDeniedError(
message=f"Permission denied by {self._provider_name()}: {str(e)}",
code=ErrorCode.INTERNAL_SERVER_ERROR,
details={"is_byok": is_byok},
)
elif e.code == 404:
return LLMNotFoundError(
message=f"Resource not found in {self._provider_name()}: {str(e)}",
code=ErrorCode.INTERNAL_SERVER_ERROR,
details={"is_byok": is_byok},
)
elif e.code == 408:
return LLMTimeoutError(
message=f"Request to {self._provider_name()} timed out: {str(e)}",
code=ErrorCode.TIMEOUT,
details={"cause": str(e.__cause__) if e.__cause__ else None},
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
)
elif e.code == 422:
return LLMUnprocessableEntityError(
message=f"Invalid request content for {self._provider_name()}: {str(e)}",
code=ErrorCode.INTERNAL_SERVER_ERROR,
details={"is_byok": is_byok},
)
elif e.code == 429:
logger.warning(f"{self._provider_prefix()} Rate limited (429). Consider backoff.")
return LLMRateLimitError(
message=f"Rate limited by {self._provider_name()}: {str(e)}",
code=ErrorCode.RATE_LIMIT_EXCEEDED,
details={"is_byok": is_byok},
)
else:
return LLMServerError(
@@ -915,6 +925,7 @@ class GoogleVertexClient(LLMClientBase):
details={
"status_code": e.code,
"response_json": getattr(e, "response_json", None),
"is_byok": is_byok,
},
)
@@ -929,13 +940,14 @@ class GoogleVertexClient(LLMClientBase):
details={
"status_code": e.code,
"response_json": getattr(e, "response_json", None),
"is_byok": is_byok,
},
)
elif e.code == 502:
return LLMConnectionError(
message=f"Bad gateway from {self._provider_name()}: {str(e)}",
code=ErrorCode.INTERNAL_SERVER_ERROR,
details={"cause": str(e.__cause__) if e.__cause__ else None},
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
)
elif e.code == 503:
return LLMServerError(
@@ -944,13 +956,14 @@ class GoogleVertexClient(LLMClientBase):
details={
"status_code": e.code,
"response_json": getattr(e, "response_json", None),
"is_byok": is_byok,
},
)
elif e.code == 504:
return LLMTimeoutError(
message=f"Gateway timeout from {self._provider_name()}: {str(e)}",
code=ErrorCode.TIMEOUT,
details={"cause": str(e.__cause__) if e.__cause__ else None},
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
)
else:
return LLMServerError(
@@ -959,6 +972,7 @@ class GoogleVertexClient(LLMClientBase):
details={
"status_code": e.code,
"response_json": getattr(e, "response_json", None),
"is_byok": is_byok,
},
)
@@ -970,6 +984,7 @@ class GoogleVertexClient(LLMClientBase):
details={
"status_code": e.code,
"response_json": getattr(e, "response_json", None),
"is_byok": is_byok,
},
)
@@ -981,7 +996,7 @@ class GoogleVertexClient(LLMClientBase):
return LLMConnectionError(
message=f"Connection error during {self._provider_name()} streaming: {str(e)}",
code=ErrorCode.INTERNAL_SERVER_ERROR,
details={"cause": str(e.__cause__) if e.__cause__ else None},
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
)
# Handle httpx network errors which can occur during streaming
@@ -991,7 +1006,7 @@ class GoogleVertexClient(LLMClientBase):
return LLMConnectionError(
message=f"Network error during {self._provider_name()} streaming: {str(e)}",
code=ErrorCode.INTERNAL_SERVER_ERROR,
details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__},
details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__, "is_byok": is_byok},
)
# Handle connection-related errors
@@ -1000,11 +1015,11 @@ class GoogleVertexClient(LLMClientBase):
return LLMConnectionError(
message=f"Failed to connect to {self._provider_name()}: {str(e)}",
code=ErrorCode.INTERNAL_SERVER_ERROR,
details={"cause": str(e.__cause__) if e.__cause__ else None},
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
)
# Fallback to base implementation for other errors
return super().handle_llm_error(e)
return super().handle_llm_error(e, llm_config=llm_config)
async def count_tokens(self, messages: List[dict] = None, model: str = None, tools: List[OpenAITool] = None) -> int:
"""

View File

@@ -226,7 +226,7 @@ class LLMClientBase:
)
log_event(name="llm_response_received", attributes=response_data)
except Exception as e:
raise self.handle_llm_error(e)
raise self.handle_llm_error(e, llm_config=llm_config)
return await self.convert_response_to_chat_completion(response_data, messages, llm_config)
@@ -261,7 +261,7 @@ class LLMClientBase:
log_event(name="llm_response_received", attributes=response_data)
except Exception as e:
raise self.handle_llm_error(e)
raise self.handle_llm_error(e, llm_config=llm_config)
return await self.convert_response_to_chat_completion(response_data, messages, llm_config)
@@ -353,17 +353,20 @@ class LLMClientBase:
raise NotImplementedError
@abstractmethod
def handle_llm_error(self, e: Exception) -> Exception:
def handle_llm_error(self, e: Exception, llm_config: Optional["LLMConfig"] = None) -> Exception:
"""
Maps provider-specific errors to common LLMError types.
Each LLM provider should implement this to translate their specific errors.
Args:
e: The original provider-specific exception
llm_config: Optional LLM config to determine if this is a BYOK key
Returns:
An LLMError subclass that represents the error in a provider-agnostic way
"""
is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
# Handle httpx.RemoteProtocolError which can occur during streaming
# when the remote server closes the connection unexpectedly
# (e.g., "peer closed connection without sending complete message body")
@@ -375,10 +378,10 @@ class LLMClientBase:
return LLMConnectionError(
message=f"Connection error during streaming: {str(e)}",
code=ErrorCode.INTERNAL_SERVER_ERROR,
details={"cause": str(e.__cause__) if e.__cause__ else None},
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
)
return LLMError(f"Unhandled LLM error: {str(e)}")
return LLMError(message=f"Unhandled LLM error: {str(e)}", details={"is_byok": is_byok})
def get_byok_overrides(self, llm_config: LLMConfig) -> Tuple[Optional[str], Optional[str], Optional[str]]:
"""

View File

@@ -40,6 +40,7 @@ from letta.log import get_logger
from letta.otel.tracing import trace_method
from letta.schemas.agent import AgentType
from letta.schemas.embedding_config import EmbeddingConfig
from letta.schemas.enums import ProviderCategory
from letta.schemas.letta_message_content import MessageContentType
from letta.schemas.llm_config import LLMConfig
from letta.schemas.message import Message as PydanticMessage
@@ -1015,10 +1016,11 @@ class OpenAIClient(LLMClientBase):
return results
@trace_method
def handle_llm_error(self, e: Exception) -> Exception:
def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
"""
Maps OpenAI-specific errors to common LLMError types.
"""
is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
if isinstance(e, openai.APITimeoutError):
timeout_duration = getattr(e, "timeout", "unknown")
logger.warning(f"[OpenAI] Request timeout after {timeout_duration} seconds: {e}")
@@ -1028,6 +1030,7 @@ class OpenAIClient(LLMClientBase):
details={
"timeout_duration": timeout_duration,
"cause": str(e.__cause__) if e.__cause__ else None,
"is_byok": is_byok,
},
)
@@ -1036,7 +1039,7 @@ class OpenAIClient(LLMClientBase):
return LLMConnectionError(
message=f"Failed to connect to OpenAI: {str(e)}",
code=ErrorCode.INTERNAL_SERVER_ERROR,
details={"cause": str(e.__cause__) if e.__cause__ else None},
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
)
# Handle httpx.RemoteProtocolError which can occur during streaming
@@ -1047,7 +1050,7 @@ class OpenAIClient(LLMClientBase):
return LLMConnectionError(
message=f"Connection error during OpenAI streaming: {str(e)}",
code=ErrorCode.INTERNAL_SERVER_ERROR,
details={"cause": str(e.__cause__) if e.__cause__ else None},
details={"cause": str(e.__cause__) if e.__cause__ else None, "is_byok": is_byok},
)
# Handle httpx network errors which can occur during streaming
@@ -1057,15 +1060,16 @@ class OpenAIClient(LLMClientBase):
return LLMConnectionError(
message=f"Network error during OpenAI streaming: {str(e)}",
code=ErrorCode.INTERNAL_SERVER_ERROR,
details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__},
details={"cause": str(e.__cause__) if e.__cause__ else None, "error_type": type(e).__name__, "is_byok": is_byok},
)
if isinstance(e, openai.RateLimitError):
logger.warning(f"[OpenAI] Rate limited (429). Consider backoff. Error: {e}")
body_details = e.body if isinstance(e.body, dict) else {"body": e.body}
return LLMRateLimitError(
message=f"Rate limited by OpenAI: {str(e)}",
code=ErrorCode.RATE_LIMIT_EXCEEDED,
details=e.body, # Include body which often has rate limit details
details={**body_details, "is_byok": is_byok},
)
if isinstance(e, openai.BadRequestError):
@@ -1082,12 +1086,14 @@ class OpenAIClient(LLMClientBase):
if error_code == "context_length_exceeded" or is_context_window_overflow_message(str(e)):
return ContextWindowExceededError(
message=f"Bad request to OpenAI (context window exceeded): {str(e)}",
details={"is_byok": is_byok},
)
else:
body_details = e.body if isinstance(e.body, dict) else {"body": e.body}
return LLMBadRequestError(
message=f"Bad request to OpenAI: {str(e)}",
code=ErrorCode.INVALID_ARGUMENT, # Or more specific if detectable
details=e.body,
code=ErrorCode.INVALID_ARGUMENT,
details={**body_details, "is_byok": is_byok},
)
# NOTE: The OpenAI Python SDK may raise a generic `openai.APIError` while *iterating*
@@ -1104,34 +1110,46 @@ class OpenAIClient(LLMClientBase):
message=f"OpenAI request exceeded the context window: {msg}",
details={
"provider_exception_type": type(e).__name__,
# Best-effort extraction (may not exist on APIError)
"body": getattr(e, "body", None),
"is_byok": is_byok,
},
)
if isinstance(e, openai.AuthenticationError):
logger.error(f"[OpenAI] Authentication error (401): {str(e)}") # More severe log level
body_details = e.body if isinstance(e.body, dict) else {"body": e.body}
return LLMAuthenticationError(
message=f"Authentication failed with OpenAI: {str(e)}", code=ErrorCode.UNAUTHENTICATED, details=e.body
message=f"Authentication failed with OpenAI: {str(e)}",
code=ErrorCode.UNAUTHENTICATED,
details={**body_details, "is_byok": is_byok},
)
if isinstance(e, openai.PermissionDeniedError):
logger.error(f"[OpenAI] Permission denied (403): {str(e)}") # More severe log level
body_details = e.body if isinstance(e.body, dict) else {"body": e.body}
return LLMPermissionDeniedError(
message=f"Permission denied by OpenAI: {str(e)}", code=ErrorCode.PERMISSION_DENIED, details=e.body
message=f"Permission denied by OpenAI: {str(e)}",
code=ErrorCode.PERMISSION_DENIED,
details={**body_details, "is_byok": is_byok},
)
if isinstance(e, openai.NotFoundError):
logger.warning(f"[OpenAI] Resource not found (404): {str(e)}")
# Could be invalid model name, etc.
return LLMNotFoundError(message=f"Resource not found in OpenAI: {str(e)}", code=ErrorCode.NOT_FOUND, details=e.body)
body_details = e.body if isinstance(e.body, dict) else {"body": e.body}
return LLMNotFoundError(
message=f"Resource not found in OpenAI: {str(e)}",
code=ErrorCode.NOT_FOUND,
details={**body_details, "is_byok": is_byok},
)
if isinstance(e, openai.UnprocessableEntityError):
logger.warning(f"[OpenAI] Unprocessable entity (422): {str(e)}")
body_details = e.body if isinstance(e.body, dict) else {"body": e.body}
return LLMUnprocessableEntityError(
message=f"Invalid request content for OpenAI: {str(e)}",
code=ErrorCode.INVALID_ARGUMENT, # Usually validation errors
details=e.body,
code=ErrorCode.INVALID_ARGUMENT,
details={**body_details, "is_byok": is_byok},
)
# General API error catch-all
@@ -1141,6 +1159,7 @@ class OpenAIClient(LLMClientBase):
if e.status_code == 413:
return ContextWindowExceededError(
message=f"Request too large for OpenAI (413): {str(e)}",
details={"is_byok": is_byok},
)
# Map based on status code potentially
if e.status_code >= 500:
@@ -1158,11 +1177,12 @@ class OpenAIClient(LLMClientBase):
"status_code": e.status_code,
"response": str(e.response),
"body": e.body,
"is_byok": is_byok,
},
)
# Fallback for unexpected errors
return super().handle_llm_error(e)
return super().handle_llm_error(e, llm_config=llm_config)
def fill_image_content_in_messages(openai_message_list: List[dict], pydantic_message_list: List[PydanticMessage]) -> List[dict]:

View File

@@ -668,12 +668,19 @@ def create_application() -> "FastAPI":
@app.exception_handler(LLMRateLimitError)
async def llm_rate_limit_error_handler(request: Request, exc: LLMRateLimitError):
is_byok = exc.details.get("is_byok") if isinstance(exc.details, dict) else None
if is_byok:
message = (
"Rate limit exceeded on your API key. Please check your provider's rate limits and billing, or reduce request frequency."
)
else:
message = "Rate limit exceeded for LLM model provider. Please wait before making another request."
return JSONResponse(
status_code=429,
content={
"error": {
"type": "llm_rate_limit",
"message": "Rate limit exceeded for LLM model provider. Please wait before making another request.",
"message": message,
"detail": str(exc),
}
},

View File

@@ -592,7 +592,7 @@ async def simple_summary(
except Exception as e:
# handle LLM error (likely a context window exceeded error)
try:
raise llm_client.handle_llm_error(e)
raise llm_client.handle_llm_error(e, llm_config=llm_config)
except ContextWindowExceededError as context_error:
logger.warning(f"Context window exceeded during summarization. Applying clamping fallbacks. Original error: {context_error}")
@@ -667,7 +667,7 @@ async def simple_summary(
except Exception as fallback_error_b:
logger.error(f"Transcript truncation fallback also failed: {fallback_error_b}. Propagating error.")
logger.info(f"Full fallback summarization payload: {request_data}")
raise llm_client.handle_llm_error(fallback_error_b)
raise llm_client.handle_llm_error(fallback_error_b, llm_config=llm_config)
logger.info(f"Summarized {len(messages)}: {summary}")