Files
letta-server/letta/adapters/simple_llm_request_adapter.py
Kian Jones 382e216cbb fix(core): differentiate BYOK vs base provider in all LLM error details (#9425)
Add is_byok flag to every LLMError's details dict returned from
handle_llm_error across all providers (OpenAI, Anthropic, Google,
ChatGPT OAuth). This enables observability into whether errors
originate from Letta's production keys or user-provided BYOK keys.

The rate limit handler in app.py now returns a more helpful message
for BYOK users ("check your provider's rate limits and billing")
versus the generic message for base provider rate limits.

Datadog issues:
- https://us5.datadoghq.com/error-tracking/issue/b711c824-f490-11f0-96e4-da7ad0900000
- https://us5.datadoghq.com/error-tracking/issue/76623036-f4de-11f0-8697-da7ad0900000
- https://us5.datadoghq.com/error-tracking/issue/43e9888a-dfcf-11f0-a645-da7ad0900000

🤖 Generated with [Letta Code](https://letta.com)

Co-authored-by: Letta <noreply@letta.com>
2026-02-24 10:52:07 -08:00

120 lines
5.2 KiB
Python

from typing import AsyncGenerator
from letta.adapters.letta_llm_request_adapter import LettaLLMRequestAdapter
from letta.helpers.datetime_helpers import get_utc_timestamp_ns
from letta.schemas.enums import LLMCallType
from letta.schemas.letta_message import LettaMessage
from letta.schemas.letta_message_content import OmittedReasoningContent, ReasoningContent, TextContent
from letta.schemas.usage import normalize_cache_tokens, normalize_reasoning_tokens
class SimpleLLMRequestAdapter(LettaLLMRequestAdapter):
"""Simplifying assumptions:
- No inner thoughts in kwargs
- No forced tool calls
- Content native as assistant message
"""
async def invoke_llm(
self,
request_data: dict,
messages: list,
tools: list,
use_assistant_message: bool,
requires_approval_tools: list[str] = [],
step_id: str | None = None,
actor: str | None = None,
) -> AsyncGenerator[LettaMessage | None, None]:
"""
Execute a blocking LLM request and yield the response.
This adapter:
1. Makes a blocking request to the LLM
2. Converts the response to chat completion format
3. Extracts reasoning and tool call information
4. Updates all instance variables
5. Yields nothing (blocking mode doesn't stream)
"""
# Store request data
self.request_data = request_data
# Set telemetry context and make the blocking LLM request
self.llm_client.set_telemetry_context(
telemetry_manager=self.telemetry_manager,
step_id=step_id,
agent_id=self.agent_id,
agent_tags=self.agent_tags,
run_id=self.run_id,
call_type=LLMCallType.agent_step,
org_id=self.org_id,
user_id=self.user_id,
llm_config=self.llm_config.model_dump() if self.llm_config else None,
)
try:
self.response_data = await self.llm_client.request_async_with_telemetry(request_data, self.llm_config)
except Exception as e:
raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config)
self.llm_request_finish_timestamp_ns = get_utc_timestamp_ns()
# Convert response to chat completion format
self.chat_completions_response = await self.llm_client.convert_response_to_chat_completion(
self.response_data, messages, self.llm_config
)
# Extract reasoning content from the response
if self.chat_completions_response.choices[0].message.reasoning_content:
self.reasoning_content = [
ReasoningContent(
reasoning=self.chat_completions_response.choices[0].message.reasoning_content,
is_native=True,
signature=self.chat_completions_response.choices[0].message.reasoning_content_signature,
)
]
elif self.chat_completions_response.choices[0].message.omitted_reasoning_content:
self.reasoning_content = [OmittedReasoningContent()]
else:
# logger.info("No reasoning content found.")
self.reasoning_content = None
if self.chat_completions_response.choices[0].message.content:
# NOTE: big difference - 'content' goes into 'content'
# Reasoning placed into content for legacy reasons
# Carry thought_signature on TextContent when ReasoningContent doesn't exist to hold it
# (e.g. Gemini 2.5 Flash with include_thoughts=False still returns thought_signature)
orphan_sig = (
self.chat_completions_response.choices[0].message.reasoning_content_signature if not self.reasoning_content else None
)
self.content = [TextContent(text=self.chat_completions_response.choices[0].message.content, signature=orphan_sig)]
else:
self.content = None
if self.reasoning_content and len(self.reasoning_content) > 0:
# Temp workaround to consolidate parts to persist reasoning content, this should be integrated better
self.content = self.reasoning_content + (self.content or [])
# Extract tool call
tool_calls = self.chat_completions_response.choices[0].message.tool_calls or []
self.tool_calls = list(tool_calls)
self.tool_call = self.tool_calls[0] if self.tool_calls else None
# Extract logprobs if present
self.logprobs = self.chat_completions_response.choices[0].logprobs
# Extract usage statistics
self.usage.step_count = 1
self.usage.completion_tokens = self.chat_completions_response.usage.completion_tokens
self.usage.prompt_tokens = self.chat_completions_response.usage.prompt_tokens
self.usage.total_tokens = self.chat_completions_response.usage.total_tokens
# Extract cache and reasoning token details using normalized helpers
usage = self.chat_completions_response.usage
self.usage.cached_input_tokens, self.usage.cache_write_tokens = normalize_cache_tokens(usage.prompt_tokens_details)
self.usage.reasoning_tokens = normalize_reasoning_tokens(usage.completion_tokens_details)
self.log_provider_trace(step_id=step_id, actor=actor)
yield None
return