letta-server/letta/adapters/simple_llm_request_adapter.py

from typing import AsyncGenerator

from letta.adapters.letta_llm_request_adapter import LettaLLMRequestAdapter
from letta.helpers.datetime_helpers import get_utc_timestamp_ns
from letta.schemas.enums import LLMCallType
from letta.schemas.letta_message import LettaMessage
from letta.schemas.letta_message_content import OmittedReasoningContent, ReasoningContent, TextContent
from letta.schemas.usage import normalize_cache_tokens, normalize_reasoning_tokens


class SimpleLLMRequestAdapter(LettaLLMRequestAdapter):
    """Simplifying assumptions:

    - No inner thoughts in kwargs
    - No forced tool calls
    - Content native as assistant message
    """

    async def invoke_llm(
        self,
        request_data: dict,
        messages: list,
        tools: list,
        use_assistant_message: bool,
        requires_approval_tools: list[str] = [],
        step_id: str | None = None,
        actor: str | None = None,
    ) -> AsyncGenerator[LettaMessage | None, None]:
        """
        Execute a blocking LLM request and yield the response.

        This adapter:
        1. Makes a blocking request to the LLM
        2. Converts the response to chat completion format
        3. Extracts reasoning and tool call information
        4. Updates all instance variables
        5. Yields nothing (blocking mode doesn't stream)
        """
        # Store request data
        self.request_data = request_data

        # Set telemetry context and make the blocking LLM request
        self.llm_client.set_telemetry_context(
            telemetry_manager=self.telemetry_manager,
            step_id=step_id,
            agent_id=self.agent_id,
            agent_tags=self.agent_tags,
            run_id=self.run_id,
            call_type=LLMCallType.agent_step,
            org_id=self.org_id,
            user_id=self.user_id,
            llm_config=self.llm_config.model_dump() if self.llm_config else None,
        )
        try:
            self.response_data = await self.llm_client.request_async_with_telemetry(request_data, self.llm_config)
        except Exception as e:
            raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config)

        self.llm_request_finish_timestamp_ns = get_utc_timestamp_ns()

        # Convert response to chat completion format
        self.chat_completions_response = await self.llm_client.convert_response_to_chat_completion(
            self.response_data, messages, self.llm_config
        )

        # Extract reasoning content from the response
        if self.chat_completions_response.choices[0].message.reasoning_content:
            self.reasoning_content = [
                ReasoningContent(
                    reasoning=self.chat_completions_response.choices[0].message.reasoning_content,
                    is_native=True,
                    signature=self.chat_completions_response.choices[0].message.reasoning_content_signature,
                )
            ]
        elif self.chat_completions_response.choices[0].message.omitted_reasoning_content:
            self.reasoning_content = [OmittedReasoningContent()]
        else:
            # logger.info("No reasoning content found.")
            self.reasoning_content = None

        if self.chat_completions_response.choices[0].message.content:
            # NOTE: big difference - 'content' goes into 'content'
            # Reasoning placed into content for legacy reasons
            # Carry thought_signature on TextContent when ReasoningContent doesn't exist to hold it
            # (e.g. Gemini 2.5 Flash with include_thoughts=False still returns thought_signature)
            orphan_sig = (
                self.chat_completions_response.choices[0].message.reasoning_content_signature if not self.reasoning_content else None
            )
            self.content = [TextContent(text=self.chat_completions_response.choices[0].message.content, signature=orphan_sig)]
        else:
            self.content = None

        if self.reasoning_content and len(self.reasoning_content) > 0:
            # Temp workaround to consolidate parts to persist reasoning content, this should be integrated better
            self.content = self.reasoning_content + (self.content or [])

        # Extract tool call
        tool_calls = self.chat_completions_response.choices[0].message.tool_calls or []
        self.tool_calls = list(tool_calls)
        self.tool_call = self.tool_calls[0] if self.tool_calls else None

        # Extract logprobs if present
        self.logprobs = self.chat_completions_response.choices[0].logprobs

        # Extract usage statistics
        self.usage.step_count = 1
        self.usage.completion_tokens = self.chat_completions_response.usage.completion_tokens
        self.usage.prompt_tokens = self.chat_completions_response.usage.prompt_tokens
        self.usage.total_tokens = self.chat_completions_response.usage.total_tokens

        # Extract cache and reasoning token details using normalized helpers
        usage = self.chat_completions_response.usage
        self.usage.cached_input_tokens, self.usage.cache_write_tokens = normalize_cache_tokens(usage.prompt_tokens_details)
        self.usage.reasoning_tokens = normalize_reasoning_tokens(usage.completion_tokens_details)

        self.log_provider_trace(step_id=step_id, actor=actor)

        yield None
        return