letta-server/letta/agents/letta_agent_v3.py

import asyncio
import json
import uuid
from typing import Any, AsyncGenerator, Dict, Optional

from opentelemetry.trace import Span

from letta.adapters.letta_llm_adapter import LettaLLMAdapter
from letta.adapters.sglang_native_adapter import SGLangNativeAdapter
from letta.adapters.simple_llm_request_adapter import SimpleLLMRequestAdapter
from letta.adapters.simple_llm_stream_adapter import SimpleLLMStreamAdapter
from letta.agents.helpers import (
    _build_rule_violation_result,
    _load_last_function_response,
    _maybe_get_approval_messages,
    _maybe_get_pending_tool_call_message,
    _prepare_in_context_messages_no_persist_async,
    _safe_load_tool_call_str,
    generate_step_id,
    merge_and_validate_prefilled_args,
)
from letta.agents.letta_agent_v2 import LettaAgentV2
from letta.constants import DEFAULT_MAX_STEPS, NON_USER_MSG_PREFIX, REQUEST_HEARTBEAT_PARAM
from letta.errors import ContextWindowExceededError, LLMError, SystemPromptTokenExceededError
from letta.helpers import ToolRulesSolver
from letta.helpers.datetime_helpers import get_utc_time, get_utc_timestamp_ns
from letta.helpers.tool_execution_helper import enable_strict_mode
from letta.local_llm.constants import INNER_THOUGHTS_KWARG
from letta.otel.tracing import trace_method
from letta.schemas.agent import AgentState
from letta.schemas.enums import LLMCallType
from letta.schemas.letta_message import (
    ApprovalReturn,
    CompactionStats,
    EventMessage,
    LettaErrorMessage,
    LettaMessage,
    MessageType,
    SummaryMessage,
    extract_compaction_stats_from_packed_json,
)
from letta.schemas.letta_message_content import OmittedReasoningContent, ReasoningContent, RedactedReasoningContent, TextContent
from letta.schemas.letta_request import ClientToolSchema
from letta.schemas.letta_response import LettaResponse, TurnTokenData
from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType
from letta.schemas.message import Message, MessageCreate, ToolReturn
from letta.schemas.openai.chat_completion_response import ChoiceLogprobs, ToolCall, ToolCallDenial, UsageStatistics
from letta.schemas.step import StepProgression
from letta.schemas.step_metrics import StepMetrics
from letta.schemas.tool_execution_result import ToolExecutionResult
from letta.schemas.user import User
from letta.server.rest_api.utils import (
    create_approval_request_message_from_llm_response,
    create_letta_messages_from_llm_response,
    create_parallel_tool_messages_from_llm_response,
    create_tool_returns_for_denials,
)
from letta.services.conversation_manager import ConversationManager
from letta.services.helpers.tool_parser_helper import runtime_override_tool_json_schema
from letta.services.summarizer.compact import compact_messages
from letta.services.summarizer.summarizer_config import CompactionSettings
from letta.services.summarizer.summarizer_sliding_window import count_tokens
from letta.settings import settings, summarizer_settings
from letta.system import package_function_response
from letta.utils import safe_create_task_with_return, validate_function_response


def extract_compaction_stats_from_message(message: Message) -> CompactionStats | None:
    """
    Extract CompactionStats from a Message object's packed content.

    Args:
        message: Message object with packed JSON content

    Returns:
        CompactionStats if found and valid, None otherwise
    """
    try:
        if message.content and len(message.content) == 1:
            text_content = message.content[0].text
            return extract_compaction_stats_from_packed_json(text_content)
    except AttributeError:
        pass
    return None


class LettaAgentV3(LettaAgentV2):
    """
    Similar to V2, but stripped down / simplified, while also generalized:
    * Supports non-tool returns
    * No inner thoughts in kwargs
    * No heartbeats (loops happen on tool calls)

    TODOs:
    * Support tool rules
    * Support Gemini / OpenAI client
    """

    def __init__(
        self,
        agent_state: AgentState,
        actor: User,
        conversation_id: str | None = None,
    ):
        super().__init__(agent_state, actor)
        # Set conversation_id after parent init (which calls _initialize_state)
        self.conversation_id = conversation_id

    def _initialize_state(self):
        super()._initialize_state()
        self._require_tool_call = False
        # Approximate token count for the *current* in-context buffer, used
        # only for proactive summarization / eviction logic. This is derived
        # from per-step usage but can be updated after summarization without
        # affecting step-level telemetry.
        self.context_token_estimate: int | None = None
        self.in_context_messages: list[Message] = []  # in-memory tracker
        # Conversation mode: when set, messages are tracked per-conversation
        self.conversation_id: str | None = None
        # Client-side tools passed in the request (executed by client, not server)
        self.client_tools: list[ClientToolSchema] = []
        # Log probabilities from the most recent LLM call (for RL training)
        self.logprobs: ChoiceLogprobs | None = None
        # Multi-turn token tracking for RL training (accumulated across all LLM calls)
        self.turns: list[TurnTokenData] = []
        self.return_token_ids: bool = False

    def _compute_tool_return_truncation_chars(self) -> int:
        """Compute a dynamic cap for tool returns in requests.

        Heuristic: ~20% of context window × 4 chars/token, minimum 5k chars.
        This prevents any single tool return from consuming too much context.
        """
        try:
            cap = int(self.agent_state.llm_config.context_window * 0.2 * 4)  # 20% of tokens → chars
        except Exception:
            cap = 5000
        return max(5000, cap)

    @trace_method
    async def step(
        self,
        input_messages: list[MessageCreate],
        max_steps: int = DEFAULT_MAX_STEPS,
        run_id: str | None = None,
        use_assistant_message: bool = True,  # NOTE: not used
        include_return_message_types: list[MessageType] | None = None,
        request_start_timestamp_ns: int | None = None,
        conversation_id: str | None = None,
        client_tools: list[ClientToolSchema] | None = None,
        include_compaction_messages: bool = False,
    ) -> LettaResponse:
        """
        Execute the agent loop in blocking mode, returning all messages at once.

        Args:
            input_messages: List of new messages to process
            max_steps: Maximum number of agent steps to execute
            run_id: Optional job/run ID for tracking
            use_assistant_message: Whether to use assistant message format
            include_return_message_types: Filter for which message types to return
            request_start_timestamp_ns: Start time for tracking request duration
            conversation_id: Optional conversation ID for conversation-scoped messaging
            client_tools: Optional list of client-side tools. When called, execution pauses
                for client to provide tool returns.
            include_compaction_messages: Whether to include SummaryMessage/EventMessage in response
                and use role=summary for stored summary messages.

        Returns:
            LettaResponse: Complete response with all messages and metadata
        """
        self._initialize_state()
        self.conversation_id = conversation_id
        self.client_tools = client_tools or []

        # Apply conversation-specific block overrides if conversation_id is provided
        if conversation_id:
            self.agent_state = await ConversationManager().apply_isolated_blocks_to_agent_state(
                agent_state=self.agent_state,
                conversation_id=conversation_id,
                actor=self.actor,
            )

        request_span = self._request_checkpoint_start(request_start_timestamp_ns=request_start_timestamp_ns)
        response_letta_messages = []

        # Prepare in-context messages (conversation mode if conversation_id provided)
        curr_in_context_messages, input_messages_to_persist = await _prepare_in_context_messages_no_persist_async(
            input_messages,
            self.agent_state,
            self.message_manager,
            self.actor,
            run_id,
            conversation_id=conversation_id,
        )
        follow_up_messages = []
        if len(input_messages_to_persist) > 1 and input_messages_to_persist[0].role == "approval":
            follow_up_messages = input_messages_to_persist[1:]
            input_messages_to_persist = [input_messages_to_persist[0]]

        self.in_context_messages = curr_in_context_messages

        # Check if we should use SGLang native adapter for multi-turn RL training
        use_sglang_native = (
            self.agent_state.llm_config.return_token_ids
            and self.agent_state.llm_config.handle
            and self.agent_state.llm_config.handle.startswith("sglang/")
        )
        self.return_token_ids = use_sglang_native

        if use_sglang_native:
            # Use SGLang native adapter for multi-turn RL training
            llm_adapter = SGLangNativeAdapter(
                llm_client=self.llm_client,
                llm_config=self.agent_state.llm_config,
                call_type=LLMCallType.agent_step,
                agent_id=self.agent_state.id,
                agent_tags=self.agent_state.tags,
                run_id=run_id,
                org_id=self.actor.organization_id,
                user_id=self.actor.id,
            )
            # Reset turns tracking for this step
            self.turns = []
        else:
            llm_adapter = SimpleLLMRequestAdapter(
                llm_client=self.llm_client,
                llm_config=self.agent_state.llm_config,
                call_type=LLMCallType.agent_step,
                agent_id=self.agent_state.id,
                agent_tags=self.agent_state.tags,
                run_id=run_id,
                org_id=self.actor.organization_id,
                user_id=self.actor.id,
            )

        credit_task = None
        for i in range(max_steps):
            if i == 1 and follow_up_messages:
                input_messages_to_persist = follow_up_messages
                follow_up_messages = []

            # Await credit check from previous iteration before running next step
            if credit_task is not None:
                if not await credit_task:
                    self.should_continue = False
                    self.stop_reason = LettaStopReason(stop_reason=StopReasonType.insufficient_credits)
                    break
                credit_task = None

            response = self._step(
                # we append input_messages_to_persist since they aren't checkpointed as in-context until the end of the step (may be rolled back)
                messages=list(self.in_context_messages + input_messages_to_persist),
                input_messages_to_persist=input_messages_to_persist,
                llm_adapter=llm_adapter,
                run_id=run_id,
                # use_assistant_message=use_assistant_message,
                include_return_message_types=include_return_message_types,
                request_start_timestamp_ns=request_start_timestamp_ns,
                include_compaction_messages=include_compaction_messages,
            )
            input_messages_to_persist = []  # clear after first step

            async for chunk in response:
                response_letta_messages.append(chunk)

            # Check if step was cancelled - break out of the step loop
            if not self.should_continue and self.stop_reason.stop_reason == StopReasonType.cancelled.value:
                break

            # TODO: persist the input messages if successful first step completion
            # TODO: persist the new messages / step / run

            ## Proactive summarization if approaching context limit
            # if (
            #    self.context_token_estimate is not None
            #    and self.context_token_estimate > self.agent_state.llm_config.context_window * SUMMARIZATION_TRIGGER_MULTIPLIER
            #    and not self.agent_state.message_buffer_autoclear
            # ):
            #    self.logger.warning(
            #        f"Step usage ({self.last_step_usage.total_tokens} tokens) approaching "
            #        f"context limit ({self.agent_state.llm_config.context_window}), triggering summarization."
            #    )

            #    in_context_messages = await self.summarize_conversation_history(
            #        in_context_messages=in_context_messages,
            #        new_letta_messages=self.response_messages,
            #        total_tokens=self.context_token_estimate,
            #        force=True,
            #    )

            #    # Clear to avoid duplication in next iteration
            #    self.response_messages = []

            if not self.should_continue:
                break

            # Fire credit check to run in parallel with loop overhead / next step setup
            credit_task = safe_create_task_with_return(self._check_credits())

            # input_messages_to_persist = []

            if i == max_steps - 1 and self.stop_reason is None:
                self.stop_reason = LettaStopReason(stop_reason=StopReasonType.max_steps.value)

        ## Rebuild context window after stepping (safety net)
        # if not self.agent_state.message_buffer_autoclear:
        #    if self.context_token_estimate is not None:
        #        await self.summarize_conversation_history(
        #            in_context_messages=in_context_messages,
        #            new_letta_messages=self.response_messages,
        #            total_tokens=self.context_token_estimate,
        #            force=False,
        #        )
        #    else:
        #        self.logger.warning(
        #            "Post-loop summarization skipped: last_step_usage is None. "
        #            "No step completed successfully or usage stats were not updated."
        #        )

        if self.stop_reason is None:
            self.stop_reason = LettaStopReason(stop_reason=StopReasonType.end_turn.value)

        # construct the response
        response_letta_messages = Message.to_letta_messages_from_list(
            self.response_messages,
            use_assistant_message=False,  # NOTE: set to false
            reverse=False,
            text_is_assistant_message=True,
        )
        if include_return_message_types:
            response_letta_messages = [m for m in response_letta_messages if m.message_type in include_return_message_types]
        # Set context_tokens to expose actual context window usage (vs accumulated prompt_tokens)
        self.usage.context_tokens = self.context_token_estimate
        result = LettaResponse(
            messages=response_letta_messages,
            stop_reason=self.stop_reason,
            usage=self.usage,
            logprobs=self.logprobs,
            turns=self.turns if self.return_token_ids and self.turns else None,
        )
        if run_id:
            if self.job_update_metadata is None:
                self.job_update_metadata = {}
            self.job_update_metadata["result"] = result.model_dump(mode="json")

        await self._request_checkpoint_finish(
            request_span=request_span, request_start_timestamp_ns=request_start_timestamp_ns, run_id=run_id
        )
        return result

    @trace_method
    async def stream(
        self,
        input_messages: list[MessageCreate],
        max_steps: int = DEFAULT_MAX_STEPS,
        stream_tokens: bool = False,
        run_id: str | None = None,
        use_assistant_message: bool = True,  # NOTE: not used
        include_return_message_types: list[MessageType] | None = None,
        request_start_timestamp_ns: int | None = None,
        conversation_id: str | None = None,
        client_tools: list[ClientToolSchema] | None = None,
        include_compaction_messages: bool = False,
    ) -> AsyncGenerator[str, None]:
        """
        Execute the agent loop in streaming mode, yielding chunks as they become available.
        If stream_tokens is True, individual tokens are streamed as they arrive from the LLM,
        providing the lowest latency experience, otherwise each complete step (reasoning +
        tool call + tool return) is yielded as it completes.

        Args:
            input_messages: List of new messages to process
            max_steps: Maximum number of agent steps to execute
            stream_tokens: Whether to stream back individual tokens. Not all llm
                providers offer native token streaming functionality; in these cases,
                this api streams back steps rather than individual tokens.
            run_id: Optional job/run ID for tracking
            use_assistant_message: Whether to use assistant message format
            include_return_message_types: Filter for which message types to return
            request_start_timestamp_ns: Start time for tracking request duration
            conversation_id: Optional conversation ID for conversation-scoped messaging
            client_tools: Optional list of client-side tools. When called, execution pauses
                for client to provide tool returns.

        Yields:
            str: JSON-formatted SSE data chunks for each completed step
        """
        self._initialize_state()
        self.conversation_id = conversation_id
        self.client_tools = client_tools or []
        request_span = self._request_checkpoint_start(request_start_timestamp_ns=request_start_timestamp_ns)
        response_letta_messages = []
        first_chunk = True

        # Apply conversation-specific block overrides if conversation_id is provided
        if conversation_id:
            self.agent_state = await ConversationManager().apply_isolated_blocks_to_agent_state(
                agent_state=self.agent_state,
                conversation_id=conversation_id,
                actor=self.actor,
            )

        # Check if we should use SGLang native adapter for multi-turn RL training
        use_sglang_native = (
            self.agent_state.llm_config.return_token_ids
            and self.agent_state.llm_config.handle
            and self.agent_state.llm_config.handle.startswith("sglang/")
        )
        self.return_token_ids = use_sglang_native

        if stream_tokens:
            llm_adapter = SimpleLLMStreamAdapter(
                llm_client=self.llm_client,
                llm_config=self.agent_state.llm_config,
                call_type=LLMCallType.agent_step,
                agent_id=self.agent_state.id,
                agent_tags=self.agent_state.tags,
                run_id=run_id,
                org_id=self.actor.organization_id,
                user_id=self.actor.id,
            )
        elif use_sglang_native:
            # Use SGLang native adapter for multi-turn RL training
            llm_adapter = SGLangNativeAdapter(
                llm_client=self.llm_client,
                llm_config=self.agent_state.llm_config,
                call_type=LLMCallType.agent_step,
                agent_id=self.agent_state.id,
                agent_tags=self.agent_state.tags,
                run_id=run_id,
                org_id=self.actor.organization_id,
                user_id=self.actor.id,
            )
            # Reset turns tracking for this step
            self.turns = []
        else:
            llm_adapter = SimpleLLMRequestAdapter(
                llm_client=self.llm_client,
                llm_config=self.agent_state.llm_config,
                call_type=LLMCallType.agent_step,
                agent_id=self.agent_state.id,
                agent_tags=self.agent_state.tags,
                run_id=run_id,
                org_id=self.actor.organization_id,
                user_id=self.actor.id,
            )

        try:
            # Prepare in-context messages (conversation mode if conversation_id provided)
            in_context_messages, input_messages_to_persist = await _prepare_in_context_messages_no_persist_async(
                input_messages,
                self.agent_state,
                self.message_manager,
                self.actor,
                run_id,
                conversation_id=conversation_id,
            )
            follow_up_messages = []
            if len(input_messages_to_persist) > 1 and input_messages_to_persist[0].role == "approval":
                follow_up_messages = input_messages_to_persist[1:]
                input_messages_to_persist = [input_messages_to_persist[0]]

            self.in_context_messages = in_context_messages
            credit_task = None
            for i in range(max_steps):
                if i == 1 and follow_up_messages:
                    input_messages_to_persist = follow_up_messages
                    follow_up_messages = []

                # Await credit check from previous iteration before running next step
                if credit_task is not None:
                    if not await credit_task:
                        self.should_continue = False
                        self.stop_reason = LettaStopReason(stop_reason=StopReasonType.insufficient_credits)
                        break
                    credit_task = None

                response = self._step(
                    # we append input_messages_to_persist since they aren't checkpointed as in-context until the end of the step (may be rolled back)
                    messages=list(self.in_context_messages + input_messages_to_persist),
                    input_messages_to_persist=input_messages_to_persist,
                    llm_adapter=llm_adapter,
                    run_id=run_id,
                    # use_assistant_message=use_assistant_message,
                    include_return_message_types=include_return_message_types,
                    request_start_timestamp_ns=request_start_timestamp_ns,
                    include_compaction_messages=include_compaction_messages,
                )
                input_messages_to_persist = []  # clear after first step
                async for chunk in response:
                    response_letta_messages.append(chunk)
                    if first_chunk:
                        request_span = self._request_checkpoint_ttft(request_span, request_start_timestamp_ns)

                    # Log chunks with missing id or otid for debugging.
                    # Compaction EventMessage is intentionally metadata-only and may omit otid.
                    is_compaction_event = isinstance(chunk, EventMessage) and chunk.event_type == "compaction"
                    if isinstance(chunk, LettaMessage) and (not chunk.id or not chunk.otid) and not is_compaction_event:
                        self.logger.warning(
                            "Streaming chunk missing id or otid: message_type=%s id=%s otid=%s step_id=%s",
                            chunk.message_type,
                            chunk.id,
                            chunk.otid,
                            chunk.step_id,
                        )

                    yield f"data: {chunk.model_dump_json()}\n\n"
                    first_chunk = False

                # Check if step was cancelled - break out of the step loop
                if not self.should_continue and self.stop_reason.stop_reason == StopReasonType.cancelled.value:
                    break

                # refresh in-context messages (TODO: remove?)
                # in_context_messages = await self._refresh_messages(in_context_messages)

                if not self.should_continue:
                    break

                # Fire credit check to run in parallel with loop overhead / next step setup
                credit_task = safe_create_task_with_return(self._check_credits())

                if i == max_steps - 1 and self.stop_reason is None:
                    self.stop_reason = LettaStopReason(stop_reason=StopReasonType.max_steps.value)

            ## Rebuild context window after stepping (safety net)
            # if not self.agent_state.message_buffer_autoclear:
            #    if self.context_token_estimate is not None:
            #        await self.summarize_conversation_history(
            #            in_context_messages=in_context_messages,
            #            new_letta_messages=self.response_messages,
            #            total_tokens=self.context_token_estimate,
            #            force=False,
            #        )
            #    else:
            #        self.logger.warning(
            #            "Post-loop summarization skipped: last_step_usage is None. "
            #            "No step completed successfully or usage stats were not updated."
            #        )

            if self.stop_reason is None:
                self.stop_reason = LettaStopReason(stop_reason=StopReasonType.end_turn.value)

        except Exception as e:
            # Use repr() if str() is empty (happens with Exception() with no args)
            error_detail = str(e) or repr(e)
            self.logger.warning(f"Error during agent stream: {error_detail}", exc_info=True)

            # Set stop_reason if not already set
            if self.stop_reason is None:
                # Classify error type
                if isinstance(e, LLMError):
                    self.stop_reason = LettaStopReason(stop_reason=StopReasonType.llm_api_error.value)
                else:
                    self.stop_reason = LettaStopReason(stop_reason=StopReasonType.error.value)

            if first_chunk:
                # Raise if no chunks sent yet (response not started, can return error status code)
                raise
            else:
                yield f"data: {self.stop_reason.model_dump_json()}\n\n"

                # Mid-stream error: yield error event to client in SSE format
                error_message = LettaErrorMessage(
                    run_id=run_id,
                    error_type="internal_error",
                    message="An error occurred during agent execution.",
                    detail=error_detail,
                )
                yield f"event: error\ndata: {error_message.model_dump_json()}\n\n"

                # Return immediately - don't fall through to finish chunks
                # This prevents sending end_turn finish chunks after an error
                return

        # Cleanup and finalize (only runs if no exception occurred)
        try:
            # Set context_tokens to expose actual context window usage (vs accumulated prompt_tokens)
            self.usage.context_tokens = self.context_token_estimate

            if run_id:
                # Filter out LettaStopReason from messages (only valid in LettaStreamingResponse, not LettaResponse)
                filtered_messages = [m for m in response_letta_messages if not isinstance(m, LettaStopReason)]
                result = LettaResponse(
                    messages=filtered_messages,
                    stop_reason=self.stop_reason,
                    usage=self.usage,
                    logprobs=self.logprobs,
                    turns=self.turns if self.return_token_ids and self.turns else None,
                )
                if self.job_update_metadata is None:
                    self.job_update_metadata = {}
                self.job_update_metadata["result"] = result.model_dump(mode="json")

            await self._request_checkpoint_finish(
                request_span=request_span, request_start_timestamp_ns=request_start_timestamp_ns, run_id=run_id
            )
            for finish_chunk in self.get_finish_chunks_for_stream(self.usage, self.stop_reason):
                yield f"data: {finish_chunk}\n\n"
        except Exception as cleanup_error:
            # Error during cleanup/finalization - ensure we still send a terminal event
            self.logger.error(f"Error during stream cleanup: {cleanup_error}", exc_info=True)

            # Set stop_reason if not already set
            if self.stop_reason is None:
                self.stop_reason = LettaStopReason(stop_reason=StopReasonType.error.value)

            yield f"data: {self.stop_reason.model_dump_json()}\n\n"

            # Send error event
            error_message = LettaErrorMessage(
                run_id=run_id,
                error_type="cleanup_error",
                message="An error occurred during stream finalization.",
                detail=str(cleanup_error),
            )
            yield f"event: error\ndata: {error_message.model_dump_json()}\n\n"
            # Note: we don't send finish chunks here since we already errored

    async def _check_for_system_prompt_overflow(self, system_message):
        """
        Since the system prompt cannot be compacted, we need to check to see if it is the cause of the context overflow
        """
        system_prompt_token_estimate = await count_tokens(
            actor=self.actor,
            llm_config=self.agent_state.llm_config,
            messages=[system_message],
        )
        if system_prompt_token_estimate is not None and system_prompt_token_estimate >= self.agent_state.llm_config.context_window:
            self.should_continue = False
            self.stop_reason = LettaStopReason(stop_reason=StopReasonType.context_window_overflow_in_system_prompt.value)
            raise SystemPromptTokenExceededError(
                system_prompt_token_estimate=system_prompt_token_estimate,
                context_window=self.agent_state.llm_config.context_window,
            )

    async def _checkpoint_messages(self, run_id: str, step_id: str, new_messages: list[Message], in_context_messages: list[Message]):
        """
        Checkpoint the current message state - run this only when the current messages are 'safe' - meaning the step has completed successfully.

        This handles:
        - Persisting the new messages into the `messages` table
        - Updating the in-memory trackers for in-context messages (`self.in_context_messages`) and agent state (`self.agent_state.message_ids`)
        - Updating the DB with the current in-context messages (`self.agent_state.message_ids`) OR conversation_messages table

        Args:
            run_id: The run ID to associate with the messages
            step_id: The step ID to associate with the messages
            new_messages: The new messages to persist
            in_context_messages: The current in-context messages
        """
        # make sure all the new messages have the correct run_id, step_id, and conversation_id
        for message in new_messages:
            message.step_id = step_id
            message.run_id = run_id
            message.conversation_id = self.conversation_id

        # persist the new message objects - ONLY place where messages are persisted
        await self.message_manager.create_many_messages_async(
            new_messages,
            actor=self.actor,
            run_id=run_id,
            project_id=self.agent_state.project_id,
            template_id=self.agent_state.template_id,
        )

        if self.conversation_id:
            # Conversation mode: update conversation_messages table
            # Add new messages to conversation tracking
            new_message_ids = [m.id for m in new_messages]
            if new_message_ids:
                await ConversationManager().add_messages_to_conversation(
                    conversation_id=self.conversation_id,
                    agent_id=self.agent_state.id,
                    message_ids=new_message_ids,
                    actor=self.actor,
                )

            # Update which messages are in context
            # Note: update_in_context_messages also updates positions to preserve order
            await ConversationManager().update_in_context_messages(
                conversation_id=self.conversation_id,
                in_context_message_ids=[m.id for m in in_context_messages],
                actor=self.actor,
            )
        else:
            # Default mode: update agent.message_ids
            await self.agent_manager.update_message_ids_async(
                agent_id=self.agent_state.id,
                message_ids=[m.id for m in in_context_messages],
                actor=self.actor,
            )
            self.agent_state.message_ids = [m.id for m in in_context_messages]  # update in-memory state

        self.in_context_messages = in_context_messages  # update in-memory state

    def _create_compaction_event_message(
        self,
        step_id: str | None,
        run_id: str | None,
        trigger: str,
    ) -> EventMessage:
        """
        Create an EventMessage to notify the client that compaction is starting.

        Args:
            step_id: The current step ID
            run_id: The current run ID
            trigger: The trigger that caused compaction (e.g., "context_window_exceeded", "post_step_context_check")

        Returns:
            EventMessage to yield before compaction starts
        """
        return EventMessage(
            id=str(uuid.uuid4()),
            date=get_utc_time(),
            event_type="compaction",
            event_data={
                "trigger": trigger,
                "context_token_estimate": self.context_token_estimate,
                "context_window": self.agent_state.llm_config.context_window,
            },
            run_id=run_id,
            step_id=step_id,
        )

    def _create_summary_result_message(
        self,
        summary_message: Message,
        summary_text: str,
        step_id: str | None,
        run_id: str | None,
        include_compaction_messages: bool,
    ) -> list[LettaMessage]:
        """
        Create the summary message to yield to the client after compaction completes.

        Args:
            summary_message: The persisted summary Message object
            summary_text: The raw summary text (unpacked)
            step_id: The current step ID
            run_id: The current run ID
            include_compaction_messages: If True, return SummaryMessage; if False, return UserMessage

        Returns:
            List of LettaMessage objects to yield to the client
        """
        if include_compaction_messages:
            # Extract compaction_stats from the packed message content if available
            compaction_stats = extract_compaction_stats_from_message(summary_message)

            # New behavior: structured SummaryMessage
            return [
                SummaryMessage(
                    id=summary_message.id,
                    date=summary_message.created_at,
                    summary=summary_text,
                    otid=Message.generate_otid_from_id(summary_message.id, 0),
                    step_id=step_id,
                    run_id=run_id,
                    compaction_stats=compaction_stats,
                ),
            ]
        else:
            # Old behavior: UserMessage with packed JSON
            return list(Message.to_letta_messages(summary_message))

    @trace_method
    async def _step(
        self,
        messages: list[Message],  # current in-context messages
        llm_adapter: LettaLLMAdapter,
        input_messages_to_persist: list[Message] | None = None,
        run_id: str | None = None,
        # use_assistant_message: bool = True,
        include_return_message_types: list[MessageType] | None = None,
        request_start_timestamp_ns: int | None = None,
        remaining_turns: int = -1,
        dry_run: bool = False,
        enforce_run_id_set: bool = True,
        include_compaction_messages: bool = False,
    ) -> AsyncGenerator[LettaMessage | dict, None]:
        """
        Execute a single agent step (one LLM call and tool execution).

        This is the core execution method that all public methods (step, stream_steps,
        stream_tokens) funnel through. It handles the complete flow of making an LLM
        request, processing the response, executing tools, and persisting messages.

        Args:
            messages: Current in-context messages
            llm_adapter: Adapter for LLM interaction (blocking or streaming)
            input_messages_to_persist: New messages to persist after execution
            run_id: Optional job/run ID for tracking
            include_return_message_types: Filter for which message types to yield
            request_start_timestamp_ns: Start time for tracking request duration
            remaining_turns: Number of turns remaining (for max_steps enforcement)
            dry_run: If true, only build and return the request without executing

        Yields:
            LettaMessage or dict: Chunks for streaming mode, or request data for dry_run
        """
        if enforce_run_id_set and run_id is None:
            raise AssertionError("run_id is required when enforce_run_id_set is True")

        input_messages_to_persist = input_messages_to_persist or []

        if self.context_token_estimate is None:
            self.logger.warning("Context token estimate is not set")

        step_progression = StepProgression.START
        caught_exception = None
        # TODO(@caren): clean this up
        tool_calls, content, agent_step_span, _first_chunk, step_id, logged_step, _step_start_ns, step_metrics = (
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
        )
        try:
            self.last_function_response = _load_last_function_response(messages)
            valid_tools = await self._get_valid_tools()
            require_tool_call = self.tool_rules_solver.should_force_tool_call()

            if self._require_tool_call != require_tool_call:
                if require_tool_call:
                    self.logger.info("switching to constrained mode (forcing tool call)")
                else:
                    self.logger.info("switching to unconstrained mode (allowing non-tool responses)")
            self._require_tool_call = require_tool_call

            # Refresh messages at the start of each step to scrub inner thoughts.
            # NOTE: We skip system prompt refresh during normal steps to preserve prefix caching.
            # The system prompt is only rebuilt after compaction or message reset.
            try:
                messages = await self._refresh_messages(messages, force_system_prompt_refresh=False)
            except Exception as e:
                self.logger.warning(f"Failed to refresh messages at step start: {e}")

            approval_request, approval_response = _maybe_get_approval_messages(messages)
            tool_call_denials, tool_returns = [], []
            if approval_request and approval_response:
                # case of handling approval responses
                content = approval_request.content

                # Get tool calls that are pending
                backfill_tool_call_id = approval_request.tool_calls[0].id  # legacy case
                if approval_response.approvals:
                    approved_tool_call_ids = {
                        backfill_tool_call_id if a.tool_call_id.startswith("message-") else a.tool_call_id
                        for a in approval_response.approvals
                        if isinstance(a, ApprovalReturn) and a.approve
                    }
                else:
                    approved_tool_call_ids = {}
                tool_calls = [tool_call for tool_call in approval_request.tool_calls if tool_call.id in approved_tool_call_ids]
                pending_tool_call_message = _maybe_get_pending_tool_call_message(messages)
                if pending_tool_call_message:
                    tool_calls.extend(pending_tool_call_message.tool_calls)

                # Get tool calls that were denied
                if approval_response.approvals:
                    denies = {d.tool_call_id: d for d in approval_response.approvals if isinstance(d, ApprovalReturn) and not d.approve}
                else:
                    denies = {}
                tool_call_denials = [
                    ToolCallDenial(**t.model_dump(), reason=denies.get(t.id).reason) for t in approval_request.tool_calls if t.id in denies
                ]

                # Get tool calls that were executed client side
                if approval_response.approvals:
                    tool_returns = [r for r in approval_response.approvals if isinstance(r, ToolReturn)]

                # Validate that the approval response contains meaningful data
                # If all three lists are empty, this is a malformed approval response
                if not tool_calls and not tool_call_denials and not tool_returns:
                    self.logger.error(
                        f"Invalid approval response: approval_response.approvals is {approval_response.approvals} "
                        f"but no tool calls, denials, or returns were extracted. "
                        f"This likely indicates a corrupted or malformed approval payload."
                    )
                    self.should_continue = False
                    self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_tool_call.value)
                    return

                step_id = approval_request.step_id
                if step_id is None:
                    # Old approval messages may not have step_id set - generate a new one
                    self.logger.warning(f"Approval request message {approval_request.id} has no step_id, generating new step_id")
                    step_id = generate_step_id()
                    step_progression, logged_step, step_metrics, agent_step_span = await self._step_checkpoint_start(
                        step_id=step_id, run_id=run_id
                    )
                else:
                    step_metrics = await self.step_manager.get_step_metrics_async(step_id=step_id, actor=self.actor)
            else:
                # Check for job cancellation at the start of each step
                if run_id and await self._check_run_cancellation(run_id):
                    self.should_continue = False
                    self.stop_reason = LettaStopReason(stop_reason=StopReasonType.cancelled.value)
                    self.logger.info(f"Agent execution cancelled for run {run_id}")
                    return

                step_id = generate_step_id()
                step_progression, logged_step, step_metrics, agent_step_span = await self._step_checkpoint_start(
                    step_id=step_id, run_id=run_id
                )

                force_tool_call = valid_tools[0]["name"] if len(valid_tools) == 1 and self._require_tool_call else None
                for llm_request_attempt in range(summarizer_settings.max_summarizer_retries + 1):
                    try:
                        request_data = self.llm_client.build_request_data(
                            agent_type=self.agent_state.agent_type,
                            messages=messages,
                            llm_config=self.agent_state.llm_config,
                            tools=valid_tools,
                            force_tool_call=force_tool_call,
                            requires_subsequent_tool_call=self._require_tool_call,
                            tool_return_truncation_chars=self._compute_tool_return_truncation_chars(),
                        )
                        # TODO: Extend to more providers, and also approval tool rules
                        # TODO: this entire code block should be inside of the clients
                        # Enable parallel tool use when no tool rules are attached
                        try:
                            no_tool_rules = (
                                not self.agent_state.tool_rules
                                or len([t for t in self.agent_state.tool_rules if t.type != "requires_approval"]) == 0
                            )

                            # Anthropic/Bedrock/MiniMax parallel tool use (MiniMax uses Anthropic-compatible API)
                            if self.agent_state.llm_config.model_endpoint_type in ["anthropic", "bedrock", "minimax"]:
                                if (
                                    isinstance(request_data.get("tool_choice"), dict)
                                    and "disable_parallel_tool_use" in request_data["tool_choice"]
                                ):
                                    # Gate parallel tool use on both: no tool rules and toggled on
                                    if no_tool_rules and self.agent_state.llm_config.parallel_tool_calls:
                                        request_data["tool_choice"]["disable_parallel_tool_use"] = False
                                    else:
                                        # Explicitly disable when tool rules present or llm_config toggled off
                                        request_data["tool_choice"]["disable_parallel_tool_use"] = True

                            # OpenAI parallel tool use
                            elif self.agent_state.llm_config.model_endpoint_type == "openai":
                                # For OpenAI, we control parallel tool calling via parallel_tool_calls field
                                # Only allow parallel tool calls when no tool rules and enabled in config
                                if "parallel_tool_calls" in request_data:
                                    if no_tool_rules and self.agent_state.llm_config.parallel_tool_calls:
                                        request_data["parallel_tool_calls"] = True
                                    else:
                                        request_data["parallel_tool_calls"] = False

                            # Gemini (Google AI/Vertex) parallel tool use
                            elif self.agent_state.llm_config.model_endpoint_type in ["google_ai", "google_vertex"]:
                                # Gemini supports parallel tool calling natively through multiple parts in the response
                                # We just need to ensure the config flag is set for tracking purposes
                                # The actual handling happens in GoogleVertexClient.convert_response_to_chat_completion
                                pass  # No specific request_data field needed for Gemini
                        except Exception:
                            # if this fails, we simply don't enable parallel tool use
                            pass
                        if dry_run:
                            yield request_data
                            return

                        step_progression, step_metrics = self._step_checkpoint_llm_request_start(step_metrics, agent_step_span)
                        invocation = llm_adapter.invoke_llm(
                            request_data=request_data,
                            messages=messages,
                            tools=valid_tools,
                            use_assistant_message=False,  # NOTE: set to false
                            requires_approval_tools=self.tool_rules_solver.get_requires_approval_tools(
                                set([t["name"] for t in valid_tools])
                            )
                            + [ct.name for ct in self.client_tools],
                            step_id=step_id,
                            actor=self.actor,
                        )
                        async for chunk in invocation:
                            if llm_adapter.supports_token_streaming():
                                if include_return_message_types is None or chunk.message_type in include_return_message_types:
                                    yield chunk
                        # If you've reached this point without an error, break out of retry loop
                        break
                    except ValueError as e:
                        self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response.value)
                        raise e
                    except LLMError as e:
                        self.stop_reason = LettaStopReason(stop_reason=StopReasonType.llm_api_error.value)
                        raise e
                    except Exception as e:
                        if isinstance(e, ContextWindowExceededError) and llm_request_attempt < summarizer_settings.max_summarizer_retries:
                            # Retry case
                            self.logger.info(
                                f"Context window exceeded (error {e}), trying to compact messages attempt {llm_request_attempt + 1} of {summarizer_settings.max_summarizer_retries + 1}"
                            )
                            try:
                                # Capture pre-compaction state for metadata
                                context_tokens_before = self.context_token_estimate
                                messages_count_before = len(messages)

                                # Yield event notification before compaction starts
                                if include_compaction_messages:
                                    yield self._create_compaction_event_message(
                                        step_id=step_id,
                                        run_id=run_id,
                                        trigger="context_window_exceeded",
                                    )

                                # Ensure system prompt is recompiled before summarization so compaction
                                # operates on the latest system+memory state (including recent repairs).
                                messages = await self._refresh_messages(messages, force_system_prompt_refresh=True)

                                summary_message, messages, summary_text = await self.compact(
                                    messages,
                                    trigger_threshold=self.agent_state.llm_config.context_window,
                                    run_id=run_id,
                                    step_id=step_id,
                                    use_summary_role=include_compaction_messages,
                                    trigger="context_window_exceeded",
                                    context_tokens_before=context_tokens_before,
                                    messages_count_before=messages_count_before,
                                )

                                # Recompile the persisted system prompt after compaction so subsequent
                                # turns load the repaired system+memory state from message_ids[0].
                                await self.agent_manager.rebuild_system_prompt_async(
                                    agent_id=self.agent_state.id,
                                    actor=self.actor,
                                    force=True,
                                    update_timestamp=True,
                                )
                                # Force system prompt rebuild after compaction to update memory blocks and timestamps
                                messages = await self._refresh_messages(messages, force_system_prompt_refresh=True)
                                self.logger.info("Summarization succeeded, continuing to retry LLM request")

                                # Persist the summary message
                                self.response_messages.append(summary_message)
                                await self._checkpoint_messages(
                                    run_id=run_id,
                                    step_id=step_id,
                                    new_messages=[summary_message],
                                    in_context_messages=messages,
                                )

                                # Yield summary result message to client
                                for msg in self._create_summary_result_message(
                                    summary_message=summary_message,
                                    summary_text=summary_text,
                                    step_id=step_id,
                                    run_id=run_id,
                                    include_compaction_messages=include_compaction_messages,
                                ):
                                    yield msg

                                continue
                            except SystemPromptTokenExceededError:
                                self.should_continue = False
                                self.stop_reason = LettaStopReason(
                                    stop_reason=StopReasonType.context_window_overflow_in_system_prompt.value
                                )
                                raise
                            except Exception as e:
                                self.stop_reason = LettaStopReason(stop_reason=StopReasonType.error.value)
                                self.logger.error(f"Unknown error occured for summarization run {run_id}: {e}")
                                raise e

                        else:
                            self.stop_reason = LettaStopReason(stop_reason=StopReasonType.error.value)
                            self.logger.error(f"Unknown error occured for run {run_id}: {e}")
                            raise e

                step_progression, step_metrics = self._step_checkpoint_llm_request_finish(
                    step_metrics, agent_step_span, llm_adapter.llm_request_finish_timestamp_ns
                )
                # update metrics
                self._update_global_usage_stats(llm_adapter.usage)
                self.context_token_estimate = llm_adapter.usage.total_tokens
                self.logger.info(f"Context token estimate after LLM request: {self.context_token_estimate}")

                # Extract logprobs if present (for RL training)
                if llm_adapter.logprobs is not None:
                    self.logprobs = llm_adapter.logprobs

                # Track turn data for multi-turn RL training (SGLang native mode)
                if self.return_token_ids and hasattr(llm_adapter, "output_ids") and llm_adapter.output_ids:
                    self.turns.append(
                        TurnTokenData(
                            role="assistant",
                            output_ids=llm_adapter.output_ids,
                            output_token_logprobs=llm_adapter.output_token_logprobs,
                            content=llm_adapter.chat_completions_response.choices[0].message.content
                            if llm_adapter.chat_completions_response
                            else None,
                        )
                    )

                # Handle the AI response with the extracted data (supports multiple tool calls)
                # Gather tool calls - check for multi-call API first, then fall back to single
                if hasattr(llm_adapter, "tool_calls") and llm_adapter.tool_calls:
                    tool_calls = llm_adapter.tool_calls
                elif llm_adapter.tool_call is not None:
                    tool_calls = [llm_adapter.tool_call]
                else:
                    tool_calls = []

                # Enforce parallel_tool_calls=false by truncating to first tool call
                # Some providers (e.g. Gemini) don't respect this setting via API, so we enforce it client-side
                if len(tool_calls) > 1 and not self.agent_state.llm_config.parallel_tool_calls:
                    self.logger.warning(
                        f"LLM returned {len(tool_calls)} tool calls but parallel_tool_calls=false. "
                        f"Truncating to first tool call: {tool_calls[0].function.name}"
                    )
                    tool_calls = [tool_calls[0]]

            # get the new generated `Message` objects from handling the LLM response
            new_messages, self.should_continue, self.stop_reason = await self._handle_ai_response(
                tool_calls=tool_calls,
                valid_tool_names=[tool["name"] for tool in valid_tools],
                tool_rules_solver=self.tool_rules_solver,
                usage=UsageStatistics(
                    completion_tokens=self.usage.completion_tokens,
                    prompt_tokens=self.usage.prompt_tokens,
                    total_tokens=self.usage.total_tokens,
                ),
                content=content or llm_adapter.content,
                pre_computed_assistant_message_id=llm_adapter.message_id,
                step_id=step_id,
                initial_messages=[],  # input_messages_to_persist, # TODO: deprecate - super confusing
                agent_step_span=agent_step_span,
                is_final_step=(remaining_turns == 0),
                run_id=run_id,
                step_metrics=step_metrics,
                is_approval_response=approval_response is not None,
                tool_call_denials=tool_call_denials,
                tool_returns=tool_returns,
                finish_reason=llm_adapter.finish_reason,
            )

            # extend trackers with new messages
            self.response_messages.extend(new_messages)
            messages.extend(new_messages)

            # Track tool return turns for multi-turn RL training
            if self.return_token_ids:
                for msg in new_messages:
                    if msg.role == "tool":
                        # Get tool return content
                        tool_content = None
                        tool_name = None
                        if hasattr(msg, "tool_returns") and msg.tool_returns:
                            # Aggregate all tool returns into content (func_response is the actual content)
                            parts = []
                            for tr in msg.tool_returns:
                                if hasattr(tr, "func_response") and tr.func_response:
                                    if isinstance(tr.func_response, str):
                                        parts.append(tr.func_response)
                                    else:
                                        parts.append(str(tr.func_response))
                            tool_content = "\n".join(parts)
                        elif hasattr(msg, "content") and msg.content:
                            tool_content = msg.content if isinstance(msg.content, str) else str(msg.content)
                        if hasattr(msg, "name"):
                            tool_name = msg.name
                        if tool_content:
                            self.turns.append(
                                TurnTokenData(
                                    role="tool",
                                    content=tool_content,
                                    tool_name=tool_name,
                                )
                            )

            # step(...) has successfully completed! now we can persist messages and update the in-context messages + save metrics
            # persistence needs to happen before streaming to minimize chances of agent getting into an inconsistent state
            step_progression, step_metrics = await self._step_checkpoint_finish(step_metrics, agent_step_span, logged_step)
            await self._checkpoint_messages(
                run_id=run_id,
                step_id=step_id,
                new_messages=input_messages_to_persist + new_messages,
                in_context_messages=messages,  # update the in-context messages
            )

            # yield back generated messages
            if llm_adapter.supports_token_streaming():
                if tool_calls:
                    # Stream each tool return if tools were executed
                    response_tool_returns = [msg for msg in new_messages if msg.role == "tool"]
                    for tr in response_tool_returns:
                        # Skip streaming for aggregated parallel tool returns (no per-call tool_call_id)
                        if tr.tool_call_id is None and tr.tool_returns:
                            continue
                        tool_return_letta = tr.to_letta_messages()[0]
                        if include_return_message_types is None or tool_return_letta.message_type in include_return_message_types:
                            yield tool_return_letta
            else:
                # TODO: modify this use step_response_messages
                filter_user_messages = [m for m in new_messages if m.role != "user"]
                letta_messages = Message.to_letta_messages_from_list(
                    filter_user_messages,
                    use_assistant_message=False,  # NOTE: set to false
                    reverse=False,
                    # text_is_assistant_message=(self.agent_state.agent_type == AgentType.react_agent),
                    text_is_assistant_message=True,
                )
                for message in letta_messages:
                    if include_return_message_types is None or message.message_type in include_return_message_types:
                        yield message

            # check compaction
            if self.context_token_estimate is not None and self.context_token_estimate > self.agent_state.llm_config.context_window:
                self.logger.info(
                    f"Context window exceeded (current: {self.context_token_estimate}, threshold: {self.agent_state.llm_config.context_window}), trying to compact messages"
                )

                # Capture pre-compaction state for metadata
                context_tokens_before = self.context_token_estimate
                messages_count_before = len(messages)

                # Yield event notification before compaction starts
                if include_compaction_messages:
                    yield self._create_compaction_event_message(
                        step_id=step_id,
                        run_id=run_id,
                        trigger="post_step_context_check",
                    )

                try:
                    # Ensure system prompt is recompiled before summarization so compaction
                    # operates on the latest system+memory state (including recent repairs).
                    messages = await self._refresh_messages(messages, force_system_prompt_refresh=True)

                    summary_message, messages, summary_text = await self.compact(
                        messages,
                        trigger_threshold=self.agent_state.llm_config.context_window,
                        run_id=run_id,
                        step_id=step_id,
                        use_summary_role=include_compaction_messages,
                        trigger="post_step_context_check",
                        context_tokens_before=context_tokens_before,
                        messages_count_before=messages_count_before,
                    )

                    # Recompile the persisted system prompt after compaction so subsequent
                    # turns load the repaired system+memory state from message_ids[0].
                    await self.agent_manager.rebuild_system_prompt_async(
                        agent_id=self.agent_state.id,
                        actor=self.actor,
                        force=True,
                        update_timestamp=True,
                    )
                    # Force system prompt rebuild after compaction to update memory blocks and timestamps
                    messages = await self._refresh_messages(messages, force_system_prompt_refresh=True)
                    # TODO: persist + return the summary message
                    # TODO: convert this to a SummaryMessage
                    self.response_messages.append(summary_message)

                    # Yield summary result message to client
                    for msg in self._create_summary_result_message(
                        summary_message=summary_message,
                        summary_text=summary_text,
                        step_id=step_id,
                        run_id=run_id,
                        include_compaction_messages=include_compaction_messages,
                    ):
                        yield msg

                    await self._checkpoint_messages(
                        run_id=run_id,
                        step_id=step_id,
                        new_messages=[summary_message],
                        in_context_messages=messages,
                    )
                except SystemPromptTokenExceededError:
                    self.should_continue = False
                    self.stop_reason = LettaStopReason(stop_reason=StopReasonType.context_window_overflow_in_system_prompt.value)
                    raise

        except Exception as e:
            caught_exception = e
            # NOTE: message persistence does not happen in the case of an exception (rollback to previous state)
            # Use repr() if str() is empty (happens with Exception() with no args)
            error_detail = str(e) or repr(e)
            self.logger.warning(f"Error during step processing: {error_detail}")
            self.job_update_metadata = {"error": error_detail}

            # Stop the agent loop on any exception to prevent wasteful retry loops
            # (e.g., if post-step compaction fails, we don't want to keep retrying)
            self.should_continue = False
            self.logger.warning(
                f"Agent loop stopped due to exception (step_progression={step_progression.name}, "
                f"exception_type={type(e).__name__}): {error_detail}"
            )

            # This indicates we failed after we decided to stop stepping, which indicates a bug with our flow.
            if not self.stop_reason:
                self.stop_reason = LettaStopReason(stop_reason=StopReasonType.error.value)
            elif self.stop_reason.stop_reason in (StopReasonType.end_turn, StopReasonType.max_steps, StopReasonType.tool_rule):
                self.logger.warning("Error occurred during step processing, with valid stop reason: %s", self.stop_reason.stop_reason)
            elif self.stop_reason.stop_reason not in (
                StopReasonType.no_tool_call,
                StopReasonType.invalid_tool_call,
                StopReasonType.invalid_llm_response,
                StopReasonType.llm_api_error,
                StopReasonType.context_window_overflow_in_system_prompt,
            ):
                self.logger.warning("Error occurred during step processing, with unexpected stop reason: %s", self.stop_reason.stop_reason)
            raise e
        finally:
            # always make sure we update the step/run metadata
            self.logger.debug("Running cleanup for agent loop run: %s", run_id)
            self.logger.info("Running final update. Step Progression: %s", step_progression)
            try:
                if step_progression == StepProgression.FINISHED:
                    if not self.should_continue:
                        if self.stop_reason is None:
                            self.stop_reason = LettaStopReason(stop_reason=StopReasonType.end_turn.value)
                        if logged_step and step_id:
                            await self.step_manager.update_step_stop_reason(self.actor, step_id, self.stop_reason.stop_reason)
                    if not self.stop_reason or self.stop_reason.stop_reason != StopReasonType.context_window_overflow_in_system_prompt:
                        # only return if the stop reason is not context window overflow in system prompt
                        return
                if step_progression < StepProgression.STEP_LOGGED:
                    # Error occurred before step was fully logged
                    import traceback

                    if logged_step:
                        await self.step_manager.update_step_error_async(
                            actor=self.actor,
                            step_id=step_id,  # Use original step_id for telemetry
                            error_type=type(caught_exception).__name__ if caught_exception is not None else "Unknown",
                            error_message=str(caught_exception) if caught_exception is not None else "Unknown error",
                            error_traceback=traceback.format_exc(),
                            stop_reason=self.stop_reason,
                        )
                elif step_progression <= StepProgression.LOGGED_TRACE:
                    if self.stop_reason is None:
                        self.logger.warning("Error in step after logging step")
                        self.stop_reason = LettaStopReason(stop_reason=StopReasonType.error.value)
                    if logged_step:
                        await self.step_manager.update_step_stop_reason(self.actor, step_id, self.stop_reason.stop_reason)
                else:
                    self.logger.warning("Invalid StepProgression value")

                # Do tracking for failure cases. Can consolidate with success conditions later.
                if settings.track_stop_reason:
                    await self._log_request(request_start_timestamp_ns, None, self.job_update_metadata, is_error=True, run_id=run_id)

                # Record partial step metrics on failure (capture whatever timing data we have)
                if logged_step and step_metrics and step_progression < StepProgression.FINISHED:
                    # Calculate total step time up to the failure point
                    step_metrics.step_ns = get_utc_timestamp_ns() - step_metrics.step_start_ns

                    await self._record_step_metrics(
                        step_id=step_id,
                        step_metrics=step_metrics,
                        run_id=run_id,
                    )
            except Exception as e:
                self.logger.warning(f"Error during post-completion step tracking: {e}")

    @trace_method
    async def _handle_ai_response(
        self,
        valid_tool_names: list[str],
        tool_rules_solver: ToolRulesSolver,
        usage: UsageStatistics,
        content: list[TextContent | ReasoningContent | RedactedReasoningContent | OmittedReasoningContent] | None = None,
        pre_computed_assistant_message_id: str | None = None,
        step_id: str | None = None,
        initial_messages: list[Message] | None = None,
        agent_step_span: Span | None = None,
        is_final_step: bool | None = None,
        run_id: str | None = None,
        step_metrics: StepMetrics = None,
        is_approval_response: bool | None = None,
        tool_calls: list[ToolCall] = [],
        tool_call_denials: list[ToolCallDenial] = [],
        tool_returns: list[ToolReturn] = [],
        finish_reason: str | None = None,
    ) -> tuple[list[Message], bool, LettaStopReason | None]:
        """
        Handle the final AI response once streaming completes, execute / validate tool calls,
        decide whether we should keep stepping, and persist state.

        Unified approach: treats single and multi-tool calls uniformly to reduce code duplication.
        """

        # 1. Handle no-tool cases (content-only or no-op)
        if not tool_calls and not tool_call_denials and not tool_returns:
            # Case 1a: No tool call, no content (LLM no-op)
            if content is None or len(content) == 0:
                # Check if there are required-before-exit tools that haven't been called
                uncalled = tool_rules_solver.get_uncalled_required_tools(available_tools=set([t.name for t in self.agent_state.tools]))
                if uncalled:
                    heartbeat_reason = (
                        f"{NON_USER_MSG_PREFIX}ToolRuleViolated: You must call {', '.join(uncalled)} at least once to exit the loop."
                    )
                    from letta.server.rest_api.utils import create_heartbeat_system_message

                    heartbeat_msg = create_heartbeat_system_message(
                        agent_id=self.agent_state.id,
                        model=self.agent_state.llm_config.model,
                        function_call_success=True,
                        timezone=self.agent_state.timezone,
                        heartbeat_reason=heartbeat_reason,
                        run_id=run_id,
                    )
                    messages_to_persist = (initial_messages or []) + [heartbeat_msg]
                    continue_stepping, stop_reason = True, None
                else:
                    # No required tools remaining, end turn without persisting no-op
                    continue_stepping = False
                    stop_reason = LettaStopReason(stop_reason=StopReasonType.end_turn.value)
                    messages_to_persist = initial_messages or []

            # Case 1b: No tool call but has content
            else:
                continue_stepping, heartbeat_reason, stop_reason = self._decide_continuation(
                    agent_state=self.agent_state,
                    tool_call_name=None,
                    tool_rule_violated=False,
                    tool_rules_solver=tool_rules_solver,
                    is_final_step=is_final_step,
                    finish_reason=finish_reason,
                )
                assistant_message = create_letta_messages_from_llm_response(
                    agent_id=self.agent_state.id,
                    model=self.agent_state.llm_config.model,
                    function_name=None,
                    function_arguments=None,
                    tool_execution_result=None,
                    tool_call_id=None,
                    function_response=None,
                    timezone=self.agent_state.timezone,
                    continue_stepping=continue_stepping,
                    heartbeat_reason=heartbeat_reason,
                    reasoning_content=content,
                    pre_computed_assistant_message_id=pre_computed_assistant_message_id,
                    step_id=step_id,
                    run_id=run_id,
                    is_approval_response=is_approval_response,
                    force_set_request_heartbeat=False,
                    add_heartbeat_on_continue=bool(heartbeat_reason),
                )
                messages_to_persist = (initial_messages or []) + assistant_message
            return messages_to_persist, continue_stepping, stop_reason

        # 2. Check whether tool call requires approval (includes client-side tools)
        if not is_approval_response:
            # Get names of client-side tools (these are executed by client, not server)
            client_tool_names = {ct.name for ct in self.client_tools} if self.client_tools else set()

            # Tools requiring approval: requires_approval tools OR client-side tools
            requested_tool_calls = [
                t
                for t in tool_calls
                if tool_rules_solver.is_requires_approval_tool(t.function.name) or t.function.name in client_tool_names
            ]
            allowed_tool_calls = [
                t
                for t in tool_calls
                if not tool_rules_solver.is_requires_approval_tool(t.function.name) and t.function.name not in client_tool_names
            ]
            if requested_tool_calls:
                approval_messages = create_approval_request_message_from_llm_response(
                    agent_id=self.agent_state.id,
                    model=self.agent_state.llm_config.model,
                    requested_tool_calls=requested_tool_calls,
                    allowed_tool_calls=allowed_tool_calls,
                    reasoning_content=content,
                    pre_computed_assistant_message_id=pre_computed_assistant_message_id,
                    step_id=step_id,
                    run_id=run_id,
                )
                messages_to_persist = (initial_messages or []) + approval_messages
                return messages_to_persist, False, LettaStopReason(stop_reason=StopReasonType.requires_approval.value)

        result_tool_returns = []

        # 3. Handle client side tool execution
        if tool_returns:
            # Clamp client-side tool returns before persisting (JSON-aware: truncate only the 'message' field)
            try:
                cap = self._compute_tool_return_truncation_chars()
            except Exception:
                cap = 5000

            for tr in tool_returns:
                try:
                    if tr.func_response and isinstance(tr.func_response, str):
                        parsed = json.loads(tr.func_response)
                        if isinstance(parsed, dict) and "message" in parsed and isinstance(parsed["message"], str):
                            msg = parsed["message"]
                            if len(msg) > cap:
                                original_len = len(msg)
                                parsed["message"] = msg[:cap] + f"... [truncated {original_len - cap} chars]"
                                tr.func_response = json.dumps(parsed)
                                self.logger.warning(f"Truncated client-side tool return message from {original_len} to {cap} chars")
                        else:
                            # Fallback to raw string truncation if not a dict with 'message'
                            if len(tr.func_response) > cap:
                                original_len = len(tr.func_response)
                                tr.func_response = tr.func_response[:cap] + f"... [truncated {original_len - cap} chars]"
                                self.logger.warning(f"Truncated client-side tool return (raw) from {original_len} to {cap} chars")
                except json.JSONDecodeError:
                    # Non-JSON or unexpected shape; truncate as raw string
                    if tr.func_response and len(tr.func_response) > cap:
                        original_len = len(tr.func_response)
                        tr.func_response = tr.func_response[:cap] + f"... [truncated {original_len - cap} chars]"
                        self.logger.warning(f"Truncated client-side tool return (non-JSON) from {original_len} to {cap} chars")
                except Exception as e:
                    # Unexpected error; log and skip truncation for this return
                    self.logger.warning(f"Failed to truncate client-side tool return: {e}")

            continue_stepping = True
            stop_reason = None
            result_tool_returns = tool_returns

        # 4. Handle denial cases
        if tool_call_denials:
            # Convert ToolCallDenial objects to ToolReturn objects using shared helper
            # Group denials by reason to potentially batch them, but for now process individually
            for tool_call_denial in tool_call_denials:
                denial_returns = create_tool_returns_for_denials(
                    tool_calls=[tool_call_denial],
                    denial_reason=tool_call_denial.reason,
                    timezone=self.agent_state.timezone,
                )
                result_tool_returns.extend(denial_returns)

        # 5. Unified tool execution path (works for both single and multiple tools)

        # 5. Unified tool execution path (works for both single and multiple tools)
        # Note: Parallel tool calling with tool rules is validated at agent create/update time.
        # At runtime, we trust that if tool_rules exist, parallel_tool_calls=false is enforced earlier.

        # 5a. Prepare execution specs for all tools
        exec_specs = []
        for tc in tool_calls:
            call_id = tc.id or f"call_{uuid.uuid4().hex[:8]}"
            name = tc.function.name
            args = _safe_load_tool_call_str(tc.function.arguments)
            args.pop(REQUEST_HEARTBEAT_PARAM, None)
            args.pop(INNER_THOUGHTS_KWARG, None)

            # Validate against allowed tools
            tool_rule_violated = name not in valid_tool_names and not is_approval_response

            # Handle prefilled args if present
            if not tool_rule_violated:
                prefill_args = tool_rules_solver.last_prefilled_args_by_tool.get(name)
                if prefill_args:
                    target_tool = next((t for t in self.agent_state.tools if t.name == name), None)
                    provenance = tool_rules_solver.last_prefilled_args_provenance.get(name)
                    try:
                        args = merge_and_validate_prefilled_args(
                            tool=target_tool,
                            llm_args=args,
                            prefilled_args=prefill_args,
                        )
                    except ValueError as ve:
                        # Invalid prefilled args - create error result
                        error_prefix = "Invalid prefilled tool arguments from tool rules"
                        prov_suffix = f" (source={provenance})" if provenance else ""
                        err_msg = f"{error_prefix}{prov_suffix}: {str(ve)}"

                        exec_specs.append(
                            {
                                "id": call_id,
                                "name": name,
                                "args": args,
                                "violated": False,
                                "error": err_msg,
                            }
                        )
                        continue

            exec_specs.append(
                {
                    "id": call_id,
                    "name": name,
                    "args": args,
                    "violated": tool_rule_violated,
                    "error": None,
                }
            )

        # 5c. Execute tools (sequentially for single, parallel for multiple)
        async def _run_one(spec: Dict[str, Any]):
            if spec.get("error"):
                return ToolExecutionResult(status="error", func_return=spec["error"]), 0
            if spec["violated"]:
                result = _build_rule_violation_result(spec["name"], valid_tool_names, tool_rules_solver)
                return result, 0
            t0 = get_utc_timestamp_ns()
            target_tool = next((x for x in self.agent_state.tools if x.name == spec["name"]), None)
            res = await self._execute_tool(
                target_tool=target_tool,
                tool_args=spec["args"],
                agent_state=self.agent_state,
                agent_step_span=agent_step_span,
                step_id=step_id,
            )
            dt = get_utc_timestamp_ns() - t0
            return res, dt

        if len(exec_specs) == 1:
            results = [await _run_one(exec_specs[0])]
        else:
            # separate tools by parallel execution capability
            parallel_items = []
            serial_items = []

            for idx, spec in enumerate(exec_specs):
                target_tool = next((x for x in self.agent_state.tools if x.name == spec["name"]), None)
                if target_tool and target_tool.enable_parallel_execution:
                    parallel_items.append((idx, spec))
                else:
                    serial_items.append((idx, spec))

            # execute all parallel tools concurrently and all serial tools sequentially
            results = [None] * len(exec_specs)

            parallel_results = await asyncio.gather(*[_run_one(spec) for _, spec in parallel_items]) if parallel_items else []
            for (idx, _), result in zip(parallel_items, parallel_results):
                results[idx] = result

            for idx, spec in serial_items:
                results[idx] = await _run_one(spec)

        # 5d. Update metrics with execution time
        if step_metrics is not None and results:
            step_metrics.tool_execution_ns = max(dt for _, dt in results)

        # 5e. Process results and compute function responses
        function_responses: list[Optional[str]] = []
        persisted_continue_flags: list[bool] = []
        persisted_stop_reasons: list[LettaStopReason | None] = []

        for idx, spec in enumerate(exec_specs):
            tool_execution_result, _ = results[idx]
            has_prefill_error = bool(spec.get("error"))

            # Validate and format function response
            truncate = spec["name"] not in {"conversation_search", "conversation_search_date", "archival_memory_search"}
            return_char_limit = next((t.return_char_limit for t in self.agent_state.tools if t.name == spec["name"]), None)
            function_response_string = validate_function_response(
                tool_execution_result.func_return,
                return_char_limit=return_char_limit,
                truncate=truncate,
            )
            function_responses.append(function_response_string)

            # Update last function response (for tool rules)
            self.last_function_response = package_function_response(
                was_success=tool_execution_result.success_flag,
                response_string=function_response_string,
                timezone=self.agent_state.timezone,
            )

            # Register successful tool call with solver
            if not spec["violated"] and not has_prefill_error:
                tool_rules_solver.register_tool_call(spec["name"])

            # Decide continuation for this tool
            if has_prefill_error:
                cont = False
                _hb_reason = None
                sr = LettaStopReason(stop_reason=StopReasonType.invalid_tool_call.value)
            else:
                cont, _hb_reason, sr = self._decide_continuation(
                    agent_state=self.agent_state,
                    tool_call_name=spec["name"],
                    tool_rule_violated=spec["violated"],
                    tool_rules_solver=tool_rules_solver,
                    is_final_step=(is_final_step and idx == len(exec_specs) - 1),
                    finish_reason=finish_reason,
                )
            persisted_continue_flags.append(cont)
            persisted_stop_reasons.append(sr)

        # 5f. Create messages using parallel message creation (works for both single and multi)
        tool_call_specs = [{"name": s["name"], "arguments": s["args"], "id": s["id"]} for s in exec_specs]
        tool_execution_results = [res for (res, _) in results]

        # Use the parallel message creation function for both single and multiple tools
        parallel_messages = create_parallel_tool_messages_from_llm_response(
            agent_id=self.agent_state.id,
            model=self.agent_state.llm_config.model,
            tool_call_specs=tool_call_specs,
            tool_execution_results=tool_execution_results,
            function_responses=function_responses,
            timezone=self.agent_state.timezone,
            run_id=run_id,
            step_id=step_id,
            reasoning_content=content,
            pre_computed_assistant_message_id=pre_computed_assistant_message_id,
            is_approval_response=is_approval_response,
            tool_returns=result_tool_returns,
        )

        messages_to_persist: list[Message] = (initial_messages or []) + parallel_messages

        # Set run_id and step_id on all messages before persisting
        for message in messages_to_persist:
            if message.run_id is None:
                message.run_id = run_id
            if message.step_id is None:
                message.step_id = step_id

        # 5g. Aggregate continuation decisions
        aggregate_continue = any(persisted_continue_flags) if persisted_continue_flags else False
        aggregate_continue = aggregate_continue or tool_call_denials or tool_returns

        # Determine aggregate stop reason
        aggregate_stop_reason = None
        for sr in persisted_stop_reasons:
            if sr is not None:
                aggregate_stop_reason = sr

        # For parallel tool calls, always continue to allow the agent to process/summarize results
        # unless a terminal tool was called or we hit max steps
        if len(exec_specs) > 1:
            has_terminal = any(sr and sr.stop_reason == StopReasonType.tool_rule.value for sr in persisted_stop_reasons)
            is_max_steps = any(sr and sr.stop_reason == StopReasonType.max_steps.value for sr in persisted_stop_reasons)

            if not has_terminal and not is_max_steps:
                # Force continuation for parallel tool execution
                aggregate_continue = True
                aggregate_stop_reason = None
        return messages_to_persist, aggregate_continue, aggregate_stop_reason

    @trace_method
    def _decide_continuation(
        self,
        agent_state: AgentState,
        tool_call_name: Optional[str],
        tool_rule_violated: bool,
        tool_rules_solver: ToolRulesSolver,
        is_final_step: bool | None,
        finish_reason: str | None = None,
    ) -> tuple[bool, str | None, LettaStopReason | None]:
        """
        In v3 loop, we apply the following rules:

        1. Did not call a tool? Loop ends

        2. Called a tool? Loop continues. This can be:
           2a. Called tool, tool executed successfully
           2b. Called tool, tool failed to execute
           2c. Called tool + tool rule violation (did not execute)

        """
        continue_stepping = True  # Default continue
        continuation_reason: str | None = None
        stop_reason: LettaStopReason | None = None

        if tool_call_name is None:
            # No tool call – if there are required-before-exit tools uncalled, keep stepping
            # and provide explicit feedback to the model; otherwise end the loop.
            uncalled = tool_rules_solver.get_uncalled_required_tools(available_tools=set([t.name for t in agent_state.tools]))
            if uncalled and not is_final_step:
                reason = f"{NON_USER_MSG_PREFIX}ToolRuleViolated: You must call {', '.join(uncalled)} at least once to exit the loop."
                return True, reason, None
            # No required tools remaining → end turn
            # Check if the LLM hit max_tokens (finish_reason == "length")
            if finish_reason == "length":
                return False, None, LettaStopReason(stop_reason=StopReasonType.max_tokens_exceeded.value)
            return False, None, LettaStopReason(stop_reason=StopReasonType.end_turn.value)
        else:
            if tool_rule_violated:
                continue_stepping = True
                continuation_reason = f"{NON_USER_MSG_PREFIX}Continuing: tool rule violation."
            else:
                tool_rules_solver.register_tool_call(tool_call_name)

                if tool_rules_solver.is_terminal_tool(tool_call_name):
                    stop_reason = LettaStopReason(stop_reason=StopReasonType.tool_rule.value)
                    continue_stepping = False

                elif tool_rules_solver.has_children_tools(tool_call_name):
                    continue_stepping = True
                    continuation_reason = f"{NON_USER_MSG_PREFIX}Continuing: child tool rule."

                elif tool_rules_solver.is_continue_tool(tool_call_name):
                    continue_stepping = True
                    continuation_reason = f"{NON_USER_MSG_PREFIX}Continuing: continue tool rule."

                # – hard stop overrides –
                if is_final_step:
                    continue_stepping = False
                    stop_reason = LettaStopReason(stop_reason=StopReasonType.max_steps.value)
                else:
                    uncalled = tool_rules_solver.get_uncalled_required_tools(available_tools=set([t.name for t in agent_state.tools]))
                    if uncalled:
                        continue_stepping = True
                        continuation_reason = (
                            f"{NON_USER_MSG_PREFIX}Continuing, user expects these tools: [{', '.join(uncalled)}] to be called still."
                        )

                        stop_reason = None  # reset – we’re still going

            return continue_stepping, continuation_reason, stop_reason

    @trace_method
    async def _get_valid_tools(self):
        tools = self.agent_state.tools
        valid_tool_names = self.tool_rules_solver.get_allowed_tool_names(
            available_tools=set([t.name for t in tools]),
            last_function_response=self.last_function_response,
            error_on_empty=False,  # Return empty list instead of raising error
        ) or list(set(t.name for t in tools))

        # Get client tool names to filter out server tools with same name (client tools override)
        client_tool_names = {ct.name for ct in self.client_tools} if self.client_tools else set()

        # Build allowed tools from server tools, excluding those overridden by client tools
        allowed_tools = [
            enable_strict_mode(t.json_schema, strict=self.agent_state.llm_config.strict)
            for t in tools
            if t.name in set(valid_tool_names) and t.name not in client_tool_names
        ]

        # Merge client-side tools (use flat format matching enable_strict_mode output)
        if self.client_tools:
            for ct in self.client_tools:
                client_tool_schema = {
                    "name": ct.name,
                    "description": ct.description,
                    "parameters": ct.parameters or {"type": "object", "properties": {}},
                }
                allowed_tools.append(client_tool_schema)

        terminal_tool_names = {rule.tool_name for rule in self.tool_rules_solver.terminal_tool_rules}
        allowed_tools = runtime_override_tool_json_schema(
            tool_list=allowed_tools,
            response_format=self.agent_state.response_format,
            request_heartbeat=False,  # NOTE: difference for v3 (don't add request heartbeat)
            terminal_tools=terminal_tool_names,
        )
        return allowed_tools

    @trace_method
    async def compact(
        self,
        messages,
        trigger_threshold: Optional[int] = None,
        compaction_settings: Optional["CompactionSettings"] = None,
        run_id: Optional[str] = None,
        step_id: Optional[str] = None,
        use_summary_role: bool = False,
        trigger: Optional[str] = None,
        context_tokens_before: Optional[int] = None,
        messages_count_before: Optional[int] = None,
    ) -> tuple[Message, list[Message], str]:
        """Compact the current in-context messages for this agent.

        Compaction uses a summarizer LLM configuration derived from
        ``compaction_settings.model`` when provided. This mirrors how agent
        creation derives defaults from provider-specific ModelSettings, but is
        localized to summarization.

        Args:
            use_summary_role: If True, the summary message will be created with
                role=summary instead of role=user. This enables first-class
                summary message handling in the database and API responses.
            trigger: What triggered the compaction (e.g., "context_window_exceeded", "post_step_context_check").
            context_tokens_before: Token count before compaction (for stats).
            messages_count_before: Message count before compaction (for stats).
        """
        # Determine compaction settings: passed-in > agent's > global defaults
        effective_compaction_settings = compaction_settings or self.agent_state.compaction_settings

        result = await compact_messages(
            actor=self.actor,
            agent_id=self.agent_state.id,
            agent_llm_config=self.agent_state.llm_config,
            messages=messages,
            timezone=self.agent_state.timezone,
            compaction_settings=effective_compaction_settings,
            agent_model_handle=self.agent_state.model,
            agent_tags=self.agent_state.tags,
            tools=self.agent_state.tools,
            trigger_threshold=trigger_threshold,
            run_id=run_id,
            step_id=step_id,
            use_summary_role=use_summary_role,
            trigger=trigger,
            context_tokens_before=context_tokens_before,
            messages_count_before=messages_count_before,
        )

        # Update the agent's context token estimate
        self.context_token_estimate = result.context_token_estimate

        return result.summary_message, result.compacted_messages, result.summary_text