From a4041879a432a9ab6f46d507e5705ef01c2d8fe7 Mon Sep 17 00:00:00 2001 From: Charles Packer Date: Tue, 23 Sep 2025 17:49:59 -0700 Subject: [PATCH] feat: add new agent loop (squash rebase of OSS PR) (#4815) * feat: squash rebase of OSS PR * fix: revert changes that weren't on manual rebase * fix: caught another one * fix: disable force * chore: drop print * fix: just stage-api && just publish-api * fix: make agent_type consistently an arg in the client * fix: patch multi-modal support * chore: put in todo stub * fix: disable hardcoding for tests * fix: patch validate agent sync (#4882) patch validate agent sync * fix: strip bad merge diff * fix: revert unrelated diff * fix: react_v2 naming -> letta_v1 naming * fix: strip bad merge --------- Co-authored-by: Kevin Lin --- fern/openapi.json | 65 +- letta/adapters/letta_llm_request_adapter.py | 78 ++ letta/adapters/letta_llm_stream_adapter.py | 177 +++- letta/agents/agent_loop.py | 6 + letta/agents/ephemeral_summary_agent.py | 2 +- letta/agents/letta_agent.py | 1 + letta/agents/letta_agent_v2.py | 1 + letta/agents/letta_agent_v3.py | 817 ++++++++++++++++++ letta/helpers/converters.py | 3 + .../anthropic_streaming_interface.py | 357 ++++++++ .../interfaces/openai_streaming_interface.py | 671 +++++++++++++- letta/llm_api/anthropic_client.py | 65 +- letta/llm_api/bedrock_client.py | 5 +- letta/llm_api/deepseek_client.py | 4 +- letta/llm_api/google_vertex_client.py | 59 +- letta/llm_api/groq_client.py | 4 +- letta/llm_api/llm_client_base.py | 6 +- letta/llm_api/openai_client.py | 302 ++++++- letta/llm_api/xai_client.py | 4 +- letta/memory.py | 1 + letta/prompts/system_prompts/__init__.py | 2 + letta/prompts/system_prompts/letta_v1.py | 5 + letta/prompts/system_prompts/letta_v1.txt | 3 + letta/schemas/agent.py | 1 + letta/schemas/enums.py | 1 + letta/schemas/letta_message_content.py | 66 +- letta/schemas/memory.py | 2 +- letta/schemas/message.py | 440 +++++++--- letta/schemas/openai/responses_request.py | 64 ++ letta/schemas/providers/ollama.py | 150 +++- letta/server/rest_api/routers/v1/agents.py | 5 +- letta/server/rest_api/routers/v1/tools.py | 3 +- letta/server/rest_api/utils.py | 160 ++-- letta/server/server.py | 5 + letta/services/agent_manager.py | 9 + .../services/helpers/agent_manager_helper.py | 4 + letta/services/summarizer/summarizer.py | 4 +- 37 files changed, 3315 insertions(+), 237 deletions(-) create mode 100644 letta/agents/letta_agent_v3.py create mode 100644 letta/prompts/system_prompts/letta_v1.py create mode 100644 letta/prompts/system_prompts/letta_v1.txt create mode 100644 letta/schemas/openai/responses_request.py diff --git a/fern/openapi.json b/fern/openapi.json index 6e6f53ff..ec1df9a2 100644 --- a/fern/openapi.json +++ b/fern/openapi.json @@ -13259,6 +13259,7 @@ "agentType": { "type": "string", "enum": [ + "letta_v1_agent", "memgpt_agent", "memgpt_v2_agent", "react_agent", @@ -15606,6 +15607,7 @@ "enum": [ "memgpt_agent", "memgpt_v2_agent", + "letta_v1_agent", "react_agent", "workflow_agent", "split_thread_agent", @@ -23361,6 +23363,9 @@ }, { "$ref": "#/components/schemas/OmittedReasoningContent" + }, + { + "$ref": "#/components/schemas/SummarizedReasoningContent" } ], "discriminator": { @@ -23370,6 +23375,7 @@ "omitted_reasoning": "#/components/schemas/OmittedReasoningContent", "reasoning": "#/components/schemas/ReasoningContent", "redacted_reasoning": "#/components/schemas/RedactedReasoningContent", + "summarized_reasoning": "#/components/schemas/SummarizedReasoningContent", "text": "#/components/schemas/TextContent", "tool_call": "#/components/schemas/ToolCallContent", "tool_return": "#/components/schemas/ToolReturnContent" @@ -23938,7 +23944,8 @@ } }, "type": "object", - "title": "OmittedReasoningContent" + "title": "OmittedReasoningContent", + "description": "A placeholder for reasoning content we know is present, but isn't returned by the provider (e.g. OpenAI GPT-5 on ChatCompletions)" }, "Organization": { "properties": { @@ -24850,7 +24857,8 @@ }, "type": "object", "required": ["is_native", "reasoning"], - "title": "ReasoningContent" + "title": "ReasoningContent", + "description": "Sent via the Anthropic Messages API" }, "ReasoningMessage": { "properties": { @@ -24991,7 +24999,8 @@ }, "type": "object", "required": ["data"], - "title": "RedactedReasoningContent" + "title": "RedactedReasoningContent", + "description": "Sent via the Anthropic Messages API" }, "RequiredBeforeExitToolRule": { "properties": { @@ -26748,6 +26757,56 @@ "title": "StreamableHTTPServerConfig", "description": "Configuration for an MCP server using Streamable HTTP\n\nAuthentication can be provided in multiple ways:\n1. Using auth_header + auth_token: Will add a specific header with the token\n Example: auth_header=\"Authorization\", auth_token=\"Bearer abc123\"\n\n2. Using the custom_headers dict: For more complex authentication scenarios\n Example: custom_headers={\"X-API-Key\": \"abc123\", \"X-Custom-Header\": \"value\"}" }, + "SummarizedReasoningContent": { + "properties": { + "type": { + "type": "string", + "const": "summarized_reasoning", + "title": "Type", + "description": "Indicates this is a summarized reasoning step.", + "default": "summarized_reasoning" + }, + "id": { + "type": "string", + "title": "Id", + "description": "The unique identifier for this reasoning step." + }, + "summary": { + "items": { + "$ref": "#/components/schemas/SummarizedReasoningContentPart" + }, + "type": "array", + "title": "Summary", + "description": "Summaries of the reasoning content." + }, + "encrypted_content": { + "type": "string", + "title": "Encrypted Content", + "description": "The encrypted reasoning content." + } + }, + "type": "object", + "required": ["id", "summary"], + "title": "SummarizedReasoningContent", + "description": "The style of reasoning content returned by the OpenAI Responses API" + }, + "SummarizedReasoningContentPart": { + "properties": { + "index": { + "type": "integer", + "title": "Index", + "description": "The index of the summary part." + }, + "text": { + "type": "string", + "title": "Text", + "description": "The text of the summary part." + } + }, + "type": "object", + "required": ["index", "text"], + "title": "SummarizedReasoningContentPart" + }, "SupervisorManager": { "properties": { "manager_type": { diff --git a/letta/adapters/letta_llm_request_adapter.py b/letta/adapters/letta_llm_request_adapter.py index 705965aa..07519e8a 100644 --- a/letta/adapters/letta_llm_request_adapter.py +++ b/letta/adapters/letta_llm_request_adapter.py @@ -110,3 +110,81 @@ class LettaLLMRequestAdapter(LettaLLMAdapter): ), label="create_provider_trace", ) + + +class SimpleLettaLLMRequestAdapter(LettaLLMRequestAdapter): + """Simplifying assumptions: + + - No inner thoughts in kwargs + - No forced tool calls + - Content native as assistant message + """ + + async def invoke_llm( + self, + request_data: dict, + messages: list, + tools: list, + use_assistant_message: bool, + requires_approval_tools: list[str] = [], + step_id: str | None = None, + actor: str | None = None, + ) -> AsyncGenerator[LettaMessage | None, None]: + """ + Execute a blocking LLM request and yield the response. + + This adapter: + 1. Makes a blocking request to the LLM + 2. Converts the response to chat completion format + 3. Extracts reasoning and tool call information + 4. Updates all instance variables + 5. Yields nothing (blocking mode doesn't stream) + """ + # Store request data + self.request_data = request_data + + # Make the blocking LLM request + self.response_data = await self.llm_client.request_async(request_data, self.llm_config) + self.llm_request_finish_timestamp_ns = get_utc_timestamp_ns() + + # Convert response to chat completion format + self.chat_completions_response = self.llm_client.convert_response_to_chat_completion(self.response_data, messages, self.llm_config) + + # Extract reasoning content from the response + if self.chat_completions_response.choices[0].message.reasoning_content: + self.reasoning_content = [ + ReasoningContent( + reasoning=self.chat_completions_response.choices[0].message.reasoning_content, + is_native=True, + signature=self.chat_completions_response.choices[0].message.reasoning_content_signature, + ) + ] + elif self.chat_completions_response.choices[0].message.omitted_reasoning_content: + self.reasoning_content = [OmittedReasoningContent()] + else: + # logger.info("No reasoning content found.") + self.reasoning_content = None + + if self.chat_completions_response.choices[0].message.content: + # NOTE: big difference - 'content' goes into 'content' + # Reasoning placed into content for legacy reasons + self.content = [TextContent(text=self.chat_completions_response.choices[0].message.content)] + else: + self.content = None + + # Extract tool call + if self.chat_completions_response.choices[0].message.tool_calls: + self.tool_call = self.chat_completions_response.choices[0].message.tool_calls[0] + else: + self.tool_call = None + + # Extract usage statistics + self.usage.step_count = 1 + self.usage.completion_tokens = self.chat_completions_response.usage.completion_tokens + self.usage.prompt_tokens = self.chat_completions_response.usage.prompt_tokens + self.usage.total_tokens = self.chat_completions_response.usage.total_tokens + + self.log_provider_trace(step_id=step_id, actor=actor) + + yield None + return diff --git a/letta/adapters/letta_llm_stream_adapter.py b/letta/adapters/letta_llm_stream_adapter.py index 565cf455..84f337f9 100644 --- a/letta/adapters/letta_llm_stream_adapter.py +++ b/letta/adapters/letta_llm_stream_adapter.py @@ -1,13 +1,18 @@ import asyncio -from typing import AsyncGenerator +from typing import AsyncGenerator, List from letta.adapters.letta_llm_adapter import LettaLLMAdapter from letta.helpers.datetime_helpers import get_utc_timestamp_ns -from letta.interfaces.anthropic_streaming_interface import AnthropicStreamingInterface -from letta.interfaces.openai_streaming_interface import OpenAIStreamingInterface +from letta.interfaces.anthropic_streaming_interface import AnthropicStreamingInterface, SimpleAnthropicStreamingInterface +from letta.interfaces.openai_streaming_interface import ( + OpenAIStreamingInterface, + SimpleOpenAIResponsesStreamingInterface, + SimpleOpenAIStreamingInterface, +) from letta.llm_api.llm_client_base import LLMClientBase from letta.schemas.enums import ProviderType from letta.schemas.letta_message import LettaMessage +from letta.schemas.letta_message_content import SummarizedReasoningContent, TextContent from letta.schemas.llm_config import LLMConfig from letta.schemas.provider_trace import ProviderTraceCreate from letta.schemas.usage import LettaUsageStatistics @@ -60,6 +65,7 @@ class LettaLLMStreamAdapter(LettaLLMAdapter): requires_approval_tools=requires_approval_tools, ) elif self.llm_config.model_endpoint_type == ProviderType.openai: + # For non-v1 agents, always use Chat Completions streaming interface self.interface = OpenAIStreamingInterface( use_assistant_message=use_assistant_message, is_openai_proxy=self.llm_config.provider_name == "lmstudio_openai", @@ -168,3 +174,168 @@ class LettaLLMStreamAdapter(LettaLLMAdapter): ), label="create_provider_trace", ) + + +class SimpleLettaLLMStreamAdapter(LettaLLMStreamAdapter): + """ + Adapter for handling streaming LLM requests with immediate token yielding. + + This adapter supports real-time streaming of tokens from the LLM, providing + minimal time-to-first-token (TTFT) latency. It uses specialized streaming + interfaces for different providers (OpenAI, Anthropic) to handle their + specific streaming formats. + """ + + async def invoke_llm( + self, + request_data: dict, + messages: list, + tools: list, + use_assistant_message: bool, # NOTE: not used + requires_approval_tools: list[str] = [], + step_id: str | None = None, + actor: User | None = None, + ) -> AsyncGenerator[LettaMessage, None]: + """ + Execute a streaming LLM request and yield tokens/chunks as they arrive. + + This adapter: + 1. Makes a streaming request to the LLM + 2. Yields chunks immediately for minimal TTFT + 3. Accumulates response data through the streaming interface + 4. Updates all instance variables after streaming completes + """ + # Store request data + self.request_data = request_data + + # Instantiate streaming interface + if self.llm_config.model_endpoint_type in [ProviderType.anthropic, ProviderType.bedrock]: + # NOTE: different + self.interface = SimpleAnthropicStreamingInterface( + requires_approval_tools=requires_approval_tools, + ) + elif self.llm_config.model_endpoint_type == ProviderType.openai: + # Decide interface based on payload shape + use_responses = "input" in request_data and "messages" not in request_data + # No support for Responses API proxy + is_proxy = self.llm_config.provider_name == "lmstudio_openai" + if use_responses and not is_proxy: + self.interface = SimpleOpenAIResponsesStreamingInterface( + is_openai_proxy=False, + messages=messages, + tools=tools, + requires_approval_tools=requires_approval_tools, + ) + else: + self.interface = SimpleOpenAIStreamingInterface( + is_openai_proxy=self.llm_config.provider_name == "lmstudio_openai", + messages=messages, + tools=tools, + requires_approval_tools=requires_approval_tools, + model=self.llm_config.model, + ) + else: + raise ValueError(f"Streaming not supported for provider {self.llm_config.model_endpoint_type}") + + # Extract optional parameters + # ttft_span = kwargs.get('ttft_span', None) + + # Start the streaming request + stream = await self.llm_client.stream_async(request_data, self.llm_config) + + # Process the stream and yield chunks immediately for TTFT + async for chunk in self.interface.process(stream): # TODO: add ttft span + # Yield each chunk immediately as it arrives + yield chunk + + # After streaming completes, extract the accumulated data + self.llm_request_finish_timestamp_ns = get_utc_timestamp_ns() + + # Extract tool call from the interface + try: + self.tool_call = self.interface.get_tool_call_object() + except ValueError as e: + # No tool call, handle upstream + self.tool_call = None + + # Extract reasoning content from the interface + # TODO this should probably just be called "content"? + # self.reasoning_content = self.interface.get_reasoning_content() + + # Extract non-reasoning content (eg text) + self.content: List[TextContent | SummarizedReasoningContent] = self.interface.get_content() + + # Extract usage statistics + # Some providers don't provide usage in streaming, use fallback if needed + if hasattr(self.interface, "input_tokens") and hasattr(self.interface, "output_tokens"): + # Handle cases where tokens might not be set (e.g., LMStudio) + input_tokens = self.interface.input_tokens + output_tokens = self.interface.output_tokens + + # Fallback to estimated values if not provided + if not input_tokens and hasattr(self.interface, "fallback_input_tokens"): + input_tokens = self.interface.fallback_input_tokens + if not output_tokens and hasattr(self.interface, "fallback_output_tokens"): + output_tokens = self.interface.fallback_output_tokens + + self.usage = LettaUsageStatistics( + step_count=1, + completion_tokens=output_tokens or 0, + prompt_tokens=input_tokens or 0, + total_tokens=(input_tokens or 0) + (output_tokens or 0), + ) + else: + # Default usage statistics if not available + self.usage = LettaUsageStatistics(step_count=1, completion_tokens=0, prompt_tokens=0, total_tokens=0) + + # Store any additional data from the interface + self.message_id = self.interface.letta_message_id + + # Log request and response data + self.log_provider_trace(step_id=step_id, actor=actor) + + def log_provider_trace(self, step_id: str | None, actor: User | None) -> None: + """ + Log provider trace data for telemetry purposes in a fire-and-forget manner. + + Creates an async task to log the request/response data without blocking + the main execution flow. For streaming adapters, this includes the final + tool call and reasoning content collected during streaming. + + Args: + step_id: The step ID associated with this request for logging purposes + actor: The user associated with this request for logging purposes + """ + if step_id is None or actor is None or not settings.track_provider_trace: + return + + safe_create_task( + self.telemetry_manager.create_provider_trace_async( + actor=actor, + provider_trace_create=ProviderTraceCreate( + request_json=self.request_data, + response_json={ + "content": { + "tool_call": self.tool_call.model_dump_json() if self.tool_call else None, + # "reasoning": [content.model_dump_json() for content in self.reasoning_content], + # NOTE: different + # TODO potentially split this into both content and reasoning? + "content": [content.model_dump_json() for content in self.content], + }, + "id": self.interface.message_id, + "model": self.interface.model, + "role": "assistant", + # "stop_reason": "", + # "stop_sequence": None, + "type": "message", + "usage": { + "input_tokens": self.usage.prompt_tokens, + "output_tokens": self.usage.completion_tokens, + }, + }, + step_id=step_id, # Use original step_id for telemetry + organization_id=actor.organization_id, + ), + ), + label="create_provider_trace", + ) diff --git a/letta/agents/agent_loop.py b/letta/agents/agent_loop.py index 507dc4d1..667819c5 100644 --- a/letta/agents/agent_loop.py +++ b/letta/agents/agent_loop.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING from letta.agents.base_agent_v2 import BaseAgentV2 from letta.agents.letta_agent_v2 import LettaAgentV2 +from letta.agents.letta_agent_v3 import LettaAgentV3 from letta.groups.sleeptime_multi_agent_v3 import SleeptimeMultiAgentV3 from letta.schemas.agent import AgentState from letta.schemas.enums import AgentType @@ -17,6 +18,11 @@ class AgentLoop: def load(agent_state: AgentState, actor: "User") -> BaseAgentV2: if agent_state.enable_sleeptime and agent_state.agent_type != AgentType.voice_convo_agent: return SleeptimeMultiAgentV3(agent_state=agent_state, actor=actor, group=agent_state.multi_agent_group) + elif agent_state.agent_type == AgentType.letta_v1_agent: + return LettaAgentV3( + agent_state=agent_state, + actor=actor, + ) else: return LettaAgentV2( agent_state=agent_state, diff --git a/letta/agents/ephemeral_summary_agent.py b/letta/agents/ephemeral_summary_agent.py index 55d610c2..af56235c 100644 --- a/letta/agents/ephemeral_summary_agent.py +++ b/letta/agents/ephemeral_summary_agent.py @@ -84,7 +84,7 @@ class EphemeralSummaryAgent(BaseAgent): timezone=agent_state.timezone, ) - request_data = llm_client.build_request_data(messages, agent_state.llm_config, tools=[]) + request_data = llm_client.build_request_data(agent_state.agent_type, messages, agent_state.llm_config, tools=[]) response_data = await llm_client.request_async(request_data, agent_state.llm_config) response = llm_client.convert_response_to_chat_completion(response_data, messages, agent_state.llm_config) summary = response.choices[0].message.content.strip() diff --git a/letta/agents/letta_agent.py b/letta/agents/letta_agent.py index f5441eb0..228de580 100644 --- a/letta/agents/letta_agent.py +++ b/letta/agents/letta_agent.py @@ -1622,6 +1622,7 @@ class LettaAgent(BaseAgent): return ( llm_client.build_request_data( + agent_state.agent_type, in_context_messages, agent_state.llm_config, allowed_tools, diff --git a/letta/agents/letta_agent_v2.py b/letta/agents/letta_agent_v2.py index dc8a6680..fada233f 100644 --- a/letta/agents/letta_agent_v2.py +++ b/letta/agents/letta_agent_v2.py @@ -406,6 +406,7 @@ class LettaAgentV2(BaseAgentV2): for llm_request_attempt in range(summarizer_settings.max_summarizer_retries + 1): try: request_data = self.llm_client.build_request_data( + agent_type=self.agent_state.agent_type, messages=messages, llm_config=self.agent_state.llm_config, tools=valid_tools, diff --git a/letta/agents/letta_agent_v3.py b/letta/agents/letta_agent_v3.py new file mode 100644 index 00000000..0f0f61fb --- /dev/null +++ b/letta/agents/letta_agent_v3.py @@ -0,0 +1,817 @@ +import uuid +from typing import AsyncGenerator, Optional + +from opentelemetry.trace import Span + +from letta.adapters.letta_llm_adapter import LettaLLMAdapter +from letta.adapters.letta_llm_request_adapter import LettaLLMRequestAdapter, SimpleLettaLLMRequestAdapter +from letta.adapters.letta_llm_stream_adapter import SimpleLettaLLMStreamAdapter +from letta.agents.helpers import ( + _build_rule_violation_result, + _load_last_function_response, + _maybe_get_approval_messages, + _prepare_in_context_messages_no_persist_async, + _safe_load_tool_call_str, + generate_step_id, +) +from letta.agents.letta_agent_v2 import LettaAgentV2 +from letta.constants import DEFAULT_MAX_STEPS, NON_USER_MSG_PREFIX, REQUEST_HEARTBEAT_PARAM +from letta.errors import ContextWindowExceededError, LLMError +from letta.helpers import ToolRulesSolver +from letta.helpers.datetime_helpers import get_utc_timestamp_ns +from letta.helpers.tool_execution_helper import enable_strict_mode +from letta.local_llm.constants import INNER_THOUGHTS_KWARG +from letta.otel.tracing import trace_method +from letta.schemas.agent import AgentState +from letta.schemas.letta_message import LettaMessage, MessageType +from letta.schemas.letta_message_content import OmittedReasoningContent, ReasoningContent, RedactedReasoningContent, TextContent +from letta.schemas.letta_response import LettaResponse +from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType +from letta.schemas.message import Message, MessageCreate +from letta.schemas.openai.chat_completion_response import ToolCall, UsageStatistics +from letta.schemas.step import StepProgression +from letta.schemas.step_metrics import StepMetrics +from letta.schemas.tool_execution_result import ToolExecutionResult +from letta.server.rest_api.utils import create_approval_request_message_from_llm_response, create_letta_messages_from_llm_response +from letta.services.helpers.tool_parser_helper import runtime_override_tool_json_schema +from letta.settings import settings, summarizer_settings +from letta.system import package_function_response +from letta.utils import log_telemetry, validate_function_response + + +class LettaAgentV3(LettaAgentV2): + """ + Similar to V2, but stripped down / simplified, while also generalized: + * Supports non-tool returns + * No inner thoughts in kwargs + * No heartbeats (loops happen on tool calls) + + TODOs: + * Support tool rules + * Support Gemini / OpenAI client + """ + + @trace_method + async def step( + self, + input_messages: list[MessageCreate], + max_steps: int = DEFAULT_MAX_STEPS, + run_id: str | None = None, + use_assistant_message: bool = True, # NOTE: not used + include_return_message_types: list[MessageType] | None = None, + request_start_timestamp_ns: int | None = None, + ) -> LettaResponse: + """ + Execute the agent loop in blocking mode, returning all messages at once. + + Args: + input_messages: List of new messages to process + max_steps: Maximum number of agent steps to execute + run_id: Optional job/run ID for tracking + use_assistant_message: Whether to use assistant message format + include_return_message_types: Filter for which message types to return + request_start_timestamp_ns: Start time for tracking request duration + + Returns: + LettaResponse: Complete response with all messages and metadata + """ + self._initialize_state() + request_span = self._request_checkpoint_start(request_start_timestamp_ns=request_start_timestamp_ns) + + in_context_messages, input_messages_to_persist = await _prepare_in_context_messages_no_persist_async( + input_messages, self.agent_state, self.message_manager, self.actor + ) + in_context_messages = in_context_messages + input_messages_to_persist + response_letta_messages = [] + for i in range(max_steps): + response = self._step( + messages=in_context_messages + self.response_messages, + input_messages_to_persist=input_messages_to_persist, + # TODO need to support non-streaming adapter too + llm_adapter=LettaLLMRequestAdapter(llm_client=self.llm_client, llm_config=self.agent_state.llm_config), + run_id=run_id, + # use_assistant_message=use_assistant_message, + include_return_message_types=include_return_message_types, + request_start_timestamp_ns=request_start_timestamp_ns, + ) + + async for chunk in response: + response_letta_messages.append(chunk) + + if not self.should_continue: + break + + input_messages_to_persist = [] + + # Rebuild context window after stepping + if not self.agent_state.message_buffer_autoclear: + await self.summarize_conversation_history( + in_context_messages=in_context_messages, + new_letta_messages=self.response_messages, + total_tokens=self.usage.total_tokens, + force=False, + ) + + if self.stop_reason is None: + self.stop_reason = LettaStopReason(stop_reason=StopReasonType.end_turn.value) + + result = LettaResponse(messages=response_letta_messages, stop_reason=self.stop_reason, usage=self.usage) + if run_id: + if self.job_update_metadata is None: + self.job_update_metadata = {} + self.job_update_metadata["result"] = result.model_dump(mode="json") + + await self._request_checkpoint_finish( + request_span=request_span, request_start_timestamp_ns=request_start_timestamp_ns, run_id=run_id + ) + return result + + @trace_method + async def stream( + self, + input_messages: list[MessageCreate], + max_steps: int = DEFAULT_MAX_STEPS, + stream_tokens: bool = False, + run_id: str | None = None, + use_assistant_message: bool = True, # NOTE: not used + include_return_message_types: list[MessageType] | None = None, + request_start_timestamp_ns: int | None = None, + ) -> AsyncGenerator[str, None]: + """ + Execute the agent loop in streaming mode, yielding chunks as they become available. + If stream_tokens is True, individual tokens are streamed as they arrive from the LLM, + providing the lowest latency experience, otherwise each complete step (reasoning + + tool call + tool return) is yielded as it completes. + + Args: + input_messages: List of new messages to process + max_steps: Maximum number of agent steps to execute + stream_tokens: Whether to stream back individual tokens. Not all llm + providers offer native token streaming functionality; in these cases, + this api streams back steps rather than individual tokens. + run_id: Optional job/run ID for tracking + use_assistant_message: Whether to use assistant message format + include_return_message_types: Filter for which message types to return + request_start_timestamp_ns: Start time for tracking request duration + + Yields: + str: JSON-formatted SSE data chunks for each completed step + """ + self._initialize_state() + request_span = self._request_checkpoint_start(request_start_timestamp_ns=request_start_timestamp_ns) + first_chunk = True + + if stream_tokens: + llm_adapter = SimpleLettaLLMStreamAdapter( + llm_client=self.llm_client, + llm_config=self.agent_state.llm_config, + ) + else: + llm_adapter = SimpleLettaLLMRequestAdapter( + llm_client=self.llm_client, + llm_config=self.agent_state.llm_config, + ) + + try: + in_context_messages, input_messages_to_persist = await _prepare_in_context_messages_no_persist_async( + input_messages, self.agent_state, self.message_manager, self.actor + ) + in_context_messages = in_context_messages + input_messages_to_persist + for i in range(max_steps): + response = self._step( + messages=in_context_messages + self.response_messages, + input_messages_to_persist=input_messages_to_persist, + llm_adapter=llm_adapter, + run_id=run_id, + # use_assistant_message=use_assistant_message, + include_return_message_types=include_return_message_types, + request_start_timestamp_ns=request_start_timestamp_ns, + ) + async for chunk in response: + if first_chunk: + request_span = self._request_checkpoint_ttft(request_span, request_start_timestamp_ns) + yield f"data: {chunk.model_dump_json()}\n\n" + first_chunk = False + + if not self.should_continue: + break + + input_messages_to_persist = [] + + if not self.agent_state.message_buffer_autoclear: + await self.summarize_conversation_history( + in_context_messages=in_context_messages, + new_letta_messages=self.response_messages, + total_tokens=self.usage.total_tokens, + force=False, + ) + + except: + if self.stop_reason and not first_chunk: + yield f"data: {self.stop_reason.model_dump_json()}\n\n" + raise + + if run_id: + letta_messages = Message.to_letta_messages_from_list( + self.response_messages, + use_assistant_message=False, # NOTE: set to false + reverse=False, + # text_is_assistant_message=(self.agent_state.agent_type == AgentType.react_agent), + text_is_assistant_message=True, + ) + result = LettaResponse(messages=letta_messages, stop_reason=self.stop_reason, usage=self.usage) + if self.job_update_metadata is None: + self.job_update_metadata = {} + self.job_update_metadata["result"] = result.model_dump(mode="json") + + await self._request_checkpoint_finish( + request_span=request_span, request_start_timestamp_ns=request_start_timestamp_ns, run_id=run_id + ) + for finish_chunk in self.get_finish_chunks_for_stream(self.usage, self.stop_reason): + yield f"data: {finish_chunk}\n\n" + + @trace_method + async def _step( + self, + messages: list[Message], + llm_adapter: LettaLLMAdapter, + input_messages_to_persist: list[Message] | None = None, + run_id: str | None = None, + # use_assistant_message: bool = True, + include_return_message_types: list[MessageType] | None = None, + request_start_timestamp_ns: int | None = None, + remaining_turns: int = -1, + dry_run: bool = False, + ) -> AsyncGenerator[LettaMessage | dict, None]: + """ + Execute a single agent step (one LLM call and tool execution). + + This is the core execution method that all public methods (step, stream_steps, + stream_tokens) funnel through. It handles the complete flow of making an LLM + request, processing the response, executing tools, and persisting messages. + + Args: + messages: Current in-context messages + llm_adapter: Adapter for LLM interaction (blocking or streaming) + input_messages_to_persist: New messages to persist after execution + run_id: Optional job/run ID for tracking + include_return_message_types: Filter for which message types to yield + request_start_timestamp_ns: Start time for tracking request duration + remaining_turns: Number of turns remaining (for max_steps enforcement) + dry_run: If true, only build and return the request without executing + + Yields: + LettaMessage or dict: Chunks for streaming mode, or request data for dry_run + """ + step_progression = StepProgression.START + # TODO(@caren): clean this up + tool_call, content, agent_step_span, first_chunk, step_id, logged_step, step_start_ns, step_metrics = ( + None, + None, + None, + None, + None, + None, + None, + None, + ) + try: + self.last_function_response = _load_last_function_response(messages) + valid_tools = await self._get_valid_tools() + approval_request, approval_response = _maybe_get_approval_messages(messages) + if approval_request and approval_response: + tool_call = approval_request.tool_calls[0] + content = approval_request.content + step_id = approval_request.step_id + step_metrics = await self.step_manager.get_step_metrics_async(step_id=step_id, actor=self.actor) + else: + # Check for job cancellation at the start of each step + if run_id and await self._check_run_cancellation(run_id): + self.stop_reason = LettaStopReason(stop_reason=StopReasonType.cancelled.value) + self.logger.info(f"Agent execution cancelled for run {run_id}") + return + + step_id = generate_step_id() + step_progression, logged_step, step_metrics, agent_step_span = await self._step_checkpoint_start( + step_id=step_id, run_id=run_id + ) + + messages = await self._refresh_messages(messages) + force_tool_call = valid_tools[0]["name"] if len(valid_tools) == 1 else None + for llm_request_attempt in range(summarizer_settings.max_summarizer_retries + 1): + try: + request_data = self.llm_client.build_request_data( + agent_type=self.agent_state.agent_type, + messages=messages, + llm_config=self.agent_state.llm_config, + tools=valid_tools, + force_tool_call=force_tool_call, + ) + if dry_run: + yield request_data + return + + step_progression, step_metrics = self._step_checkpoint_llm_request_start(step_metrics, agent_step_span) + + invocation = llm_adapter.invoke_llm( + request_data=request_data, + messages=messages, + tools=valid_tools, + use_assistant_message=False, # NOTE: set to false + requires_approval_tools=self.tool_rules_solver.get_requires_approval_tools( + set([t["name"] for t in valid_tools]) + ), + step_id=step_id, + actor=self.actor, + ) + async for chunk in invocation: + if llm_adapter.supports_token_streaming(): + if include_return_message_types is None or chunk.message_type in include_return_message_types: + first_chunk = True + yield chunk + # If you've reached this point without an error, break out of retry loop + break + except ValueError as e: + self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response.value) + raise e + except LLMError as e: + self.stop_reason = LettaStopReason(stop_reason=StopReasonType.llm_api_error.value) + raise e + except Exception as e: + if isinstance(e, ContextWindowExceededError) and llm_request_attempt < summarizer_settings.max_summarizer_retries: + # Retry case + messages = await self.summarize_conversation_history( + in_context_messages=messages, + new_letta_messages=self.response_messages, + llm_config=self.agent_state.llm_config, + force=True, + ) + else: + raise e + + step_progression, step_metrics = self._step_checkpoint_llm_request_finish( + step_metrics, agent_step_span, llm_adapter.llm_request_finish_timestamp_ns + ) + + self._update_global_usage_stats(llm_adapter.usage) + + # Handle the AI response with the extracted data + # NOTE: in v3 loop, no tool call is OK + # if tool_call is None and llm_adapter.tool_call is None: + + persisted_messages, self.should_continue, self.stop_reason = await self._handle_ai_response( + tool_call=tool_call or llm_adapter.tool_call, + valid_tool_names=[tool["name"] for tool in valid_tools], + agent_state=self.agent_state, + tool_rules_solver=self.tool_rules_solver, + usage=UsageStatistics( + completion_tokens=self.usage.completion_tokens, + prompt_tokens=self.usage.prompt_tokens, + total_tokens=self.usage.total_tokens, + ), + # reasoning_content=reasoning_content or llm_adapter.reasoning_content, + content=content or llm_adapter.content, + pre_computed_assistant_message_id=llm_adapter.message_id, + step_id=step_id, + initial_messages=input_messages_to_persist, + agent_step_span=agent_step_span, + is_final_step=(remaining_turns == 0), + run_id=run_id, + step_metrics=step_metrics, + is_approval=approval_response.approve if approval_response is not None else False, + is_denial=(approval_response.approve == False) if approval_response is not None else False, + denial_reason=approval_response.denial_reason if approval_response is not None else None, + ) + + new_message_idx = len(input_messages_to_persist) if input_messages_to_persist else 0 + self.response_messages.extend(persisted_messages[new_message_idx:]) + + if llm_adapter.supports_token_streaming(): + # Stream the tool return if a tool was actually executed. + # In the normal streaming path, the tool call is surfaced via the streaming interface + # (llm_adapter.tool_call), so don't rely solely on the local `tool_call` variable. + has_tool_return = any(m.role == "tool" for m in persisted_messages) + if persisted_messages[-1].role != "approval" and has_tool_return: + tool_return = [msg for msg in persisted_messages if msg.role == "tool"][-1].to_letta_messages()[0] + if include_return_message_types is None or tool_return.message_type in include_return_message_types: + yield tool_return + else: + filter_user_messages = [m for m in persisted_messages[new_message_idx:] if m.role != "user"] + letta_messages = Message.to_letta_messages_from_list( + filter_user_messages, + use_assistant_message=False, # NOTE: set to false + reverse=False, + # text_is_assistant_message=(self.agent_state.agent_type == AgentType.react_agent), + text_is_assistant_message=True, + ) + for message in letta_messages: + if include_return_message_types is None or message.message_type in include_return_message_types: + yield message + + # Persist approval responses immediately to prevent agent from getting into a bad state + if ( + len(input_messages_to_persist) == 1 + and input_messages_to_persist[0].role == "approval" + and persisted_messages[0].role == "approval" + and persisted_messages[1].role == "tool" + ): + self.agent_state.message_ids = self.agent_state.message_ids + [m.id for m in persisted_messages[:2]] + await self.agent_manager.update_message_ids_async( + agent_id=self.agent_state.id, message_ids=self.agent_state.message_ids, actor=self.actor + ) + step_progression, step_metrics = await self._step_checkpoint_finish(step_metrics, agent_step_span, logged_step) + except Exception as e: + import traceback + + self.logger.error(f"Error during step processing: {e}") + self.logger.error(f"Error traceback: {traceback.format_exc()}") + # self.logger.error(f"Error during step processing: {e}") + self.job_update_metadata = {"error": str(e)} + + # This indicates we failed after we decided to stop stepping, which indicates a bug with our flow. + if not self.stop_reason: + self.stop_reason = LettaStopReason(stop_reason=StopReasonType.error.value) + elif self.stop_reason.stop_reason in (StopReasonType.end_turn, StopReasonType.max_steps, StopReasonType.tool_rule): + self.logger.error("Error occurred during step processing, with valid stop reason: %s", self.stop_reason.stop_reason) + elif self.stop_reason.stop_reason not in ( + StopReasonType.no_tool_call, + StopReasonType.invalid_tool_call, + StopReasonType.invalid_llm_response, + StopReasonType.llm_api_error, + ): + self.logger.error("Error occurred during step processing, with unexpected stop reason: %s", self.stop_reason.stop_reason) + raise e + finally: + self.logger.debug("Running cleanup for agent loop run: %s", run_id) + self.logger.info("Running final update. Step Progression: %s", step_progression) + try: + if step_progression == StepProgression.FINISHED: + if not self.should_continue: + if self.stop_reason is None: + self.stop_reason = LettaStopReason(stop_reason=StopReasonType.end_turn.value) + if logged_step and step_id: + await self.step_manager.update_step_stop_reason(self.actor, step_id, self.stop_reason.stop_reason) + return + if step_progression < StepProgression.STEP_LOGGED: + # Error occurred before step was fully logged + import traceback + + if logged_step: + await self.step_manager.update_step_error_async( + actor=self.actor, + step_id=step_id, # Use original step_id for telemetry + error_type=type(e).__name__ if "e" in locals() else "Unknown", + error_message=str(e) if "e" in locals() else "Unknown error", + error_traceback=traceback.format_exc(), + stop_reason=self.stop_reason, + ) + if step_progression <= StepProgression.STREAM_RECEIVED: + if first_chunk and settings.track_errored_messages and input_messages_to_persist: + for message in input_messages_to_persist: + message.is_err = True + message.step_id = step_id + await self.message_manager.create_many_messages_async( + input_messages_to_persist, + actor=self.actor, + project_id=self.agent_state.project_id, + template_id=self.agent_state.template_id, + ) + elif step_progression <= StepProgression.LOGGED_TRACE: + if self.stop_reason is None: + self.logger.error("Error in step after logging step") + self.stop_reason = LettaStopReason(stop_reason=StopReasonType.error.value) + if logged_step: + await self.step_manager.update_step_stop_reason(self.actor, step_id, self.stop_reason.stop_reason) + else: + self.logger.error("Invalid StepProgression value") + + # Do tracking for failure cases. Can consolidate with success conditions later. + if settings.track_stop_reason: + await self._log_request(request_start_timestamp_ns, None, self.job_update_metadata, is_error=True, run_id=run_id) + + # Record partial step metrics on failure (capture whatever timing data we have) + if logged_step and step_metrics and step_progression < StepProgression.FINISHED: + # Calculate total step time up to the failure point + step_metrics.step_ns = get_utc_timestamp_ns() - step_metrics.step_start_ns + + await self._record_step_metrics( + step_id=step_id, + step_metrics=step_metrics, + run_id=run_id, + ) + except Exception as e: + self.logger.error(f"Error during post-completion step tracking: {e}") + + @trace_method + async def _handle_ai_response( + self, + tool_call: Optional[ToolCall], # NOTE: should only be None for react agents + valid_tool_names: list[str], + agent_state: AgentState, + tool_rules_solver: ToolRulesSolver, + usage: UsageStatistics, + # reasoning_content: list[TextContent | ReasoningContent | RedactedReasoningContent | OmittedReasoningContent] | None = None, + content: list[TextContent | ReasoningContent | RedactedReasoningContent | OmittedReasoningContent] | None = None, + pre_computed_assistant_message_id: str | None = None, + step_id: str | None = None, + initial_messages: list[Message] | None = None, + agent_step_span: Span | None = None, + is_final_step: bool | None = None, + run_id: str | None = None, + step_metrics: StepMetrics = None, + is_approval: bool | None = None, + is_denial: bool | None = None, + denial_reason: str | None = None, + ) -> tuple[list[Message], bool, LettaStopReason | None]: + """ + Handle the final AI response once streaming completes, execute / validate the + tool call, decide whether we should keep stepping, and persist state. + """ + if tool_call is None: + # NOTE: in v3 loop, no tool call is OK + tool_call_id = None + else: + tool_call_id: str = tool_call.id or f"call_{uuid.uuid4().hex[:8]}" + + if is_denial: + continue_stepping = True + stop_reason = None + tool_call_messages = create_letta_messages_from_llm_response( + agent_id=agent_state.id, + model=agent_state.llm_config.model, + function_name=tool_call.function.name, + function_arguments={}, + tool_execution_result=ToolExecutionResult(status="error"), + tool_call_id=tool_call_id, + function_call_success=False, + function_response=f"Error: request to call tool denied. User reason: {denial_reason}", + timezone=agent_state.timezone, + actor=self.actor, + continue_stepping=continue_stepping, + # NOTE: we may need to change this to not have a "heartbeat" prefix for v3? + heartbeat_reason=f"{NON_USER_MSG_PREFIX}Continuing: user denied request to call tool.", + reasoning_content=None, + pre_computed_assistant_message_id=None, + step_id=step_id, + is_approval_response=True, + force_set_request_heartbeat=False, + add_heartbeat_on_continue=False, + ) + messages_to_persist = (initial_messages or []) + tool_call_messages + persisted_messages = await self.message_manager.create_many_messages_async( + messages_to_persist, + actor=self.actor, + project_id=agent_state.project_id, + template_id=agent_state.template_id, + ) + return persisted_messages, continue_stepping, stop_reason + + # 0. If there's no tool call, we can early exit + if tool_call is None: + # TODO could just hardcode the line here instead of calling the function... + continue_stepping, heartbeat_reason, stop_reason = self._decide_continuation( + # agent_state=agent_state, + # request_heartbeat=False, + tool_call_name=None, + tool_rule_violated=False, + tool_rules_solver=tool_rules_solver, + is_final_step=is_final_step, + ) + assistant_message = create_letta_messages_from_llm_response( + agent_id=agent_state.id, + model=agent_state.llm_config.model, + function_name=None, + function_arguments=None, + tool_execution_result=None, + tool_call_id=None, + function_call_success=None, + function_response=None, + timezone=agent_state.timezone, + actor=self.actor, + continue_stepping=continue_stepping, + heartbeat_reason=heartbeat_reason, + # NOTE: should probably rename this to `content`? + reasoning_content=content, + pre_computed_assistant_message_id=pre_computed_assistant_message_id, + step_id=step_id, + is_approval_response=is_approval or is_denial, + force_set_request_heartbeat=False, + add_heartbeat_on_continue=False, + ) + messages_to_persist = (initial_messages or []) + assistant_message + + else: + # 1. Parse and validate the tool-call envelope + tool_call_name: str = tool_call.function.name + + tool_args = _safe_load_tool_call_str(tool_call.function.arguments) + # NOTE: these are failsafes - for v3, we should eventually be able to remove these + # request_heartbeat: bool = _pop_heartbeat(tool_args) + tool_args.pop(REQUEST_HEARTBEAT_PARAM, None) + tool_args.pop(INNER_THOUGHTS_KWARG, None) + + log_telemetry( + self.logger, + "_handle_ai_response execute tool start", + tool_name=tool_call_name, + tool_args=tool_args, + tool_call_id=tool_call_id, + # request_heartbeat=request_heartbeat, + ) + + if not is_approval and tool_rules_solver.is_requires_approval_tool(tool_call_name): + approval_message = create_approval_request_message_from_llm_response( + agent_id=agent_state.id, + model=agent_state.llm_config.model, + function_name=tool_call_name, + function_arguments=tool_args, + tool_call_id=tool_call_id, + actor=self.actor, + # continue_stepping=request_heartbeat, + continue_stepping=True, + # reasoning_content=reasoning_content, + reasoning_content=content, + pre_computed_assistant_message_id=pre_computed_assistant_message_id, + step_id=step_id, + ) + messages_to_persist = (initial_messages or []) + [approval_message] + continue_stepping = False + stop_reason = LettaStopReason(stop_reason=StopReasonType.requires_approval.value) + else: + # 2. Execute the tool (or synthesize an error result if disallowed) + tool_rule_violated = tool_call_name not in valid_tool_names and not is_approval + if tool_rule_violated: + tool_execution_result = _build_rule_violation_result(tool_call_name, valid_tool_names, tool_rules_solver) + else: + # Track tool execution time + tool_start_time = get_utc_timestamp_ns() + tool_execution_result = await self._execute_tool( + tool_name=tool_call_name, + tool_args=tool_args, + agent_state=agent_state, + agent_step_span=agent_step_span, + step_id=step_id, + ) + tool_end_time = get_utc_timestamp_ns() + + # Store tool execution time in metrics + step_metrics.tool_execution_ns = tool_end_time - tool_start_time + + log_telemetry( + self.logger, + "_handle_ai_response execute tool finish", + tool_execution_result=tool_execution_result, + tool_call_id=tool_call_id, + ) + + # 3. Prepare the function-response payload + truncate = tool_call_name not in {"conversation_search", "conversation_search_date", "archival_memory_search"} + return_char_limit = next( + (t.return_char_limit for t in agent_state.tools if t.name == tool_call_name), + None, + ) + function_response_string = validate_function_response( + tool_execution_result.func_return, + return_char_limit=return_char_limit, + truncate=truncate, + ) + self.last_function_response = package_function_response( + was_success=tool_execution_result.success_flag, + response_string=function_response_string, + timezone=agent_state.timezone, + ) + + # 4. Decide whether to keep stepping (focal section simplified) + continue_stepping, heartbeat_reason, stop_reason = self._decide_continuation( + # agent_state=agent_state, + # request_heartbeat=request_heartbeat, + tool_call_name=tool_call_name, + tool_rule_violated=tool_rule_violated, + tool_rules_solver=tool_rules_solver, + is_final_step=is_final_step, + ) + + # 5. Create messages (step was already created at the beginning) + tool_call_messages = create_letta_messages_from_llm_response( + agent_id=agent_state.id, + model=agent_state.llm_config.model, + function_name=tool_call_name, + function_arguments=tool_args, + tool_execution_result=tool_execution_result, + tool_call_id=tool_call_id, + function_call_success=tool_execution_result.success_flag, + function_response=function_response_string, + timezone=agent_state.timezone, + actor=self.actor, + continue_stepping=continue_stepping, + # heartbeat_reason=heartbeat_reason, + heartbeat_reason=None, + # reasoning_content=reasoning_content, + reasoning_content=content, + pre_computed_assistant_message_id=pre_computed_assistant_message_id, + step_id=step_id, + is_approval_response=is_approval or is_denial, + force_set_request_heartbeat=False, + add_heartbeat_on_continue=False, + ) + messages_to_persist = (initial_messages or []) + tool_call_messages + + persisted_messages = await self.message_manager.create_many_messages_async( + messages_to_persist, actor=self.actor, project_id=agent_state.project_id, template_id=agent_state.template_id + ) + + if run_id: + await self.job_manager.add_messages_to_job_async( + job_id=run_id, + message_ids=[m.id for m in persisted_messages if m.role != "user"], + actor=self.actor, + ) + + return persisted_messages, continue_stepping, stop_reason + + @trace_method + def _decide_continuation( + self, + # agent_state: AgentState, + # request_heartbeat: bool, + tool_call_name: Optional[str], + tool_rule_violated: bool, + tool_rules_solver: ToolRulesSolver, + is_final_step: bool | None, + ) -> tuple[bool, str | None, LettaStopReason | None]: + """ + In v3 loop, we apply the following rules: + + 1. Did not call a tool? Loop ends + + 2. Called a tool? Loop continues. This can be: + 2a. Called tool, tool executed successfully + 2b. Called tool, tool failed to execute + 2c. Called tool + tool rule violation (did not execute) + + """ + continuation_reason: str | None = None + stop_reason: LettaStopReason | None = None + + if tool_call_name is None: + # No tool call? End loop + return False, None, LettaStopReason(stop_reason=StopReasonType.end_turn.value) + + else: + # If we have a tool call, we continue stepping + return True, None, None + + # TODO support tool rules + # I think we can just uncomment the bellow? + if tool_rule_violated: + continue_stepping = True + continuation_reason = f"{NON_USER_MSG_PREFIX}Continuing: tool rule violation." + else: + tool_rules_solver.register_tool_call(tool_call_name) + + if tool_rules_solver.is_terminal_tool(tool_call_name): + if continue_stepping: + stop_reason = LettaStopReason(stop_reason=StopReasonType.tool_rule.value) + continue_stepping = False + + elif tool_rules_solver.has_children_tools(tool_call_name): + continue_stepping = True + continuation_reason = f"{NON_USER_MSG_PREFIX}Continuing: child tool rule." + + elif tool_rules_solver.is_continue_tool(tool_call_name): + continue_stepping = True + continuation_reason = f"{NON_USER_MSG_PREFIX}Continuing: continue tool rule." + + # – hard stop overrides – + if is_final_step: + continue_stepping = False + stop_reason = LettaStopReason(stop_reason=StopReasonType.max_steps.value) + else: + uncalled = tool_rules_solver.get_uncalled_required_tools(available_tools=set([t.name for t in agent_state.tools])) + if not continue_stepping and uncalled: + continue_stepping = True + continuation_reason = ( + f"{NON_USER_MSG_PREFIX}Continuing, user expects these tools: [{', '.join(uncalled)}] to be called still." + ) + + stop_reason = None # reset – we’re still going + + return continue_stepping, continuation_reason, stop_reason + + @trace_method + async def _get_valid_tools(self): + tools = self.agent_state.tools + valid_tool_names = self.tool_rules_solver.get_allowed_tool_names( + available_tools=set([t.name for t in tools]), + last_function_response=self.last_function_response, + error_on_empty=False, # Return empty list instead of raising error + ) or list(set(t.name for t in tools)) + allowed_tools = [enable_strict_mode(t.json_schema) for t in tools if t.name in set(valid_tool_names)] + terminal_tool_names = {rule.tool_name for rule in self.tool_rules_solver.terminal_tool_rules} + allowed_tools = runtime_override_tool_json_schema( + tool_list=allowed_tools, + response_format=self.agent_state.response_format, + request_heartbeat=False, # NOTE: difference for v3 (don't add request heartbeat) + terminal_tools=terminal_tool_names, + ) + return allowed_tools diff --git a/letta/helpers/converters.py b/letta/helpers/converters.py index 522ee031..6ded1937 100644 --- a/letta/helpers/converters.py +++ b/letta/helpers/converters.py @@ -16,6 +16,7 @@ from letta.schemas.letta_message_content import ( OmittedReasoningContent, ReasoningContent, RedactedReasoningContent, + SummarizedReasoningContent, TextContent, ToolCallContent, ToolReturnContent, @@ -270,6 +271,8 @@ def deserialize_message_content(data: Optional[List[Dict]]) -> List[MessageConte content = RedactedReasoningContent(**item) elif content_type == MessageContentType.omitted_reasoning: content = OmittedReasoningContent(**item) + elif content_type == MessageContentType.summarized_reasoning: + content = SummarizedReasoningContent(**item) else: # Skip invalid content continue diff --git a/letta/interfaces/anthropic_streaming_interface.py b/letta/interfaces/anthropic_streaming_interface.py index 0c394287..1ddf7a02 100644 --- a/letta/interfaces/anthropic_streaming_interface.py +++ b/letta/interfaces/anthropic_streaming_interface.py @@ -23,6 +23,7 @@ from anthropic.types.beta import ( BetaThinkingDelta, BetaToolUseBlock, ) +from letta_client.types import assistant_message from letta.constants import DEFAULT_MESSAGE_TOOL, DEFAULT_MESSAGE_TOOL_KWARG from letta.local_llm.constants import INNER_THOUGHTS_KWARG @@ -522,3 +523,359 @@ class AnthropicStreamingInterface: self.tool_call_buffer = [] self.anthropic_mode = None + + +class SimpleAnthropicStreamingInterface: + """ + A simpler version of AnthropicStreamingInterface that doesn't handle send_message parsing on inner_thoughts_in_kwargs + """ + + def __init__( + self, + requires_approval_tools: list = [], + ): + self.json_parser: JSONParser = PydanticJSONParser() + + # Premake IDs for database writes + self.letta_message_id = Message.generate_id() + + self.anthropic_mode = None + self.message_id = None + self.accumulated_inner_thoughts = [] + self.tool_call_id = None + self.tool_call_name = None + self.accumulated_tool_call_args = "" + self.previous_parse = {} + + # usage trackers + self.input_tokens = 0 + self.output_tokens = 0 + self.model = None + + # reasoning object trackers + self.reasoning_messages = [] + + # assistant object trackers + self.assistant_messages: list[AssistantMessage] = [] + + # Buffer to hold tool call messages until inner thoughts are complete + self.tool_call_buffer = [] + self.inner_thoughts_complete = False + + # Buffer to handle partial XML tags across chunks + self.partial_tag_buffer = "" + + self.requires_approval_tools = requires_approval_tools + + def get_tool_call_object(self) -> Optional[ToolCall]: + """Useful for agent loop""" + if not self.tool_call_name: + return None + + # hack for tool rules + try: + tool_input = json.loads(self.accumulated_tool_call_args) + except json.JSONDecodeError as e: + # Attempt to use OptimisticJSONParser to handle incomplete/malformed JSON + try: + tool_input = self.json_parser.parse(self.accumulated_tool_call_args) + except: + logger.warning( + f"Failed to decode tool call arguments for tool_call_id={self.tool_call_id}, " + f"name={self.tool_call_name}. Raw input: {self.accumulated_tool_call_args!r}. Error: {e}" + ) + raise e + if "id" in tool_input and tool_input["id"].startswith("toolu_") and "function" in tool_input: + arguments = str(json.dumps(tool_input["function"]["arguments"], indent=2)) + else: + arguments = str(json.dumps(tool_input, indent=2)) + return ToolCall(id=self.tool_call_id, function=FunctionCall(arguments=arguments, name=self.tool_call_name)) + + def get_reasoning_content(self) -> list[TextContent | ReasoningContent | RedactedReasoningContent]: + def _process_group( + group: list[ReasoningMessage | HiddenReasoningMessage | AssistantMessage], + group_type: str, + ) -> TextContent | ReasoningContent | RedactedReasoningContent: + if group_type == "reasoning": + reasoning_text = "".join(chunk.reasoning for chunk in group).strip() + is_native = any(chunk.source == "reasoner_model" for chunk in group) + signature = next((chunk.signature for chunk in group if chunk.signature is not None), None) + if is_native: + return ReasoningContent(is_native=is_native, reasoning=reasoning_text, signature=signature) + else: + return TextContent(text=reasoning_text) + elif group_type == "redacted": + redacted_text = "".join(chunk.hidden_reasoning for chunk in group if chunk.hidden_reasoning is not None) + return RedactedReasoningContent(data=redacted_text) + elif group_type == "text": + concat = "" + for chunk in group: + if isinstance(chunk.content, list): + concat += "".join([c.text for c in chunk.content]) + else: + concat += chunk.content + return TextContent(text=concat) + else: + raise ValueError("Unexpected group type") + + merged = [] + current_group = [] + current_group_type = None # "reasoning" or "redacted" + + for msg in self.reasoning_messages: + # Determine the type of the current message + if isinstance(msg, HiddenReasoningMessage): + msg_type = "redacted" + elif isinstance(msg, ReasoningMessage): + msg_type = "reasoning" + elif isinstance(msg, AssistantMessage): + msg_type = "text" + else: + raise ValueError("Unexpected message type") + + # Initialize group type if not set + if current_group_type is None: + current_group_type = msg_type + + # If the type changes, process the current group + if msg_type != current_group_type: + merged.append(_process_group(current_group, current_group_type)) + current_group = [] + current_group_type = msg_type + + current_group.append(msg) + + # Process the final group, if any. + if current_group: + merged.append(_process_group(current_group, current_group_type)) + + return merged + + def get_content(self) -> list[TextContent | ReasoningContent | RedactedReasoningContent]: + return self.get_reasoning_content() + # concat = "" + # for msg in self.assistant_messages: + # if isinstance(msg.content, list): + # concat += "".join([c.text for c in msg.content]) + # else: + # concat += msg.content + # return [TextContent(text=concat)] + + async def process( + self, + stream: AsyncStream[BetaRawMessageStreamEvent], + ttft_span: Optional["Span"] = None, + ) -> AsyncGenerator[LettaMessage | LettaStopReason, None]: + prev_message_type = None + message_index = 0 + event = None + try: + async with stream: + async for event in stream: + try: + async for message in self._process_event(event, ttft_span, prev_message_type, message_index): + new_message_type = message.message_type + if new_message_type != prev_message_type: + if prev_message_type != None: + message_index += 1 + prev_message_type = new_message_type + # print(f"Yielding message: {message}") + yield message + except asyncio.CancelledError as e: + import traceback + + logger.info("Cancelled stream attempt but overriding %s: %s", e, traceback.format_exc()) + async for message in self._process_event(event, ttft_span, prev_message_type, message_index): + new_message_type = message.message_type + if new_message_type != prev_message_type: + if prev_message_type != None: + message_index += 1 + prev_message_type = new_message_type + yield message + + # Don't raise the exception here + continue + + except Exception as e: + import traceback + + logger.error("Error processing stream: %s\n%s", e, traceback.format_exc()) + if ttft_span: + ttft_span.add_event( + name="stop_reason", + attributes={"stop_reason": StopReasonType.error.value, "error": str(e), "stacktrace": traceback.format_exc()}, + ) + yield LettaStopReason(stop_reason=StopReasonType.error) + raise e + finally: + logger.info("AnthropicStreamingInterface: Stream processing complete.") + + async def _process_event( + self, + event: BetaRawMessageStreamEvent, + ttft_span: Optional["Span"] = None, + prev_message_type: Optional[str] = None, + message_index: int = 0, + ) -> AsyncGenerator[LettaMessage | LettaStopReason, None]: + """Process a single event from the Anthropic stream and yield any resulting messages. + + Args: + event: The event to process + + Yields: + Messages generated from processing this event + """ + if isinstance(event, BetaRawContentBlockStartEvent): + content = event.content_block + + if isinstance(content, BetaTextBlock): + self.anthropic_mode = EventMode.TEXT + # TODO: Can capture citations, etc. + + elif isinstance(content, BetaToolUseBlock): + self.anthropic_mode = EventMode.TOOL_USE + self.tool_call_id = content.id + self.tool_call_name = content.name + + if prev_message_type and prev_message_type != "tool_call_message": + message_index += 1 + + if self.tool_call_name in self.requires_approval_tools: + tool_call_msg = ApprovalRequestMessage( + id=self.letta_message_id, + tool_call=ToolCallDelta(name=self.tool_call_name, tool_call_id=self.tool_call_id), + date=datetime.now(timezone.utc).isoformat(), + ) + else: + tool_call_msg = ToolCallMessage( + id=self.letta_message_id, + tool_call=ToolCallDelta(name=self.tool_call_name, tool_call_id=self.tool_call_id), + date=datetime.now(timezone.utc).isoformat(), + ) + prev_message_type = tool_call_msg.message_type + yield tool_call_msg + + elif isinstance(content, BetaThinkingBlock): + self.anthropic_mode = EventMode.THINKING + # TODO: Can capture signature, etc. + + elif isinstance(content, BetaRedactedThinkingBlock): + self.anthropic_mode = EventMode.REDACTED_THINKING + + if prev_message_type and prev_message_type != "hidden_reasoning_message": + message_index += 1 + + hidden_reasoning_message = HiddenReasoningMessage( + id=self.letta_message_id, + state="redacted", + hidden_reasoning=content.data, + date=datetime.now(timezone.utc).isoformat(), + otid=Message.generate_otid_from_id(self.letta_message_id, message_index), + ) + + self.reasoning_messages.append(hidden_reasoning_message) + prev_message_type = hidden_reasoning_message.message_type + yield hidden_reasoning_message + + elif isinstance(event, BetaRawContentBlockDeltaEvent): + delta = event.delta + + if isinstance(delta, BetaTextDelta): + # Safety check + if not self.anthropic_mode == EventMode.TEXT: + raise RuntimeError(f"Streaming integrity failed - received BetaTextDelta object while not in TEXT EventMode: {delta}") + + if prev_message_type and prev_message_type != "assistant_message": + message_index += 1 + + assistant_msg = AssistantMessage( + id=self.letta_message_id, + # content=[TextContent(text=delta.text)], + content=delta.text, + date=datetime.now(timezone.utc).isoformat(), + otid=Message.generate_otid_from_id(self.letta_message_id, message_index), + ) + # self.assistant_messages.append(assistant_msg) + self.reasoning_messages.append(assistant_msg) + prev_message_type = assistant_msg.message_type + yield assistant_msg + + elif isinstance(delta, BetaInputJSONDelta): + if not self.anthropic_mode == EventMode.TOOL_USE: + raise RuntimeError( + f"Streaming integrity failed - received BetaInputJSONDelta object while not in TOOL_USE EventMode: {delta}" + ) + + self.accumulated_tool_call_args += delta.partial_json + + if self.tool_call_name in self.requires_approval_tools: + tool_call_msg = ApprovalRequestMessage( + id=self.letta_message_id, + tool_call=ToolCallDelta(name=self.tool_call_name, tool_call_id=self.tool_call_id, arguments=delta.partial_json), + date=datetime.now(timezone.utc).isoformat(), + ) + else: + tool_call_msg = ToolCallMessage( + id=self.letta_message_id, + tool_call=ToolCallDelta(name=self.tool_call_name, tool_call_id=self.tool_call_id, arguments=delta.partial_json), + date=datetime.now(timezone.utc).isoformat(), + ) + + yield tool_call_msg + + elif isinstance(delta, BetaThinkingDelta): + # Safety check + if not self.anthropic_mode == EventMode.THINKING: + raise RuntimeError( + f"Streaming integrity failed - received BetaThinkingBlock object while not in THINKING EventMode: {delta}" + ) + + if prev_message_type and prev_message_type != "reasoning_message": + message_index += 1 + reasoning_message = ReasoningMessage( + id=self.letta_message_id, + source="reasoner_model", + reasoning=delta.thinking, + date=datetime.now(timezone.utc).isoformat(), + otid=Message.generate_otid_from_id(self.letta_message_id, message_index), + ) + self.reasoning_messages.append(reasoning_message) + prev_message_type = reasoning_message.message_type + yield reasoning_message + + elif isinstance(delta, BetaSignatureDelta): + # Safety check + if not self.anthropic_mode == EventMode.THINKING: + raise RuntimeError( + f"Streaming integrity failed - received BetaSignatureDelta object while not in THINKING EventMode: {delta}" + ) + + if prev_message_type and prev_message_type != "reasoning_message": + message_index += 1 + reasoning_message = ReasoningMessage( + id=self.letta_message_id, + source="reasoner_model", + reasoning="", + date=datetime.now(timezone.utc).isoformat(), + signature=delta.signature, + otid=Message.generate_otid_from_id(self.letta_message_id, message_index), + ) + self.reasoning_messages.append(reasoning_message) + prev_message_type = reasoning_message.message_type + yield reasoning_message + + elif isinstance(event, BetaRawMessageStartEvent): + self.message_id = event.message.id + self.input_tokens += event.message.usage.input_tokens + self.output_tokens += event.message.usage.output_tokens + self.model = event.message.model + + elif isinstance(event, BetaRawMessageDeltaEvent): + self.output_tokens += event.usage.output_tokens + + elif isinstance(event, BetaRawMessageStopEvent): + # Don't do anything here! We don't want to stop the stream. + pass + + elif isinstance(event, BetaRawContentBlockStopEvent): + self.anthropic_mode = None diff --git a/letta/interfaces/openai_streaming_interface.py b/letta/interfaces/openai_streaming_interface.py index 0ae0a4be..37d166e7 100644 --- a/letta/interfaces/openai_streaming_interface.py +++ b/letta/interfaces/openai_streaming_interface.py @@ -5,6 +5,28 @@ from typing import Optional from openai import AsyncStream from openai.types.chat.chat_completion_chunk import ChatCompletionChunk +from openai.types.responses import ( + ResponseCompletedEvent, + ResponseContentPartAddedEvent, + ResponseContentPartDoneEvent, + ResponseCreatedEvent, + ResponseFunctionCallArgumentsDeltaEvent, + ResponseFunctionCallArgumentsDoneEvent, + ResponseFunctionToolCall, + ResponseInProgressEvent, + ResponseOutputItemAddedEvent, + ResponseOutputItemDoneEvent, + ResponseOutputMessage, + ResponseOutputText, + ResponseReasoningItem, + ResponseReasoningSummaryPartAddedEvent, + ResponseReasoningSummaryPartDoneEvent, + ResponseReasoningSummaryTextDeltaEvent, + ResponseReasoningSummaryTextDoneEvent, + ResponseTextDeltaEvent, + ResponseTextDoneEvent, +) +from openai.types.responses.response_stream_event import ResponseStreamEvent from letta.constants import DEFAULT_MESSAGE_TOOL, DEFAULT_MESSAGE_TOOL_KWARG from letta.llm_api.openai_client import is_openai_reasoning_model @@ -19,7 +41,12 @@ from letta.schemas.letta_message import ( ToolCallDelta, ToolCallMessage, ) -from letta.schemas.letta_message_content import OmittedReasoningContent, TextContent +from letta.schemas.letta_message_content import ( + OmittedReasoningContent, + SummarizedReasoningContent, + SummarizedReasoningContentPart, + TextContent, +) from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType from letta.schemas.message import Message from letta.schemas.openai.chat_completion_response import FunctionCall, ToolCall @@ -427,3 +454,645 @@ class OpenAIStreamingInterface: prev_message_type = tool_call_msg.message_type yield tool_call_msg self.function_id_buffer = None + + +class SimpleOpenAIStreamingInterface: + """ + Encapsulates the logic for streaming responses from OpenAI. + This class handles parsing of partial tokens, pre-execution messages, + and detection of tool call events. + """ + + def __init__( + self, + is_openai_proxy: bool = False, + messages: Optional[list] = None, + tools: Optional[list] = None, + requires_approval_tools: list = [], + model: str = None, + ): + # Premake IDs for database writes + self.letta_message_id = Message.generate_id() + + self.message_id = None + self.model = model + + # Token counters (from OpenAI usage) + self.input_tokens = 0 + self.output_tokens = 0 + + # Fallback token counters (using tiktoken cl200k-base) + self.fallback_input_tokens = 0 + self.fallback_output_tokens = 0 + + # Store messages and tools for fallback counting + self.is_openai_proxy = is_openai_proxy + self.messages = messages or [] + self.tools = tools or [] + + # Buffers to hold accumulating tools + self.tool_call_name = "" + self.tool_call_args = "" + self.tool_call_id = "" + + self.content_messages = [] + self.emitted_hidden_reasoning = False # Track if we've emitted hidden reasoning message + + self.requires_approval_tools = requires_approval_tools + + def get_content(self) -> list[TextContent | OmittedReasoningContent]: + shown_omitted = False + concat_content = "" + merged_messages = [] + for msg in self.content_messages: + if isinstance(msg, HiddenReasoningMessage) and not shown_omitted: + merged_messages.append(OmittedReasoningContent()) + shown_omitted = True + elif isinstance(msg, AssistantMessage): + if isinstance(msg.content, list): + concat_content += "".join([c.text for c in msg.content]) + else: + concat_content += msg.content + merged_messages.append(TextContent(text=concat_content)) + return merged_messages + + def get_tool_call_object(self) -> ToolCall: + """Useful for agent loop""" + if not self.tool_call_name: + raise ValueError("No tool call name available") + if not self.tool_call_args: + raise ValueError("No tool call arguments available") + if not self.tool_call_id: + raise ValueError("No tool call ID available") + + return ToolCall( + id=self.tool_call_id, + function=FunctionCall(arguments=self.tool_call_args, name=self.tool_call_name), + ) + + async def process( + self, + stream: AsyncStream[ChatCompletionChunk], + ttft_span: Optional["Span"] = None, + ) -> AsyncGenerator[LettaMessage | LettaStopReason, None]: + """ + Iterates over the OpenAI stream, yielding SSE events. + It also collects tokens and detects if a tool call is triggered. + """ + # Fallback input token counting - this should only be required for non-OpenAI providers using the OpenAI client (e.g. LMStudio) + if self.is_openai_proxy: + if self.messages: + # Convert messages to dict format for token counting + message_dicts = [msg.to_openai_dict() if hasattr(msg, "to_openai_dict") else msg for msg in self.messages] + message_dicts = [m for m in message_dicts if m is not None] + self.fallback_input_tokens = num_tokens_from_messages(message_dicts) # fallback to gpt-4 cl100k-base + + if self.tools: + # Convert tools to dict format for token counting + tool_dicts = [tool["function"] if isinstance(tool, dict) and "function" in tool else tool for tool in self.tools] + self.fallback_input_tokens += num_tokens_from_functions(tool_dicts) + + prev_message_type = None + message_index = 0 + try: + async with stream: + # For reasoning models, emit a hidden reasoning message before the first chunk + if not self.emitted_hidden_reasoning and is_openai_reasoning_model(self.model): + self.emitted_hidden_reasoning = True + hidden_message = HiddenReasoningMessage( + id=self.letta_message_id, + date=datetime.now(timezone.utc), + state="omitted", + hidden_reasoning=None, + otid=Message.generate_otid_from_id(self.letta_message_id, message_index), + ) + self.content_messages.append(hidden_message) + prev_message_type = hidden_message.message_type + message_index += 1 # Increment for the next message + yield hidden_message + + async for chunk in stream: + try: + async for message in self._process_chunk(chunk, ttft_span, prev_message_type, message_index): + new_message_type = message.message_type + if new_message_type != prev_message_type: + if prev_message_type != None: + message_index += 1 + prev_message_type = new_message_type + yield message + except asyncio.CancelledError as e: + import traceback + + logger.info("Cancelled stream attempt but overriding %s: %s", e, traceback.format_exc()) + async for message in self._process_chunk(chunk, ttft_span, prev_message_type, message_index): + new_message_type = message.message_type + if new_message_type != prev_message_type: + if prev_message_type != None: + message_index += 1 + prev_message_type = new_message_type + yield message + + # Don't raise the exception here + continue + + except Exception as e: + import traceback + + logger.error("Error processing stream: %s\n%s", e, traceback.format_exc()) + if ttft_span: + ttft_span.add_event( + name="stop_reason", + attributes={"stop_reason": StopReasonType.error.value, "error": str(e), "stacktrace": traceback.format_exc()}, + ) + yield LettaStopReason(stop_reason=StopReasonType.error) + raise e + finally: + logger.info("OpenAIStreamingInterface: Stream processing complete.") + + async def _process_chunk( + self, + chunk: ChatCompletionChunk, + ttft_span: Optional["Span"] = None, + prev_message_type: Optional[str] = None, + message_index: int = 0, + ) -> AsyncGenerator[LettaMessage | LettaStopReason, None]: + if not self.model or not self.message_id: + self.model = chunk.model + self.message_id = chunk.id + + # track usage + if chunk.usage: + self.input_tokens += chunk.usage.prompt_tokens + self.output_tokens += chunk.usage.completion_tokens + + if chunk.choices: + choice = chunk.choices[0] + message_delta = choice.delta + + if message_delta.content is not None and message_delta.content != "": + assistant_msg = AssistantMessage( + id=self.letta_message_id, + content=[TextContent(text=message_delta.content)], + date=datetime.now(timezone.utc).isoformat(), + otid=Message.generate_otid_from_id(self.letta_message_id, message_index), + ) + self.content_messages.append(assistant_msg) + prev_message_type = assistant_msg.message_type + message_index += 1 # Increment for the next message + yield assistant_msg + + if message_delta.tool_calls is not None and len(message_delta.tool_calls) > 0: + tool_call = message_delta.tool_calls[0] + + # For OpenAI reasoning models, emit a hidden reasoning message before the first tool call + # if not self.emitted_hidden_reasoning and is_openai_reasoning_model(self.model): + # self.emitted_hidden_reasoning = True + # if prev_message_type and prev_message_type != "hidden_reasoning_message": + # message_index += 1 + # hidden_message = HiddenReasoningMessage( + # id=self.letta_message_id, + # date=datetime.now(timezone.utc), + # state="omitted", + # hidden_reasoning=None, + # otid=Message.generate_otid_from_id(self.letta_message_id, message_index), + # ) + # self.content_messages.append(hidden_message) + # prev_message_type = hidden_message.message_type + # message_index += 1 # Increment for the next message + # yield hidden_message + + if not tool_call.function.name and not tool_call.function.arguments and not tool_call.id: + # No chunks to process, exit + return + + if tool_call.function.name: + self.tool_call_name += tool_call.function.name + if tool_call.function.arguments: + self.tool_call_args += tool_call.function.arguments + if tool_call.id: + self.tool_call_id += tool_call.id + + if self.requires_approval_tools: + tool_call_msg = ApprovalRequestMessage( + id=self.letta_message_id, + date=datetime.now(timezone.utc), + tool_call=ToolCallDelta( + name=tool_call.function.name, + arguments=tool_call.function.arguments, + tool_call_id=tool_call.id, + ), + # name=name, + otid=Message.generate_otid_from_id(self.letta_message_id, message_index), + ) + else: + tool_call_msg = ToolCallMessage( + id=self.letta_message_id, + date=datetime.now(timezone.utc), + tool_call=ToolCallDelta( + name=tool_call.function.name, + arguments=tool_call.function.arguments, + tool_call_id=tool_call.id, + ), + # name=name, + otid=Message.generate_otid_from_id(self.letta_message_id, message_index), + ) + prev_message_type = tool_call_msg.message_type + message_index += 1 # Increment for the next message + yield tool_call_msg + + +class SimpleOpenAIResponsesStreamingInterface: + """ + Encapsulates the logic for streaming responses from OpenAI Responses API. + """ + + def __init__( + self, + is_openai_proxy: bool = False, + messages: Optional[list] = None, + tools: Optional[list] = None, + requires_approval_tools: list = [], + model: str = None, + ): + self.is_openai_proxy = is_openai_proxy + self.messages = messages + self.tools = tools + self.requires_approval_tools = requires_approval_tools + # We need to store the name for approvals + self.tool_call_name = None + # ID responses used + self.message_id = None + + # Premake IDs for database writes + self.letta_message_id = Message.generate_id() + self.model = model + self.final_response = None + + def get_content(self) -> list[TextContent | SummarizedReasoningContent]: + """This includes both SummarizedReasoningContent and TextContent""" + if self.final_response is None: + raise ValueError("No final response available") + + content = [] + for response in self.final_response.output: + if isinstance(response, ResponseReasoningItem): + # TODO consider cleaning up our representation to not require indexing + letta_summary = [SummarizedReasoningContentPart(index=i, text=part.text) for i, part in enumerate(response.summary)] + content.append( + SummarizedReasoningContent( + id=response.id, + summary=letta_summary, + encrypted_content=response.encrypted_content, + ) + ) + elif isinstance(response, ResponseOutputMessage): + if len(response.content) == 1: + content.append( + TextContent( + text=response.content[0].text, + ) + ) + else: + raise ValueError(f"Got {len(response.content)} content parts, expected 1") + + return content + + def get_tool_call_object(self) -> ToolCall: + """Useful for agent loop""" + if self.final_response is None: + raise ValueError("No final response available") + + tool_calls = [] + for response in self.final_response.output: + # TODO make sure this shouldn't be ResponseCustomToolCall? + if isinstance(response, ResponseFunctionToolCall): + tool_calls.append( + ToolCall( + id=response.call_id, + function=FunctionCall( + name=response.name, + arguments=response.arguments, + ), + ) + ) + + if len(tool_calls) == 0: + raise ValueError("No tool calls available") + if len(tool_calls) > 1: + raise ValueError(f"Got {len(tool_calls)} tool calls, expected 1") + + return tool_calls[0] + + async def process( + self, + stream: AsyncStream[ResponseStreamEvent], + ttft_span: Optional["Span"] = None, + ) -> AsyncGenerator[LettaMessage | LettaStopReason, None]: + """ + Iterates over the OpenAI stream, yielding SSE events. + It also collects tokens and detects if a tool call is triggered. + """ + # Fallback input token counting - this should only be required for non-OpenAI providers using the OpenAI client (e.g. LMStudio) + if self.is_openai_proxy: + raise NotImplementedError("OpenAI proxy is not supported for OpenAI Responses API") + + prev_message_type = None + message_index = 0 + try: + async with stream: + async for event in stream: + try: + async for message in self._process_event(event, ttft_span, prev_message_type, message_index): + new_message_type = message.message_type + if new_message_type != prev_message_type: + if prev_message_type != None: + message_index += 1 + prev_message_type = new_message_type + yield message + except asyncio.CancelledError as e: + import traceback + + logger.info("Cancelled stream attempt but overriding %s: %s", e, traceback.format_exc()) + async for message in self._process_event(event, ttft_span, prev_message_type, message_index): + new_message_type = message.message_type + if new_message_type != prev_message_type: + if prev_message_type != None: + message_index += 1 + prev_message_type = new_message_type + yield message + + # Don't raise the exception here + continue + + except Exception as e: + import traceback + + logger.error("Error processing stream: %s\n%s", e, traceback.format_exc()) + if ttft_span: + ttft_span.add_event( + name="stop_reason", + attributes={"stop_reason": StopReasonType.error.value, "error": str(e), "stacktrace": traceback.format_exc()}, + ) + yield LettaStopReason(stop_reason=StopReasonType.error) + raise e + finally: + logger.info("OpenAIStreamingInterface: Stream processing complete.") + + async def _process_event( + self, + event: ResponseStreamEvent, + ttft_span: Optional["Span"] = None, + prev_message_type: Optional[str] = None, + message_index: int = 0, + ) -> AsyncGenerator[LettaMessage | LettaStopReason, None]: + if isinstance(event, ResponseCreatedEvent): + # No-op, just had the input events + return + # or yield None? + + elif isinstance(event, ResponseInProgressEvent): + # No-op, just an indicator that we've started + return + + elif isinstance(event, ResponseOutputItemAddedEvent): + new_event_item = event.item + + # New "item" was added, can be reasoning, tool call, or content + if isinstance(new_event_item, ResponseReasoningItem): + # Look for summary delta, or encrypted_content + summary = new_event_item.summary + content = new_event_item.content # NOTE: always none + encrypted_content = new_event_item.encrypted_content + # TODO change to summarize reasoning message, but we need to figure out the streaming indices of summary problem + concat_summary = "".join([s.text for s in summary]) + if concat_summary != "": + yield ReasoningMessage( + id=self.letta_message_id, + date=datetime.now(timezone.utc).isoformat(), + otid=Message.generate_otid_from_id(self.letta_message_id, message_index), + source="reasoner_model", + reasoning=concat_summary, + ) + else: + return + + elif isinstance(new_event_item, ResponseFunctionToolCall): + # Look for call_id, name, and possibly arguments (though likely always empty string) + call_id = new_event_item.call_id + name = new_event_item.name + arguments = new_event_item.arguments + # cache for approval if/elses + self.tool_call_name = name + if self.tool_call_name and self.tool_call_name in self.requires_approval_tools: + yield ApprovalRequestMessage( + id=self.letta_message_id, + otid=Message.generate_otid_from_id(self.letta_message_id, message_index), + date=datetime.now(timezone.utc), + tool_call=ToolCallDelta( + name=name, + arguments=arguments if arguments != "" else None, + tool_call_id=call_id, + ), + ) + else: + yield ToolCallMessage( + id=self.letta_message_id, + otid=Message.generate_otid_from_id(self.letta_message_id, message_index), + date=datetime.now(timezone.utc), + tool_call=ToolCallDelta( + name=name, + arguments=arguments if arguments != "" else None, + tool_call_id=call_id, + ), + ) + + elif isinstance(new_event_item, ResponseOutputMessage): + # Look for content (may be empty list []), or contain ResponseOutputText + if len(new_event_item.content) > 0: + for content_item in new_event_item.content: + if isinstance(content_item, ResponseOutputText): + # Add this as a AssistantMessage part + yield AssistantMessage( + id=self.letta_message_id, + otid=Message.generate_otid_from_id(self.letta_message_id, message_index), + date=datetime.now(timezone.utc), + content=content_item.text, + ) + else: + return + + else: + # Other types we don't handle, ignore + return + + # Reasoning summary is streaming in + # TODO / FIXME return a SummaryReasoning type + elif isinstance(event, ResponseReasoningSummaryPartAddedEvent): + # This means the part got added, but likely no content yet (likely empty string) + summary_index = event.summary_index + part = event.part + + # If this is a follow-up summary part, we need to add leading newlines + if summary_index > 1: + summary_text = "\n\n" + part.text + else: + summary_text = part.text + + yield ReasoningMessage( + id=self.letta_message_id, + date=datetime.now(timezone.utc).isoformat(), + otid=Message.generate_otid_from_id(self.letta_message_id, message_index), + source="reasoner_model", + reasoning=summary_text, + ) + + # Reasoning summary streaming + elif isinstance(event, ResponseReasoningSummaryTextDeltaEvent): + # NOTE: the summary is a list with indices + summary_index = event.summary_index + delta = event.delta + if delta != "": + summary_index = event.summary_index + # Check if we need to instantiate a fresh new part + # NOTE: we can probably use the part added and part done events, but this is safer + # TODO / FIXME return a SummaryReasoning type + yield ReasoningMessage( + id=self.letta_message_id, + date=datetime.now(timezone.utc).isoformat(), + otid=Message.generate_otid_from_id(self.letta_message_id, message_index), + source="reasoner_model", + reasoning=delta, + ) + else: + return + + # Reasoning summary streaming + elif isinstance(event, ResponseReasoningSummaryTextDoneEvent): + # NOTE: is this inclusive of the deltas? + # If not, we should add it to the rolling + summary_index = event.summary_index + text = event.text + return + + # Reasoning summary streaming + elif isinstance(event, ResponseReasoningSummaryPartDoneEvent): + # NOTE: this one is definitely inclusive, so can skip + summary_index = event.summary_index + # text = event + return + + # Assistant message streaming + elif isinstance(event, ResponseContentPartAddedEvent): + part = event.part + if isinstance(part, ResponseOutputText): + # Append to running + return # TODO + else: + # TODO handle + return + + # Assistant message streaming + elif isinstance(event, ResponseTextDeltaEvent): + delta = event.delta + if delta != "": + # Append to running + yield AssistantMessage( + id=self.letta_message_id, + otid=Message.generate_otid_from_id(self.letta_message_id, message_index), + date=datetime.now(timezone.utc), + content=delta, + ) + else: + return + + # Assistant message streaming + elif isinstance(event, ResponseTextDoneEvent): + # NOTE: inclusive, can skip + text = event.text + return + + # Assistant message done + elif isinstance(event, ResponseContentPartDoneEvent): + # NOTE: inclusive, can skip + part = event.part + return + + # Function calls + elif isinstance(event, ResponseFunctionCallArgumentsDeltaEvent): + # only includes delta on args + delta = event.delta + + if self.tool_call_name and self.tool_call_name in self.requires_approval_tools: + yield ApprovalRequestMessage( + id=self.letta_message_id, + otid=Message.generate_otid_from_id(self.letta_message_id, message_index), + date=datetime.now(timezone.utc), + tool_call=ToolCallDelta( + name=None, + arguments=delta, + tool_call_id=None, + ), + ) + else: + yield ToolCallMessage( + id=self.letta_message_id, + otid=Message.generate_otid_from_id(self.letta_message_id, message_index), + date=datetime.now(timezone.utc), + tool_call=ToolCallDelta( + name=None, + arguments=delta, + tool_call_id=None, + ), + ) + + # Function calls + elif isinstance(event, ResponseFunctionCallArgumentsDoneEvent): + # NOTE: inclusive + full_args = event.arguments + return + + # Generic + elif isinstance(event, ResponseOutputItemDoneEvent): + # Inclusive, so skip + return + + # Generic finish + elif isinstance(event, ResponseCompletedEvent): + # NOTE we can "rebuild" the final state of the stream using the values in here, instead of relying on the accumulators + self.final_response = event.response + self.model = event.response.model + self.input_tokens = event.response.usage.input_tokens + self.output_tokens = event.response.usage.output_tokens + self.message_id = event.response.id + return + + else: + logger.debug(f"Unhandled event: {event}") + return + + +""" +ResponseCreatedEvent(response=Response(id='resp_0ad9f0876b2555790068c7b783d17c8192a1a12ecc0b83d381', created_at=1757919107.0, error=None, incomplete_details=None, instructions=None, metadata={}, model='gpt-5-2025-08-07', object='response', output=[], parallel_tool_calls=True, temperature=1.0, tool_choice='auto', tools=[], top_p=1.0, background=False, max_output_tokens=None, max_tool_calls=None, previous_response_id=None, prompt=None, prompt_cache_key=None, reasoning=Reasoning(effort='high', generate_summary=None, summary='detailed'), safety_identifier=None, service_tier='auto', status='in_progress', text=ResponseTextConfig(format=ResponseFormatText(type='text'), verbosity='medium'), top_logprobs=0, truncation='disabled', usage=None, user=None, store=True), sequence_number=0, type='response.created') +ResponseInProgressEvent(response=Response(id='resp_0ad9f0876b2555790068c7b783d17c8192a1a12ecc0b83d381', created_at=1757919107.0, error=None, incomplete_details=None, instructions=None, metadata={}, model='gpt-5-2025-08-07', object='response', output=[], parallel_tool_calls=True, temperature=1.0, tool_choice='auto', tools=[], top_p=1.0, background=False, max_output_tokens=None, max_tool_calls=None, previous_response_id=None, prompt=None, prompt_cache_key=None, reasoning=Reasoning(effort='high', generate_summary=None, summary='detailed'), safety_identifier=None, service_tier='auto', status='in_progress', text=ResponseTextConfig(format=ResponseFormatText(type='text'), verbosity='medium'), top_logprobs=0, truncation='disabled', usage=None, user=None, store=True), sequence_number=1, type='response.in_progress') +ResponseOutputItemAddedEvent(item=ResponseReasoningItem(id='rs_0ad9f0876b2555790068c7b78439888192a40c50a09625bb26', summary=[], type='reasoning', content=None, encrypted_content='gAAAAABox7eEiOVncSJVTjHrczwPKD0bueuhRmgzj6sBTQPnyB5TTE4T3CCoxXALshB1mkOnz48dkd8OkkqFSjZ90OmFi1uVZ9LdJQxoibXj2qUetqhwO_Lm8tcy5Yi4DHrqqhMPbGnDOuJr38PyI_Jx5BDPzJlPbDeU6a99Eg531W7nfSVCzwihekQxlcV9X0xYAvSaigCgbu75sSkx4mopcYDeBTxTjYtpJIAH4C-ygv_MyEeqTJqGdGoQ1NjmF6QJECIXir6llkHlvUHhGeAH6bUabUw7SDBk7gJnMAwDUOZVfp0GyWHRVbDfLCrP7G5nkz98iaEl9LFOcTolsrqxYI_e7k2rIejhfvvSEwgvhCOidNjjKNr3Jujt2ALJ6kGgG3fyWu81cLMobRTL6H0iQ2uT8u9XqZ2eiwHwImexRytC1sSDPK9LaBih46J66HVBKQTeRqMA7m379U8o-qLESN6AiS0PoiJvBpT3F89qJSl3rG19NwzJpPC99Ni1Dzgbr6VPqVmYBqJ5pRt98P-zcW4G72xNr1BLWgCGlCiuuNOxvn2fxPmdHt6S4422oNYb8mNkKeL7p0-6QB9C6L4WPrXUmCOr2_9-dcd1YIplHNQd7BGcbrotZIOj_kTgOvkbQa72ihDV6lNFg8w0_WO2JqubjxP4Ss22-hhtODP6dtuhWjAX5vhIS1j0lFlCRjnQsdC6j7nWhq8ymoPVrmoTE9Ej-evsvTnKO1QVXDKPrKd0y-fMmuvMghHCmhqJ5IiYT1xPX6X83HEXwZs2YY5aHHZkKcbgScAhcv0d1Rv4dp18XHzHUkM=', status=None), output_index=0, sequence_number=2, type='response.output_item.added') +ResponseReasoningSummaryPartAddedEvent(item_id='rs_0ad9f0876b2555790068c7b78439888192a40c50a09625bb26', output_index=0, part=Part(text='', type='summary_text'), sequence_number=3, summary_index=0, type='response.reasoning_summary_part.added') +ResponseReasoningSummaryTextDeltaEvent(delta='**Analy', item_id='rs_0ad9f0876b2555790068c7b78439888192a40c50a09625bb26', output_index=0, sequence_number=4, summary_index=0, type='response.reasoning_summary_text.delta', obfuscation='JdVJEL6G1') +ResponseReasoningSummaryTextDeltaEvent(delta='zing', item_id='rs_0ad9f0876b2555790068c7b78439888192a40c50a09625bb26', output_index=0, sequence_number=5, summary_index=0, type='response.reasoning_summary_text.delta', obfuscation='3g4DefV5mIyG') +ResponseReasoningSummaryTextDeltaEvent(delta=' r', item_id='rs_0ad9f0876b2555790068c7b78439888192a40c50a09625bb26', output_index=0, sequence_number=6, summary_index=0, type='response.reasoning_summary_text.delta', obfuscation='dCErh1m4eFG18w') +ResponseReasoningSummaryTextDeltaEvent(delta=' things', item_id='rs_0ad9f0876b2555790068c7b78439888192a40c50a09625bb26', output_index=0, sequence_number=214, summary_index=1, type='response.reasoning_summary_text.delta', obfuscation='hPD6t2pv9') +ResponseReasoningSummaryTextDeltaEvent(delta='!', item_id='rs_0ad9f0876b2555790068c7b78439888192a40c50a09625bb26', output_index=0, sequence_number=215, summary_index=1, type='response.reasoning_summary_text.delta', obfuscation='g1Sjo96fgHE4LQa') +ResponseReasoningSummaryTextDoneEvent(item_id='rs_0ad9f0876b2555790068c7b78439888192a40c50a09625bb26', output_index=0, sequence_number=216, summary_index=1, text='**Clarifying letter counts**\n\nI realize this task is straightforward: I can provide both answers. If the user is counting uppercase R\'s, the answer would be 0. For a case-insensitive count, it\'s 3. It\'s good to give both for clarity. I should keep it brief; a concise response would be: "If you\'re asking about uppercase \'R\', there are 0. If counting \'r\' regardless of case, there are 3." This way, I cover all bases without overcomplicating things!', type='response.reasoning_summary_text.done') +ResponseReasoningSummaryPartDoneEvent(item_id='rs_0ad9f0876b2555790068c7b78439888192a40c50a09625bb26', output_index=0, part=Part(text='**Clarifying letter counts**\n\nI realize this task is straightforward: I can provide both answers. If the user is counting uppercase R\'s, the answer would be 0. For a case-insensitive count, it\'s 3. It\'s good to give both for clarity. I should keep it brief; a concise response would be: "If you\'re asking about uppercase \'R\', there are 0. If counting \'r\' regardless of case, there are 3." This way, I cover all bases without overcomplicating things!', type='summary_text'), sequence_number=217, summary_index=1, type='response.reasoning_summary_part.done') +ResponseOutputItemDoneEvent(item=ResponseReasoningItem(id='rs_0ad9f0876b2555790068c7b78439888192a40c50a09625bb26', summary=[Summary(text='**Analyzing riddle ambiguity**\n\nI’m thinking about a puzzle that mixes cases to mislead, but the answer should be \'3\'. There are gotcha riddles about counting letters, like asking how many R\'s are in "Strawberry." If it\'s capitalized, the answer differs. The typical trick is that there are no capital R\'s in "strawberry." Since the user asked about uppercase R\'s but quoted the lowercase version, it\'s confusing. I should clarify if they mean uppercase \'R\' or any \'r\'.', type='summary_text'), Summary(text='**Clarifying letter counts**\n\nI realize this task is straightforward: I can provide both answers. If the user is counting uppercase R\'s, the answer would be 0. For a case-insensitive count, it\'s 3. It\'s good to give both for clarity. I should keep it brief; a concise response would be: "If you\'re asking about uppercase \'R\', there are 0. If counting \'r\' regardless of case, there are 3." This way, I cover all bases without overcomplicating things!', type='summary_text')], type='reasoning', content=None, encrypted_content='gAAAAABox7eQs7K1F8qaKB_jhBgufrTLqEXk96f-M9YyeTQ7tvQO730WOtTZtmuQ7XLiAekqxt4yrNQEmgYZhS7qQx-5oq30NlfHezcgkYqmvBqFhGtJkg_Ea6eO9WVMYaXK6nbxXvyK-HS73GvF8AN6NE72rITUE0fdlT6_VeU_OLBSDtVJXUMqbr6V4MOllzXRklbIOJCemZWRax0tenrxaVBrR4IbGoXoFbz5q2Lt8-Xc4NtuUShrzv8AU8Lm46KGvZeX2bWtS0d7-x3in6HJNk4gAFmepYh-cNbk_Qd8UVMvARb2nBjK7jHTB6IP1fDbVYYMUvX6ox8q2jPdHA7ZFRF-YFDXUyX6lwvhLGhVodqyQ4IdmZv1sJ78mvLUpuEdJrHSapA83SN6oaqpoD5cO174UKxyZnrhwQCyxPQ__lS5ZaUgnsIfgtuF5_cKATDxJFBrVo-0SwPHJZdtiCD1CYaVgUKr6uBDtUk32WCDOSJbFK5ClYM1W41x7mBLUWwBJVJ4PZVz3Cc6lR6EMa4a3SAMtIRzMY3869ox6WwDUV8TAYpSMdsb_VW3aezj0hXhnGYUrfmrmtYJEmxy36kV9GsHoBSLwXwNYbjTnP-Pni_AqlCQgZWKTI9KzJ8Zi95l617XwDJ6PzaHt2D6OSX2pmiVPwMGjZDIR6o21fBw3ZwI9TGkJitwL5O9Xlc6PQfYnk-oAVt17OZet6tXQe8LA3wq-9BQXY-88OQRrIGsnFjuGKOmaEXXDmaT1u9lGwOfSdKtU-4X67iDmy5e--lKYZrbWEVy2aoMcwMh2gTsPl-nS_fLzPdOlIgXv4DKCFf_E93LjjdyVoSFctm928rqY_qayqvP5kGx4UjPGiIxFD7tI3lEGMMFA8P0h6nE6NnZgb7pgMtgsqF17SdBKAXFLF8JtuaZulzoBdJJ_2Skq0FO7X8xynq_hhIDdwK2QU9PEfaX7h0j-kGYwVuWs8C_zispG--pHveDqPE1j9GUVrRN9W72-qNHnXRMPEDan1jq4WFN4VknDVwbnK9HR_suKJOTKGZF0MJACtaL4_FyvGfqANLky3cfWeMLpYmXec2Buo-4x8XwlRASCyvK6KXnz7K-M0SuvtoEqTBw0Pa4PBO683OtssZ-ujqMgnzFy3tTKpAabGq-Tz3Dn5fxbYgZONpE6jdTEQxBhkkvplReda3GATlskQHrQtn5Q_tvYwOIQu3iFiP9uoTtfCVQ_Tm4CIGxcEDqWnVaP1fOe8LKHwCvPf7bm046YI3oL-2do70oBJEch0JRKiI3ijrqHzXpI6e-bam9inNnzKxq0HMornRJh37HMDtME0nbXvrNSTu7k7pldDJtQ7SVIoey_PnLirAL9WdfM0HTdsAVmHgXp8u6Ta3_aob-vdrYs19TnGAh6Hp5DqC47wrDeg4RqqSWTM5PLdj4kfdmzkBB90zLMTdR_7Xq7ox64NXfaOXkyLSdFNgz3vmMGyyI3RDeDfVN8tLWfmAWKnooXp866vmdkdWp2IGiq8VWFOe20oaugm8CtT54XLlL6Hh_nipMZy_4pLTVZSsSNd1lvUn-xPMu7WD3NMEdk5b61juYsa77CLHj71vzbPVfHhmOtxqQ_Iqeh4sgPhY0FKRhblvs6yIXy__Ab9MKMYz1Cba1qAr-m9_JGNR1PzUPb7CfS-gbwBwqGoNy5ig1ir-GccsA9hB0UPORaobOGkklDI6B-aEjf8DkzEGdzXQLpJkWwv4cjJAeU1oA9R0hNAwR_STZDvmjkos7j0opRUl-qOez4pBeoRcR8T6V3uqO6OD4j0WNMkLekAGi_BE9tt25v2ClWVSeBE7M9TjlG4uOJwp_IHJZRM3VwunJt9L6ZXALck5sEdIG1EYiAgSophCMqfqUUS2cG7QkDOH-N_jGQisoKRqWJKouERgIHT9TK5ZDeL3WVQL3a-6-HH3y-Lv8UJC1-F_V15FZTgAK3SxUqeHHts6EvDKEqde9QxxTPWwhMOk6dBxNQ0jxfKn9pzNNXhasVIHnk5zn1wWJkm8P3B5sG6Oxwpsxu6ywbY4AOFjBRwHGnnO06CykNaB6uR3KxIlDo2pdidOChI1uZrqYAEDKhjGHcKUQOlgq83wz4dLciiioDYPHfexfSl91QQaQZWrAIN77AbT6e9wxXaZZNQ4Jwo9JpQNjRkoBu2_4tW317nzLj31ayK-5w07imhOBh3ziD8yx3MC7AxuIbsAWo_scZgq8h7OxwRBih9NyiYMePLTLPOPahjDQvl-4XFj4NVNNnXKsiLrxPwtxmMREZraJxcmrSzDFiYDnqkibHXQ3eYyykcjCY3kWRCszoAEYhI3a2qsfbyePgPlfynf3_8rCsb2qaiXmu93lLqrRRg0ktRXtBb3lJVlpVGezUD6Itc_BDZQJAfC0PJbf_AoLfxIVw9-Pj5p5ssxuyybJn0thiqR5CnzcK_TO4jA2PJkjdfK5zLZbyNSYp4NKUpaL1u0jxiuD_vJ30qt3hJugsTv8EvCLdtoNwuvBwjhqplPPZ8_TWCVsowYm3n9LEYWCK-EEk6D2H4_Z8gQYNWz0O735CSiAVpSZpChBRwkfOhlerp8o6k8NJmf7VEqVCE5_iwrKqllB0o8hNLPDSlzQ97EacKz6wsLBorlqTRvGvRrJqwQHwybQLkJlCinqZV9XF52kc0c9GqdKdF-aPxv5VNoPenEBDo6EpAnDyM-TRxzsWtQ71kzRQgLIi-tvO9fTA2MExrF4tv_m1CULjF2jIoeG8RZPC4zhHVd9lvyflhCVSLflF6GR2qzSQua2zqqMsfM4qYGdW83in2U5KDWc7yD7FVi_IM5F1_AKeUaPQ_9MbwCkUO8zdDSQ-eVxY051PGiKHNKTP982Legft29skJqqDZv57Oju9wtI9PmmoeozaBPv4-spuuczsMsVbl6aRLs8xQsPQoke-MUMuelF1kGIqJnMktKiN8AGB8CoU_XzBjGSV-8yJj7tCBYquF66tj5wyn5tsVWwHsi8sl-IRMrVsza1LY0mVx-6ljo97j3WME1LuCTTNF5GOZMHUfRUXgHW5aENuENS9LhsqymVK8sAeQVMVVijC1Gnq2I0ddKLwodsrzCReaqLKx4y3Q4NB0Rom76UzyODd3vzDxjUS9k-IvRbzyXYC0YO-WsngpJr8sKZ8eQqJuBSE3rjT6CEx6-Ldxf8ad-iT6rh-nJRMn27jtHaUgQdZoexMDS1yons8r-MfUYayTaAGeIiimpuCj1A-f3zpQgqqehRkxoEJmjcGLe0oRI5H-kXEk8_LZt45nCiD86HnSCBqRasFNV0lAhWy2UF2cuu0AQuixUDRRgJU5ilWuDTcnJAo-Y4T7wh06xUGuCa50mLAszVnldO-JFrYGYE5UsWTe7qSNOSNsLIJqVoR4WLJJp-FaDFpiir14v1llvh1OumR03aDCA4gOQzeFNzfkUIQNRq0sU1ReZcxLUnlNjHWFqSBfB53rerSV8mdauA91EweO3cOJ1iTUFnAST_QPB2da03hINiRWd8jSkkiUdha-t6iajgOA2w_YlP2cyZ2b-L-cVhBFx0r1VSHocASSSTK1vU1vrPwtXJdMHq6c_EcMSirybtLzpIM3WR-z1wbr2gYvPF2KR_DvsybXE3DsX4qKInsykvBLmg-0RYWcFivmBgAGcIgYLjuCaWbpi5wYi_hNbPBJw07WpxN4QOS9_CaOn0AQh0NnqgPg9DH_am9mpOutvWMKWOqMcNKaRACDCpQkGDhX8yfF6W4EihLKam0vmiYYYtnFQ19Xl59cXf8gVcbNOnElOuA3gK_4PMCYHL66tPUdhKreBlboULKLm0xgYMf3lRrPh803TG0x5L0oYAGzXcGUZIs0AtX2wmkfYSSivFsqSThLY-q2VHtiNBigEZRIWr1lfNzLFYzNipiajvFAfB1EpDpfRkjnnoV5n656y11uFcyySyiskKxZqZryqfb3HPfn8VlK3baKLMk5a0i1CZp5LswGErlk2qgwaSYSWOcHmt6z1GfJOKzrGkHFTWMzzg', status=None), output_index=0, sequence_number=218, type='response.output_item.done') +ResponseOutputItemAddedEvent(item=ResponseOutputMessage(id='msg_0ad9f0876b2555790068c7b790e5388192aa7d4d442882790a', content=[], role='assistant', status='in_progress', type='message'), output_index=1, sequence_number=219, type='response.output_item.added') +ResponseContentPartAddedEvent(content_index=0, item_id='msg_0ad9f0876b2555790068c7b790e5388192aa7d4d442882790a', output_index=1, part=ResponseOutputText(annotations=[], text='', type='output_text', logprobs=[]), sequence_number=220, type='response.content_part.added') +ResponseTextDeltaEvent(content_index=0, delta='Upper', item_id='msg_0ad9f0876b2555790068c7b790e5388192aa7d4d442882790a', logprobs=[], output_index=1, sequence_number=221, type='response.output_text.delta', obfuscation='a8XGRatycGS') +esponseTextDeltaEvent(content_index=0, delta=' ', item_id='msg_0ad9f0876b2555790068c7b790e5388192aa7d4d442882790a', logprobs=[], output_index=1, sequence_number=234, type='response.output_text.delta', obfuscation='Ljhu9qR46fiOkfr') +... +ResponseTextDeltaEvent(content_index=0, delta='3', item_id='msg_0ad9f0876b2555790068c7b790e5388192aa7d4d442882790a', logprobs=[], output_index=1, sequence_number=235, type='response.output_text.delta', obfuscation='5auIEi4JmSFDF72') +ResponseTextDeltaEvent(content_index=0, delta='.', item_id='msg_0ad9f0876b2555790068c7b790e5388192aa7d4d442882790a', logprobs=[], output_index=1, sequence_number=236, type='response.output_text.delta', obfuscation='I78DIGKqtD2P6H2') +ResponseTextDoneEvent(content_index=0, item_id='msg_0ad9f0876b2555790068c7b790e5388192aa7d4d442882790a', logprobs=[], output_index=1, sequence_number=237, text='Uppercase R: 0. Counting r regardless of case: 3.', type='response.output_text.done') +ResponseContentPartDoneEvent(content_index=0, item_id='msg_0ad9f0876b2555790068c7b790e5388192aa7d4d442882790a', output_index=1, part=ResponseOutputText(annotations=[], text='Uppercase R: 0. Counting r regardless of case: 3.', type='output_text', logprobs=[]), sequence_number=238, type='response.content_part.done') +ResponseOutputItemDoneEvent(item=ResponseOutputMessage(id='msg_0ad9f0876b2555790068c7b790e5388192aa7d4d442882790a', content=[ResponseOutputText(annotations=[], text='Uppercase R: 0. Counting r regardless of case: 3.', type='output_text', logprobs=[])], role='assistant', status='completed', type='message'), output_index=1, sequence_number=239, type='response.output_item.done') +ResponseCompletedEvent(response=Response(id='resp_0ad9f0876b2555790068c7b783d17c8192a1a12ecc0b83d381', created_at=1757919107.0, error=None, incomplete_details=None, instructions=None, metadata={}, model='gpt-5-2025-08-07', object='response', output=[ResponseReasoningItem(id='rs_0ad9f0876b2555790068c7b78439888192a40c50a09625bb26', summary=[Summary(text='**Analyzing riddle ambiguity**\n\nI’m thinking about a puzzle that mixes cases to mislead, but the answer should be \'3\'. There are gotcha riddles about counting letters, like asking how many R\'s are in "Strawberry." If it\'s capitalized, the answer differs. The typical trick is that there are no capital R\'s in "strawberry." Since the user asked about uppercase R\'s but quoted the lowercase version, it\'s confusing. I should clarify if they mean uppercase \'R\' or any \'r\'.', type='summary_text'), Summary(text='**Clarifying letter counts**\n\nI realize this task is straightforward: I can provide both answers. If the user is counting uppercase R\'s, the answer would be 0. For a case-insensitive count, it\'s 3. It\'s good to give both for clarity. I should keep it brief; a concise response would be: "If you\'re asking about uppercase \'R\', there are 0. If counting \'r\' regardless of case, there are 3." This way, I cover all bases without overcomplicating things!', type='summary_text')], type='reasoning', content=None, encrypted_content='gAAAAABox7eRIRNnSmrunATD6UBi-Hm77E5JggsaXTKrNH-6ZkwIcosPQPf4vVdjR3ywdcYr4pr2Od3C0ADYSUpyR35tyusZq8A8yR-EmpgA-7otyIGLk5zzZy3AqKv2zZElkvgcr8PEKpYpC8VS6AO4Qg3g_gvBD8eV8j2O_FtGTIQ5MKS_Q0_gf9BCJtkh-PgYjL-0bEXsmCfgPa37BogC4nYh42b5hc7vge3ZH_RmR3irxWontsGaUIkOxR8_oK3RGKvkLfR24QYd4U8BIiZk3G58cR1UDRmtvfHwM4E7W6mpog-dFe9D-V96q1OWBGsNObyHxJcoSNGLxHkxWRvGnq3aWts_Lh-srgJ50rIa19pnOzfXePfdNxdXy7dYXD0D1uiBibpX5nneKUr1C0QmQdwS_nW16pr1oNKZ2fVkZJDTn31rOR3WfvtY9gL7tKo_CMnJ8jT3YKhZxFHG9PhHEoA5OsE_QC-3To54meckPExJqrVJ-h3u_5S4lHK9xu8buzIv4WM92X91zeX98A3g_YkqqvoTUmkFyMoIr8PVxM6Cmg4JtooT9bL2FAVUo6MV2_tlX07hNNH-hWSgqZHMVdx3_cTDAfKW3cAbwaG16ApgK_VUUc7rIfygHxgxtW-YeZpbETlvdNrDIDhzhuPQqPB86DQFh9O262o3cvBHok7V0WVqq-KXH5mH-eio7MhZJ46Ri4qklU9Xn77Tw4zl1cw029FuDKwF0_KsFZ8Omayi5iWJoZFjzqhATR_qt2J3nr368skIHDQ1tSa1vUAJt4UM7A4Un9KG2syCydoAmVQYoRgc5niiWU9FFouzulKW_cfyLrJDlVN1EfaUx2xVzaJO-LhdimhDiP4CKk5DsvEuuhTDn9RkO19cz7eJdrt_wGthYRlcJ-5bSFsSG1UV4VlovcLjuqApc5Fsis9kRo0jkar53HM7rmI7t9uN3TcTCQWGpbDvi-OQblbdvNFZh8wy-BaC0SFtOwcVkhwR2CDCf-7FuB5HOJnzmSOtKDZoFrA9gspNZjXoV6LCKmKIGj_tRLaI9jsn9iZZ7Bdtv2SLw7blE53f4OesXbsC0evl9GzlJfIsiaO1I5pEGCT2sWitWyHrbQLJTWeUBi6SoeULpujVp_w25xJonbCD9HAV51bD6rmAI9LEj0bYOBJ1RmtESAqZpV2wj68i-tv5ejdQ-YXOXSuy4DwInYsALmGMRhFFf0tKhNLHMVdCOij0zo4fU24EhmfxMRZifapm4fDBe2bswE10_LJI2DhzLv_NwQfHMQ0qEDOZQss74qaggBnsr4N-OK6egO3RJYCFddDFUa9vwxYIBHjlqb2p7tX4YpugHQ0ZmDYpUAwRzUmcwYaLjs9lzskQzzpOCeKXmwksWWOax-aWkkw9ic17PTAqne84_LMSNnY4mPOYU4sQ0DxdfNX_2iGVrWSkP3XcLUut1OH6Mah-yWaioJbLoXIpxbngW-IAm3Uxafha94fOHSaMymRYG8ZIbKHvg6n3tud08gBfiiJON5CLCovoKkAeGC3-NQQC58341osMVKSRF6SEpsHMGd97lMdWTlkB3v29m-xf8nuCOqgk4Ig5gIodter_BWs2BEXLiw5ISDBvdl26FVUUoBOpexXFwf99wTroDPK85UYlH5W9m51FlSwfgm0Vg5N9nzivMGClDy_jvNDyI5UHnQjVuqnTAcK0nF6RJn-lnO6hT60qq9hRsqa84iUMqOmQZxXv1KbS0exoqfrqps6ILqifM93r87HVzrrCShWFB1A7hfJHoVqRq-PLsO2iv-V6v9S5_nFwYGG8srrNUuNgzLvLB9J7hlN6fPL4f6vWIJ9sBVpPukR-Pr_I8q8hZicr3YVshIuB544w-srUH4OvRx5v5pz9Jfcm2hHZZjO2yaVDAWKQ_PQk0xj43b-pyrjpAznYdG1QMmFcTfjqVDal97EQeMjIAIjlah0BOqhzHtT0dBjYLyBXZwzO7ii7z-6-jQk5FIDX-RqrdHm8D41dTHx-W7LMvNwpW6ueir7HVoYdIZAP0qaSU-Nf3oJTK8wGhRsh4G3PBsrGbamsfK7c2-AYi_f6kcvXWE4G7ch6c5H6cVqrriil8AcjUZ422dIkFIJHfhbPeAIFy6zuDm9ZnZvgjyqI_mnnK0hlLzfSgJFV6QRAYdkmiviit4qIwEOobM6zYeYPdb09Y15MDLcCOM1KpCecaSJDZm4PrnvP3F0nUpYHvVygA1C-CPenmCjeC_AqWMJ_BVXQyIcVx31fxZCvBIkEskI9Wm6qfJkN8IYw00_X4PnpV-u9d6poChA2smfOsFaeHqoNE_RmPO_QTqHE4m6xBH2RueVnIt4QZ2NVOFyZUI4vBEOsNOXYQpw8tkzONR3FcRRsp2qWNXfmTVdrkVR_-oQpUSlkhQKKo9thNq6SpDezaMUpWjMpi1lgaIZUbSU0WUq3A2EtpWW1yJQjuA2rosQYh2zgILAEtyYgu0Qh5qqsKQB_7oyv3LOB5JHVYa94H1xqHwk9XVfOM6Eeszb1-FYZ3ibagpOiIzPPrGhZA1FIfdVDLk3ulDR7l3-NZZD48SkbkxlnJqbjksgtoM-0AAPVV7q4OSH9MBHK29yVJRahzoFei9toYhD2qN3Mo-HVbWPOo89wJ8LKnwTTF02RUcA4xstjuD5B4IEGF2fMprohnlYVpULejRkkga3Mt6wdjLHzJY4WHkSaGfrDChgMfRpAhtPYQ4sSf4FVFaeT6up-1pU3o-n56zibIwHDfmB_rXEXHLIpaUBEDo7-X8TXZ8SvS-isKwmExJxDtjUI_pglFcThIfigVOJvyemEQ11iLmcoIw6vj5Zge3xzxR7pJgiHbGXhbUpYIJyvrol7NIBZwW_AhgE0WJEjrq9ffdoE9OB311ZZbES2q-ghlfGKgyFrrZNgpY_mYCjd8yx5APWvBYoj-w1WxL42Q0bE3DSyBM9JOwb8t1SPNNduz01MVsbj6_zbya7KDW4pHGiU-4Dh-YU8q9ndeuIezb7km6vQn6zjOfLLPXSkIH99RgAn-eNMPdk5CZWXm16nqgpL1ZtxivXhPItlq2p5akhj64_nreXLe2bKscR7syMZ_9xRC1u9EdomxyuJx6HAB-Jo7_AatJcYeI0BNLiGjnflLnbqwP0jH9_6Q2ucC9oNoNNtiyzq-Wy7zW9Q9eDCL8zVfKVAwNkyvzKSra8EJ6u-ukskCAXmN09_WUXQC00H7foIKOhhn4LXT9LoVgFblMsVjm_bBzXQuEA11Bc3RAJHUyLlpH9K-vz1Zebn-1AUDSlEQENIkzW6TpnoumA1m728tvaF8byNOqKgfRdftIixRHmKYUPrgKXrJErEz6P_n2MJvOvvCVH03o_Dpoh19PY6Rcvv1t56SaUCzdEyTcsVP9JRNh26HckesWjb2IfJsDuGrjlX5V5FabPImAKVRGzwNW5lJLwB59OBGkS4xXxI_vwzFiwrP6Pb2DPVgw3-Epe017D0atbZVs6Oik-14Q9uLrxBz4X2EV0HK_nnkg3mndj2LDBEXtCFky6sIrWer4W3i4Ksrfe5oxGiV02tjNNzFSqHg_z9QX43kTbcBePuYDlMRJ2DwmBykJUXdLcT4j9FlQ9BwOSAKHNaE35j-YZkASDYKqRn5SL9zC71C2qyJVDQ-5cw9GRaFZfLDKO6ySv7yZb367UpQ1uUUzqsyivAYA8jqez7LV0Yxz_hq5mBKE-NdHf-EU9uHHg1zkB73pk1wFOqE5siD0fjr7IkU3R3OcTsNSXEMa63jfeiODcSEoKwcOB8gxG-3Xwh1ueQO6sGvP7Z6sWBfPeWlmA662QytXV7njzFerjuXVRLbCfUg1v26xoPdh2jCKN_GZXroctfpV5LuOGfXd6xjgEpDq4CxNLFmNfVAZBKMQ-Fxk_szAtGpOB3lPcJTdy73VelN_L-adhUGmJmETqqK77CFTYze80l1c_lzWn6zNvS6T5HmLaNFdf5m-Rl_DSEijvJiqZrkY-Ff_R3FthqM4NZDrxwkkX99uXbkEqXjReJ', status=None), ResponseOutputMessage(id='msg_0ad9f0876b2555790068c7b790e5388192aa7d4d442882790a', content=[ResponseOutputText(annotations=[], text='Uppercase R: 0. Counting r regardless of case: 3.', type='output_text', logprobs=[])], role='assistant', status='completed', type='message')], parallel_tool_calls=True, temperature=1.0, tool_choice='auto', tools=[], top_p=1.0, background=False, max_output_tokens=None, max_tool_calls=None, previous_response_id=None, prompt=None, prompt_cache_key=None, reasoning=Reasoning(effort='high', generate_summary=None, summary='detailed'), safety_identifier=None, service_tier='default', status='completed', text=ResponseTextConfig(format=ResponseFormatText(type='text'), verbosity='medium'), top_logprobs=0, truncation='disabled', usage=ResponseUsage(input_tokens=19, input_tokens_details=InputTokensDetails(cached_tokens=0), output_tokens=598, output_tokens_details=OutputTokensDetails(reasoning_tokens=576), total_tokens=617), user=None, store=True), sequence_number=240, type='response.completed') +""" diff --git a/letta/llm_api/anthropic_client.py b/letta/llm_api/anthropic_client.py index a6d274f3..724b41d5 100644 --- a/letta/llm_api/anthropic_client.py +++ b/letta/llm_api/anthropic_client.py @@ -10,7 +10,7 @@ from anthropic.types.beta.message_create_params import MessageCreateParamsNonStr from anthropic.types.beta.messages import BetaMessageBatch from anthropic.types.beta.messages.batch_create_params import Request -from letta.constants import FUNC_FAILED_HEARTBEAT_MESSAGE, REQ_HEARTBEAT_MESSAGE +from letta.constants import FUNC_FAILED_HEARTBEAT_MESSAGE, REQ_HEARTBEAT_MESSAGE, REQUEST_HEARTBEAT_PARAM from letta.errors import ( ContextWindowExceededError, ErrorCode, @@ -31,6 +31,7 @@ from letta.llm_api.llm_client_base import LLMClientBase from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_DESCRIPTION from letta.log import get_logger from letta.otel.tracing import trace_method +from letta.schemas.agent import AgentType from letta.schemas.llm_config import LLMConfig from letta.schemas.message import Message as PydanticMessage from letta.schemas.openai.chat_completion_request import Tool as OpenAITool @@ -114,6 +115,7 @@ class AnthropicClient(LLMClientBase): try: requests = { agent_id: self.build_request_data( + agent_type=agent_llm_config_mapping[agent_id].agent_type, messages=agent_messages_mapping[agent_id], llm_config=agent_llm_config_mapping[agent_id], tools=agent_tools_mapping[agent_id], @@ -175,6 +177,7 @@ class AnthropicClient(LLMClientBase): @trace_method def build_request_data( self, + agent_type: AgentType, # if react, use native content + strip heartbeats messages: List[PydanticMessage], llm_config: LLMConfig, tools: Optional[List[dict]] = None, @@ -222,8 +225,9 @@ class AnthropicClient(LLMClientBase): # Special case for summarization path tools_for_request = None tool_choice = None - elif self.is_reasoning_model(llm_config) and llm_config.enable_reasoner: + elif self.is_reasoning_model(llm_config) and llm_config.enable_reasoner or agent_type == AgentType.letta_v1_agent: # NOTE: reasoning models currently do not allow for `any` + # NOTE: react agents should always have auto on, since the precense/absense of tool calls controls chaining tool_choice = {"type": "auto", "disable_parallel_tool_use": True} tools_for_request = [OpenAITool(function=f) for f in tools] elif force_tool_call is not None: @@ -270,6 +274,9 @@ class AnthropicClient(LLMClientBase): messages=messages[1:], inner_thoughts_xml_tag=inner_thoughts_xml_tag, put_inner_thoughts_in_kwargs=bool(llm_config.put_inner_thoughts_in_kwargs), + # if react, use native content + strip heartbeats + native_content=agent_type == AgentType.letta_v1_agent, + strip_request_heartbeat=agent_type == AgentType.letta_v1_agent, ) # Ensure first message is user @@ -279,9 +286,19 @@ class AnthropicClient(LLMClientBase): # Handle alternating messages data["messages"] = merge_tool_results_into_user_messages(data["messages"]) - # Strip heartbeat pings if extended thinking - if llm_config.enable_reasoner: - data["messages"] = merge_heartbeats_into_tool_responses(data["messages"]) + if agent_type == AgentType.letta_v1_agent: + # Both drop heartbeats in the payload + data["messages"] = drop_heartbeats(data["messages"]) + # And drop heartbeats in the tools + for tool in data["tools"]: + tool["input_schema"]["properties"].pop(REQUEST_HEARTBEAT_PARAM, None) + if REQUEST_HEARTBEAT_PARAM in tool["input_schema"]["required"]: + tool["input_schema"]["required"].remove(REQUEST_HEARTBEAT_PARAM) + + else: + # Strip heartbeat pings if extended thinking + if llm_config.enable_reasoner: + data["messages"] = merge_heartbeats_into_tool_responses(data["messages"]) # Prefix fill # https://docs.anthropic.com/en/api/messages#body-messages @@ -716,6 +733,44 @@ def is_heartbeat(message: dict, is_ping: bool = False) -> bool: return False +def drop_heartbeats(messages: List[dict]): + cleaned_messages = [] + + # Loop through messages + # For messages with role 'user' and len(content) > 1, + # Check if content[0].type == 'tool_result' + # If so, iterate over content[1:] and while content.type == 'text' and is_heartbeat(content.text), + # merge into content[0].content + + for message in messages: + if "role" in message and "content" in message and message["role"] == "user": + content_parts = message["content"] + + if isinstance(content_parts, str): + if is_heartbeat({"role": "user", "content": content_parts}): + continue + elif isinstance(content_parts, list) and len(content_parts) == 1 and "text" in content_parts[0]: + if is_heartbeat({"role": "user", "content": content_parts[0]["text"]}): + continue # skip + else: + cleaned_parts = [] + # Drop all the parts + for content_part in content_parts: + if "text" in content_part and is_heartbeat({"role": "user", "content": content_part["text"]}): + continue # skip + else: + cleaned_parts.append(content_part) + + if len(cleaned_parts) == 0: + continue + else: + message["content"] = cleaned_parts + + cleaned_messages.append(message) + + return cleaned_messages + + def merge_heartbeats_into_tool_responses(messages: List[dict]): """For extended thinking mode, we don't want anything other than tool responses in-between assistant actions diff --git a/letta/llm_api/bedrock_client.py b/letta/llm_api/bedrock_client.py index 0d26e0f5..4b0de30d 100644 --- a/letta/llm_api/bedrock_client.py +++ b/letta/llm_api/bedrock_client.py @@ -6,7 +6,7 @@ from aioboto3.session import Session from letta.llm_api.anthropic_client import AnthropicClient from letta.log import get_logger from letta.otel.tracing import trace_method -from letta.schemas.enums import ProviderCategory +from letta.schemas.enums import AgentType, ProviderCategory from letta.schemas.llm_config import LLMConfig from letta.schemas.message import Message as PydanticMessage from letta.services.provider_manager import ProviderManager @@ -65,12 +65,13 @@ class BedrockClient(AnthropicClient): @trace_method def build_request_data( self, + agent_type: AgentType, messages: List[PydanticMessage], llm_config: LLMConfig, tools: Optional[List[dict]] = None, force_tool_call: Optional[str] = None, ) -> dict: - data = super().build_request_data(messages, llm_config, tools, force_tool_call) + data = super().build_request_data(agent_type, messages, llm_config, tools, force_tool_call) # remove disallowed fields if "tool_choice" in data: del data["tool_choice"]["disable_parallel_tool_use"] diff --git a/letta/llm_api/deepseek_client.py b/letta/llm_api/deepseek_client.py index a0037b1e..3dd2d0df 100644 --- a/letta/llm_api/deepseek_client.py +++ b/letta/llm_api/deepseek_client.py @@ -10,6 +10,7 @@ from openai.types.chat.chat_completion_chunk import ChatCompletionChunk from letta.llm_api.openai_client import OpenAIClient from letta.otel.tracing import trace_method +from letta.schemas.enums import AgentType from letta.schemas.llm_config import LLMConfig from letta.schemas.message import Message as PydanticMessage from letta.schemas.openai.chat_completion_request import ( @@ -331,6 +332,7 @@ class DeepseekClient(OpenAIClient): @trace_method def build_request_data( self, + agent_type: AgentType, messages: List[PydanticMessage], llm_config: LLMConfig, tools: Optional[List[dict]] = None, @@ -339,7 +341,7 @@ class DeepseekClient(OpenAIClient): # Override put_inner_thoughts_in_kwargs to False for DeepSeek llm_config.put_inner_thoughts_in_kwargs = False - data = super().build_request_data(messages, llm_config, tools, force_tool_call) + data = super().build_request_data(agent_type, messages, llm_config, tools, force_tool_call) def add_functions_to_system_message(system_message: ChatMessage): system_message.content += f" {''.join(json.dumps(f) for f in tools)} " diff --git a/letta/llm_api/google_vertex_client.py b/letta/llm_api/google_vertex_client.py index 22ef8644..18be59b2 100644 --- a/letta/llm_api/google_vertex_client.py +++ b/letta/llm_api/google_vertex_client.py @@ -34,6 +34,7 @@ from letta.local_llm.json_parser import clean_json_string_extra_backslash from letta.local_llm.utils import count_tokens from letta.log import get_logger from letta.otel.tracing import trace_method +from letta.schemas.agent import AgentType from letta.schemas.llm_config import LLMConfig from letta.schemas.message import Message as PydanticMessage from letta.schemas.openai.chat_completion_request import Tool @@ -274,6 +275,7 @@ class GoogleVertexClient(LLMClientBase): @trace_method def build_request_data( self, + agent_type: AgentType, # if react, use native content + strip heartbeats messages: List[PydanticMessage], llm_config: LLMConfig, tools: List[dict], @@ -282,6 +284,9 @@ class GoogleVertexClient(LLMClientBase): """ Constructs a request object in the expected data format for this client. """ + # NOTE: forcing inner thoughts in kwargs off + if agent_type == AgentType.letta_v1_agent: + llm_config.put_inner_thoughts_in_kwargs = False if tools: tool_objs = [Tool(type="function", function=t) for t in tools] @@ -293,7 +298,11 @@ class GoogleVertexClient(LLMClientBase): tool_names = [] contents = self.add_dummy_model_messages( - PydanticMessage.to_google_dicts_from_list(messages), + PydanticMessage.to_google_dicts_from_list( + messages, + put_inner_thoughts_in_kwargs=False if agent_type == AgentType.letta_v1_agent else True, + native_content=True if agent_type == AgentType.letta_v1_agent else False, + ), ) request_data = { @@ -312,16 +321,42 @@ class GoogleVertexClient(LLMClientBase): request_data["config"]["response_schema"] = self.get_function_call_response_schema(tools[0]) del request_data["config"]["tools"] elif tools: - tool_config = ToolConfig( - function_calling_config=FunctionCallingConfig( - # ANY mode forces the model to predict only function calls - mode=FunctionCallingConfigMode.ANY, - # Provide the list of tools (though empty should also work, it seems not to) - allowed_function_names=tool_names, + if agent_type == AgentType.letta_v1_agent: + # don't require tools + tool_call_mode = FunctionCallingConfigMode.AUTO + tool_config = ToolConfig( + function_calling_config=FunctionCallingConfig( + mode=tool_call_mode, + ) ) - ) + else: + # require tools + tool_call_mode = FunctionCallingConfigMode.ANY + tool_config = ToolConfig( + function_calling_config=FunctionCallingConfig( + mode=tool_call_mode, + # Provide the list of tools (though empty should also work, it seems not to) + allowed_function_names=tool_names, + ) + ) + request_data["config"]["tool_config"] = tool_config.model_dump() + # https://ai.google.dev/gemini-api/docs/thinking#set-budget + # 2.5 Pro + # - Default: dynamic thinking + # - Dynamic thinking that cannot be disabled + # - Range: -1 (for dynamic), or 128-32768 + # 2.5 Flash + # - Default: dynamic thinking + # - Dynamic thinking that *can* be disabled + # - Range: -1, 0, or 0-24576 + # 2.5 Flash Lite + # - Default: no thinking + # - Dynamic thinking that *can* be disabled + # - Range: -1, 0, or 512-24576 + # TODO when using v3 agent loop, properly support the native thinking in Gemini + # Add thinking_config for flash # If enable_reasoner is False, set thinking_budget to 0 # Otherwise, use the value from max_reasoning_tokens @@ -410,8 +445,10 @@ class GoogleVertexClient(LLMClientBase): function_args = function_call.args assert isinstance(function_args, dict), function_args - # NOTE: this also involves stripping the inner monologue out of the function + # TODO this is kind of funky - really, we should be passing 'native_content' as a kwarg to fork behavior + inner_thoughts = response_message.text if llm_config.put_inner_thoughts_in_kwargs: + # NOTE: this also involves stripping the inner monologue out of the function from letta.local_llm.constants import INNER_THOUGHTS_KWARG_VERTEX assert INNER_THOUGHTS_KWARG_VERTEX in function_args, ( @@ -420,7 +457,9 @@ class GoogleVertexClient(LLMClientBase): inner_thoughts = function_args.pop(INNER_THOUGHTS_KWARG_VERTEX) assert inner_thoughts is not None, f"Expected non-null inner thoughts function arg:\n{function_call}" else: - inner_thoughts = None + pass + # inner_thoughts = None + # inner_thoughts = response_message.text # Google AI API doesn't generate tool call IDs openai_response_message = Message( diff --git a/letta/llm_api/groq_client.py b/letta/llm_api/groq_client.py index 25d7aaaf..79544988 100644 --- a/letta/llm_api/groq_client.py +++ b/letta/llm_api/groq_client.py @@ -8,6 +8,7 @@ from openai.types.chat.chat_completion_chunk import ChatCompletionChunk from letta.llm_api.openai_client import OpenAIClient from letta.otel.tracing import trace_method from letta.schemas.embedding_config import EmbeddingConfig +from letta.schemas.enums import AgentType from letta.schemas.llm_config import LLMConfig from letta.schemas.message import Message as PydanticMessage from letta.settings import model_settings @@ -23,12 +24,13 @@ class GroqClient(OpenAIClient): @trace_method def build_request_data( self, + agent_type: AgentType, messages: List[PydanticMessage], llm_config: LLMConfig, tools: Optional[List[dict]] = None, force_tool_call: Optional[str] = None, ) -> dict: - data = super().build_request_data(messages, llm_config, tools, force_tool_call) + data = super().build_request_data(agent_type, messages, llm_config, tools, force_tool_call) # Groq validation - these fields are not supported and will cause 400 errors # https://console.groq.com/docs/openai diff --git a/letta/llm_api/llm_client_base.py b/letta/llm_api/llm_client_base.py index ccbfc3b8..f0b2d0fe 100644 --- a/letta/llm_api/llm_client_base.py +++ b/letta/llm_api/llm_client_base.py @@ -9,7 +9,7 @@ from openai.types.chat.chat_completion_chunk import ChatCompletionChunk from letta.errors import LLMError from letta.otel.tracing import log_event, trace_method from letta.schemas.embedding_config import EmbeddingConfig -from letta.schemas.enums import ProviderCategory +from letta.schemas.enums import AgentType, ProviderCategory from letta.schemas.llm_config import LLMConfig from letta.schemas.message import Message from letta.schemas.openai.chat_completion_response import ChatCompletionResponse @@ -40,6 +40,7 @@ class LLMClientBase: @trace_method def send_llm_request( self, + agent_type: AgentType, messages: List[Message], llm_config: LLMConfig, tools: Optional[List[dict]] = None, # TODO: change to Tool object @@ -52,7 +53,7 @@ class LLMClientBase: If stream=True, returns a Stream[ChatCompletionChunk] that can be iterated over. Otherwise returns a ChatCompletionResponse. """ - request_data = self.build_request_data(messages, llm_config, tools, force_tool_call) + request_data = self.build_request_data(agent_type, messages, llm_config, tools, force_tool_call) try: log_event(name="llm_request_sent", attributes=request_data) @@ -120,6 +121,7 @@ class LLMClientBase: @abstractmethod def build_request_data( self, + agent_type: AgentType, messages: List[Message], llm_config: LLMConfig, tools: List[dict], diff --git a/letta/llm_api/openai_client.py b/letta/llm_api/openai_client.py index 71b83d7d..bc640d58 100644 --- a/letta/llm_api/openai_client.py +++ b/letta/llm_api/openai_client.py @@ -4,10 +4,13 @@ from typing import List, Optional import openai from openai import AsyncOpenAI, AsyncStream, OpenAI +from openai.types import Reasoning from openai.types.chat.chat_completion import ChatCompletion from openai.types.chat.chat_completion_chunk import ChatCompletionChunk +from openai.types.responses import ResponseTextConfigParam +from openai.types.responses.response_stream_event import ResponseStreamEvent -from letta.constants import LETTA_MODEL_ENDPOINT +from letta.constants import LETTA_MODEL_ENDPOINT, REQUEST_HEARTBEAT_PARAM from letta.errors import ( ContextWindowExceededError, ErrorCode, @@ -26,6 +29,7 @@ from letta.llm_api.llm_client_base import LLMClientBase from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_DESCRIPTION, INNER_THOUGHTS_KWARG_DESCRIPTION_GO_FIRST from letta.log import get_logger from letta.otel.tracing import trace_method +from letta.schemas.agent import AgentType from letta.schemas.embedding_config import EmbeddingConfig from letta.schemas.letta_message_content import MessageContentType from letta.schemas.llm_config import LLMConfig @@ -39,6 +43,7 @@ from letta.schemas.openai.chat_completion_request import ( cast_message_to_subtype, ) from letta.schemas.openai.chat_completion_response import ChatCompletionResponse +from letta.schemas.openai.responses_request import ResponsesRequest from letta.settings import model_settings logger = get_logger(__name__) @@ -117,6 +122,11 @@ def requires_auto_tool_choice(llm_config: LLMConfig) -> bool: return False +def use_responses_api(llm_config: LLMConfig) -> bool: + # TODO can opt in all reasoner models to use the Responses API + return is_openai_5_model(llm_config.model) + + class OpenAIClient(LLMClientBase): def _prepare_client_kwargs(self, llm_config: LLMConfig) -> dict: api_key, _, _ = self.get_byok_overrides(llm_config) @@ -188,9 +198,155 @@ class OpenAIClient(LLMClientBase): def supports_structured_output(self, llm_config: LLMConfig) -> bool: return supports_structured_output(llm_config) + @trace_method + def build_request_data_responses( + self, + agent_type: AgentType, # if react, use native content + strip heartbeats + messages: List[PydanticMessage], + llm_config: LLMConfig, + tools: Optional[List[dict]] = None, # Keep as dict for now as per base class + force_tool_call: Optional[str] = None, + ) -> dict: + """ + Constructs a request object in the expected data format for the OpenAI Responses API. + """ + if llm_config.put_inner_thoughts_in_kwargs: + raise ValueError("Inner thoughts in kwargs are not supported for the OpenAI Responses API") + + openai_messages_list = PydanticMessage.to_openai_responses_dicts_from_list(messages) + # Add multi-modal support for Responses API by rewriting user messages + # into input_text/input_image parts. + openai_messages_list = fill_image_content_in_responses_input(openai_messages_list, messages) + + if llm_config.model: + model = llm_config.model + else: + logger.warning(f"Model type not set in llm_config: {llm_config.model_dump_json(indent=4)}") + model = None + + # Default to auto, unless there's a forced tool call coming from above + tool_choice = None + if tools: # only set tool_choice if tools exist + tool_choice = ( + "auto" + if force_tool_call is None + else ToolFunctionChoice(type="function", function=ToolFunctionChoiceFunctionCall(name=force_tool_call)) + ) + + # Convert the tools from the ChatCompletions style to the Responses style + if tools: + # Get proper typing + typed_tools: List[OpenAITool] = [OpenAITool(type="function", function=f) for f in tools] + + # Strip request heartbeat + # TODO relax this? + if agent_type == AgentType.letta_v1_agent: + new_tools = [] + for tool in typed_tools: + # Remove request_heartbeat from the properties if it exists + if tool.function.parameters and "properties" in tool.function.parameters: + tool.function.parameters["properties"].pop(REQUEST_HEARTBEAT_PARAM, None) + # Also remove from required list if present + if "required" in tool.function.parameters and REQUEST_HEARTBEAT_PARAM in tool.function.parameters["required"]: + tool.function.parameters["required"].remove(REQUEST_HEARTBEAT_PARAM) + new_tools.append(tool.model_copy(deep=True)) + typed_tools = new_tools + + # Convert to strict mode + if supports_structured_output(llm_config): + for tool in typed_tools: + try: + structured_output_version = convert_to_structured_output(tool.function.model_dump()) + tool.function = FunctionSchema(**structured_output_version) + except ValueError as e: + logger.warning(f"Failed to convert tool function to structured output, tool={tool}, error={e}") + + # Finally convert to a Responses-friendly dict + responses_tools = [ + { + "type": "function", + "name": t.function.name, + "description": t.function.description, + "parameters": t.function.parameters, + "strict": True, + } + for t in typed_tools + ] + + else: + # Finally convert to a Responses-friendly dict + responses_tools = [ + { + "type": "function", + "name": t.function.name, + "description": t.function.description, + "parameters": t.function.parameters, + # "strict": True, + } + for t in typed_tools + ] + else: + responses_tools = None + + # Prepare the request payload + data = ResponsesRequest( + # Responses specific + store=False, + include=["reasoning.encrypted_content"], + # More or less generic to ChatCompletions API + model=model, + input=openai_messages_list, + tools=responses_tools, + tool_choice=tool_choice, + max_output_tokens=llm_config.max_tokens, + temperature=llm_config.temperature if supports_temperature_param(model) else None, + ) + + # Add verbosity control for GPT-5 models + if supports_verbosity_control(model) and llm_config.verbosity: + # data.verbosity = llm_config.verbosity + # https://cookbook.openai.com/examples/gpt-5/gpt-5_new_params_and_tools + data.text = ResponseTextConfigParam(verbosity=llm_config.verbosity) + + # Add reasoning effort control for reasoning models + if is_openai_reasoning_model(model) and llm_config.reasoning_effort: + # data.reasoning_effort = llm_config.reasoning_effort + data.reasoning = Reasoning( + effort=llm_config.reasoning_effort, + # NOTE: hardcoding summary level, could put in llm_config? + summary="detailed", + ) + + # TODO I don't see this in Responses? + # Add frequency penalty + # if llm_config.frequency_penalty is not None: + # data.frequency_penalty = llm_config.frequency_penalty + + # Add parallel tool calling + if tools and supports_parallel_tool_calling(model): + data.parallel_tool_calls = False + + # always set user id for openai requests + if self.actor: + data.user = self.actor.id + + if llm_config.model_endpoint == LETTA_MODEL_ENDPOINT: + if not self.actor: + # override user id for inference.letta.com + import uuid + + data.user = str(uuid.UUID(int=0)) + + data.model = "memgpt-openai" + + request_data = data.model_dump(exclude_unset=True) + # print("responses request data", request_data) + return request_data + @trace_method def build_request_data( self, + agent_type: AgentType, # if react, use native content + strip heartbeats messages: List[PydanticMessage], llm_config: LLMConfig, tools: Optional[List[dict]] = None, # Keep as dict for now as per base class @@ -199,6 +355,20 @@ class OpenAIClient(LLMClientBase): """ Constructs a request object in the expected data format for the OpenAI API. """ + # Shortcut for GPT-5 to use Responses API, but only for letta_v1_agent + if use_responses_api(llm_config) and agent_type == AgentType.letta_v1_agent: + return self.build_request_data_responses( + agent_type=agent_type, + messages=messages, + llm_config=llm_config, + tools=tools, + force_tool_call=force_tool_call, + ) + + if agent_type == AgentType.letta_v1_agent: + # Safety hard override in case it got set somewhere by accident + llm_config.put_inner_thoughts_in_kwargs = False + if tools and llm_config.put_inner_thoughts_in_kwargs: # Special case for LM Studio backend since it needs extra guidance to force out the thoughts first # TODO(fix) @@ -236,7 +406,7 @@ class OpenAIClient(LLMClientBase): # TODO: This vllm checking is very brittle and is a patch at most tool_choice = None if tools: # only set tool_choice if tools exist - if self.requires_auto_tool_choice(llm_config): + if self.requires_auto_tool_choice(llm_config) or agent_type == AgentType.letta_v1_agent: tool_choice = "auto" else: # only set if tools is non-Null @@ -283,6 +453,20 @@ class OpenAIClient(LLMClientBase): data.model = "memgpt-openai" + # For some reason, request heartbeats are still leaking into here... + # So strip them manually for v3 + if agent_type == AgentType.letta_v1_agent: + new_tools = [] + for tool in data.tools: + # Remove request_heartbeat from the properties if it exists + if tool.function.parameters and "properties" in tool.function.parameters: + tool.function.parameters["properties"].pop(REQUEST_HEARTBEAT_PARAM, None) + # Also remove from required list if present + if "required" in tool.function.parameters and REQUEST_HEARTBEAT_PARAM in tool.function.parameters["required"]: + tool.function.parameters["required"].remove(REQUEST_HEARTBEAT_PARAM) + new_tools.append(tool.model_copy(deep=True)) + data.tools = new_tools + if data.tools is not None and len(data.tools) > 0: # Convert to structured output style (which has 'strict' and no optionals) for tool in data.tools: @@ -293,6 +477,14 @@ class OpenAIClient(LLMClientBase): except ValueError as e: logger.warning(f"Failed to convert tool function to structured output, tool={tool}, error={e}") request_data = data.model_dump(exclude_unset=True) + + # If Ollama + # if llm_config.handle.startswith("ollama/") and llm_config.enable_reasoner: + # Sadly, reasoning via the OpenAI proxy on Ollama only works for Harmony/gpt-oss + # Ollama's OpenAI layer simply looks for the presence of 'reasoining' or 'reasoning_effort' + # If set, then in the backend "medium" thinking is turned on + # request_data["reasoning_effort"] = "medium" + return request_data @trace_method @@ -301,8 +493,13 @@ class OpenAIClient(LLMClientBase): Performs underlying synchronous request to OpenAI API and returns raw response dict. """ client = OpenAI(**self._prepare_client_kwargs(llm_config)) - response: ChatCompletion = client.chat.completions.create(**request_data) - return response.model_dump() + # Route based on payload shape: Responses uses 'input', Chat Completions uses 'messages' + if "input" in request_data and "messages" not in request_data: + resp = client.responses.create(**request_data) + return resp.model_dump() + else: + response: ChatCompletion = client.chat.completions.create(**request_data) + return response.model_dump() @trace_method async def request_async(self, request_data: dict, llm_config: LLMConfig) -> dict: @@ -311,8 +508,13 @@ class OpenAIClient(LLMClientBase): """ kwargs = await self._prepare_client_kwargs_async(llm_config) client = AsyncOpenAI(**kwargs) - response: ChatCompletion = await client.chat.completions.create(**request_data) - return response.model_dump() + # Route based on payload shape: Responses uses 'input', Chat Completions uses 'messages' + if "input" in request_data and "messages" not in request_data: + resp = await client.responses.create(**request_data) + return resp.model_dump() + else: + response: ChatCompletion = await client.chat.completions.create(**request_data) + return response.model_dump() def is_reasoning_model(self, llm_config: LLMConfig) -> bool: return is_openai_reasoning_model(llm_config.model) @@ -328,6 +530,10 @@ class OpenAIClient(LLMClientBase): Converts raw OpenAI response dict into the ChatCompletionResponse Pydantic model. Handles potential extraction of inner thoughts if they were added via kwargs. """ + + if "object" in response_data and response_data["object"] == "response": + raise NotImplementedError("Responses API is not supported for non-streaming") + # OpenAI's response structure directly maps to ChatCompletionResponse # We just need to instantiate the Pydantic model for validation and type safety. chat_completion_response = ChatCompletionResponse(**response_data) @@ -345,15 +551,36 @@ class OpenAIClient(LLMClientBase): return chat_completion_response @trace_method - async def stream_async(self, request_data: dict, llm_config: LLMConfig) -> AsyncStream[ChatCompletionChunk]: + async def stream_async(self, request_data: dict, llm_config: LLMConfig) -> AsyncStream[ChatCompletionChunk | ResponseStreamEvent]: """ Performs underlying asynchronous streaming request to OpenAI and returns the async stream iterator. """ kwargs = await self._prepare_client_kwargs_async(llm_config) client = AsyncOpenAI(**kwargs) - response_stream: AsyncStream[ChatCompletionChunk] = await client.chat.completions.create( - **request_data, stream=True, stream_options={"include_usage": True} - ) + + # Route based on payload shape: Responses uses 'input', Chat Completions uses 'messages' + if "input" in request_data and "messages" not in request_data: + response_stream: AsyncStream[ResponseStreamEvent] = await client.responses.create( + **request_data, + stream=True, + # stream_options={"include_usage": True}, + ) + else: + response_stream: AsyncStream[ChatCompletionChunk] = await client.chat.completions.create( + **request_data, + stream=True, + stream_options={"include_usage": True}, + ) + return response_stream + + @trace_method + async def stream_async_responses(self, request_data: dict, llm_config: LLMConfig) -> AsyncStream[ResponseStreamEvent]: + """ + Performs underlying asynchronous streaming request to OpenAI and returns the async stream iterator. + """ + kwargs = await self._prepare_client_kwargs_async(llm_config) + client = AsyncOpenAI(**kwargs) + response_stream: AsyncStream[ResponseStreamEvent] = await client.responses.create(**request_data, stream=True) return response_stream @trace_method @@ -562,3 +789,58 @@ def fill_image_content_in_messages(openai_message_list: List[dict], pydantic_mes new_message_list.append({"role": "user", "content": message_content}) return new_message_list + + +def fill_image_content_in_responses_input(openai_message_list: List[dict], pydantic_message_list: List[PydanticMessage]) -> List[dict]: + """ + Rewrite user messages in the Responses API input to embed multi-modal parts inside + the message's content array (not as top-level items). + + Expected structure for Responses API input messages: + { "type": "message", "role": "user", "content": [ + {"type": "input_text", "text": "..."}, + {"type": "input_image", "image_url": {"url": "data:;base64,", "detail": "auto"}} + ] } + + Non-user items are left unchanged. + """ + user_msgs = [m for m in pydantic_message_list if getattr(m, "role", None) == "user"] + user_idx = 0 + + rewritten: List[dict] = [] + for item in openai_message_list: + if isinstance(item, dict) and item.get("role") == "user": + if user_idx >= len(user_msgs): + rewritten.append(item) + continue + + pm = user_msgs[user_idx] + user_idx += 1 + + # Only rewrite if the pydantic message actually contains multiple parts or images + if not isinstance(pm.content, list) or (len(pm.content) == 1 and pm.content[0].type == MessageContentType.text): + rewritten.append(item) + continue + + parts: List[dict] = [] + for content in pm.content: + if content.type == MessageContentType.text: + parts.append({"type": "input_text", "text": content.text}) + elif content.type == MessageContentType.image: + # For Responses API, image_url is a string and detail is required + data_url = f"data:{content.source.media_type};base64,{content.source.data}" + parts.append( + {"type": "input_image", "image_url": data_url, "detail": getattr(content.source, "detail", None) or "auto"} + ) + else: + # Skip unsupported content types for Responses input + continue + + # Update message content to include multi-modal parts (EasyInputMessageParam style) + new_item = dict(item) + new_item["content"] = parts + rewritten.append(new_item) + else: + rewritten.append(item) + + return rewritten diff --git a/letta/llm_api/xai_client.py b/letta/llm_api/xai_client.py index b9d37a95..199edcc0 100644 --- a/letta/llm_api/xai_client.py +++ b/letta/llm_api/xai_client.py @@ -8,6 +8,7 @@ from openai.types.chat.chat_completion_chunk import ChatCompletionChunk from letta.llm_api.openai_client import OpenAIClient from letta.otel.tracing import trace_method from letta.schemas.embedding_config import EmbeddingConfig +from letta.schemas.enums import AgentType from letta.schemas.llm_config import LLMConfig from letta.schemas.message import Message as PydanticMessage from letta.settings import model_settings @@ -23,12 +24,13 @@ class XAIClient(OpenAIClient): @trace_method def build_request_data( self, + agent_type: AgentType, messages: List[PydanticMessage], llm_config: LLMConfig, tools: Optional[List[dict]] = None, force_tool_call: Optional[str] = None, ) -> dict: - data = super().build_request_data(messages, llm_config, tools, force_tool_call) + data = super().build_request_data(agent_type, messages, llm_config, tools, force_tool_call) # Specific bug for the mini models (as of Apr 14, 2025) # 400 - {'code': 'Client specified an invalid argument', 'error': 'Argument not supported on this model: presencePenalty'} diff --git a/letta/memory.py b/letta/memory.py index c6a03c14..cd9b4b87 100644 --- a/letta/memory.py +++ b/letta/memory.py @@ -91,6 +91,7 @@ def summarize_messages( # TODO: we can just directly call the LLM here? if llm_client: response = llm_client.send_llm_request( + agent_type=agent_state.agent_type, messages=message_sequence, llm_config=llm_config_no_inner_thoughts, ) diff --git a/letta/prompts/system_prompts/__init__.py b/letta/prompts/system_prompts/__init__.py index 1ceaeb8f..07d23ee6 100644 --- a/letta/prompts/system_prompts/__init__.py +++ b/letta/prompts/system_prompts/__init__.py @@ -1,4 +1,5 @@ from letta.prompts.system_prompts import ( + letta_v1, memgpt_chat, memgpt_generate_tool, memgpt_v2_chat, @@ -17,6 +18,7 @@ SYSTEM_PROMPTS = { "memgpt_v2_chat": memgpt_v2_chat.PROMPT, "sleeptime_v2": sleeptime_v2.PROMPT, "react": react.PROMPT, + "letta_v1": letta_v1.PROMPT, "workflow": workflow.PROMPT, "memgpt_chat": memgpt_chat.PROMPT, "sleeptime_doc_ingest": sleeptime_doc_ingest.PROMPT, diff --git a/letta/prompts/system_prompts/letta_v1.py b/letta/prompts/system_prompts/letta_v1.py new file mode 100644 index 00000000..38bae134 --- /dev/null +++ b/letta/prompts/system_prompts/letta_v1.py @@ -0,0 +1,5 @@ +PROMPT = r""" + +You are a helpful assistant. + +""" diff --git a/letta/prompts/system_prompts/letta_v1.txt b/letta/prompts/system_prompts/letta_v1.txt new file mode 100644 index 00000000..4b5d94e0 --- /dev/null +++ b/letta/prompts/system_prompts/letta_v1.txt @@ -0,0 +1,3 @@ + +Your are a helpful assistant. + diff --git a/letta/schemas/agent.py b/letta/schemas/agent.py index c82e658f..a49f8d83 100644 --- a/letta/schemas/agent.py +++ b/letta/schemas/agent.py @@ -31,6 +31,7 @@ class AgentType(str, Enum): memgpt_agent = "memgpt_agent" # the OG set of memgpt tools memgpt_v2_agent = "memgpt_v2_agent" # memgpt style tools, but refreshed + letta_v1_agent = "letta_v1_agent" # simplification of the memgpt loop, no heartbeats or forced tool calls react_agent = "react_agent" # basic react agent, no memory tools workflow_agent = "workflow_agent" # workflow with auto-clearing message buffer split_thread_agent = "split_thread_agent" diff --git a/letta/schemas/enums.py b/letta/schemas/enums.py index cbf50c89..8e099918 100644 --- a/letta/schemas/enums.py +++ b/letta/schemas/enums.py @@ -28,6 +28,7 @@ class AgentType(str, Enum): memgpt_agent = "memgpt_agent" # the OG set of memgpt tools memgpt_v2_agent = "memgpt_v2_agent" # memgpt style tools, but refreshed + letta_v1_agent = "letta_v1_agent" # simplification of the memgpt loop, no heartbeats or forced tool calls react_agent = "react_agent" # basic react agent, no memory tools workflow_agent = "workflow_agent" # workflow with auto-clearing message buffer split_thread_agent = "split_thread_agent" diff --git a/letta/schemas/letta_message_content.py b/letta/schemas/letta_message_content.py index 8bf31110..69e073ab 100644 --- a/letta/schemas/letta_message_content.py +++ b/letta/schemas/letta_message_content.py @@ -1,6 +1,7 @@ from enum import Enum -from typing import Annotated, Literal, Optional, Union +from typing import Annotated, List, Literal, Optional, Union +from openai.types import Reasoning from pydantic import BaseModel, Field @@ -9,9 +10,13 @@ class MessageContentType(str, Enum): image = "image" tool_call = "tool_call" tool_return = "tool_return" + # For Anthropic extended thinking reasoning = "reasoning" redacted_reasoning = "redacted_reasoning" + # Generic "hidden" (unsavailable) reasoning omitted_reasoning = "omitted_reasoning" + # For OpenAI Responses API + summarized_reasoning = "summarized_reasoning" class MessageContent(BaseModel): @@ -207,6 +212,8 @@ class ToolReturnContent(MessageContent): class ReasoningContent(MessageContent): + """Sent via the Anthropic Messages API""" + type: Literal[MessageContentType.reasoning] = Field( default=MessageContentType.reasoning, description="Indicates this is a reasoning/intermediate step." ) @@ -220,6 +227,8 @@ class ReasoningContent(MessageContent): class RedactedReasoningContent(MessageContent): + """Sent via the Anthropic Messages API""" + type: Literal[MessageContentType.redacted_reasoning] = Field( default=MessageContentType.redacted_reasoning, description="Indicates this is a redacted thinking step." ) @@ -227,6 +236,8 @@ class RedactedReasoningContent(MessageContent): class OmittedReasoningContent(MessageContent): + """A placeholder for reasoning content we know is present, but isn't returned by the provider (e.g. OpenAI GPT-5 on ChatCompletions)""" + type: Literal[MessageContentType.omitted_reasoning] = Field( default=MessageContentType.omitted_reasoning, description="Indicates this is an omitted reasoning step." ) @@ -234,9 +245,60 @@ class OmittedReasoningContent(MessageContent): # tokens: int = Field(..., description="The reasoning token count for intermediate reasoning content.") +class SummarizedReasoningContentPart(BaseModel): + index: int = Field(..., description="The index of the summary part.") + text: str = Field(..., description="The text of the summary part.") + + +class SummarizedReasoningContent(MessageContent): + """The style of reasoning content returned by the OpenAI Responses API""" + + # TODO consider expanding ReasoningContent to support this superset? + # Or alternatively, rename `ReasoningContent` to `AnthropicReasoningContent`, + # and rename this one to `OpenAIReasoningContent`? + + # NOTE: I think the argument for putting thie in ReasoningContent as an additional "summary" field is that it keeps the + # rendering and GET / listing code a lot simpler, you just need to know how to render "TextContent" and "ReasoningContent" + # vs breaking out into having to know how to render additional types + # NOTE: I think the main issue is that we need to track provenance of which provider the reasoning came from + # so that we don't attempt eg to put Anthropic encrypted reasoning into a GPT-5 responses payload + type: Literal[MessageContentType.summarized_reasoning] = Field( + default=MessageContentType.summarized_reasoning, description="Indicates this is a summarized reasoning step." + ) + + # OpenAI requires holding a string + id: str = Field(..., description="The unique identifier for this reasoning step.") # NOTE: I don't think this is actually needed? + # OpenAI returns a list of summary objects, each a string + # Straying a bit from the OpenAI schema so that we can enforce ordering on the deltas that come out + # summary: List[str] = Field(..., description="Summaries of the reasoning content.") + summary: List[SummarizedReasoningContentPart] = Field(..., description="Summaries of the reasoning content.") + encrypted_content: str = Field(default=None, description="The encrypted reasoning content.") + + # Temporary stop-gap until the SDKs are updated + def to_reasoning_content(self) -> Optional[ReasoningContent]: + # Merge the summary parts with a '\n' join + parts = [s.text for s in self.summary if s.text != ""] + if not parts or len(parts) == 0: + return None + else: + combined_summary = "\n\n".join(parts) + return ReasoningContent( + is_native=True, + reasoning=combined_summary, + signature=self.encrypted_content, + ) + + LettaMessageContentUnion = Annotated[ Union[ - TextContent, ImageContent, ToolCallContent, ToolReturnContent, ReasoningContent, RedactedReasoningContent, OmittedReasoningContent + TextContent, + ImageContent, + ToolCallContent, + ToolReturnContent, + ReasoningContent, + RedactedReasoningContent, + OmittedReasoningContent, + SummarizedReasoningContent, ], Field(discriminator="type"), ] diff --git a/letta/schemas/memory.py b/letta/schemas/memory.py index 934ced64..78f5404c 100644 --- a/letta/schemas/memory.py +++ b/letta/schemas/memory.py @@ -271,7 +271,7 @@ class Memory(BaseModel, validate_assignment=True): raw_type = self.agent_type.value if hasattr(self.agent_type, "value") else (self.agent_type or "") norm_type = raw_type.lower() is_react = norm_type in ("react_agent", "workflow_agent") - is_line_numbered = norm_type in ("sleeptime_agent", "memgpt_v2_agent") + is_line_numbered = norm_type in ("sleeptime_agent", "memgpt_v2_agent", "letta_v1_agent") # Memory blocks (not for react/workflow). Always include wrapper for preview/tests. if not is_react: diff --git a/letta/schemas/message.py b/letta/schemas/message.py index d66f3b70..dcfda42f 100644 --- a/letta/schemas/message.py +++ b/letta/schemas/message.py @@ -11,9 +11,10 @@ from enum import Enum from typing import Annotated, Any, Dict, List, Literal, Optional, Union from openai.types.chat.chat_completion_message_tool_call import ChatCompletionMessageToolCall as OpenAIToolCall, Function as OpenAIFunction +from openai.types.responses import ResponseReasoningItem from pydantic import BaseModel, Field, field_validator, model_validator -from letta.constants import DEFAULT_MESSAGE_TOOL, DEFAULT_MESSAGE_TOOL_KWARG, TOOL_CALL_ID_MAX_LEN +from letta.constants import DEFAULT_MESSAGE_TOOL, DEFAULT_MESSAGE_TOOL_KWARG, REQUEST_HEARTBEAT_PARAM, TOOL_CALL_ID_MAX_LEN from letta.helpers.datetime_helpers import get_utc_time, is_utc_datetime from letta.helpers.json_helpers import json_dumps from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_VERTEX @@ -38,6 +39,7 @@ from letta.schemas.letta_message_content import ( OmittedReasoningContent, ReasoningContent, RedactedReasoningContent, + SummarizedReasoningContent, TextContent, ToolReturnContent, get_letta_message_content_union_str_json_schema, @@ -239,6 +241,7 @@ class Message(BaseMessage): assistant_message_tool_kwarg: str = DEFAULT_MESSAGE_TOOL_KWARG, reverse: bool = True, include_err: Optional[bool] = None, + text_is_assistant_message: bool = False, ) -> List[LettaMessage]: if use_assistant_message: message_ids_to_remove = [] @@ -270,6 +273,7 @@ class Message(BaseMessage): assistant_message_tool_kwarg=assistant_message_tool_kwarg, reverse=reverse, include_err=include_err, + text_is_assistant_message=text_is_assistant_message, ) ] @@ -280,12 +284,14 @@ class Message(BaseMessage): assistant_message_tool_kwarg: str = DEFAULT_MESSAGE_TOOL_KWARG, reverse: bool = True, include_err: Optional[bool] = None, + text_is_assistant_message: bool = False, ) -> List[LettaMessage]: """Convert message object (in DB format) to the style used by the original Letta API""" + messages = [] if self.role == MessageRole.assistant: if self.content: - messages.extend(self._convert_reasoning_messages()) + messages.extend(self._convert_reasoning_messages(text_is_assistant_message=text_is_assistant_message)) if self.tool_calls is not None: messages.extend( self._convert_tool_call_messages( @@ -303,7 +309,7 @@ class Message(BaseMessage): messages.append(self._convert_system_message()) elif self.role == MessageRole.approval: if self.content: - messages.extend(self._convert_reasoning_messages()) + messages.extend(self._convert_reasoning_messages(text_is_assistant_message=text_is_assistant_message)) if self.tool_calls is not None: tool_calls = self._convert_tool_call_messages() assert len(tool_calls) == 1 @@ -324,30 +330,33 @@ class Message(BaseMessage): return messages[::-1] if reverse else messages - def _convert_reasoning_messages(self, current_message_count: int = 0) -> List[LettaMessage]: + def _convert_reasoning_messages( + self, + current_message_count: int = 0, + text_is_assistant_message: bool = False, # For v3 loop, set to True + ) -> List[LettaMessage]: messages = [] - # Check for ReACT-style COT inside of TextContent - if len(self.content) == 1 and isinstance(self.content[0], TextContent): + + for content_part in self.content: otid = Message.generate_otid_from_id(self.id, current_message_count + len(messages)) - messages.append( - ReasoningMessage( - id=self.id, - date=self.created_at, - reasoning=self.content[0].text, - name=self.name, - otid=otid, - sender_id=self.sender_id, - step_id=self.step_id, - is_err=self.is_err, - ) - ) - # Otherwise, we may have a list of multiple types - else: - # TODO we can probably collapse these two cases into a single loop - for content_part in self.content: - otid = Message.generate_otid_from_id(self.id, current_message_count + len(messages)) - if isinstance(content_part, TextContent): - # COT + + if isinstance(content_part, TextContent): + if text_is_assistant_message: + # .content is assistant message + messages.append( + AssistantMessage( + id=self.id, + date=self.created_at, + content=content_part.text, + name=self.name, + otid=otid, + sender_id=self.sender_id, + step_id=self.step_id, + is_err=self.is_err, + ) + ) + else: + # .content is COT messages.append( ReasoningMessage( id=self.id, @@ -360,54 +369,96 @@ class Message(BaseMessage): is_err=self.is_err, ) ) - elif isinstance(content_part, ReasoningContent): - # "native" COT + + elif isinstance(content_part, ReasoningContent): + # "native" COT + messages.append( + ReasoningMessage( + id=self.id, + date=self.created_at, + reasoning=content_part.reasoning, + source="reasoner_model", # TODO do we want to tag like this? + signature=content_part.signature, + name=self.name, + otid=otid, + step_id=self.step_id, + is_err=self.is_err, + ) + ) + + elif isinstance(content_part, SummarizedReasoningContent): + # TODO remove the cast and just return the native type + casted_content_part = content_part.to_reasoning_content() + if casted_content_part is not None: messages.append( ReasoningMessage( id=self.id, date=self.created_at, - reasoning=content_part.reasoning, + reasoning=casted_content_part.reasoning, source="reasoner_model", # TODO do we want to tag like this? - signature=content_part.signature, + signature=casted_content_part.signature, name=self.name, otid=otid, step_id=self.step_id, is_err=self.is_err, ) ) - elif isinstance(content_part, RedactedReasoningContent): - # "native" redacted/hidden COT - messages.append( - HiddenReasoningMessage( - id=self.id, - date=self.created_at, - state="redacted", - hidden_reasoning=content_part.data, - name=self.name, - otid=otid, - sender_id=self.sender_id, - step_id=self.step_id, - is_err=self.is_err, - ) + + elif isinstance(content_part, RedactedReasoningContent): + # "native" redacted/hidden COT + messages.append( + HiddenReasoningMessage( + id=self.id, + date=self.created_at, + state="redacted", + hidden_reasoning=content_part.data, + name=self.name, + otid=otid, + sender_id=self.sender_id, + step_id=self.step_id, + is_err=self.is_err, ) - elif isinstance(content_part, OmittedReasoningContent): - # Special case for "hidden reasoning" models like o1/o3 - # NOTE: we also have to think about how to return this during streaming - messages.append( - HiddenReasoningMessage( - id=self.id, - date=self.created_at, - state="omitted", - name=self.name, - otid=otid, - step_id=self.step_id, - is_err=self.is_err, - ) + ) + + elif isinstance(content_part, OmittedReasoningContent): + # Special case for "hidden reasoning" models like o1/o3 + # NOTE: we also have to think about how to return this during streaming + messages.append( + HiddenReasoningMessage( + id=self.id, + date=self.created_at, + state="omitted", + name=self.name, + otid=otid, + step_id=self.step_id, + is_err=self.is_err, ) - else: - warnings.warn(f"Unrecognized content part in assistant message: {content_part}") + ) + + else: + warnings.warn(f"Unrecognized content part in assistant message: {content_part}") + return messages + def _convert_assistant_message( + self, + ) -> AssistantMessage: + if self.content and len(self.content) == 1 and isinstance(self.content[0], TextContent): + text_content = self.content[0].text + else: + raise ValueError(f"Invalid assistant message (no text object on message): {self.content}") + + return AssistantMessage( + id=self.id, + date=self.created_at, + content=text_content, + name=self.name, + otid=self.otid, + sender_id=self.sender_id, + step_id=self.step_id, + # is_err=self.is_err, + ) + def _convert_tool_call_messages( self, current_message_count: int = 0, @@ -746,8 +797,13 @@ class Message(BaseMessage): max_tool_id_length: int = TOOL_CALL_ID_MAX_LEN, put_inner_thoughts_in_kwargs: bool = False, use_developer_message: bool = False, + # if true, then treat the content field as AssistantMessage + native_content: bool = False, + strip_request_heartbeat: bool = False, ) -> dict | None: """Go from Message class to ChatCompletion message object""" + assert not (native_content and put_inner_thoughts_in_kwargs), "native_content and put_inner_thoughts_in_kwargs cannot both be true" + if self.role == "approval" and self.tool_calls is None: return None @@ -789,10 +845,21 @@ class Message(BaseMessage): elif self.role == "assistant" or self.role == "approval": assert self.tool_calls is not None or text_content is not None - openai_message = { - "content": None if (put_inner_thoughts_in_kwargs and self.tool_calls is not None) else text_content, - "role": "assistant", - } + + # if native content, then put it directly inside the content + if native_content: + openai_message = { + # TODO support listed content (if it's possible for role assistant?) + # "content": self.content, + "content": text_content, # here content is not reasoning, it's assistant message + "role": "assistant", + } + # otherwise, if inner_thoughts_in_kwargs, hold it for the tool calls + else: + openai_message = { + "content": None if (put_inner_thoughts_in_kwargs and self.tool_calls is not None) else text_content, + "role": "assistant", + } if self.tool_calls is not None: if put_inner_thoughts_in_kwargs: @@ -807,6 +874,11 @@ class Message(BaseMessage): ] else: openai_message["tool_calls"] = [tool_call.model_dump() for tool_call in self.tool_calls] + + if strip_request_heartbeat: + for tool_call_dict in openai_message["tool_calls"]: + tool_call_dict.pop(REQUEST_HEARTBEAT_PARAM, None) + if max_tool_id_length: for tool_call_dict in openai_message["tool_calls"]: tool_call_dict["id"] = tool_call_dict["id"][:max_tool_id_length] @@ -858,10 +930,116 @@ class Message(BaseMessage): result = [m for m in result if m is not None] return result + def to_openai_responses_dicts( + self, + max_tool_id_length: int = TOOL_CALL_ID_MAX_LEN, + ) -> List[dict]: + """Go from Message class to ChatCompletion message object""" + + if self.role == "approval" and self.tool_calls is None: + return [] + + message_dicts = [] + + if self.role == "system": + assert len(self.content) == 1 and isinstance(self.content[0], TextContent), vars(self) + message_dicts.append( + { + "role": "developer", + "content": self.content[0].text, + } + ) + + elif self.role == "user": + # TODO do we need to do a swap to placeholder text here for images? + assert all([isinstance(c, TextContent) or isinstance(c, ImageContent) for c in self.content]), vars(self) + + user_dict = { + "role": self.role.value if hasattr(self.role, "value") else self.role, + # TODO support multi-modal + "content": self.content[0].text, + } + + # Optional field, do not include if null or invalid + if self.name is not None: + if bool(re.match(r"^[^\s<|\\/>]+$", self.name)): + user_dict["name"] = self.name + else: + warnings.warn(f"Using OpenAI with invalid 'name' field (name={self.name} role={self.role}).") + + message_dicts.append(user_dict) + + elif self.role == "assistant" or self.role == "approval": + assert self.tool_calls is not None or (self.content is not None and len(self.content) > 0) + + # A few things may be in here, firstly reasoning content, secondly assistant messages, thirdly tool calls + # TODO check if OpenAI Responses is capable of R->A->T like Anthropic? + + if self.content is not None: + for content_part in self.content: + if isinstance(content_part, SummarizedReasoningContent): + message_dicts.append( + { + "type": "reasoning", + "id": content_part.id, + "summary": [{"type": "summary_text", "text": s.text} for s in content_part.summary], + "encrypted_content": content_part.encrypted_content, + } + ) + elif isinstance(content_part, TextContent): + message_dicts.append( + { + "role": "assistant", + "content": content_part.text, + } + ) + # else skip + + if self.tool_calls is not None: + for tool_call in self.tool_calls: + message_dicts.append( + { + "type": "function_call", + "call_id": tool_call.id[:max_tool_id_length] if max_tool_id_length else tool_call.id, + "name": tool_call.function.name, + "arguments": tool_call.function.arguments, + "status": "completed", # TODO check if needed? + } + ) + + elif self.role == "tool": + assert self.tool_call_id is not None, vars(self) + assert len(self.content) == 1 and isinstance(self.content[0], TextContent), vars(self) + message_dicts.append( + { + "type": "function_call_output", + "call_id": self.tool_call_id[:max_tool_id_length] if max_tool_id_length else self.tool_call_id, + "output": self.content[0].text, + } + ) + + else: + raise ValueError(self.role) + + return message_dicts + + @staticmethod + def to_openai_responses_dicts_from_list( + messages: List[Message], + max_tool_id_length: int = TOOL_CALL_ID_MAX_LEN, + ) -> List[dict]: + result = [] + for message in messages: + result.extend(message.to_openai_responses_dicts(max_tool_id_length=max_tool_id_length)) + return result + def to_anthropic_dict( self, inner_thoughts_xml_tag="thinking", put_inner_thoughts_in_kwargs: bool = False, + # if true, then treat the content field as AssistantMessage + native_content: bool = False, + strip_request_heartbeat: bool = False, ) -> dict | None: """ Convert to an Anthropic message dictionary @@ -869,6 +1047,8 @@ class Message(BaseMessage): Args: inner_thoughts_xml_tag (str): The XML tag to wrap around inner thoughts """ + assert not (native_content and put_inner_thoughts_in_kwargs), "native_content and put_inner_thoughts_in_kwargs cannot both be true" + if self.role == "approval" and self.tool_calls is None: return None @@ -929,43 +1109,76 @@ class Message(BaseMessage): } elif self.role == "assistant" or self.role == "approval": - assert self.tool_calls is not None or text_content is not None + # assert self.tool_calls is not None or text_content is not None, vars(self) + assert self.tool_calls is not None or len(self.content) > 0 anthropic_message = { "role": "assistant", } content = [] - # COT / reasoning / thinking - if self.content is not None and len(self.content) >= 1: - for content_part in self.content: - if isinstance(content_part, ReasoningContent): - content.append( - { - "type": "thinking", - "thinking": content_part.reasoning, - "signature": content_part.signature, - } - ) - if isinstance(content_part, RedactedReasoningContent): - content.append( - { - "type": "redacted_thinking", - "data": content_part.data, - } - ) - if isinstance(content_part, TextContent): - content.append( - { - "type": "text", - "text": content_part.text, - } - ) - elif text_content is not None: - content.append( - { - "type": "text", - "text": add_xml_tag(string=text_content, xml_tag=inner_thoughts_xml_tag), - } - ) + if native_content: + # No special handling for TextContent + if self.content is not None: + for content_part in self.content: + # TextContent, ImageContent, ToolCallContent, ToolReturnContent, ReasoningContent, RedactedReasoningContent, OmittedReasoningContent + if isinstance(content_part, ReasoningContent): + content.append( + { + "type": "thinking", + "thinking": content_part.reasoning, + "signature": content_part.signature, + } + ) + elif isinstance(content_part, RedactedReasoningContent): + content.append( + { + "type": "redacted_thinking", + "data": content_part.data, + } + ) + elif isinstance(content_part, TextContent): + content.append( + { + "type": "text", + "text": content_part.text, + } + ) + else: + # Skip unsupported types eg OmmitedReasoningContent + pass + + else: + # COT / reasoning / thinking + if self.content is not None and len(self.content) >= 1: + for content_part in self.content: + if isinstance(content_part, ReasoningContent): + content.append( + { + "type": "thinking", + "thinking": content_part.reasoning, + "signature": content_part.signature, + } + ) + if isinstance(content_part, RedactedReasoningContent): + content.append( + { + "type": "redacted_thinking", + "data": content_part.data, + } + ) + if isinstance(content_part, TextContent): + content.append( + { + "type": "text", + "text": content_part.text, + } + ) + elif text_content is not None: + content.append( + { + "type": "text", + "text": add_xml_tag(string=text_content, xml_tag=inner_thoughts_xml_tag), + } + ) # Tool calling if self.tool_calls is not None: for tool_call in self.tool_calls: @@ -978,6 +1191,9 @@ class Message(BaseMessage): else: tool_call_input = parse_json(tool_call.function.arguments) + if strip_request_heartbeat: + tool_call_input.pop(REQUEST_HEARTBEAT_PARAM, None) + content.append( { "type": "tool_use", @@ -987,8 +1203,6 @@ class Message(BaseMessage): } ) - # If the only content was text, unpack it back into a singleton - # TODO support multi-modal anthropic_message["content"] = content elif self.role == "tool": @@ -1016,21 +1230,34 @@ class Message(BaseMessage): messages: List[Message], inner_thoughts_xml_tag: str = "thinking", put_inner_thoughts_in_kwargs: bool = False, + # if true, then treat the content field as AssistantMessage + native_content: bool = False, + strip_request_heartbeat: bool = False, ) -> List[dict]: result = [ m.to_anthropic_dict( inner_thoughts_xml_tag=inner_thoughts_xml_tag, put_inner_thoughts_in_kwargs=put_inner_thoughts_in_kwargs, + native_content=native_content, + strip_request_heartbeat=strip_request_heartbeat, ) for m in messages ] result = [m for m in result if m is not None] return result - def to_google_dict(self, put_inner_thoughts_in_kwargs: bool = True) -> dict | None: + def to_google_dict( + self, + put_inner_thoughts_in_kwargs: bool = True, + # if true, then treat the content field as AssistantMessage + native_content: bool = False, + strip_request_heartbeat: bool = False, + ) -> dict | None: """ Go from Message class to Google AI REST message object """ + assert not (native_content and put_inner_thoughts_in_kwargs), "native_content and put_inner_thoughts_in_kwargs cannot both be true" + if self.role == "approval" and self.tool_calls is None: return None @@ -1088,7 +1315,12 @@ class Message(BaseMessage): # NOTE: Google AI API doesn't allow non-null content + function call # To get around this, just two a two part message, inner thoughts first then parts = [] - if not put_inner_thoughts_in_kwargs and text_content is not None: + + if native_content and text_content is not None: + # TODO support multi-part assistant content + parts.append({"text": text_content}) + + elif not put_inner_thoughts_in_kwargs and text_content is not None: # NOTE: ideally we do multi-part for CoT / inner thoughts + function call, but Google AI API doesn't allow it raise NotImplementedError parts.append({"text": text_content}) @@ -1110,6 +1342,9 @@ class Message(BaseMessage): assert len(self.tool_calls) == 1 function_args[INNER_THOUGHTS_KWARG_VERTEX] = text_content + if strip_request_heartbeat: + function_args.pop(REQUEST_HEARTBEAT_PARAM, None) + parts.append( { "functionCall": { @@ -1119,8 +1354,9 @@ class Message(BaseMessage): } ) else: - assert text_content is not None - parts.append({"text": text_content}) + if not native_content: + assert text_content is not None + parts.append({"text": text_content}) google_ai_message["parts"] = parts elif self.role == "tool": @@ -1171,10 +1407,12 @@ class Message(BaseMessage): def to_google_dicts_from_list( messages: List[Message], put_inner_thoughts_in_kwargs: bool = True, + native_content: bool = False, ): result = [ m.to_google_dict( put_inner_thoughts_in_kwargs=put_inner_thoughts_in_kwargs, + native_content=native_content, ) for m in messages ] diff --git a/letta/schemas/openai/responses_request.py b/letta/schemas/openai/responses_request.py new file mode 100644 index 00000000..aeeefa23 --- /dev/null +++ b/letta/schemas/openai/responses_request.py @@ -0,0 +1,64 @@ +from typing import Any, Dict, Iterable, List, Literal, Optional, Union + +from openai import NOT_GIVEN +from openai.types import Metadata, Reasoning, ResponsesModel + +# from openai._types import Headers, Query, Body +from openai.types.responses import ( + ResponseIncludable, + ResponseInputParam, + ResponsePromptParam, + ResponseTextConfigParam, + ToolParam, + response_create_params, +) + +# import httpx +from pydantic import BaseModel, Field + + +class ResponsesRequest(BaseModel): + background: Optional[bool] = Field(default=NOT_GIVEN) + include: Optional[List[ResponseIncludable]] = Field(default=NOT_GIVEN) + input: Optional[Union[str, ResponseInputParam]] = Field(default=NOT_GIVEN) + instructions: Optional[str] = Field(default=NOT_GIVEN) + max_output_tokens: Optional[int] = Field(default=NOT_GIVEN) + max_tool_calls: Optional[int] = Field(default=NOT_GIVEN) + metadata: Optional[Metadata] = Field(default=NOT_GIVEN) + model: Optional[ResponsesModel] = Field(default=NOT_GIVEN) + parallel_tool_calls: Optional[bool] = Field(default=NOT_GIVEN) + previous_response_id: Optional[str] = Field(default=NOT_GIVEN) + prompt: Optional[ResponsePromptParam] = Field(default=NOT_GIVEN) + prompt_cache_key: Optional[str] = Field(default=NOT_GIVEN) + reasoning: Optional[Reasoning] = Field(default=NOT_GIVEN) + safety_identifier: Optional[str] = Field(default=NOT_GIVEN) + service_tier: Optional[Literal["auto", "default", "flex", "scale", "priority"]] = Field(default=NOT_GIVEN) + store: Optional[bool] = Field(default=NOT_GIVEN) + stream: Optional[Literal[False]] = Field(default=NOT_GIVEN) + stream_options: Optional[response_create_params.StreamOptions] = Field(default=NOT_GIVEN) + temperature: Optional[float] = Field(default=NOT_GIVEN) + text: Optional[ResponseTextConfigParam] = Field(default=NOT_GIVEN) + tool_choice: Optional[response_create_params.ToolChoice] = Field(default=NOT_GIVEN) + tools: Optional[Iterable[ToolParam]] = Field(default=NOT_GIVEN) + top_logprobs: Optional[int] = Field(default=NOT_GIVEN) + top_p: Optional[float] = Field(default=NOT_GIVEN) + truncation: Optional[Literal["auto", "disabled"]] = Field(default=NOT_GIVEN) + user: Optional[str] = Field(default=NOT_GIVEN) + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + # extra_headers: Headers | None = (None,) + # extra_query: Query | None = (None,) + # extra_body: Body | None = (None,) + # timeout: float | httpx.Timeout | None | NotGiven = (NOT_GIVEN,) + + def model_dump(self, **kwargs) -> Dict[str, Any]: + """Custom model_dump that properly serializes complex OpenAI types for JSON compatibility.""" + # Force JSON mode to ensure full serialization of complex OpenAI types + # This prevents SerializationIterator objects from being created + kwargs["mode"] = "json" + + # Get the JSON-serialized dump + data = super().model_dump(**kwargs) + + # The API expects dicts, which JSON mode provides + return data diff --git a/letta/schemas/providers/ollama.py b/letta/schemas/providers/ollama.py index ba0f7940..b5e4801a 100644 --- a/letta/schemas/providers/ollama.py +++ b/letta/schemas/providers/ollama.py @@ -3,7 +3,7 @@ from typing import Literal import aiohttp from pydantic import Field -from letta.constants import DEFAULT_CONTEXT_WINDOW, DEFAULT_EMBEDDING_CHUNK_SIZE, DEFAULT_EMBEDDING_DIM, OLLAMA_API_PREFIX +from letta.constants import DEFAULT_CONTEXT_WINDOW, DEFAULT_EMBEDDING_CHUNK_SIZE from letta.log import get_logger from letta.schemas.embedding_config import EmbeddingConfig from letta.schemas.enums import ProviderCategory, ProviderType @@ -27,82 +27,163 @@ class OllamaProvider(OpenAIProvider): ..., description="Default prompt formatter (aka model wrapper) to use on a /completions style API." ) - async def list_llm_models_async(self) -> list[LLMConfig]: - """List available LLM Models from Ollama + @property + def raw_base_url(self) -> str: + """Base URL for native Ollama /api endpoints (no trailing /v1).""" + if self.base_url.endswith("/v1"): + return self.base_url[: -len("/v1")] + return self.base_url - https://github.com/ollama/ollama/blob/main/docs/api.md#list-local-models""" - endpoint = f"{self.base_url}/api/tags" + @property + def openai_compat_base_url(self) -> str: + """Base URL with /v1 appended for OpenAI-compatible clients if ever needed. + + Note: We do not use OpenAI chat completions for Ollama, but expose this + helper to clarify intent and avoid duplicating logic elsewhere. + """ + return self.base_url if self.base_url.endswith("/v1") else f"{self.base_url.rstrip('/')}" + "/v1" + + async def list_llm_models_async(self) -> list[LLMConfig]: + """List available LLM Models from Ollama. + + Note: Older Ollama versions do not expose a "capabilities" field on /api/show. + We therefore avoid filtering on capabilities and instead infer support from + /api/show model_info (falling back to safe defaults). + + https://github.com/ollama/ollama/blob/main/docs/api.md#list-local-models + """ + endpoint = f"{self.raw_base_url}/api/tags" async with aiohttp.ClientSession() as session: async with session.get(endpoint) as response: if response.status != 200: - raise Exception(f"Failed to list Ollama models: {response.text}") + # aiohttp: .text() is async + error_text = await response.text() + raise Exception(f"Failed to list Ollama models: {response.status} - {error_text}") response_json = await response.json() - configs = [] - for model in response_json.get("models", []): - model_name = model["name"] - model_details = await self._get_model_details_async(model_name) - if not model_details or "completion" not in model_details.get("capabilities", []): + configs: list[LLMConfig] = [] + for m in response_json.get("models", []): + model_name = m.get("name") + if not model_name: continue - context_window = None - model_info = model_details.get("model_info", {}) - if architecture := model_info.get("general.architecture"): - if context_length := model_info.get(f"{architecture}.context_length"): - context_window = int(context_length) + # Use /api/show to check capabilities, specifically tools support + details = await self._get_model_details_async(model_name) + if not details: + # If details cannot be fetched, skip to avoid tool errors later + continue + caps = details.get("capabilities") or [] + if not isinstance(caps, list): + caps = [] + if "tools" not in [str(c).lower() for c in caps]: + # Only include models that declare tools support + continue + # Derive context window from /api/show model_info if available + context_window = None + model_info = details.get("model_info", {}) if isinstance(details, dict) else {} + architecture = model_info.get("general.architecture") if isinstance(model_info, dict) else None + if architecture: + ctx_len = model_info.get(f"{architecture}.context_length") + if ctx_len is not None: + try: + context_window = int(ctx_len) + except Exception: + context_window = None if context_window is None: - logger.warning(f"Ollama model {model_name} has no context window, using default {DEFAULT_CONTEXT_WINDOW}") + logger.warning(f"Ollama model {model_name} has no context window in /api/show, using default {DEFAULT_CONTEXT_WINDOW}") context_window = DEFAULT_CONTEXT_WINDOW + # === Capability stubs === + # Compute support flags from /api/show capabilities. These are not + # yet plumbed through LLMConfig, but are captured here for later use. + caps_lower = [str(c).lower() for c in caps] + supports_tools = "tools" in caps_lower + supports_thinking = "thinking" in caps_lower + supports_vision = "vision" in caps_lower + supports_completion = "completion" in caps_lower + _ = (supports_tools, supports_thinking, supports_vision, supports_completion) + configs.append( + # Legacy Ollama using raw generate + # LLMConfig( + # model=model_name, + # model_endpoint_type="ollama", + # model_endpoint=self.openai_compat_base_url, + # model_wrapper=self.default_prompt_formatter, + # context_window=context_window, + # # Ollama specific + # handle=self.get_handle(model_name), + # provider_name=self.name, + # provider_category=self.provider_category, + # ) + # New "trust Ollama" version w/ pure OpenAI proxy LLMConfig( model=model_name, - model_endpoint_type=ProviderType.ollama, - model_endpoint=f"{self.base_url}{OLLAMA_API_PREFIX}", - model_wrapper=self.default_prompt_formatter, + model_endpoint_type="openai", + model_endpoint=self.openai_compat_base_url, + # model_wrapper=self.default_prompt_formatter, context_window=context_window, handle=self.get_handle(model_name), provider_name=self.name, provider_category=self.provider_category, + # put_inner_thoughts_in_kwargs=True, + # enable_reasoner=supports_thinking, ) ) return configs async def list_embedding_models_async(self) -> list[EmbeddingConfig]: - """List available embedding models from Ollama + """List available embedding models from Ollama. + + We infer embedding support via model_info.*.embedding_length when available. https://github.com/ollama/ollama/blob/main/docs/api.md#list-local-models """ - endpoint = f"{self.base_url}/api/tags" + endpoint = f"{self.raw_base_url}/api/tags" async with aiohttp.ClientSession() as session: async with session.get(endpoint) as response: if response.status != 200: - raise Exception(f"Failed to list Ollama models: {response.text}") + error_text = await response.text() + raise Exception(f"Failed to list Ollama models: {response.status} - {error_text}") response_json = await response.json() - configs = [] + configs: list[EmbeddingConfig] = [] for model in response_json.get("models", []): model_name = model["name"] model_details = await self._get_model_details_async(model_name) - if not model_details or "embedding" not in model_details.get("capabilities", []): + + if not model_details: + continue + + # Filter to true embedding models via capabilities + caps = model_details.get("capabilities") or [] + if not isinstance(caps, list): + caps = [] + if "embedding" not in [str(c).lower() for c in caps]: continue embedding_dim = None model_info = model_details.get("model_info", {}) - if architecture := model_info.get("general.architecture"): - if embedding_length := model_info.get(f"{architecture}.embedding_length"): - embedding_dim = int(embedding_length) + architecture = model_info.get("general.architecture") + if architecture: + embedding_length = model_info.get(f"{architecture}.embedding_length") + if embedding_length is not None: + try: + embedding_dim = int(embedding_length) + except Exception: + pass if not embedding_dim: - logger.warning(f"Ollama model {model_name} has no embedding dimension, using default {DEFAULT_EMBEDDING_DIM}") - embedding_dim = DEFAULT_EMBEDDING_DIM + # Skip models without a reported embedding dimension to avoid DB dimension mismatches + continue configs.append( EmbeddingConfig( embedding_model=model_name, - embedding_endpoint_type=ProviderType.ollama, - embedding_endpoint=f"{self.base_url}{OLLAMA_API_PREFIX}", + # Use OpenAI-compatible proxy for embeddings + embedding_endpoint_type=ProviderType.openai, + embedding_endpoint=self.openai_compat_base_url, embedding_dim=embedding_dim, embedding_chunk_size=DEFAULT_EMBEDDING_CHUNK_SIZE, handle=self.get_handle(model_name, is_embedding=True), @@ -112,11 +193,12 @@ class OllamaProvider(OpenAIProvider): async def _get_model_details_async(self, model_name: str) -> dict | None: """Get detailed information for a specific model from /api/show.""" - endpoint = f"{self.base_url}/api/show" + endpoint = f"{self.raw_base_url}/api/show" payload = {"name": model_name} try: - async with aiohttp.ClientSession() as session: + timeout = aiohttp.ClientTimeout(total=2.0) + async with aiohttp.ClientSession(timeout=timeout) as session: async with session.post(endpoint, json=payload) as response: if response.status != 200: error_text = await response.text() diff --git a/letta/server/rest_api/routers/v1/agents.py b/letta/server/rest_api/routers/v1/agents.py index 3ae2c1f6..a5ac046e 100644 --- a/letta/server/rest_api/routers/v1/agents.py +++ b/letta/server/rest_api/routers/v1/agents.py @@ -30,7 +30,7 @@ from letta.log import get_logger from letta.orm.errors import NoResultFound from letta.otel.context import get_ctx_attributes from letta.otel.metric_registry import MetricRegistry -from letta.schemas.agent import AgentState, CreateAgent, UpdateAgent +from letta.schemas.agent import AgentState, AgentType, CreateAgent, UpdateAgent from letta.schemas.agent_file import AgentFileSchema from letta.schemas.block import Block, BlockUpdate from letta.schemas.enums import JobType @@ -438,6 +438,8 @@ async def create_agent( """ Create an agent. """ + # TODO remove + # agent.agent_type = AgentType.letta_v1_agent try: actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id) return await server.create_agent_async(agent, actor=actor) @@ -1653,6 +1655,7 @@ async def send_message_async( """ MetricRegistry().user_message_counter.add(1, get_ctx_attributes()) actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id) + # Create a new job use_lettuce = headers.experimental_params.message_async and settings.temporal_endpoint is not None run = Run( diff --git a/letta/server/rest_api/routers/v1/tools.py b/letta/server/rest_api/routers/v1/tools.py index 5c7200d9..08656ce6 100644 --- a/letta/server/rest_api/routers/v1/tools.py +++ b/letta/server/rest_api/routers/v1/tools.py @@ -27,7 +27,7 @@ from letta.log import get_logger from letta.orm.errors import UniqueConstraintViolationError from letta.orm.mcp_oauth import OAuthSessionStatus from letta.prompts.gpt_system import get_system_text -from letta.schemas.enums import MessageRole, ToolType +from letta.schemas.enums import AgentType, MessageRole, ToolType from letta.schemas.letta_message import ToolReturnMessage from letta.schemas.letta_message_content import TextContent from letta.schemas.mcp import UpdateSSEMCPServer, UpdateStdioMCPServer, UpdateStreamableHTTPMCPServer @@ -1165,6 +1165,7 @@ async def generate_tool_from_prompt( }, } request_data = llm_client.build_request_data( + AgentType.letta_v1_agent, input_messages, llm_config, tools=[tool], diff --git a/letta/server/rest_api/utils.py b/letta/server/rest_api/utils.py index bf664728..3e9be813 100644 --- a/letta/server/rest_api/utils.py +++ b/letta/server/rest_api/utils.py @@ -27,7 +27,13 @@ from letta.otel.metric_registry import MetricRegistry from letta.otel.tracing import tracer from letta.schemas.agent import AgentState from letta.schemas.enums import MessageRole -from letta.schemas.letta_message_content import OmittedReasoningContent, ReasoningContent, RedactedReasoningContent, TextContent +from letta.schemas.letta_message_content import ( + OmittedReasoningContent, + ReasoningContent, + RedactedReasoningContent, + SummarizedReasoningContent, + TextContent, +) from letta.schemas.llm_config import LLMConfig from letta.schemas.message import ApprovalCreate, Message, MessageCreate, ToolReturn from letta.schemas.tool_execution_result import ToolExecutionResult @@ -216,75 +222,119 @@ def create_approval_request_message_from_llm_response( def create_letta_messages_from_llm_response( agent_id: str, model: str, - function_name: str, - function_arguments: Dict, - tool_execution_result: ToolExecutionResult, - tool_call_id: str, - function_call_success: bool, + function_name: Optional[str], + function_arguments: Optional[Dict], + tool_execution_result: Optional[ToolExecutionResult], + tool_call_id: Optional[str], + function_call_success: Optional[bool], function_response: Optional[str], timezone: str, actor: User, continue_stepping: bool = False, heartbeat_reason: Optional[str] = None, - reasoning_content: Optional[List[Union[TextContent, ReasoningContent, RedactedReasoningContent, OmittedReasoningContent]]] = None, + reasoning_content: Optional[ + List[Union[TextContent, ReasoningContent, RedactedReasoningContent, OmittedReasoningContent | SummarizedReasoningContent]] + ] = None, pre_computed_assistant_message_id: Optional[str] = None, llm_batch_item_id: Optional[str] = None, step_id: str | None = None, is_approval_response: bool | None = None, + # force set request_heartbeat, useful for v2 loop to ensure matching tool rules + force_set_request_heartbeat: bool = True, + add_heartbeat_on_continue: bool = True, ) -> List[Message]: messages = [] - if not is_approval_response: - # Construct the tool call with the assistant's message - # Force set request_heartbeat in tool_args to calculated continue_stepping - function_arguments[REQUEST_HEARTBEAT_PARAM] = continue_stepping - tool_call = OpenAIToolCall( - id=tool_call_id, - function=OpenAIFunction( - name=function_name, - arguments=json.dumps(function_arguments), - ), - type="function", - ) - # TODO: Use ToolCallContent instead of tool_calls - # TODO: This helps preserve ordering - assistant_message = Message( - role=MessageRole.assistant, - content=reasoning_content if reasoning_content else [], - agent_id=agent_id, - model=model, - tool_calls=[tool_call], - tool_call_id=tool_call_id, - created_at=get_utc_time(), - batch_item_id=llm_batch_item_id, - ) - if pre_computed_assistant_message_id: - assistant_message.id = pre_computed_assistant_message_id - messages.append(assistant_message) + if not is_approval_response: # Skip approval responses (omit them) + if function_name is not None: + # Construct the tool call with the assistant's message + # Force set request_heartbeat in tool_args to calculated continue_stepping + if force_set_request_heartbeat: + function_arguments[REQUEST_HEARTBEAT_PARAM] = continue_stepping + tool_call = OpenAIToolCall( + id=tool_call_id, + function=OpenAIFunction( + name=function_name, + arguments=json.dumps(function_arguments), + ), + type="function", + ) + # TODO: Use ToolCallContent instead of tool_calls + # TODO: This helps preserve ordering + + # Safeguard against empty text messages + content = [] + if reasoning_content: + for content_part in reasoning_content: + if isinstance(content_part, TextContent) and content_part.text == "": + continue + content.append(content_part) + + assistant_message = Message( + role=MessageRole.assistant, + content=content, + agent_id=agent_id, + model=model, + tool_calls=[tool_call], + tool_call_id=tool_call_id, + created_at=get_utc_time(), + batch_item_id=llm_batch_item_id, + ) + else: + # Safeguard against empty text messages + content = [] + if reasoning_content: + for content_part in reasoning_content: + if isinstance(content_part, TextContent) and content_part.text == "": + continue + content.append(content_part) + + # Should only hit this if using react agents + if content and len(content) > 0: + assistant_message = Message( + role=MessageRole.assistant, + # NOTE: weird that this is called "reasoning_content" here, since it's not + content=content, + agent_id=agent_id, + model=model, + tool_calls=None, + tool_call_id=None, + created_at=get_utc_time(), + batch_item_id=llm_batch_item_id, + ) + else: + assistant_message = None + + if assistant_message: + if pre_computed_assistant_message_id: + assistant_message.id = pre_computed_assistant_message_id + messages.append(assistant_message) # TODO: Use ToolReturnContent instead of TextContent # TODO: This helps preserve ordering - tool_message = Message( - role=MessageRole.tool, - content=[TextContent(text=package_function_response(function_call_success, function_response, timezone))], - agent_id=agent_id, - model=model, - tool_calls=[], - tool_call_id=tool_call_id, - created_at=get_utc_time(), - name=function_name, - batch_item_id=llm_batch_item_id, - tool_returns=[ - ToolReturn( - status=tool_execution_result.status, - stderr=tool_execution_result.stderr, - stdout=tool_execution_result.stdout, - # func_return=tool_execution_result.func_return, - ) - ], - ) - messages.append(tool_message) + if tool_execution_result is not None: + tool_message = Message( + role=MessageRole.tool, + content=[TextContent(text=package_function_response(function_call_success, function_response, timezone))], + agent_id=agent_id, + model=model, + tool_calls=[], + tool_call_id=tool_call_id, + created_at=get_utc_time(), + name=function_name, + batch_item_id=llm_batch_item_id, + tool_returns=[ + ToolReturn( + status=tool_execution_result.status, + stderr=tool_execution_result.stderr, + stdout=tool_execution_result.stdout, + # func_return=tool_execution_result.func_return, + ) + ], + ) + messages.append(tool_message) - if continue_stepping: + if continue_stepping and add_heartbeat_on_continue: + # TODO skip this for react agents, instead we just force looping heartbeat_system_message = create_heartbeat_system_message( agent_id=agent_id, model=model, diff --git a/letta/server/server.py b/letta/server/server.py index b82fa115..4f30704d 100644 --- a/letta/server/server.py +++ b/letta/server/server.py @@ -753,6 +753,10 @@ class SyncServer(Server): ) if not return_message_object: + # Get agent state to determine if it's a react agent + agent_state = await self.agent_manager.get_agent_by_id_async(agent_id=agent_id, actor=actor) + text_is_assistant_message = agent_state.agent_type == AgentType.letta_v1_agent + records = Message.to_letta_messages_from_list( messages=records, use_assistant_message=use_assistant_message, @@ -760,6 +764,7 @@ class SyncServer(Server): assistant_message_tool_kwarg=assistant_message_tool_kwarg, reverse=reverse, include_err=include_err, + text_is_assistant_message=text_is_assistant_message, ) if reverse: diff --git a/letta/services/agent_manager.py b/letta/services/agent_manager.py index 4c40131e..ead5199e 100644 --- a/letta/services/agent_manager.py +++ b/letta/services/agent_manager.py @@ -352,6 +352,15 @@ class AgentManager: tool_names |= calculate_base_tools(is_v2=True) elif agent_create.agent_type == AgentType.react_agent: pass # no default tools + elif agent_create.agent_type == AgentType.letta_v1_agent: + tool_names |= calculate_base_tools(is_v2=True) + # Remove `send_message` if it exists + tool_names.discard("send_message") + # NOTE: also overwriting inner_thoughts_in_kwargs to force False + agent_create.llm_config.put_inner_thoughts_in_kwargs = False + # NOTE: also overwrite initial message sequence to empty by default + if agent_create.initial_message_sequence is None: + agent_create.initial_message_sequence = [] elif agent_create.agent_type == AgentType.workflow_agent: pass # no default tools else: diff --git a/letta/services/helpers/agent_manager_helper.py b/letta/services/helpers/agent_manager_helper.py index bd317697..98d8159f 100644 --- a/letta/services/helpers/agent_manager_helper.py +++ b/letta/services/helpers/agent_manager_helper.py @@ -207,6 +207,10 @@ def derive_system_message(agent_type: AgentType, enable_sleeptime: Optional[bool elif agent_type == AgentType.react_agent: system = gpt_system.get_system_text("react") + # Letta v1 + elif agent_type == AgentType.letta_v1_agent: + system = gpt_system.get_system_text("letta_v1") + # Workflow elif agent_type == AgentType.workflow_agent: system = gpt_system.get_system_text("workflow") diff --git a/letta/services/summarizer/summarizer.py b/letta/services/summarizer/summarizer.py index 58bea6b0..de2492f5 100644 --- a/letta/services/summarizer/summarizer.py +++ b/letta/services/summarizer/summarizer.py @@ -10,7 +10,7 @@ from letta.llm_api.llm_client import LLMClient from letta.log import get_logger from letta.otel.tracing import trace_method from letta.prompts import gpt_summarize -from letta.schemas.enums import MessageRole +from letta.schemas.enums import AgentType, MessageRole from letta.schemas.letta_message_content import TextContent from letta.schemas.llm_config import LLMConfig from letta.schemas.message import Message, MessageCreate @@ -383,7 +383,7 @@ async def simple_summary(messages: List[Message], llm_config: LLMConfig, actor: {"role": "user", "content": summary_transcript}, ] input_messages_obj = [simple_message_wrapper(msg) for msg in input_messages] - request_data = llm_client.build_request_data(input_messages_obj, llm_config, tools=[]) + request_data = llm_client.build_request_data(AgentType.letta_v1_agent, input_messages_obj, llm_config, tools=[]) # NOTE: we should disable the inner_thoughts_in_kwargs here, because we don't use it # I'm leaving it commented it out for now for safety but is fine assuming the var here is a copy not a reference