From 264171f32798082c094f8c8f3ed26a3f43b489fb Mon Sep 17 00:00:00 2001 From: Charles Packer Date: Tue, 2 Sep 2025 16:21:18 -0700 Subject: [PATCH] fix: patch streaming hidden reasoning event [LET-4167] (#4367) * fix: patch streaming hidden reasoning event * fix: patch reasoning_effort not getting passed to openai --- .../interfaces/openai_streaming_interface.py | 27 ++++++++++++++++++- letta/llm_api/openai_client.py | 4 +++ .../schemas/openai/chat_completion_request.py | 1 + 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/letta/interfaces/openai_streaming_interface.py b/letta/interfaces/openai_streaming_interface.py index 8a4d968f..10c6ed78 100644 --- a/letta/interfaces/openai_streaming_interface.py +++ b/letta/interfaces/openai_streaming_interface.py @@ -10,7 +10,14 @@ from letta.constants import DEFAULT_MESSAGE_TOOL, DEFAULT_MESSAGE_TOOL_KWARG from letta.llm_api.openai_client import is_openai_reasoning_model from letta.local_llm.utils import num_tokens_from_functions, num_tokens_from_messages from letta.log import get_logger -from letta.schemas.letta_message import AssistantMessage, LettaMessage, ReasoningMessage, ToolCallDelta, ToolCallMessage +from letta.schemas.letta_message import ( + AssistantMessage, + HiddenReasoningMessage, + LettaMessage, + ReasoningMessage, + ToolCallDelta, + ToolCallMessage, +) from letta.schemas.letta_message_content import OmittedReasoningContent, TextContent from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType from letta.schemas.message import Message @@ -40,6 +47,7 @@ class OpenAIStreamingInterface: self.use_assistant_message = use_assistant_message self.assistant_message_tool_name = DEFAULT_MESSAGE_TOOL self.assistant_message_tool_kwarg = DEFAULT_MESSAGE_TOOL_KWARG + self.put_inner_thoughts_in_kwarg = put_inner_thoughts_in_kwarg self.optimistic_json_parser: OptimisticJSONParser = OptimisticJSONParser() self.function_args_reader = JSONInnerThoughtsExtractor(wait_for_first_key=put_inner_thoughts_in_kwarg) @@ -76,6 +84,7 @@ class OpenAIStreamingInterface: self.tool_call_name: str | None = None self.tool_call_id: str | None = None self.reasoning_messages = [] + self.emitted_hidden_reasoning = False # Track if we've emitted hidden reasoning message def get_reasoning_content(self) -> list[TextContent | OmittedReasoningContent]: content = "".join(self.reasoning_messages).strip() @@ -186,6 +195,22 @@ class OpenAIStreamingInterface: if message_delta.tool_calls is not None and len(message_delta.tool_calls) > 0: tool_call = message_delta.tool_calls[0] + # For OpenAI reasoning models, emit a hidden reasoning message before the first tool call + if not self.emitted_hidden_reasoning and is_openai_reasoning_model(self.model) and not self.put_inner_thoughts_in_kwarg: + self.emitted_hidden_reasoning = True + if prev_message_type and prev_message_type != "hidden_reasoning_message": + message_index += 1 + hidden_message = HiddenReasoningMessage( + id=self.letta_message_id, + date=datetime.now(timezone.utc), + state="omitted", + hidden_reasoning=None, + otid=Message.generate_otid_from_id(self.letta_message_id, message_index), + ) + yield hidden_message + prev_message_type = hidden_message.message_type + message_index += 1 # Increment for the next message + if tool_call.function.name: # If we're waiting for the first key, then we should hold back the name # ie add it to a buffer instead of returning it as a chunk diff --git a/letta/llm_api/openai_client.py b/letta/llm_api/openai_client.py index 15fc1d71..a7e4e100 100644 --- a/letta/llm_api/openai_client.py +++ b/letta/llm_api/openai_client.py @@ -221,6 +221,10 @@ class OpenAIClient(LLMClientBase): if supports_verbosity_control(model) and llm_config.verbosity: data.verbosity = llm_config.verbosity + # Add reasoning effort control for reasoning models + if is_openai_reasoning_model(model) and llm_config.reasoning_effort: + data.reasoning_effort = llm_config.reasoning_effort + if llm_config.frequency_penalty is not None: data.frequency_penalty = llm_config.frequency_penalty diff --git a/letta/schemas/openai/chat_completion_request.py b/letta/schemas/openai/chat_completion_request.py index 26d3a4ca..35ddf702 100644 --- a/letta/schemas/openai/chat_completion_request.py +++ b/letta/schemas/openai/chat_completion_request.py @@ -136,6 +136,7 @@ class ChatCompletionRequest(BaseModel): parallel_tool_calls: Optional[bool] = None instructions: Optional[str] = None verbosity: Optional[Literal["low", "medium", "high"]] = None # For verbosity control in GPT-5 models + reasoning_effort: Optional[Literal["minimal", "low", "medium", "high"]] = None # For reasoning effort control in reasoning models # function-calling related tools: Optional[List[Tool]] = None