From 264171f32798082c094f8c8f3ed26a3f43b489fb Mon Sep 17 00:00:00 2001
From: Charles Packer <packercharles@gmail.com>
Date: Tue, 2 Sep 2025 16:21:18 -0700
Subject: [PATCH] fix: patch streaming hidden reasoning event [LET-4167]
 (#4367)

* fix: patch streaming hidden reasoning event

* fix: patch reasoning_effort not getting passed to openai
---
 .../interfaces/openai_streaming_interface.py  | 27 ++++++++++++++++++-
 letta/llm_api/openai_client.py                |  4 +++
 .../schemas/openai/chat_completion_request.py |  1 +
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/letta/interfaces/openai_streaming_interface.py b/letta/interfaces/openai_streaming_interface.py
index 8a4d968f..10c6ed78 100644
--- a/letta/interfaces/openai_streaming_interface.py
+++ b/letta/interfaces/openai_streaming_interface.py
@@ -10,7 +10,14 @@ from letta.constants import DEFAULT_MESSAGE_TOOL, DEFAULT_MESSAGE_TOOL_KWARG
 from letta.llm_api.openai_client import is_openai_reasoning_model
 from letta.local_llm.utils import num_tokens_from_functions, num_tokens_from_messages
 from letta.log import get_logger
-from letta.schemas.letta_message import AssistantMessage, LettaMessage, ReasoningMessage, ToolCallDelta, ToolCallMessage
+from letta.schemas.letta_message import (
+    AssistantMessage,
+    HiddenReasoningMessage,
+    LettaMessage,
+    ReasoningMessage,
+    ToolCallDelta,
+    ToolCallMessage,
+)
 from letta.schemas.letta_message_content import OmittedReasoningContent, TextContent
 from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType
 from letta.schemas.message import Message
@@ -40,6 +47,7 @@ class OpenAIStreamingInterface:
         self.use_assistant_message = use_assistant_message
         self.assistant_message_tool_name = DEFAULT_MESSAGE_TOOL
         self.assistant_message_tool_kwarg = DEFAULT_MESSAGE_TOOL_KWARG
+        self.put_inner_thoughts_in_kwarg = put_inner_thoughts_in_kwarg
 
         self.optimistic_json_parser: OptimisticJSONParser = OptimisticJSONParser()
         self.function_args_reader = JSONInnerThoughtsExtractor(wait_for_first_key=put_inner_thoughts_in_kwarg)
@@ -76,6 +84,7 @@ class OpenAIStreamingInterface:
         self.tool_call_name: str | None = None
         self.tool_call_id: str | None = None
         self.reasoning_messages = []
+        self.emitted_hidden_reasoning = False  # Track if we've emitted hidden reasoning message
 
     def get_reasoning_content(self) -> list[TextContent | OmittedReasoningContent]:
         content = "".join(self.reasoning_messages).strip()
@@ -186,6 +195,22 @@ class OpenAIStreamingInterface:
             if message_delta.tool_calls is not None and len(message_delta.tool_calls) > 0:
                 tool_call = message_delta.tool_calls[0]
 
+                # For OpenAI reasoning models, emit a hidden reasoning message before the first tool call
+                if not self.emitted_hidden_reasoning and is_openai_reasoning_model(self.model) and not self.put_inner_thoughts_in_kwarg:
+                    self.emitted_hidden_reasoning = True
+                    if prev_message_type and prev_message_type != "hidden_reasoning_message":
+                        message_index += 1
+                    hidden_message = HiddenReasoningMessage(
+                        id=self.letta_message_id,
+                        date=datetime.now(timezone.utc),
+                        state="omitted",
+                        hidden_reasoning=None,
+                        otid=Message.generate_otid_from_id(self.letta_message_id, message_index),
+                    )
+                    yield hidden_message
+                    prev_message_type = hidden_message.message_type
+                    message_index += 1  # Increment for the next message
+
                 if tool_call.function.name:
                     # If we're waiting for the first key, then we should hold back the name
                     # ie add it to a buffer instead of returning it as a chunk
diff --git a/letta/llm_api/openai_client.py b/letta/llm_api/openai_client.py
index 15fc1d71..a7e4e100 100644
--- a/letta/llm_api/openai_client.py
+++ b/letta/llm_api/openai_client.py
@@ -221,6 +221,10 @@ class OpenAIClient(LLMClientBase):
         if supports_verbosity_control(model) and llm_config.verbosity:
             data.verbosity = llm_config.verbosity
 
+        # Add reasoning effort control for reasoning models
+        if is_openai_reasoning_model(model) and llm_config.reasoning_effort:
+            data.reasoning_effort = llm_config.reasoning_effort
+
         if llm_config.frequency_penalty is not None:
             data.frequency_penalty = llm_config.frequency_penalty
 
diff --git a/letta/schemas/openai/chat_completion_request.py b/letta/schemas/openai/chat_completion_request.py
index 26d3a4ca..35ddf702 100644
--- a/letta/schemas/openai/chat_completion_request.py
+++ b/letta/schemas/openai/chat_completion_request.py
@@ -136,6 +136,7 @@ class ChatCompletionRequest(BaseModel):
     parallel_tool_calls: Optional[bool] = None
     instructions: Optional[str] = None
     verbosity: Optional[Literal["low", "medium", "high"]] = None  # For verbosity control in GPT-5 models
+    reasoning_effort: Optional[Literal["minimal", "low", "medium", "high"]] = None  # For reasoning effort control in reasoning models
 
     # function-calling related
     tools: Optional[List[Tool]] = None