From 2944e5206cbc1e1d4e3e4c522ada5252afcf8878 Mon Sep 17 00:00:00 2001
From: Matthew Zhou <mattzh1314@gmail.com>
Date: Tue, 8 Apr 2025 19:06:39 -0700
Subject: [PATCH] fix: Fix streaming when there are child tool rules (#1638)

---
 letta/llm_api/anthropic.py     | 11 ++++++++---
 letta/llm_api/llm_api_tools.py |  2 ++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/letta/llm_api/anthropic.py b/letta/llm_api/anthropic.py
index ca6a9156..e3b9b34b 100644
--- a/letta/llm_api/anthropic.py
+++ b/letta/llm_api/anthropic.py
@@ -25,6 +25,7 @@ from letta.llm_api.aws_bedrock import get_bedrock_client
 from letta.llm_api.helpers import add_inner_thoughts_to_functions
 from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_DESCRIPTION
 from letta.local_llm.utils import num_tokens_from_functions, num_tokens_from_messages
+from letta.log import get_logger
 from letta.schemas.message import Message as _Message
 from letta.schemas.message import MessageRole as _MessageRole
 from letta.schemas.openai.chat_completion_request import ChatCompletionRequest, Tool
@@ -44,6 +45,8 @@ from letta.settings import model_settings
 from letta.streaming_interface import AgentChunkStreamingInterface, AgentRefreshStreamingInterface
 from letta.tracing import log_event
 
+logger = get_logger(__name__)
+
 BASE_URL = "https://api.anthropic.com/v1"
 
 
@@ -620,9 +623,9 @@ def _prepare_anthropic_request(
     data: ChatCompletionRequest,
     inner_thoughts_xml_tag: Optional[str] = "thinking",
     # if true, prefix fill the generation with the thinking tag
-    prefix_fill: bool = True,
+    prefix_fill: bool = False,
     # if true, put COT inside the tool calls instead of inside the content
-    put_inner_thoughts_in_kwargs: bool = False,
+    put_inner_thoughts_in_kwargs: bool = True,
     bedrock: bool = False,
     # extended thinking related fields
     # https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking
@@ -634,7 +637,9 @@ def _prepare_anthropic_request(
         assert (
             max_reasoning_tokens is not None and max_reasoning_tokens < data.max_tokens
         ), "max tokens must be greater than thinking budget"
-        assert not put_inner_thoughts_in_kwargs, "extended thinking not compatible with put_inner_thoughts_in_kwargs"
+        if put_inner_thoughts_in_kwargs:
+            logger.warning("Extended thinking not compatible with put_inner_thoughts_in_kwargs")
+            put_inner_thoughts_in_kwargs = False
         # assert not prefix_fill, "extended thinking not compatible with prefix_fill"
         # Silently disable prefix_fill for now
         prefix_fill = False
diff --git a/letta/llm_api/llm_api_tools.py b/letta/llm_api/llm_api_tools.py
index fe941e03..3d069f1b 100644
--- a/letta/llm_api/llm_api_tools.py
+++ b/letta/llm_api/llm_api_tools.py
@@ -322,6 +322,7 @@ def create(
 
         # Force tool calling
         tool_call = None
+        llm_config.put_inner_thoughts_in_kwargs = True
         if functions is None:
             # Special case for summarization path
             tools = None
@@ -356,6 +357,7 @@ def create(
         if stream:  # Client requested token streaming
             assert isinstance(stream_interface, (AgentChunkStreamingInterface, AgentRefreshStreamingInterface)), type(stream_interface)
 
+            stream_interface.inner_thoughts_in_kwargs = True
             response = anthropic_chat_completions_process_stream(
                 chat_completion_request=chat_completion_request,
                 put_inner_thoughts_in_kwargs=llm_config.put_inner_thoughts_in_kwargs,