fix: Fix streaming when there are child tool rules (#1638)

2025-04-08 19:06:39 -07:00
parent 4c2fea1f8d
commit 2944e5206c
2 changed files with 10 additions and 3 deletions
--- a/letta/llm_api/anthropic.py
+++ b/letta/llm_api/anthropic.py
@@ -25,6 +25,7 @@ from letta.llm_api.aws_bedrock import get_bedrock_client
 from letta.llm_api.helpers import add_inner_thoughts_to_functions
 from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_DESCRIPTION
 from letta.local_llm.utils import num_tokens_from_functions, num_tokens_from_messages
+from letta.log import get_logger
 from letta.schemas.message import Message as _Message
 from letta.schemas.message import MessageRole as _MessageRole
 from letta.schemas.openai.chat_completion_request import ChatCompletionRequest, Tool
@@ -44,6 +45,8 @@ from letta.settings import model_settings
 from letta.streaming_interface import AgentChunkStreamingInterface, AgentRefreshStreamingInterface
 from letta.tracing import log_event

+logger = get_logger(__name__)
+
 BASE_URL = "https://api.anthropic.com/v1"


@@ -620,9 +623,9 @@ def _prepare_anthropic_request(
    data: ChatCompletionRequest,
    inner_thoughts_xml_tag: Optional[str] = "thinking",
    # if true, prefix fill the generation with the thinking tag
-    prefix_fill: bool = True,
+    prefix_fill: bool = False,
    # if true, put COT inside the tool calls instead of inside the content
-    put_inner_thoughts_in_kwargs: bool = False,
+    put_inner_thoughts_in_kwargs: bool = True,
    bedrock: bool = False,
    # extended thinking related fields
    # https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking
@@ -634,7 +637,9 @@ def _prepare_anthropic_request(
        assert (
            max_reasoning_tokens is not None and max_reasoning_tokens < data.max_tokens
        ), "max tokens must be greater than thinking budget"
-        assert not put_inner_thoughts_in_kwargs, "extended thinking not compatible with put_inner_thoughts_in_kwargs"
+        if put_inner_thoughts_in_kwargs:
+            logger.warning("Extended thinking not compatible with put_inner_thoughts_in_kwargs")
+            put_inner_thoughts_in_kwargs = False
        # assert not prefix_fill, "extended thinking not compatible with prefix_fill"
        # Silently disable prefix_fill for now
        prefix_fill = False
--- a/letta/llm_api/llm_api_tools.py
+++ b/letta/llm_api/llm_api_tools.py
@@ -322,6 +322,7 @@ def create(

        # Force tool calling
        tool_call = None
+        llm_config.put_inner_thoughts_in_kwargs = True
        if functions is None:
            # Special case for summarization path
            tools = None
@@ -356,6 +357,7 @@ def create(
        if stream:  # Client requested token streaming
            assert isinstance(stream_interface, (AgentChunkStreamingInterface, AgentRefreshStreamingInterface)), type(stream_interface)

+            stream_interface.inner_thoughts_in_kwargs = True
            response = anthropic_chat_completions_process_stream(
                chat_completion_request=chat_completion_request,
                put_inner_thoughts_in_kwargs=llm_config.put_inner_thoughts_in_kwargs,