From 2944e5206cbc1e1d4e3e4c522ada5252afcf8878 Mon Sep 17 00:00:00 2001 From: Matthew Zhou Date: Tue, 8 Apr 2025 19:06:39 -0700 Subject: [PATCH] fix: Fix streaming when there are child tool rules (#1638) --- letta/llm_api/anthropic.py | 11 ++++++++--- letta/llm_api/llm_api_tools.py | 2 ++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/letta/llm_api/anthropic.py b/letta/llm_api/anthropic.py index ca6a9156..e3b9b34b 100644 --- a/letta/llm_api/anthropic.py +++ b/letta/llm_api/anthropic.py @@ -25,6 +25,7 @@ from letta.llm_api.aws_bedrock import get_bedrock_client from letta.llm_api.helpers import add_inner_thoughts_to_functions from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_DESCRIPTION from letta.local_llm.utils import num_tokens_from_functions, num_tokens_from_messages +from letta.log import get_logger from letta.schemas.message import Message as _Message from letta.schemas.message import MessageRole as _MessageRole from letta.schemas.openai.chat_completion_request import ChatCompletionRequest, Tool @@ -44,6 +45,8 @@ from letta.settings import model_settings from letta.streaming_interface import AgentChunkStreamingInterface, AgentRefreshStreamingInterface from letta.tracing import log_event +logger = get_logger(__name__) + BASE_URL = "https://api.anthropic.com/v1" @@ -620,9 +623,9 @@ def _prepare_anthropic_request( data: ChatCompletionRequest, inner_thoughts_xml_tag: Optional[str] = "thinking", # if true, prefix fill the generation with the thinking tag - prefix_fill: bool = True, + prefix_fill: bool = False, # if true, put COT inside the tool calls instead of inside the content - put_inner_thoughts_in_kwargs: bool = False, + put_inner_thoughts_in_kwargs: bool = True, bedrock: bool = False, # extended thinking related fields # https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking @@ -634,7 +637,9 @@ def _prepare_anthropic_request( assert ( max_reasoning_tokens is not None and max_reasoning_tokens < data.max_tokens ), "max tokens must be greater than thinking budget" - assert not put_inner_thoughts_in_kwargs, "extended thinking not compatible with put_inner_thoughts_in_kwargs" + if put_inner_thoughts_in_kwargs: + logger.warning("Extended thinking not compatible with put_inner_thoughts_in_kwargs") + put_inner_thoughts_in_kwargs = False # assert not prefix_fill, "extended thinking not compatible with prefix_fill" # Silently disable prefix_fill for now prefix_fill = False diff --git a/letta/llm_api/llm_api_tools.py b/letta/llm_api/llm_api_tools.py index fe941e03..3d069f1b 100644 --- a/letta/llm_api/llm_api_tools.py +++ b/letta/llm_api/llm_api_tools.py @@ -322,6 +322,7 @@ def create( # Force tool calling tool_call = None + llm_config.put_inner_thoughts_in_kwargs = True if functions is None: # Special case for summarization path tools = None @@ -356,6 +357,7 @@ def create( if stream: # Client requested token streaming assert isinstance(stream_interface, (AgentChunkStreamingInterface, AgentRefreshStreamingInterface)), type(stream_interface) + stream_interface.inner_thoughts_in_kwargs = True response = anthropic_chat_completions_process_stream( chat_completion_request=chat_completion_request, put_inner_thoughts_in_kwargs=llm_config.put_inner_thoughts_in_kwargs,