From 89321ff29a2b0a873f83d318558fde1c3e8d7324 Mon Sep 17 00:00:00 2001 From: cthomas Date: Fri, 3 Oct 2025 15:52:00 -0700 Subject: [PATCH] feat: handle flaky reasoning in v2 tests (#5133) --- letta/llm_api/google_vertex_client.py | 6 +- letta/schemas/message.py | 64 ++++++------ tests/integration_test_send_message_v2.py | 119 ++++++++++------------ 3 files changed, 89 insertions(+), 100 deletions(-) diff --git a/letta/llm_api/google_vertex_client.py b/letta/llm_api/google_vertex_client.py index 687ac937..de7c4d81 100644 --- a/letta/llm_api/google_vertex_client.py +++ b/letta/llm_api/google_vertex_client.py @@ -563,9 +563,9 @@ class GoogleVertexClient(LLMClientBase): ) else: openai_response_message.content = inner_thoughts - if response_message.thought_signature: - thought_signature = base64.b64encode(response_message.thought_signature).decode("utf-8") - openai_response_message.reasoning_content_signature = thought_signature + if response_message.thought_signature: + thought_signature = base64.b64encode(response_message.thought_signature).decode("utf-8") + openai_response_message.reasoning_content_signature = thought_signature # Google AI API uses different finish reason strings than OpenAI # OpenAI: 'stop', 'length', 'function_call', 'content_filter', null diff --git a/letta/schemas/message.py b/letta/schemas/message.py index 1d24e6de..7334f9d1 100644 --- a/letta/schemas/message.py +++ b/letta/schemas/message.py @@ -302,12 +302,6 @@ class Message(BaseMessage): if self.role == MessageRole.assistant: if self.content: messages.extend(self._convert_reasoning_messages(text_is_assistant_message=text_is_assistant_message)) - for i in range(len(messages) - 1, -1, -1): - if i > 0 and messages[i].message_type == messages[i - 1].message_type: - if messages[i].message_type == MessageType.reasoning_message: - messages[i - 1].reasoning = messages[i - 1].reasoning + messages.pop(i).reasoning - elif messages[i].message_type == MessageType.assistant_message: - messages[i - 1].content = messages[i - 1].content + messages.pop(i).content if self.tool_calls is not None: messages.extend( @@ -361,19 +355,22 @@ class Message(BaseMessage): if isinstance(content_part, TextContent): if text_is_assistant_message: # .content is assistant message - messages.append( - AssistantMessage( - id=self.id, - date=self.created_at, - content=content_part.text, - name=self.name, - otid=otid, - sender_id=self.sender_id, - step_id=self.step_id, - is_err=self.is_err, - run_id=self.run_id, + if messages and messages[-1].message_type == MessageType.assistant_message: + messages[-1].content += content_part.text + else: + messages.append( + AssistantMessage( + id=self.id, + date=self.created_at, + content=content_part.text, + name=self.name, + otid=otid, + sender_id=self.sender_id, + step_id=self.step_id, + is_err=self.is_err, + run_id=self.run_id, + ) ) - ) else: # .content is COT messages.append( @@ -392,20 +389,23 @@ class Message(BaseMessage): elif isinstance(content_part, ReasoningContent): # "native" COT - messages.append( - ReasoningMessage( - id=self.id, - date=self.created_at, - reasoning=content_part.reasoning, - source="reasoner_model", # TODO do we want to tag like this? - signature=content_part.signature, - name=self.name, - otid=otid, - step_id=self.step_id, - is_err=self.is_err, - run_id=self.run_id, + if messages and messages[-1].message_type == MessageType.reasoning_message: + messages[-1].reasoning += content_part.reasoning + else: + messages.append( + ReasoningMessage( + id=self.id, + date=self.created_at, + reasoning=content_part.reasoning, + source="reasoner_model", # TODO do we want to tag like this? + signature=content_part.signature, + name=self.name, + otid=otid, + step_id=self.step_id, + is_err=self.is_err, + run_id=self.run_id, + ) ) - ) elif isinstance(content_part, SummarizedReasoningContent): # TODO remove the cast and just return the native type @@ -1409,7 +1409,7 @@ class Message(BaseMessage): "name": content.name, "args": content.input, }, - "thought_signature": content.signature, + # "thought_signature": content.signature, } ) else: diff --git a/tests/integration_test_send_message_v2.py b/tests/integration_test_send_message_v2.py index eca63086..1ea9d9ac 100644 --- a/tests/integration_test_send_message_v2.py +++ b/tests/integration_test_send_message_v2.py @@ -6,7 +6,7 @@ import time import uuid from contextlib import contextmanager from http.server import BaseHTTPRequestHandler, HTTPServer -from typing import Any, Dict, List +from typing import Any, Dict, List, Tuple from unittest.mock import patch import httpx @@ -115,19 +115,10 @@ def assert_greeting_response( msg for msg in messages if not (isinstance(msg, LettaPing) or (hasattr(msg, "message_type") and msg.message_type == "ping")) ] - expected_message_count = get_expected_message_count(llm_config, streaming=streaming, from_db=from_db) - try: - assert len(messages) == expected_message_count - except: - # Reasoning summary in responses API when effort is high is still flaky, so don't throw if missing - if ( - LLMConfig.is_openai_reasoning_model(llm_config) - or LLMConfig.is_google_vertex_reasoning_model(llm_config) - or LLMConfig.is_google_ai_reasoning_model(llm_config) - ): - assert len(messages) == expected_message_count - 1 - else: - raise + expected_message_count_min, expected_message_count_max = get_expected_message_count_range( + llm_config, streaming=streaming, from_db=from_db + ) + assert expected_message_count_min <= len(messages) <= expected_message_count_max # User message if loaded from db index = 0 @@ -139,26 +130,14 @@ def assert_greeting_response( # Reasoning message if reasoning enabled otid_suffix = 0 try: - if ( - (LLMConfig.is_openai_reasoning_model(llm_config) and llm_config.reasoning_effort == "high") - or LLMConfig.is_anthropic_reasoning_model(llm_config) - or LLMConfig.is_google_vertex_reasoning_model(llm_config) - or LLMConfig.is_google_ai_reasoning_model(llm_config) - ): + if is_reasoner_model(llm_config): assert isinstance(messages[index], ReasoningMessage) assert messages[index].otid and messages[index].otid[-1] == str(otid_suffix) index += 1 otid_suffix += 1 except: - # Reasoning summary in responses API when effort is high is still flaky, so don't throw if missing - if ( - LLMConfig.is_openai_reasoning_model(llm_config) - or LLMConfig.is_google_vertex_reasoning_model(llm_config) - or LLMConfig.is_google_ai_reasoning_model(llm_config) - ): - pass - else: - raise + # Reasoning is non-deterministic, so don't throw if missing + pass # Assistant message assert isinstance(messages[index], AssistantMessage) @@ -196,15 +175,10 @@ def assert_tool_call_response( msg for msg in messages if not (isinstance(msg, LettaPing) or (hasattr(msg, "message_type") and msg.message_type == "ping")) ] - expected_message_count = get_expected_message_count(llm_config, tool_call=True, streaming=streaming, from_db=from_db) - try: - assert len(messages) == expected_message_count - except: - # Reasoning summary in responses API when effort is high is still flaky, so don't throw if missing - if LLMConfig.is_openai_reasoning_model(llm_config): - assert len(messages) == expected_message_count - 1 - else: - raise + expected_message_count_min, expected_message_count_max = get_expected_message_count_range( + llm_config, tool_call=True, streaming=streaming, from_db=from_db + ) + assert expected_message_count_min <= len(messages) <= expected_message_count_max # User message if loaded from db index = 0 @@ -216,19 +190,14 @@ def assert_tool_call_response( # Reasoning message if reasoning enabled otid_suffix = 0 try: - if ( - LLMConfig.is_openai_reasoning_model(llm_config) and llm_config.reasoning_effort == "high" - ) or LLMConfig.is_anthropic_reasoning_model(llm_config): + if is_reasoner_model(llm_config): assert isinstance(messages[index], ReasoningMessage) assert messages[index].otid and messages[index].otid[-1] == str(otid_suffix) index += 1 otid_suffix += 1 except: - # Reasoning summary in responses API when effort is high is still flaky, so don't throw if missing - if LLMConfig.is_openai_reasoning_model(llm_config): - pass - else: - raise + # Reasoning is non-deterministic, so don't throw if missing + pass # Assistant message if llm_config.model_endpoint_type == "anthropic": @@ -248,6 +217,18 @@ def assert_tool_call_response( assert messages[index].otid and messages[index].otid[-1] == str(otid_suffix) index += 1 + # Reasoning message if reasoning enabled + otid_suffix = 0 + try: + if is_reasoner_model(llm_config): + assert isinstance(messages[index], ReasoningMessage) + assert messages[index].otid and messages[index].otid[-1] == str(otid_suffix) + index += 1 + otid_suffix += 1 + except: + # Reasoning is non-deterministic, so don't throw if missing + pass + # Assistant message assert isinstance(messages[index], AssistantMessage) assert messages[index].otid and messages[index].otid[-1] == str(otid_suffix) @@ -312,42 +293,41 @@ async def wait_for_run_completion(client: AsyncLetta, run_id: str, timeout: floa time.sleep(interval) -def get_expected_message_count(llm_config: LLMConfig, tool_call: bool = False, streaming: bool = False, from_db: bool = False) -> int: +def get_expected_message_count_range( + llm_config: LLMConfig, tool_call: bool = False, streaming: bool = False, from_db: bool = False +) -> Tuple[int, int]: """ - Returns the expected number of messages for a given LLM configuration. + Returns the expected range of number of messages for a given LLM configuration. Uses range to account for possible variations in the number of reasoning messages. Greeting: ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | gpt-4o | gpt-o3 (med effort) | gpt-5 (high effort) | sonnet-3-5 | sonnet-3.7-thinking | flash-2.5-thinking | | ------------------------ | ------------------------ | ------------------------ | ------------------------ | ------------------------ | ------------------------ | - | AssistantMessage | AssistantMessage | ReasoningMessage | AssistantMessage | ReasoningMessage | AssistantMessage | - | | | AssistantMessage | | AssistantMessage | | + | AssistantMessage | AssistantMessage | ReasoningMessage | AssistantMessage | ReasoningMessage | ReasoningMessage | + | | | AssistantMessage | | AssistantMessage | AssistantMessage | Tool Call: ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | gpt-4o | gpt-o3 (med effort) | gpt-5 (high effort) | sonnet-3-5 | sonnet-3.7-thinking | flash-2.5-thinking | | ------------------------ | ------------------------ | ------------------------ | ------------------------ | ------------------------ | ------------------------ | - | ToolCallMessage | ToolCallMessage | ReasoningMessage | AssistantMessage | ReasoningMessage | ToolCallMessage | - | ToolReturnMessage | ToolReturnMessage | ToolCallMessage | ToolCallMessage | AssistantMessage | ToolReturnMessage | - | AssistantMessage | AssistantMessage | ToolReturnMessage | ToolReturnMessage | ToolCallMessage | AssistantMessage | - | | | AssistantMessage | AssistantMessage | ToolReturnMessage | | - | | | | | AssistantMessage | | + | ToolCallMessage | ToolCallMessage | ReasoningMessage | AssistantMessage | ReasoningMessage | ReasoningMessage | + | ToolReturnMessage | ToolReturnMessage | ToolCallMessage | ToolCallMessage | AssistantMessage | ToolCallMessage | + | AssistantMessage | AssistantMessage | ToolReturnMessage | ToolReturnMessage | ToolCallMessage | ToolReturnMessage | + | | | ReasoningMessage | AssistantMessage | ToolReturnMessage | ReasoningMessage | + | | | AssistantMessage | | AssistantMessage | AssistantMessage | """ - is_reasoner_model = ( - (LLMConfig.is_openai_reasoning_model(llm_config) and llm_config.reasoning_effort == "high") - or LLMConfig.is_anthropic_reasoning_model(llm_config) - or LLMConfig.is_google_vertex_reasoning_model(llm_config) - or LLMConfig.is_google_ai_reasoning_model(llm_config) - ) - # assistant message expected_message_count = 1 + expected_range = 0 - if is_reasoner_model: + if is_reasoner_model(llm_config): # reasoning message - expected_message_count += 1 + expected_range += 1 + if tool_call and not LLMConfig.is_anthropic_reasoning_model(llm_config): + # reasoning message for additional turn, only for openai and google models + expected_range += 1 if tool_call: # tool call and tool return messages @@ -364,7 +344,16 @@ def get_expected_message_count(llm_config: LLMConfig, tool_call: bool = False, s # stop reason and usage statistics expected_message_count += 2 - return expected_message_count + return expected_message_count, expected_message_count + expected_range + + +def is_reasoner_model(llm_config: LLMConfig) -> bool: + return ( + (LLMConfig.is_openai_reasoning_model(llm_config) and llm_config.reasoning_effort == "high") + or LLMConfig.is_anthropic_reasoning_model(llm_config) + or LLMConfig.is_google_vertex_reasoning_model(llm_config) + or LLMConfig.is_google_ai_reasoning_model(llm_config) + ) # ------------------------------