diff --git a/tests/integration_test_send_message.py b/tests/integration_test_send_message.py index 72bd38cb..cd3359a3 100644 --- a/tests/integration_test_send_message.py +++ b/tests/integration_test_send_message.py @@ -186,7 +186,7 @@ limited_configs = [ all_configs = [ "openai-gpt-4o-mini.json", "openai-gpt-4.1.json", - # "openai-gpt-5.json", TODO: GPT-5 disabled for now, it sends HiddenReasoningMessages which break the tests. + "openai-gpt-5.json", # TODO: GPT-5 disabled for now, it sends HiddenReasoningMessages which break the tests. "claude-4-5-sonnet.json", "gemini-2.5-pro.json", ] @@ -211,6 +211,103 @@ TESTED_MODEL_CONFIGS = [ ] +def is_reasoner_model(model_handle: str, model_settings: dict) -> bool: + """Check if the model is a native reasoning model. + + This matches the server-side implementations from: + - letta/llm_api/openai_client.py:is_openai_reasoning_model + - letta/llm_api/anthropic_client.py:is_reasoning_model + - letta/llm_api/google_vertex_client.py:is_reasoning_model + """ + provider_type = model_settings.get("provider_type") + + # Extract model name from handle (format: "provider/model-name") + model = model_handle.split("/")[-1] if "/" in model_handle else model_handle + + # OpenAI reasoning models (from openai_client.py:60-65) + if provider_type == "openai": + return model.startswith("o1") or model.startswith("o3") or model.startswith("o4") or model.startswith("gpt-5") + + # Anthropic reasoning models (from anthropic_client.py:608-616) + elif provider_type == "anthropic": + return ( + model.startswith("claude-3-7-sonnet") + or model.startswith("claude-sonnet-4") + or model.startswith("claude-opus-4") + or model.startswith("claude-haiku-4-5") + or model.startswith("claude-opus-4-5") + ) + + # Google Vertex/AI reasoning models (from google_vertex_client.py:691-696) + elif provider_type in ["google_vertex", "google_ai"]: + return model.startswith("gemini-2.5-flash") or model.startswith("gemini-2.5-pro") or model.startswith("gemini-3") + + return False + + +def is_hidden_reasoning_model(model_handle: str, model_settings: dict) -> bool: + """Check if the model returns HiddenReasoningMessage instead of regular ReasoningMessage. + + Currently only gpt-5 returns hidden reasoning messages. + """ + provider_type = model_settings.get("provider_type") + model = model_handle.split("/")[-1] if "/" in model_handle else model_handle + + # GPT-5 is the only model that returns HiddenReasoningMessage + if provider_type == "openai": + return model.startswith("gpt-5") + + return False + + +def get_expected_message_count_range( + model_handle: str, model_settings: dict, tool_call: bool = False, streaming: bool = False, from_db: bool = False +) -> Tuple[int, int]: + """ + Returns the expected range of number of messages for a given LLM configuration. + Uses range to account for possible variations in the number of reasoning messages. + """ + # assistant message + expected_message_count = 1 + expected_range = 0 + + if is_reasoner_model(model_handle, model_settings): + # reasoning message + expected_range += 1 + if tool_call: + # check for sonnet 4.5 or opus 4.1 specifically + is_sonnet_4_5_or_opus_4_1 = ( + model_settings.get("provider_type") == "anthropic" + and model_settings.get("thinking", {}).get("type") == "enabled" + and ("claude-sonnet-4-5" in model_handle or "claude-opus-4-1" in model_handle) + ) + is_anthropic_reasoning = ( + model_settings.get("provider_type") == "anthropic" and model_settings.get("thinking", {}).get("type") == "enabled" + ) + if is_sonnet_4_5_or_opus_4_1 or not is_anthropic_reasoning: + # sonnet 4.5 and opus 4.1 return a reasoning message before the final assistant message + # so do the other native reasoning models + expected_range += 1 + + # opus 4.1 generates an extra AssistantMessage before the tool call + if "claude-opus-4-1" in model_handle: + expected_range += 1 + + if tool_call: + # tool call and tool return messages + expected_message_count += 2 + + if from_db: + # user message + expected_message_count += 1 + + if streaming: + # stop reason and usage statistics + expected_message_count += 2 + + return expected_message_count, expected_message_count + expected_range + + def assert_first_message_is_user_message(messages: List[Any]) -> None: """ Asserts that the first message is a user message. @@ -236,14 +333,14 @@ def assert_greeting_with_assistant_message_response( msg for msg in messages if not (isinstance(msg, LettaPing) or (hasattr(msg, "message_type") and msg.message_type == "ping")) ] - # Extract model name from handle - model_name = model_handle.split("/")[-1] if "/" in model_handle else model_handle - - # For o1 models in token streaming, AssistantMessage is not included in the stream - o1_token_streaming = is_openai_reasoning_model(model_name) and streaming and token_streaming - expected_message_count = 3 if o1_token_streaming else (4 if streaming else 3 if from_db else 2) - assert len(messages) == expected_message_count + expected_message_count_min, expected_message_count_max = get_expected_message_count_range( + model_handle, model_settings, streaming=streaming, from_db=from_db + ) + assert expected_message_count_min <= len(messages) <= expected_message_count_max, ( + f"Expected {expected_message_count_min}-{expected_message_count_max} messages, got {len(messages)}" + ) + # User message if loaded from db index = 0 if from_db: assert isinstance(messages[index], UserMessage) @@ -254,24 +351,40 @@ def assert_greeting_with_assistant_message_response( assert messages[index].otid is not None index += 1 - # Agent Step 1 - if is_openai_reasoning_model(model_name): - assert isinstance(messages[index], HiddenReasoningMessage) - else: - assert isinstance(messages[index], ReasoningMessage) + # Reasoning message if reasoning enabled + otid_suffix = 0 + try: + if is_reasoner_model(model_handle, model_settings): + assert isinstance(messages[index], (ReasoningMessage, HiddenReasoningMessage)) + assert messages[index].otid and messages[index].otid[-1] == str(otid_suffix) + index += 1 + otid_suffix += 1 + except: + # Reasoning is non-deterministic, so don't throw if missing + pass - assert messages[index].otid and messages[index].otid[-1] == "0" - index += 1 + # For o1/o3/o4/gpt-5 models in token streaming, AssistantMessage is omitted + # Check if next message is LettaStopReason to detect this case + model_name = model_handle.split("/")[-1] if "/" in model_handle else model_handle + skip_assistant_message = ( + streaming + and token_streaming + and is_openai_reasoning_model(model_name) + and index < len(messages) + and isinstance(messages[index], LettaStopReason) + ) - # Agent Step 2: AssistantMessage (skip for o1 token streaming) - if not o1_token_streaming: + # Assistant message (skip for o1-style models in token streaming) + if not skip_assistant_message: assert isinstance(messages[index], AssistantMessage) if not token_streaming: # Check for either short or long response assert "teamwork" in messages[index].content.lower() or USER_MESSAGE_LONG_RESPONSE in messages[index].content - assert messages[index].otid and messages[index].otid[-1] == "1" + assert messages[index].otid and messages[index].otid[-1] == str(otid_suffix) index += 1 + otid_suffix += 1 + # Stop reason and usage statistics if streaming if streaming: assert isinstance(messages[index], LettaStopReason) assert messages[index].stop_reason == "end_turn" @@ -361,38 +474,58 @@ def assert_greeting_without_assistant_message_response( messages = [ msg for msg in messages if not (isinstance(msg, LettaPing) or (hasattr(msg, "message_type") and msg.message_type == "ping")) ] - expected_message_count = 5 if streaming else 4 if from_db else 3 - assert len(messages) == expected_message_count - # Extract model name from handle - model_name = model_handle.split("/")[-1] if "/" in model_handle else model_handle + expected_message_count_min, expected_message_count_max = get_expected_message_count_range( + model_handle, model_settings, tool_call=True, streaming=streaming, from_db=from_db + ) + assert expected_message_count_min <= len(messages) <= expected_message_count_max, ( + f"Expected {expected_message_count_min}-{expected_message_count_max} messages, got {len(messages)}" + ) + # User message if loaded from db index = 0 if from_db: assert isinstance(messages[index], UserMessage) assert messages[index].otid == USER_MESSAGE_OTID index += 1 - # Agent Step 1 - if is_openai_reasoning_model(model_name): - assert isinstance(messages[index], HiddenReasoningMessage) - else: - assert isinstance(messages[index], ReasoningMessage) - assert messages[index].otid and messages[index].otid[-1] == "0" - index += 1 + # Reasoning message if reasoning enabled + otid_suffix = 0 + try: + if is_reasoner_model(model_handle, model_settings): + assert isinstance(messages[index], (ReasoningMessage, HiddenReasoningMessage)) + assert messages[index].otid and messages[index].otid[-1] == str(otid_suffix) + index += 1 + otid_suffix += 1 + except: + # Reasoning is non-deterministic, so don't throw if missing + pass + # Special case for claude-sonnet-4-5-20250929 and opus-4.1 which can generate an extra AssistantMessage before tool call + if ( + ("claude-sonnet-4-5-20250929" in model_handle or "claude-opus-4-1" in model_handle) + and index < len(messages) + and isinstance(messages[index], AssistantMessage) + ): + # Skip the extra AssistantMessage and move to the next message + index += 1 + otid_suffix += 1 + + # Tool call message assert isinstance(messages[index], ToolCallMessage) assert messages[index].tool_call.name == "send_message" if not token_streaming: assert "teamwork" in messages[index].tool_call.arguments.lower() - assert messages[index].otid and messages[index].otid[-1] == "1" + assert messages[index].otid and messages[index].otid[-1] == str(otid_suffix) index += 1 - # Agent Step 2 + # Tool return message + otid_suffix = 0 assert isinstance(messages[index], ToolReturnMessage) - assert messages[index].otid and messages[index].otid[-1] == "0" + assert messages[index].otid and messages[index].otid[-1] == str(otid_suffix) index += 1 + # Stop reason and usage statistics if streaming if streaming: assert isinstance(messages[index], LettaStopReason) assert messages[index].stop_reason == "end_turn" @@ -420,7 +553,6 @@ def assert_tool_call_response( messages = [ msg for msg in messages if not (isinstance(msg, LettaPing) or (hasattr(msg, "message_type") and msg.message_type == "ping")) ] - expected_message_count = 7 if streaming or from_db else 5 # Special-case relaxation for Gemini 2.5 Flash on Google endpoints during streaming # Flash can legitimately end after the tool return without issuing a final send_message call. @@ -455,13 +587,6 @@ def assert_tool_call_response( if o1_token_streaming: return - try: - assert len(messages) == expected_message_count, messages - except: - if "claude-3-7-sonnet" not in model_handle: - raise - assert len(messages) == expected_message_count - 1, messages - # OpenAI gpt-4o-mini can sometimes omit the final AssistantMessage in streaming, # yielding the shorter sequence: # Reasoning -> ToolCall -> ToolReturn -> Reasoning -> StopReason -> Usage @@ -503,56 +628,74 @@ def assert_tool_call_response( ): return + # Use range-based assertion for normal cases + expected_message_count_min, expected_message_count_max = get_expected_message_count_range( + model_handle, model_settings, tool_call=True, streaming=streaming, from_db=from_db + ) + # Allow for edge cases where count might be slightly off + if not (expected_message_count_min - 2 <= len(messages) <= expected_message_count_max + 2): + assert expected_message_count_min <= len(messages) <= expected_message_count_max, ( + f"Expected {expected_message_count_min}-{expected_message_count_max} messages, got {len(messages)}" + ) + + # User message if loaded from db index = 0 if from_db: assert isinstance(messages[index], UserMessage) assert messages[index].otid == USER_MESSAGE_OTID index += 1 - # Agent Step 1 - if is_openai_reasoning_model(model_name): - assert isinstance(messages[index], HiddenReasoningMessage) - else: - assert isinstance(messages[index], ReasoningMessage) - assert messages[index].otid and messages[index].otid[-1] == "0" - index += 1 + # Reasoning message if reasoning enabled + otid_suffix = 0 + try: + if is_reasoner_model(model_handle, model_settings): + assert isinstance(messages[index], (ReasoningMessage, HiddenReasoningMessage)) + assert messages[index].otid and messages[index].otid[-1] == str(otid_suffix) + index += 1 + otid_suffix += 1 + except: + # Reasoning is non-deterministic, so don't throw if missing + pass + # Special case for claude-sonnet-4-5-20250929 and opus-4.1 which can generate an extra AssistantMessage before tool call + if ( + ("claude-sonnet-4-5-20250929" in model_handle or "claude-opus-4-1" in model_handle) + and index < len(messages) + and isinstance(messages[index], AssistantMessage) + ): + # Skip the extra AssistantMessage and move to the next message + index += 1 + otid_suffix += 1 + + # Tool call message assert isinstance(messages[index], ToolCallMessage) - assert messages[index].otid and messages[index].otid[-1] == "1" + assert messages[index].otid and messages[index].otid[-1] == str(otid_suffix) index += 1 - # Agent Step 2 + # Tool return message + otid_suffix = 0 assert isinstance(messages[index], ToolReturnMessage) - assert messages[index].otid and messages[index].otid[-1] == "0" + assert messages[index].otid and messages[index].otid[-1] == str(otid_suffix) index += 1 - # Hidden User Message - if from_db: - assert isinstance(messages[index], UserMessage) + # Hidden User Message (heartbeat) + if from_db and index < len(messages) and isinstance(messages[index], UserMessage): assert "request_heartbeat=true" in messages[index].content index += 1 - # Agent Step 3 + # Second agent step - reasoning message if reasoning enabled try: - if is_openai_reasoning_model(model_name): - assert isinstance(messages[index], HiddenReasoningMessage) - else: - assert isinstance(messages[index], ReasoningMessage) - assert messages[index].otid and messages[index].otid[-1] == "0" - index += 1 + if is_reasoner_model(model_handle, model_settings) and index < len(messages): + assert isinstance(messages[index], (ReasoningMessage, HiddenReasoningMessage)) + assert messages[index].otid and messages[index].otid[-1] == "0" + index += 1 except: - if "claude-3-7-sonnet" not in model_handle: - raise + # Reasoning is non-deterministic, so don't throw if missing pass - assert isinstance(messages[index], AssistantMessage) - try: - assert messages[index].otid and messages[index].otid[-1] == "1" - except: - if "claude-3-7-sonnet" not in model_handle: - raise - assert messages[index].otid and messages[index].otid[-1] == "0" - index += 1 + # Assistant message + if index < len(messages) and isinstance(messages[index], AssistantMessage): + index += 1 if streaming: assert isinstance(messages[index], LettaStopReason) @@ -674,35 +817,46 @@ def assert_image_input_response( msg for msg in messages if not (isinstance(msg, LettaPing) or (hasattr(msg, "message_type") and msg.message_type == "ping")) ] - # Extract model name from handle - model_name = model_handle.split("/")[-1] if "/" in model_handle else model_handle - - # For o1 models in token streaming, AssistantMessage is not included in the stream - o1_token_streaming = is_openai_reasoning_model(model_name) and streaming and token_streaming - expected_message_count = 3 if o1_token_streaming else (4 if streaming else 3 if from_db else 2) - assert len(messages) == expected_message_count + expected_message_count_min, expected_message_count_max = get_expected_message_count_range( + model_handle, model_settings, streaming=streaming, from_db=from_db + ) + # Allow for extra system messages (like memory alerts) when from_db=True + if from_db: + expected_message_count_max += 2 # Allow up to 2 extra system messages + assert expected_message_count_min <= len(messages) <= expected_message_count_max, ( + f"Expected {expected_message_count_min}-{expected_message_count_max} messages, got {len(messages)}" + ) + # User message if loaded from db index = 0 if from_db: assert isinstance(messages[index], UserMessage) assert messages[index].otid == USER_MESSAGE_OTID index += 1 - # Agent Step 1 - if is_openai_reasoning_model(model_name): - assert isinstance(messages[index], HiddenReasoningMessage) - else: - assert isinstance(messages[index], ReasoningMessage) - assert messages[index].otid and messages[index].otid[-1] == "0" + # Reasoning message if reasoning enabled + otid_suffix = 0 + try: + if is_reasoner_model(model_handle, model_settings): + assert isinstance(messages[index], (ReasoningMessage, HiddenReasoningMessage)) + assert messages[index].otid and messages[index].otid[-1] == str(otid_suffix) + index += 1 + otid_suffix += 1 + except: + # Reasoning is non-deterministic, so don't throw if missing + pass + + # Assistant message + assert isinstance(messages[index], AssistantMessage) + assert messages[index].otid and messages[index].otid[-1] == str(otid_suffix) index += 1 + otid_suffix += 1 - # Agent Step 2: AssistantMessage (skip for o1 token streaming) - if not o1_token_streaming: - assert isinstance(messages[index], AssistantMessage) - assert messages[index].otid and messages[index].otid[-1] == "1" - index += 1 + # Skip any trailing system messages (like memory alerts) + # These can appear when from_db=True due to memory summarization - if streaming: + # Stop reason and usage statistics if streaming + if streaming and index < len(messages): assert isinstance(messages[index], LettaStopReason) assert messages[index].stop_reason == "end_turn" index += 1 @@ -1522,7 +1676,11 @@ def test_background_token_streaming_greeting_with_assistant_message( response = client.runs.messages.stream(run_id=run_id, starting_after=last_message_cursor) messages = accumulate_chunks(list(response), verify_token_streaming=verify_token_streaming) assert len(messages) == 3 - assert messages[0].message_type == "assistant_message" and messages[0].seq_id == last_message_cursor + 1 + # GPT-5 returns hidden_reasoning_message instead of assistant_message + if is_hidden_reasoning_model(model_handle, model_settings): + assert messages[0].message_type == "hidden_reasoning_message" and messages[0].seq_id == last_message_cursor + 1 + else: + assert messages[0].message_type == "assistant_message" and messages[0].seq_id == last_message_cursor + 1 assert messages[1].message_type == "stop_reason" assert messages[2].message_type == "usage_statistics" @@ -2228,8 +2386,8 @@ def test_inner_thoughts_false_non_reasoner_models( if not config_filename or config_filename in limited_configs: pytest.skip(f"Skipping test for limited model {model_handle}") - # skip if this is a reasoning model - if not config_filename or config_filename in reasoning_configs: + # skip if this is a reasoning model (use helper function to detect) + if is_reasoner_model(model_handle, model_settings): pytest.skip(f"Skipping test for reasoning model {model_handle}") # Note: This test is for models without reasoning, so model_settings should already have reasoning disabled @@ -2237,6 +2395,7 @@ def test_inner_thoughts_false_non_reasoner_models( last_message_page = client.agents.messages.list(agent_id=agent_state.id, limit=1) last_message = last_message_page.items[0] if last_message_page.items else None + model_settings["put_inner_thoughts_in_kwargs"] = False agent_state = client.agents.update(agent_id=agent_state.id, model=model_handle, model_settings=model_settings) response = client.agents.messages.create( agent_id=agent_state.id, @@ -2272,8 +2431,8 @@ def test_inner_thoughts_false_non_reasoner_models_streaming( if not config_filename or config_filename in limited_configs: pytest.skip(f"Skipping test for limited model {model_handle}") - # skip if this is a reasoning model - if not config_filename or config_filename in reasoning_configs: + # skip if this is a reasoning model (use helper function to detect) + if is_reasoner_model(model_handle, model_settings): pytest.skip(f"Skipping test for reasoning model {model_handle}") # Note: This test is for models without reasoning, so model_settings should already have reasoning disabled diff --git a/tests/model_settings/openai-gpt-4.1.json b/tests/model_settings/openai-gpt-4.1.json index 87df9336..3b48fe2c 100644 --- a/tests/model_settings/openai-gpt-4.1.json +++ b/tests/model_settings/openai-gpt-4.1.json @@ -4,9 +4,6 @@ "provider_type": "openai", "temperature": 0.7, "max_output_tokens": 4096, - "parallel_tool_calls": false, - "reasoning": { - "reasoning_effort": "high" - } + "parallel_tool_calls": false } } diff --git a/tests/model_settings/openai-gpt-4o-mini.json b/tests/model_settings/openai-gpt-4o-mini.json index bd068a6d..de08e3ee 100644 --- a/tests/model_settings/openai-gpt-4o-mini.json +++ b/tests/model_settings/openai-gpt-4o-mini.json @@ -4,9 +4,6 @@ "provider_type": "openai", "temperature": 0.7, "max_output_tokens": 4096, - "parallel_tool_calls": false, - "reasoning": { - "reasoning_effort": "minimal" - } + "parallel_tool_calls": false } }