From ad42c886b70970089687dcda5f025b777f933328 Mon Sep 17 00:00:00 2001
From: cthomas <caren@letta.com>
Date: Tue, 30 Sep 2025 20:18:07 -0700
Subject: [PATCH] feat: add new agent loop tests to ci (#5049)

---
 tests/integration_test_send_message_v2.py | 100 ++++++++++++++++------
 1 file changed, 76 insertions(+), 24 deletions(-)

diff --git a/tests/integration_test_send_message_v2.py b/tests/integration_test_send_message_v2.py
index dbabbd0f..129df1fa 100644
--- a/tests/integration_test_send_message_v2.py
+++ b/tests/integration_test_send_message_v2.py
@@ -46,8 +46,11 @@ logger = get_logger(__name__)
 
 
 all_configs = [
+    "openai-gpt-4o-mini.json",
+    "openai-o3.json",
     "claude-3-5-sonnet.json",
     "claude-3-7-sonnet-extended.json",
+    "gemini-2.5-flash.json",
 ]
 
 
@@ -109,9 +112,8 @@ def assert_greeting_response(
         msg for msg in messages if not (isinstance(msg, LettaPing) or (hasattr(msg, "message_type") and msg.message_type == "ping"))
     ]
 
-    is_reasoner_model = LLMConfig.is_openai_reasoning_model(llm_config) or LLMConfig.is_anthropic_reasoning_model(llm_config)
-    expected_message_count = 3 if streaming else 2 if from_db else 1
-    assert len(messages) == expected_message_count + (1 if is_reasoner_model else 0)
+    expected_message_count = get_expected_message_count(llm_config, streaming=streaming, from_db=from_db)
+    assert len(messages) == expected_message_count
 
     # User message if loaded from db
     index = 0
@@ -122,7 +124,7 @@ def assert_greeting_response(
 
     # Reasoning message if reasoning enabled
     otid_suffix = 0
-    if is_reasoner_model:
+    if LLMConfig.is_openai_reasoning_model(llm_config) or LLMConfig.is_anthropic_reasoning_model(llm_config):
         if LLMConfig.is_openai_reasoning_model(llm_config):
             assert isinstance(messages[index], HiddenReasoningMessage)
         else:
@@ -168,9 +170,8 @@ def assert_tool_call_response(
         msg for msg in messages if not (isinstance(msg, LettaPing) or (hasattr(msg, "message_type") and msg.message_type == "ping"))
     ]
 
-    is_reasoner_model = LLMConfig.is_openai_reasoning_model(llm_config) or LLMConfig.is_anthropic_reasoning_model(llm_config)
-    expected_message_count = 6 if streaming else 5 if from_db else 4
-    assert len(messages) == expected_message_count + (1 if is_reasoner_model else 0)
+    expected_message_count = get_expected_message_count(llm_config, tool_call=True, streaming=streaming, from_db=from_db)
+    assert len(messages) == expected_message_count
 
     # User message if loaded from db
     index = 0
@@ -181,7 +182,7 @@ def assert_tool_call_response(
 
     # Reasoning message if reasoning enabled
     otid_suffix = 0
-    if is_reasoner_model:
+    if LLMConfig.is_openai_reasoning_model(llm_config) or LLMConfig.is_anthropic_reasoning_model(llm_config):
         if LLMConfig.is_openai_reasoning_model(llm_config):
             assert isinstance(messages[index], HiddenReasoningMessage)
         else:
@@ -191,10 +192,11 @@ def assert_tool_call_response(
         otid_suffix += 1
 
     # Assistant message
-    assert isinstance(messages[index], AssistantMessage)
-    assert messages[index].otid and messages[index].otid[-1] == str(otid_suffix)
-    index += 1
-    otid_suffix += 1
+    if llm_config.model_endpoint_type == "anthropic":
+        assert isinstance(messages[index], AssistantMessage)
+        assert messages[index].otid and messages[index].otid[-1] == str(otid_suffix)
+        index += 1
+        otid_suffix += 1
 
     # Tool call message
     assert isinstance(messages[index], ToolCallMessage)
@@ -207,16 +209,13 @@ def assert_tool_call_response(
     assert messages[index].otid and messages[index].otid[-1] == str(otid_suffix)
     index += 1
 
-    # Reasoning message if reasoning enabled
+    # Reasoning message if reasoning enabled for openai models
     otid_suffix = 0
-    # if is_reasoner_model:
-    #     if LLMConfig.is_openai_reasoning_model(llm_config):
-    #         assert isinstance(messages[index], HiddenReasoningMessage)
-    #     else:
-    #         assert isinstance(messages[index], ReasoningMessage)
-    #     assert messages[index].otid and messages[index].otid[-1] == str(otid_suffix)
-    #     index += 1
-    #     otid_suffix += 1
+    if LLMConfig.is_openai_reasoning_model(llm_config):
+        assert isinstance(messages[index], HiddenReasoningMessage)
+        assert messages[index].otid and messages[index].otid[-1] == str(otid_suffix)
+        index += 1
+        otid_suffix += 1
 
     # Assistant message
     assert isinstance(messages[index], AssistantMessage)
@@ -283,6 +282,59 @@ async def wait_for_run_completion(client: AsyncLetta, run_id: str, timeout: floa
         time.sleep(interval)
 
 
+def get_expected_message_count(llm_config: LLMConfig, tool_call: bool = False, streaming: bool = False, from_db: bool = False) -> int:
+    """
+    Returns the expected number of messages for a given LLM configuration.
+
+    Greeting:
+    ---------------------------------------------------------------------------------------------------------------------------------------
+    | gpt-4o                   |  gpt-o3                  |  sonnet-3-5              |  sonnet-3.7-thinking     |  flash-2.5-thinking      |
+    | ------------------------ | ------------------------ | ------------------------ | ------------------------ | ------------------------ |
+    | AssistantMessage         |  HiddenReasoningMessage  |  AssistantMessage        |  ReasoningMessage        |  AssistantMessage        |
+    |                          |  AssistantMessage        |                          |  AssistantMessage        |                          |
+
+
+    Tool Call:
+    ---------------------------------------------------------------------------------------------------------------------------------------
+    | gpt-4o                   |  gpt-o3                  |  sonnet-3-5              |  sonnet-3.7-thinking     |  flash-2.5-thinking      |
+    | ------------------------ | ------------------------ | ------------------------ | ------------------------ | ------------------------ |
+    | ToolCallMessage          |  HiddenReasoningMessage  |  AssistantMessage        |  ReasoningMessage        |  ToolCallMessage         |
+    | ToolReturnMessage        |  ToolCallMessage         |  ToolCallMessage         |  AssistantMessage        |  ToolReturnMessage       |
+    | AssistantMessage         |  ToolReturnMessage       |  ToolReturnMessage       |  ToolCallMessage         |  AssistantMessage        |
+    |                          |  HiddenReasoningMessage  |  AssistantMessage        |  ToolReturnMessage       |                          |
+    |                          |  AssistantMessage        |                          |  AssistantMessage        |                          |
+
+    """
+    is_reasoner_model = LLMConfig.is_openai_reasoning_model(llm_config) or LLMConfig.is_anthropic_reasoning_model(llm_config)
+
+    # assistant message
+    expected_message_count = 1
+
+    if is_reasoner_model:
+        # reasoning message
+        expected_message_count += 1
+
+    if tool_call:
+        # tool call and tool return messages
+        expected_message_count += 2
+        if llm_config.model_endpoint_type == "anthropic":
+            # anthropic models return an assistant message first before the tool call message
+            expected_message_count += 1
+        if LLMConfig.is_openai_reasoning_model(llm_config):
+            # openai reasoning models return an additional reasoning message before final assistant message
+            expected_message_count += 1
+
+    if from_db:
+        # user message
+        expected_message_count += 1
+
+    if streaming:
+        # stop reason and usage statistics
+        expected_message_count += 2
+
+    return expected_message_count
+
+
 # ------------------------------
 # Fixtures
 # ------------------------------
@@ -476,6 +528,6 @@ async def test_tool_call(
     messages_from_db = await client.agents.messages.list(agent_id=agent_state.id, after=last_message[0].id)
     assert_tool_call_response(messages_from_db, from_db=True, llm_config=llm_config)
 
-    # assert run_id is not None
-    # run = await client.runs.retrieve(run_id=run_id)
-    # assert run.status == JobStatus.completed
+    assert run_id is not None
+    run = await client.runs.retrieve(run_id=run_id)
+    assert run.status == JobStatus.completed