feat: Add model integration testing (#587)

2025-01-10 12:28:12 -10:00
parent e2b4c76df0
commit 5264349e43
4 changed files with 128 additions and 139 deletions
--- a/letta/llm_api/google_ai.py
+++ b/letta/llm_api/google_ai.py
@@ -264,6 +264,7 @@ def convert_google_ai_response_to_chatcompletion(
    """
    try:
        choices = []
+        index = 0
        for candidate in response_json["candidates"]:
            content = candidate["content"]

@@ -272,86 +273,87 @@ def convert_google_ai_response_to_chatcompletion(

            parts = content["parts"]
            # TODO support parts / multimodal
-            assert len(parts) == 1, f"Multi-part not yet supported:\n{parts}"
-            response_message = parts[0]
+            # TODO support parallel tool calling natively
+            # TODO Alternative here is to throw away everything else except for the first part
+            for response_message in parts:
+                # Convert the actual message style to OpenAI style
+                if "functionCall" in response_message and response_message["functionCall"] is not None:
+                    function_call = response_message["functionCall"]
+                    assert isinstance(function_call, dict), function_call
+                    function_name = function_call["name"]
+                    assert isinstance(function_name, str), function_name
+                    function_args = function_call["args"]
+                    assert isinstance(function_args, dict), function_args

-            # Convert the actual message style to OpenAI style
-            if "functionCall" in response_message and response_message["functionCall"] is not None:
-                function_call = response_message["functionCall"]
-                assert isinstance(function_call, dict), function_call
-                function_name = function_call["name"]
-                assert isinstance(function_name, str), function_name
-                function_args = function_call["args"]
-                assert isinstance(function_args, dict), function_args
+                    # NOTE: this also involves stripping the inner monologue out of the function
+                    if pull_inner_thoughts_from_args:
+                        from letta.local_llm.constants import INNER_THOUGHTS_KWARG

-                # NOTE: this also involves stripping the inner monologue out of the function
-                if pull_inner_thoughts_from_args:
-                    from letta.local_llm.constants import INNER_THOUGHTS_KWARG
+                        assert INNER_THOUGHTS_KWARG in function_args, f"Couldn't find inner thoughts in function args:\n{function_call}"
+                        inner_thoughts = function_args.pop(INNER_THOUGHTS_KWARG)
+                        assert inner_thoughts is not None, f"Expected non-null inner thoughts function arg:\n{function_call}"
+                    else:
+                        inner_thoughts = None
+
+                    # Google AI API doesn't generate tool call IDs
+                    openai_response_message = Message(
+                        role="assistant",  # NOTE: "model" -> "assistant"
+                        content=inner_thoughts,
+                        tool_calls=[
+                            ToolCall(
+                                id=get_tool_call_id(),
+                                type="function",
+                                function=FunctionCall(
+                                    name=function_name,
+                                    arguments=clean_json_string_extra_backslash(json_dumps(function_args)),
+                                ),
+                            )
+                        ],
+                    )

-                    assert INNER_THOUGHTS_KWARG in function_args, f"Couldn't find inner thoughts in function args:\n{function_call}"
-                    inner_thoughts = function_args.pop(INNER_THOUGHTS_KWARG)
-                    assert inner_thoughts is not None, f"Expected non-null inner thoughts function arg:\n{function_call}"
                else:
-                    inner_thoughts = None

-                # Google AI API doesn't generate tool call IDs
-                openai_response_message = Message(
-                    role="assistant",  # NOTE: "model" -> "assistant"
-                    content=inner_thoughts,
-                    tool_calls=[
-                        ToolCall(
-                            id=get_tool_call_id(),
-                            type="function",
-                            function=FunctionCall(
-                                name=function_name,
-                                arguments=clean_json_string_extra_backslash(json_dumps(function_args)),
-                            ),
-                        )
-                    ],
+                    # Inner thoughts are the content by default
+                    inner_thoughts = response_message["text"]
+
+                    # Google AI API doesn't generate tool call IDs
+                    openai_response_message = Message(
+                        role="assistant",  # NOTE: "model" -> "assistant"
+                        content=inner_thoughts,
+                    )
+
+                # Google AI API uses different finish reason strings than OpenAI
+                # OpenAI: 'stop', 'length', 'function_call', 'content_filter', null
+                #   see: https://platform.openai.com/docs/guides/text-generation/chat-completions-api
+                # Google AI API: FINISH_REASON_UNSPECIFIED, STOP, MAX_TOKENS, SAFETY, RECITATION, OTHER
+                #   see: https://ai.google.dev/api/python/google/ai/generativelanguage/Candidate/FinishReason
+                finish_reason = candidate["finishReason"]
+                if finish_reason == "STOP":
+                    openai_finish_reason = (
+                        "function_call"
+                        if openai_response_message.tool_calls is not None and len(openai_response_message.tool_calls) > 0
+                        else "stop"
+                    )
+                elif finish_reason == "MAX_TOKENS":
+                    openai_finish_reason = "length"
+                elif finish_reason == "SAFETY":
+                    openai_finish_reason = "content_filter"
+                elif finish_reason == "RECITATION":
+                    openai_finish_reason = "content_filter"
+                else:
+                    raise ValueError(f"Unrecognized finish reason in Google AI response: {finish_reason}")
+
+                choices.append(
+                    Choice(
+                        finish_reason=openai_finish_reason,
+                        index=index,
+                        message=openai_response_message,
+                    )
                )
+                index += 1

-            else:
-
-                # Inner thoughts are the content by default
-                inner_thoughts = response_message["text"]
-
-                # Google AI API doesn't generate tool call IDs
-                openai_response_message = Message(
-                    role="assistant",  # NOTE: "model" -> "assistant"
-                    content=inner_thoughts,
-                )
-
-            # Google AI API uses different finish reason strings than OpenAI
-            # OpenAI: 'stop', 'length', 'function_call', 'content_filter', null
-            #   see: https://platform.openai.com/docs/guides/text-generation/chat-completions-api
-            # Google AI API: FINISH_REASON_UNSPECIFIED, STOP, MAX_TOKENS, SAFETY, RECITATION, OTHER
-            #   see: https://ai.google.dev/api/python/google/ai/generativelanguage/Candidate/FinishReason
-            finish_reason = candidate["finishReason"]
-            if finish_reason == "STOP":
-                openai_finish_reason = (
-                    "function_call"
-                    if openai_response_message.tool_calls is not None and len(openai_response_message.tool_calls) > 0
-                    else "stop"
-                )
-            elif finish_reason == "MAX_TOKENS":
-                openai_finish_reason = "length"
-            elif finish_reason == "SAFETY":
-                openai_finish_reason = "content_filter"
-            elif finish_reason == "RECITATION":
-                openai_finish_reason = "content_filter"
-            else:
-                raise ValueError(f"Unrecognized finish reason in Google AI response: {finish_reason}")
-
-            choices.append(
-                Choice(
-                    finish_reason=openai_finish_reason,
-                    index=candidate["index"],
-                    message=openai_response_message,
-                )
-            )
-
-        if len(choices) > 1:
-            raise UserWarning(f"Unexpected number of candidates in response (expected 1, got {len(choices)})")
+        # if len(choices) > 1:
+        #     raise UserWarning(f"Unexpected number of candidates in response (expected 1, got {len(choices)})")

        # NOTE: some of the Google AI APIs show UsageMetadata in the response, but it seems to not exist?
        #  "usageMetadata": {
--- a/tests/helpers/endpoints_helper.py
+++ b/tests/helpers/endpoints_helper.py
@@ -173,14 +173,14 @@ def check_agent_uses_external_tool(filename: str) -> LettaResponse:

    My name is Letta.

-    I am a personal assistant who answers a user's questions about a website `example.com`. When a user asks me a question about `example.com`, I will use a tool called {tool.name} which will search `example.com` and answer the relevant question.
+    I am a personal assistant who uses a tool called {tool.name} to star a desired github repo.

    Don’t forget - inner monologue / inner thoughts should always be different than the contents of send_message! send_message is how you communicate with the user, whereas inner thoughts are your own personal inner thoughts.
    """

    agent_state = setup_agent(client, filename, memory_persona_str=persona, tool_ids=[tool.id])

-    response = client.user_message(agent_id=agent_state.id, message="What's on the example.com website?")
+    response = client.user_message(agent_id=agent_state.id, message="Please star the repo with owner=letta-ai and repo=letta")

    # Basic checks
    assert_sanity_checks(response)
--- a/tests/pytest.ini
+++ b/tests/pytest.ini
@@ -7,3 +7,7 @@ filterwarnings =
 markers =
    local_sandbox: mark test as part of local sandbox tests
    e2b_sandbox: mark test as part of E2B sandbox tests
+    openai_basic: Tests for OpenAI endpoints
+    anthropic_basic: Tests for Anthropic endpoints
+    azure_basic: Tests for Azure endpoints
+    gemini_basic: Tests for Gemini endpoints
--- a/tests/test_model_letta_perfomance.py
+++ b/tests/test_model_letta_perfomance.py
@@ -2,15 +2,15 @@ import functools
 import os
 import time

+import pytest
+
 from tests.helpers.endpoints_helper import (
    check_agent_archival_memory_insert,
    check_agent_archival_memory_retrieval,
    check_agent_edit_core_memory,
    check_agent_recall_chat_memory,
-    check_agent_summarize_memory_simple,
    check_agent_uses_external_tool,
    check_first_response_is_valid_for_llm_endpoint,
-    check_response_contains_keyword,
    run_embedding_endpoint,
 )

@@ -84,6 +84,7 @@ def retry_until_success(max_attempts=10, sleep_time_seconds=4):
 # ======================================================================================================================
 # OPENAI TESTS
 # ======================================================================================================================
+@pytest.mark.openai_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
 def test_openai_gpt_4o_returns_valid_first_message():
    filename = os.path.join(llm_config_dir, "openai-gpt-4o.json")
@@ -92,23 +93,16 @@ def test_openai_gpt_4o_returns_valid_first_message():
    print(f"Got successful response from client: \n\n{response}")


+@pytest.mark.openai_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
-def test_openai_gpt_4o_returns_keyword():
-    keyword = "banana"
-    filename = os.path.join(llm_config_dir, "openai-gpt-4o.json")
-    response = check_response_contains_keyword(filename, keyword=keyword)
-    # Log out successful response
-    print(f"Got successful response from client: \n\n{response}")
-
-
-@retry_until_success(max_attempts=5, sleep_time_seconds=2)
-def test_openai_gpt_4o_uses_external_tool():
+def test_openai_gpt_4o_uses_external_tool(mock_e2b_api_key_none):
    filename = os.path.join(llm_config_dir, "openai-gpt-4o.json")
    response = check_agent_uses_external_tool(filename)
    # Log out successful response
    print(f"Got successful response from client: \n\n{response}")


+@pytest.mark.openai_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
 def test_openai_gpt_4o_recall_chat_memory():
    filename = os.path.join(llm_config_dir, "openai-gpt-4o.json")
@@ -117,6 +111,7 @@ def test_openai_gpt_4o_recall_chat_memory():
    print(f"Got successful response from client: \n\n{response}")


+@pytest.mark.openai_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
 def test_openai_gpt_4o_archival_memory_retrieval():
    filename = os.path.join(llm_config_dir, "openai-gpt-4o.json")
@@ -125,6 +120,7 @@ def test_openai_gpt_4o_archival_memory_retrieval():
    print(f"Got successful response from client: \n\n{response}")


+@pytest.mark.openai_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
 def test_openai_gpt_4o_archival_memory_insert():
    filename = os.path.join(llm_config_dir, "openai-gpt-4o.json")
@@ -133,6 +129,7 @@ def test_openai_gpt_4o_archival_memory_insert():
    print(f"Got successful response from client: \n\n{response}")


+@pytest.mark.openai_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
 def test_openai_gpt_4o_edit_core_memory():
    filename = os.path.join(llm_config_dir, "openai-gpt-4o.json")
@@ -141,13 +138,7 @@ def test_openai_gpt_4o_edit_core_memory():
    print(f"Got successful response from client: \n\n{response}")


-@retry_until_success(max_attempts=5, sleep_time_seconds=2)
-def test_openai_gpt_4o_summarize_memory():
-    filename = os.path.join(llm_config_dir, "openai-gpt-4o.json")
-    response = check_agent_summarize_memory_simple(filename)
-    print(f"Got successful response from client: \n\n{response}")
-
-
+@pytest.mark.openai_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
 def test_embedding_endpoint_openai():
    filename = os.path.join(embedding_config_dir, "openai_embed.json")
@@ -157,6 +148,8 @@ def test_embedding_endpoint_openai():
 # ======================================================================================================================
 # AZURE TESTS
 # ======================================================================================================================
+@pytest.mark.azure_basic
+@retry_until_success(max_attempts=5, sleep_time_seconds=2)
 def test_azure_gpt_4o_mini_returns_valid_first_message():
    filename = os.path.join(llm_config_dir, "azure-gpt-4o-mini.json")
    response = check_first_response_is_valid_for_llm_endpoint(filename)
@@ -164,21 +157,17 @@ def test_azure_gpt_4o_mini_returns_valid_first_message():
    print(f"Got successful response from client: \n\n{response}")


-def test_azure_gpt_4o_mini_returns_keyword():
-    keyword = "banana"
-    filename = os.path.join(llm_config_dir, "azure-gpt-4o-mini.json")
-    response = check_response_contains_keyword(filename, keyword=keyword)
-    # Log out successful response
-    print(f"Got successful response from client: \n\n{response}")
-
-
-def test_azure_gpt_4o_mini_uses_external_tool():
+@pytest.mark.azure_basic
+@retry_until_success(max_attempts=5, sleep_time_seconds=2)
+def test_azure_gpt_4o_mini_uses_external_tool(mock_e2b_api_key_none):
    filename = os.path.join(llm_config_dir, "azure-gpt-4o-mini.json")
    response = check_agent_uses_external_tool(filename)
    # Log out successful response
    print(f"Got successful response from client: \n\n{response}")


+@pytest.mark.azure_basic
+@retry_until_success(max_attempts=5, sleep_time_seconds=2)
 def test_azure_gpt_4o_mini_recall_chat_memory():
    filename = os.path.join(llm_config_dir, "azure-gpt-4o-mini.json")
    response = check_agent_recall_chat_memory(filename)
@@ -186,6 +175,8 @@ def test_azure_gpt_4o_mini_recall_chat_memory():
    print(f"Got successful response from client: \n\n{response}")


+@pytest.mark.azure_basic
+@retry_until_success(max_attempts=5, sleep_time_seconds=2)
 def test_azure_gpt_4o_mini_archival_memory_retrieval():
    filename = os.path.join(llm_config_dir, "azure-gpt-4o-mini.json")
    response = check_agent_archival_memory_retrieval(filename)
@@ -193,6 +184,8 @@ def test_azure_gpt_4o_mini_archival_memory_retrieval():
    print(f"Got successful response from client: \n\n{response}")


+@pytest.mark.azure_basic
+@retry_until_success(max_attempts=5, sleep_time_seconds=2)
 def test_azure_gpt_4o_mini_edit_core_memory():
    filename = os.path.join(llm_config_dir, "azure-gpt-4o-mini.json")
    response = check_agent_edit_core_memory(filename)
@@ -200,6 +193,8 @@ def test_azure_gpt_4o_mini_edit_core_memory():
    print(f"Got successful response from client: \n\n{response}")


+@pytest.mark.azure_basic
+@retry_until_success(max_attempts=5, sleep_time_seconds=2)
 def test_azure_embedding_endpoint():
    filename = os.path.join(embedding_config_dir, "azure_embed.json")
    run_embedding_endpoint(filename)
@@ -239,6 +234,8 @@ def test_embedding_endpoint_ollama():
 # ======================================================================================================================
 # ANTHROPIC TESTS
 # ======================================================================================================================
+@pytest.mark.anthropic_basic
+@retry_until_success(max_attempts=5, sleep_time_seconds=2)
 def test_claude_haiku_3_5_returns_valid_first_message():
    filename = os.path.join(llm_config_dir, "claude-3-5-haiku.json")
    response = check_first_response_is_valid_for_llm_endpoint(filename)
@@ -246,21 +243,17 @@ def test_claude_haiku_3_5_returns_valid_first_message():
    print(f"Got successful response from client: \n\n{response}")


-def test_claude_haiku_3_5_returns_keyword():
-    keyword = "banana"
-    filename = os.path.join(llm_config_dir, "claude-3-5-haiku.json")
-    response = check_response_contains_keyword(filename, keyword=keyword)
-    # Log out successful response
-    print(f"Got successful response from client: \n\n{response}")
-
-
-def test_claude_haiku_3_5_uses_external_tool():
+@pytest.mark.anthropic_basic
+@retry_until_success(max_attempts=5, sleep_time_seconds=2)
+def test_claude_haiku_3_5_uses_external_tool(mock_e2b_api_key_none):
    filename = os.path.join(llm_config_dir, "claude-3-5-haiku.json")
    response = check_agent_uses_external_tool(filename)
    # Log out successful response
    print(f"Got successful response from client: \n\n{response}")


+@pytest.mark.anthropic_basic
+@retry_until_success(max_attempts=5, sleep_time_seconds=2)
 def test_claude_haiku_3_5_recall_chat_memory():
    filename = os.path.join(llm_config_dir, "claude-3-5-haiku.json")
    response = check_agent_recall_chat_memory(filename)
@@ -268,6 +261,8 @@ def test_claude_haiku_3_5_recall_chat_memory():
    print(f"Got successful response from client: \n\n{response}")


+@pytest.mark.anthropic_basic
+@retry_until_success(max_attempts=5, sleep_time_seconds=2)
 def test_claude_haiku_3_5_archival_memory_retrieval():
    filename = os.path.join(llm_config_dir, "claude-3-5-haiku.json")
    response = check_agent_archival_memory_retrieval(filename)
@@ -275,6 +270,8 @@ def test_claude_haiku_3_5_archival_memory_retrieval():
    print(f"Got successful response from client: \n\n{response}")


+@pytest.mark.anthropic_basic
+@retry_until_success(max_attempts=5, sleep_time_seconds=2)
 def test_claude_haiku_3_5_edit_core_memory():
    filename = os.path.join(llm_config_dir, "claude-3-5-haiku.json")
    response = check_agent_edit_core_memory(filename)
@@ -292,15 +289,7 @@ def test_groq_llama31_70b_returns_valid_first_message():
    print(f"Got successful response from client: \n\n{response}")


-def test_groq_llama31_70b_returns_keyword():
-    keyword = "banana"
-    filename = os.path.join(llm_config_dir, "groq.json")
-    response = check_response_contains_keyword(filename, keyword=keyword)
-    # Log out successful response
-    print(f"Got successful response from client: \n\n{response}")
-
-
-def test_groq_llama31_70b_uses_external_tool():
+def test_groq_llama31_70b_uses_external_tool(mock_e2b_api_key_none):
    filename = os.path.join(llm_config_dir, "groq.json")
    response = check_agent_uses_external_tool(filename)
    # Log out successful response
@@ -332,6 +321,8 @@ def test_groq_llama31_70b_edit_core_memory():
 # ======================================================================================================================
 # GEMINI TESTS
 # ======================================================================================================================
+@pytest.mark.gemini_basic
+@retry_until_success(max_attempts=5, sleep_time_seconds=2)
 def test_gemini_pro_15_returns_valid_first_message():
    filename = os.path.join(llm_config_dir, "gemini-pro.json")
    response = check_first_response_is_valid_for_llm_endpoint(filename)
@@ -339,21 +330,17 @@ def test_gemini_pro_15_returns_valid_first_message():
    print(f"Got successful response from client: \n\n{response}")


-def test_gemini_pro_15_returns_keyword():
-    keyword = "banana"
-    filename = os.path.join(llm_config_dir, "gemini-pro.json")
-    response = check_response_contains_keyword(filename, keyword=keyword)
-    # Log out successful response
-    print(f"Got successful response from client: \n\n{response}")
-
-
-def test_gemini_pro_15_uses_external_tool():
+@pytest.mark.gemini_basic
+@retry_until_success(max_attempts=5, sleep_time_seconds=2)
+def test_gemini_pro_15_uses_external_tool(mock_e2b_api_key_none):
    filename = os.path.join(llm_config_dir, "gemini-pro.json")
    response = check_agent_uses_external_tool(filename)
    # Log out successful response
    print(f"Got successful response from client: \n\n{response}")


+@pytest.mark.gemini_basic
+@retry_until_success(max_attempts=5, sleep_time_seconds=2)
 def test_gemini_pro_15_recall_chat_memory():
    filename = os.path.join(llm_config_dir, "gemini-pro.json")
    response = check_agent_recall_chat_memory(filename)
@@ -361,6 +348,8 @@ def test_gemini_pro_15_recall_chat_memory():
    print(f"Got successful response from client: \n\n{response}")


+@pytest.mark.gemini_basic
+@retry_until_success(max_attempts=5, sleep_time_seconds=2)
 def test_gemini_pro_15_archival_memory_retrieval():
    filename = os.path.join(llm_config_dir, "gemini-pro.json")
    response = check_agent_archival_memory_retrieval(filename)
@@ -368,6 +357,8 @@ def test_gemini_pro_15_archival_memory_retrieval():
    print(f"Got successful response from client: \n\n{response}")


+@pytest.mark.gemini_basic
+@retry_until_success(max_attempts=5, sleep_time_seconds=2)
 def test_gemini_pro_15_edit_core_memory():
    filename = os.path.join(llm_config_dir, "gemini-pro.json")
    response = check_agent_edit_core_memory(filename)
@@ -385,15 +376,7 @@ def test_together_llama_3_70b_returns_valid_first_message():
    print(f"Got successful response from client: \n\n{response}")


-def test_together_llama_3_70b_returns_keyword():
-    keyword = "banana"
-    filename = os.path.join(llm_config_dir, "together-llama-3-70b.json")
-    response = check_response_contains_keyword(filename, keyword=keyword)
-    # Log out successful response
-    print(f"Got successful response from client: \n\n{response}")
-
-
-def test_together_llama_3_70b_uses_external_tool():
+def test_together_llama_3_70b_uses_external_tool(mock_e2b_api_key_none):
    filename = os.path.join(llm_config_dir, "together-llama-3-70b.json")
    response = check_agent_uses_external_tool(filename)
    # Log out successful response