fix: Fix summarizer for Anthropic and add integration tests (#2046)

2024-11-15 16:46:12 -08:00
parent cf35b9c4cd
commit 2d26365e42
16 changed files with 218 additions and 68 deletions
--- a/.github/workflows/integration_tests.yml
+++ b/.github/workflows/integration_tests.yml
@@ -0,0 +1,75 @@
+name: Integration Tests
+
+env:
+  OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+  COMPOSIO_API_KEY: ${{ secrets.COMPOSIO_API_KEY }}
+  ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+  GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
+  GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
+  AZURE_API_KEY: ${{ secrets.AZURE_API_KEY }}
+  AZURE_BASE_URL: ${{ secrets.AZURE_BASE_URL }}
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  run-integration-tests:
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    strategy:
+      fail-fast: false
+      matrix:
+        integration_test_suite:
+          - "integration_test_summarizer.py"
+    services:
+      qdrant:
+        image: qdrant/qdrant
+        ports:
+          - 6333:6333
+      postgres:
+        image: pgvector/pgvector:pg17
+        ports:
+          - 5432:5432
+        env:
+          POSTGRES_HOST_AUTH_METHOD: trust
+          POSTGRES_DB: postgres
+          POSTGRES_USER: postgres
+        options: >-
+          --health-cmd pg_isready
+          --health-interval 10s
+          --health-timeout 5s
+          --health-retries 5
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python, Poetry, and Dependencies
+        uses: packetcoders/action-setup-cache-python-poetry@main
+        with:
+          python-version: "3.12"
+          poetry-version: "1.8.2"
+          install-args: "-E dev -E postgres -E milvus -E external-tools -E tests"
+      - name: Migrate database
+        env:
+          LETTA_PG_PORT: 5432
+          LETTA_PG_USER: postgres
+          LETTA_PG_PASSWORD: postgres
+          LETTA_PG_DB: postgres
+          LETTA_PG_HOST: localhost
+        run: |
+          psql -h localhost -U postgres -d postgres -c 'CREATE EXTENSION vector'
+          poetry run alembic upgrade head
+      - name: Run core unit tests
+        env:
+          LETTA_PG_PORT: 5432
+          LETTA_PG_USER: postgres
+          LETTA_PG_PASSWORD: postgres
+          LETTA_PG_DB: postgres
+          LETTA_PG_HOST: localhost
+          LETTA_SERVER_PASS: test_server_token
+        run: |
+          poetry run pytest -s -vv tests/${{ matrix.integration_test_suite }}
--- a/.github/workflows/test_anthropic.yml
+++ b/.github/workflows/test_anthropic.yml
@@ -29,7 +29,7 @@ jobs:
      env:
        ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_returns_valid_first_message
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_claude_opus_3_returns_valid_first_message
        echo "TEST_FIRST_MESSAGE_EXIT_CODE=$?" >> $GITHUB_ENV
      continue-on-error: true

@@ -38,7 +38,7 @@ jobs:
      env:
        ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_returns_keyword
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_claude_opus_3_returns_keyword
        echo "TEST_KEYWORD_MESSAGE_EXIT_CODE=$?" >> $GITHUB_ENV
      continue-on-error: true

@@ -47,7 +47,7 @@ jobs:
      env:
        ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_uses_external_tool
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_claude_opus_3_uses_external_tool
        echo "TEST_EXTERNAL_TOOL_EXIT_CODE=$?" >> $GITHUB_ENV
      continue-on-error: true

@@ -56,7 +56,7 @@ jobs:
      env:
        ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_recall_chat_memory
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_claude_opus_3_recall_chat_memory
        echo "TEST_CHAT_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV
      continue-on-error: true

@@ -65,7 +65,7 @@ jobs:
      env:
        ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_archival_memory_retrieval
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_claude_opus_3_archival_memory_retrieval
        echo "TEST_ARCHIVAL_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV
      continue-on-error: true

@@ -74,7 +74,7 @@ jobs:
      env:
        ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_edit_core_memory
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_claude_opus_3_edit_core_memory
        echo "TEST_CORE_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV
      continue-on-error: true

--- a/.github/workflows/test_azure.yml
+++ b/.github/workflows/test_azure.yml
@@ -31,7 +31,7 @@ jobs:
        AZURE_API_KEY: ${{ secrets.AZURE_API_KEY }}
        AZURE_BASE_URL: ${{ secrets.AZURE_BASE_URL }}
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_azure_gpt_4o_mini_returns_valid_first_message
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_azure_gpt_4o_mini_returns_valid_first_message
        echo "TEST_FIRST_MESSAGE_EXIT_CODE=$?" >> $GITHUB_ENV
      continue-on-error: true

@@ -41,7 +41,7 @@ jobs:
        AZURE_API_KEY: ${{ secrets.AZURE_API_KEY }}
        AZURE_BASE_URL: ${{ secrets.AZURE_BASE_URL }}
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_azure_gpt_4o_mini_returns_keyword
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_azure_gpt_4o_mini_returns_keyword
        echo "TEST_KEYWORD_MESSAGE_EXIT_CODE=$?" >> $GITHUB_ENV
      continue-on-error: true

@@ -51,7 +51,7 @@ jobs:
        AZURE_API_KEY: ${{ secrets.AZURE_API_KEY }}
        AZURE_BASE_URL: ${{ secrets.AZURE_BASE_URL }}
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_azure_gpt_4o_mini_uses_external_tool
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_azure_gpt_4o_mini_uses_external_tool
        echo "TEST_EXTERNAL_TOOL_EXIT_CODE=$?" >> $GITHUB_ENV
      continue-on-error: true

@@ -61,7 +61,7 @@ jobs:
        AZURE_API_KEY: ${{ secrets.AZURE_API_KEY }}
        AZURE_BASE_URL: ${{ secrets.AZURE_BASE_URL }}
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_azure_gpt_4o_mini_recall_chat_memory
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_azure_gpt_4o_mini_recall_chat_memory
        echo "TEST_CHAT_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV
      continue-on-error: true

@@ -71,7 +71,7 @@ jobs:
        AZURE_API_KEY: ${{ secrets.AZURE_API_KEY }}
        AZURE_BASE_URL: ${{ secrets.AZURE_BASE_URL }}
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_azure_gpt_4o_mini_archival_memory_retrieval
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_azure_gpt_4o_mini_archival_memory_retrieval
        echo "TEST_ARCHIVAL_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV
      continue-on-error: true

@@ -81,7 +81,7 @@ jobs:
        AZURE_API_KEY: ${{ secrets.AZURE_API_KEY }}
        AZURE_BASE_URL: ${{ secrets.AZURE_BASE_URL }}
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_azure_gpt_4o_mini_edit_core_memory
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_azure_gpt_4o_mini_edit_core_memory
        echo "TEST_CORE_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV
      continue-on-error: true

--- a/.github/workflows/test_groq.yml
+++ b/.github/workflows/test_groq.yml
@@ -29,7 +29,7 @@ jobs:
      env:
        GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_groq_llama31_70b_returns_valid_first_message
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_groq_llama31_70b_returns_valid_first_message
        echo "TEST_FIRST_MESSAGE_EXIT_CODE=$?" >> $GITHUB_ENV
      continue-on-error: true

@@ -38,7 +38,7 @@ jobs:
      env:
        GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_groq_llama31_70b_returns_keyword
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_groq_llama31_70b_returns_keyword
        echo "TEST_KEYWORD_MESSAGE_EXIT_CODE=$?" >> $GITHUB_ENV
      continue-on-error: true

@@ -47,7 +47,7 @@ jobs:
      env:
        GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_groq_llama31_70b_uses_external_tool
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_groq_llama31_70b_uses_external_tool
        echo "TEST_EXTERNAL_TOOL_EXIT_CODE=$?" >> $GITHUB_ENV
      continue-on-error: true

@@ -56,7 +56,7 @@ jobs:
      env:
        GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_groq_llama31_70b_recall_chat_memory
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_groq_llama31_70b_recall_chat_memory
        echo "TEST_CHAT_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV
      continue-on-error: true

@@ -65,7 +65,7 @@ jobs:
      env:
        GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_groq_llama31_70b_archival_memory_retrieval
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_groq_llama31_70b_archival_memory_retrieval
        echo "TEST_ARCHIVAL_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV
      continue-on-error: true

@@ -74,7 +74,7 @@ jobs:
      env:
        GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_groq_llama31_70b_edit_core_memory
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_groq_llama31_70b_edit_core_memory
        echo "TEST_CORE_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV
      continue-on-error: true

--- a/.github/workflows/test_memgpt_hosted.yml
+++ b/.github/workflows/test_memgpt_hosted.yml
@@ -23,9 +23,9 @@ jobs:

    - name: Test LLM endpoint
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_llm_endpoint_letta_hosted
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_llm_endpoint_letta_hosted
      continue-on-error: true

    - name: Test embedding endpoint
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_embedding_endpoint_letta_hosted
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_embedding_endpoint_letta_hosted
--- a/.github/workflows/test_ollama.yml
+++ b/.github/workflows/test_ollama.yml
@@ -34,11 +34,11 @@ jobs:

    - name: Test LLM endpoint
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_llm_endpoint_ollama
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_llm_endpoint_ollama

    - name: Test embedding endpoint
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_embedding_endpoint_ollama
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_embedding_endpoint_ollama

    - name: Test provider
      run: |
--- a/.github/workflows/test_openai.yml
+++ b/.github/workflows/test_openai.yml
@@ -29,53 +29,53 @@ jobs:
      env:
        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_openai_gpt_4o_returns_valid_first_message
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_openai_gpt_4o_returns_valid_first_message

    - name: Test model sends message with keyword
      id: test_keyword_message
      env:
        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_openai_gpt_4o_returns_keyword
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_openai_gpt_4o_returns_keyword

    - name: Test model uses external tool correctly
      id: test_external_tool
      env:
        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_openai_gpt_4o_uses_external_tool
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_openai_gpt_4o_uses_external_tool

    - name: Test model recalls chat memory
      id: test_chat_memory
      env:
        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_openai_gpt_4o_recall_chat_memory
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_openai_gpt_4o_recall_chat_memory

    - name: Test model uses 'archival_memory_search' to find secret
      id: test_archival_memory_search
      env:
        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_openai_gpt_4o_archival_memory_retrieval
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_openai_gpt_4o_archival_memory_retrieval

    - name: Test model uses 'archival_memory_insert' to insert archival memories
      id: test_archival_memory_insert
      env:
        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_openai_gpt_4o_archival_memory_insert
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_openai_gpt_4o_archival_memory_insert

    - name: Test model can edit core memories
      id: test_core_memory
      env:
        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_openai_gpt_4o_edit_core_memory
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_openai_gpt_4o_edit_core_memory

    - name: Test embedding endpoint
      id: test_embedding_endpoint
      env:
        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
      run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_embedding_endpoint_openai
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_embedding_endpoint_openai
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -131,4 +131,4 @@ jobs:
          LETTA_SERVER_PASS: test_server_token
          PYTHONPATH: ${{ github.workspace }}:${{ env.PYTHONPATH }}
        run: |
-          poetry run pytest -s -vv -k "not test_agent_tool_graph.py and not test_tool_rule_solver.py and not test_local_client.py and not test_o1_agent.py and not test_cli.py and not test_tools.py and not test_concurrent_connections.py and not test_quickstart and not test_endpoints and not test_storage and not test_server and not test_openai_client and not test_providers and not test_client.py" tests
+          poetry run pytest -s -vv -k "not integration_test_summarizer.py and not test_agent_tool_graph.py and not test_tool_rule_solver.py and not test_local_client.py and not test_o1_agent.py and not test_cli.py and not test_tools.py and not test_concurrent_connections.py and not test_quickstart and not test_model_letta_perfomance and not test_storage and not test_server and not test_openai_client and not test_providers and not test_client.py" tests
--- a/.gitignore
+++ b/.gitignore
@@ -1018,3 +1018,8 @@ pgdata/
 letta/.pytest_cache/
 memgpy/pytest.ini
 **/**/pytest_cache
+
+
+# local sandbox venvs
+letta/services/tool_sandbox_env/*
+tests/test_tool_sandbox/*
--- a/examples/tool_rule_usage.py
+++ b/examples/tool_rule_usage.py
@@ -9,7 +9,7 @@ from tests.helpers.endpoints_helper import (
    setup_agent,
 )
 from tests.helpers.utils import cleanup
-from tests.test_endpoints import llm_config_dir
+from tests.test_model_letta_perfomance import llm_config_dir

 """
 This example shows how you can constrain tool calls in your agent.
--- a/letta/agent.py
+++ b/letta/agent.py
@@ -48,6 +48,7 @@ from letta.schemas.tool_rule import TerminalToolRule
 from letta.schemas.usage import LettaUsageStatistics
 from letta.services.source_manager import SourceManager
 from letta.services.user_manager import UserManager
+from letta.streaming_interface import StreamingRefreshCLIInterface
 from letta.system import (
    get_heartbeat,
    get_initial_boot_messages,
@@ -229,7 +230,7 @@ class BaseAgent(ABC):
 class Agent(BaseAgent):
    def __init__(
        self,
-        interface: Optional[AgentInterface],
+        interface: Optional[Union[AgentInterface, StreamingRefreshCLIInterface]],
        # agents can be created from providing agent_state
        agent_state: AgentState,
        tools: List[Tool],
--- a/letta/llm_api/anthropic.py
+++ b/letta/llm_api/anthropic.py
@@ -242,26 +242,28 @@ def convert_anthropic_response_to_chatcompletion(
    finish_reason = remap_finish_reason(response_json["stop_reason"])

    if isinstance(response_json["content"], list):
-        # inner mono + function call
-        # TODO relax asserts
-        assert len(response_json["content"]) == 2, response_json
-        assert response_json["content"][0]["type"] == "text", response_json
-        assert response_json["content"][1]["type"] == "tool_use", response_json
-        content = strip_xml_tags(string=response_json["content"][0]["text"], tag=inner_thoughts_xml_tag)
-        tool_calls = [
-            ToolCall(
-                id=response_json["content"][1]["id"],
-                type="function",
-                function=FunctionCall(
-                    name=response_json["content"][1]["name"],
-                    arguments=json.dumps(response_json["content"][1]["input"], indent=2),
-                ),
-            )
-        ]
+        if len(response_json["content"]) > 1:
+            # inner mono + function call
+            assert len(response_json["content"]) == 2, response_json
+            assert response_json["content"][0]["type"] == "text", response_json
+            assert response_json["content"][1]["type"] == "tool_use", response_json
+            content = strip_xml_tags(string=response_json["content"][0]["text"], tag=inner_thoughts_xml_tag)
+            tool_calls = [
+                ToolCall(
+                    id=response_json["content"][1]["id"],
+                    type="function",
+                    function=FunctionCall(
+                        name=response_json["content"][1]["name"],
+                        arguments=json.dumps(response_json["content"][1]["input"], indent=2),
+                    ),
+                )
+            ]
+        else:
+            # Just inner mono
+            content = strip_xml_tags(string=response_json["content"][0]["text"], tag=inner_thoughts_xml_tag)
+            tool_calls = None
    else:
-        # just inner mono
-        content = strip_xml_tags(string=response_json["content"], tag=inner_thoughts_xml_tag)
-        tool_calls = None
+        raise RuntimeError("Unexpected type for content in response_json.")

    assert response_json["role"] == "assistant", response_json
    choice = Choice(
--- a/tests/configs/llm_model_configs/claude-3-5-haiku.json
+++ b/tests/configs/llm_model_configs/claude-3-5-haiku.json
@@ -1,6 +1,6 @@
 {
    "context_window": 200000,
-    "model": "claude-3-opus-20240229",
+    "model": "claude-3-5-haiku-20241022",
    "model_endpoint_type": "anthropic",
    "model_endpoint": "https://api.anthropic.com/v1",
    "model_wrapper": null,
--- a/tests/integration_test_summarizer.py
+++ b/tests/integration_test_summarizer.py
@@ -0,0 +1,68 @@
+import json
+import os
+import uuid
+
+import pytest
+
+from letta import create_client
+from letta.agent import Agent
+from letta.schemas.embedding_config import EmbeddingConfig
+from letta.schemas.llm_config import LLMConfig
+from letta.streaming_interface import StreamingRefreshCLIInterface
+from tests.helpers.endpoints_helper import EMBEDDING_CONFIG_PATH
+from tests.helpers.utils import cleanup
+
+# constants
+LLM_CONFIG_DIR = "tests/configs/llm_model_configs"
+SUMMARY_KEY_PHRASE = "The following is a summary"
+
+
+@pytest.mark.parametrize(
+    "config_filename",
+    [
+        "openai-gpt-4o.json",
+        "azure-gpt-4o-mini.json",
+        "claude-3-5-haiku.json",
+        # "groq.json", TODO: Support groq, rate limiting currently makes it impossible to test
+        # "gemini-pro.json", TODO: Gemini is broken
+    ],
+)
+def test_summarizer(config_filename):
+    namespace = uuid.NAMESPACE_DNS
+    agent_name = str(uuid.uuid5(namespace, f"integration-test-summarizer-{config_filename}"))
+
+    # Get the LLM config
+    filename = os.path.join(LLM_CONFIG_DIR, config_filename)
+    config_data = json.load(open(filename, "r"))
+
+    # Create client and clean up agents
+    llm_config = LLMConfig(**config_data)
+    embedding_config = EmbeddingConfig(**json.load(open(EMBEDDING_CONFIG_PATH)))
+    client = create_client()
+    client.set_default_llm_config(llm_config)
+    client.set_default_embedding_config(embedding_config)
+    cleanup(client=client, agent_uuid=agent_name)
+
+    # Create agent
+    agent_state = client.create_agent(name=agent_name, llm_config=llm_config, embedding_config=embedding_config)
+    tools = [client.get_tool(client.get_tool_id(name=tool_name)) for tool_name in agent_state.tools]
+    letta_agent = Agent(interface=StreamingRefreshCLIInterface(), agent_state=agent_state, tools=tools, first_message_verify_mono=False)
+
+    # Make conversation
+    messages = [
+        "Did you know that honey never spoils? Archaeologists have found pots of honey in ancient Egyptian tombs that are over 3,000 years old and still perfectly edible.",
+        "Octopuses have three hearts, and two of them stop beating when they swim.",
+    ]
+
+    for m in messages:
+        letta_agent.step_user_message(
+            user_message_str=m,
+            first_message=False,
+            skip_verify=False,
+            stream=False,
+            ms=client.server.ms,
+        )
+
+    # Invoke a summarize
+    letta_agent.summarize_messages_inplace(preserve_last_N_messages=False)
+    assert SUMMARY_KEY_PHRASE in letta_agent.messages[1]["content"], f"Test failed for config: {config_filename}"
--- a/tests/test_agent_tool_graph.py
+++ b/tests/test_agent_tool_graph.py
@@ -1,4 +1,3 @@
-import os
 import uuid

 import pytest
@@ -13,12 +12,11 @@ from tests.helpers.endpoints_helper import (
    setup_agent,
 )
 from tests.helpers.utils import cleanup
-from tests.test_endpoints import llm_config_dir

 # Generate uuid for agent name for this example
 namespace = uuid.NAMESPACE_DNS
 agent_uuid = str(uuid.uuid5(namespace, "test_agent_tool_graph"))
-config_file = os.path.join(llm_config_dir, "openai-gpt-4o.json")
+config_file = "tests/configs/llm_model_configs/openai-gpt-4o.json"

 """Contrived tools for this test case"""

--- a/tests/test_model_letta_perfomance.py
+++ b/tests/test_model_letta_perfomance.py
@@ -59,6 +59,7 @@ def retry_until_threshold(threshold=0.5, max_attempts=10, sleep_time_seconds=4):
 # ======================================================================================================================
 # OPENAI TESTS
 # ======================================================================================================================
+@retry_until_threshold(threshold=0.75, max_attempts=4)
 def test_openai_gpt_4o_returns_valid_first_message():
    filename = os.path.join(llm_config_dir, "openai-gpt-4o.json")
    response = check_first_response_is_valid_for_llm_endpoint(filename)
@@ -205,44 +206,44 @@ def test_embedding_endpoint_ollama():
 # ======================================================================================================================
 # ANTHROPIC TESTS
 # ======================================================================================================================
-def test_claude_opus_3_returns_valid_first_message():
-    filename = os.path.join(llm_config_dir, "claude-3-opus.json")
+def test_claude_haiku_3_5_returns_valid_first_message():
+    filename = os.path.join(llm_config_dir, "claude-3-5-haiku.json")
    response = check_first_response_is_valid_for_llm_endpoint(filename)
    # Log out successful response
    print(f"Got successful response from client: \n\n{response}")


-def test_claude_opus_3_returns_keyword():
+def test_claude_haiku_3_5_returns_keyword():
    keyword = "banana"
-    filename = os.path.join(llm_config_dir, "claude-3-opus.json")
+    filename = os.path.join(llm_config_dir, "claude-3-5-haiku.json")
    response = check_response_contains_keyword(filename, keyword=keyword)
    # Log out successful response
    print(f"Got successful response from client: \n\n{response}")


-def test_claude_opus_3_uses_external_tool():
-    filename = os.path.join(llm_config_dir, "claude-3-opus.json")
+def test_claude_haiku_3_5_uses_external_tool():
+    filename = os.path.join(llm_config_dir, "claude-3-5-haiku.json")
    response = check_agent_uses_external_tool(filename)
    # Log out successful response
    print(f"Got successful response from client: \n\n{response}")


-def test_claude_opus_3_recall_chat_memory():
-    filename = os.path.join(llm_config_dir, "claude-3-opus.json")
+def test_claude_haiku_3_5_recall_chat_memory():
+    filename = os.path.join(llm_config_dir, "claude-3-5-haiku.json")
    response = check_agent_recall_chat_memory(filename)
    # Log out successful response
    print(f"Got successful response from client: \n\n{response}")


-def test_claude_opus_3_archival_memory_retrieval():
-    filename = os.path.join(llm_config_dir, "claude-3-opus.json")
+def test_claude_haiku_3_5_archival_memory_retrieval():
+    filename = os.path.join(llm_config_dir, "claude-3-5-haiku.json")
    response = check_agent_archival_memory_retrieval(filename)
    # Log out successful response
    print(f"Got successful response from client: \n\n{response}")


-def test_claude_opus_3_edit_core_memory():
-    filename = os.path.join(llm_config_dir, "claude-3-opus.json")
+def test_claude_haiku_3_5_edit_core_memory():
+    filename = os.path.join(llm_config_dir, "claude-3-5-haiku.json")
    response = check_agent_edit_core_memory(filename)
    # Log out successful response
    print(f"Got successful response from client: \n\n{response}")