From c76cecb8cb3ecf3d84f50b6994441e1cfbe4f2c7 Mon Sep 17 00:00:00 2001
From: Matthew Zhou <mattzh1314@gmail.com>
Date: Mon, 7 Oct 2024 11:39:54 -0700
Subject: [PATCH] test: Add complex e2e tests for anthropic opus-3 model
 (#1837)

Co-authored-by: Matt Zhou <mattzhou@Matts-MacBook-Pro.local>
---
 .github/workflows/test_anthropic.yml          | 78 ++++++++++++++++++-
 .github/workflows/test_openai.yml             |  2 +-
 .../{anthropic.json => claude-3-opus.json}    |  0
 tests/test_endpoints.py                       | 45 ++++++++++-
 4 files changed, 116 insertions(+), 9 deletions(-)
 rename configs/llm_model_configs/{anthropic.json => claude-3-opus.json} (100%)

diff --git a/.github/workflows/test_anthropic.yml b/.github/workflows/test_anthropic.yml
index f8273447..2c4391ad 100644
--- a/.github/workflows/test_anthropic.yml
+++ b/.github/workflows/test_anthropic.yml
@@ -1,4 +1,4 @@
-name: Endpoint (Anthropic)
+name: Anthropic Claude Opus 3 Capabilities Test
 
 env:
   ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
@@ -22,10 +22,80 @@ jobs:
       with:
         python-version: "3.12"
         poetry-version: "1.8.2"
-        install-args: "-E dev"
+        install-args: "-E dev -E external-tools"
 
-    - name: Test LLM endpoint
+    - name: Test first message contains expected function call and inner monologue
+      id: test_first_message
       env:
         ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
       run: |
-        poetry run pytest -s -vv tests/test_endpoints.py::test_llm_endpoint_anthropic
+        poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_returns_valid_first_message
+        echo "TEST_FIRST_MESSAGE_EXIT_CODE=$?" >> $GITHUB_ENV
+      continue-on-error: true
+
+    - name: Test model sends message with keyword
+      id: test_keyword_message
+      env:
+        ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+      run: |
+        poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_returns_keyword
+        echo "TEST_KEYWORD_MESSAGE_EXIT_CODE=$?" >> $GITHUB_ENV
+      continue-on-error: true
+
+    - name: Test model uses external tool correctly
+      id: test_external_tool
+      env:
+        ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+      run: |
+        poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_uses_external_tool
+        echo "TEST_EXTERNAL_TOOL_EXIT_CODE=$?" >> $GITHUB_ENV
+      continue-on-error: true
+
+    - name: Test model recalls chat memory
+      id: test_chat_memory
+      env:
+        ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+      run: |
+        poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_recall_chat_memory
+        echo "TEST_CHAT_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV
+      continue-on-error: true
+
+    - name: Test model uses 'archival_memory_search' to find secret
+      id: test_archival_memory
+      env:
+        ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+      run: |
+        poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_archival_memory_retrieval
+        echo "TEST_ARCHIVAL_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV
+      continue-on-error: true
+
+    - name: Test model can edit core memories
+      id: test_core_memory
+      env:
+        ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+      run: |
+        poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_edit_core_memory
+        echo "TEST_CORE_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV
+      continue-on-error: true
+
+    - name: Summarize test results
+      if: always()
+      run: |
+        echo "Test Results Summary:"
+        echo "Test first message: $([[ $TEST_FIRST_MESSAGE_EXIT_CODE -eq 0 ]] && echo ✅ || echo ❌)"
+        echo "Test model sends message with keyword: $([[ $TEST_KEYWORD_MESSAGE_EXIT_CODE -eq 0 ]] && echo ✅ || echo ❌)"
+        echo "Test model uses external tool: $([[ $TEST_EXTERNAL_TOOL_EXIT_CODE -eq 0 ]] && echo ✅ || echo ❌)"
+        echo "Test model recalls chat memory: $([[ $TEST_CHAT_MEMORY_EXIT_CODE -eq 0 ]] && echo ✅ || echo ❌)"
+        echo "Test model uses 'archival_memory_search' to find secret: $([[ $TEST_ARCHIVAL_MEMORY_EXIT_CODE -eq 0 ]] && echo ✅ || echo ❌)"
+        echo "Test model can edit core memories: $([[ $TEST_CORE_MEMORY_EXIT_CODE -eq 0 ]] && echo ✅ || echo ❌)"
+
+        # Check if any test failed
+        if [[ $TEST_FIRST_MESSAGE_EXIT_CODE -ne 0 || \
+              $TEST_KEYWORD_MESSAGE_EXIT_CODE -ne 0 || \
+              $TEST_EXTERNAL_TOOL_EXIT_CODE -ne 0 || \
+              $TEST_CHAT_MEMORY_EXIT_CODE -ne 0 || \
+              $TEST_ARCHIVAL_MEMORY_EXIT_CODE -ne 0 || \
+              $TEST_CORE_MEMORY_EXIT_CODE -ne 0 ]]; then
+          echo "Some tests failed."
+          exit 78
+        fi
diff --git a/.github/workflows/test_openai.yml b/.github/workflows/test_openai.yml
index 0d51e8b5..f5957998 100644
--- a/.github/workflows/test_openai.yml
+++ b/.github/workflows/test_openai.yml
@@ -107,6 +107,6 @@ jobs:
               $TEST_ARCHIVAL_MEMORY_EXIT_CODE -ne 0 || \
               $TEST_CORE_MEMORY_EXIT_CODE -ne 0 || \
               $TEST_EMBEDDING_ENDPOINT_EXIT_CODE -ne 0 ]]; then
-          echo "Some tests failed, setting neutral status."
+          echo "Some tests failed."
           exit 78
         fi
diff --git a/configs/llm_model_configs/anthropic.json b/configs/llm_model_configs/claude-3-opus.json
similarity index 100%
rename from configs/llm_model_configs/anthropic.json
rename to configs/llm_model_configs/claude-3-opus.json
diff --git a/tests/test_endpoints.py b/tests/test_endpoints.py
index de751096..a6cd16a3 100644
--- a/tests/test_endpoints.py
+++ b/tests/test_endpoints.py
@@ -100,10 +100,47 @@ def test_embedding_endpoint_ollama():
 # ======================================================================================================================
 # ANTHROPIC TESTS
 # ======================================================================================================================
-def test_llm_endpoint_anthropic():
-    filename = os.path.join(llm_config_dir, "anthropic.json")
-    check_first_response_is_valid_for_llm_endpoint(filename)
-    check_first_response_is_valid_for_llm_endpoint(filename)
+def test_claude_opus_3_returns_valid_first_message():
+    filename = os.path.join(llm_config_dir, "claude-3-opus.json")
+    response = check_first_response_is_valid_for_llm_endpoint(filename)
+    # Log out successful response
+    print(f"Got successful response from client: \n\n{response}")
+
+
+def test_claude_opus_3_returns_keyword():
+    keyword = "banana"
+    filename = os.path.join(llm_config_dir, "claude-3-opus.json")
+    response = check_response_contains_keyword(filename, keyword=keyword)
+    # Log out successful response
+    print(f"Got successful response from client: \n\n{response}")
+
+
+def test_claude_opus_3_uses_external_tool():
+    filename = os.path.join(llm_config_dir, "claude-3-opus.json")
+    response = check_agent_uses_external_tool(filename)
+    # Log out successful response
+    print(f"Got successful response from client: \n\n{response}")
+
+
+def test_claude_opus_3_recall_chat_memory():
+    filename = os.path.join(llm_config_dir, "claude-3-opus.json")
+    response = check_agent_recall_chat_memory(filename)
+    # Log out successful response
+    print(f"Got successful response from client: \n\n{response}")
+
+
+def test_claude_opus_3_archival_memory_retrieval():
+    filename = os.path.join(llm_config_dir, "claude-3-opus.json")
+    response = check_agent_archival_memory_retrieval(filename)
+    # Log out successful response
+    print(f"Got successful response from client: \n\n{response}")
+
+
+def test_claude_opus_3_edit_core_memory():
+    filename = os.path.join(llm_config_dir, "claude-3-opus.json")
+    response = check_agent_edit_core_memory(filename)
+    # Log out successful response
+    print(f"Got successful response from client: \n\n{response}")
 
 
 # ======================================================================================================================