From c76cecb8cb3ecf3d84f50b6994441e1cfbe4f2c7 Mon Sep 17 00:00:00 2001 From: Matthew Zhou Date: Mon, 7 Oct 2024 11:39:54 -0700 Subject: [PATCH] test: Add complex e2e tests for anthropic opus-3 model (#1837) Co-authored-by: Matt Zhou --- .github/workflows/test_anthropic.yml | 78 ++++++++++++++++++- .github/workflows/test_openai.yml | 2 +- .../{anthropic.json => claude-3-opus.json} | 0 tests/test_endpoints.py | 45 ++++++++++- 4 files changed, 116 insertions(+), 9 deletions(-) rename configs/llm_model_configs/{anthropic.json => claude-3-opus.json} (100%) diff --git a/.github/workflows/test_anthropic.yml b/.github/workflows/test_anthropic.yml index f8273447..2c4391ad 100644 --- a/.github/workflows/test_anthropic.yml +++ b/.github/workflows/test_anthropic.yml @@ -1,4 +1,4 @@ -name: Endpoint (Anthropic) +name: Anthropic Claude Opus 3 Capabilities Test env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} @@ -22,10 +22,80 @@ jobs: with: python-version: "3.12" poetry-version: "1.8.2" - install-args: "-E dev" + install-args: "-E dev -E external-tools" - - name: Test LLM endpoint + - name: Test first message contains expected function call and inner monologue + id: test_first_message env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} run: | - poetry run pytest -s -vv tests/test_endpoints.py::test_llm_endpoint_anthropic + poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_returns_valid_first_message + echo "TEST_FIRST_MESSAGE_EXIT_CODE=$?" >> $GITHUB_ENV + continue-on-error: true + + - name: Test model sends message with keyword + id: test_keyword_message + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + run: | + poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_returns_keyword + echo "TEST_KEYWORD_MESSAGE_EXIT_CODE=$?" >> $GITHUB_ENV + continue-on-error: true + + - name: Test model uses external tool correctly + id: test_external_tool + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + run: | + poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_uses_external_tool + echo "TEST_EXTERNAL_TOOL_EXIT_CODE=$?" >> $GITHUB_ENV + continue-on-error: true + + - name: Test model recalls chat memory + id: test_chat_memory + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + run: | + poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_recall_chat_memory + echo "TEST_CHAT_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV + continue-on-error: true + + - name: Test model uses 'archival_memory_search' to find secret + id: test_archival_memory + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + run: | + poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_archival_memory_retrieval + echo "TEST_ARCHIVAL_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV + continue-on-error: true + + - name: Test model can edit core memories + id: test_core_memory + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + run: | + poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_edit_core_memory + echo "TEST_CORE_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV + continue-on-error: true + + - name: Summarize test results + if: always() + run: | + echo "Test Results Summary:" + echo "Test first message: $([[ $TEST_FIRST_MESSAGE_EXIT_CODE -eq 0 ]] && echo ✅ || echo ❌)" + echo "Test model sends message with keyword: $([[ $TEST_KEYWORD_MESSAGE_EXIT_CODE -eq 0 ]] && echo ✅ || echo ❌)" + echo "Test model uses external tool: $([[ $TEST_EXTERNAL_TOOL_EXIT_CODE -eq 0 ]] && echo ✅ || echo ❌)" + echo "Test model recalls chat memory: $([[ $TEST_CHAT_MEMORY_EXIT_CODE -eq 0 ]] && echo ✅ || echo ❌)" + echo "Test model uses 'archival_memory_search' to find secret: $([[ $TEST_ARCHIVAL_MEMORY_EXIT_CODE -eq 0 ]] && echo ✅ || echo ❌)" + echo "Test model can edit core memories: $([[ $TEST_CORE_MEMORY_EXIT_CODE -eq 0 ]] && echo ✅ || echo ❌)" + + # Check if any test failed + if [[ $TEST_FIRST_MESSAGE_EXIT_CODE -ne 0 || \ + $TEST_KEYWORD_MESSAGE_EXIT_CODE -ne 0 || \ + $TEST_EXTERNAL_TOOL_EXIT_CODE -ne 0 || \ + $TEST_CHAT_MEMORY_EXIT_CODE -ne 0 || \ + $TEST_ARCHIVAL_MEMORY_EXIT_CODE -ne 0 || \ + $TEST_CORE_MEMORY_EXIT_CODE -ne 0 ]]; then + echo "Some tests failed." + exit 78 + fi diff --git a/.github/workflows/test_openai.yml b/.github/workflows/test_openai.yml index 0d51e8b5..f5957998 100644 --- a/.github/workflows/test_openai.yml +++ b/.github/workflows/test_openai.yml @@ -107,6 +107,6 @@ jobs: $TEST_ARCHIVAL_MEMORY_EXIT_CODE -ne 0 || \ $TEST_CORE_MEMORY_EXIT_CODE -ne 0 || \ $TEST_EMBEDDING_ENDPOINT_EXIT_CODE -ne 0 ]]; then - echo "Some tests failed, setting neutral status." + echo "Some tests failed." exit 78 fi diff --git a/configs/llm_model_configs/anthropic.json b/configs/llm_model_configs/claude-3-opus.json similarity index 100% rename from configs/llm_model_configs/anthropic.json rename to configs/llm_model_configs/claude-3-opus.json diff --git a/tests/test_endpoints.py b/tests/test_endpoints.py index de751096..a6cd16a3 100644 --- a/tests/test_endpoints.py +++ b/tests/test_endpoints.py @@ -100,10 +100,47 @@ def test_embedding_endpoint_ollama(): # ====================================================================================================================== # ANTHROPIC TESTS # ====================================================================================================================== -def test_llm_endpoint_anthropic(): - filename = os.path.join(llm_config_dir, "anthropic.json") - check_first_response_is_valid_for_llm_endpoint(filename) - check_first_response_is_valid_for_llm_endpoint(filename) +def test_claude_opus_3_returns_valid_first_message(): + filename = os.path.join(llm_config_dir, "claude-3-opus.json") + response = check_first_response_is_valid_for_llm_endpoint(filename) + # Log out successful response + print(f"Got successful response from client: \n\n{response}") + + +def test_claude_opus_3_returns_keyword(): + keyword = "banana" + filename = os.path.join(llm_config_dir, "claude-3-opus.json") + response = check_response_contains_keyword(filename, keyword=keyword) + # Log out successful response + print(f"Got successful response from client: \n\n{response}") + + +def test_claude_opus_3_uses_external_tool(): + filename = os.path.join(llm_config_dir, "claude-3-opus.json") + response = check_agent_uses_external_tool(filename) + # Log out successful response + print(f"Got successful response from client: \n\n{response}") + + +def test_claude_opus_3_recall_chat_memory(): + filename = os.path.join(llm_config_dir, "claude-3-opus.json") + response = check_agent_recall_chat_memory(filename) + # Log out successful response + print(f"Got successful response from client: \n\n{response}") + + +def test_claude_opus_3_archival_memory_retrieval(): + filename = os.path.join(llm_config_dir, "claude-3-opus.json") + response = check_agent_archival_memory_retrieval(filename) + # Log out successful response + print(f"Got successful response from client: \n\n{response}") + + +def test_claude_opus_3_edit_core_memory(): + filename = os.path.join(llm_config_dir, "claude-3-opus.json") + response = check_agent_edit_core_memory(filename) + # Log out successful response + print(f"Got successful response from client: \n\n{response}") # ======================================================================================================================