test: Add complex e2e tests for anthropic opus-3 model (#1837)

Co-authored-by: Matt Zhou <mattzhou@Matts-MacBook-Pro.local>
This commit is contained in:
Matthew Zhou
2024-10-07 11:39:54 -07:00
committed by GitHub
parent 9e4c7ad07f
commit c76cecb8cb
4 changed files with 116 additions and 9 deletions

View File

@@ -1,4 +1,4 @@
name: Endpoint (Anthropic)
name: Anthropic Claude Opus 3 Capabilities Test
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
@@ -22,10 +22,80 @@ jobs:
with:
python-version: "3.12"
poetry-version: "1.8.2"
install-args: "-E dev"
install-args: "-E dev -E external-tools"
- name: Test LLM endpoint
- name: Test first message contains expected function call and inner monologue
id: test_first_message
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
run: |
poetry run pytest -s -vv tests/test_endpoints.py::test_llm_endpoint_anthropic
poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_returns_valid_first_message
echo "TEST_FIRST_MESSAGE_EXIT_CODE=$?" >> $GITHUB_ENV
continue-on-error: true
- name: Test model sends message with keyword
id: test_keyword_message
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
run: |
poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_returns_keyword
echo "TEST_KEYWORD_MESSAGE_EXIT_CODE=$?" >> $GITHUB_ENV
continue-on-error: true
- name: Test model uses external tool correctly
id: test_external_tool
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
run: |
poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_uses_external_tool
echo "TEST_EXTERNAL_TOOL_EXIT_CODE=$?" >> $GITHUB_ENV
continue-on-error: true
- name: Test model recalls chat memory
id: test_chat_memory
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
run: |
poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_recall_chat_memory
echo "TEST_CHAT_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV
continue-on-error: true
- name: Test model uses 'archival_memory_search' to find secret
id: test_archival_memory
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
run: |
poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_archival_memory_retrieval
echo "TEST_ARCHIVAL_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV
continue-on-error: true
- name: Test model can edit core memories
id: test_core_memory
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
run: |
poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_edit_core_memory
echo "TEST_CORE_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV
continue-on-error: true
- name: Summarize test results
if: always()
run: |
echo "Test Results Summary:"
echo "Test first message: $([[ $TEST_FIRST_MESSAGE_EXIT_CODE -eq 0 ]] && echo ✅ || echo ❌)"
echo "Test model sends message with keyword: $([[ $TEST_KEYWORD_MESSAGE_EXIT_CODE -eq 0 ]] && echo ✅ || echo ❌)"
echo "Test model uses external tool: $([[ $TEST_EXTERNAL_TOOL_EXIT_CODE -eq 0 ]] && echo ✅ || echo ❌)"
echo "Test model recalls chat memory: $([[ $TEST_CHAT_MEMORY_EXIT_CODE -eq 0 ]] && echo ✅ || echo ❌)"
echo "Test model uses 'archival_memory_search' to find secret: $([[ $TEST_ARCHIVAL_MEMORY_EXIT_CODE -eq 0 ]] && echo ✅ || echo ❌)"
echo "Test model can edit core memories: $([[ $TEST_CORE_MEMORY_EXIT_CODE -eq 0 ]] && echo ✅ || echo ❌)"
# Check if any test failed
if [[ $TEST_FIRST_MESSAGE_EXIT_CODE -ne 0 || \
$TEST_KEYWORD_MESSAGE_EXIT_CODE -ne 0 || \
$TEST_EXTERNAL_TOOL_EXIT_CODE -ne 0 || \
$TEST_CHAT_MEMORY_EXIT_CODE -ne 0 || \
$TEST_ARCHIVAL_MEMORY_EXIT_CODE -ne 0 || \
$TEST_CORE_MEMORY_EXIT_CODE -ne 0 ]]; then
echo "Some tests failed."
exit 78
fi

View File

@@ -107,6 +107,6 @@ jobs:
$TEST_ARCHIVAL_MEMORY_EXIT_CODE -ne 0 || \
$TEST_CORE_MEMORY_EXIT_CODE -ne 0 || \
$TEST_EMBEDDING_ENDPOINT_EXIT_CODE -ne 0 ]]; then
echo "Some tests failed, setting neutral status."
echo "Some tests failed."
exit 78
fi