test: Add complex e2e tests for anthropic opus-3 model (#1837)
Co-authored-by: Matt Zhou <mattzhou@Matts-MacBook-Pro.local>
This commit is contained in:
78
.github/workflows/test_anthropic.yml
vendored
78
.github/workflows/test_anthropic.yml
vendored
@@ -1,4 +1,4 @@
|
||||
name: Endpoint (Anthropic)
|
||||
name: Anthropic Claude Opus 3 Capabilities Test
|
||||
|
||||
env:
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
@@ -22,10 +22,80 @@ jobs:
|
||||
with:
|
||||
python-version: "3.12"
|
||||
poetry-version: "1.8.2"
|
||||
install-args: "-E dev"
|
||||
install-args: "-E dev -E external-tools"
|
||||
|
||||
- name: Test LLM endpoint
|
||||
- name: Test first message contains expected function call and inner monologue
|
||||
id: test_first_message
|
||||
env:
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
run: |
|
||||
poetry run pytest -s -vv tests/test_endpoints.py::test_llm_endpoint_anthropic
|
||||
poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_returns_valid_first_message
|
||||
echo "TEST_FIRST_MESSAGE_EXIT_CODE=$?" >> $GITHUB_ENV
|
||||
continue-on-error: true
|
||||
|
||||
- name: Test model sends message with keyword
|
||||
id: test_keyword_message
|
||||
env:
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
run: |
|
||||
poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_returns_keyword
|
||||
echo "TEST_KEYWORD_MESSAGE_EXIT_CODE=$?" >> $GITHUB_ENV
|
||||
continue-on-error: true
|
||||
|
||||
- name: Test model uses external tool correctly
|
||||
id: test_external_tool
|
||||
env:
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
run: |
|
||||
poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_uses_external_tool
|
||||
echo "TEST_EXTERNAL_TOOL_EXIT_CODE=$?" >> $GITHUB_ENV
|
||||
continue-on-error: true
|
||||
|
||||
- name: Test model recalls chat memory
|
||||
id: test_chat_memory
|
||||
env:
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
run: |
|
||||
poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_recall_chat_memory
|
||||
echo "TEST_CHAT_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV
|
||||
continue-on-error: true
|
||||
|
||||
- name: Test model uses 'archival_memory_search' to find secret
|
||||
id: test_archival_memory
|
||||
env:
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
run: |
|
||||
poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_archival_memory_retrieval
|
||||
echo "TEST_ARCHIVAL_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV
|
||||
continue-on-error: true
|
||||
|
||||
- name: Test model can edit core memories
|
||||
id: test_core_memory
|
||||
env:
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
run: |
|
||||
poetry run pytest -s -vv tests/test_endpoints.py::test_claude_opus_3_edit_core_memory
|
||||
echo "TEST_CORE_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV
|
||||
continue-on-error: true
|
||||
|
||||
- name: Summarize test results
|
||||
if: always()
|
||||
run: |
|
||||
echo "Test Results Summary:"
|
||||
echo "Test first message: $([[ $TEST_FIRST_MESSAGE_EXIT_CODE -eq 0 ]] && echo ✅ || echo ❌)"
|
||||
echo "Test model sends message with keyword: $([[ $TEST_KEYWORD_MESSAGE_EXIT_CODE -eq 0 ]] && echo ✅ || echo ❌)"
|
||||
echo "Test model uses external tool: $([[ $TEST_EXTERNAL_TOOL_EXIT_CODE -eq 0 ]] && echo ✅ || echo ❌)"
|
||||
echo "Test model recalls chat memory: $([[ $TEST_CHAT_MEMORY_EXIT_CODE -eq 0 ]] && echo ✅ || echo ❌)"
|
||||
echo "Test model uses 'archival_memory_search' to find secret: $([[ $TEST_ARCHIVAL_MEMORY_EXIT_CODE -eq 0 ]] && echo ✅ || echo ❌)"
|
||||
echo "Test model can edit core memories: $([[ $TEST_CORE_MEMORY_EXIT_CODE -eq 0 ]] && echo ✅ || echo ❌)"
|
||||
|
||||
# Check if any test failed
|
||||
if [[ $TEST_FIRST_MESSAGE_EXIT_CODE -ne 0 || \
|
||||
$TEST_KEYWORD_MESSAGE_EXIT_CODE -ne 0 || \
|
||||
$TEST_EXTERNAL_TOOL_EXIT_CODE -ne 0 || \
|
||||
$TEST_CHAT_MEMORY_EXIT_CODE -ne 0 || \
|
||||
$TEST_ARCHIVAL_MEMORY_EXIT_CODE -ne 0 || \
|
||||
$TEST_CORE_MEMORY_EXIT_CODE -ne 0 ]]; then
|
||||
echo "Some tests failed."
|
||||
exit 78
|
||||
fi
|
||||
|
||||
2
.github/workflows/test_openai.yml
vendored
2
.github/workflows/test_openai.yml
vendored
@@ -107,6 +107,6 @@ jobs:
|
||||
$TEST_ARCHIVAL_MEMORY_EXIT_CODE -ne 0 || \
|
||||
$TEST_CORE_MEMORY_EXIT_CODE -ne 0 || \
|
||||
$TEST_EMBEDDING_ENDPOINT_EXIT_CODE -ne 0 ]]; then
|
||||
echo "Some tests failed, setting neutral status."
|
||||
echo "Some tests failed."
|
||||
exit 78
|
||||
fi
|
||||
|
||||
@@ -100,10 +100,47 @@ def test_embedding_endpoint_ollama():
|
||||
# ======================================================================================================================
|
||||
# ANTHROPIC TESTS
|
||||
# ======================================================================================================================
|
||||
def test_llm_endpoint_anthropic():
|
||||
filename = os.path.join(llm_config_dir, "anthropic.json")
|
||||
check_first_response_is_valid_for_llm_endpoint(filename)
|
||||
check_first_response_is_valid_for_llm_endpoint(filename)
|
||||
def test_claude_opus_3_returns_valid_first_message():
|
||||
filename = os.path.join(llm_config_dir, "claude-3-opus.json")
|
||||
response = check_first_response_is_valid_for_llm_endpoint(filename)
|
||||
# Log out successful response
|
||||
print(f"Got successful response from client: \n\n{response}")
|
||||
|
||||
|
||||
def test_claude_opus_3_returns_keyword():
|
||||
keyword = "banana"
|
||||
filename = os.path.join(llm_config_dir, "claude-3-opus.json")
|
||||
response = check_response_contains_keyword(filename, keyword=keyword)
|
||||
# Log out successful response
|
||||
print(f"Got successful response from client: \n\n{response}")
|
||||
|
||||
|
||||
def test_claude_opus_3_uses_external_tool():
|
||||
filename = os.path.join(llm_config_dir, "claude-3-opus.json")
|
||||
response = check_agent_uses_external_tool(filename)
|
||||
# Log out successful response
|
||||
print(f"Got successful response from client: \n\n{response}")
|
||||
|
||||
|
||||
def test_claude_opus_3_recall_chat_memory():
|
||||
filename = os.path.join(llm_config_dir, "claude-3-opus.json")
|
||||
response = check_agent_recall_chat_memory(filename)
|
||||
# Log out successful response
|
||||
print(f"Got successful response from client: \n\n{response}")
|
||||
|
||||
|
||||
def test_claude_opus_3_archival_memory_retrieval():
|
||||
filename = os.path.join(llm_config_dir, "claude-3-opus.json")
|
||||
response = check_agent_archival_memory_retrieval(filename)
|
||||
# Log out successful response
|
||||
print(f"Got successful response from client: \n\n{response}")
|
||||
|
||||
|
||||
def test_claude_opus_3_edit_core_memory():
|
||||
filename = os.path.join(llm_config_dir, "claude-3-opus.json")
|
||||
response = check_agent_edit_core_memory(filename)
|
||||
# Log out successful response
|
||||
print(f"Got successful response from client: \n\n{response}")
|
||||
|
||||
|
||||
# ======================================================================================================================
|
||||
|
||||
Reference in New Issue
Block a user