feat: Add model integration testing (#587)

This commit is contained in:
Matthew Zhou
2025-01-10 12:28:12 -10:00
committed by GitHub
parent e2b4c76df0
commit 5264349e43
4 changed files with 128 additions and 139 deletions

View File

@@ -264,6 +264,7 @@ def convert_google_ai_response_to_chatcompletion(
"""
try:
choices = []
index = 0
for candidate in response_json["candidates"]:
content = candidate["content"]
@@ -272,86 +273,87 @@ def convert_google_ai_response_to_chatcompletion(
parts = content["parts"]
# TODO support parts / multimodal
assert len(parts) == 1, f"Multi-part not yet supported:\n{parts}"
response_message = parts[0]
# TODO support parallel tool calling natively
# TODO Alternative here is to throw away everything else except for the first part
for response_message in parts:
# Convert the actual message style to OpenAI style
if "functionCall" in response_message and response_message["functionCall"] is not None:
function_call = response_message["functionCall"]
assert isinstance(function_call, dict), function_call
function_name = function_call["name"]
assert isinstance(function_name, str), function_name
function_args = function_call["args"]
assert isinstance(function_args, dict), function_args
# Convert the actual message style to OpenAI style
if "functionCall" in response_message and response_message["functionCall"] is not None:
function_call = response_message["functionCall"]
assert isinstance(function_call, dict), function_call
function_name = function_call["name"]
assert isinstance(function_name, str), function_name
function_args = function_call["args"]
assert isinstance(function_args, dict), function_args
# NOTE: this also involves stripping the inner monologue out of the function
if pull_inner_thoughts_from_args:
from letta.local_llm.constants import INNER_THOUGHTS_KWARG
# NOTE: this also involves stripping the inner monologue out of the function
if pull_inner_thoughts_from_args:
from letta.local_llm.constants import INNER_THOUGHTS_KWARG
assert INNER_THOUGHTS_KWARG in function_args, f"Couldn't find inner thoughts in function args:\n{function_call}"
inner_thoughts = function_args.pop(INNER_THOUGHTS_KWARG)
assert inner_thoughts is not None, f"Expected non-null inner thoughts function arg:\n{function_call}"
else:
inner_thoughts = None
# Google AI API doesn't generate tool call IDs
openai_response_message = Message(
role="assistant", # NOTE: "model" -> "assistant"
content=inner_thoughts,
tool_calls=[
ToolCall(
id=get_tool_call_id(),
type="function",
function=FunctionCall(
name=function_name,
arguments=clean_json_string_extra_backslash(json_dumps(function_args)),
),
)
],
)
assert INNER_THOUGHTS_KWARG in function_args, f"Couldn't find inner thoughts in function args:\n{function_call}"
inner_thoughts = function_args.pop(INNER_THOUGHTS_KWARG)
assert inner_thoughts is not None, f"Expected non-null inner thoughts function arg:\n{function_call}"
else:
inner_thoughts = None
# Google AI API doesn't generate tool call IDs
openai_response_message = Message(
role="assistant", # NOTE: "model" -> "assistant"
content=inner_thoughts,
tool_calls=[
ToolCall(
id=get_tool_call_id(),
type="function",
function=FunctionCall(
name=function_name,
arguments=clean_json_string_extra_backslash(json_dumps(function_args)),
),
)
],
# Inner thoughts are the content by default
inner_thoughts = response_message["text"]
# Google AI API doesn't generate tool call IDs
openai_response_message = Message(
role="assistant", # NOTE: "model" -> "assistant"
content=inner_thoughts,
)
# Google AI API uses different finish reason strings than OpenAI
# OpenAI: 'stop', 'length', 'function_call', 'content_filter', null
# see: https://platform.openai.com/docs/guides/text-generation/chat-completions-api
# Google AI API: FINISH_REASON_UNSPECIFIED, STOP, MAX_TOKENS, SAFETY, RECITATION, OTHER
# see: https://ai.google.dev/api/python/google/ai/generativelanguage/Candidate/FinishReason
finish_reason = candidate["finishReason"]
if finish_reason == "STOP":
openai_finish_reason = (
"function_call"
if openai_response_message.tool_calls is not None and len(openai_response_message.tool_calls) > 0
else "stop"
)
elif finish_reason == "MAX_TOKENS":
openai_finish_reason = "length"
elif finish_reason == "SAFETY":
openai_finish_reason = "content_filter"
elif finish_reason == "RECITATION":
openai_finish_reason = "content_filter"
else:
raise ValueError(f"Unrecognized finish reason in Google AI response: {finish_reason}")
choices.append(
Choice(
finish_reason=openai_finish_reason,
index=index,
message=openai_response_message,
)
)
index += 1
else:
# Inner thoughts are the content by default
inner_thoughts = response_message["text"]
# Google AI API doesn't generate tool call IDs
openai_response_message = Message(
role="assistant", # NOTE: "model" -> "assistant"
content=inner_thoughts,
)
# Google AI API uses different finish reason strings than OpenAI
# OpenAI: 'stop', 'length', 'function_call', 'content_filter', null
# see: https://platform.openai.com/docs/guides/text-generation/chat-completions-api
# Google AI API: FINISH_REASON_UNSPECIFIED, STOP, MAX_TOKENS, SAFETY, RECITATION, OTHER
# see: https://ai.google.dev/api/python/google/ai/generativelanguage/Candidate/FinishReason
finish_reason = candidate["finishReason"]
if finish_reason == "STOP":
openai_finish_reason = (
"function_call"
if openai_response_message.tool_calls is not None and len(openai_response_message.tool_calls) > 0
else "stop"
)
elif finish_reason == "MAX_TOKENS":
openai_finish_reason = "length"
elif finish_reason == "SAFETY":
openai_finish_reason = "content_filter"
elif finish_reason == "RECITATION":
openai_finish_reason = "content_filter"
else:
raise ValueError(f"Unrecognized finish reason in Google AI response: {finish_reason}")
choices.append(
Choice(
finish_reason=openai_finish_reason,
index=candidate["index"],
message=openai_response_message,
)
)
if len(choices) > 1:
raise UserWarning(f"Unexpected number of candidates in response (expected 1, got {len(choices)})")
# if len(choices) > 1:
# raise UserWarning(f"Unexpected number of candidates in response (expected 1, got {len(choices)})")
# NOTE: some of the Google AI APIs show UsageMetadata in the response, but it seems to not exist?
# "usageMetadata": {

View File

@@ -173,14 +173,14 @@ def check_agent_uses_external_tool(filename: str) -> LettaResponse:
My name is Letta.
I am a personal assistant who answers a user's questions about a website `example.com`. When a user asks me a question about `example.com`, I will use a tool called {tool.name} which will search `example.com` and answer the relevant question.
I am a personal assistant who uses a tool called {tool.name} to star a desired github repo.
Dont forget - inner monologue / inner thoughts should always be different than the contents of send_message! send_message is how you communicate with the user, whereas inner thoughts are your own personal inner thoughts.
"""
agent_state = setup_agent(client, filename, memory_persona_str=persona, tool_ids=[tool.id])
response = client.user_message(agent_id=agent_state.id, message="What's on the example.com website?")
response = client.user_message(agent_id=agent_state.id, message="Please star the repo with owner=letta-ai and repo=letta")
# Basic checks
assert_sanity_checks(response)

View File

@@ -7,3 +7,7 @@ filterwarnings =
markers =
local_sandbox: mark test as part of local sandbox tests
e2b_sandbox: mark test as part of E2B sandbox tests
openai_basic: Tests for OpenAI endpoints
anthropic_basic: Tests for Anthropic endpoints
azure_basic: Tests for Azure endpoints
gemini_basic: Tests for Gemini endpoints

View File

@@ -2,15 +2,15 @@ import functools
import os
import time
import pytest
from tests.helpers.endpoints_helper import (
check_agent_archival_memory_insert,
check_agent_archival_memory_retrieval,
check_agent_edit_core_memory,
check_agent_recall_chat_memory,
check_agent_summarize_memory_simple,
check_agent_uses_external_tool,
check_first_response_is_valid_for_llm_endpoint,
check_response_contains_keyword,
run_embedding_endpoint,
)
@@ -84,6 +84,7 @@ def retry_until_success(max_attempts=10, sleep_time_seconds=4):
# ======================================================================================================================
# OPENAI TESTS
# ======================================================================================================================
@pytest.mark.openai_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
def test_openai_gpt_4o_returns_valid_first_message():
filename = os.path.join(llm_config_dir, "openai-gpt-4o.json")
@@ -92,23 +93,16 @@ def test_openai_gpt_4o_returns_valid_first_message():
print(f"Got successful response from client: \n\n{response}")
@pytest.mark.openai_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
def test_openai_gpt_4o_returns_keyword():
keyword = "banana"
filename = os.path.join(llm_config_dir, "openai-gpt-4o.json")
response = check_response_contains_keyword(filename, keyword=keyword)
# Log out successful response
print(f"Got successful response from client: \n\n{response}")
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
def test_openai_gpt_4o_uses_external_tool():
def test_openai_gpt_4o_uses_external_tool(mock_e2b_api_key_none):
filename = os.path.join(llm_config_dir, "openai-gpt-4o.json")
response = check_agent_uses_external_tool(filename)
# Log out successful response
print(f"Got successful response from client: \n\n{response}")
@pytest.mark.openai_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
def test_openai_gpt_4o_recall_chat_memory():
filename = os.path.join(llm_config_dir, "openai-gpt-4o.json")
@@ -117,6 +111,7 @@ def test_openai_gpt_4o_recall_chat_memory():
print(f"Got successful response from client: \n\n{response}")
@pytest.mark.openai_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
def test_openai_gpt_4o_archival_memory_retrieval():
filename = os.path.join(llm_config_dir, "openai-gpt-4o.json")
@@ -125,6 +120,7 @@ def test_openai_gpt_4o_archival_memory_retrieval():
print(f"Got successful response from client: \n\n{response}")
@pytest.mark.openai_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
def test_openai_gpt_4o_archival_memory_insert():
filename = os.path.join(llm_config_dir, "openai-gpt-4o.json")
@@ -133,6 +129,7 @@ def test_openai_gpt_4o_archival_memory_insert():
print(f"Got successful response from client: \n\n{response}")
@pytest.mark.openai_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
def test_openai_gpt_4o_edit_core_memory():
filename = os.path.join(llm_config_dir, "openai-gpt-4o.json")
@@ -141,13 +138,7 @@ def test_openai_gpt_4o_edit_core_memory():
print(f"Got successful response from client: \n\n{response}")
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
def test_openai_gpt_4o_summarize_memory():
filename = os.path.join(llm_config_dir, "openai-gpt-4o.json")
response = check_agent_summarize_memory_simple(filename)
print(f"Got successful response from client: \n\n{response}")
@pytest.mark.openai_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
def test_embedding_endpoint_openai():
filename = os.path.join(embedding_config_dir, "openai_embed.json")
@@ -157,6 +148,8 @@ def test_embedding_endpoint_openai():
# ======================================================================================================================
# AZURE TESTS
# ======================================================================================================================
@pytest.mark.azure_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
def test_azure_gpt_4o_mini_returns_valid_first_message():
filename = os.path.join(llm_config_dir, "azure-gpt-4o-mini.json")
response = check_first_response_is_valid_for_llm_endpoint(filename)
@@ -164,21 +157,17 @@ def test_azure_gpt_4o_mini_returns_valid_first_message():
print(f"Got successful response from client: \n\n{response}")
def test_azure_gpt_4o_mini_returns_keyword():
keyword = "banana"
filename = os.path.join(llm_config_dir, "azure-gpt-4o-mini.json")
response = check_response_contains_keyword(filename, keyword=keyword)
# Log out successful response
print(f"Got successful response from client: \n\n{response}")
def test_azure_gpt_4o_mini_uses_external_tool():
@pytest.mark.azure_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
def test_azure_gpt_4o_mini_uses_external_tool(mock_e2b_api_key_none):
filename = os.path.join(llm_config_dir, "azure-gpt-4o-mini.json")
response = check_agent_uses_external_tool(filename)
# Log out successful response
print(f"Got successful response from client: \n\n{response}")
@pytest.mark.azure_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
def test_azure_gpt_4o_mini_recall_chat_memory():
filename = os.path.join(llm_config_dir, "azure-gpt-4o-mini.json")
response = check_agent_recall_chat_memory(filename)
@@ -186,6 +175,8 @@ def test_azure_gpt_4o_mini_recall_chat_memory():
print(f"Got successful response from client: \n\n{response}")
@pytest.mark.azure_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
def test_azure_gpt_4o_mini_archival_memory_retrieval():
filename = os.path.join(llm_config_dir, "azure-gpt-4o-mini.json")
response = check_agent_archival_memory_retrieval(filename)
@@ -193,6 +184,8 @@ def test_azure_gpt_4o_mini_archival_memory_retrieval():
print(f"Got successful response from client: \n\n{response}")
@pytest.mark.azure_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
def test_azure_gpt_4o_mini_edit_core_memory():
filename = os.path.join(llm_config_dir, "azure-gpt-4o-mini.json")
response = check_agent_edit_core_memory(filename)
@@ -200,6 +193,8 @@ def test_azure_gpt_4o_mini_edit_core_memory():
print(f"Got successful response from client: \n\n{response}")
@pytest.mark.azure_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
def test_azure_embedding_endpoint():
filename = os.path.join(embedding_config_dir, "azure_embed.json")
run_embedding_endpoint(filename)
@@ -239,6 +234,8 @@ def test_embedding_endpoint_ollama():
# ======================================================================================================================
# ANTHROPIC TESTS
# ======================================================================================================================
@pytest.mark.anthropic_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
def test_claude_haiku_3_5_returns_valid_first_message():
filename = os.path.join(llm_config_dir, "claude-3-5-haiku.json")
response = check_first_response_is_valid_for_llm_endpoint(filename)
@@ -246,21 +243,17 @@ def test_claude_haiku_3_5_returns_valid_first_message():
print(f"Got successful response from client: \n\n{response}")
def test_claude_haiku_3_5_returns_keyword():
keyword = "banana"
filename = os.path.join(llm_config_dir, "claude-3-5-haiku.json")
response = check_response_contains_keyword(filename, keyword=keyword)
# Log out successful response
print(f"Got successful response from client: \n\n{response}")
def test_claude_haiku_3_5_uses_external_tool():
@pytest.mark.anthropic_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
def test_claude_haiku_3_5_uses_external_tool(mock_e2b_api_key_none):
filename = os.path.join(llm_config_dir, "claude-3-5-haiku.json")
response = check_agent_uses_external_tool(filename)
# Log out successful response
print(f"Got successful response from client: \n\n{response}")
@pytest.mark.anthropic_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
def test_claude_haiku_3_5_recall_chat_memory():
filename = os.path.join(llm_config_dir, "claude-3-5-haiku.json")
response = check_agent_recall_chat_memory(filename)
@@ -268,6 +261,8 @@ def test_claude_haiku_3_5_recall_chat_memory():
print(f"Got successful response from client: \n\n{response}")
@pytest.mark.anthropic_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
def test_claude_haiku_3_5_archival_memory_retrieval():
filename = os.path.join(llm_config_dir, "claude-3-5-haiku.json")
response = check_agent_archival_memory_retrieval(filename)
@@ -275,6 +270,8 @@ def test_claude_haiku_3_5_archival_memory_retrieval():
print(f"Got successful response from client: \n\n{response}")
@pytest.mark.anthropic_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
def test_claude_haiku_3_5_edit_core_memory():
filename = os.path.join(llm_config_dir, "claude-3-5-haiku.json")
response = check_agent_edit_core_memory(filename)
@@ -292,15 +289,7 @@ def test_groq_llama31_70b_returns_valid_first_message():
print(f"Got successful response from client: \n\n{response}")
def test_groq_llama31_70b_returns_keyword():
keyword = "banana"
filename = os.path.join(llm_config_dir, "groq.json")
response = check_response_contains_keyword(filename, keyword=keyword)
# Log out successful response
print(f"Got successful response from client: \n\n{response}")
def test_groq_llama31_70b_uses_external_tool():
def test_groq_llama31_70b_uses_external_tool(mock_e2b_api_key_none):
filename = os.path.join(llm_config_dir, "groq.json")
response = check_agent_uses_external_tool(filename)
# Log out successful response
@@ -332,6 +321,8 @@ def test_groq_llama31_70b_edit_core_memory():
# ======================================================================================================================
# GEMINI TESTS
# ======================================================================================================================
@pytest.mark.gemini_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
def test_gemini_pro_15_returns_valid_first_message():
filename = os.path.join(llm_config_dir, "gemini-pro.json")
response = check_first_response_is_valid_for_llm_endpoint(filename)
@@ -339,21 +330,17 @@ def test_gemini_pro_15_returns_valid_first_message():
print(f"Got successful response from client: \n\n{response}")
def test_gemini_pro_15_returns_keyword():
keyword = "banana"
filename = os.path.join(llm_config_dir, "gemini-pro.json")
response = check_response_contains_keyword(filename, keyword=keyword)
# Log out successful response
print(f"Got successful response from client: \n\n{response}")
def test_gemini_pro_15_uses_external_tool():
@pytest.mark.gemini_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
def test_gemini_pro_15_uses_external_tool(mock_e2b_api_key_none):
filename = os.path.join(llm_config_dir, "gemini-pro.json")
response = check_agent_uses_external_tool(filename)
# Log out successful response
print(f"Got successful response from client: \n\n{response}")
@pytest.mark.gemini_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
def test_gemini_pro_15_recall_chat_memory():
filename = os.path.join(llm_config_dir, "gemini-pro.json")
response = check_agent_recall_chat_memory(filename)
@@ -361,6 +348,8 @@ def test_gemini_pro_15_recall_chat_memory():
print(f"Got successful response from client: \n\n{response}")
@pytest.mark.gemini_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
def test_gemini_pro_15_archival_memory_retrieval():
filename = os.path.join(llm_config_dir, "gemini-pro.json")
response = check_agent_archival_memory_retrieval(filename)
@@ -368,6 +357,8 @@ def test_gemini_pro_15_archival_memory_retrieval():
print(f"Got successful response from client: \n\n{response}")
@pytest.mark.gemini_basic
@retry_until_success(max_attempts=5, sleep_time_seconds=2)
def test_gemini_pro_15_edit_core_memory():
filename = os.path.join(llm_config_dir, "gemini-pro.json")
response = check_agent_edit_core_memory(filename)
@@ -385,15 +376,7 @@ def test_together_llama_3_70b_returns_valid_first_message():
print(f"Got successful response from client: \n\n{response}")
def test_together_llama_3_70b_returns_keyword():
keyword = "banana"
filename = os.path.join(llm_config_dir, "together-llama-3-70b.json")
response = check_response_contains_keyword(filename, keyword=keyword)
# Log out successful response
print(f"Got successful response from client: \n\n{response}")
def test_together_llama_3_70b_uses_external_tool():
def test_together_llama_3_70b_uses_external_tool(mock_e2b_api_key_none):
filename = os.path.join(llm_config_dir, "together-llama-3-70b.json")
response = check_agent_uses_external_tool(filename)
# Log out successful response