feat: migrate integration tests to use model_settings (#6125)

* migration integration tests to use model_settings

* updated configs

* update v2 tests

* revert files

* add some configs and fix tests

* fix conversation_search

---------

Co-authored-by: Ari Webb <ari@letta.com>
This commit is contained in:
Sarah Wooders
2025-11-19 11:21:09 -08:00
committed by Caren Thomas
parent f9b3978460
commit ce3add4f55
23 changed files with 675 additions and 398 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -16,8 +16,6 @@ from letta_client.types import AgentState, MessageCreateParam, ToolReturnMessage
from letta_client.types.agents import AssistantMessage, ReasoningMessage, Run, ToolCallMessage, UserMessage
from letta_client.types.agents.letta_streaming_response import LettaPing, LettaStopReason, LettaUsageStatistics
from letta.schemas.llm_config import LLMConfig
logger = logging.getLogger(__name__)
@@ -35,17 +33,17 @@ all_configs = [
]
def get_llm_config(filename: str, llm_config_dir: str = "tests/configs/llm_model_configs") -> LLMConfig:
filename = os.path.join(llm_config_dir, filename)
def get_model_config(filename: str, model_settings_dir: str = "tests/sdk_v1/model_settings") -> Tuple[str, dict]:
"""Load a model_settings file and return the handle and settings dict."""
filename = os.path.join(model_settings_dir, filename)
with open(filename, "r") as f:
config_data = json.load(f)
llm_config = LLMConfig(**config_data)
return llm_config
return config_data["handle"], config_data.get("model_settings", {})
requested = os.getenv("LLM_CONFIG_FILE")
filenames = [requested] if requested else all_configs
TESTED_LLM_CONFIGS: List[LLMConfig] = [get_llm_config(fn) for fn in filenames]
TESTED_MODEL_CONFIGS: List[Tuple[str, dict]] = [get_model_config(fn) for fn in filenames]
def roll_dice(num_sides: int) -> int:
@@ -91,7 +89,8 @@ USER_MESSAGE_PARALLEL_TOOL_CALL: List[MessageCreateParam] = [
def assert_greeting_response(
messages: List[Any],
llm_config: LLMConfig,
model_handle: str,
model_settings: dict,
streaming: bool = False,
token_streaming: bool = False,
from_db: bool = False,
@@ -106,7 +105,7 @@ def assert_greeting_response(
]
expected_message_count_min, expected_message_count_max = get_expected_message_count_range(
llm_config, streaming=streaming, from_db=from_db
model_handle, model_settings, streaming=streaming, from_db=from_db
)
assert expected_message_count_min <= len(messages) <= expected_message_count_max
@@ -120,7 +119,7 @@ def assert_greeting_response(
# Reasoning message if reasoning enabled
otid_suffix = 0
try:
if is_reasoner_model(llm_config):
if is_reasoner_model(model_handle, model_settings):
assert isinstance(messages[index], ReasoningMessage)
assert messages[index].otid and messages[index].otid[-1] == str(otid_suffix)
index += 1
@@ -151,7 +150,8 @@ def assert_greeting_response(
def assert_tool_call_response(
messages: List[Any],
llm_config: LLMConfig,
model_handle: str,
model_settings: dict,
streaming: bool = False,
from_db: bool = False,
with_cancellation: bool = False,
@@ -172,7 +172,7 @@ def assert_tool_call_response(
if not with_cancellation:
expected_message_count_min, expected_message_count_max = get_expected_message_count_range(
llm_config, tool_call=True, streaming=streaming, from_db=from_db
model_handle, model_settings, tool_call=True, streaming=streaming, from_db=from_db
)
assert expected_message_count_min <= len(messages) <= expected_message_count_max
@@ -190,7 +190,7 @@ def assert_tool_call_response(
# Reasoning message if reasoning enabled
otid_suffix = 0
try:
if is_reasoner_model(llm_config):
if is_reasoner_model(model_handle, model_settings):
assert isinstance(messages[index], ReasoningMessage)
assert messages[index].otid and messages[index].otid[-1] == str(otid_suffix)
index += 1
@@ -201,7 +201,7 @@ def assert_tool_call_response(
# Special case for claude-sonnet-4-5-20250929 and opus-4.1 which can generate an extra AssistantMessage before tool call
if (
(llm_config.model == "claude-sonnet-4-5-20250929" or llm_config.model.startswith("claude-opus-4-1"))
("claude-sonnet-4-5-20250929" in model_handle or "claude-opus-4-1" in model_handle)
and index < len(messages)
and isinstance(messages[index], AssistantMessage)
):
@@ -235,7 +235,7 @@ def assert_tool_call_response(
# Reasoning message if reasoning enabled
otid_suffix = 0
try:
if is_reasoner_model(llm_config):
if is_reasoner_model(model_handle, model_settings):
assert isinstance(messages[index], ReasoningMessage)
assert messages[index].otid and messages[index].otid[-1] == str(otid_suffix)
index += 1
@@ -373,7 +373,7 @@ async def wait_for_run_completion(client: AsyncLetta, run_id: str, timeout: floa
def get_expected_message_count_range(
llm_config: LLMConfig, tool_call: bool = False, streaming: bool = False, from_db: bool = False
model_handle: str, model_settings: dict, tool_call: bool = False, streaming: bool = False, from_db: bool = False
) -> Tuple[int, int]:
"""
Returns the expected range of number of messages for a given LLM configuration. Uses range to account for possible variations in the number of reasoning messages.
@@ -402,23 +402,26 @@ def get_expected_message_count_range(
expected_message_count = 1
expected_range = 0
if is_reasoner_model(llm_config):
if is_reasoner_model(model_handle, model_settings):
# reasoning message
expected_range += 1
if tool_call:
# check for sonnet 4.5 or opus 4.1 specifically
is_sonnet_4_5_or_opus_4_1 = (
llm_config.model_endpoint_type == "anthropic"
and llm_config.enable_reasoner
and (llm_config.model.startswith("claude-sonnet-4-5") or llm_config.model.startswith("claude-opus-4-1"))
model_settings.get("provider_type") == "anthropic"
and model_settings.get("thinking", {}).get("type") == "enabled"
and ("claude-sonnet-4-5" in model_handle or "claude-opus-4-1" in model_handle)
)
if is_sonnet_4_5_or_opus_4_1 or not LLMConfig.is_anthropic_reasoning_model(llm_config):
is_anthropic_reasoning = (
model_settings.get("provider_type") == "anthropic" and model_settings.get("thinking", {}).get("type") == "enabled"
)
if is_sonnet_4_5_or_opus_4_1 or not is_anthropic_reasoning:
# sonnet 4.5 and opus 4.1 return a reasoning message before the final assistant message
# so do the other native reasoning models
expected_range += 1
# opus 4.1 generates an extra AssistantMessage before the tool call
if llm_config.model.startswith("claude-opus-4-1"):
if "claude-opus-4-1" in model_handle:
expected_range += 1
if tool_call:
@@ -436,13 +439,34 @@ def get_expected_message_count_range(
return expected_message_count, expected_message_count + expected_range
def is_reasoner_model(llm_config: LLMConfig) -> bool:
return (
(LLMConfig.is_openai_reasoning_model(llm_config) and llm_config.reasoning_effort == "high")
or LLMConfig.is_anthropic_reasoning_model(llm_config)
or LLMConfig.is_google_vertex_reasoning_model(llm_config)
or LLMConfig.is_google_ai_reasoning_model(llm_config)
def is_reasoner_model(model_handle: str, model_settings: dict) -> bool:
"""Check if the model is a reasoning model based on its handle and settings."""
# OpenAI reasoning models with high reasoning effort
is_openai_reasoning = (
model_settings.get("provider_type") == "openai"
and (
"gpt-5" in model_handle
or "o1" in model_handle
or "o3" in model_handle
or "o4-mini" in model_handle
or "gpt-4.1" in model_handle
)
and model_settings.get("reasoning", {}).get("reasoning_effort") == "high"
)
# Anthropic models with thinking enabled
is_anthropic_reasoning = (
model_settings.get("provider_type") == "anthropic" and model_settings.get("thinking", {}).get("type") == "enabled"
)
# Google Vertex models with thinking config
is_google_vertex_reasoning = (
model_settings.get("provider_type") == "google_vertex" and model_settings.get("thinking_config", {}).get("include_thoughts") is True
)
# Google AI models with thinking config
is_google_ai_reasoning = (
model_settings.get("provider_type") == "google_ai" and model_settings.get("thinking_config", {}).get("include_thoughts") is True
)
return is_openai_reasoning or is_anthropic_reasoning or is_google_vertex_reasoning or is_google_ai_reasoning
# ------------------------------
@@ -524,9 +548,9 @@ async def agent_state(client: AsyncLetta) -> AgentState:
@pytest.mark.parametrize(
"llm_config",
TESTED_LLM_CONFIGS,
ids=[c.model for c in TESTED_LLM_CONFIGS],
"model_config",
TESTED_MODEL_CONFIGS,
ids=[handle for handle, _ in TESTED_MODEL_CONFIGS],
)
@pytest.mark.parametrize("send_type", ["step", "stream_steps", "stream_tokens", "stream_tokens_background", "async"])
@pytest.mark.asyncio(loop_scope="function")
@@ -534,12 +558,13 @@ async def test_greeting(
disable_e2b_api_key: Any,
client: AsyncLetta,
agent_state: AgentState,
llm_config: LLMConfig,
model_config: Tuple[str, dict],
send_type: str,
) -> None:
model_handle, model_settings = model_config
last_message_page = await client.agents.messages.list(agent_id=agent_state.id, limit=1)
last_message = last_message_page.items[0] if last_message_page.items else None
agent_state = await client.agents.update(agent_id=agent_state.id, llm_config=llm_config)
agent_state = await client.agents.update(agent_id=agent_state.id, model=model_handle, model_settings=model_settings)
if send_type == "step":
response = await client.agents.messages.create(
@@ -573,19 +598,19 @@ async def test_greeting(
run_id = runs.items[0].id if runs.items else None
assert_greeting_response(
messages, streaming=("stream" in send_type), token_streaming=(send_type == "stream_tokens"), llm_config=llm_config
messages, model_handle, model_settings, streaming=("stream" in send_type), token_streaming=(send_type == "stream_tokens")
)
if "background" in send_type:
response = await client.runs.messages.stream(run_id=run_id, starting_after=0)
messages = await accumulate_chunks(response)
assert_greeting_response(
messages, streaming=("stream" in send_type), token_streaming=(send_type == "stream_tokens"), llm_config=llm_config
messages, model_handle, model_settings, streaming=("stream" in send_type), token_streaming=(send_type == "stream_tokens")
)
messages_from_db_page = await client.agents.messages.list(agent_id=agent_state.id, after=last_message.id if last_message else None)
messages_from_db = messages_from_db_page.items
assert_greeting_response(messages_from_db, from_db=True, llm_config=llm_config)
assert_greeting_response(messages_from_db, model_handle, model_settings, from_db=True)
assert run_id is not None
run = await client.runs.retrieve(run_id=run_id)
@@ -593,9 +618,9 @@ async def test_greeting(
@pytest.mark.parametrize(
"llm_config",
TESTED_LLM_CONFIGS,
ids=[c.model for c in TESTED_LLM_CONFIGS],
"model_config",
TESTED_MODEL_CONFIGS,
ids=[handle for handle, _ in TESTED_MODEL_CONFIGS],
)
@pytest.mark.parametrize("send_type", ["step", "stream_steps", "stream_tokens", "stream_tokens_background", "async"])
@pytest.mark.asyncio(loop_scope="function")
@@ -603,28 +628,33 @@ async def test_parallel_tool_calls(
disable_e2b_api_key: Any,
client: AsyncLetta,
agent_state: AgentState,
llm_config: LLMConfig,
model_config: Tuple[str, dict],
send_type: str,
) -> None:
if llm_config.model_endpoint_type not in ["anthropic", "openai", "google_ai", "google_vertex"]:
model_handle, model_settings = model_config
provider_type = model_settings.get("provider_type", "")
if provider_type not in ["anthropic", "openai", "google_ai", "google_vertex"]:
pytest.skip("Parallel tool calling test only applies to Anthropic, OpenAI, and Gemini models.")
if llm_config.model in ["gpt-5", "o3"]:
if "gpt-5" in model_handle or "o3" in model_handle:
pytest.skip("GPT-5 takes too long to test, o3 is bad at this task.")
# change llm_config to support parallel tool calling
# Create a copy and modify it to ensure we're not modifying the original
modified_llm_config = llm_config.model_copy(deep=True)
modified_llm_config.parallel_tool_calls = True
# this test was flaking so set temperature to 0.0 to avoid randomness
modified_llm_config.temperature = 0.0
# Skip Gemini models due to issues with parallel tool calling
if provider_type in ["google_ai", "google_vertex"]:
pytest.skip("Gemini models are flaky for this test so we disable them for now")
# IMPORTANT: Set parallel_tool_calls at BOTH the agent level and llm_config level
# There are two different parallel_tool_calls fields that need to be set
# # Update model_settings to enable parallel tool calling
# modified_model_settings = model_settings.copy()
# modified_model_settings["parallel_tool_calls"] = True
# IMPORTANT: Set parallel_tool_calls at BOTH the agent level and in model_settings
# Even though the agent-level parameter is deprecated, it may still be needed
agent_state = await client.agents.update(
agent_id=agent_state.id,
llm_config=modified_llm_config,
parallel_tool_calls=True, # Set at agent level as well!
model=model_handle,
model_settings=model_settings,
parallel_tool_calls=True, # Set at agent level as well
)
if send_type == "step":
@@ -696,7 +726,7 @@ async def test_parallel_tool_calls(
# IMPORTANT: Assert that parallel tool calling is actually working
# This test should FAIL if parallel tool calling is not working properly
assert is_parallel, (
f"Parallel tool calling is NOT working for {llm_config.model_endpoint_type}! "
f"Parallel tool calling is NOT working for {provider_type}! "
f"Got {len(tool_call_messages)} ToolCallMessage(s) instead of 1 with 3 parallel calls. "
f"When using letta_v1_agent with parallel_tool_calls=True, all tool calls should be in a single message."
)
@@ -773,9 +803,9 @@ async def test_parallel_tool_calls(
@pytest.mark.parametrize(
"llm_config",
TESTED_LLM_CONFIGS,
ids=[c.model for c in TESTED_LLM_CONFIGS],
"model_config",
TESTED_MODEL_CONFIGS,
ids=[handle for handle, _ in TESTED_MODEL_CONFIGS],
)
@pytest.mark.parametrize(
["send_type", "cancellation"],
@@ -796,20 +826,22 @@ async def test_tool_call(
disable_e2b_api_key: Any,
client: AsyncLetta,
agent_state: AgentState,
llm_config: LLMConfig,
model_config: Tuple[str, dict],
send_type: str,
cancellation: str,
) -> None:
model_handle, model_settings = model_config
# Skip models with OTID mismatch issues between ToolCallMessage and ToolReturnMessage
if llm_config.model == "gpt-5" or llm_config.model == "claude-sonnet-4-5-20250929" or llm_config.model.startswith("claude-opus-4-1"):
pytest.skip(f"Skipping {llm_config.model} due to OTID chain issue - messages receive incorrect OTID suffixes")
if "gpt-5" in model_handle or "claude-sonnet-4-5-20250929" in model_handle or "claude-opus-4-1" in model_handle:
pytest.skip(f"Skipping {model_handle} due to OTID chain issue - messages receive incorrect OTID suffixes")
last_message_page = await client.agents.messages.list(agent_id=agent_state.id, limit=1)
last_message = last_message_page.items[0] if last_message_page.items else None
agent_state = await client.agents.update(agent_id=agent_state.id, llm_config=llm_config)
agent_state = await client.agents.update(agent_id=agent_state.id, model=model_handle, model_settings=model_settings)
if cancellation == "with_cancellation":
delay = 5 if llm_config.model == "gpt-5" else 0.5 # increase delay for responses api
delay = 5 if "gpt-5" in model_handle else 0.5 # increase delay for responses api
_cancellation_task = asyncio.create_task(cancel_run_after_delay(client, agent_state.id, delay=delay))
if send_type == "step":
@@ -844,20 +876,24 @@ async def test_tool_call(
run_id = runs.items[0].id if runs.items else None
assert_tool_call_response(
messages, streaming=("stream" in send_type), llm_config=llm_config, with_cancellation=(cancellation == "with_cancellation")
messages, model_handle, model_settings, streaming=("stream" in send_type), with_cancellation=(cancellation == "with_cancellation")
)
if "background" in send_type:
response = await client.runs.messages.stream(run_id=run_id, starting_after=0)
messages = await accumulate_chunks(response)
assert_tool_call_response(
messages, streaming=("stream" in send_type), llm_config=llm_config, with_cancellation=(cancellation == "with_cancellation")
messages,
model_handle,
model_settings,
streaming=("stream" in send_type),
with_cancellation=(cancellation == "with_cancellation"),
)
messages_from_db_page = await client.agents.messages.list(agent_id=agent_state.id, after=last_message.id if last_message else None)
messages_from_db = messages_from_db_page.items
assert_tool_call_response(
messages_from_db, from_db=True, llm_config=llm_config, with_cancellation=(cancellation == "with_cancellation")
messages_from_db, model_handle, model_settings, from_db=True, with_cancellation=(cancellation == "with_cancellation")
)
assert run_id is not None

View File

@@ -0,0 +1,9 @@
{
"handle": "azure/gpt-4o-mini",
"model_settings": {
"provider_type": "azure",
"temperature": 0.7,
"max_output_tokens": 4096,
"parallel_tool_calls": false
}
}

View File

@@ -0,0 +1,9 @@
{
"handle": "bedrock/arn:aws:bedrock:us-east-1:474668403324:inference-profile/us.anthropic.claude-sonnet-4-20250514-v1:0",
"model_settings": {
"provider_type": "bedrock",
"temperature": 0.7,
"max_output_tokens": 4096,
"parallel_tool_calls": false
}
}

View File

@@ -0,0 +1,13 @@
{
"handle": "anthropic/claude-3-5-sonnet-20241022",
"model_settings": {
"provider_type": "anthropic",
"temperature": 1.0,
"max_output_tokens": 4096,
"parallel_tool_calls": false,
"thinking": {
"type": "enabled",
"budget_tokens": 1024
}
}
}

View File

@@ -0,0 +1,13 @@
{
"handle": "anthropic/claude-3-7-sonnet-20250219",
"model_settings": {
"provider_type": "anthropic",
"temperature": 1.0,
"max_output_tokens": 4096,
"parallel_tool_calls": false,
"thinking": {
"type": "enabled",
"budget_tokens": 1024
}
}
}

View File

@@ -0,0 +1,13 @@
{
"handle": "anthropic/claude-3-7-sonnet-20250219",
"model_settings": {
"provider_type": "anthropic",
"temperature": 1.0,
"max_output_tokens": 4096,
"parallel_tool_calls": false,
"thinking": {
"type": "enabled",
"budget_tokens": 1024
}
}
}

View File

@@ -0,0 +1,13 @@
{
"handle": "anthropic/claude-sonnet-4-5-20250929",
"model_settings": {
"provider_type": "anthropic",
"temperature": 1.0,
"max_output_tokens": 4096,
"parallel_tool_calls": false,
"thinking": {
"type": "enabled",
"budget_tokens": 1024
}
}
}

View File

@@ -0,0 +1,13 @@
{
"handle": "anthropic/claude-sonnet-4-20250514",
"model_settings": {
"provider_type": "anthropic",
"temperature": 1.0,
"max_output_tokens": 4096,
"parallel_tool_calls": false,
"thinking": {
"type": "enabled",
"budget_tokens": 1024
}
}
}

View File

@@ -0,0 +1,13 @@
{
"handle": "anthropic/claude-sonnet-4-20250514",
"model_settings": {
"provider_type": "anthropic",
"temperature": 1.0,
"max_output_tokens": 4096,
"parallel_tool_calls": false,
"thinking": {
"type": "enabled",
"budget_tokens": 1024
}
}
}

View File

@@ -0,0 +1,13 @@
{
"handle": "google_vertex/gemini-2.5-flash",
"model_settings": {
"provider_type": "google_vertex",
"temperature": 0.7,
"max_output_tokens": 65536,
"parallel_tool_calls": false,
"thinking_config": {
"include_thoughts": true,
"thinking_budget": 1024
}
}
}

View File

@@ -0,0 +1,13 @@
{
"handle": "google_vertex/gemini-2.5-pro",
"model_settings": {
"provider_type": "google_vertex",
"temperature": 0.7,
"max_output_tokens": 65536,
"parallel_tool_calls": false,
"thinking_config": {
"include_thoughts": true,
"thinking_budget": 1024
}
}
}

View File

@@ -0,0 +1,13 @@
{
"handle": "google_ai/gemini-2.5-pro",
"model_settings": {
"provider_type": "google_ai",
"temperature": 0.7,
"max_output_tokens": 65536,
"parallel_tool_calls": false,
"thinking_config": {
"include_thoughts": true,
"thinking_budget": 1024
}
}
}

View File

@@ -0,0 +1,9 @@
{
"handle": "groq/qwen/qwen3-32b",
"model_settings": {
"provider_type": "groq",
"temperature": 0.7,
"max_output_tokens": 4096,
"parallel_tool_calls": false
}
}

View File

@@ -0,0 +1,9 @@
{
"handle": "ollama/qwen2.5:7b",
"model_settings": {
"provider_type": "openai",
"temperature": 0.7,
"max_output_tokens": 4096,
"parallel_tool_calls": false
}
}

View File

@@ -0,0 +1,12 @@
{
"handle": "openai/gpt-4.1-2025-04-14",
"model_settings": {
"provider_type": "openai",
"temperature": 0.7,
"max_output_tokens": 4096,
"parallel_tool_calls": false,
"reasoning": {
"reasoning_effort": "high"
}
}
}

View File

@@ -0,0 +1,12 @@
{
"handle": "openai/gpt-4o-mini",
"model_settings": {
"provider_type": "openai",
"temperature": 0.7,
"max_output_tokens": 4096,
"parallel_tool_calls": false,
"reasoning": {
"reasoning_effort": "minimal"
}
}
}

View File

@@ -0,0 +1,11 @@
{
"handle": "openai/gpt-5",
"model_settings": {
"provider_type": "openai",
"max_output_tokens": 4096,
"parallel_tool_calls": false,
"reasoning": {
"reasoning_effort": "minimal"
}
}
}

View File

@@ -0,0 +1,12 @@
{
"handle": "openai/o1",
"model_settings": {
"provider_type": "openai",
"temperature": 0.7,
"max_output_tokens": 4096,
"parallel_tool_calls": false,
"reasoning": {
"reasoning_effort": "high"
}
}
}

View File

@@ -0,0 +1,12 @@
{
"handle": "openai/o3",
"model_settings": {
"provider_type": "openai",
"temperature": 0.7,
"max_output_tokens": 4096,
"parallel_tool_calls": false,
"reasoning": {
"reasoning_effort": "high"
}
}
}

View File

@@ -0,0 +1,12 @@
{
"handle": "openai/o4-mini",
"model_settings": {
"provider_type": "openai",
"temperature": 0.7,
"max_output_tokens": 4096,
"parallel_tool_calls": false,
"reasoning": {
"reasoning_effort": "high"
}
}
}

View File

@@ -0,0 +1,9 @@
{
"handle": "together/Qwen/Qwen2.5-72B-Instruct-Turbo",
"model_settings": {
"provider_type": "together",
"temperature": 0.7,
"max_output_tokens": 4096,
"parallel_tool_calls": false
}
}