feat(ci): self-hosted provider test for lmstudio (#9404)
* add gpu runners and prod memory_repos * add lmstudio and vllm in model_settings * fix llm_configs and change variable name in reusable workflow and change perms for memory_repos to admin in tf * fix: update self-hosted provider tests to use SDK 1.0 and v2 tests - Update letta-client from ==0.1.324 to >=1.0.0 - Switch ollama/vllm/lmstudio tests to integration_test_send_message_v2.py 🤖 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * fix: use openai provider_type for self-hosted model settings ollama/vllm/lmstudio are not valid provider_type values in the SDK model_settings schema - they use openai-compatible APIs so provider_type should be openai. The provider routing is determined by the handle prefix. 🤖 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * fix: enable redis for ollama/vllm/lmstudio tests Background streaming tests require Redis. Add use-redis: true to self-hosted provider test workflows. 🤖 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * prep for lmstudio and vllm * used lmstudio_openai client * change tool call parser from hermes to qwen3_xml * qwen3_xmlk -> qwen3_coder * revert to hermes (incompatible with parallel tool calls?) and skipping vllm tests on parallel tool calls * install uv redis extra * remove lmstudio * create lmstudio test * qwen3-14b on lmstudio * try with qwen3-4b * actually update the model config json to use qwen3-4b * add test_providers::test_lmstudio * bump timeout from 60 to 120 for slow lmstudio on cpu model * misc vllm changes --------- Co-authored-by: Letta <noreply@letta.com>
This commit is contained in:
@@ -213,7 +213,7 @@ def assert_tool_call_response(
|
|||||||
"claude-sonnet-4-5-20250929" in model_handle
|
"claude-sonnet-4-5-20250929" in model_handle
|
||||||
or "claude-opus-4-1" in model_handle
|
or "claude-opus-4-1" in model_handle
|
||||||
or model_settings.get("provider_type") == "zai"
|
or model_settings.get("provider_type") == "zai"
|
||||||
or model_handle.startswith(("ollama/", "vllm/"))
|
or model_handle.startswith(("ollama/", "vllm/", "lmstudio_openai/"))
|
||||||
)
|
)
|
||||||
if is_extra_assistant_model and index < len(messages) and isinstance(messages[index], AssistantMessage):
|
if is_extra_assistant_model and index < len(messages) and isinstance(messages[index], AssistantMessage):
|
||||||
# Skip the extra AssistantMessage and move to the next message
|
# Skip the extra AssistantMessage and move to the next message
|
||||||
@@ -444,8 +444,8 @@ def get_expected_message_count_range(
|
|||||||
if model_settings.get("provider_type") == "zai":
|
if model_settings.get("provider_type") == "zai":
|
||||||
expected_range += 1
|
expected_range += 1
|
||||||
|
|
||||||
# Self-hosted models (ollama/vllm) may emit an extra AssistantMessage with thinking content
|
# Self-hosted models (ollama/vllm/lmstudio) may emit an extra AssistantMessage with thinking content
|
||||||
if model_handle.startswith(("ollama/", "vllm/")):
|
if model_handle.startswith(("ollama/", "vllm/", "lmstudio/", "lmstudio_openai/")):
|
||||||
expected_range += 1
|
expected_range += 1
|
||||||
|
|
||||||
if tool_call:
|
if tool_call:
|
||||||
@@ -621,7 +621,7 @@ async def test_greeting(
|
|||||||
agent_id=agent_state.id,
|
agent_id=agent_state.id,
|
||||||
messages=USER_MESSAGE_FORCE_REPLY,
|
messages=USER_MESSAGE_FORCE_REPLY,
|
||||||
)
|
)
|
||||||
run = await wait_for_run_completion(client, run.id, timeout=60.0)
|
run = await wait_for_run_completion(client, run.id, timeout=120.0)
|
||||||
messages_page = await client.runs.messages.list(run_id=run.id)
|
messages_page = await client.runs.messages.list(run_id=run.id)
|
||||||
messages = [m for m in messages_page.items if m.message_type != "user_message"]
|
messages = [m for m in messages_page.items if m.message_type != "user_message"]
|
||||||
run_id = run.id
|
run_id = run.id
|
||||||
@@ -687,6 +687,12 @@ async def test_parallel_tool_calls(
|
|||||||
if provider_type in ["google_ai", "google_vertex"]:
|
if provider_type in ["google_ai", "google_vertex"]:
|
||||||
pytest.skip("Gemini models are flaky for this test so we disable them for now")
|
pytest.skip("Gemini models are flaky for this test so we disable them for now")
|
||||||
|
|
||||||
|
if model_handle.startswith("lmstudio"):
|
||||||
|
pytest.skip("LMStudio runs on CPU and times out on parallel tool call tests")
|
||||||
|
|
||||||
|
if model_handle.startswith("vllm"):
|
||||||
|
pytest.skip("vLLM Qwen3 tool call parsers incompatible with streaming parallel tool calls")
|
||||||
|
|
||||||
if model_handle.startswith("vllm"):
|
if model_handle.startswith("vllm"):
|
||||||
pytest.skip("vLLM Qwen3 tool call parsers incompatible with streaming parallel tool calls")
|
pytest.skip("vLLM Qwen3 tool call parsers incompatible with streaming parallel tool calls")
|
||||||
|
|
||||||
@@ -710,7 +716,7 @@ async def test_parallel_tool_calls(
|
|||||||
agent_id=agent_state.id,
|
agent_id=agent_state.id,
|
||||||
messages=USER_MESSAGE_PARALLEL_TOOL_CALL,
|
messages=USER_MESSAGE_PARALLEL_TOOL_CALL,
|
||||||
)
|
)
|
||||||
await wait_for_run_completion(client, run.id, timeout=60.0)
|
await wait_for_run_completion(client, run.id, timeout=120.0)
|
||||||
else:
|
else:
|
||||||
response = await client.agents.messages.stream(
|
response = await client.agents.messages.stream(
|
||||||
agent_id=agent_state.id,
|
agent_id=agent_state.id,
|
||||||
@@ -899,7 +905,7 @@ async def test_tool_call(
|
|||||||
agent_id=agent_state.id,
|
agent_id=agent_state.id,
|
||||||
messages=USER_MESSAGE_ROLL_DICE,
|
messages=USER_MESSAGE_ROLL_DICE,
|
||||||
)
|
)
|
||||||
run = await wait_for_run_completion(client, run.id, timeout=60.0)
|
run = await wait_for_run_completion(client, run.id, timeout=120.0)
|
||||||
messages_page = await client.runs.messages.list(run_id=run.id)
|
messages_page = await client.runs.messages.list(run_id=run.id)
|
||||||
messages = [m for m in messages_page.items if m.message_type != "user_message"]
|
messages = [m for m in messages_page.items if m.message_type != "user_message"]
|
||||||
run_id = run.id
|
run_id = run.id
|
||||||
@@ -1090,7 +1096,7 @@ async def test_conversation_non_streaming_raw_http(
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
os.getenv("LLM_CONFIG_FILE", "").startswith(("ollama", "vllm")),
|
os.getenv("LLM_CONFIG_FILE", "").startswith(("ollama", "vllm", "lmstudio")),
|
||||||
reason="Structured output not supported on self-hosted providers in CI",
|
reason="Structured output not supported on self-hosted providers in CI",
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
|||||||
9
tests/model_settings/lmstudio.json
Normal file
9
tests/model_settings/lmstudio.json
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
{
|
||||||
|
"handle": "lmstudio_openai/qwen3-4b",
|
||||||
|
"model_settings": {
|
||||||
|
"provider_type": "openai",
|
||||||
|
"temperature": 0.7,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"parallel_tool_calls": true
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -233,6 +233,21 @@ async def test_vllm():
|
|||||||
assert len(embedding_models) == 0 # embedding models currently not supported by vLLM
|
assert len(embedding_models) == 0 # embedding models currently not supported by vLLM
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(model_settings.lmstudio_base_url is None, reason="Only run if LMSTUDIO_BASE_URL is set.")
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_lmstudio():
|
||||||
|
from letta.schemas.providers.lmstudio import LMStudioOpenAIProvider
|
||||||
|
|
||||||
|
provider = LMStudioOpenAIProvider(name="lmstudio_openai", base_url=model_settings.lmstudio_base_url)
|
||||||
|
models = await provider.list_llm_models_async()
|
||||||
|
assert len(models) > 0
|
||||||
|
assert models[0].handle == f"{provider.name}/{models[0].model}"
|
||||||
|
|
||||||
|
embedding_models = await provider.list_embedding_models_async()
|
||||||
|
assert len(embedding_models) > 0
|
||||||
|
assert embedding_models[0].handle == f"{provider.name}/{embedding_models[0].embedding_model}"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(model_settings.sglang_api_base is None, reason="Only run if SGLANG_API_BASE is set.")
|
@pytest.mark.skipif(model_settings.sglang_api_base is None, reason="Only run if SGLANG_API_BASE is set.")
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_sglang():
|
async def test_sglang():
|
||||||
|
|||||||
Reference in New Issue
Block a user