From 7eb85707b155fab64c9bab578948570b0ecf7b49 Mon Sep 17 00:00:00 2001 From: Kian Jones <11655409+kianjones9@users.noreply.github.com> Date: Tue, 10 Feb 2026 10:56:55 -0800 Subject: [PATCH] feat(tf): gpu runners and prod memory_repos (#9283) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add gpu runners and prod memory_repos * add lmstudio and vllm in model_settings * fix llm_configs and change variable name in reusable workflow and change perms for memory_repos to admin in tf * fix: update self-hosted provider tests to use SDK 1.0 and v2 tests - Update letta-client from ==0.1.324 to >=1.0.0 - Switch ollama/vllm/lmstudio tests to integration_test_send_message_v2.py 🤖 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta * fix: use openai provider_type for self-hosted model settings ollama/vllm/lmstudio are not valid provider_type values in the SDK model_settings schema - they use openai-compatible APIs so provider_type should be openai. The provider routing is determined by the handle prefix. 🤖 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta * fix: use openai_compat_base_url for ollama/vllm/lmstudio providers When reconstructing LLMConfig from a model handle lookup, use the provider's openai_compat_base_url (which includes /v1) instead of raw base_url. This fixes 404 errors when calling ollama/vllm/lmstudio since OpenAI client expects /v1/chat/completions endpoint. 🤖 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta * fix: enable redis for ollama/vllm/lmstudio tests Background streaming tests require Redis. Add use-redis: true to self-hosted provider test workflows. 🤖 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta * add memfs-py in prod bucket access * change ollama * change packer model defaults * self-hosted provider support * diasble reasoner to match the number of messages in test case, enable parallel tool calls, and pass embedding configs * remove reasoning setting not supported for ollama * add qwen3 to extra assistant message case * lower temp * prep for lmstudio and vllm * used lmstudio_openai client * skip parallel tool calls on cpu ran provider lmstudio * revert downgrade since it's so slow already * add reuired flags for tool call parsing etc. * change tool call parser from hermes to qwen3_xml * qwen3_xmlk -> qwen3_coder * upgrade vllm to latest container * revert to hermes (incompatible with parallel tool calls?) and skipping vllm tests on parallel tool calls * install uv redis extra * remove lmstudio --------- Co-authored-by: Letta --- letta/services/provider_manager.py | 9 ++++--- tests/integration_test_send_message_v2.py | 33 +++++++++++++++++------ tests/model_settings/ollama.json | 6 ++--- tests/model_settings/vllm.json | 9 +++++++ 4 files changed, 43 insertions(+), 14 deletions(-) create mode 100644 tests/model_settings/vllm.json diff --git a/letta/services/provider_manager.py b/letta/services/provider_manager.py index 2f66d140..c8a3a1ae 100644 --- a/letta/services/provider_manager.py +++ b/letta/services/provider_manager.py @@ -991,10 +991,13 @@ class ProviderManager: # Get the default max_output_tokens from the provider (provider-specific logic) max_tokens = typed_provider.get_default_max_output_tokens(model.name) - # Determine the model endpoint - use provider's base_url if set, - # otherwise use provider-specific defaults + # Determine the model endpoint - use provider's OpenAI-compatible base_url if available, + # otherwise fall back to raw base_url or provider-specific defaults - if typed_provider.base_url: + if hasattr(typed_provider, "openai_compat_base_url"): + # For providers like ollama/vllm/lmstudio that need /v1 appended for OpenAI compatibility + model_endpoint = typed_provider.openai_compat_base_url + elif typed_provider.base_url: model_endpoint = typed_provider.base_url elif provider.provider_type == ProviderType.chatgpt_oauth: # ChatGPT OAuth uses the ChatGPT backend API, not a generic endpoint pattern diff --git a/tests/integration_test_send_message_v2.py b/tests/integration_test_send_message_v2.py index ce267e7d..7f040eb2 100644 --- a/tests/integration_test_send_message_v2.py +++ b/tests/integration_test_send_message_v2.py @@ -207,12 +207,15 @@ def assert_tool_call_response( # Reasoning is non-deterministic, so don't throw if missing pass - # Special case for claude-sonnet-4-5-20250929, opus-4.1, and zai which can generate an extra AssistantMessage before tool call - if ( - ("claude-sonnet-4-5-20250929" in model_handle or "claude-opus-4-1" in model_handle or model_settings.get("provider_type") == "zai") - and index < len(messages) - and isinstance(messages[index], AssistantMessage) - ): + # Special case for models that can generate an extra AssistantMessage before tool call + # (claude-sonnet-4-5, opus-4.1, zai, and self-hosted models like ollama/qwen3 with thinking) + is_extra_assistant_model = ( + "claude-sonnet-4-5-20250929" in model_handle + or "claude-opus-4-1" in model_handle + or model_settings.get("provider_type") == "zai" + or model_handle.startswith(("ollama/", "vllm/")) + ) + if is_extra_assistant_model and index < len(messages) and isinstance(messages[index], AssistantMessage): # Skip the extra AssistantMessage and move to the next message index += 1 otid_suffix += 1 @@ -441,6 +444,10 @@ def get_expected_message_count_range( if model_settings.get("provider_type") == "zai": expected_range += 1 + # Self-hosted models (ollama/vllm) may emit an extra AssistantMessage with thinking content + if model_handle.startswith(("ollama/", "vllm/")): + expected_range += 1 + if tool_call: # tool call and tool return messages expected_message_count += 2 @@ -561,13 +568,16 @@ async def agent_state(client: AsyncLetta) -> AgentState: """ dice_tool = await client.tools.upsert_from_function(func=roll_dice) + initial_model = TESTED_MODEL_CONFIGS[0][0] if TESTED_MODEL_CONFIGS else "openai/gpt-4o" + initial_embedding = os.getenv("EMBEDDING_HANDLE", "openai/text-embedding-3-small") + agent_state_instance = await client.agents.create( agent_type="letta_v1_agent", name="test_agent", include_base_tools=False, tool_ids=[dice_tool.id], - model="openai/gpt-4o", - embedding="openai/text-embedding-3-small", + model=initial_model, + embedding=initial_embedding, tags=["test"], ) yield agent_state_instance @@ -677,6 +687,9 @@ async def test_parallel_tool_calls( if provider_type in ["google_ai", "google_vertex"]: pytest.skip("Gemini models are flaky for this test so we disable them for now") + if model_handle.startswith("vllm"): + pytest.skip("vLLM Qwen3 tool call parsers incompatible with streaming parallel tool calls") + # Update model_settings to enable parallel tool calling modified_model_settings = model_settings.copy() modified_model_settings["parallel_tool_calls"] = True @@ -1076,6 +1089,10 @@ async def test_conversation_non_streaming_raw_http( assert "assistant_message" in message_types, f"Expected assistant_message in {message_types}" +@pytest.mark.skipif( + os.getenv("LLM_CONFIG_FILE", "").startswith(("ollama", "vllm")), + reason="Structured output not supported on self-hosted providers in CI", +) @pytest.mark.parametrize( "model_handle,provider_type", [ diff --git a/tests/model_settings/ollama.json b/tests/model_settings/ollama.json index 9382a68c..bd905dac 100644 --- a/tests/model_settings/ollama.json +++ b/tests/model_settings/ollama.json @@ -1,9 +1,9 @@ { - "handle": "ollama/qwen2.5:7b", + "handle": "ollama/qwen3:8b", "model_settings": { "provider_type": "openai", - "temperature": 1.0, + "temperature": 0.7, "max_output_tokens": 4096, - "parallel_tool_calls": false + "parallel_tool_calls": true } } diff --git a/tests/model_settings/vllm.json b/tests/model_settings/vllm.json new file mode 100644 index 00000000..0ee9492b --- /dev/null +++ b/tests/model_settings/vllm.json @@ -0,0 +1,9 @@ +{ + "handle": "vllm/Qwen/Qwen3-32B-AWQ", + "model_settings": { + "provider_type": "openai", + "temperature": 0.7, + "max_output_tokens": 4096, + "parallel_tool_calls": true + } +}