From 7eb85707b155fab64c9bab578948570b0ecf7b49 Mon Sep 17 00:00:00 2001
From: Kian Jones <11655409+kianjones9@users.noreply.github.com>
Date: Tue, 10 Feb 2026 10:56:55 -0800
Subject: [PATCH] feat(tf): gpu runners and prod memory_repos (#9283)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add gpu runners and prod memory_repos

* add lmstudio and vllm in model_settings

* fix llm_configs and change variable name in reusable workflow and change perms for memory_repos to admin in tf

* fix: update self-hosted provider tests to use SDK 1.0 and v2 tests

- Update letta-client from ==0.1.324 to >=1.0.0
- Switch ollama/vllm/lmstudio tests to integration_test_send_message_v2.py

🤖 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* fix: use openai provider_type for self-hosted model settings

ollama/vllm/lmstudio are not valid provider_type values in the SDK
model_settings schema - they use openai-compatible APIs so provider_type
should be openai. The provider routing is determined by the handle prefix.

🤖 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* fix: use openai_compat_base_url for ollama/vllm/lmstudio providers

When reconstructing LLMConfig from a model handle lookup, use the
provider's openai_compat_base_url (which includes /v1) instead of
raw base_url. This fixes 404 errors when calling ollama/vllm/lmstudio
since OpenAI client expects /v1/chat/completions endpoint.

🤖 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* fix: enable redis for ollama/vllm/lmstudio tests

Background streaming tests require Redis. Add use-redis: true to
self-hosted provider test workflows.

🤖 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* add memfs-py in prod bucket access

* change ollama

* change packer model defaults

* self-hosted provider support

* diasble reasoner to match the number of messages in test case, enable parallel tool calls, and pass embedding configs

* remove reasoning setting not supported for ollama

* add qwen3 to extra assistant message case

* lower temp

* prep for lmstudio and vllm

* used lmstudio_openai client

* skip parallel tool calls on cpu ran provider lmstudio

* revert downgrade since it's so slow already

* add reuired flags for tool call parsing etc.

* change tool call parser from hermes to qwen3_xml

* qwen3_xmlk -> qwen3_coder

* upgrade vllm to latest container

* revert to hermes (incompatible with parallel tool calls?) and skipping vllm tests on parallel tool calls

* install uv redis extra

* remove lmstudio

---------

Co-authored-by: Letta <noreply@letta.com>
---
 letta/services/provider_manager.py        |  9 ++++---
 tests/integration_test_send_message_v2.py | 33 +++++++++++++++++------
 tests/model_settings/ollama.json          |  6 ++---
 tests/model_settings/vllm.json            |  9 +++++++
 4 files changed, 43 insertions(+), 14 deletions(-)
 create mode 100644 tests/model_settings/vllm.json

diff --git a/letta/services/provider_manager.py b/letta/services/provider_manager.py
index 2f66d140..c8a3a1ae 100644
--- a/letta/services/provider_manager.py
+++ b/letta/services/provider_manager.py
@@ -991,10 +991,13 @@ class ProviderManager:
         # Get the default max_output_tokens from the provider (provider-specific logic)
         max_tokens = typed_provider.get_default_max_output_tokens(model.name)
 
-        # Determine the model endpoint - use provider's base_url if set,
-        # otherwise use provider-specific defaults
+        # Determine the model endpoint - use provider's OpenAI-compatible base_url if available,
+        # otherwise fall back to raw base_url or provider-specific defaults
 
-        if typed_provider.base_url:
+        if hasattr(typed_provider, "openai_compat_base_url"):
+            # For providers like ollama/vllm/lmstudio that need /v1 appended for OpenAI compatibility
+            model_endpoint = typed_provider.openai_compat_base_url
+        elif typed_provider.base_url:
             model_endpoint = typed_provider.base_url
         elif provider.provider_type == ProviderType.chatgpt_oauth:
             # ChatGPT OAuth uses the ChatGPT backend API, not a generic endpoint pattern
diff --git a/tests/integration_test_send_message_v2.py b/tests/integration_test_send_message_v2.py
index ce267e7d..7f040eb2 100644
--- a/tests/integration_test_send_message_v2.py
+++ b/tests/integration_test_send_message_v2.py
@@ -207,12 +207,15 @@ def assert_tool_call_response(
         # Reasoning is non-deterministic, so don't throw if missing
         pass
 
-    # Special case for claude-sonnet-4-5-20250929, opus-4.1, and zai which can generate an extra AssistantMessage before tool call
-    if (
-        ("claude-sonnet-4-5-20250929" in model_handle or "claude-opus-4-1" in model_handle or model_settings.get("provider_type") == "zai")
-        and index < len(messages)
-        and isinstance(messages[index], AssistantMessage)
-    ):
+    # Special case for models that can generate an extra AssistantMessage before tool call
+    # (claude-sonnet-4-5, opus-4.1, zai, and self-hosted models like ollama/qwen3 with thinking)
+    is_extra_assistant_model = (
+        "claude-sonnet-4-5-20250929" in model_handle
+        or "claude-opus-4-1" in model_handle
+        or model_settings.get("provider_type") == "zai"
+        or model_handle.startswith(("ollama/", "vllm/"))
+    )
+    if is_extra_assistant_model and index < len(messages) and isinstance(messages[index], AssistantMessage):
         # Skip the extra AssistantMessage and move to the next message
         index += 1
         otid_suffix += 1
@@ -441,6 +444,10 @@ def get_expected_message_count_range(
             if model_settings.get("provider_type") == "zai":
                 expected_range += 1
 
+    # Self-hosted models (ollama/vllm) may emit an extra AssistantMessage with thinking content
+    if model_handle.startswith(("ollama/", "vllm/")):
+        expected_range += 1
+
     if tool_call:
         # tool call and tool return messages
         expected_message_count += 2
@@ -561,13 +568,16 @@ async def agent_state(client: AsyncLetta) -> AgentState:
     """
     dice_tool = await client.tools.upsert_from_function(func=roll_dice)
 
+    initial_model = TESTED_MODEL_CONFIGS[0][0] if TESTED_MODEL_CONFIGS else "openai/gpt-4o"
+    initial_embedding = os.getenv("EMBEDDING_HANDLE", "openai/text-embedding-3-small")
+
     agent_state_instance = await client.agents.create(
         agent_type="letta_v1_agent",
         name="test_agent",
         include_base_tools=False,
         tool_ids=[dice_tool.id],
-        model="openai/gpt-4o",
-        embedding="openai/text-embedding-3-small",
+        model=initial_model,
+        embedding=initial_embedding,
         tags=["test"],
     )
     yield agent_state_instance
@@ -677,6 +687,9 @@ async def test_parallel_tool_calls(
     if provider_type in ["google_ai", "google_vertex"]:
         pytest.skip("Gemini models are flaky for this test so we disable them for now")
 
+    if model_handle.startswith("vllm"):
+        pytest.skip("vLLM Qwen3 tool call parsers incompatible with streaming parallel tool calls")
+
     # Update model_settings to enable parallel tool calling
     modified_model_settings = model_settings.copy()
     modified_model_settings["parallel_tool_calls"] = True
@@ -1076,6 +1089,10 @@ async def test_conversation_non_streaming_raw_http(
         assert "assistant_message" in message_types, f"Expected assistant_message in {message_types}"
 
 
+@pytest.mark.skipif(
+    os.getenv("LLM_CONFIG_FILE", "").startswith(("ollama", "vllm")),
+    reason="Structured output not supported on self-hosted providers in CI",
+)
 @pytest.mark.parametrize(
     "model_handle,provider_type",
     [
diff --git a/tests/model_settings/ollama.json b/tests/model_settings/ollama.json
index 9382a68c..bd905dac 100644
--- a/tests/model_settings/ollama.json
+++ b/tests/model_settings/ollama.json
@@ -1,9 +1,9 @@
 {
-  "handle": "ollama/qwen2.5:7b",
+  "handle": "ollama/qwen3:8b",
   "model_settings": {
     "provider_type": "openai",
-    "temperature": 1.0,
+    "temperature": 0.7,
     "max_output_tokens": 4096,
-    "parallel_tool_calls": false
+    "parallel_tool_calls": true
   }
 }
diff --git a/tests/model_settings/vllm.json b/tests/model_settings/vllm.json
new file mode 100644
index 00000000..0ee9492b
--- /dev/null
+++ b/tests/model_settings/vllm.json
@@ -0,0 +1,9 @@
+{
+  "handle": "vllm/Qwen/Qwen3-32B-AWQ",
+  "model_settings": {
+    "provider_type": "openai",
+    "temperature": 0.7,
+    "max_output_tokens": 4096,
+    "parallel_tool_calls": true
+  }
+}