feat(tf): gpu runners and prod memory_repos (#9283)

* add gpu runners and prod memory_repos * add lmstudio and vllm in model_settings * fix llm_configs and change variable name in reusable workflow and change perms for memory_repos to admin in tf * fix: update self-hosted provider tests to use SDK 1.0 and v2 tests - Update letta-client from ==0.1.324 to >=1.0.0 - Switch ollama/vllm/lmstudio tests to integration_test_send_message_v2.py 🤖 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * fix: use openai provider_type for self-hosted model settings ollama/vllm/lmstudio are not valid provider_type values in the SDK model_settings schema - they use openai-compatible APIs so provider_type should be openai. The provider routing is determined by the handle prefix. 🤖 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * fix: use openai_compat_base_url for ollama/vllm/lmstudio providers When reconstructing LLMConfig from a model handle lookup, use the provider's openai_compat_base_url (which includes /v1) instead of raw base_url. This fixes 404 errors when calling ollama/vllm/lmstudio since OpenAI client expects /v1/chat/completions endpoint. 🤖 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * fix: enable redis for ollama/vllm/lmstudio tests Background streaming tests require Redis. Add use-redis: true to self-hosted provider test workflows. 🤖 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * add memfs-py in prod bucket access * change ollama * change packer model defaults * self-hosted provider support * diasble reasoner to match the number of messages in test case, enable parallel tool calls, and pass embedding configs * remove reasoning setting not supported for ollama * add qwen3 to extra assistant message case * lower temp * prep for lmstudio and vllm * used lmstudio_openai client * skip parallel tool calls on cpu ran provider lmstudio * revert downgrade since it's so slow already * add reuired flags for tool call parsing etc. * change tool call parser from hermes to qwen3_xml * qwen3_xmlk -> qwen3_coder * upgrade vllm to latest container * revert to hermes (incompatible with parallel tool calls?) and skipping vllm tests on parallel tool calls * install uv redis extra * remove lmstudio --------- Co-authored-by: Letta <noreply@letta.com>
2026-02-10 10:56:55 -08:00
parent 23c94ec6d3
commit 7eb85707b1
4 changed files with 43 additions and 14 deletions
--- a/tests/integration_test_send_message_v2.py
+++ b/tests/integration_test_send_message_v2.py
@@ -207,12 +207,15 @@ def assert_tool_call_response(
        # Reasoning is non-deterministic, so don't throw if missing
        pass

-    # Special case for claude-sonnet-4-5-20250929, opus-4.1, and zai which can generate an extra AssistantMessage before tool call
-    if (
-        ("claude-sonnet-4-5-20250929" in model_handle or "claude-opus-4-1" in model_handle or model_settings.get("provider_type") == "zai")
-        and index < len(messages)
-        and isinstance(messages[index], AssistantMessage)
-    ):
+    # Special case for models that can generate an extra AssistantMessage before tool call
+    # (claude-sonnet-4-5, opus-4.1, zai, and self-hosted models like ollama/qwen3 with thinking)
+    is_extra_assistant_model = (
+        "claude-sonnet-4-5-20250929" in model_handle
+        or "claude-opus-4-1" in model_handle
+        or model_settings.get("provider_type") == "zai"
+        or model_handle.startswith(("ollama/", "vllm/"))
+    )
+    if is_extra_assistant_model and index < len(messages) and isinstance(messages[index], AssistantMessage):
        # Skip the extra AssistantMessage and move to the next message
        index += 1
        otid_suffix += 1
@@ -441,6 +444,10 @@ def get_expected_message_count_range(
            if model_settings.get("provider_type") == "zai":
                expected_range += 1

+    # Self-hosted models (ollama/vllm) may emit an extra AssistantMessage with thinking content
+    if model_handle.startswith(("ollama/", "vllm/")):
+        expected_range += 1
+
    if tool_call:
        # tool call and tool return messages
        expected_message_count += 2
@@ -561,13 +568,16 @@ async def agent_state(client: AsyncLetta) -> AgentState:
    """
    dice_tool = await client.tools.upsert_from_function(func=roll_dice)

+    initial_model = TESTED_MODEL_CONFIGS[0][0] if TESTED_MODEL_CONFIGS else "openai/gpt-4o"
+    initial_embedding = os.getenv("EMBEDDING_HANDLE", "openai/text-embedding-3-small")
+
    agent_state_instance = await client.agents.create(
        agent_type="letta_v1_agent",
        name="test_agent",
        include_base_tools=False,
        tool_ids=[dice_tool.id],
-        model="openai/gpt-4o",
-        embedding="openai/text-embedding-3-small",
+        model=initial_model,
+        embedding=initial_embedding,
        tags=["test"],
    )
    yield agent_state_instance
@@ -677,6 +687,9 @@ async def test_parallel_tool_calls(
    if provider_type in ["google_ai", "google_vertex"]:
        pytest.skip("Gemini models are flaky for this test so we disable them for now")

+    if model_handle.startswith("vllm"):
+        pytest.skip("vLLM Qwen3 tool call parsers incompatible with streaming parallel tool calls")
+
    # Update model_settings to enable parallel tool calling
    modified_model_settings = model_settings.copy()
    modified_model_settings["parallel_tool_calls"] = True
@@ -1076,6 +1089,10 @@ async def test_conversation_non_streaming_raw_http(
        assert "assistant_message" in message_types, f"Expected assistant_message in {message_types}"


+@pytest.mark.skipif(
+    os.getenv("LLM_CONFIG_FILE", "").startswith(("ollama", "vllm")),
+    reason="Structured output not supported on self-hosted providers in CI",
+)
@pytest.mark.parametrize(
    "model_handle,provider_type",
    [
--- a/tests/model_settings/ollama.json
+++ b/tests/model_settings/ollama.json
@@ -1,9 +1,9 @@
 {
-  "handle": "ollama/qwen2.5:7b",
+  "handle": "ollama/qwen3:8b",
  "model_settings": {
    "provider_type": "openai",
-    "temperature": 1.0,
+    "temperature": 0.7,
    "max_output_tokens": 4096,
-    "parallel_tool_calls": false
+    "parallel_tool_calls": true
  }
 }
--- a/tests/model_settings/vllm.json
+++ b/tests/model_settings/vllm.json
@@ -0,0 +1,9 @@
+{
+  "handle": "vllm/Qwen/Qwen3-32B-AWQ",
+  "model_settings": {
+    "provider_type": "openai",
+    "temperature": 0.7,
+    "max_output_tokens": 4096,
+    "parallel_tool_calls": true
+  }
+}