feat: patch model listing to actually match handle [LET-5888] (#5754)

2025-10-27 15:30:48 -07:00
parent 042a32d371
commit e7fff12da0
14 changed files with 60 additions and 82 deletions
--- a/fern/examples/agent_config.py
+++ b/fern/examples/agent_config.py
@@ -32,7 +32,7 @@ azure_openai_agent = client.agents.create(

 # anthropic
 anthropic_agent = client.agents.create(
-    model="anthropic/claude-3-5-sonnet-20241022",
+    model="anthropic/claude-sonnet-4-20250514",
    # note: anthropic does not support embeddings so you will need another provider
    embedding="openai/text-embedding-3-small",
    # optional configuration
--- a/fern/examples/data_sources.py
+++ b/fern/examples/data_sources.py
@@ -50,7 +50,7 @@ print(f"Passages in source: {passages}")
 agent = client.agents.create(
    name="my_agent",
    memory_blocks=[],
-    model="anthropic/claude-3-5-sonnet-20241022",
+    model="anthropic/claude-sonnet-4-20250514",
    embedding=embedding_configs[0].handle,
    tags=["worker"],
 )
--- a/fern/examples/memory.py
+++ b/fern/examples/memory.py
@@ -8,7 +8,7 @@ agent = client.agents.create(
        {"label": "persona", "value": "I am a memory agent"},
        {"label": "human", "value": "Name: Bob", "limit": 10000},
    ],
-    model="anthropic/claude-3-5-sonnet-20241022",
+    model="anthropic/claude-sonnet-4-20250514",
    embedding="openai/text-embedding-3-small",
    tags=["worker"],
 )
@@ -25,7 +25,7 @@ block = client.blocks.create(
 shared_block_agent = client.agents.create(
    name="shared_block_agent",
    memory_blocks=[block.id],
-    model="anthropic/claude-3-5-sonnet-20241022",
+    model="anthropic/claude-sonnet-4-20250514",
    embedding="openai/text-embedding-3-small",
    tags=["worker"],
 )
--- a/fern/examples/simple_multiagent.py
+++ b/fern/examples/simple_multiagent.py
@@ -10,7 +10,7 @@ try:
        memory_blocks=[
            {"label": "persona", "value": "I am the supervisor, and I can communicate with worker agents with the tag `worker`"}
        ],
-        model="anthropic/claude-3-5-sonnet-20241022",
+        model="anthropic/claude-sonnet-4-20250514",
        embedding="openai/text-embedding-3-small",
        tags=["supervisor"],
        tools=["send_message_to_agents_matching_all_tags"],
@@ -28,7 +28,7 @@ try:
    worker_agent = client.agents.create(
        name="worker_agent",
        memory_blocks=[{"label": "persona", "value": f"I am the worker, my supervisor agent has ID {supervisor_agent.id}"}],
-        model="anthropic/claude-3-5-sonnet-20241022",
+        model="anthropic/claude-sonnet-4-20250514",
        embedding="openai/text-embedding-3-small",
        tool_ids=[tool.id],
        tags=["worker"],
--- a/fern/examples/tool_rules.py
+++ b/fern/examples/tool_rules.py
@@ -16,7 +16,7 @@ client = Letta(base_url="http://localhost:8283")
 search_agent = client.agents.create(
    name="search_agent",
    memory_blocks=[],
-    model="anthropic/claude-3-5-sonnet-20241022",
+    model="anthropic/claude-sonnet-4-20250514",
    embedding="openai/text-embedding-3-small",
    tags=["worker"],
    tool_rules=[
--- a/letta/llm_api/deepseek_client.py
+++ b/letta/llm_api/deepseek_client.py
@@ -59,7 +59,7 @@ def handle_assistant_message(assistant_message: AssistantMessage) -> AssistantMe
    return assistant_message


-def map_messages_to_deepseek_format(messages: List[ChatMessage]) -> List[_Message]:
+def map_messages_to_deepseek_format(messages: List[ChatMessage]) -> List["_Message"]:
    """
    Deepeek API has the following constraints: messages must be interleaved between user and assistant messages, ending on a user message.
    Tools are currently unstable for V3 and not supported for R1 in the API: https://api-docs.deepseek.com/guides/function_calling.
@@ -103,7 +103,7 @@ def map_messages_to_deepseek_format(messages: List[ChatMessage]) -> List[_Messag

 def build_deepseek_chat_completions_request(
    llm_config: LLMConfig,
-    messages: List[_Message],
+    messages: List["_Message"],
    user_id: Optional[str],
    functions: Optional[list],
    function_call: Optional[str],
--- a/letta/server/rest_api/app.py
+++ b/letta/server/rest_api/app.py
@@ -26,6 +26,7 @@ from letta.errors import (
    AgentFileImportError,
    AgentNotFoundForExportError,
    BedrockPermissionError,
+    HandleNotFoundError,
    LettaAgentNotFoundError,
    LettaExpiredError,
    LettaInvalidArgumentError,
@@ -369,6 +370,7 @@ def create_application() -> "FastAPI":
    app.add_exception_handler(LettaAgentNotFoundError, _error_handler_404_agent)
    app.add_exception_handler(LettaUserNotFoundError, _error_handler_404_user)
    app.add_exception_handler(AgentNotFoundForExportError, _error_handler_404)
+    app.add_exception_handler(HandleNotFoundError, _error_handler_404)

    # 410 Expired errors
    app.add_exception_handler(LettaExpiredError, _error_handler_410)
--- a/letta/server/server.py
+++ b/letta/server/server.py
@@ -430,6 +430,11 @@ class SyncServer(object):
            log_event(name="start get_cached_llm_config", attributes=config_params)
            request.llm_config = await self.get_cached_llm_config_async(actor=actor, **config_params)
            log_event(name="end get_cached_llm_config", attributes=config_params)
+            if request.model and isinstance(request.model, str):
+                assert request.llm_config.handle == request.model, (
+                    f"LLM config handle {request.llm_config.handle} does not match request handle {request.model}"
+                )
+        print("GOT LLM CONFIG", request.llm_config)

        if request.reasoning is None:
            request.reasoning = request.llm_config.enable_reasoner or request.llm_config.put_inner_thoughts_in_kwargs
@@ -1031,92 +1036,63 @@ class SyncServer(object):
        max_reasoning_tokens: Optional[int] = None,
        enable_reasoner: Optional[bool] = None,
    ) -> LLMConfig:
-        try:
-            provider_name, model_name = handle.split("/", 1)
-            provider = await self.get_provider_from_name_async(provider_name, actor)
-
-            all_llm_configs = await provider.list_llm_models_async()
-            llm_configs = [config for config in all_llm_configs if config.handle == handle]
-            if not llm_configs:
-                llm_configs = [config for config in all_llm_configs if config.model == model_name]
-            if not llm_configs:
-                available_handles = [config.handle for config in all_llm_configs]
-                raise HandleNotFoundError(handle, available_handles)
-        except ValueError as e:
-            llm_configs = [config for config in self.get_local_llm_configs() if config.handle == handle]
-            if not llm_configs:
-                llm_configs = [config for config in self.get_local_llm_configs() if config.model == model_name]
-            if not llm_configs:
-                raise e
-
-        if len(llm_configs) == 1:
-            llm_config = llm_configs[0]
-        elif len(llm_configs) > 1:
-            raise LettaInvalidArgumentError(
-                f"Multiple LLM models with name {model_name} supported by {provider_name}", argument_name="model_name"
-            )
-        else:
-            llm_config = llm_configs[0]
+        """String match the `handle` to the available configs"""
+        matched_llm_config = None
+        available_handles = []
+        for provider in self._enabled_providers:
+            llm_configs = await provider.list_llm_models_async()
+            for llm_config in llm_configs:
+                available_handles.append(llm_config.handle)
+                if llm_config.handle == handle:
+                    matched_llm_config = llm_config
+                    break
+        if not matched_llm_config:
+            raise HandleNotFoundError(handle, available_handles)

        if context_window_limit is not None:
-            if context_window_limit > llm_config.context_window:
+            if context_window_limit > matched_llm_config.context_window:
                raise LettaInvalidArgumentError(
-                    f"Context window limit ({context_window_limit}) is greater than maximum of ({llm_config.context_window})",
+                    f"Context window limit ({context_window_limit}) is greater than maximum of ({matched_llm_config.context_window})",
                    argument_name="context_window_limit",
                )
-            llm_config.context_window = context_window_limit
+            matched_llm_config.context_window = context_window_limit
        else:
-            llm_config.context_window = min(llm_config.context_window, model_settings.global_max_context_window_limit)
+            matched_llm_config.context_window = min(matched_llm_config.context_window, model_settings.global_max_context_window_limit)

        if max_tokens is not None:
-            llm_config.max_tokens = max_tokens
+            matched_llm_config.max_tokens = max_tokens
        if max_reasoning_tokens is not None:
            if not max_tokens or max_reasoning_tokens > max_tokens:
                raise LettaInvalidArgumentError(
                    f"Max reasoning tokens ({max_reasoning_tokens}) must be less than max tokens ({max_tokens})",
                    argument_name="max_reasoning_tokens",
                )
-            llm_config.max_reasoning_tokens = max_reasoning_tokens
+            matched_llm_config.max_reasoning_tokens = max_reasoning_tokens
        if enable_reasoner is not None:
-            llm_config.enable_reasoner = enable_reasoner
-            if enable_reasoner and llm_config.model_endpoint_type == "anthropic":
-                llm_config.put_inner_thoughts_in_kwargs = False
+            matched_llm_config.enable_reasoner = enable_reasoner
+            if enable_reasoner and matched_llm_config.model_endpoint_type == "anthropic":
+                matched_llm_config.put_inner_thoughts_in_kwargs = False

-        return llm_config
+        return matched_llm_config

    @trace_method
    async def get_embedding_config_from_handle_async(
        self, actor: User, handle: str, embedding_chunk_size: int = constants.DEFAULT_EMBEDDING_CHUNK_SIZE
    ) -> EmbeddingConfig:
-        try:
-            provider_name, model_name = handle.split("/", 1)
-            provider = await self.get_provider_from_name_async(provider_name, actor)
-
-            all_embedding_configs = await provider.list_embedding_models_async()
-            embedding_configs = [config for config in all_embedding_configs if config.handle == handle]
-            if not embedding_configs:
-                raise LettaInvalidArgumentError(
-                    f"Embedding model {model_name} is not supported by {provider_name}", argument_name="model_name"
-                )
-        except LettaInvalidArgumentError as e:
-            # search local configs
-            embedding_configs = [config for config in self.get_local_embedding_configs() if config.handle == handle]
-            if not embedding_configs:
-                raise e
-
-        if len(embedding_configs) == 1:
-            embedding_config = embedding_configs[0]
-        elif len(embedding_configs) > 1:
-            raise LettaInvalidArgumentError(
-                f"Multiple embedding models with name {model_name} supported by {provider_name}", argument_name="model_name"
-            )
-        else:
-            embedding_config = embedding_configs[0]
+        matched_embedding_config = None
+        available_handles = []
+        for provider in self._enabled_providers:
+            embedding_configs = await provider.list_embedding_models_async()
+            for embedding_config in embedding_configs:
+                available_handles.append(embedding_config.handle)
+                if embedding_config.handle == handle:
+                    matched_embedding_config = embedding_config
+                    break

        if embedding_chunk_size:
-            embedding_config.embedding_chunk_size = embedding_chunk_size
+            matched_embedding_config.embedding_chunk_size = embedding_chunk_size

-        return embedding_config
+        return matched_embedding_config

    async def get_provider_from_name_async(self, provider_name: str, actor: User) -> Provider:
        all_providers = await self.get_enabled_providers_async(actor)
--- a/tests/integration_test_batch_api_cron_jobs.py
+++ b/tests/integration_test_batch_api_cron_jobs.py
@@ -134,7 +134,7 @@ def create_failed_response(custom_id: str) -> BetaMessageBatchIndividualResponse
 # --- Test Setup Helpers --- #


-async def create_test_agent(name, actor, test_id: Optional[str] = None, model="anthropic/claude-3-5-sonnet-20241022"):
+async def create_test_agent(name, actor, test_id: Optional[str] = None, model="anthropic/claude-sonnet-4-20250514"):
    """Create a test agent with standardized configuration."""
    dummy_llm_config = LLMConfig(
        model="claude-3-7-sonnet-latest",
--- a/tests/integration_test_voice_agent.py
+++ b/tests/integration_test_voice_agent.py
@@ -225,7 +225,7 @@ def _assert_valid_chunk(chunk, idx, chunks):


@pytest.mark.asyncio(loop_scope="module")
-@pytest.mark.parametrize("model", ["openai/gpt-4o-mini", "anthropic/claude-3-5-sonnet-20241022"])
+@pytest.mark.parametrize("model", ["openai/gpt-4o-mini", "anthropic/claude-sonnet-4-20250514"])
@pytest.mark.parametrize(
    "message", ["How are you?", "Use the roll_dice tool to roll a die for me", "Use the run_code tool to calculate 2+2"]
 )
--- a/tests/test_letta_agent_batch.py
+++ b/tests/test_letta_agent_batch.py
@@ -35,9 +35,9 @@ from tests.utils import create_tool_from_func

 # Model identifiers used in tests
 MODELS = {
-    "sonnet": "anthropic/claude-3-5-sonnet-20241022",
-    "haiku": "anthropic/claude-3-5-haiku-20241022",
-    "opus": "anthropic/claude-3-opus-20240229",
+    "sonnet": "anthropic/claude-sonnet-4-20250514",
+    "haiku": "anthropic/claude-haiku-4-5-20251001",
+    "opus": "anthropic/claude-opus-4-1-20250805",
 }

 # Expected message roles in batch requests
@@ -933,9 +933,9 @@ async def test_step_until_request_prepares_and_submits_batch_correctly(

    # Map of agent IDs to their expected models
    expected_models = {
-        agent_sonnet.id: "claude-3-5-sonnet-20241022",
-        agent_haiku.id: "claude-3-5-haiku-20241022",
-        agent_opus.id: "claude-3-opus-20240229",
+        agent_sonnet.id: "claude-sonnet-4-20250514",
+        agent_haiku.id: "claude-haiku-4-5-20251001",
+        agent_opus.id: "claude-opus-4-1-20250805",
    }

    # Set up spy function for the Anthropic client
--- a/tests/test_llm_clients.py
+++ b/tests/test_llm_clients.py
@@ -17,7 +17,7 @@ def llm_config():
        model_endpoint_type="anthropic",
        model_endpoint="https://api.anthropic.com/v1",
        context_window=32000,
-        handle="anthropic/claude-3-5-sonnet-20241022",
+        handle="anthropic/claude-sonnet-4-20250514",
        put_inner_thoughts_in_kwargs=False,
        max_tokens=4096,
        enable_reasoner=True,
--- a/tests/test_sdk_client.py
+++ b/tests/test_sdk_client.py
@@ -2331,7 +2331,7 @@ def test_create_agent(client: LettaSDKClient) -> None:
                label="human",
            )
        ],
-        model="anthropic/claude-3-5-sonnet-20241022",
+        model="anthropic/claude-sonnet-4-20250514",
        embedding="openai/text-embedding-ada-002",
    )
    assert agent is not None
--- a/tests/test_sonnet_nonnative_reasoning_buffering.py
+++ b/tests/test_sonnet_nonnative_reasoning_buffering.py
@@ -126,7 +126,7 @@ def detect_burst_chunks(chunks: List[Tuple[float, any]], burst_threshold: float
@pytest.mark.parametrize(
    "model,expected_buffering",
    [
-        ("anthropic/claude-3-5-sonnet-20241022", False),  # With fine-grained streaming beta, should stream better
+        ("anthropic/claude-sonnet-4-20250514", False),  # With fine-grained streaming beta, should stream better
        ("anthropic/claude-sonnet-4-20250514", False),  # Sonnet 4 should NOT show buffering (has native reasoning)
        ("openai/gpt-4.1", False),  # GPT-4.1 should NOT show buffering (uses native reasoning)
    ],