fix(core): raise LLMEmptyResponseError for empty Anthropic responses (#9624)

* fix(core): raise LLMEmptyResponseError for empty Anthropic responses Fixes LET-7679: Opus 4.6 occasionally returns empty responses (no content and no tool calls), causing silent failures with stop_reason=end_turn. Changes: - Add LLMEmptyResponseError class (subclass of LLMServerError) - Raise error in anthropic_client for empty non-streaming responses - Raise error in anthropic_streaming_interface for empty streaming responses - Pass through LLMError instances in handle_llm_error to preserve specific types - Add test for empty streaming response detection This allows clients (letta-code) to catch this specific error and implement retry logic with cache-busting modifications. 🤖 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * fix(core): set invalid_llm_response stop reason for empty responses Catch LLMEmptyResponseError specifically and set stop_reason to invalid_llm_response instead of llm_api_error. This allows clients to distinguish empty responses from transient API errors. 🤖 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> --------- Co-authored-by: Letta <noreply@letta.com>
2026-02-24 19:57:23 -08:00
parent 86ff216dc9
commit 3d781efd21
5 changed files with 116 additions and 6 deletions
--- a/letta/agents/letta_agent_v3.py
+++ b/letta/agents/letta_agent_v3.py
@@ -21,7 +21,7 @@ from letta.agents.helpers import (
 )
 from letta.agents.letta_agent_v2 import LettaAgentV2
 from letta.constants import DEFAULT_MAX_STEPS, NON_USER_MSG_PREFIX, REQUEST_HEARTBEAT_PARAM
-from letta.errors import ContextWindowExceededError, LLMError, SystemPromptTokenExceededError
+from letta.errors import ContextWindowExceededError, LLMEmptyResponseError, LLMError, SystemPromptTokenExceededError
 from letta.helpers import ToolRulesSolver
 from letta.helpers.datetime_helpers import get_utc_time, get_utc_timestamp_ns
 from letta.helpers.tool_execution_helper import enable_strict_mode
@@ -990,6 +990,9 @@ class LettaAgentV3(LettaAgentV2):
                    except ValueError as e:
                        self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response.value)
                        raise e
+                    except LLMEmptyResponseError as e:
+                        self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response.value)
+                        raise e
                    except LLMError as e:
                        self.stop_reason = LettaStopReason(stop_reason=StopReasonType.llm_api_error.value)
                        raise e
--- a/letta/errors.py
+++ b/letta/errors.py
@@ -283,6 +283,15 @@ class LLMServerError(LLMError):
    while processing the request."""


+class LLMEmptyResponseError(LLMServerError):
+    """Error when LLM returns an empty response (no content and no tool calls).
+
+    This is a subclass of LLMServerError to maintain retry behavior, but allows
+    specific handling for empty response cases which may benefit from request
+    modification before retry.
+    """
+
+
 class LLMTimeoutError(LLMError):
    """Error when LLM request times out"""

--- a/letta/interfaces/anthropic_streaming_interface.py
+++ b/letta/interfaces/anthropic_streaming_interface.py
@@ -30,6 +30,7 @@ from anthropic.types.beta import (
 )

 from letta.constants import DEFAULT_MESSAGE_TOOL, DEFAULT_MESSAGE_TOOL_KWARG
+from letta.errors import LLMEmptyResponseError
 from letta.local_llm.constants import INNER_THOUGHTS_KWARG
 from letta.log import get_logger
 from letta.schemas.letta_message import (
@@ -104,6 +105,10 @@ class AnthropicStreamingInterface:
        self.inner_thoughts_complete = False
        self.put_inner_thoughts_in_kwarg = put_inner_thoughts_in_kwarg

+        # Track whether any content was produced (text or tool calls)
+        # Used to detect empty responses from models like Opus 4.6
+        self.has_content = False
+
        # Buffer to handle partial XML tags across chunks
        self.partial_tag_buffer = ""

@@ -298,9 +303,11 @@ class AnthropicStreamingInterface:

            if isinstance(content, BetaTextBlock):
                self.anthropic_mode = EventMode.TEXT
+                self.has_content = True  # Track that we received text content
                # TODO: Can capture citations, etc.
            elif isinstance(content, BetaToolUseBlock):
                self.anthropic_mode = EventMode.TOOL_USE
+                self.has_content = True  # Track that we received tool use content
                self.tool_call_id = content.id
                self.tool_call_name = content.name
                self.inner_thoughts_complete = False
@@ -589,8 +596,12 @@ class AnthropicStreamingInterface:
            # message_delta event are *cumulative*." So we assign, not accumulate.
            self.output_tokens = event.usage.output_tokens
        elif isinstance(event, BetaRawMessageStopEvent):
-            # Don't do anything here! We don't want to stop the stream.
-            pass
+            # Check if any content was produced during the stream
+            # Empty responses (no text and no tool calls) should raise an error
+            if not self.has_content:
+                raise LLMEmptyResponseError(
+                    message=f"LLM provider returned empty content in streaming response (model: {self.model}, message_id: {self.message_id})"
+                )
        elif isinstance(event, BetaRawContentBlockStopEvent):
            # If we're exiting a tool use block and there are still buffered messages,
            # we should flush them now.
@@ -837,10 +848,12 @@ class SimpleAnthropicStreamingInterface:

            if isinstance(content, BetaTextBlock):
                self.anthropic_mode = EventMode.TEXT
+                self.has_content = True  # Track that we received text content
                # TODO: Can capture citations, etc.

            elif isinstance(content, BetaToolUseBlock):
                self.anthropic_mode = EventMode.TOOL_USE
+                self.has_content = True  # Track that we received tool use content
                self.tool_call_id = content.id
                self.tool_call_name = content.name

@@ -1014,8 +1027,12 @@ class SimpleAnthropicStreamingInterface:
            self.output_tokens = event.usage.output_tokens

        elif isinstance(event, BetaRawMessageStopEvent):
-            # Don't do anything here! We don't want to stop the stream.
-            pass
+            # Check if any content was produced during the stream
+            # Empty responses (no text and no tool calls) should raise an error
+            if not self.has_content:
+                raise LLMEmptyResponseError(
+                    message=f"LLM provider returned empty content in streaming response (model: {self.model}, message_id: {self.message_id})"
+                )

        elif isinstance(event, BetaRawContentBlockStopEvent):
            self.anthropic_mode = None
--- a/letta/llm_api/anthropic_client.py
+++ b/letta/llm_api/anthropic_client.py
@@ -19,6 +19,8 @@ from letta.errors import (
    LLMAuthenticationError,
    LLMBadRequestError,
    LLMConnectionError,
+    LLMEmptyResponseError,
+    LLMError,
    LLMInsufficientCreditsError,
    LLMNotFoundError,
    LLMPermissionDeniedError,
@@ -957,6 +959,11 @@ class AnthropicClient(LLMClientBase):

    @trace_method
    def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
+        # Pass through errors that are already LLMError instances unchanged
+        # This preserves specific error types like LLMEmptyResponseError
+        if isinstance(e, LLMError):
+            return e
+
        is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None

        # make sure to check for overflow errors, regardless of error type
@@ -1278,7 +1285,7 @@ class AnthropicClient(LLMClientBase):
                response.stop_reason,
                json.dumps(response_data),
            )
-            raise LLMServerError(
+            raise LLMEmptyResponseError(
                message=f"LLM provider returned empty content in response (ID: {response.id}, model: {response.model}, stop_reason: {response.stop_reason})",
                code=ErrorCode.INTERNAL_SERVER_ERROR,
                details={
--- a/tests/adapters/test_letta_llm_stream_adapter_error_handling.py
+++ b/tests/adapters/test_letta_llm_stream_adapter_error_handling.py
@@ -2,6 +2,12 @@ import anthropic
 import httpx
 import openai
 import pytest
+from anthropic.types.beta import (
+    BetaMessage,
+    BetaRawMessageStartEvent,
+    BetaRawMessageStopEvent,
+    BetaUsage,
+)
 from google.genai import errors as google_errors

 from letta.adapters.letta_llm_stream_adapter import LettaLLMStreamAdapter
@@ -9,6 +15,7 @@ from letta.errors import (
    ContextWindowExceededError,
    LLMBadRequestError,
    LLMConnectionError,
+    LLMEmptyResponseError,
    LLMInsufficientCreditsError,
    LLMServerError,
 )
@@ -287,3 +294,70 @@ def test_openai_client_handle_llm_error_non_credit_api_error():
    result = client.handle_llm_error(error)
    assert isinstance(result, LLMBadRequestError)
    assert not isinstance(result, LLMInsufficientCreditsError)
+
+
+@pytest.mark.asyncio
+async def test_letta_llm_stream_adapter_raises_empty_response_error_for_anthropic(monkeypatch):
+    """LET-7679: Empty streaming responses (no content blocks) should raise LLMEmptyResponseError.
+
+    This tests the case where Opus 4.6 returns a response with:
+    - BetaRawMessageStartEvent (with usage tokens)
+    - BetaRawMessageStopEvent (end_turn)
+    - NO content blocks in between
+
+    This should raise LLMEmptyResponseError, not complete successfully with stop_reason=end_turn.
+    """
+
+    class FakeAsyncStream:
+        """Mimics anthropic.AsyncStream that returns empty content (no content blocks)."""
+
+        def __init__(self):
+            self.events = [
+                # Message start with some usage info
+                BetaRawMessageStartEvent(
+                    type="message_start",
+                    message=BetaMessage(
+                        id="msg_test_empty",
+                        type="message",
+                        role="assistant",
+                        content=[],  # Empty content
+                        model="claude-opus-4-6",
+                        stop_reason="end_turn",
+                        stop_sequence=None,
+                        usage=BetaUsage(input_tokens=1000, output_tokens=26, cache_creation_input_tokens=0, cache_read_input_tokens=0),
+                    ),
+                ),
+                # Message stop immediately after start - no content blocks
+                BetaRawMessageStopEvent(type="message_stop"),
+            ]
+            self.index = 0
+
+        async def __aenter__(self):
+            return self
+
+        async def __aexit__(self, exc_type, exc, tb):
+            return None
+
+        def __aiter__(self):
+            return self
+
+        async def __anext__(self):
+            if self.index >= len(self.events):
+                raise StopAsyncIteration
+            event = self.events[self.index]
+            self.index += 1
+            return event
+
+    async def fake_stream_async(self, request_data: dict, llm_config):
+        return FakeAsyncStream()
+
+    monkeypatch.setattr(AnthropicClient, "stream_async", fake_stream_async, raising=True)
+
+    llm_client = AnthropicClient()
+    llm_config = LLMConfig(model="claude-opus-4-6", model_endpoint_type="anthropic", context_window=200000)
+    adapter = LettaLLMStreamAdapter(llm_client=llm_client, llm_config=llm_config, call_type=LLMCallType.agent_step)
+
+    gen = adapter.invoke_llm(request_data={}, messages=[], tools=[], use_assistant_message=True)
+    with pytest.raises(LLMEmptyResponseError):
+        async for _ in gen:
+            pass