fix(core): raise LLMEmptyResponseError for empty Anthropic responses (#9624)

* fix(core): raise LLMEmptyResponseError for empty Anthropic responses Fixes LET-7679: Opus 4.6 occasionally returns empty responses (no content and no tool calls), causing silent failures with stop_reason=end_turn. Changes: - Add LLMEmptyResponseError class (subclass of LLMServerError) - Raise error in anthropic_client for empty non-streaming responses - Raise error in anthropic_streaming_interface for empty streaming responses - Pass through LLMError instances in handle_llm_error to preserve specific types - Add test for empty streaming response detection This allows clients (letta-code) to catch this specific error and implement retry logic with cache-busting modifications. 🤖 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * fix(core): set invalid_llm_response stop reason for empty responses Catch LLMEmptyResponseError specifically and set stop_reason to invalid_llm_response instead of llm_api_error. This allows clients to distinguish empty responses from transient API errors. 🤖 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> --------- Co-authored-by: Letta <noreply@letta.com>
2026-02-24 19:57:23 -08:00
parent 86ff216dc9
commit 3d781efd21
5 changed files with 116 additions and 6 deletions
--- a/letta/agents/letta_agent_v3.py
+++ b/letta/agents/letta_agent_v3.py
@@ -21,7 +21,7 @@ from letta.agents.helpers import (
 )
 from letta.agents.letta_agent_v2 import LettaAgentV2
 from letta.constants import DEFAULT_MAX_STEPS, NON_USER_MSG_PREFIX, REQUEST_HEARTBEAT_PARAM
-from letta.errors import ContextWindowExceededError, LLMError, SystemPromptTokenExceededError
+from letta.errors import ContextWindowExceededError, LLMEmptyResponseError, LLMError, SystemPromptTokenExceededError
 from letta.helpers import ToolRulesSolver
 from letta.helpers.datetime_helpers import get_utc_time, get_utc_timestamp_ns
 from letta.helpers.tool_execution_helper import enable_strict_mode
@@ -990,6 +990,9 @@ class LettaAgentV3(LettaAgentV2):
                    except ValueError as e:
                        self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response.value)
                        raise e
                    except LLMEmptyResponseError as e:
                        self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response.value)
                        raise e
                    except LLMError as e:
                        self.stop_reason = LettaStopReason(stop_reason=StopReasonType.llm_api_error.value)
                        raise e
--- a/letta/errors.py
+++ b/letta/errors.py
@@ -283,6 +283,15 @@ class LLMServerError(LLMError):
    while processing the request."""
 class LLMEmptyResponseError(LLMServerError):
    """Error when LLM returns an empty response (no content and no tool calls).
    This is a subclass of LLMServerError to maintain retry behavior, but allows
    specific handling for empty response cases which may benefit from request
    modification before retry.
    """
 class LLMTimeoutError(LLMError):
    """Error when LLM request times out"""
--- a/letta/interfaces/anthropic_streaming_interface.py
+++ b/letta/interfaces/anthropic_streaming_interface.py
@@ -30,6 +30,7 @@ from anthropic.types.beta import (
 )
 from letta.constants import DEFAULT_MESSAGE_TOOL, DEFAULT_MESSAGE_TOOL_KWARG
 from letta.errors import LLMEmptyResponseError
 from letta.local_llm.constants import INNER_THOUGHTS_KWARG
 from letta.log import get_logger
 from letta.schemas.letta_message import (
@@ -104,6 +105,10 @@ class AnthropicStreamingInterface:
        self.inner_thoughts_complete = False
        self.put_inner_thoughts_in_kwarg = put_inner_thoughts_in_kwarg
        # Track whether any content was produced (text or tool calls)
        # Used to detect empty responses from models like Opus 4.6
        self.has_content = False
        # Buffer to handle partial XML tags across chunks
        self.partial_tag_buffer = ""
@@ -298,9 +303,11 @@ class AnthropicStreamingInterface:
            if isinstance(content, BetaTextBlock):
                self.anthropic_mode = EventMode.TEXT
                self.has_content = True  # Track that we received text content
                # TODO: Can capture citations, etc.
            elif isinstance(content, BetaToolUseBlock):
                self.anthropic_mode = EventMode.TOOL_USE
                self.has_content = True  # Track that we received tool use content
                self.tool_call_id = content.id
                self.tool_call_name = content.name
                self.inner_thoughts_complete = False
@@ -589,8 +596,12 @@ class AnthropicStreamingInterface:
            # message_delta event are *cumulative*." So we assign, not accumulate.
            self.output_tokens = event.usage.output_tokens
        elif isinstance(event, BetaRawMessageStopEvent):
-            # Don't do anything here! We don't want to stop the stream.
+            # Check if any content was produced during the stream
-            pass
+            # Empty responses (no text and no tool calls) should raise an error
            if not self.has_content:
                raise LLMEmptyResponseError(
                    message=f"LLM provider returned empty content in streaming response (model: {self.model}, message_id: {self.message_id})"
                )
        elif isinstance(event, BetaRawContentBlockStopEvent):
            # If we're exiting a tool use block and there are still buffered messages,
            # we should flush them now.
@@ -837,10 +848,12 @@ class SimpleAnthropicStreamingInterface:
            if isinstance(content, BetaTextBlock):
                self.anthropic_mode = EventMode.TEXT
                self.has_content = True  # Track that we received text content
                # TODO: Can capture citations, etc.
            elif isinstance(content, BetaToolUseBlock):
                self.anthropic_mode = EventMode.TOOL_USE
                self.has_content = True  # Track that we received tool use content
                self.tool_call_id = content.id
                self.tool_call_name = content.name
@@ -1014,8 +1027,12 @@ class SimpleAnthropicStreamingInterface:
            self.output_tokens = event.usage.output_tokens
        elif isinstance(event, BetaRawMessageStopEvent):
-            # Don't do anything here! We don't want to stop the stream.
+            # Check if any content was produced during the stream
-            pass
+            # Empty responses (no text and no tool calls) should raise an error
            if not self.has_content:
                raise LLMEmptyResponseError(
                    message=f"LLM provider returned empty content in streaming response (model: {self.model}, message_id: {self.message_id})"
                )
        elif isinstance(event, BetaRawContentBlockStopEvent):
            self.anthropic_mode = None
--- a/letta/llm_api/anthropic_client.py
+++ b/letta/llm_api/anthropic_client.py
@@ -19,6 +19,8 @@ from letta.errors import (
    LLMAuthenticationError,
    LLMBadRequestError,
    LLMConnectionError,
    LLMEmptyResponseError,
    LLMError,
    LLMInsufficientCreditsError,
    LLMNotFoundError,
    LLMPermissionDeniedError,
@@ -957,6 +959,11 @@ class AnthropicClient(LLMClientBase):
    @trace_method
    def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
        # Pass through errors that are already LLMError instances unchanged
        # This preserves specific error types like LLMEmptyResponseError
        if isinstance(e, LLMError):
            return e
        is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
        # make sure to check for overflow errors, regardless of error type
@@ -1278,7 +1285,7 @@ class AnthropicClient(LLMClientBase):
                response.stop_reason,
                json.dumps(response_data),
            )
-            raise LLMServerError(
+            raise LLMEmptyResponseError(
                message=f"LLM provider returned empty content in response (ID: {response.id}, model: {response.model}, stop_reason: {response.stop_reason})",
                code=ErrorCode.INTERNAL_SERVER_ERROR,
                details={
--- a/tests/adapters/test_letta_llm_stream_adapter_error_handling.py
+++ b/tests/adapters/test_letta_llm_stream_adapter_error_handling.py
@@ -2,6 +2,12 @@ import anthropic
 import httpx
 import openai
 import pytest
 from anthropic.types.beta import (
    BetaMessage,
    BetaRawMessageStartEvent,
    BetaRawMessageStopEvent,
    BetaUsage,
 )
 from google.genai import errors as google_errors
 from letta.adapters.letta_llm_stream_adapter import LettaLLMStreamAdapter
@@ -9,6 +15,7 @@ from letta.errors import (
    ContextWindowExceededError,
    LLMBadRequestError,
    LLMConnectionError,
    LLMEmptyResponseError,
    LLMInsufficientCreditsError,
    LLMServerError,
 )
@@ -287,3 +294,70 @@ def test_openai_client_handle_llm_error_non_credit_api_error():
    result = client.handle_llm_error(error)
    assert isinstance(result, LLMBadRequestError)
    assert not isinstance(result, LLMInsufficientCreditsError)
@pytest.mark.asyncio
 async def test_letta_llm_stream_adapter_raises_empty_response_error_for_anthropic(monkeypatch):
    """LET-7679: Empty streaming responses (no content blocks) should raise LLMEmptyResponseError.
    This tests the case where Opus 4.6 returns a response with:
    - BetaRawMessageStartEvent (with usage tokens)
    - BetaRawMessageStopEvent (end_turn)
    - NO content blocks in between
    This should raise LLMEmptyResponseError, not complete successfully with stop_reason=end_turn.
    """
    class FakeAsyncStream:
        """Mimics anthropic.AsyncStream that returns empty content (no content blocks)."""
        def __init__(self):
            self.events = [
                # Message start with some usage info
                BetaRawMessageStartEvent(
                    type="message_start",
                    message=BetaMessage(
                        id="msg_test_empty",
                        type="message",
                        role="assistant",
                        content=[],  # Empty content
                        model="claude-opus-4-6",
                        stop_reason="end_turn",
                        stop_sequence=None,
                        usage=BetaUsage(input_tokens=1000, output_tokens=26, cache_creation_input_tokens=0, cache_read_input_tokens=0),
                    ),
                ),
                # Message stop immediately after start - no content blocks
                BetaRawMessageStopEvent(type="message_stop"),
            ]
            self.index = 0
        async def __aenter__(self):
            return self
        async def __aexit__(self, exc_type, exc, tb):
            return None
        def __aiter__(self):
            return self
        async def __anext__(self):
            if self.index >= len(self.events):
                raise StopAsyncIteration
            event = self.events[self.index]
            self.index += 1
            return event
    async def fake_stream_async(self, request_data: dict, llm_config):
        return FakeAsyncStream()
    monkeypatch.setattr(AnthropicClient, "stream_async", fake_stream_async, raising=True)
    llm_client = AnthropicClient()
    llm_config = LLMConfig(model="claude-opus-4-6", model_endpoint_type="anthropic", context_window=200000)
    adapter = LettaLLMStreamAdapter(llm_client=llm_client, llm_config=llm_config, call_type=LLMCallType.agent_step)
    gen = adapter.invoke_llm(request_data={}, messages=[], tools=[], use_assistant_message=True)
    with pytest.raises(LLMEmptyResponseError):
        async for _ in gen:
            pass