fix(core): raise LLMEmptyResponseError for empty Anthropic responses (#9624)

* fix(core): raise LLMEmptyResponseError for empty Anthropic responses

Fixes LET-7679: Opus 4.6 occasionally returns empty responses (no content
and no tool calls), causing silent failures with stop_reason=end_turn.

Changes:
- Add LLMEmptyResponseError class (subclass of LLMServerError)
- Raise error in anthropic_client for empty non-streaming responses
- Raise error in anthropic_streaming_interface for empty streaming responses
- Pass through LLMError instances in handle_llm_error to preserve specific types
- Add test for empty streaming response detection

This allows clients (letta-code) to catch this specific error and implement
retry logic with cache-busting modifications.

🤖 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* fix(core): set invalid_llm_response stop reason for empty responses

Catch LLMEmptyResponseError specifically and set stop_reason to
invalid_llm_response instead of llm_api_error. This allows clients
to distinguish empty responses from transient API errors.

🤖 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

---------

Co-authored-by: Letta <noreply@letta.com>
This commit is contained in:
cthomas
2026-02-24 19:57:23 -08:00
committed by Caren Thomas
parent 86ff216dc9
commit 3d781efd21
5 changed files with 116 additions and 6 deletions

View File

@@ -21,7 +21,7 @@ from letta.agents.helpers import (
) )
from letta.agents.letta_agent_v2 import LettaAgentV2 from letta.agents.letta_agent_v2 import LettaAgentV2
from letta.constants import DEFAULT_MAX_STEPS, NON_USER_MSG_PREFIX, REQUEST_HEARTBEAT_PARAM from letta.constants import DEFAULT_MAX_STEPS, NON_USER_MSG_PREFIX, REQUEST_HEARTBEAT_PARAM
from letta.errors import ContextWindowExceededError, LLMError, SystemPromptTokenExceededError from letta.errors import ContextWindowExceededError, LLMEmptyResponseError, LLMError, SystemPromptTokenExceededError
from letta.helpers import ToolRulesSolver from letta.helpers import ToolRulesSolver
from letta.helpers.datetime_helpers import get_utc_time, get_utc_timestamp_ns from letta.helpers.datetime_helpers import get_utc_time, get_utc_timestamp_ns
from letta.helpers.tool_execution_helper import enable_strict_mode from letta.helpers.tool_execution_helper import enable_strict_mode
@@ -990,6 +990,9 @@ class LettaAgentV3(LettaAgentV2):
except ValueError as e: except ValueError as e:
self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response.value) self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response.value)
raise e raise e
except LLMEmptyResponseError as e:
self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response.value)
raise e
except LLMError as e: except LLMError as e:
self.stop_reason = LettaStopReason(stop_reason=StopReasonType.llm_api_error.value) self.stop_reason = LettaStopReason(stop_reason=StopReasonType.llm_api_error.value)
raise e raise e

View File

@@ -283,6 +283,15 @@ class LLMServerError(LLMError):
while processing the request.""" while processing the request."""
class LLMEmptyResponseError(LLMServerError):
"""Error when LLM returns an empty response (no content and no tool calls).
This is a subclass of LLMServerError to maintain retry behavior, but allows
specific handling for empty response cases which may benefit from request
modification before retry.
"""
class LLMTimeoutError(LLMError): class LLMTimeoutError(LLMError):
"""Error when LLM request times out""" """Error when LLM request times out"""

View File

@@ -30,6 +30,7 @@ from anthropic.types.beta import (
) )
from letta.constants import DEFAULT_MESSAGE_TOOL, DEFAULT_MESSAGE_TOOL_KWARG from letta.constants import DEFAULT_MESSAGE_TOOL, DEFAULT_MESSAGE_TOOL_KWARG
from letta.errors import LLMEmptyResponseError
from letta.local_llm.constants import INNER_THOUGHTS_KWARG from letta.local_llm.constants import INNER_THOUGHTS_KWARG
from letta.log import get_logger from letta.log import get_logger
from letta.schemas.letta_message import ( from letta.schemas.letta_message import (
@@ -104,6 +105,10 @@ class AnthropicStreamingInterface:
self.inner_thoughts_complete = False self.inner_thoughts_complete = False
self.put_inner_thoughts_in_kwarg = put_inner_thoughts_in_kwarg self.put_inner_thoughts_in_kwarg = put_inner_thoughts_in_kwarg
# Track whether any content was produced (text or tool calls)
# Used to detect empty responses from models like Opus 4.6
self.has_content = False
# Buffer to handle partial XML tags across chunks # Buffer to handle partial XML tags across chunks
self.partial_tag_buffer = "" self.partial_tag_buffer = ""
@@ -298,9 +303,11 @@ class AnthropicStreamingInterface:
if isinstance(content, BetaTextBlock): if isinstance(content, BetaTextBlock):
self.anthropic_mode = EventMode.TEXT self.anthropic_mode = EventMode.TEXT
self.has_content = True # Track that we received text content
# TODO: Can capture citations, etc. # TODO: Can capture citations, etc.
elif isinstance(content, BetaToolUseBlock): elif isinstance(content, BetaToolUseBlock):
self.anthropic_mode = EventMode.TOOL_USE self.anthropic_mode = EventMode.TOOL_USE
self.has_content = True # Track that we received tool use content
self.tool_call_id = content.id self.tool_call_id = content.id
self.tool_call_name = content.name self.tool_call_name = content.name
self.inner_thoughts_complete = False self.inner_thoughts_complete = False
@@ -589,8 +596,12 @@ class AnthropicStreamingInterface:
# message_delta event are *cumulative*." So we assign, not accumulate. # message_delta event are *cumulative*." So we assign, not accumulate.
self.output_tokens = event.usage.output_tokens self.output_tokens = event.usage.output_tokens
elif isinstance(event, BetaRawMessageStopEvent): elif isinstance(event, BetaRawMessageStopEvent):
# Don't do anything here! We don't want to stop the stream. # Check if any content was produced during the stream
pass # Empty responses (no text and no tool calls) should raise an error
if not self.has_content:
raise LLMEmptyResponseError(
message=f"LLM provider returned empty content in streaming response (model: {self.model}, message_id: {self.message_id})"
)
elif isinstance(event, BetaRawContentBlockStopEvent): elif isinstance(event, BetaRawContentBlockStopEvent):
# If we're exiting a tool use block and there are still buffered messages, # If we're exiting a tool use block and there are still buffered messages,
# we should flush them now. # we should flush them now.
@@ -837,10 +848,12 @@ class SimpleAnthropicStreamingInterface:
if isinstance(content, BetaTextBlock): if isinstance(content, BetaTextBlock):
self.anthropic_mode = EventMode.TEXT self.anthropic_mode = EventMode.TEXT
self.has_content = True # Track that we received text content
# TODO: Can capture citations, etc. # TODO: Can capture citations, etc.
elif isinstance(content, BetaToolUseBlock): elif isinstance(content, BetaToolUseBlock):
self.anthropic_mode = EventMode.TOOL_USE self.anthropic_mode = EventMode.TOOL_USE
self.has_content = True # Track that we received tool use content
self.tool_call_id = content.id self.tool_call_id = content.id
self.tool_call_name = content.name self.tool_call_name = content.name
@@ -1014,8 +1027,12 @@ class SimpleAnthropicStreamingInterface:
self.output_tokens = event.usage.output_tokens self.output_tokens = event.usage.output_tokens
elif isinstance(event, BetaRawMessageStopEvent): elif isinstance(event, BetaRawMessageStopEvent):
# Don't do anything here! We don't want to stop the stream. # Check if any content was produced during the stream
pass # Empty responses (no text and no tool calls) should raise an error
if not self.has_content:
raise LLMEmptyResponseError(
message=f"LLM provider returned empty content in streaming response (model: {self.model}, message_id: {self.message_id})"
)
elif isinstance(event, BetaRawContentBlockStopEvent): elif isinstance(event, BetaRawContentBlockStopEvent):
self.anthropic_mode = None self.anthropic_mode = None

View File

@@ -19,6 +19,8 @@ from letta.errors import (
LLMAuthenticationError, LLMAuthenticationError,
LLMBadRequestError, LLMBadRequestError,
LLMConnectionError, LLMConnectionError,
LLMEmptyResponseError,
LLMError,
LLMInsufficientCreditsError, LLMInsufficientCreditsError,
LLMNotFoundError, LLMNotFoundError,
LLMPermissionDeniedError, LLMPermissionDeniedError,
@@ -957,6 +959,11 @@ class AnthropicClient(LLMClientBase):
@trace_method @trace_method
def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception: def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
# Pass through errors that are already LLMError instances unchanged
# This preserves specific error types like LLMEmptyResponseError
if isinstance(e, LLMError):
return e
is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
# make sure to check for overflow errors, regardless of error type # make sure to check for overflow errors, regardless of error type
@@ -1278,7 +1285,7 @@ class AnthropicClient(LLMClientBase):
response.stop_reason, response.stop_reason,
json.dumps(response_data), json.dumps(response_data),
) )
raise LLMServerError( raise LLMEmptyResponseError(
message=f"LLM provider returned empty content in response (ID: {response.id}, model: {response.model}, stop_reason: {response.stop_reason})", message=f"LLM provider returned empty content in response (ID: {response.id}, model: {response.model}, stop_reason: {response.stop_reason})",
code=ErrorCode.INTERNAL_SERVER_ERROR, code=ErrorCode.INTERNAL_SERVER_ERROR,
details={ details={

View File

@@ -2,6 +2,12 @@ import anthropic
import httpx import httpx
import openai import openai
import pytest import pytest
from anthropic.types.beta import (
BetaMessage,
BetaRawMessageStartEvent,
BetaRawMessageStopEvent,
BetaUsage,
)
from google.genai import errors as google_errors from google.genai import errors as google_errors
from letta.adapters.letta_llm_stream_adapter import LettaLLMStreamAdapter from letta.adapters.letta_llm_stream_adapter import LettaLLMStreamAdapter
@@ -9,6 +15,7 @@ from letta.errors import (
ContextWindowExceededError, ContextWindowExceededError,
LLMBadRequestError, LLMBadRequestError,
LLMConnectionError, LLMConnectionError,
LLMEmptyResponseError,
LLMInsufficientCreditsError, LLMInsufficientCreditsError,
LLMServerError, LLMServerError,
) )
@@ -287,3 +294,70 @@ def test_openai_client_handle_llm_error_non_credit_api_error():
result = client.handle_llm_error(error) result = client.handle_llm_error(error)
assert isinstance(result, LLMBadRequestError) assert isinstance(result, LLMBadRequestError)
assert not isinstance(result, LLMInsufficientCreditsError) assert not isinstance(result, LLMInsufficientCreditsError)
@pytest.mark.asyncio
async def test_letta_llm_stream_adapter_raises_empty_response_error_for_anthropic(monkeypatch):
"""LET-7679: Empty streaming responses (no content blocks) should raise LLMEmptyResponseError.
This tests the case where Opus 4.6 returns a response with:
- BetaRawMessageStartEvent (with usage tokens)
- BetaRawMessageStopEvent (end_turn)
- NO content blocks in between
This should raise LLMEmptyResponseError, not complete successfully with stop_reason=end_turn.
"""
class FakeAsyncStream:
"""Mimics anthropic.AsyncStream that returns empty content (no content blocks)."""
def __init__(self):
self.events = [
# Message start with some usage info
BetaRawMessageStartEvent(
type="message_start",
message=BetaMessage(
id="msg_test_empty",
type="message",
role="assistant",
content=[], # Empty content
model="claude-opus-4-6",
stop_reason="end_turn",
stop_sequence=None,
usage=BetaUsage(input_tokens=1000, output_tokens=26, cache_creation_input_tokens=0, cache_read_input_tokens=0),
),
),
# Message stop immediately after start - no content blocks
BetaRawMessageStopEvent(type="message_stop"),
]
self.index = 0
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc, tb):
return None
def __aiter__(self):
return self
async def __anext__(self):
if self.index >= len(self.events):
raise StopAsyncIteration
event = self.events[self.index]
self.index += 1
return event
async def fake_stream_async(self, request_data: dict, llm_config):
return FakeAsyncStream()
monkeypatch.setattr(AnthropicClient, "stream_async", fake_stream_async, raising=True)
llm_client = AnthropicClient()
llm_config = LLMConfig(model="claude-opus-4-6", model_endpoint_type="anthropic", context_window=200000)
adapter = LettaLLMStreamAdapter(llm_client=llm_client, llm_config=llm_config, call_type=LLMCallType.agent_step)
gen = adapter.invoke_llm(request_data={}, messages=[], tools=[], use_assistant_message=True)
with pytest.raises(LLMEmptyResponseError):
async for _ in gen:
pass