fix(core): raise LLMEmptyResponseError for empty Anthropic responses (#9624)
* fix(core): raise LLMEmptyResponseError for empty Anthropic responses Fixes LET-7679: Opus 4.6 occasionally returns empty responses (no content and no tool calls), causing silent failures with stop_reason=end_turn. Changes: - Add LLMEmptyResponseError class (subclass of LLMServerError) - Raise error in anthropic_client for empty non-streaming responses - Raise error in anthropic_streaming_interface for empty streaming responses - Pass through LLMError instances in handle_llm_error to preserve specific types - Add test for empty streaming response detection This allows clients (letta-code) to catch this specific error and implement retry logic with cache-busting modifications. 🤖 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * fix(core): set invalid_llm_response stop reason for empty responses Catch LLMEmptyResponseError specifically and set stop_reason to invalid_llm_response instead of llm_api_error. This allows clients to distinguish empty responses from transient API errors. 🤖 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> --------- Co-authored-by: Letta <noreply@letta.com>
This commit is contained in:
@@ -21,7 +21,7 @@ from letta.agents.helpers import (
|
||||
)
|
||||
from letta.agents.letta_agent_v2 import LettaAgentV2
|
||||
from letta.constants import DEFAULT_MAX_STEPS, NON_USER_MSG_PREFIX, REQUEST_HEARTBEAT_PARAM
|
||||
from letta.errors import ContextWindowExceededError, LLMError, SystemPromptTokenExceededError
|
||||
from letta.errors import ContextWindowExceededError, LLMEmptyResponseError, LLMError, SystemPromptTokenExceededError
|
||||
from letta.helpers import ToolRulesSolver
|
||||
from letta.helpers.datetime_helpers import get_utc_time, get_utc_timestamp_ns
|
||||
from letta.helpers.tool_execution_helper import enable_strict_mode
|
||||
@@ -990,6 +990,9 @@ class LettaAgentV3(LettaAgentV2):
|
||||
except ValueError as e:
|
||||
self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response.value)
|
||||
raise e
|
||||
except LLMEmptyResponseError as e:
|
||||
self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response.value)
|
||||
raise e
|
||||
except LLMError as e:
|
||||
self.stop_reason = LettaStopReason(stop_reason=StopReasonType.llm_api_error.value)
|
||||
raise e
|
||||
|
||||
@@ -283,6 +283,15 @@ class LLMServerError(LLMError):
|
||||
while processing the request."""
|
||||
|
||||
|
||||
class LLMEmptyResponseError(LLMServerError):
|
||||
"""Error when LLM returns an empty response (no content and no tool calls).
|
||||
|
||||
This is a subclass of LLMServerError to maintain retry behavior, but allows
|
||||
specific handling for empty response cases which may benefit from request
|
||||
modification before retry.
|
||||
"""
|
||||
|
||||
|
||||
class LLMTimeoutError(LLMError):
|
||||
"""Error when LLM request times out"""
|
||||
|
||||
|
||||
@@ -30,6 +30,7 @@ from anthropic.types.beta import (
|
||||
)
|
||||
|
||||
from letta.constants import DEFAULT_MESSAGE_TOOL, DEFAULT_MESSAGE_TOOL_KWARG
|
||||
from letta.errors import LLMEmptyResponseError
|
||||
from letta.local_llm.constants import INNER_THOUGHTS_KWARG
|
||||
from letta.log import get_logger
|
||||
from letta.schemas.letta_message import (
|
||||
@@ -104,6 +105,10 @@ class AnthropicStreamingInterface:
|
||||
self.inner_thoughts_complete = False
|
||||
self.put_inner_thoughts_in_kwarg = put_inner_thoughts_in_kwarg
|
||||
|
||||
# Track whether any content was produced (text or tool calls)
|
||||
# Used to detect empty responses from models like Opus 4.6
|
||||
self.has_content = False
|
||||
|
||||
# Buffer to handle partial XML tags across chunks
|
||||
self.partial_tag_buffer = ""
|
||||
|
||||
@@ -298,9 +303,11 @@ class AnthropicStreamingInterface:
|
||||
|
||||
if isinstance(content, BetaTextBlock):
|
||||
self.anthropic_mode = EventMode.TEXT
|
||||
self.has_content = True # Track that we received text content
|
||||
# TODO: Can capture citations, etc.
|
||||
elif isinstance(content, BetaToolUseBlock):
|
||||
self.anthropic_mode = EventMode.TOOL_USE
|
||||
self.has_content = True # Track that we received tool use content
|
||||
self.tool_call_id = content.id
|
||||
self.tool_call_name = content.name
|
||||
self.inner_thoughts_complete = False
|
||||
@@ -589,8 +596,12 @@ class AnthropicStreamingInterface:
|
||||
# message_delta event are *cumulative*." So we assign, not accumulate.
|
||||
self.output_tokens = event.usage.output_tokens
|
||||
elif isinstance(event, BetaRawMessageStopEvent):
|
||||
# Don't do anything here! We don't want to stop the stream.
|
||||
pass
|
||||
# Check if any content was produced during the stream
|
||||
# Empty responses (no text and no tool calls) should raise an error
|
||||
if not self.has_content:
|
||||
raise LLMEmptyResponseError(
|
||||
message=f"LLM provider returned empty content in streaming response (model: {self.model}, message_id: {self.message_id})"
|
||||
)
|
||||
elif isinstance(event, BetaRawContentBlockStopEvent):
|
||||
# If we're exiting a tool use block and there are still buffered messages,
|
||||
# we should flush them now.
|
||||
@@ -837,10 +848,12 @@ class SimpleAnthropicStreamingInterface:
|
||||
|
||||
if isinstance(content, BetaTextBlock):
|
||||
self.anthropic_mode = EventMode.TEXT
|
||||
self.has_content = True # Track that we received text content
|
||||
# TODO: Can capture citations, etc.
|
||||
|
||||
elif isinstance(content, BetaToolUseBlock):
|
||||
self.anthropic_mode = EventMode.TOOL_USE
|
||||
self.has_content = True # Track that we received tool use content
|
||||
self.tool_call_id = content.id
|
||||
self.tool_call_name = content.name
|
||||
|
||||
@@ -1014,8 +1027,12 @@ class SimpleAnthropicStreamingInterface:
|
||||
self.output_tokens = event.usage.output_tokens
|
||||
|
||||
elif isinstance(event, BetaRawMessageStopEvent):
|
||||
# Don't do anything here! We don't want to stop the stream.
|
||||
pass
|
||||
# Check if any content was produced during the stream
|
||||
# Empty responses (no text and no tool calls) should raise an error
|
||||
if not self.has_content:
|
||||
raise LLMEmptyResponseError(
|
||||
message=f"LLM provider returned empty content in streaming response (model: {self.model}, message_id: {self.message_id})"
|
||||
)
|
||||
|
||||
elif isinstance(event, BetaRawContentBlockStopEvent):
|
||||
self.anthropic_mode = None
|
||||
|
||||
@@ -19,6 +19,8 @@ from letta.errors import (
|
||||
LLMAuthenticationError,
|
||||
LLMBadRequestError,
|
||||
LLMConnectionError,
|
||||
LLMEmptyResponseError,
|
||||
LLMError,
|
||||
LLMInsufficientCreditsError,
|
||||
LLMNotFoundError,
|
||||
LLMPermissionDeniedError,
|
||||
@@ -957,6 +959,11 @@ class AnthropicClient(LLMClientBase):
|
||||
|
||||
@trace_method
|
||||
def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
|
||||
# Pass through errors that are already LLMError instances unchanged
|
||||
# This preserves specific error types like LLMEmptyResponseError
|
||||
if isinstance(e, LLMError):
|
||||
return e
|
||||
|
||||
is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
|
||||
|
||||
# make sure to check for overflow errors, regardless of error type
|
||||
@@ -1278,7 +1285,7 @@ class AnthropicClient(LLMClientBase):
|
||||
response.stop_reason,
|
||||
json.dumps(response_data),
|
||||
)
|
||||
raise LLMServerError(
|
||||
raise LLMEmptyResponseError(
|
||||
message=f"LLM provider returned empty content in response (ID: {response.id}, model: {response.model}, stop_reason: {response.stop_reason})",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={
|
||||
|
||||
@@ -2,6 +2,12 @@ import anthropic
|
||||
import httpx
|
||||
import openai
|
||||
import pytest
|
||||
from anthropic.types.beta import (
|
||||
BetaMessage,
|
||||
BetaRawMessageStartEvent,
|
||||
BetaRawMessageStopEvent,
|
||||
BetaUsage,
|
||||
)
|
||||
from google.genai import errors as google_errors
|
||||
|
||||
from letta.adapters.letta_llm_stream_adapter import LettaLLMStreamAdapter
|
||||
@@ -9,6 +15,7 @@ from letta.errors import (
|
||||
ContextWindowExceededError,
|
||||
LLMBadRequestError,
|
||||
LLMConnectionError,
|
||||
LLMEmptyResponseError,
|
||||
LLMInsufficientCreditsError,
|
||||
LLMServerError,
|
||||
)
|
||||
@@ -287,3 +294,70 @@ def test_openai_client_handle_llm_error_non_credit_api_error():
|
||||
result = client.handle_llm_error(error)
|
||||
assert isinstance(result, LLMBadRequestError)
|
||||
assert not isinstance(result, LLMInsufficientCreditsError)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_letta_llm_stream_adapter_raises_empty_response_error_for_anthropic(monkeypatch):
|
||||
"""LET-7679: Empty streaming responses (no content blocks) should raise LLMEmptyResponseError.
|
||||
|
||||
This tests the case where Opus 4.6 returns a response with:
|
||||
- BetaRawMessageStartEvent (with usage tokens)
|
||||
- BetaRawMessageStopEvent (end_turn)
|
||||
- NO content blocks in between
|
||||
|
||||
This should raise LLMEmptyResponseError, not complete successfully with stop_reason=end_turn.
|
||||
"""
|
||||
|
||||
class FakeAsyncStream:
|
||||
"""Mimics anthropic.AsyncStream that returns empty content (no content blocks)."""
|
||||
|
||||
def __init__(self):
|
||||
self.events = [
|
||||
# Message start with some usage info
|
||||
BetaRawMessageStartEvent(
|
||||
type="message_start",
|
||||
message=BetaMessage(
|
||||
id="msg_test_empty",
|
||||
type="message",
|
||||
role="assistant",
|
||||
content=[], # Empty content
|
||||
model="claude-opus-4-6",
|
||||
stop_reason="end_turn",
|
||||
stop_sequence=None,
|
||||
usage=BetaUsage(input_tokens=1000, output_tokens=26, cache_creation_input_tokens=0, cache_read_input_tokens=0),
|
||||
),
|
||||
),
|
||||
# Message stop immediately after start - no content blocks
|
||||
BetaRawMessageStopEvent(type="message_stop"),
|
||||
]
|
||||
self.index = 0
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc, tb):
|
||||
return None
|
||||
|
||||
def __aiter__(self):
|
||||
return self
|
||||
|
||||
async def __anext__(self):
|
||||
if self.index >= len(self.events):
|
||||
raise StopAsyncIteration
|
||||
event = self.events[self.index]
|
||||
self.index += 1
|
||||
return event
|
||||
|
||||
async def fake_stream_async(self, request_data: dict, llm_config):
|
||||
return FakeAsyncStream()
|
||||
|
||||
monkeypatch.setattr(AnthropicClient, "stream_async", fake_stream_async, raising=True)
|
||||
|
||||
llm_client = AnthropicClient()
|
||||
llm_config = LLMConfig(model="claude-opus-4-6", model_endpoint_type="anthropic", context_window=200000)
|
||||
adapter = LettaLLMStreamAdapter(llm_client=llm_client, llm_config=llm_config, call_type=LLMCallType.agent_step)
|
||||
|
||||
gen = adapter.invoke_llm(request_data={}, messages=[], tools=[], use_assistant_message=True)
|
||||
with pytest.raises(LLMEmptyResponseError):
|
||||
async for _ in gen:
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user