fix(core): raise LLMEmptyResponseError for empty Anthropic responses (#9624)
* fix(core): raise LLMEmptyResponseError for empty Anthropic responses Fixes LET-7679: Opus 4.6 occasionally returns empty responses (no content and no tool calls), causing silent failures with stop_reason=end_turn. Changes: - Add LLMEmptyResponseError class (subclass of LLMServerError) - Raise error in anthropic_client for empty non-streaming responses - Raise error in anthropic_streaming_interface for empty streaming responses - Pass through LLMError instances in handle_llm_error to preserve specific types - Add test for empty streaming response detection This allows clients (letta-code) to catch this specific error and implement retry logic with cache-busting modifications. 🤖 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * fix(core): set invalid_llm_response stop reason for empty responses Catch LLMEmptyResponseError specifically and set stop_reason to invalid_llm_response instead of llm_api_error. This allows clients to distinguish empty responses from transient API errors. 🤖 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> --------- Co-authored-by: Letta <noreply@letta.com>
This commit is contained in:
@@ -21,7 +21,7 @@ from letta.agents.helpers import (
|
|||||||
)
|
)
|
||||||
from letta.agents.letta_agent_v2 import LettaAgentV2
|
from letta.agents.letta_agent_v2 import LettaAgentV2
|
||||||
from letta.constants import DEFAULT_MAX_STEPS, NON_USER_MSG_PREFIX, REQUEST_HEARTBEAT_PARAM
|
from letta.constants import DEFAULT_MAX_STEPS, NON_USER_MSG_PREFIX, REQUEST_HEARTBEAT_PARAM
|
||||||
from letta.errors import ContextWindowExceededError, LLMError, SystemPromptTokenExceededError
|
from letta.errors import ContextWindowExceededError, LLMEmptyResponseError, LLMError, SystemPromptTokenExceededError
|
||||||
from letta.helpers import ToolRulesSolver
|
from letta.helpers import ToolRulesSolver
|
||||||
from letta.helpers.datetime_helpers import get_utc_time, get_utc_timestamp_ns
|
from letta.helpers.datetime_helpers import get_utc_time, get_utc_timestamp_ns
|
||||||
from letta.helpers.tool_execution_helper import enable_strict_mode
|
from letta.helpers.tool_execution_helper import enable_strict_mode
|
||||||
@@ -990,6 +990,9 @@ class LettaAgentV3(LettaAgentV2):
|
|||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response.value)
|
self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response.value)
|
||||||
raise e
|
raise e
|
||||||
|
except LLMEmptyResponseError as e:
|
||||||
|
self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response.value)
|
||||||
|
raise e
|
||||||
except LLMError as e:
|
except LLMError as e:
|
||||||
self.stop_reason = LettaStopReason(stop_reason=StopReasonType.llm_api_error.value)
|
self.stop_reason = LettaStopReason(stop_reason=StopReasonType.llm_api_error.value)
|
||||||
raise e
|
raise e
|
||||||
|
|||||||
@@ -283,6 +283,15 @@ class LLMServerError(LLMError):
|
|||||||
while processing the request."""
|
while processing the request."""
|
||||||
|
|
||||||
|
|
||||||
|
class LLMEmptyResponseError(LLMServerError):
|
||||||
|
"""Error when LLM returns an empty response (no content and no tool calls).
|
||||||
|
|
||||||
|
This is a subclass of LLMServerError to maintain retry behavior, but allows
|
||||||
|
specific handling for empty response cases which may benefit from request
|
||||||
|
modification before retry.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class LLMTimeoutError(LLMError):
|
class LLMTimeoutError(LLMError):
|
||||||
"""Error when LLM request times out"""
|
"""Error when LLM request times out"""
|
||||||
|
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ from anthropic.types.beta import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
from letta.constants import DEFAULT_MESSAGE_TOOL, DEFAULT_MESSAGE_TOOL_KWARG
|
from letta.constants import DEFAULT_MESSAGE_TOOL, DEFAULT_MESSAGE_TOOL_KWARG
|
||||||
|
from letta.errors import LLMEmptyResponseError
|
||||||
from letta.local_llm.constants import INNER_THOUGHTS_KWARG
|
from letta.local_llm.constants import INNER_THOUGHTS_KWARG
|
||||||
from letta.log import get_logger
|
from letta.log import get_logger
|
||||||
from letta.schemas.letta_message import (
|
from letta.schemas.letta_message import (
|
||||||
@@ -104,6 +105,10 @@ class AnthropicStreamingInterface:
|
|||||||
self.inner_thoughts_complete = False
|
self.inner_thoughts_complete = False
|
||||||
self.put_inner_thoughts_in_kwarg = put_inner_thoughts_in_kwarg
|
self.put_inner_thoughts_in_kwarg = put_inner_thoughts_in_kwarg
|
||||||
|
|
||||||
|
# Track whether any content was produced (text or tool calls)
|
||||||
|
# Used to detect empty responses from models like Opus 4.6
|
||||||
|
self.has_content = False
|
||||||
|
|
||||||
# Buffer to handle partial XML tags across chunks
|
# Buffer to handle partial XML tags across chunks
|
||||||
self.partial_tag_buffer = ""
|
self.partial_tag_buffer = ""
|
||||||
|
|
||||||
@@ -298,9 +303,11 @@ class AnthropicStreamingInterface:
|
|||||||
|
|
||||||
if isinstance(content, BetaTextBlock):
|
if isinstance(content, BetaTextBlock):
|
||||||
self.anthropic_mode = EventMode.TEXT
|
self.anthropic_mode = EventMode.TEXT
|
||||||
|
self.has_content = True # Track that we received text content
|
||||||
# TODO: Can capture citations, etc.
|
# TODO: Can capture citations, etc.
|
||||||
elif isinstance(content, BetaToolUseBlock):
|
elif isinstance(content, BetaToolUseBlock):
|
||||||
self.anthropic_mode = EventMode.TOOL_USE
|
self.anthropic_mode = EventMode.TOOL_USE
|
||||||
|
self.has_content = True # Track that we received tool use content
|
||||||
self.tool_call_id = content.id
|
self.tool_call_id = content.id
|
||||||
self.tool_call_name = content.name
|
self.tool_call_name = content.name
|
||||||
self.inner_thoughts_complete = False
|
self.inner_thoughts_complete = False
|
||||||
@@ -589,8 +596,12 @@ class AnthropicStreamingInterface:
|
|||||||
# message_delta event are *cumulative*." So we assign, not accumulate.
|
# message_delta event are *cumulative*." So we assign, not accumulate.
|
||||||
self.output_tokens = event.usage.output_tokens
|
self.output_tokens = event.usage.output_tokens
|
||||||
elif isinstance(event, BetaRawMessageStopEvent):
|
elif isinstance(event, BetaRawMessageStopEvent):
|
||||||
# Don't do anything here! We don't want to stop the stream.
|
# Check if any content was produced during the stream
|
||||||
pass
|
# Empty responses (no text and no tool calls) should raise an error
|
||||||
|
if not self.has_content:
|
||||||
|
raise LLMEmptyResponseError(
|
||||||
|
message=f"LLM provider returned empty content in streaming response (model: {self.model}, message_id: {self.message_id})"
|
||||||
|
)
|
||||||
elif isinstance(event, BetaRawContentBlockStopEvent):
|
elif isinstance(event, BetaRawContentBlockStopEvent):
|
||||||
# If we're exiting a tool use block and there are still buffered messages,
|
# If we're exiting a tool use block and there are still buffered messages,
|
||||||
# we should flush them now.
|
# we should flush them now.
|
||||||
@@ -837,10 +848,12 @@ class SimpleAnthropicStreamingInterface:
|
|||||||
|
|
||||||
if isinstance(content, BetaTextBlock):
|
if isinstance(content, BetaTextBlock):
|
||||||
self.anthropic_mode = EventMode.TEXT
|
self.anthropic_mode = EventMode.TEXT
|
||||||
|
self.has_content = True # Track that we received text content
|
||||||
# TODO: Can capture citations, etc.
|
# TODO: Can capture citations, etc.
|
||||||
|
|
||||||
elif isinstance(content, BetaToolUseBlock):
|
elif isinstance(content, BetaToolUseBlock):
|
||||||
self.anthropic_mode = EventMode.TOOL_USE
|
self.anthropic_mode = EventMode.TOOL_USE
|
||||||
|
self.has_content = True # Track that we received tool use content
|
||||||
self.tool_call_id = content.id
|
self.tool_call_id = content.id
|
||||||
self.tool_call_name = content.name
|
self.tool_call_name = content.name
|
||||||
|
|
||||||
@@ -1014,8 +1027,12 @@ class SimpleAnthropicStreamingInterface:
|
|||||||
self.output_tokens = event.usage.output_tokens
|
self.output_tokens = event.usage.output_tokens
|
||||||
|
|
||||||
elif isinstance(event, BetaRawMessageStopEvent):
|
elif isinstance(event, BetaRawMessageStopEvent):
|
||||||
# Don't do anything here! We don't want to stop the stream.
|
# Check if any content was produced during the stream
|
||||||
pass
|
# Empty responses (no text and no tool calls) should raise an error
|
||||||
|
if not self.has_content:
|
||||||
|
raise LLMEmptyResponseError(
|
||||||
|
message=f"LLM provider returned empty content in streaming response (model: {self.model}, message_id: {self.message_id})"
|
||||||
|
)
|
||||||
|
|
||||||
elif isinstance(event, BetaRawContentBlockStopEvent):
|
elif isinstance(event, BetaRawContentBlockStopEvent):
|
||||||
self.anthropic_mode = None
|
self.anthropic_mode = None
|
||||||
|
|||||||
@@ -19,6 +19,8 @@ from letta.errors import (
|
|||||||
LLMAuthenticationError,
|
LLMAuthenticationError,
|
||||||
LLMBadRequestError,
|
LLMBadRequestError,
|
||||||
LLMConnectionError,
|
LLMConnectionError,
|
||||||
|
LLMEmptyResponseError,
|
||||||
|
LLMError,
|
||||||
LLMInsufficientCreditsError,
|
LLMInsufficientCreditsError,
|
||||||
LLMNotFoundError,
|
LLMNotFoundError,
|
||||||
LLMPermissionDeniedError,
|
LLMPermissionDeniedError,
|
||||||
@@ -957,6 +959,11 @@ class AnthropicClient(LLMClientBase):
|
|||||||
|
|
||||||
@trace_method
|
@trace_method
|
||||||
def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
|
def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
|
||||||
|
# Pass through errors that are already LLMError instances unchanged
|
||||||
|
# This preserves specific error types like LLMEmptyResponseError
|
||||||
|
if isinstance(e, LLMError):
|
||||||
|
return e
|
||||||
|
|
||||||
is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
|
is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
|
||||||
|
|
||||||
# make sure to check for overflow errors, regardless of error type
|
# make sure to check for overflow errors, regardless of error type
|
||||||
@@ -1278,7 +1285,7 @@ class AnthropicClient(LLMClientBase):
|
|||||||
response.stop_reason,
|
response.stop_reason,
|
||||||
json.dumps(response_data),
|
json.dumps(response_data),
|
||||||
)
|
)
|
||||||
raise LLMServerError(
|
raise LLMEmptyResponseError(
|
||||||
message=f"LLM provider returned empty content in response (ID: {response.id}, model: {response.model}, stop_reason: {response.stop_reason})",
|
message=f"LLM provider returned empty content in response (ID: {response.id}, model: {response.model}, stop_reason: {response.stop_reason})",
|
||||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||||
details={
|
details={
|
||||||
|
|||||||
@@ -2,6 +2,12 @@ import anthropic
|
|||||||
import httpx
|
import httpx
|
||||||
import openai
|
import openai
|
||||||
import pytest
|
import pytest
|
||||||
|
from anthropic.types.beta import (
|
||||||
|
BetaMessage,
|
||||||
|
BetaRawMessageStartEvent,
|
||||||
|
BetaRawMessageStopEvent,
|
||||||
|
BetaUsage,
|
||||||
|
)
|
||||||
from google.genai import errors as google_errors
|
from google.genai import errors as google_errors
|
||||||
|
|
||||||
from letta.adapters.letta_llm_stream_adapter import LettaLLMStreamAdapter
|
from letta.adapters.letta_llm_stream_adapter import LettaLLMStreamAdapter
|
||||||
@@ -9,6 +15,7 @@ from letta.errors import (
|
|||||||
ContextWindowExceededError,
|
ContextWindowExceededError,
|
||||||
LLMBadRequestError,
|
LLMBadRequestError,
|
||||||
LLMConnectionError,
|
LLMConnectionError,
|
||||||
|
LLMEmptyResponseError,
|
||||||
LLMInsufficientCreditsError,
|
LLMInsufficientCreditsError,
|
||||||
LLMServerError,
|
LLMServerError,
|
||||||
)
|
)
|
||||||
@@ -287,3 +294,70 @@ def test_openai_client_handle_llm_error_non_credit_api_error():
|
|||||||
result = client.handle_llm_error(error)
|
result = client.handle_llm_error(error)
|
||||||
assert isinstance(result, LLMBadRequestError)
|
assert isinstance(result, LLMBadRequestError)
|
||||||
assert not isinstance(result, LLMInsufficientCreditsError)
|
assert not isinstance(result, LLMInsufficientCreditsError)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_letta_llm_stream_adapter_raises_empty_response_error_for_anthropic(monkeypatch):
|
||||||
|
"""LET-7679: Empty streaming responses (no content blocks) should raise LLMEmptyResponseError.
|
||||||
|
|
||||||
|
This tests the case where Opus 4.6 returns a response with:
|
||||||
|
- BetaRawMessageStartEvent (with usage tokens)
|
||||||
|
- BetaRawMessageStopEvent (end_turn)
|
||||||
|
- NO content blocks in between
|
||||||
|
|
||||||
|
This should raise LLMEmptyResponseError, not complete successfully with stop_reason=end_turn.
|
||||||
|
"""
|
||||||
|
|
||||||
|
class FakeAsyncStream:
|
||||||
|
"""Mimics anthropic.AsyncStream that returns empty content (no content blocks)."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.events = [
|
||||||
|
# Message start with some usage info
|
||||||
|
BetaRawMessageStartEvent(
|
||||||
|
type="message_start",
|
||||||
|
message=BetaMessage(
|
||||||
|
id="msg_test_empty",
|
||||||
|
type="message",
|
||||||
|
role="assistant",
|
||||||
|
content=[], # Empty content
|
||||||
|
model="claude-opus-4-6",
|
||||||
|
stop_reason="end_turn",
|
||||||
|
stop_sequence=None,
|
||||||
|
usage=BetaUsage(input_tokens=1000, output_tokens=26, cache_creation_input_tokens=0, cache_read_input_tokens=0),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
# Message stop immediately after start - no content blocks
|
||||||
|
BetaRawMessageStopEvent(type="message_stop"),
|
||||||
|
]
|
||||||
|
self.index = 0
|
||||||
|
|
||||||
|
async def __aenter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def __aexit__(self, exc_type, exc, tb):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def __aiter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def __anext__(self):
|
||||||
|
if self.index >= len(self.events):
|
||||||
|
raise StopAsyncIteration
|
||||||
|
event = self.events[self.index]
|
||||||
|
self.index += 1
|
||||||
|
return event
|
||||||
|
|
||||||
|
async def fake_stream_async(self, request_data: dict, llm_config):
|
||||||
|
return FakeAsyncStream()
|
||||||
|
|
||||||
|
monkeypatch.setattr(AnthropicClient, "stream_async", fake_stream_async, raising=True)
|
||||||
|
|
||||||
|
llm_client = AnthropicClient()
|
||||||
|
llm_config = LLMConfig(model="claude-opus-4-6", model_endpoint_type="anthropic", context_window=200000)
|
||||||
|
adapter = LettaLLMStreamAdapter(llm_client=llm_client, llm_config=llm_config, call_type=LLMCallType.agent_step)
|
||||||
|
|
||||||
|
gen = adapter.invoke_llm(request_data={}, messages=[], tools=[], use_assistant_message=True)
|
||||||
|
with pytest.raises(LLMEmptyResponseError):
|
||||||
|
async for _ in gen:
|
||||||
|
pass
|
||||||
|
|||||||
Reference in New Issue
Block a user