Files
letta-server/letta/adapters/letta_llm_adapter.py
jnjpng e8d5922ff9 fix(core): handle ResponseIncompleteEvent in OpenAI Responses API streaming (#9535)
* fix(core): handle ResponseIncompleteEvent in OpenAI Responses API streaming

When reasoning models (gpt-5.x) exhaust their max_output_tokens budget
on chain-of-thought reasoning, OpenAI emits a ResponseIncompleteEvent
instead of ResponseCompletedEvent. This was previously unhandled, causing
final_response to remain None — which meant get_content() and
get_tool_call_objects() returned empty results, silently dropping the
partial response.

Now ResponseIncompleteEvent is handled identically to
ResponseCompletedEvent (extracting partial content, usage stats, and
token details), with an additional warning log indicating the incomplete
reason.

* fix(core): propagate finish_reason for Responses API incomplete events

- Guard usage extraction against None usage payload in
  ResponseIncompleteEvent handler
- Add _finish_reason override to LettaLLMAdapter so streaming adapters
  can explicitly set finish_reason without a chat_completions_response
- Map incomplete_details.reason="max_output_tokens" to
  finish_reason="length" in SimpleLLMStreamAdapter, matching the Chat
  Completions API convention
- This allows the agent loop's _decide_continuation to correctly return
  stop_reason="max_tokens_exceeded" instead of "end_turn" when the model
  exhausts its output token budget on reasoning

* fix(core): handle empty content parts in incomplete ResponseOutputMessage

When a model hits max_output_tokens after starting a ResponseOutputMessage
but before producing any content parts, the message has content=[]. This
previously raised ValueError("Got 0 content parts, expected 1"). Now it
logs a warning and skips the empty message, allowing reasoning-only
incomplete responses to be processed cleanly.

* fix(core): map all incomplete reasons to finish_reason, not just max_output_tokens

Handle content_filter and any future unknown incomplete reasons from the
Responses API instead of silently leaving finish_reason as None.
2026-02-24 10:55:11 -08:00

120 lines
4.8 KiB
Python

from abc import ABC, abstractmethod
from typing import AsyncGenerator
from letta.llm_api.llm_client_base import LLMClientBase
from letta.schemas.enums import LLMCallType
from letta.schemas.letta_message import LettaMessage
from letta.schemas.letta_message_content import ReasoningContent, RedactedReasoningContent, TextContent
from letta.schemas.llm_config import LLMConfig
from letta.schemas.openai.chat_completion_response import ChatCompletionResponse, ChoiceLogprobs, ToolCall
from letta.schemas.usage import LettaUsageStatistics
from letta.schemas.user import User
from letta.services.telemetry_manager import TelemetryManager
class LettaLLMAdapter(ABC):
"""
Base adapter for handling LLM calls in a unified way.
This abstract class defines the interface for both blocking and streaming
LLM interactions, allowing the agent to use different execution modes
through a consistent API.
"""
def __init__(
self,
llm_client: LLMClientBase,
llm_config: LLMConfig,
call_type: LLMCallType,
agent_id: str | None = None,
agent_tags: list[str] | None = None,
run_id: str | None = None,
org_id: str | None = None,
user_id: str | None = None,
) -> None:
self.llm_client: LLMClientBase = llm_client
self.llm_config: LLMConfig = llm_config
self.call_type: LLMCallType = call_type
self.agent_id: str | None = agent_id
self.agent_tags: list[str] | None = agent_tags
self.run_id: str | None = run_id
self.org_id: str | None = org_id
self.user_id: str | None = user_id
self.message_id: str | None = None
self.request_data: dict | None = None
self.response_data: dict | None = None
self.chat_completions_response: ChatCompletionResponse | None = None
self.reasoning_content: list[TextContent | ReasoningContent | RedactedReasoningContent] | None = None
self.content: list[TextContent | ReasoningContent | RedactedReasoningContent] | None = None
self.tool_call: ToolCall | None = None
self.tool_calls: list[ToolCall] = []
self.logprobs: ChoiceLogprobs | None = None
# SGLang native endpoint data (for multi-turn RL training)
self.output_ids: list[int] | None = None
self.output_token_logprobs: list[list[float]] | None = None
self.usage: LettaUsageStatistics = LettaUsageStatistics()
self.telemetry_manager: TelemetryManager = TelemetryManager()
self.llm_request_finish_timestamp_ns: int | None = None
self._finish_reason: str | None = None
@abstractmethod
async def invoke_llm(
self,
request_data: dict,
messages: list,
tools: list,
use_assistant_message: bool,
requires_approval_tools: list[str] = [],
step_id: str | None = None,
actor: User | None = None,
) -> AsyncGenerator[LettaMessage | None, None]:
"""
Execute the LLM call and yield results as they become available.
Args:
request_data: The prepared request data for the LLM API
messages: The messages in context for the request
tools: The tools available for the LLM to use
use_assistant_message: If true, use assistant messages when streaming response
requires_approval_tools: The subset of tools that require approval before use
step_id: The step ID associated with this request. If provided, logs request and response data.
actor: The optional actor associated with this request for logging purposes.
Yields:
LettaMessage: Chunks of data for streaming adapters, or None for blocking adapters
"""
raise NotImplementedError
@property
def finish_reason(self) -> str | None:
"""
Get the finish_reason from the LLM response.
Returns:
str | None: The finish_reason if available, None otherwise
"""
if self._finish_reason is not None:
return self._finish_reason
if self.chat_completions_response and self.chat_completions_response.choices:
return self.chat_completions_response.choices[0].finish_reason
return None
def supports_token_streaming(self) -> bool:
"""
Check if the adapter supports token-level streaming.
Returns:
bool: True if the adapter can stream back tokens as they are generated, False otherwise
"""
return False
def log_provider_trace(self, step_id: str | None, actor: User | None) -> None:
"""
Log provider trace data for telemetry purposes.
Args:
step_id: The step ID associated with this request for logging purposes
actor: The user associated with this request for logging purposes
"""
raise NotImplementedError