From 8729a037b9035e71180098d807b800fce182184b Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Mon, 15 Dec 2025 20:29:44 -0800
Subject: [PATCH] fix: handle new openai overflow error format (#7110)

---
 .../interfaces/openai_streaming_interface.py  |  9 +++++++
 letta/llm_api/error_utils.py                  | 22 ++++++++++++++++
 letta/llm_api/openai_client.py                | 26 +++++++++++++++----
 3 files changed, 52 insertions(+), 5 deletions(-)
 create mode 100644 letta/llm_api/error_utils.py

diff --git a/letta/interfaces/openai_streaming_interface.py b/letta/interfaces/openai_streaming_interface.py
index aa7dac9a..36e3dfa6 100644
--- a/letta/interfaces/openai_streaming_interface.py
+++ b/letta/interfaces/openai_streaming_interface.py
@@ -30,6 +30,7 @@ from openai.types.responses import (
 from openai.types.responses.response_stream_event import ResponseStreamEvent
 
 from letta.constants import DEFAULT_MESSAGE_TOOL, DEFAULT_MESSAGE_TOOL_KWARG
+from letta.llm_api.error_utils import is_context_window_overflow_message
 from letta.llm_api.openai_client import is_openai_reasoning_model
 from letta.local_llm.utils import num_tokens_from_functions, num_tokens_from_messages
 from letta.log import get_logger
@@ -746,6 +747,14 @@ class SimpleOpenAIStreamingInterface:
         except Exception as e:
             import traceback
 
+            # IMPORTANT: If this is a context window overflow, we should propagate the
+            # exception upward so the agent loop can compact/summarize + retry.
+            # Yielding an error stop reason here would prematurely terminate the user's
+            # stream even though a retry path exists.
+            msg = str(e)
+            if is_context_window_overflow_message(msg):
+                raise
+
             logger.exception("Error processing stream: %s", e)
             if ttft_span:
                 ttft_span.add_event(
diff --git a/letta/llm_api/error_utils.py b/letta/llm_api/error_utils.py
new file mode 100644
index 00000000..b1d6e356
--- /dev/null
+++ b/letta/llm_api/error_utils.py
@@ -0,0 +1,22 @@
+"""Shared helpers for provider error detection/mapping.
+
+Keep these utilities free of heavy imports to avoid circular dependencies between
+LLM clients (provider-specific) and streaming interfaces.
+"""
+
+
+def is_context_window_overflow_message(msg: str) -> bool:
+    """Best-effort detection for context window overflow errors.
+
+    Different providers (and even different API surfaces within the same provider)
+    may phrase context-window errors differently. We centralize the heuristic so
+    all layers (clients, streaming interfaces, agent loops) behave consistently.
+    """
+
+    return (
+        "exceeds the context window" in msg
+        or "This model's maximum context length is" in msg
+        or "maximum context length" in msg
+        or "context_length_exceeded" in msg
+        or "Input tokens exceed the configured limit" in msg
+    )
diff --git a/letta/llm_api/openai_client.py b/letta/llm_api/openai_client.py
index 5bb5b08c..26dd493a 100644
--- a/letta/llm_api/openai_client.py
+++ b/letta/llm_api/openai_client.py
@@ -26,6 +26,7 @@ from letta.errors import (
     LLMTimeoutError,
     LLMUnprocessableEntityError,
 )
+from letta.llm_api.error_utils import is_context_window_overflow_message
 from letta.llm_api.helpers import (
     add_inner_thoughts_to_functions,
     convert_response_format_to_responses_api,
@@ -978,11 +979,7 @@ class OpenAIClient(LLMClientBase):
                     error_code = error_details.get("code")
 
             # Check both the error code and message content for context length issues
-            if (
-                error_code == "context_length_exceeded"
-                or "This model's maximum context length is" in str(e)
-                or "Input tokens exceed the configured limit" in str(e)
-            ):
+            if error_code == "context_length_exceeded" or is_context_window_overflow_message(str(e)):
                 return ContextWindowExceededError(
                     message=f"Bad request to OpenAI (context window exceeded): {str(e)}",
                 )
@@ -993,6 +990,25 @@ class OpenAIClient(LLMClientBase):
                     details=e.body,
                 )
 
+        # NOTE: The OpenAI Python SDK may raise a generic `openai.APIError` while *iterating*
+        # over a stream (e.g. Responses API streaming). In this case we don't necessarily
+        # get a `BadRequestError` with a structured error body, but we still want to
+        # trigger Letta's context window compaction / retry logic.
+        #
+        # Example message:
+        #   "Your input exceeds the context window of this model. Please adjust your input and try again."
+        if isinstance(e, openai.APIError):
+            msg = str(e)
+            if is_context_window_overflow_message(msg):
+                return ContextWindowExceededError(
+                    message=f"OpenAI request exceeded the context window: {msg}",
+                    details={
+                        "provider_exception_type": type(e).__name__,
+                        # Best-effort extraction (may not exist on APIError)
+                        "body": getattr(e, "body", None),
+                    },
+                )
+
         if isinstance(e, openai.AuthenticationError):
             logger.error(f"[OpenAI] Authentication error (401): {str(e)}")  # More severe log level
             return LLMAuthenticationError(