From b9c4ed3b153be6371d32217db3e071f49fffafee Mon Sep 17 00:00:00 2001
From: Kian Jones <11655409+kianjones9@users.noreply.github.com>
Date: Wed, 11 Feb 2026 22:49:35 -0800
Subject: [PATCH] fix: catch contextwindowexceeded error on gemini (#9450)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* catch contextwindowexceeded error

* fix(core): detect Google token limit errors as ContextWindowExceededError

Google's error message says "input token count exceeds the maximum
number of tokens allowed" which doesn't contain the word "context",
so it was falling through to generic LLMBadRequestError instead of
ContextWindowExceededError. This means compaction won't auto-trigger.

Expands the detection to also match "token count" and "tokens allowed"
in addition to the existing "context" keyword.

🐾 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* fix(core): add missing message arg to LLMBadRequestError in OpenAI client

The generic 400 path in handle_llm_error was constructing
LLMBadRequestError without the required message positional arg,
causing TypeError in prod during summarization.

🐾 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* ci: add adapters/ test suite to core unit test matrix

🐾 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* fix(tests): update adapter error handling test expectations to match actual behavior

The streaming adapter's error handling double-wraps errors: the
AnthropicStreamingInterface calls handle_llm_error first, then the
adapter catches the result and calls handle_llm_error again, which
falls through to the base class LLMError. Updated test expectations
to match this behavior.

🐾 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* fix(core): prevent double-wrapping of LLMError in stream adapter

The AnthropicStreamingInterface.process() already transforms raw
provider errors into LLMError subtypes via handle_llm_error. The
adapter was catching the result and calling handle_llm_error again,
which didn't recognize the already-transformed LLMError and wrapped
it in a generic LLMError("Unhandled LLM error"). This downgraded
specific error types (LLMConnectionError, LLMServerError, etc.)
and broke retry logic that matches on specific subtypes.

Now the adapter checks if the error is already an LLMError and
re-raises it as-is. Tests restored to original correct expectations.

🐾 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

---------

Co-authored-by: Letta <noreply@letta.com>
---
 letta/adapters/letta_llm_stream_adapter.py    |  3 ++
 letta/llm_api/google_vertex_client.py         |  4 +-
 letta/llm_api/openai_client.py                |  1 +
 ...letta_llm_stream_adapter_error_handling.py | 49 ++++++++++++++++++-
 4 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/letta/adapters/letta_llm_stream_adapter.py b/letta/adapters/letta_llm_stream_adapter.py
index 1d9aa396..c345ee74 100644
--- a/letta/adapters/letta_llm_stream_adapter.py
+++ b/letta/adapters/letta_llm_stream_adapter.py
@@ -1,6 +1,7 @@
 from typing import AsyncGenerator
 
 from letta.adapters.letta_llm_adapter import LettaLLMAdapter
+from letta.errors import LLMError
 from letta.helpers.datetime_helpers import get_utc_timestamp_ns
 from letta.interfaces.anthropic_streaming_interface import AnthropicStreamingInterface
 from letta.interfaces.openai_streaming_interface import OpenAIStreamingInterface
@@ -133,6 +134,8 @@ class LettaLLMStreamAdapter(LettaLLMAdapter):
                 error_msg=str(e),
                 error_type=type(e).__name__,
             )
+            if isinstance(e, LLMError):
+                raise
             raise self.llm_client.handle_llm_error(e, llm_config=self.llm_config)
 
         # After streaming completes, extract the accumulated data
diff --git a/letta/llm_api/google_vertex_client.py b/letta/llm_api/google_vertex_client.py
index 368ba5e7..4eaa6bb2 100644
--- a/letta/llm_api/google_vertex_client.py
+++ b/letta/llm_api/google_vertex_client.py
@@ -895,7 +895,9 @@ class GoogleVertexClient(LLMClientBase):
             # Handle specific error codes
             if e.code == 400:
                 error_str = str(e).lower()
-                if "context" in error_str and ("exceed" in error_str or "limit" in error_str or "too long" in error_str):
+                if ("context" in error_str or "token count" in error_str or "tokens allowed" in error_str) and (
+                    "exceed" in error_str or "limit" in error_str or "too long" in error_str
+                ):
                     return ContextWindowExceededError(
                         message=f"Bad request to {self._provider_name()} (context window exceeded): {str(e)}",
                         details={"is_byok": is_byok},
diff --git a/letta/llm_api/openai_client.py b/letta/llm_api/openai_client.py
index b48280cb..674088ba 100644
--- a/letta/llm_api/openai_client.py
+++ b/letta/llm_api/openai_client.py
@@ -1098,6 +1098,7 @@ class OpenAIClient(LLMClientBase):
             else:
                 body_details = e.body if isinstance(e.body, dict) else {"body": e.body}
                 return LLMBadRequestError(
+                    message=f"Bad request to OpenAI-compatible endpoint: {str(e)}",
                     code=ErrorCode.INVALID_ARGUMENT,
                     details={**body_details, "is_byok": is_byok},
                 )
diff --git a/tests/adapters/test_letta_llm_stream_adapter_error_handling.py b/tests/adapters/test_letta_llm_stream_adapter_error_handling.py
index 58951c59..4d3842fc 100644
--- a/tests/adapters/test_letta_llm_stream_adapter_error_handling.py
+++ b/tests/adapters/test_letta_llm_stream_adapter_error_handling.py
@@ -1,10 +1,12 @@
 import anthropic
 import httpx
 import pytest
+from google.genai import errors as google_errors
 
 from letta.adapters.letta_llm_stream_adapter import LettaLLMStreamAdapter
-from letta.errors import ContextWindowExceededError, LLMConnectionError, LLMServerError
+from letta.errors import ContextWindowExceededError, LLMBadRequestError, LLMConnectionError, LLMError, LLMServerError
 from letta.llm_api.anthropic_client import AnthropicClient
+from letta.llm_api.google_vertex_client import GoogleVertexClient
 from letta.schemas.enums import LLMCallType
 from letta.schemas.llm_config import LLMConfig
 
@@ -188,3 +190,48 @@ def test_anthropic_client_handle_llm_error_request_too_large_string():
 
     assert isinstance(result, ContextWindowExceededError)
     assert "request_too_large" in result.message.lower() or "context window exceeded" in result.message.lower()
+
+
+@pytest.mark.parametrize(
+    "error_message",
+    [
+        "The input token count exceeds the maximum number of tokens allowed 1048576.",
+        "Token count of 1500000 exceeds the model limit of 1048576 tokens allowed.",
+    ],
+    ids=["gemini-token-count-exceeds", "gemini-tokens-allowed-limit"],
+)
+def test_google_client_handle_llm_error_token_limit_returns_context_window_exceeded(error_message):
+    """Google 400 errors about token limits should map to ContextWindowExceededError."""
+    client = GoogleVertexClient.__new__(GoogleVertexClient)
+    response_json = {
+        "message": f'{{"error": {{"code": 400, "message": "{error_message}", "status": "INVALID_ARGUMENT"}}}}',
+        "status": "Bad Request",
+    }
+    error = google_errors.ClientError(400, response_json)
+    result = client.handle_llm_error(error)
+    assert isinstance(result, ContextWindowExceededError)
+
+
+def test_google_client_handle_llm_error_context_exceeded_returns_context_window_exceeded():
+    """Google 400 errors with 'context' + 'exceeded' should map to ContextWindowExceededError."""
+    client = GoogleVertexClient.__new__(GoogleVertexClient)
+    response_json = {
+        "message": '{"error": {"code": 400, "message": "Request context window exceeded the limit.", "status": "INVALID_ARGUMENT"}}',
+        "status": "Bad Request",
+    }
+    error = google_errors.ClientError(400, response_json)
+    result = client.handle_llm_error(error)
+    assert isinstance(result, ContextWindowExceededError)
+
+
+def test_google_client_handle_llm_error_generic_400_returns_bad_request():
+    """Google 400 errors without token/context keywords should map to LLMBadRequestError."""
+    client = GoogleVertexClient.__new__(GoogleVertexClient)
+    response_json = {
+        "message": '{"error": {"code": 400, "message": "Invalid argument: unsupported parameter.", "status": "INVALID_ARGUMENT"}}',
+        "status": "Bad Request",
+    }
+    error = google_errors.ClientError(400, response_json)
+    result = client.handle_llm_error(error)
+    assert isinstance(result, LLMBadRequestError)
+    assert not isinstance(result, ContextWindowExceededError)