From 57bb051ea423c63ad60463d9ee816eb7cdc5fae5 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Fri, 31 Oct 2025 15:29:14 -0700
Subject: [PATCH] feat: add tool return truncation to summarization as a
 fallback [LET-5970] (#5859)

---
 letta/constants.py                            |  3 +
 letta/llm_api/anthropic_client.py             | 10 +++
 letta/llm_api/bedrock_client.py               |  1 +
 letta/llm_api/deepseek_client.py              |  1 +
 letta/llm_api/google_vertex_client.py         |  1 +
 letta/llm_api/groq_client.py                  |  1 +
 letta/llm_api/llm_client_base.py              | 16 +++-
 letta/llm_api/openai_client.py                |  8 +-
 letta/llm_api/xai_client.py                   |  1 +
 letta/schemas/message.py                      | 57 +++++++++---
 letta/services/summarizer/summarizer.py       | 30 ++++++-
 .../llm_model_configs/claude-4-5-haiku.json   |  9 ++
 tests/integration_test_summarizer.py          | 87 ++++++++++++++++++-
 13 files changed, 209 insertions(+), 16 deletions(-)
 create mode 100644 tests/configs/llm_model_configs/claude-4-5-haiku.json

diff --git a/letta/constants.py b/letta/constants.py
index 8990c227..5e568f91 100644
--- a/letta/constants.py
+++ b/letta/constants.py
@@ -378,6 +378,9 @@ FUNCTION_RETURN_CHAR_LIMIT = 50000  # ~300 words
 BASE_FUNCTION_RETURN_CHAR_LIMIT = 50000  # same as regular function limit
 FILE_IS_TRUNCATED_WARNING = "# NOTE: This block is truncated, use functions to view the full content."
 
+# Tool return truncation limit for LLM context window management
+TOOL_RETURN_TRUNCATION_CHARS = 5000
+
 MAX_PAUSE_HEARTBEATS = 360  # in min
 
 MESSAGE_CHATGPT_FUNCTION_MODEL = "gpt-3.5-turbo"
diff --git a/letta/llm_api/anthropic_client.py b/letta/llm_api/anthropic_client.py
index d1dbe716..b9808e3b 100644
--- a/letta/llm_api/anthropic_client.py
+++ b/letta/llm_api/anthropic_client.py
@@ -231,6 +231,7 @@ class AnthropicClient(LLMClientBase):
         tools: Optional[List[dict]] = None,
         force_tool_call: Optional[str] = None,
         requires_subsequent_tool_call: bool = False,
+        tool_return_truncation_chars: Optional[int] = None,
     ) -> dict:
         # TODO: This needs to get cleaned up. The logic here is pretty confusing.
         # TODO: I really want to get rid of prefixing, it's a recipe for disaster code maintenance wise
@@ -336,6 +337,7 @@ class AnthropicClient(LLMClientBase):
             # if react, use native content + strip heartbeats
             native_content=is_v1,
             strip_request_heartbeat=is_v1,
+            tool_return_truncation_chars=tool_return_truncation_chars,
         )
 
         # Ensure first message is user
@@ -474,6 +476,14 @@ class AnthropicClient(LLMClientBase):
 
     @trace_method
     def handle_llm_error(self, e: Exception) -> Exception:
+        # make sure to check for overflow errors, regardless of error type
+        error_str = str(e).lower()
+        if "prompt is too long" in error_str or "exceed context limit" in error_str or "exceeds context" in error_str:
+            logger.warning(f"[Anthropic] Context window exceeded: {str(e)}")
+            return ContextWindowExceededError(
+                message=f"Context window exceeded for Anthropic: {str(e)}",
+            )
+
         if isinstance(e, anthropic.APITimeoutError):
             logger.warning(f"[Anthropic] Request timeout: {e}")
             return LLMTimeoutError(
diff --git a/letta/llm_api/bedrock_client.py b/letta/llm_api/bedrock_client.py
index d2d5722c..d471424f 100644
--- a/letta/llm_api/bedrock_client.py
+++ b/letta/llm_api/bedrock_client.py
@@ -71,6 +71,7 @@ class BedrockClient(AnthropicClient):
         tools: Optional[List[dict]] = None,
         force_tool_call: Optional[str] = None,
         requires_subsequent_tool_call: bool = False,
+        tool_return_truncation_chars: Optional[int] = None,
     ) -> dict:
         data = super().build_request_data(agent_type, messages, llm_config, tools, force_tool_call, requires_subsequent_tool_call)
         # remove disallowed fields
diff --git a/letta/llm_api/deepseek_client.py b/letta/llm_api/deepseek_client.py
index deba4b53..e21d58b4 100644
--- a/letta/llm_api/deepseek_client.py
+++ b/letta/llm_api/deepseek_client.py
@@ -340,6 +340,7 @@ class DeepseekClient(OpenAIClient):
         tools: Optional[List[dict]] = None,
         force_tool_call: Optional[str] = None,
         requires_subsequent_tool_call: bool = False,
+        tool_return_truncation_chars: Optional[int] = None,
     ) -> dict:
         # Override put_inner_thoughts_in_kwargs to False for DeepSeek
         llm_config.put_inner_thoughts_in_kwargs = False
diff --git a/letta/llm_api/google_vertex_client.py b/letta/llm_api/google_vertex_client.py
index abf3903b..a298950d 100644
--- a/letta/llm_api/google_vertex_client.py
+++ b/letta/llm_api/google_vertex_client.py
@@ -291,6 +291,7 @@ class GoogleVertexClient(LLMClientBase):
         tools: List[dict],
         force_tool_call: Optional[str] = None,
         requires_subsequent_tool_call: bool = False,
+        tool_return_truncation_chars: Optional[int] = None,
     ) -> dict:
         """
         Constructs a request object in the expected data format for this client.
diff --git a/letta/llm_api/groq_client.py b/letta/llm_api/groq_client.py
index f9d3baae..8a71ab28 100644
--- a/letta/llm_api/groq_client.py
+++ b/letta/llm_api/groq_client.py
@@ -30,6 +30,7 @@ class GroqClient(OpenAIClient):
         tools: Optional[List[dict]] = None,
         force_tool_call: Optional[str] = None,
         requires_subsequent_tool_call: bool = False,
+        tool_return_truncation_chars: Optional[int] = None,
     ) -> dict:
         data = super().build_request_data(agent_type, messages, llm_config, tools, force_tool_call, requires_subsequent_tool_call)
 
diff --git a/letta/llm_api/llm_client_base.py b/letta/llm_api/llm_client_base.py
index 4a2d8980..881239d3 100644
--- a/letta/llm_api/llm_client_base.py
+++ b/letta/llm_api/llm_client_base.py
@@ -47,13 +47,22 @@ class LLMClientBase:
         force_tool_call: Optional[str] = None,
         telemetry_manager: Optional["TelemetryManager"] = None,
         step_id: Optional[str] = None,
+        tool_return_truncation_chars: Optional[int] = None,
     ) -> Union[ChatCompletionResponse, Stream[ChatCompletionChunk]]:
         """
         Issues a request to the downstream model endpoint and parses response.
         If stream=True, returns a Stream[ChatCompletionChunk] that can be iterated over.
         Otherwise returns a ChatCompletionResponse.
         """
-        request_data = self.build_request_data(agent_type, messages, llm_config, tools, force_tool_call)
+        request_data = self.build_request_data(
+            agent_type,
+            messages,
+            llm_config,
+            tools,
+            force_tool_call,
+            requires_subsequent_tool_call=False,
+            tool_return_truncation_chars=tool_return_truncation_chars,
+        )
 
         try:
             log_event(name="llm_request_sent", attributes=request_data)
@@ -128,9 +137,14 @@ class LLMClientBase:
         tools: List[dict],
         force_tool_call: Optional[str] = None,
         requires_subsequent_tool_call: bool = False,
+        tool_return_truncation_chars: Optional[int] = None,
     ) -> dict:
         """
         Constructs a request object in the expected data format for this client.
+
+        Args:
+            tool_return_truncation_chars: If set, truncates tool return content to this many characters.
+                                         Used during summarization to avoid context window issues.
         """
         raise NotImplementedError
 
diff --git a/letta/llm_api/openai_client.py b/letta/llm_api/openai_client.py
index 7a5257e0..2e36b747 100644
--- a/letta/llm_api/openai_client.py
+++ b/letta/llm_api/openai_client.py
@@ -229,6 +229,7 @@ class OpenAIClient(LLMClientBase):
         tools: Optional[List[dict]] = None,  # Keep as dict for now as per base class
         force_tool_call: Optional[str] = None,
         requires_subsequent_tool_call: bool = False,
+        tool_return_truncation_chars: Optional[int] = None,
     ) -> dict:
         """
         Constructs a request object in the expected data format for the OpenAI Responses API.
@@ -236,7 +237,9 @@ class OpenAIClient(LLMClientBase):
         if llm_config.put_inner_thoughts_in_kwargs:
             raise ValueError("Inner thoughts in kwargs are not supported for the OpenAI Responses API")
 
-        openai_messages_list = PydanticMessage.to_openai_responses_dicts_from_list(messages)
+        openai_messages_list = PydanticMessage.to_openai_responses_dicts_from_list(
+            messages, tool_return_truncation_chars=tool_return_truncation_chars
+        )
         # Add multi-modal support for Responses API by rewriting user messages
         # into input_text/input_image parts.
         openai_messages_list = fill_image_content_in_responses_input(openai_messages_list, messages)
@@ -377,6 +380,7 @@ class OpenAIClient(LLMClientBase):
         tools: Optional[List[dict]] = None,  # Keep as dict for now as per base class
         force_tool_call: Optional[str] = None,
         requires_subsequent_tool_call: bool = False,
+        tool_return_truncation_chars: Optional[int] = None,
     ) -> dict:
         """
         Constructs a request object in the expected data format for the OpenAI API.
@@ -390,6 +394,7 @@ class OpenAIClient(LLMClientBase):
                 tools=tools,
                 force_tool_call=force_tool_call,
                 requires_subsequent_tool_call=requires_subsequent_tool_call,
+                tool_return_truncation_chars=tool_return_truncation_chars,
             )
 
         if agent_type == AgentType.letta_v1_agent:
@@ -419,6 +424,7 @@ class OpenAIClient(LLMClientBase):
                 messages,
                 put_inner_thoughts_in_kwargs=llm_config.put_inner_thoughts_in_kwargs,
                 use_developer_message=use_developer_message,
+                tool_return_truncation_chars=tool_return_truncation_chars,
             )
         ]
 
diff --git a/letta/llm_api/xai_client.py b/letta/llm_api/xai_client.py
index f39dfbc9..1085e5a6 100644
--- a/letta/llm_api/xai_client.py
+++ b/letta/llm_api/xai_client.py
@@ -30,6 +30,7 @@ class XAIClient(OpenAIClient):
         tools: Optional[List[dict]] = None,
         force_tool_call: Optional[str] = None,
         requires_subsequent_tool_call: bool = False,
+        tool_return_truncation_chars: Optional[int] = None,
     ) -> dict:
         data = super().build_request_data(agent_type, messages, llm_config, tools, force_tool_call, requires_subsequent_tool_call)
 
diff --git a/letta/schemas/message.py b/letta/schemas/message.py
index a9d0b732..8773958e 100644
--- a/letta/schemas/message.py
+++ b/letta/schemas/message.py
@@ -56,6 +56,14 @@ from letta.system import unpack_message
 from letta.utils import parse_json, validate_function_response
 
 
+def truncate_tool_return(content: Optional[str], limit: Optional[int]) -> Optional[str]:
+    if limit is None or content is None:
+        return content
+    if len(content) <= limit:
+        return content
+    return content[:limit] + f"... [truncated {len(content) - limit} chars]"
+
+
 def add_inner_thoughts_to_tool_call(
     tool_call: OpenAIToolCall,
     inner_thoughts: str,
@@ -1090,6 +1098,7 @@ class Message(BaseMessage):
         # if true, then treat the content field as AssistantMessage
         native_content: bool = False,
         strip_request_heartbeat: bool = False,
+        tool_return_truncation_chars: Optional[int] = None,
     ) -> dict | None:
         """Go from Message class to ChatCompletion message object"""
         assert not (native_content and put_inner_thoughts_in_kwargs), "native_content and put_inner_thoughts_in_kwargs cannot both be true"
@@ -1191,16 +1200,18 @@ class Message(BaseMessage):
                 tool_return = self.tool_returns[0]
                 if not tool_return.tool_call_id:
                     raise TypeError("OpenAI API requires tool_call_id to be set.")
+                func_response = truncate_tool_return(tool_return.func_response, tool_return_truncation_chars)
                 openai_message = {
-                    "content": tool_return.func_response,
+                    "content": func_response,
                     "role": self.role,
                     "tool_call_id": tool_return.tool_call_id[:max_tool_id_length] if max_tool_id_length else tool_return.tool_call_id,
                 }
             else:
                 # Legacy fallback for old message format
                 assert self.tool_call_id is not None, vars(self)
+                legacy_content = truncate_tool_return(text_content, tool_return_truncation_chars)
                 openai_message = {
-                    "content": text_content,
+                    "content": legacy_content,
                     "role": self.role,
                     "tool_call_id": self.tool_call_id[:max_tool_id_length] if max_tool_id_length else self.tool_call_id,
                 }
@@ -1232,6 +1243,7 @@ class Message(BaseMessage):
         max_tool_id_length: int = TOOL_CALL_ID_MAX_LEN,
         put_inner_thoughts_in_kwargs: bool = False,
         use_developer_message: bool = False,
+        tool_return_truncation_chars: Optional[int] = None,
     ) -> List[dict]:
         messages = Message.filter_messages_for_llm_api(messages)
         result: List[dict] = []
@@ -1256,6 +1268,7 @@ class Message(BaseMessage):
                 max_tool_id_length=max_tool_id_length,
                 put_inner_thoughts_in_kwargs=put_inner_thoughts_in_kwargs,
                 use_developer_message=use_developer_message,
+                tool_return_truncation_chars=tool_return_truncation_chars,
             )
             if d is not None:
                 result.append(d)
@@ -1265,6 +1278,7 @@ class Message(BaseMessage):
     def to_openai_responses_dicts(
         self,
         max_tool_id_length: int = TOOL_CALL_ID_MAX_LEN,
+        tool_return_truncation_chars: Optional[int] = None,
     ) -> List[dict]:
         """Go from Message class to ChatCompletion message object"""
 
@@ -1345,22 +1359,24 @@ class Message(BaseMessage):
                 for tool_return in self.tool_returns:
                     if not tool_return.tool_call_id:
                         raise TypeError("OpenAI Responses API requires tool_call_id to be set.")
+                    func_response = truncate_tool_return(tool_return.func_response, tool_return_truncation_chars)
                     message_dicts.append(
                         {
                             "type": "function_call_output",
                             "call_id": tool_return.tool_call_id[:max_tool_id_length] if max_tool_id_length else tool_return.tool_call_id,
-                            "output": tool_return.func_response,
+                            "output": func_response,
                         }
                     )
             else:
                 # Legacy fallback for old message format
                 assert self.tool_call_id is not None, vars(self)
                 assert len(self.content) == 1 and isinstance(self.content[0], TextContent), vars(self)
+                legacy_output = truncate_tool_return(self.content[0].text, tool_return_truncation_chars)
                 message_dicts.append(
                     {
                         "type": "function_call_output",
                         "call_id": self.tool_call_id[:max_tool_id_length] if max_tool_id_length else self.tool_call_id,
-                        "output": self.content[0].text,
+                        "output": legacy_output,
                     }
                 )
 
@@ -1373,11 +1389,16 @@ class Message(BaseMessage):
     def to_openai_responses_dicts_from_list(
         messages: List[Message],
         max_tool_id_length: int = TOOL_CALL_ID_MAX_LEN,
+        tool_return_truncation_chars: Optional[int] = None,
     ) -> List[dict]:
         messages = Message.filter_messages_for_llm_api(messages)
         result = []
         for message in messages:
-            result.extend(message.to_openai_responses_dicts(max_tool_id_length=max_tool_id_length))
+            result.extend(
+                message.to_openai_responses_dicts(
+                    max_tool_id_length=max_tool_id_length, tool_return_truncation_chars=tool_return_truncation_chars
+                )
+            )
         return result
 
     def to_anthropic_dict(
@@ -1388,6 +1409,7 @@ class Message(BaseMessage):
         # if true, then treat the content field as AssistantMessage
         native_content: bool = False,
         strip_request_heartbeat: bool = False,
+        tool_return_truncation_chars: Optional[int] = None,
     ) -> dict | None:
         """
         Convert to an Anthropic message dictionary
@@ -1563,11 +1585,12 @@ class Message(BaseMessage):
             for tool_return in self.tool_returns:
                 if not tool_return.tool_call_id:
                     raise TypeError("Anthropic API requires tool_use_id to be set.")
+                func_response = truncate_tool_return(tool_return.func_response, tool_return_truncation_chars)
                 content.append(
                     {
                         "type": "tool_result",
                         "tool_use_id": tool_return.tool_call_id,
-                        "content": tool_return.func_response,
+                        "content": func_response,
                     }
                 )
             if content:
@@ -1580,6 +1603,7 @@ class Message(BaseMessage):
                     raise TypeError("Anthropic API requires tool_use_id to be set.")
 
                 # This is for legacy reasons
+                legacy_content = truncate_tool_return(text_content, tool_return_truncation_chars)
                 anthropic_message = {
                     "role": "user",  # NOTE: diff
                     "content": [
@@ -1587,7 +1611,7 @@ class Message(BaseMessage):
                         {
                             "type": "tool_result",
                             "tool_use_id": self.tool_call_id,
-                            "content": text_content,
+                            "content": legacy_content,
                         }
                     ],
                 }
@@ -1606,6 +1630,7 @@ class Message(BaseMessage):
         # if true, then treat the content field as AssistantMessage
         native_content: bool = False,
         strip_request_heartbeat: bool = False,
+        tool_return_truncation_chars: Optional[int] = None,
     ) -> List[dict]:
         messages = Message.filter_messages_for_llm_api(messages)
         result = [
@@ -1615,6 +1640,7 @@ class Message(BaseMessage):
                 put_inner_thoughts_in_kwargs=put_inner_thoughts_in_kwargs,
                 native_content=native_content,
                 strip_request_heartbeat=strip_request_heartbeat,
+                tool_return_truncation_chars=tool_return_truncation_chars,
             )
             for m in messages
         ]
@@ -1628,6 +1654,7 @@ class Message(BaseMessage):
         # if true, then treat the content field as AssistantMessage
         native_content: bool = False,
         strip_request_heartbeat: bool = False,
+        tool_return_truncation_chars: Optional[int] = None,
     ) -> dict | None:
         """
         Go from Message class to Google AI REST message object
@@ -1776,11 +1803,14 @@ class Message(BaseMessage):
                     # Use the function name if available, otherwise use tool_call_id
                     function_name = self.name if self.name else tool_return.tool_call_id
 
+                    # Truncate the tool return if needed
+                    func_response = truncate_tool_return(tool_return.func_response, tool_return_truncation_chars)
+
                     # NOTE: Google AI API wants the function response as JSON only, no string
                     try:
-                        function_response = parse_json(tool_return.func_response)
+                        function_response = parse_json(func_response)
                     except:
-                        function_response = {"function_response": tool_return.func_response}
+                        function_response = {"function_response": func_response}
 
                     parts.append(
                         {
@@ -1808,11 +1838,14 @@ class Message(BaseMessage):
                 else:
                     function_name = self.name
 
+                # Truncate the legacy content if needed
+                legacy_content = truncate_tool_return(text_content, tool_return_truncation_chars)
+
                 # NOTE: Google AI API wants the function response as JSON only, no string
                 try:
-                    function_response = parse_json(text_content)
+                    function_response = parse_json(legacy_content)
                 except:
-                    function_response = {"function_response": text_content}
+                    function_response = {"function_response": legacy_content}
 
                 google_ai_message = {
                     "role": "function",
@@ -1848,6 +1881,7 @@ class Message(BaseMessage):
         current_model: str,
         put_inner_thoughts_in_kwargs: bool = True,
         native_content: bool = False,
+        tool_return_truncation_chars: Optional[int] = None,
     ):
         messages = Message.filter_messages_for_llm_api(messages)
         result = [
@@ -1855,6 +1889,7 @@ class Message(BaseMessage):
                 current_model=current_model,
                 put_inner_thoughts_in_kwargs=put_inner_thoughts_in_kwargs,
                 native_content=native_content,
+                tool_return_truncation_chars=tool_return_truncation_chars,
             )
             for m in messages
         ]
diff --git a/letta/services/summarizer/summarizer.py b/letta/services/summarizer/summarizer.py
index 84cb62ca..24422fb7 100644
--- a/letta/services/summarizer/summarizer.py
+++ b/letta/services/summarizer/summarizer.py
@@ -4,7 +4,13 @@ import traceback
 from typing import List, Optional, Tuple, Union
 
 from letta.agents.ephemeral_summary_agent import EphemeralSummaryAgent
-from letta.constants import DEFAULT_MESSAGE_TOOL, DEFAULT_MESSAGE_TOOL_KWARG, MESSAGE_SUMMARY_REQUEST_ACK
+from letta.constants import (
+    DEFAULT_MESSAGE_TOOL,
+    DEFAULT_MESSAGE_TOOL_KWARG,
+    MESSAGE_SUMMARY_REQUEST_ACK,
+    TOOL_RETURN_TRUNCATION_CHARS,
+)
+from letta.errors import ContextWindowExceededError
 from letta.helpers.message_helper import convert_message_creates_to_messages
 from letta.llm_api.llm_client import LLMClient
 from letta.log import get_logger
@@ -394,7 +400,27 @@ async def simple_summary(messages: List[Message], llm_config: LLMConfig, actor:
         response_data = await llm_client.request_async(request_data, summarizer_llm_config)
     except Exception as e:
         # handle LLM error (likely a context window exceeded error)
-        raise llm_client.handle_llm_error(e)
+        try:
+            raise llm_client.handle_llm_error(e)
+        except ContextWindowExceededError as context_error:
+            logger.warning(
+                f"Context window exceeded during summarization, falling back to truncated tool returns. Original error: {context_error}"
+            )
+
+            # Fallback: rebuild request with truncated tool returns
+            request_data = llm_client.build_request_data(
+                AgentType.letta_v1_agent,
+                input_messages_obj,
+                summarizer_llm_config,
+                tools=[],
+                tool_return_truncation_chars=TOOL_RETURN_TRUNCATION_CHARS,
+            )
+
+            try:
+                response_data = await llm_client.request_async(request_data, summarizer_llm_config)
+            except Exception as fallback_error:
+                logger.error(f"Fallback summarization also failed: {fallback_error}")
+                raise llm_client.handle_llm_error(fallback_error)
     response = llm_client.convert_response_to_chat_completion(response_data, input_messages_obj, summarizer_llm_config)
     if response.choices[0].message.content is None:
         logger.warning("No content returned from summarizer")
diff --git a/tests/configs/llm_model_configs/claude-4-5-haiku.json b/tests/configs/llm_model_configs/claude-4-5-haiku.json
new file mode 100644
index 00000000..93654ec4
--- /dev/null
+++ b/tests/configs/llm_model_configs/claude-4-5-haiku.json
@@ -0,0 +1,9 @@
+{
+  "model": "claude-haiku-4-5",
+  "model_endpoint_type": "anthropic",
+  "model_endpoint": "https://api.anthropic.com/v1",
+  "model_wrapper": null,
+  "context_window": 200000,
+  "put_inner_thoughts_in_kwargs": true,
+  "enable_reasoner": true
+}
diff --git a/tests/integration_test_summarizer.py b/tests/integration_test_summarizer.py
index 29f16dd5..b4b45fff 100644
--- a/tests/integration_test_summarizer.py
+++ b/tests/integration_test_summarizer.py
@@ -39,13 +39,15 @@ def get_llm_config(filename: str, llm_config_dir: str = "tests/configs/llm_model
 # Test configurations - using a subset of models for summarization tests
 all_configs = [
     "openai-gpt-5-mini.json",
+    "claude-4-5-haiku.json",
+    "gemini-2.5-flash.json",
+    # "gemini-2.5-flash-vertex.json",  # Requires Vertex AI credentials
     # "openai-gpt-4.1.json",
     # "openai-o1.json",
     # "openai-o3.json",
     # "openai-o4-mini.json",
     # "claude-4-sonnet.json",
     # "claude-3-7-sonnet.json",
-    # "gemini-2.5-flash-vertex.json",
     # "gemini-2.5-pro-vertex.json",
 ]
 
@@ -517,3 +519,86 @@ async def test_summarize_multiple_large_tool_calls(server: SyncServer, actor, ll
         assert hasattr(msg, "content")
 
     print(f"Summarized {len(in_context_messages)} messages with {total_content_size} chars to {len(result)} messages")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "llm_config",
+    TESTED_LLM_CONFIGS,
+    ids=[c.model for c in TESTED_LLM_CONFIGS],
+)
+async def test_summarize_truncates_large_tool_return(server: SyncServer, actor, llm_config: LLMConfig):
+    """
+    Test that summarization properly truncates very large tool returns.
+    This ensures that oversized tool returns don't consume excessive context.
+    """
+    # Create an extremely large tool return (100k chars)
+    large_return = create_large_tool_return(100000)
+    original_size = len(large_return)
+
+    # Create messages with a large tool return
+    messages = [
+        PydanticMessage(
+            role=MessageRole.user,
+            content=[TextContent(type="text", text="Please run the database query.")],
+        ),
+        PydanticMessage(
+            role=MessageRole.assistant,
+            content=[
+                TextContent(type="text", text="Running query..."),
+                ToolCallContent(
+                    type="tool_call",
+                    id="call_1",
+                    name="run_query",
+                    input={"query": "SELECT * FROM large_table"},
+                ),
+            ],
+        ),
+        PydanticMessage(
+            role=MessageRole.tool,
+            tool_call_id="call_1",
+            content=[
+                ToolReturnContent(
+                    type="tool_return",
+                    tool_call_id="call_1",
+                    content=large_return,
+                    is_error=False,
+                )
+            ],
+        ),
+        PydanticMessage(
+            role=MessageRole.assistant,
+            content=[TextContent(type="text", text="Query completed successfully with many results.")],
+        ),
+    ]
+
+    agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
+
+    # Verify the original tool return is indeed large
+    assert original_size > 90000, f"Expected tool return >90k chars, got {original_size}"
+
+    # Run summarization
+    result = await run_summarization(server, agent_state, in_context_messages, actor)
+
+    # Verify result
+    assert isinstance(result, list)
+    assert len(result) >= 1
+
+    # Find tool return messages in the result and verify truncation occurred
+    tool_returns_found = False
+    for msg in result:
+        if msg.role == MessageRole.tool:
+            for content in msg.content:
+                if isinstance(content, ToolReturnContent):
+                    tool_returns_found = True
+                    result_size = len(content.content)
+                    # Verify that the tool return has been truncated
+                    assert result_size < original_size, (
+                        f"Expected tool return to be truncated from {original_size} chars, but got {result_size} chars"
+                    )
+                    print(f"Tool return successfully truncated from {original_size} to {result_size} chars")
+
+    # If we didn't find any tool returns in the result, that's also acceptable
+    # (they may have been completely removed during aggressive summarization)
+    if not tool_returns_found:
+        print("Tool returns were completely removed during summarization")