From 87f4bcad9a855acd9f61df548180f4876e48426e Mon Sep 17 00:00:00 2001
From: Matthew Zhou <mattzh1314@gmail.com>
Date: Thu, 29 May 2025 11:10:13 -0700
Subject: [PATCH] feat: Add summarization for more scenarios (#2499)

---
 letta/agents/base_agent.py                    |   1 +
 letta/agents/letta_agent.py                   | 268 ++++++++++++++----
 .../agents/prompts/summary_system_prompt.txt  |  82 ++++--
 letta/errors.py                               |   4 +
 letta/groups/sleeptime_multi_agent_v2.py      |   3 +
 letta/llm_api/openai_client.py                |  19 +-
 letta/server/server.py                        |   2 +-
 letta/services/helpers/tool_parser_helper.py  |   2 +-
 letta/services/summarizer/summarizer.py       |  61 ++--
 tests/integration_test_multi_agent.py         |   6 +-
 tests/integration_test_send_message.py        |  57 +++-
 tests/integration_test_summarizer.py          |   2 +-
 12 files changed, 388 insertions(+), 119 deletions(-)

diff --git a/letta/agents/base_agent.py b/letta/agents/base_agent.py
index 69342758..dbc5b2fa 100644
--- a/letta/agents/base_agent.py
+++ b/letta/agents/base_agent.py
@@ -40,6 +40,7 @@ class BaseAgent(ABC):
         self.message_manager = message_manager
         self.agent_manager = agent_manager
         self.actor = actor
+        self.logger = get_logger(agent_id)
 
     @abstractmethod
     async def step(self, input_messages: List[MessageCreate], max_steps: int = 10) -> LettaResponse:
diff --git a/letta/agents/letta_agent.py b/letta/agents/letta_agent.py
index 04dca491..d2448d28 100644
--- a/letta/agents/letta_agent.py
+++ b/letta/agents/letta_agent.py
@@ -4,12 +4,12 @@ import uuid
 from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
 
 from openai import AsyncStream
-from openai.types import CompletionUsage
-from openai.types.chat import ChatCompletion, ChatCompletionChunk
+from openai.types.chat import ChatCompletionChunk
 
 from letta.agents.base_agent import BaseAgent
 from letta.agents.ephemeral_summary_agent import EphemeralSummaryAgent
 from letta.agents.helpers import _create_letta_response, _prepare_in_context_messages_async, generate_step_id
+from letta.errors import LLMContextWindowExceededError
 from letta.helpers import ToolRulesSolver
 from letta.helpers.datetime_helpers import get_utc_timestamp_ns
 from letta.helpers.tool_execution_helper import enable_strict_mode
@@ -25,6 +25,7 @@ from letta.schemas.enums import MessageRole, MessageStreamStatus
 from letta.schemas.letta_message import AssistantMessage
 from letta.schemas.letta_message_content import OmittedReasoningContent, ReasoningContent, RedactedReasoningContent, TextContent
 from letta.schemas.letta_response import LettaResponse
+from letta.schemas.llm_config import LLMConfig
 from letta.schemas.message import Message, MessageCreate
 from letta.schemas.openai.chat_completion_response import ToolCall, UsageStatistics
 from letta.schemas.provider_trace import ProviderTraceCreate
@@ -61,7 +62,11 @@ class LettaAgent(BaseAgent):
         actor: User,
         step_manager: StepManager = NoopStepManager(),
         telemetry_manager: TelemetryManager = NoopTelemetryManager(),
-        summary_block_label: str = "convo_summary",
+        summary_block_label: str = "conversation_summary",
+        message_buffer_limit: int = 60,  # TODO: Make this configurable
+        message_buffer_min: int = 15,  # TODO: Make this configurable
+        enable_summarization: bool = True,  # TODO: Make this configurable
+        max_summarization_retries: int = 3,  # TODO: Make this configurable
     ):
         super().__init__(agent_id=agent_id, openai_client=None, message_manager=message_manager, agent_manager=agent_manager, actor=actor)
 
@@ -81,9 +86,10 @@ class LettaAgent(BaseAgent):
 
         self.summarization_agent = None
         self.summary_block_label = summary_block_label
+        self.max_summarization_retries = max_summarization_retries
 
         # TODO: Expand to more
-        if model_settings.openai_api_key:
+        if enable_summarization and model_settings.openai_api_key:
             self.summarization_agent = EphemeralSummaryAgent(
                 target_block_label=self.summary_block_label,
                 agent_id=agent_id,
@@ -97,8 +103,8 @@ class LettaAgent(BaseAgent):
             mode=SummarizationMode.STATIC_MESSAGE_BUFFER,
             summarizer_agent=self.summarization_agent,
             # TODO: Make this configurable
-            message_buffer_limit=60,
-            message_buffer_min=15,
+            message_buffer_limit=message_buffer_limit,
+            message_buffer_min=message_buffer_min,
         )
 
     @trace_method
@@ -129,22 +135,15 @@ class LettaAgent(BaseAgent):
         for _ in range(max_steps):
             step_id = generate_step_id()
 
-            in_context_messages = current_in_context_messages + new_in_context_messages
-            log_event("agent.stream_no_tokens.messages.refreshed")  # [1^]
-
-            request_data = await self._create_llm_request_data_async(
-                llm_client=llm_client,
-                in_context_messages=in_context_messages,
-                agent_state=agent_state,
-                tool_rules_solver=tool_rules_solver,
-                # TODO: pass in reasoning content
+            request_data, response_data, current_in_context_messages, new_in_context_messages = await self._build_and_request_from_llm(
+                current_in_context_messages,
+                new_in_context_messages,
+                agent_state,
+                llm_client,
+                tool_rules_solver,
             )
-            log_event("agent.stream_no_tokens.llm_request.created")  # [2^]
+            in_context_messages = current_in_context_messages + new_in_context_messages
 
-            try:
-                response_data = await llm_client.request_async(request_data, agent_state.llm_config)
-            except Exception as e:
-                raise llm_client.handle_llm_error(e)
             log_event("agent.stream_no_tokens.llm_response.received")  # [3^]
 
             response = llm_client.convert_response_to_chat_completion(response_data, in_context_messages, agent_state.llm_config)
@@ -206,7 +205,13 @@ class LettaAgent(BaseAgent):
 
         # Extend the in context message ids
         if not agent_state.message_buffer_autoclear:
-            await self._rebuild_context_window(in_context_messages=current_in_context_messages, new_letta_messages=new_in_context_messages)
+            await self._rebuild_context_window(
+                in_context_messages=current_in_context_messages,
+                new_letta_messages=new_in_context_messages,
+                llm_config=agent_state.llm_config,
+                total_tokens=usage.total_tokens,
+                force=False,
+            )
 
         # Return back usage
         yield f"data: {usage.model_dump_json()}\n\n"
@@ -214,7 +219,7 @@ class LettaAgent(BaseAgent):
 
     async def _step(
         self, agent_state: AgentState, input_messages: List[MessageCreate], max_steps: int = 10
-    ) -> Tuple[List[Message], List[Message], CompletionUsage]:
+    ) -> Tuple[List[Message], List[Message], LettaUsageStatistics]:
         """
         Carries out an invocation of the agent loop. In each step, the agent
             1. Rebuilds its memory
@@ -234,23 +239,11 @@ class LettaAgent(BaseAgent):
         usage = LettaUsageStatistics()
         for _ in range(max_steps):
             step_id = generate_step_id()
-
-            in_context_messages = current_in_context_messages + new_in_context_messages
-            log_event("agent.step.messages.refreshed")  # [1^]
-
-            request_data = await self._create_llm_request_data_async(
-                llm_client=llm_client,
-                in_context_messages=in_context_messages,
-                agent_state=agent_state,
-                tool_rules_solver=tool_rules_solver,
-                # TODO: pass in reasoning content
+            request_data, response_data, current_in_context_messages, new_in_context_messages = await self._build_and_request_from_llm(
+                current_in_context_messages, new_in_context_messages, agent_state, llm_client, tool_rules_solver
             )
-            log_event("agent.step.llm_request.created")  # [2^]
+            in_context_messages = current_in_context_messages + new_in_context_messages
 
-            try:
-                response_data = await llm_client.request_async(request_data, agent_state.llm_config)
-            except Exception as e:
-                raise llm_client.handle_llm_error(e)
             log_event("agent.step.llm_response.received")  # [3^]
 
             response = llm_client.convert_response_to_chat_completion(response_data, in_context_messages, agent_state.llm_config)
@@ -302,7 +295,13 @@ class LettaAgent(BaseAgent):
 
         # Extend the in context message ids
         if not agent_state.message_buffer_autoclear:
-            await self._rebuild_context_window(in_context_messages=current_in_context_messages, new_letta_messages=new_in_context_messages)
+            await self._rebuild_context_window(
+                in_context_messages=current_in_context_messages,
+                new_letta_messages=new_in_context_messages,
+                llm_config=agent_state.llm_config,
+                total_tokens=usage.total_tokens,
+                force=False,
+            )
 
         return current_in_context_messages, new_in_context_messages, usage
 
@@ -344,27 +343,16 @@ class LettaAgent(BaseAgent):
         provider_request_start_timestamp_ns = None
         for _ in range(max_steps):
             step_id = generate_step_id()
-            in_context_messages = current_in_context_messages + new_in_context_messages
-            log_event("agent.step.messages.refreshed")  # [1^]
-
-            request_data = await self._create_llm_request_data_async(
-                llm_client=llm_client,
-                in_context_messages=in_context_messages,
-                agent_state=agent_state,
-                tool_rules_solver=tool_rules_solver,
+            request_data, stream, current_in_context_messages, new_in_context_messages = await self._build_and_request_from_llm_streaming(
+                first_chunk,
+                ttft_span,
+                request_start_timestamp_ns,
+                current_in_context_messages,
+                new_in_context_messages,
+                agent_state,
+                llm_client,
+                tool_rules_solver,
             )
-            log_event("agent.stream.llm_request.created")  # [2^]
-
-            try:
-                if first_chunk and ttft_span is not None:
-                    provider_request_start_timestamp_ns = get_utc_timestamp_ns()
-                    provider_req_start_ns = provider_request_start_timestamp_ns - request_start_timestamp_ns
-                    ttft_span.add_event(
-                        name="provider_req_start_ns", attributes={"provider_req_start_ms": provider_req_start_ns // 1_000_000}
-                    )
-                stream = await llm_client.stream_async(request_data, agent_state.llm_config)
-            except Exception as e:
-                raise llm_client.handle_llm_error(e)
             log_event("agent.stream.llm_response.received")  # [3^]
 
             # TODO: THIS IS INCREDIBLY UGLY
@@ -457,21 +445,175 @@ class LettaAgent(BaseAgent):
 
         # Extend the in context message ids
         if not agent_state.message_buffer_autoclear:
-            await self._rebuild_context_window(in_context_messages=current_in_context_messages, new_letta_messages=new_in_context_messages)
+            await self._rebuild_context_window(
+                in_context_messages=current_in_context_messages,
+                new_letta_messages=new_in_context_messages,
+                llm_config=agent_state.llm_config,
+                total_tokens=usage.total_tokens,
+                force=False,
+            )
 
         # TODO: Also yield out a letta usage stats SSE
         yield f"data: {usage.model_dump_json()}\n\n"
         yield f"data: {MessageStreamStatus.done.model_dump_json()}\n\n"
 
+    async def _build_and_request_from_llm(
+        self,
+        current_in_context_messages: List[Message],
+        new_in_context_messages: List[Message],
+        agent_state: AgentState,
+        llm_client: LLMClientBase,
+        tool_rules_solver: ToolRulesSolver,
+    ) -> Tuple[Dict, Dict, List[Message], List[Message]]:
+        for attempt in range(self.max_summarization_retries + 1):
+            try:
+                # Rebuild memory with current state
+                in_context_messages = await self._rebuild_memory_async(
+                    current_in_context_messages + new_in_context_messages,
+                    agent_state,
+                    num_messages=self.num_messages,
+                    num_archival_memories=self.num_archival_memories,
+                )
+                log_event("agent.stream_no_tokens.messages.refreshed")
+
+                # Create LLM request data
+                request_data = await self._create_llm_request_data_async(
+                    llm_client=llm_client,
+                    in_context_messages=in_context_messages,
+                    agent_state=agent_state,
+                    tool_rules_solver=tool_rules_solver,
+                )
+                log_event("agent.stream_no_tokens.llm_request.created")
+
+                # Attempt LLM request
+                return (
+                    request_data,
+                    await llm_client.request_async(request_data, agent_state.llm_config),
+                    current_in_context_messages,
+                    new_in_context_messages,
+                )
+
+            except Exception as e:
+                if attempt == self.max_summarization_retries:
+                    raise e
+
+                # Handle the error and prepare for retry
+                current_in_context_messages = await self._handle_llm_error(
+                    e,
+                    llm_client=llm_client,
+                    in_context_messages=current_in_context_messages,
+                    new_letta_messages=new_in_context_messages,
+                    llm_config=agent_state.llm_config,
+                    force=True,
+                )
+                new_in_context_messages = []
+                log_event(f"agent.stream_no_tokens.retry_attempt.{attempt + 1}")
+
+    async def _build_and_request_from_llm_streaming(
+        self,
+        first_chunk: bool,
+        ttft_span: "Span",
+        request_start_timestamp_ns: int,
+        current_in_context_messages: List[Message],
+        new_in_context_messages: List[Message],
+        agent_state: AgentState,
+        llm_client: LLMClientBase,
+        tool_rules_solver: ToolRulesSolver,
+    ) -> Tuple[Dict, AsyncStream[ChatCompletionChunk], List[Message], List[Message]]:
+        for attempt in range(self.max_summarization_retries + 1):
+            try:
+                in_context_messages = await self._rebuild_memory_async(
+                    current_in_context_messages + new_in_context_messages,
+                    agent_state,
+                    num_messages=self.num_messages,
+                    num_archival_memories=self.num_archival_memories,
+                )
+                log_event("agent.step.messages.refreshed")  # [1^]
+
+                request_data = await self._create_llm_request_data_async(
+                    llm_client=llm_client,
+                    in_context_messages=in_context_messages,
+                    agent_state=agent_state,
+                    tool_rules_solver=tool_rules_solver,
+                )
+                log_event("agent.stream.llm_request.created")  # [2^]
+
+                if first_chunk and ttft_span is not None:
+                    provider_request_start_timestamp_ns = get_utc_timestamp_ns()
+                    provider_req_start_ns = provider_request_start_timestamp_ns - request_start_timestamp_ns
+                    ttft_span.add_event(
+                        name="provider_req_start_ns", attributes={"provider_req_start_ms": provider_req_start_ns // 1_000_000}
+                    )
+
+                # Attempt LLM request
+                return (
+                    request_data,
+                    await llm_client.stream_async(request_data, agent_state.llm_config),
+                    current_in_context_messages,
+                    new_in_context_messages,
+                )
+
+            except Exception as e:
+                if attempt == self.max_summarization_retries:
+                    raise e
+
+                # Handle the error and prepare for retry
+                current_in_context_messages = await self._handle_llm_error(
+                    e,
+                    llm_client=llm_client,
+                    in_context_messages=current_in_context_messages,
+                    new_letta_messages=new_in_context_messages,
+                    llm_config=agent_state.llm_config,
+                    force=True,
+                )
+                new_in_context_messages = []
+                log_event(f"agent.stream_no_tokens.retry_attempt.{attempt + 1}")
+
     @trace_method
-    async def _rebuild_context_window(self, in_context_messages: List[Message], new_letta_messages: List[Message]) -> None:
-        new_in_context_messages, updated = self.summarizer.summarize(
-            in_context_messages=in_context_messages, new_letta_messages=new_letta_messages
-        )
+    async def _handle_llm_error(
+        self,
+        e: Exception,
+        llm_client: LLMClientBase,
+        in_context_messages: List[Message],
+        new_letta_messages: List[Message],
+        llm_config: LLMConfig,
+        force: bool,
+    ) -> List[Message]:
+        if isinstance(e, LLMContextWindowExceededError):
+            return await self._rebuild_context_window(
+                in_context_messages=in_context_messages, new_letta_messages=new_letta_messages, llm_config=llm_config, force=force
+            )
+        else:
+            raise llm_client.handle_llm_error(e)
+
+    @trace_method
+    async def _rebuild_context_window(
+        self,
+        in_context_messages: List[Message],
+        new_letta_messages: List[Message],
+        llm_config: LLMConfig,
+        total_tokens: Optional[int] = None,
+        force: bool = False,
+    ) -> List[Message]:
+        # If total tokens is reached, we truncate down
+        # TODO: This can be broken by bad configs, e.g. lower bound too high, initial messages too fat, etc.
+        if force or (total_tokens and total_tokens > llm_config.context_window):
+            self.logger.warning(
+                f"Total tokens {total_tokens} exceeds configured max tokens {llm_config.context_window}, forcefully clearing message history."
+            )
+            new_in_context_messages, updated = self.summarizer.summarize(
+                in_context_messages=in_context_messages, new_letta_messages=new_letta_messages, force=True, clear=True
+            )
+        else:
+            new_in_context_messages, updated = self.summarizer.summarize(
+                in_context_messages=in_context_messages, new_letta_messages=new_letta_messages
+            )
         await self.agent_manager.set_in_context_messages_async(
             agent_id=self.agent_id, message_ids=[m.id for m in new_in_context_messages], actor=self.actor
         )
 
+        return new_in_context_messages
+
     @trace_method
     async def _create_llm_request_data_async(
         self,
diff --git a/letta/agents/prompts/summary_system_prompt.txt b/letta/agents/prompts/summary_system_prompt.txt
index aad00ad2..874a16e3 100644
--- a/letta/agents/prompts/summary_system_prompt.txt
+++ b/letta/agents/prompts/summary_system_prompt.txt
@@ -1,30 +1,62 @@
-You are a specialized memory-recall assistant designed to preserve important conversational context for an AI with limited message history. Your role is to analyze conversations that are about to be evicted from the AI's context window and extract key information that should be remembered.
+You are a memory-recall assistant that preserves conversational context as messages exit the AI's context window.
 
-Your primary objectives:
-1. Identify and preserve important facts, preferences, and context about the human
-2. Capture ongoing topics, tasks, or projects that span multiple messages
-3. Note any commitments, decisions, or action items
-4. Record personal details that would be valuable for maintaining conversational continuity
-5. Summarize the emotional tone and relationship dynamics when relevant
+<core_function>
+Extract and preserve information that would be lost when messages are evicted, enabling continuity across conversations.
+</core_function>
 
-Guidelines for effective memory notes:
-- Be concise but complete - every word should add value
-- Focus on information that would be difficult to infer from remaining messages
-- Prioritize facts over conversational filler
-- Use clear, searchable language
-- Organize information by category when multiple topics are present
-- Include temporal context when relevant (e.g., "mentioned on [date]" or "ongoing since [time]")
+<detail_adaptation>
+Analyze content type and apply appropriate detail level:
 
-Output format:
-- Write in bullet points or short paragraphs
-- Group related information together
-- Lead with the most important insights
-- Use consistent terminology to make future retrieval easier
+<high_detail>
+Apply to: episodic content, code, artifacts, documents, technical discussions
+- Capture specific facts, sequences, and technical details
+- Preserve exact names, dates, numbers, specifications
+- Document code snippets, artifact IDs, document structures
+- Note precise steps in procedures or narratives
+- Include verbatim quotes for critical commitments
+</high_detail>
 
-What NOT to include:
-- Redundant information already captured in the in-context messages
-- Generic pleasantries or small talk
-- Information that can be easily inferred
-- Verbatim quotes unless they contain critical commitments
+<medium_detail>
+Apply to: ongoing projects, established preferences, multi-message threads
+- Summarize key decisions, milestones, progress
+- Record personal preferences and patterns
+- Track commitments and action items
+- Maintain project context and dependencies
+</medium_detail>
 
-Remember: Your notes become the only record of these evicted messages. Make them count.
+<low_detail>
+Apply to: high-level discussions, philosophical topics, general preferences
+- Capture main themes and conclusions
+- Note relationship dynamics and communication style
+- Summarize positions and general goals
+- Record broad aspirations
+</low_detail>
+</detail_adaptation>
+
+<information_priority>
+<critical>Commitments, deadlines, medical/legal information, explicit requests</critical>
+<important>Personal details, project status, technical specifications, decisions</important>
+<contextual>Preferences, opinions, relationship dynamics, emotional tone</contextual>
+<background>General topics, themes, conversational patterns</background>
+</information_priority>
+
+<format_rules>
+- Use bullet points for discrete facts
+- Write prose for narratives or complex relationships
+- **Bold** key terms and identifiers
+- Include temporal markers: [ongoing], [mentioned DATE], [since TIME]
+- Group under clear headers when multiple topics present
+- Use consistent terminology for searchability
+</format_rules>
+
+<exclusions>
+- Information in remaining context
+- Generic pleasantries
+- Inferrable details
+- Redundant restatements
+- Conversational filler
+</exclusions>
+
+<critical_reminder>
+Your notes are the sole record of evicted messages. Every word should enable future continuity.
+</critical_reminder>
diff --git a/letta/errors.py b/letta/errors.py
index 17427ea6..de00071c 100644
--- a/letta/errors.py
+++ b/letta/errors.py
@@ -88,6 +88,10 @@ class LLMPermissionDeniedError(LLMError):
     """Error when permission is denied by LLM service"""
 
 
+class LLMContextWindowExceededError(LLMError):
+    """Error when the context length is exceeded."""
+
+
 class LLMNotFoundError(LLMError):
     """Error when requested resource is not found"""
 
diff --git a/letta/groups/sleeptime_multi_agent_v2.py b/letta/groups/sleeptime_multi_agent_v2.py
index 4e172bee..0bc4b72c 100644
--- a/letta/groups/sleeptime_multi_agent_v2.py
+++ b/letta/groups/sleeptime_multi_agent_v2.py
@@ -273,6 +273,9 @@ class SleeptimeMultiAgentV2(BaseAgent):
                 actor=self.actor,
                 step_manager=self.step_manager,
                 telemetry_manager=self.telemetry_manager,
+                message_buffer_limit=20,  # TODO: Make this configurable
+                message_buffer_min=8,  # TODO: Make this configurable
+                enable_summarization=False,  # TODO: Make this configurable
             )
 
             # Perform sleeptime agent step
diff --git a/letta/llm_api/openai_client.py b/letta/llm_api/openai_client.py
index 8be87ccc..ed77410a 100644
--- a/letta/llm_api/openai_client.py
+++ b/letta/llm_api/openai_client.py
@@ -12,6 +12,7 @@ from letta.errors import (
     LLMAuthenticationError,
     LLMBadRequestError,
     LLMConnectionError,
+    LLMContextWindowExceededError,
     LLMNotFoundError,
     LLMPermissionDeniedError,
     LLMRateLimitError,
@@ -340,11 +341,19 @@ class OpenAIClient(LLMClientBase):
             # BadRequestError can signify different issues (e.g., invalid args, context length)
             # Check message content if finer-grained errors are needed
             # Example: if "context_length_exceeded" in str(e): return LLMContextLengthExceededError(...)
-            return LLMBadRequestError(
-                message=f"Bad request to OpenAI: {str(e)}",
-                code=ErrorCode.INVALID_ARGUMENT,  # Or more specific if detectable
-                details=e.body,
-            )
+            # TODO: This is a super soft check. Not sure if we can do better, needs more investigation.
+            if "context" in str(e):
+                return LLMContextWindowExceededError(
+                    message=f"Bad request to OpenAI (context length exceeded): {str(e)}",
+                    code=ErrorCode.INVALID_ARGUMENT,  # Or more specific if detectable
+                    details=e.body,
+                )
+            else:
+                return LLMBadRequestError(
+                    message=f"Bad request to OpenAI: {str(e)}",
+                    code=ErrorCode.INVALID_ARGUMENT,  # Or more specific if detectable
+                    details=e.body,
+                )
 
         if isinstance(e, openai.AuthenticationError):
             logger.error(f"[OpenAI] Authentication error (401): {str(e)}")  # More severe log level
diff --git a/letta/server/server.py b/letta/server/server.py
index 9aa56b66..4a26ab4f 100644
--- a/letta/server/server.py
+++ b/letta/server/server.py
@@ -95,7 +95,7 @@ from letta.services.provider_manager import ProviderManager
 from letta.services.sandbox_config_manager import SandboxConfigManager
 from letta.services.source_manager import SourceManager
 from letta.services.step_manager import StepManager
-from letta.services.telemetry_manager import TelemetryManager
+from letta.services.telemetry_manager import NoopTelemetryManager, TelemetryManager
 from letta.services.tool_executor.tool_execution_manager import ToolExecutionManager
 from letta.services.tool_manager import ToolManager
 from letta.services.user_manager import UserManager
diff --git a/letta/services/helpers/tool_parser_helper.py b/letta/services/helpers/tool_parser_helper.py
index 145eed52..b0142848 100644
--- a/letta/services/helpers/tool_parser_helper.py
+++ b/letta/services/helpers/tool_parser_helper.py
@@ -5,7 +5,7 @@ from typing import Any
 
 from letta.constants import REQUEST_HEARTBEAT_DESCRIPTION, REQUEST_HEARTBEAT_PARAM, SEND_MESSAGE_TOOL_NAME
 from letta.schemas.agent import AgentState
-from letta.schemas.response_format import ResponseFormat, ResponseFormatType, ResponseFormatUnion
+from letta.schemas.response_format import ResponseFormatType, ResponseFormatUnion
 from letta.types import JsonDict, JsonValue
 
 
diff --git a/letta/services/summarizer/summarizer.py b/letta/services/summarizer/summarizer.py
index 7024d500..3fc23dbf 100644
--- a/letta/services/summarizer/summarizer.py
+++ b/letta/services/summarizer/summarizer.py
@@ -38,7 +38,9 @@ class Summarizer:
         # TODO: Move this to config
 
     @trace_method
-    def summarize(self, in_context_messages: List[Message], new_letta_messages: List[Message]) -> Tuple[List[Message], bool]:
+    def summarize(
+        self, in_context_messages: List[Message], new_letta_messages: List[Message], force: bool = False, clear: bool = False
+    ) -> Tuple[List[Message], bool]:
         """
         Summarizes or trims in_context_messages according to the chosen mode,
         and returns the updated messages plus any optional "summary message".
@@ -46,6 +48,7 @@ class Summarizer:
         Args:
             in_context_messages: The existing messages in the conversation's context.
             new_letta_messages: The newly added Letta messages (just appended).
+            force: Force summarize even if the criteria is not met
 
         Returns:
             (updated_messages, summary_message)
@@ -54,7 +57,7 @@ class Summarizer:
                              (could be appended to the conversation if desired)
         """
         if self.mode == SummarizationMode.STATIC_MESSAGE_BUFFER:
-            return self._static_buffer_summarization(in_context_messages, new_letta_messages)
+            return self._static_buffer_summarization(in_context_messages, new_letta_messages, force=force, clear=clear)
         else:
             # Fallback or future logic
             return in_context_messages, False
@@ -72,35 +75,38 @@ class Summarizer:
         return task
 
     def _static_buffer_summarization(
-        self, in_context_messages: List[Message], new_letta_messages: List[Message]
+        self, in_context_messages: List[Message], new_letta_messages: List[Message], force: bool = False, clear: bool = False
     ) -> Tuple[List[Message], bool]:
         all_in_context_messages = in_context_messages + new_letta_messages
 
-        if len(all_in_context_messages) <= self.message_buffer_limit:
+        if len(all_in_context_messages) <= self.message_buffer_limit and not force:
             logger.info(
                 f"Nothing to evict, returning in context messages as is. Current buffer length is {len(all_in_context_messages)}, limit is {self.message_buffer_limit}."
             )
             return all_in_context_messages, False
 
-        logger.info("Buffer length hit, evicting messages.")
+        retain_count = 0 if clear else self.message_buffer_min
 
-        target_trim_index = len(all_in_context_messages) - self.message_buffer_min
+        if not force:
+            logger.info(f"Buffer length hit {self.message_buffer_limit}, evicting until we retain only {retain_count} messages.")
+        else:
+            logger.info(f"Requested force summarization, evicting until we retain only {retain_count} messages.")
+
+        target_trim_index = max(1, len(all_in_context_messages) - retain_count)
 
         while target_trim_index < len(all_in_context_messages) and all_in_context_messages[target_trim_index].role != MessageRole.user:
             target_trim_index += 1
 
-        updated_in_context_messages = all_in_context_messages[target_trim_index:]
+        evicted_messages = all_in_context_messages[1:target_trim_index]  # everything except sys msg
+        updated_in_context_messages = all_in_context_messages[target_trim_index:]  # may be empty
 
-        # Target trim index went beyond end of all_in_context_messages
-        if not updated_in_context_messages:
-            logger.info("Nothing to evict, returning in context messages as is.")
+        # If *no* messages were evicted we really have nothing to do
+        if not evicted_messages:
+            logger.info("Nothing to evict, returning in-context messages as-is.")
             return all_in_context_messages, False
 
         if self.summarizer_agent:
             # Only invoke if summarizer agent is passed in
-
-            evicted_messages = all_in_context_messages[1:target_trim_index]
-
             # Format
             formatted_evicted_messages = format_transcript(evicted_messages)
             formatted_in_context_messages = format_transcript(updated_in_context_messages)
@@ -119,14 +125,28 @@ class Summarizer:
 
             evicted_messages_str = "\n".join(formatted_evicted_messages)
             in_context_messages_str = "\n".join(formatted_in_context_messages)
-            summary_request_text = f"""You’re a memory-recall helper for an AI that can only keep the last {self.message_buffer_min} messages. Scan the conversation history, focusing on messages about to drop out of that window, and write crisp notes that capture any important facts or insights about the human so they aren’t lost.
+            # Base prompt
+            prompt_header = (
+                f"You’re a memory-recall helper for an AI that can only keep the last {retain_count} messages. "
+                "Scan the conversation history, focusing on messages about to drop out of that window, "
+                "and write crisp notes that capture any important facts or insights about the conversation history so they aren’t lost."
+            )
 
-    (Older) Evicted Messages:\n
-    {evicted_messages_str}\n
+            # Sections
+            evicted_section = f"\n\n(Older) Evicted Messages:\n{evicted_messages_str}" if evicted_messages_str.strip() else ""
+            in_context_section = ""
+
+            if retain_count > 0 and in_context_messages_str.strip():
+                in_context_section = f"\n\n(Newer) In-Context Messages:\n{in_context_messages_str}"
+            elif retain_count == 0:
+                prompt_header = (
+                    "You’re a memory-recall helper for an AI that is about to forget all prior messages. "
+                    "Scan the conversation history and write crisp notes that capture any important facts or insights about the conversation history."
+                )
+
+            # Compose final prompt
+            summary_request_text = prompt_header + evicted_section + in_context_section
 
-    (Newer) In-Context Messages:\n
-    {in_context_messages_str}
-    """
             # Fire-and-forget the summarization task
             self.fire_and_forget(
                 self.summarizer_agent.step([MessageCreate(role=MessageRole.user, content=[TextContent(text=summary_request_text)])])
@@ -162,7 +182,8 @@ def format_transcript(messages: List[Message], include_system: bool = False) ->
             # Skip tool messages where the name is "send_message"
             if msg.role == MessageRole.tool and msg.name == DEFAULT_MESSAGE_TOOL:
                 continue
-            text = "".join(c.text for c in msg.content).strip()
+
+            text = "".join(c.text for c in msg.content if isinstance(c, TextContent)).strip()
 
         # 2) Otherwise, try extracting from function calls
         elif msg.tool_calls:
diff --git a/tests/integration_test_multi_agent.py b/tests/integration_test_multi_agent.py
index a4a464b5..748f824b 100644
--- a/tests/integration_test_multi_agent.py
+++ b/tests/integration_test_multi_agent.py
@@ -94,8 +94,9 @@ def agent_obj(client):
     agent_state_instance = client.agents.create(
         include_base_tools=True,
         tool_ids=[send_message_to_agent_tool.id],
-        model="openai/gpt-4o-mini",
+        model="openai/gpt-4o",
         embedding="letta/letta-free",
+        context_window_limit=32000,
     )
     yield agent_state_instance
 
@@ -108,8 +109,9 @@ def other_agent_obj(client):
     agent_state_instance = client.agents.create(
         include_base_tools=True,
         include_multi_agent_tools=False,
-        model="openai/gpt-4o-mini",
+        model="openai/gpt-4o",
         embedding="letta/letta-free",
+        context_window_limit=32000,
     )
 
     yield agent_state_instance
diff --git a/tests/integration_test_send_message.py b/tests/integration_test_send_message.py
index a7cc37f0..351daea9 100644
--- a/tests/integration_test_send_message.py
+++ b/tests/integration_test_send_message.py
@@ -128,7 +128,7 @@ USER_MESSAGE_TOOL_CALL: List[MessageCreate] = [
 ]
 all_configs = [
     "openai-gpt-4o-mini.json",
-    "azure-gpt-4o-mini.json",
+    # "azure-gpt-4o-mini.json", # TODO: Re-enable on new agent loop
     "claude-3-5-sonnet.json",
     "claude-3-7-sonnet.json",
     "claude-3-7-sonnet-extended.json",
@@ -898,3 +898,58 @@ async def test_async_greeting_with_assistant_message_async_client(
 
     messages = result["messages"]
     assert_tool_response_dict_messages(messages)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "llm_config",
+    TESTED_LLM_CONFIGS,
+    ids=[c.model for c in TESTED_LLM_CONFIGS],
+)
+async def test_auto_summarize(disable_e2b_api_key: Any, client: Letta, llm_config: LLMConfig):
+    """Test that summarization is automatically triggered."""
+    llm_config.context_window = 3000
+    client.tools.upsert_base_tools()
+
+    send_message_tool = client.tools.list(name="send_message")[0]
+    temp_agent_state = client.agents.create(
+        include_base_tools=False,
+        tool_ids=[send_message_tool.id],
+        llm_config=llm_config,
+        embedding="letta/letta-free",
+        tags=["supervisor"],
+    )
+
+    philosophical_question = """
+You know, sometimes I wonder if the entire structure of our lives is built on a series of unexamined assumptions we just silently agreed to somewhere along the way—like how we all just decided that five days a week of work and two days of “rest” constitutes balance, or how 9-to-5 became the default rhythm of a meaningful life, or even how the idea of “success” got boiled down to job titles and property ownership and productivity metrics on a LinkedIn profile, when maybe none of that is actually what makes a life feel full, or grounded, or real. And then there’s the weird paradox of ambition, how we're taught to chase it like a finish line that keeps moving, constantly redefining itself right as you’re about to grasp it—because even when you get the job, or the degree, or the validation, there's always something next, something more, like a treadmill with invisible settings you didn’t realize were turned up all the way.
+
+And have you noticed how we rarely stop to ask who set those definitions for us? Like was there ever a council that decided, yes, owning a home by thirty-five and retiring by sixty-five is the universal template for fulfillment? Or did it just accumulate like cultural sediment over generations, layered into us so deeply that questioning it feels uncomfortable, even dangerous? And isn’t it strange that we spend so much of our lives trying to optimize things—our workflows, our diets, our sleep, our morning routines—as though the point of life is to operate more efficiently rather than to experience it more richly? We build these intricate systems, these rulebooks for being a “high-functioning” human, but where in all of that is the space for feeling lost, for being soft, for wandering without a purpose just because it’s a sunny day and your heart is tugging you toward nowhere in particular?
+
+Sometimes I lie awake at night and wonder if all the noise we wrap around ourselves—notifications, updates, performance reviews, even our internal monologues—might be crowding out the questions we were meant to live into slowly, like how to love better, or how to forgive ourselves, or what the hell we’re even doing here in the first place. And when you strip it all down—no goals, no KPIs, no curated identity—what’s actually left of us? Are we just a sum of the roles we perform, or is there something quieter underneath that we've forgotten how to hear?
+
+And if there is something underneath all of it—something real, something worth listening to—then how do we begin to uncover it, gently, without rushing or reducing it to another task on our to-do list?
+    """
+
+    MAX_ATTEMPTS = 10
+    prev_length = None
+
+    for attempt in range(MAX_ATTEMPTS):
+        client.agents.messages.create(
+            agent_id=temp_agent_state.id,
+            messages=[MessageCreate(role="user", content=philosophical_question)],
+        )
+
+        temp_agent_state = client.agents.retrieve(agent_id=temp_agent_state.id)
+        message_ids = temp_agent_state.message_ids
+        current_length = len(message_ids)
+
+        print("LENGTH OF IN_CONTEXT_MESSAGES:", current_length)
+
+        if prev_length is not None and current_length <= prev_length:
+            # TODO: Add more stringent checks here
+            print(f"Summarization was triggered, detected current_length {current_length} is at least prev_length {prev_length}.")
+            break
+
+        prev_length = current_length
+    else:
+        raise AssertionError("Summarization was not triggered after 10 messages")
diff --git a/tests/integration_test_summarizer.py b/tests/integration_test_summarizer.py
index 6e0ebd73..7143f9ac 100644
--- a/tests/integration_test_summarizer.py
+++ b/tests/integration_test_summarizer.py
@@ -257,7 +257,7 @@ def test_auto_summarize(server, default_user):
     "config_filename",
     [
         "openai-gpt-4o.json",
-        "azure-gpt-4o-mini.json",
+        # "azure-gpt-4o-mini.json",
         "claude-3-5-haiku.json",
         # "groq.json",  # rate limits
         # "gemini-pro.json",  # broken