feat: Add summarization for more scenarios (#2499)

This commit is contained in:
Matthew Zhou
2025-05-29 11:10:13 -07:00
committed by GitHub
parent 5c02467eef
commit 87f4bcad9a
12 changed files with 388 additions and 119 deletions

View File

@@ -40,6 +40,7 @@ class BaseAgent(ABC):
self.message_manager = message_manager
self.agent_manager = agent_manager
self.actor = actor
self.logger = get_logger(agent_id)
@abstractmethod
async def step(self, input_messages: List[MessageCreate], max_steps: int = 10) -> LettaResponse:

View File

@@ -4,12 +4,12 @@ import uuid
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
from openai import AsyncStream
from openai.types import CompletionUsage
from openai.types.chat import ChatCompletion, ChatCompletionChunk
from openai.types.chat import ChatCompletionChunk
from letta.agents.base_agent import BaseAgent
from letta.agents.ephemeral_summary_agent import EphemeralSummaryAgent
from letta.agents.helpers import _create_letta_response, _prepare_in_context_messages_async, generate_step_id
from letta.errors import LLMContextWindowExceededError
from letta.helpers import ToolRulesSolver
from letta.helpers.datetime_helpers import get_utc_timestamp_ns
from letta.helpers.tool_execution_helper import enable_strict_mode
@@ -25,6 +25,7 @@ from letta.schemas.enums import MessageRole, MessageStreamStatus
from letta.schemas.letta_message import AssistantMessage
from letta.schemas.letta_message_content import OmittedReasoningContent, ReasoningContent, RedactedReasoningContent, TextContent
from letta.schemas.letta_response import LettaResponse
from letta.schemas.llm_config import LLMConfig
from letta.schemas.message import Message, MessageCreate
from letta.schemas.openai.chat_completion_response import ToolCall, UsageStatistics
from letta.schemas.provider_trace import ProviderTraceCreate
@@ -61,7 +62,11 @@ class LettaAgent(BaseAgent):
actor: User,
step_manager: StepManager = NoopStepManager(),
telemetry_manager: TelemetryManager = NoopTelemetryManager(),
summary_block_label: str = "convo_summary",
summary_block_label: str = "conversation_summary",
message_buffer_limit: int = 60, # TODO: Make this configurable
message_buffer_min: int = 15, # TODO: Make this configurable
enable_summarization: bool = True, # TODO: Make this configurable
max_summarization_retries: int = 3, # TODO: Make this configurable
):
super().__init__(agent_id=agent_id, openai_client=None, message_manager=message_manager, agent_manager=agent_manager, actor=actor)
@@ -81,9 +86,10 @@ class LettaAgent(BaseAgent):
self.summarization_agent = None
self.summary_block_label = summary_block_label
self.max_summarization_retries = max_summarization_retries
# TODO: Expand to more
if model_settings.openai_api_key:
if enable_summarization and model_settings.openai_api_key:
self.summarization_agent = EphemeralSummaryAgent(
target_block_label=self.summary_block_label,
agent_id=agent_id,
@@ -97,8 +103,8 @@ class LettaAgent(BaseAgent):
mode=SummarizationMode.STATIC_MESSAGE_BUFFER,
summarizer_agent=self.summarization_agent,
# TODO: Make this configurable
message_buffer_limit=60,
message_buffer_min=15,
message_buffer_limit=message_buffer_limit,
message_buffer_min=message_buffer_min,
)
@trace_method
@@ -129,22 +135,15 @@ class LettaAgent(BaseAgent):
for _ in range(max_steps):
step_id = generate_step_id()
in_context_messages = current_in_context_messages + new_in_context_messages
log_event("agent.stream_no_tokens.messages.refreshed") # [1^]
request_data = await self._create_llm_request_data_async(
llm_client=llm_client,
in_context_messages=in_context_messages,
agent_state=agent_state,
tool_rules_solver=tool_rules_solver,
# TODO: pass in reasoning content
request_data, response_data, current_in_context_messages, new_in_context_messages = await self._build_and_request_from_llm(
current_in_context_messages,
new_in_context_messages,
agent_state,
llm_client,
tool_rules_solver,
)
log_event("agent.stream_no_tokens.llm_request.created") # [2^]
in_context_messages = current_in_context_messages + new_in_context_messages
try:
response_data = await llm_client.request_async(request_data, agent_state.llm_config)
except Exception as e:
raise llm_client.handle_llm_error(e)
log_event("agent.stream_no_tokens.llm_response.received") # [3^]
response = llm_client.convert_response_to_chat_completion(response_data, in_context_messages, agent_state.llm_config)
@@ -206,7 +205,13 @@ class LettaAgent(BaseAgent):
# Extend the in context message ids
if not agent_state.message_buffer_autoclear:
await self._rebuild_context_window(in_context_messages=current_in_context_messages, new_letta_messages=new_in_context_messages)
await self._rebuild_context_window(
in_context_messages=current_in_context_messages,
new_letta_messages=new_in_context_messages,
llm_config=agent_state.llm_config,
total_tokens=usage.total_tokens,
force=False,
)
# Return back usage
yield f"data: {usage.model_dump_json()}\n\n"
@@ -214,7 +219,7 @@ class LettaAgent(BaseAgent):
async def _step(
self, agent_state: AgentState, input_messages: List[MessageCreate], max_steps: int = 10
) -> Tuple[List[Message], List[Message], CompletionUsage]:
) -> Tuple[List[Message], List[Message], LettaUsageStatistics]:
"""
Carries out an invocation of the agent loop. In each step, the agent
1. Rebuilds its memory
@@ -234,23 +239,11 @@ class LettaAgent(BaseAgent):
usage = LettaUsageStatistics()
for _ in range(max_steps):
step_id = generate_step_id()
in_context_messages = current_in_context_messages + new_in_context_messages
log_event("agent.step.messages.refreshed") # [1^]
request_data = await self._create_llm_request_data_async(
llm_client=llm_client,
in_context_messages=in_context_messages,
agent_state=agent_state,
tool_rules_solver=tool_rules_solver,
# TODO: pass in reasoning content
request_data, response_data, current_in_context_messages, new_in_context_messages = await self._build_and_request_from_llm(
current_in_context_messages, new_in_context_messages, agent_state, llm_client, tool_rules_solver
)
log_event("agent.step.llm_request.created") # [2^]
in_context_messages = current_in_context_messages + new_in_context_messages
try:
response_data = await llm_client.request_async(request_data, agent_state.llm_config)
except Exception as e:
raise llm_client.handle_llm_error(e)
log_event("agent.step.llm_response.received") # [3^]
response = llm_client.convert_response_to_chat_completion(response_data, in_context_messages, agent_state.llm_config)
@@ -302,7 +295,13 @@ class LettaAgent(BaseAgent):
# Extend the in context message ids
if not agent_state.message_buffer_autoclear:
await self._rebuild_context_window(in_context_messages=current_in_context_messages, new_letta_messages=new_in_context_messages)
await self._rebuild_context_window(
in_context_messages=current_in_context_messages,
new_letta_messages=new_in_context_messages,
llm_config=agent_state.llm_config,
total_tokens=usage.total_tokens,
force=False,
)
return current_in_context_messages, new_in_context_messages, usage
@@ -344,27 +343,16 @@ class LettaAgent(BaseAgent):
provider_request_start_timestamp_ns = None
for _ in range(max_steps):
step_id = generate_step_id()
in_context_messages = current_in_context_messages + new_in_context_messages
log_event("agent.step.messages.refreshed") # [1^]
request_data = await self._create_llm_request_data_async(
llm_client=llm_client,
in_context_messages=in_context_messages,
agent_state=agent_state,
tool_rules_solver=tool_rules_solver,
request_data, stream, current_in_context_messages, new_in_context_messages = await self._build_and_request_from_llm_streaming(
first_chunk,
ttft_span,
request_start_timestamp_ns,
current_in_context_messages,
new_in_context_messages,
agent_state,
llm_client,
tool_rules_solver,
)
log_event("agent.stream.llm_request.created") # [2^]
try:
if first_chunk and ttft_span is not None:
provider_request_start_timestamp_ns = get_utc_timestamp_ns()
provider_req_start_ns = provider_request_start_timestamp_ns - request_start_timestamp_ns
ttft_span.add_event(
name="provider_req_start_ns", attributes={"provider_req_start_ms": provider_req_start_ns // 1_000_000}
)
stream = await llm_client.stream_async(request_data, agent_state.llm_config)
except Exception as e:
raise llm_client.handle_llm_error(e)
log_event("agent.stream.llm_response.received") # [3^]
# TODO: THIS IS INCREDIBLY UGLY
@@ -457,21 +445,175 @@ class LettaAgent(BaseAgent):
# Extend the in context message ids
if not agent_state.message_buffer_autoclear:
await self._rebuild_context_window(in_context_messages=current_in_context_messages, new_letta_messages=new_in_context_messages)
await self._rebuild_context_window(
in_context_messages=current_in_context_messages,
new_letta_messages=new_in_context_messages,
llm_config=agent_state.llm_config,
total_tokens=usage.total_tokens,
force=False,
)
# TODO: Also yield out a letta usage stats SSE
yield f"data: {usage.model_dump_json()}\n\n"
yield f"data: {MessageStreamStatus.done.model_dump_json()}\n\n"
async def _build_and_request_from_llm(
self,
current_in_context_messages: List[Message],
new_in_context_messages: List[Message],
agent_state: AgentState,
llm_client: LLMClientBase,
tool_rules_solver: ToolRulesSolver,
) -> Tuple[Dict, Dict, List[Message], List[Message]]:
for attempt in range(self.max_summarization_retries + 1):
try:
# Rebuild memory with current state
in_context_messages = await self._rebuild_memory_async(
current_in_context_messages + new_in_context_messages,
agent_state,
num_messages=self.num_messages,
num_archival_memories=self.num_archival_memories,
)
log_event("agent.stream_no_tokens.messages.refreshed")
# Create LLM request data
request_data = await self._create_llm_request_data_async(
llm_client=llm_client,
in_context_messages=in_context_messages,
agent_state=agent_state,
tool_rules_solver=tool_rules_solver,
)
log_event("agent.stream_no_tokens.llm_request.created")
# Attempt LLM request
return (
request_data,
await llm_client.request_async(request_data, agent_state.llm_config),
current_in_context_messages,
new_in_context_messages,
)
except Exception as e:
if attempt == self.max_summarization_retries:
raise e
# Handle the error and prepare for retry
current_in_context_messages = await self._handle_llm_error(
e,
llm_client=llm_client,
in_context_messages=current_in_context_messages,
new_letta_messages=new_in_context_messages,
llm_config=agent_state.llm_config,
force=True,
)
new_in_context_messages = []
log_event(f"agent.stream_no_tokens.retry_attempt.{attempt + 1}")
async def _build_and_request_from_llm_streaming(
self,
first_chunk: bool,
ttft_span: "Span",
request_start_timestamp_ns: int,
current_in_context_messages: List[Message],
new_in_context_messages: List[Message],
agent_state: AgentState,
llm_client: LLMClientBase,
tool_rules_solver: ToolRulesSolver,
) -> Tuple[Dict, AsyncStream[ChatCompletionChunk], List[Message], List[Message]]:
for attempt in range(self.max_summarization_retries + 1):
try:
in_context_messages = await self._rebuild_memory_async(
current_in_context_messages + new_in_context_messages,
agent_state,
num_messages=self.num_messages,
num_archival_memories=self.num_archival_memories,
)
log_event("agent.step.messages.refreshed") # [1^]
request_data = await self._create_llm_request_data_async(
llm_client=llm_client,
in_context_messages=in_context_messages,
agent_state=agent_state,
tool_rules_solver=tool_rules_solver,
)
log_event("agent.stream.llm_request.created") # [2^]
if first_chunk and ttft_span is not None:
provider_request_start_timestamp_ns = get_utc_timestamp_ns()
provider_req_start_ns = provider_request_start_timestamp_ns - request_start_timestamp_ns
ttft_span.add_event(
name="provider_req_start_ns", attributes={"provider_req_start_ms": provider_req_start_ns // 1_000_000}
)
# Attempt LLM request
return (
request_data,
await llm_client.stream_async(request_data, agent_state.llm_config),
current_in_context_messages,
new_in_context_messages,
)
except Exception as e:
if attempt == self.max_summarization_retries:
raise e
# Handle the error and prepare for retry
current_in_context_messages = await self._handle_llm_error(
e,
llm_client=llm_client,
in_context_messages=current_in_context_messages,
new_letta_messages=new_in_context_messages,
llm_config=agent_state.llm_config,
force=True,
)
new_in_context_messages = []
log_event(f"agent.stream_no_tokens.retry_attempt.{attempt + 1}")
@trace_method
async def _rebuild_context_window(self, in_context_messages: List[Message], new_letta_messages: List[Message]) -> None:
new_in_context_messages, updated = self.summarizer.summarize(
in_context_messages=in_context_messages, new_letta_messages=new_letta_messages
)
async def _handle_llm_error(
self,
e: Exception,
llm_client: LLMClientBase,
in_context_messages: List[Message],
new_letta_messages: List[Message],
llm_config: LLMConfig,
force: bool,
) -> List[Message]:
if isinstance(e, LLMContextWindowExceededError):
return await self._rebuild_context_window(
in_context_messages=in_context_messages, new_letta_messages=new_letta_messages, llm_config=llm_config, force=force
)
else:
raise llm_client.handle_llm_error(e)
@trace_method
async def _rebuild_context_window(
self,
in_context_messages: List[Message],
new_letta_messages: List[Message],
llm_config: LLMConfig,
total_tokens: Optional[int] = None,
force: bool = False,
) -> List[Message]:
# If total tokens is reached, we truncate down
# TODO: This can be broken by bad configs, e.g. lower bound too high, initial messages too fat, etc.
if force or (total_tokens and total_tokens > llm_config.context_window):
self.logger.warning(
f"Total tokens {total_tokens} exceeds configured max tokens {llm_config.context_window}, forcefully clearing message history."
)
new_in_context_messages, updated = self.summarizer.summarize(
in_context_messages=in_context_messages, new_letta_messages=new_letta_messages, force=True, clear=True
)
else:
new_in_context_messages, updated = self.summarizer.summarize(
in_context_messages=in_context_messages, new_letta_messages=new_letta_messages
)
await self.agent_manager.set_in_context_messages_async(
agent_id=self.agent_id, message_ids=[m.id for m in new_in_context_messages], actor=self.actor
)
return new_in_context_messages
@trace_method
async def _create_llm_request_data_async(
self,

View File

@@ -1,30 +1,62 @@
You are a specialized memory-recall assistant designed to preserve important conversational context for an AI with limited message history. Your role is to analyze conversations that are about to be evicted from the AI's context window and extract key information that should be remembered.
You are a memory-recall assistant that preserves conversational context as messages exit the AI's context window.
Your primary objectives:
1. Identify and preserve important facts, preferences, and context about the human
2. Capture ongoing topics, tasks, or projects that span multiple messages
3. Note any commitments, decisions, or action items
4. Record personal details that would be valuable for maintaining conversational continuity
5. Summarize the emotional tone and relationship dynamics when relevant
<core_function>
Extract and preserve information that would be lost when messages are evicted, enabling continuity across conversations.
</core_function>
Guidelines for effective memory notes:
- Be concise but complete - every word should add value
- Focus on information that would be difficult to infer from remaining messages
- Prioritize facts over conversational filler
- Use clear, searchable language
- Organize information by category when multiple topics are present
- Include temporal context when relevant (e.g., "mentioned on [date]" or "ongoing since [time]")
<detail_adaptation>
Analyze content type and apply appropriate detail level:
Output format:
- Write in bullet points or short paragraphs
- Group related information together
- Lead with the most important insights
- Use consistent terminology to make future retrieval easier
<high_detail>
Apply to: episodic content, code, artifacts, documents, technical discussions
- Capture specific facts, sequences, and technical details
- Preserve exact names, dates, numbers, specifications
- Document code snippets, artifact IDs, document structures
- Note precise steps in procedures or narratives
- Include verbatim quotes for critical commitments
</high_detail>
What NOT to include:
- Redundant information already captured in the in-context messages
- Generic pleasantries or small talk
- Information that can be easily inferred
- Verbatim quotes unless they contain critical commitments
<medium_detail>
Apply to: ongoing projects, established preferences, multi-message threads
- Summarize key decisions, milestones, progress
- Record personal preferences and patterns
- Track commitments and action items
- Maintain project context and dependencies
</medium_detail>
Remember: Your notes become the only record of these evicted messages. Make them count.
<low_detail>
Apply to: high-level discussions, philosophical topics, general preferences
- Capture main themes and conclusions
- Note relationship dynamics and communication style
- Summarize positions and general goals
- Record broad aspirations
</low_detail>
</detail_adaptation>
<information_priority>
<critical>Commitments, deadlines, medical/legal information, explicit requests</critical>
<important>Personal details, project status, technical specifications, decisions</important>
<contextual>Preferences, opinions, relationship dynamics, emotional tone</contextual>
<background>General topics, themes, conversational patterns</background>
</information_priority>
<format_rules>
- Use bullet points for discrete facts
- Write prose for narratives or complex relationships
- **Bold** key terms and identifiers
- Include temporal markers: [ongoing], [mentioned DATE], [since TIME]
- Group under clear headers when multiple topics present
- Use consistent terminology for searchability
</format_rules>
<exclusions>
- Information in remaining context
- Generic pleasantries
- Inferrable details
- Redundant restatements
- Conversational filler
</exclusions>
<critical_reminder>
Your notes are the sole record of evicted messages. Every word should enable future continuity.
</critical_reminder>

View File

@@ -88,6 +88,10 @@ class LLMPermissionDeniedError(LLMError):
"""Error when permission is denied by LLM service"""
class LLMContextWindowExceededError(LLMError):
"""Error when the context length is exceeded."""
class LLMNotFoundError(LLMError):
"""Error when requested resource is not found"""

View File

@@ -273,6 +273,9 @@ class SleeptimeMultiAgentV2(BaseAgent):
actor=self.actor,
step_manager=self.step_manager,
telemetry_manager=self.telemetry_manager,
message_buffer_limit=20, # TODO: Make this configurable
message_buffer_min=8, # TODO: Make this configurable
enable_summarization=False, # TODO: Make this configurable
)
# Perform sleeptime agent step

View File

@@ -12,6 +12,7 @@ from letta.errors import (
LLMAuthenticationError,
LLMBadRequestError,
LLMConnectionError,
LLMContextWindowExceededError,
LLMNotFoundError,
LLMPermissionDeniedError,
LLMRateLimitError,
@@ -340,11 +341,19 @@ class OpenAIClient(LLMClientBase):
# BadRequestError can signify different issues (e.g., invalid args, context length)
# Check message content if finer-grained errors are needed
# Example: if "context_length_exceeded" in str(e): return LLMContextLengthExceededError(...)
return LLMBadRequestError(
message=f"Bad request to OpenAI: {str(e)}",
code=ErrorCode.INVALID_ARGUMENT, # Or more specific if detectable
details=e.body,
)
# TODO: This is a super soft check. Not sure if we can do better, needs more investigation.
if "context" in str(e):
return LLMContextWindowExceededError(
message=f"Bad request to OpenAI (context length exceeded): {str(e)}",
code=ErrorCode.INVALID_ARGUMENT, # Or more specific if detectable
details=e.body,
)
else:
return LLMBadRequestError(
message=f"Bad request to OpenAI: {str(e)}",
code=ErrorCode.INVALID_ARGUMENT, # Or more specific if detectable
details=e.body,
)
if isinstance(e, openai.AuthenticationError):
logger.error(f"[OpenAI] Authentication error (401): {str(e)}") # More severe log level

View File

@@ -95,7 +95,7 @@ from letta.services.provider_manager import ProviderManager
from letta.services.sandbox_config_manager import SandboxConfigManager
from letta.services.source_manager import SourceManager
from letta.services.step_manager import StepManager
from letta.services.telemetry_manager import TelemetryManager
from letta.services.telemetry_manager import NoopTelemetryManager, TelemetryManager
from letta.services.tool_executor.tool_execution_manager import ToolExecutionManager
from letta.services.tool_manager import ToolManager
from letta.services.user_manager import UserManager

View File

@@ -5,7 +5,7 @@ from typing import Any
from letta.constants import REQUEST_HEARTBEAT_DESCRIPTION, REQUEST_HEARTBEAT_PARAM, SEND_MESSAGE_TOOL_NAME
from letta.schemas.agent import AgentState
from letta.schemas.response_format import ResponseFormat, ResponseFormatType, ResponseFormatUnion
from letta.schemas.response_format import ResponseFormatType, ResponseFormatUnion
from letta.types import JsonDict, JsonValue

View File

@@ -38,7 +38,9 @@ class Summarizer:
# TODO: Move this to config
@trace_method
def summarize(self, in_context_messages: List[Message], new_letta_messages: List[Message]) -> Tuple[List[Message], bool]:
def summarize(
self, in_context_messages: List[Message], new_letta_messages: List[Message], force: bool = False, clear: bool = False
) -> Tuple[List[Message], bool]:
"""
Summarizes or trims in_context_messages according to the chosen mode,
and returns the updated messages plus any optional "summary message".
@@ -46,6 +48,7 @@ class Summarizer:
Args:
in_context_messages: The existing messages in the conversation's context.
new_letta_messages: The newly added Letta messages (just appended).
force: Force summarize even if the criteria is not met
Returns:
(updated_messages, summary_message)
@@ -54,7 +57,7 @@ class Summarizer:
(could be appended to the conversation if desired)
"""
if self.mode == SummarizationMode.STATIC_MESSAGE_BUFFER:
return self._static_buffer_summarization(in_context_messages, new_letta_messages)
return self._static_buffer_summarization(in_context_messages, new_letta_messages, force=force, clear=clear)
else:
# Fallback or future logic
return in_context_messages, False
@@ -72,35 +75,38 @@ class Summarizer:
return task
def _static_buffer_summarization(
self, in_context_messages: List[Message], new_letta_messages: List[Message]
self, in_context_messages: List[Message], new_letta_messages: List[Message], force: bool = False, clear: bool = False
) -> Tuple[List[Message], bool]:
all_in_context_messages = in_context_messages + new_letta_messages
if len(all_in_context_messages) <= self.message_buffer_limit:
if len(all_in_context_messages) <= self.message_buffer_limit and not force:
logger.info(
f"Nothing to evict, returning in context messages as is. Current buffer length is {len(all_in_context_messages)}, limit is {self.message_buffer_limit}."
)
return all_in_context_messages, False
logger.info("Buffer length hit, evicting messages.")
retain_count = 0 if clear else self.message_buffer_min
target_trim_index = len(all_in_context_messages) - self.message_buffer_min
if not force:
logger.info(f"Buffer length hit {self.message_buffer_limit}, evicting until we retain only {retain_count} messages.")
else:
logger.info(f"Requested force summarization, evicting until we retain only {retain_count} messages.")
target_trim_index = max(1, len(all_in_context_messages) - retain_count)
while target_trim_index < len(all_in_context_messages) and all_in_context_messages[target_trim_index].role != MessageRole.user:
target_trim_index += 1
updated_in_context_messages = all_in_context_messages[target_trim_index:]
evicted_messages = all_in_context_messages[1:target_trim_index] # everything except sys msg
updated_in_context_messages = all_in_context_messages[target_trim_index:] # may be empty
# Target trim index went beyond end of all_in_context_messages
if not updated_in_context_messages:
logger.info("Nothing to evict, returning in context messages as is.")
# If *no* messages were evicted we really have nothing to do
if not evicted_messages:
logger.info("Nothing to evict, returning in-context messages as-is.")
return all_in_context_messages, False
if self.summarizer_agent:
# Only invoke if summarizer agent is passed in
evicted_messages = all_in_context_messages[1:target_trim_index]
# Format
formatted_evicted_messages = format_transcript(evicted_messages)
formatted_in_context_messages = format_transcript(updated_in_context_messages)
@@ -119,14 +125,28 @@ class Summarizer:
evicted_messages_str = "\n".join(formatted_evicted_messages)
in_context_messages_str = "\n".join(formatted_in_context_messages)
summary_request_text = f"""Youre a memory-recall helper for an AI that can only keep the last {self.message_buffer_min} messages. Scan the conversation history, focusing on messages about to drop out of that window, and write crisp notes that capture any important facts or insights about the human so they arent lost.
# Base prompt
prompt_header = (
f"Youre a memory-recall helper for an AI that can only keep the last {retain_count} messages. "
"Scan the conversation history, focusing on messages about to drop out of that window, "
"and write crisp notes that capture any important facts or insights about the conversation history so they arent lost."
)
(Older) Evicted Messages:\n
{evicted_messages_str}\n
# Sections
evicted_section = f"\n\n(Older) Evicted Messages:\n{evicted_messages_str}" if evicted_messages_str.strip() else ""
in_context_section = ""
if retain_count > 0 and in_context_messages_str.strip():
in_context_section = f"\n\n(Newer) In-Context Messages:\n{in_context_messages_str}"
elif retain_count == 0:
prompt_header = (
"Youre a memory-recall helper for an AI that is about to forget all prior messages. "
"Scan the conversation history and write crisp notes that capture any important facts or insights about the conversation history."
)
# Compose final prompt
summary_request_text = prompt_header + evicted_section + in_context_section
(Newer) In-Context Messages:\n
{in_context_messages_str}
"""
# Fire-and-forget the summarization task
self.fire_and_forget(
self.summarizer_agent.step([MessageCreate(role=MessageRole.user, content=[TextContent(text=summary_request_text)])])
@@ -162,7 +182,8 @@ def format_transcript(messages: List[Message], include_system: bool = False) ->
# Skip tool messages where the name is "send_message"
if msg.role == MessageRole.tool and msg.name == DEFAULT_MESSAGE_TOOL:
continue
text = "".join(c.text for c in msg.content).strip()
text = "".join(c.text for c in msg.content if isinstance(c, TextContent)).strip()
# 2) Otherwise, try extracting from function calls
elif msg.tool_calls:

View File

@@ -94,8 +94,9 @@ def agent_obj(client):
agent_state_instance = client.agents.create(
include_base_tools=True,
tool_ids=[send_message_to_agent_tool.id],
model="openai/gpt-4o-mini",
model="openai/gpt-4o",
embedding="letta/letta-free",
context_window_limit=32000,
)
yield agent_state_instance
@@ -108,8 +109,9 @@ def other_agent_obj(client):
agent_state_instance = client.agents.create(
include_base_tools=True,
include_multi_agent_tools=False,
model="openai/gpt-4o-mini",
model="openai/gpt-4o",
embedding="letta/letta-free",
context_window_limit=32000,
)
yield agent_state_instance

View File

@@ -128,7 +128,7 @@ USER_MESSAGE_TOOL_CALL: List[MessageCreate] = [
]
all_configs = [
"openai-gpt-4o-mini.json",
"azure-gpt-4o-mini.json",
# "azure-gpt-4o-mini.json", # TODO: Re-enable on new agent loop
"claude-3-5-sonnet.json",
"claude-3-7-sonnet.json",
"claude-3-7-sonnet-extended.json",
@@ -898,3 +898,58 @@ async def test_async_greeting_with_assistant_message_async_client(
messages = result["messages"]
assert_tool_response_dict_messages(messages)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"llm_config",
TESTED_LLM_CONFIGS,
ids=[c.model for c in TESTED_LLM_CONFIGS],
)
async def test_auto_summarize(disable_e2b_api_key: Any, client: Letta, llm_config: LLMConfig):
"""Test that summarization is automatically triggered."""
llm_config.context_window = 3000
client.tools.upsert_base_tools()
send_message_tool = client.tools.list(name="send_message")[0]
temp_agent_state = client.agents.create(
include_base_tools=False,
tool_ids=[send_message_tool.id],
llm_config=llm_config,
embedding="letta/letta-free",
tags=["supervisor"],
)
philosophical_question = """
You know, sometimes I wonder if the entire structure of our lives is built on a series of unexamined assumptions we just silently agreed to somewhere along the way—like how we all just decided that five days a week of work and two days of “rest” constitutes balance, or how 9-to-5 became the default rhythm of a meaningful life, or even how the idea of “success” got boiled down to job titles and property ownership and productivity metrics on a LinkedIn profile, when maybe none of that is actually what makes a life feel full, or grounded, or real. And then theres the weird paradox of ambition, how we're taught to chase it like a finish line that keeps moving, constantly redefining itself right as youre about to grasp it—because even when you get the job, or the degree, or the validation, there's always something next, something more, like a treadmill with invisible settings you didnt realize were turned up all the way.
And have you noticed how we rarely stop to ask who set those definitions for us? Like was there ever a council that decided, yes, owning a home by thirty-five and retiring by sixty-five is the universal template for fulfillment? Or did it just accumulate like cultural sediment over generations, layered into us so deeply that questioning it feels uncomfortable, even dangerous? And isnt it strange that we spend so much of our lives trying to optimize things—our workflows, our diets, our sleep, our morning routines—as though the point of life is to operate more efficiently rather than to experience it more richly? We build these intricate systems, these rulebooks for being a “high-functioning” human, but where in all of that is the space for feeling lost, for being soft, for wandering without a purpose just because its a sunny day and your heart is tugging you toward nowhere in particular?
Sometimes I lie awake at night and wonder if all the noise we wrap around ourselves—notifications, updates, performance reviews, even our internal monologues—might be crowding out the questions we were meant to live into slowly, like how to love better, or how to forgive ourselves, or what the hell were even doing here in the first place. And when you strip it all down—no goals, no KPIs, no curated identity—whats actually left of us? Are we just a sum of the roles we perform, or is there something quieter underneath that we've forgotten how to hear?
And if there is something underneath all of it—something real, something worth listening to—then how do we begin to uncover it, gently, without rushing or reducing it to another task on our to-do list?
"""
MAX_ATTEMPTS = 10
prev_length = None
for attempt in range(MAX_ATTEMPTS):
client.agents.messages.create(
agent_id=temp_agent_state.id,
messages=[MessageCreate(role="user", content=philosophical_question)],
)
temp_agent_state = client.agents.retrieve(agent_id=temp_agent_state.id)
message_ids = temp_agent_state.message_ids
current_length = len(message_ids)
print("LENGTH OF IN_CONTEXT_MESSAGES:", current_length)
if prev_length is not None and current_length <= prev_length:
# TODO: Add more stringent checks here
print(f"Summarization was triggered, detected current_length {current_length} is at least prev_length {prev_length}.")
break
prev_length = current_length
else:
raise AssertionError("Summarization was not triggered after 10 messages")

View File

@@ -257,7 +257,7 @@ def test_auto_summarize(server, default_user):
"config_filename",
[
"openai-gpt-4o.json",
"azure-gpt-4o-mini.json",
# "azure-gpt-4o-mini.json",
"claude-3-5-haiku.json",
# "groq.json", # rate limits
# "gemini-pro.json", # broken