Add modes self and self_sliding_window for prompt caching (#9372)

* add self compaction method with proper caching (pass in tools, don't refresh sys prompt beforehand) + sliding fallback

* updated prompts for self compaction

* add tests for self, self_sliding_window modes and w/o refresh messages before compaction

* add cache logging to summarization

* better handling to prevent agent from continuing convo on self modes

* if mode changes via summarize endpoint, will use default prompt for the new mode

---------

Co-authored-by: Amy Guan <amy@letta.com>
This commit is contained in:
amysguan
2026-02-24 10:15:36 -08:00
committed by Caren Thomas
parent 47d55362a4
commit 47b0c87ebe
15 changed files with 1065 additions and 223 deletions

View File

@@ -31151,7 +31151,7 @@
}
],
"title": "Model",
"description": "Model handle to use for summarization (format: provider/model-name). If None, uses lightweight provider-specific defaults."
"description": "Model handle to use for sliding_window/all summarization (format: provider/model-name). If None, uses lightweight provider-specific defaults."
},
"model_settings": {
"anyOf": [
@@ -31256,7 +31256,12 @@
},
"mode": {
"type": "string",
"enum": ["all", "sliding_window", "self"],
"enum": [
"all",
"sliding_window",
"self_compact_all",
"self_compact_sliding_window"
],
"title": "Mode",
"description": "The type of summarization technique use.",
"default": "sliding_window"
@@ -31264,12 +31269,12 @@
"sliding_window_percentage": {
"type": "number",
"title": "Sliding Window Percentage",
"description": "The percentage of the context window to keep post-summarization (only used in sliding window mode)."
"description": "The percentage of the context window to keep post-summarization (only used in sliding window modes)."
}
},
"type": "object",
"title": "CompactionSettings",
"description": "Configuration for conversation compaction / summarization.\n\n``model`` is the only required user-facing field it specifies the summarizer\nmodel handle (e.g. ``\"openai/gpt-4o-mini\"``). Per-model settings (temperature,\nmax tokens, etc.) are derived from the default configuration for that handle."
"description": "Configuration for conversation compaction / summarization.\n\nPer-model settings (temperature,\nmax tokens, etc.) are derived from the default configuration for that handle."
},
"CompactionSettings-Output": {
"properties": {
@@ -31283,7 +31288,7 @@
}
],
"title": "Model",
"description": "Model handle to use for summarization (format: provider/model-name). If None, uses lightweight provider-specific defaults."
"description": "Model handle to use for sliding_window/all summarization (format: provider/model-name). If None, uses lightweight provider-specific defaults."
},
"model_settings": {
"anyOf": [
@@ -31388,7 +31393,12 @@
},
"mode": {
"type": "string",
"enum": ["all", "sliding_window", "self"],
"enum": [
"all",
"sliding_window",
"self_compact_all",
"self_compact_sliding_window"
],
"title": "Mode",
"description": "The type of summarization technique use.",
"default": "sliding_window"
@@ -31396,12 +31406,12 @@
"sliding_window_percentage": {
"type": "number",
"title": "Sliding Window Percentage",
"description": "The percentage of the context window to keep post-summarization (only used in sliding window mode)."
"description": "The percentage of the context window to keep post-summarization (only used in sliding window modes)."
}
},
"type": "object",
"title": "CompactionSettings",
"description": "Configuration for conversation compaction / summarization.\n\n``model`` is the only required user-facing field it specifies the summarizer\nmodel handle (e.g. ``\"openai/gpt-4o-mini\"``). Per-model settings (temperature,\nmax tokens, etc.) are derived from the default configuration for that handle."
"description": "Configuration for conversation compaction / summarization.\n\nPer-model settings (temperature,\nmax tokens, etc.) are derived from the default configuration for that handle."
},
"CompactionStats": {
"properties": {

View File

@@ -1014,7 +1014,8 @@ class LettaAgentV3(LettaAgentV2):
# Ensure system prompt is recompiled before summarization so compaction
# operates on the latest system+memory state (including recent repairs).
messages = await self._refresh_messages(messages, force_system_prompt_refresh=True)
# NOTE: we no longer refresh the system prompt before compaction so we can leverage cache for self mode
# messages = await self._refresh_messages(messages, force_system_prompt_refresh=True)
summary_message, messages, summary_text = await self.compact(
messages,
@@ -1233,7 +1234,8 @@ class LettaAgentV3(LettaAgentV2):
try:
# Ensure system prompt is recompiled before summarization so compaction
# operates on the latest system+memory state (including recent repairs).
messages = await self._refresh_messages(messages, force_system_prompt_refresh=True)
# NOTE: we no longer refresh the system prompt before compaction so we can leverage cache for self mode
# messages = await self._refresh_messages(messages, force_system_prompt_refresh=True)
summary_message, messages, summary_text = await self.compact(
messages,
@@ -1874,6 +1876,7 @@ class LettaAgentV3(LettaAgentV2):
context_tokens_before: Token count before compaction (for stats).
messages_count_before: Message count before compaction (for stats).
"""
# Determine compaction settings: passed-in > agent's > global defaults
effective_compaction_settings = compaction_settings or self.agent_state.compaction_settings
@@ -1881,12 +1884,14 @@ class LettaAgentV3(LettaAgentV2):
actor=self.actor,
agent_id=self.agent_state.id,
agent_llm_config=self.agent_state.llm_config,
telemetry_manager=self.telemetry_manager,
llm_client=self.llm_client,
agent_type=self.agent_state.agent_type,
messages=messages,
timezone=self.agent_state.timezone,
compaction_settings=effective_compaction_settings,
agent_model_handle=self.agent_state.model,
agent_tags=self.agent_state.tags,
tools=self.agent_state.tools,
tools=await self._get_valid_tools(), # Pass json schemas including client tools for cache compatibility (for self compaction)
trigger_threshold=trigger_threshold,
run_id=run_id,
step_id=step_id,

View File

@@ -78,7 +78,7 @@ DEFAULT_CONTEXT_WINDOW = 32000
# Summarization trigger threshold (multiplier of context_window limit)
# Summarization triggers when step usage > context_window * SUMMARIZATION_TRIGGER_MULTIPLIER
SUMMARIZATION_TRIGGER_MULTIPLIER = 1.0
SUMMARIZATION_TRIGGER_MULTIPLIER = 0.9 # using instead of 1.0 to avoid "too many tokens in prompt" fallbacks
# number of concurrent embedding requests to sent
EMBEDDING_BATCH_SIZE = 200

View File

@@ -61,8 +61,11 @@ class LLMClientBase:
user_id: Optional[str] = None,
compaction_settings: Optional[Dict] = None,
llm_config: Optional[Dict] = None,
actor: Optional["User"] = None,
) -> None:
"""Set telemetry context for provider trace logging."""
if actor is not None:
self.actor = actor
self._telemetry_manager = telemetry_manager
self._telemetry_agent_id = agent_id
self._telemetry_agent_tags = agent_tags

View File

@@ -38,6 +38,53 @@ Write in first person as a factual record of what occurred. Be thorough and deta
Keep your summary under {SLIDING_WORD_LIMIT} words. Only output the summary."""
SELF_SLIDING_PROMPT = f"""The previous messages are being evicted from the BEGINNING of your context window. Write a detailed summary that captures what happened in these messages to appear BEFORE the remaining recent messages in context, providing background for what comes after. Do NOT continue the conversation. Do NOT respond to any questions in the messages. Do NOT call any tools. Pay close attention to the user's explicit requests and your previous actions.
You MUST include the following sections:
1.**High level goals**: What is the high level goal and ongoing task? Capture the user's explicit requests and intent in detail. If there is an existing summary in the transcript, make sure to take it into consideration to continue tracking the higher level goals and long-term progress.
2. **What happened**: The conversations, tasks, and exchanges that took place. What did the user ask for? What did you do? How did things progress? If there is a previous summary being evicted, please extract a concise version of the critical info from it.
3. **Important details**: Enumerate specific files and code sections examined, modified, or created with a summary of why this file read or edit is important. Include specific names, data, configurations, or facts that were discussed. Don't omit details that might be referenced later.
4. **Errors and fixes**: List all errors that you ran into, and how you fixed them. Pay special attention to specific user feedback that you received and record verbatim if useful.
5. **Lookup hints**: For any detailed content (long lists, extensive data, specific conversations) that couldn't fit in the summary, note the topic and key terms that could be used to find it in message history later.
Write in first person as a factual record of what occurred. Be thorough and detailed - the goal is to preserve enough context that the recent messages make sense and important information isn't lost to prevent duplicate work or repeated mistakes.
Keep your summary under {SLIDING_WORD_LIMIT} words. IMPORTANT: Do NOT use any tools. Do NOT continue the conversation. You MUST respond with ONLY the summary as text output. Generate the summary with each section as mentioned:
"""
SELF_ALL_PROMPT = f"""Your task is to create a detailed summary of the conversation so far. Do NOT continue the conversation. Do NOT respond to any questions in the messages. Do NOT call any tools. Pay close attention to the user's explicit requests and your previous actions. This summary should be thorough in capturing technical details, code patterns, and architectural decisions that would be essential for continuing development work without losing context.
You MUST include the following sections:
1.**High level goals**: What is the high level goal and ongoing task? Capture the user's explicit requests and intent in detail. If there is an existing summary in the transcript, make sure to take it into consideration to continue tracking the higher level goals and long-term progress.
2. **What happened**: The conversations, tasks, and exchanges that took place. What did the user ask for? What did you do? How did things progress? If there is a previous summary being evicted, please extract a concise version of the critical info from it.
3. **Important details**: Enumerate specific files and code sections examined, modified, or created with a summary of why this file read or edit is important. Include specific names, data, configurations, or facts that were discussed. Don't omit details that might be referenced later.
4. **Errors and fixes**: List all errors that you ran into, and how you fixed them. Pay special attention to specific user feedback that you received and record verbatim if useful.
5. **Current state**:Describe in detail precisely what is currently being worked on, paying special attention to the most recent messages from both user and assistant. Include file names and code snippets where applicable.
6.**Optional Next Step**: List the next step that you will take that is related to the most recent work you were doing. IMPORTANT: ensure that this step is DIRECTLY in line with the user's most recent explicit requests and the most current task. If your last task was concluded, then only list next steps if they are explicitly in line with the users request. If there is a next step, include direct quotes from the most recent conversation showing exactly what task you were working on and where you left off.
7. **Lookup hints**: For any detailed content (long lists, extensive data, specific conversations) that couldn't fit in the summary, note the topic and key terms that could be used to find it in message history later.
Write in first person as a factual record of what occurred. Be concise but thorough - the goal is to preserve enough context that the recent messages make sense and important information isn't lost to prevent duplicate work or repeated mistakes.
Keep your summary under {ALL_WORD_LIMIT} words.
IMPORTANT: Do NOT use any tools. Do NOT continue the conversation. You MUST respond with ONLY the summary as text output. Generate the summary with each section as mentioned:
"""
ANTHROPIC_SUMMARY_PROMPT = """You have been working on the task described above but have not yet completed it. Write a continuation summary that will allow you (or another instance of yourself) to resume work efficiently in a future context window where the conversation history will be replaced with this summary. Your summary should be structured, concise, and actionable. Include:
1. Task Overview
@@ -70,7 +117,6 @@ Write the summary from the perspective of the AI (use the first person from the
Only output the summary, do NOT include anything else in your output.
"""
WORD_LIMIT = 250
SHORTER_SUMMARY_PROMPT = f"""The following messages are being evicted from your context window. Write a detailed summary that captures what happened in these messages.
This summary will appear BEFORE the remaining recent messages in context, providing background for what comes after. Include:
@@ -85,4 +131,104 @@ This summary will appear BEFORE the remaining recent messages in context, provid
Write in first person as a factual record of what occurred. Be thorough and detailed - the goal is to preserve enough context that the recent messages make sense and important information isn't lost.
Keep your summary under {WORD_LIMIT} words. Only output the summary."""
Keep your summary under {SLIDING_WORD_LIMIT} words. Only output the summary."""
SELF_SUMMARIZATION_PROMPT = """Your task is to create a detailed summary of the conversation so far, paying close attention to the user's explicit requests and your previous actions.
This summary should be thorough in capturing technical details, code patterns, and architectural decisions that would be essential for continuing development work without losing context.
Before providing your final summary, wrap your analysis in <analysis> tags to organize your thoughts and ensure you've covered all necessary points. In your analysis process:
1. Chronologically analyze each message and section of the conversation. For each section thoroughly identify:
- The user's explicit requests and intents
- Your approach to addressing the user's requests
- Key decisions, technical concepts and code patterns
- Specific details like:
- file names
- full code snippets
- function signatures
- file edits
- Errors that you ran into and how you fixed them
- Pay special attention to specific user feedback that you received, especially if the user told you to do something differently.
2. Double-check for technical accuracy and completeness, addressing each required element thoroughly.
Your summary should include the following sections:
1. Primary Request and Intent: Capture all of the user's explicit requests and intents in detail
2. Key Technical Concepts: List all important technical concepts, technologies, and frameworks discussed.
3. Files and Code Sections: Enumerate specific files and code sections examined, modified, or created. Pay special attention to the most recent messages and include full code snippets where applicable and include a summary of why this file read or edit is important.
4. Errors and fixes: List all errors that you ran into, and how you fixed them. Pay special attention to specific user feedback that you received, especially if the user told you to do something differently.
5. Problem Solving: Document problems solved and any ongoing troubleshooting efforts.
6. All user messages: List ALL user messages that are not tool results. These are critical for understanding the users' feedback and changing intent.
6. Pending Tasks: Outline any pending tasks that you have explicitly been asked to work on.
7. Current Work: Describe in detail precisely what was being worked on immediately before this summary request, paying special attention to the most recent messages from both user and assistant. Include file names and code snippets where applicable.
8. Optional Next Step: List the next step that you will take that is related to the most recent work you were doing. IMPORTANT: ensure that this step is DIRECTLY in line with the user's most recent explicit requests, and the task you were working on immediately before this summary request. If your last task was concluded, then only list next steps if they are explicitly in line with the users request. Do not start on tangential requests or really old requests that were already completed without confirming with the user first.
If there is a next step, include direct quotes from the most recent conversation showing exactly what task you were working on and where you left off. This should be verbatim to ensure there's no drift in task interpretation.
Here's an example of how your output should be structured:
<example>
<analysis>
[Your thought process, ensuring all points are covered thoroughly and accurately]
</analysis>
<summary>
1. Primary Request and Intent:
[Detailed description]
2. Key Technical Concepts:
- [Concept 1]
- [Concept 2]
- [...]
3. Files and Code Sections:
- [File Name 1]
- [Summary of why this file is important]
- [Summary of the changes made to this file, if any]
- [Important Code Snippet]
- [File Name 2]
- [Important Code Snippet]
- [...]
4. Errors and fixes:
- [Detailed description of error 1]:
- [How you fixed the error]
- [User feedback on the error if any]
- [...]
5. Problem Solving:
[Description of solved problems and ongoing troubleshooting]
6. All user messages:
- [Detailed non tool use user message]
- [...]
7. Pending Tasks:
- [Task 1]
- [Task 2]
- [...]
8. Current Work:
[Precise description of current work]
9. Optional Next Step:
[Optional Next step to take]
</summary>
</example>
Please provide your summary based on the conversation so far, following this structure and ensuring precision and thoroughness in your response.
There may be additional summarization instructions provided in the included context. If so, remember to follow these instructions when creating the above summary. Examples of instructions include:
<example>
## Compact Instructions
When summarizing the conversation focus on typescript code changes and also remember the mistakes you made and how you fixed them.
</example>
<example>
# Summary instructions
When you are using compact - please focus on test output and code changes. Include file reads verbatim.
</example>
IMPORTANT: Do NOT use any tools. You MUST respond with ONLY the <summary>...</summary> block as your text output.
"""

View File

@@ -2385,7 +2385,7 @@ async def summarize_messages(
"""
actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
agent = await server.agent_manager.get_agent_by_id_async(agent_id, actor, include_relationships=["multi_agent_group"])
agent = await server.agent_manager.get_agent_by_id_async(agent_id, actor, include_relationships=["multi_agent_group", "tools"])
agent_loop = LettaAgentV3(agent_state=agent, actor=actor)
in_context_messages = await server.message_manager.get_messages_by_ids_async(message_ids=agent.message_ids, actor=actor)
@@ -2412,15 +2412,17 @@ async def summarize_messages(
if agent.compaction_settings and request and request.compaction_settings:
# Start with agent's settings, override with new values from request
# Use model_fields_set to get the fields that were changed in the request (want to ignore the defaults that get set automatically)
compaction_settings = agent.compaction_settings
compaction_settings = agent.compaction_settings.copy() # do not mutate original agent compaction settings
changed_fields = request.compaction_settings.model_fields_set
for field in changed_fields:
setattr(compaction_settings, field, getattr(request.compaction_settings, field))
# If mode changed from agent's original settings and prompt not explicitly set in request, then use the default prompt for the new mode
# Ex: previously was sliding_window, now is all, so we need to use the default prompt for all mode
if "mode" in changed_fields and compaction_settings.mode != request.compaction_settings.mode:
compaction_settings = compaction_settings.set_mode_specific_prompt()
if "mode" in changed_fields and agent.compaction_settings.mode != request.compaction_settings.mode:
from letta.services.summarizer.summarizer_config import get_default_prompt_for_mode
compaction_settings.prompt = get_default_prompt_for_mode(compaction_settings.mode)
else:
compaction_settings = (request and request.compaction_settings) or agent.compaction_settings
num_messages_before = len(in_context_messages)
@@ -2434,6 +2436,7 @@ async def summarize_messages(
# update the agent state
logger.info(f"Summarized {num_messages_before} messages to {num_messages_after}")
if num_messages_before <= num_messages_after:
logger.warning(f"Summarization failed to reduce the number of messages. {num_messages_before} messages -> {num_messages_after}.")
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Summarization failed to reduce the number of messages. You may need to use a different CompactionSettings (e.g. using `all` mode).",

View File

@@ -537,10 +537,13 @@ async def compact_conversation(
# Validate compaction reduced messages
if num_messages_before <= num_messages_after:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Summarization failed to reduce the number of messages. You may need to use a different CompactionSettings (e.g. using `all` mode).",
logger.warning(
f"Summarization failed to reduce the number of messages. {num_messages_before} messages -> {num_messages_after} (only expected if drop_tool_returns is True)."
)
# raise HTTPException(
# status_code=status.HTTP_400_BAD_REQUEST,
# detail="Summarization failed to reduce the number of messages. You may need to use a different CompactionSettings (e.g. using `all` mode).",
# )
# Checkpoint the messages (this will update the conversation_messages table)
await agent_loop._checkpoint_messages(run_id=None, step_id=None, new_messages=[summary_message], in_context_messages=messages)

View File

@@ -4,21 +4,24 @@ from dataclasses import dataclass
from typing import List, Optional
from letta.helpers.message_helper import convert_message_creates_to_messages
from letta.llm_api.llm_client import LLMClient
from letta.log import get_logger
from letta.otel.tracing import trace_method
from letta.schemas.agent import AgentType
from letta.schemas.enums import MessageRole
from letta.schemas.letta_message_content import TextContent
from letta.schemas.llm_config import LLMConfig
from letta.schemas.message import Message, MessageCreate
from letta.schemas.tool import Tool
from letta.schemas.user import User
from letta.services.summarizer.self_summarizer import self_summarize_all, self_summarize_sliding_window
from letta.services.summarizer.summarizer_all import summarize_all
from letta.services.summarizer.summarizer_config import CompactionSettings, get_default_summarizer_model
from letta.services.summarizer.summarizer_config import CompactionSettings, get_default_prompt_for_mode, get_default_summarizer_model
from letta.services.summarizer.summarizer_sliding_window import (
count_tokens,
count_tokens_with_tools,
summarize_via_sliding_window,
)
from letta.services.telemetry_manager import TelemetryManager
from letta.system import package_summarize_message_no_counts
logger = get_logger(__name__)
@@ -106,12 +109,14 @@ async def compact_messages(
actor: User,
agent_id: str,
agent_llm_config: LLMConfig,
telemetry_manager: TelemetryManager,
llm_client: LLMClient,
agent_type: AgentType,
messages: List[Message],
timezone: str,
compaction_settings: Optional[CompactionSettings] = None,
agent_model_handle: Optional[str] = None,
agent_tags: Optional[List[str]] = None,
tools: Optional[List[Tool]] = None,
tools: Optional[List[dict]] = None, # Tool json schemas
trigger_threshold: Optional[int] = None,
run_id: Optional[str] = None,
step_id: Optional[str] = None,
@@ -154,7 +159,105 @@ async def compact_messages(
)
summarization_mode_used = summarizer_config.mode
if summarizer_config.mode == "all":
if summarizer_config.mode == "self_compact_all":
try:
summary, compacted_messages = await self_summarize_all(
actor=actor,
agent_id=agent_id,
agent_llm_config=agent_llm_config,
telemetry_manager=telemetry_manager,
llm_client=llm_client,
agent_type=agent_type,
messages=messages,
compaction_settings=summarizer_config,
run_id=run_id,
step_id=step_id,
timezone=timezone,
agent_tags=agent_tags,
tools=tools,
)
except Exception as e:
logger.error(f"Self summarization failed with exception: {str(e)}. Falling back to self sliding window mode.")
try:
fallback_config = summarizer_config.model_copy(
update={
"mode": "self_compact_sliding_window",
"prompt": get_default_prompt_for_mode("self_compact_sliding_window"),
}
)
summary, compacted_messages = await self_summarize_sliding_window(
actor=actor,
agent_id=agent_id,
agent_llm_config=agent_llm_config,
telemetry_manager=telemetry_manager,
llm_client=llm_client,
agent_type=agent_type,
messages=messages,
compaction_settings=fallback_config,
run_id=run_id,
step_id=step_id,
timezone=timezone,
agent_tags=agent_tags,
tools=tools,
)
summarization_mode_used = "self_compact_sliding_window"
except Exception as e:
logger.error(f"Self sliding window summarization failed with exception: {str(e)}. Falling back to all mode.")
fallback_config = summarizer_config.model_copy(
update={
"mode": "all",
"prompt": get_default_prompt_for_mode("all"),
}
)
summary, compacted_messages = await summarize_all(
actor=actor,
llm_config=summarizer_llm_config,
summarizer_config=fallback_config,
in_context_messages=messages,
agent_id=agent_id,
agent_tags=agent_tags,
run_id=run_id,
step_id=step_id,
)
summarization_mode_used = "all"
elif summarizer_config.mode == "self_compact_sliding_window":
try:
summary, compacted_messages = await self_summarize_sliding_window(
actor=actor,
agent_id=agent_id,
agent_llm_config=agent_llm_config,
telemetry_manager=telemetry_manager,
llm_client=llm_client,
agent_type=agent_type,
messages=messages,
compaction_settings=summarizer_config,
run_id=run_id,
step_id=step_id,
timezone=timezone,
agent_tags=agent_tags,
tools=tools,
)
except Exception as e:
# Prompts for all and self mode should be similar --> can use original prompt
logger.error(f"Self sliding window summarization failed with exception: {str(e)}. Falling back to all mode.")
fallback_config = summarizer_config.model_copy(
update={
"mode": "all",
"prompt": get_default_prompt_for_mode("all"),
}
)
summary, compacted_messages = await summarize_all(
actor=actor,
llm_config=summarizer_llm_config,
summarizer_config=fallback_config,
in_context_messages=messages,
agent_id=agent_id,
agent_tags=agent_tags,
run_id=run_id,
step_id=step_id,
)
summarization_mode_used = "all"
elif summarizer_config.mode == "all":
summary, compacted_messages = await summarize_all(
actor=actor,
llm_config=summarizer_llm_config,
@@ -180,10 +283,16 @@ async def compact_messages(
)
except Exception as e:
logger.error(f"Sliding window summarization failed with exception: {str(e)}. Falling back to all mode.")
fallback_config = summarizer_config.model_copy(
update={
"mode": "all",
"prompt": get_default_prompt_for_mode("all"),
}
)
summary, compacted_messages = await summarize_all(
actor=actor,
llm_config=summarizer_llm_config,
summarizer_config=summarizer_config,
summarizer_config=fallback_config,
in_context_messages=messages,
agent_id=agent_id,
agent_tags=agent_tags,
@@ -271,6 +380,7 @@ async def compact_messages(
summary=summary,
timezone=timezone,
compaction_stats=compaction_stats,
mode=summarization_mode_used,
)
if use_summary_role:

View File

@@ -0,0 +1,283 @@
"""Claude Code-style summarization where agent self-summarizes using its own LLM."""
from typing import List, Optional, Tuple
from letta.llm_api.llm_client import LLMClient
from letta.log import get_logger
from letta.otel.tracing import trace_method
from letta.schemas.agent import AgentType
from letta.schemas.enums import MessageRole, ProviderType
from letta.schemas.letta_message_content import TextContent
from letta.schemas.llm_config import LLMConfig
from letta.schemas.message import Message
from letta.schemas.user import User
from letta.services.summarizer.summarizer_config import CompactionSettings, get_default_prompt_for_mode
from letta.services.summarizer.summarizer_sliding_window import count_tokens
from letta.services.telemetry_manager import TelemetryManager
logger = get_logger(__name__)
@trace_method
async def self_summarize_all(
actor: User,
agent_id: str,
agent_llm_config: LLMConfig,
telemetry_manager: TelemetryManager,
llm_client: LLMClient,
agent_type: AgentType,
messages: List[Message],
compaction_settings: CompactionSettings,
timezone: str,
run_id: Optional[str] = None,
step_id: Optional[str] = None,
agent_tags: Optional[List[str]] = None,
# For cache compatibility with regular agent requests
tools: Optional[List[dict]] = None,
) -> Tuple[str, List[Message], str]:
"""Summary request is added as a user message, then the agent's LLM is called with the messages + request.
The agent's summary response is parsed and returned.
"""
logger.info(f"Starting self-summarization for {len(messages)} messages")
# Protect system message and handle last message
if len(messages) < 2:
logger.warning("Too few messages to summarize")
return "No conversation to summarize.", messages
system_message = messages[0]
# Cutoff rules for what you can/can't separate
messages_to_summarize, protected_messages = _get_protected_messages(messages)
# Create the summary request message
if compaction_settings.prompt is None:
compaction_settings.prompt = get_default_prompt_for_mode(compaction_settings.mode)
logger.info(f"Summarizing {len(messages)} messages with prompt: {compaction_settings.prompt[:100]}...")
summary_request_message = Message(
role=MessageRole.user,
content=[TextContent(text=compaction_settings.prompt)],
agent_id=agent_id,
)
# If the last message is not an assistant message, add a dummy assistant message to prevent LLM from continuing the conversation
if messages_to_summarize[-1].role != MessageRole.assistant:
messages_with_request = [
*messages_to_summarize,
Message(role=MessageRole.assistant, content=[TextContent(text="I understand. Let me summarize.")], agent_id=agent_id),
summary_request_message,
]
logger.info(
f"Calling agent's LLM for self-summarization with {len(messages_with_request)} messages ({len(messages_to_summarize)} in-context + 1 dummy assistant message + 1 summary request)"
)
else:
# Last message is already assistant, safe to append user directly
messages_with_request = [*messages_to_summarize, summary_request_message]
logger.info(
f"Calling agent's LLM for self-summarization with {len(messages_with_request)} messages ({len(messages_to_summarize)} in-context + 1 summary request)"
)
# Set telemetry context
llm_client.set_telemetry_context(
telemetry_manager=telemetry_manager,
agent_id=agent_id,
agent_tags=agent_tags,
run_id=run_id,
step_id=step_id,
call_type="summarization",
org_id=actor.organization_id if actor.organization_id else None,
user_id=actor.id if actor.id else None,
compaction_settings=compaction_settings.model_dump() if compaction_settings else None,
actor=actor,
)
# Build request data using agent's llm_client
# Match params used by agent_v3 for cache compatibility
request_data = llm_client.build_request_data(
agent_type,
messages_with_request,
agent_llm_config,
tools=tools,
force_tool_call=None, # Don't force tool calls during summarization
requires_subsequent_tool_call=False,
# tool_return_truncation_chars=TOOL_RETURN_TRUNCATION_CHARS,
)
# Match parallel_tool_calls setting from agent's llm_config for cache compatibility
# This mirrors the logic in letta_agent_v3.py step processing
if agent_llm_config.model_endpoint_type in [ProviderType.anthropic, ProviderType.bedrock]:
if isinstance(request_data.get("tool_choice"), dict) and "disable_parallel_tool_use" in request_data["tool_choice"]:
if agent_llm_config.parallel_tool_calls:
request_data["tool_choice"]["disable_parallel_tool_use"] = False
else:
request_data["tool_choice"]["disable_parallel_tool_use"] = True
# Call LLM by sending a message
from letta.services.summarizer.summarizer import _run_summarizer_request
try:
summary_text = await _run_summarizer_request(request_data, messages_with_request, agent_llm_config, llm_client)
except Exception as e:
logger.error(f"Self-summarization request failed: {e}")
# handle LLM error (likely a context window exceeded error)
try:
raise llm_client.handle_llm_error(e, llm_config=agent_llm_config)
except Exception as e:
logger.error(f"Self-summarization request failed: {e}")
raise e
# Clip if needed
if compaction_settings.clip_chars is not None and len(summary_text) > compaction_settings.clip_chars:
logger.warning(f"CC summary length {len(summary_text)} exceeds clip length {compaction_settings.clip_chars}. Truncating.")
summary_text = summary_text[: compaction_settings.clip_chars] + "... [summary truncated to fit]"
# Build final messages: [system] + protected messages
# Summary message handling is done in compact parent function
final_messages = [system_message]
if protected_messages:
final_messages += protected_messages
logger.info(
f"Self-summarization complete. Summary length: {len(summary_text)} chars. Keeping {len(protected_messages)} protected messages."
)
return summary_text, final_messages
@trace_method
async def self_summarize_sliding_window(
actor: User,
agent_id: str,
agent_llm_config: LLMConfig,
telemetry_manager: TelemetryManager,
llm_client: LLMClient,
agent_type: AgentType,
messages: List[Message],
compaction_settings: CompactionSettings,
timezone: str,
run_id: Optional[str] = None,
step_id: Optional[str] = None,
agent_tags: Optional[List[str]] = None,
# For cache compatibility with regular agent requests
tools: Optional[List[dict]] = None,
) -> Tuple[Message, List[Message], str]:
"""Summary request is added as a user message, then the agent's LLM is called with the messages + request.
The agent's summary response is parsed and returned.
"""
logger.info("Starting self-summarization with sliding window mode")
# Protect system message and handle last message
if len(messages) < 2:
logger.warning("Too few messages to summarize")
return "No conversation to summarize.", messages
system_prompt = messages[0]
# cannot evict a pending approval request (will cause client-side errors)
total_message_count = len(messages)
if messages[-1].role == MessageRole.approval:
maximum_message_index = total_message_count - 2
else:
maximum_message_index = total_message_count - 1
eviction_percentage = compaction_settings.sliding_window_percentage
assert compaction_settings.sliding_window_percentage <= 1.0, "Sliding window percentage must be less than or equal to 1.0"
assistant_message_index = None
goal_tokens = (1 - compaction_settings.sliding_window_percentage) * agent_llm_config.context_window
approx_token_count = agent_llm_config.context_window
# allow approvals to be cutoffs (for headless agents) but ensure proper grouping with tool calls
def is_valid_cutoff(message: Message):
if message.role == MessageRole.assistant:
return True
if message.role == MessageRole.approval:
return message.tool_calls is not None and len(message.tool_calls) > 0
return False
post_summarization_buffer = [system_prompt]
while approx_token_count >= goal_tokens and eviction_percentage < 1.0:
# more eviction percentage
eviction_percentage += 0.10
# calculate message_cutoff_index
message_cutoff_index = round(eviction_percentage * total_message_count)
# get index of first assistant message after the cutoff point ()
assistant_message_index = next(
(i for i in reversed(range(1, message_cutoff_index + 1)) if i < len(messages) and is_valid_cutoff(messages[i])),
None,
)
if assistant_message_index is None:
logger.warning(
f"No assistant/approval message found for evicting up to index {message_cutoff_index}, incrementing eviction percentage"
)
continue
# update token count
logger.info(f"Attempting to compact messages to index {assistant_message_index} messages")
post_summarization_buffer = [system_prompt, *messages[assistant_message_index:]]
approx_token_count = await count_tokens(actor, agent_llm_config, post_summarization_buffer)
logger.info(
f"Compacting messages index 1:{assistant_message_index} messages resulted in {approx_token_count} tokens, goal is {goal_tokens}"
)
if assistant_message_index is None or eviction_percentage >= 1.0:
raise ValueError("No assistant message found for sliding window summarization") # fall back to complete summarization
if assistant_message_index >= maximum_message_index:
# need to keep the last message (might contain an approval request)
raise ValueError(f"Assistant message index {assistant_message_index} is at the end of the message buffer, skipping summarization")
messages_to_summarize = messages[:assistant_message_index]
logger.info(
f"Summarizing {len(messages_to_summarize)} messages with self summarization sliding window, from index 1 to {assistant_message_index} (out of {total_message_count})"
)
# pass in messages_to_summarize instead of messages
summary_text, final_messages = await self_summarize_all(
actor=actor,
agent_id=agent_id,
agent_llm_config=agent_llm_config,
telemetry_manager=telemetry_manager,
llm_client=llm_client,
agent_type=agent_type,
messages=messages_to_summarize,
compaction_settings=compaction_settings,
timezone=timezone,
run_id=run_id,
step_id=step_id,
agent_tags=agent_tags,
tools=tools,
)
# final_messages should just be the system prompt
return summary_text, final_messages + post_summarization_buffer
def _get_protected_messages(in_context_messages: List[Message]) -> Tuple[List[Message], List[Message]]:
"""Determine which messages to keep in context window."""
if in_context_messages[-1].role == MessageRole.approval:
# cannot evict a pending approval request (will cause client-side errors)
# Also protect the assistant message before it if they share the same step_id
# (both are part of the same LLM response - assistant has thinking/tool_calls, approval has approval-required subset)
protected_messages = [in_context_messages[-1]]
# Check if the message before approval is also from the same step (has reasoning/tool_calls)
if len(in_context_messages) >= 2:
potential_assistant = in_context_messages[-2]
approval_request = in_context_messages[-1]
if potential_assistant.role == MessageRole.assistant and potential_assistant.step_id == approval_request.step_id:
# They're part of the same LLM response - protect both
protected_messages = [potential_assistant, approval_request]
messages_to_summarize = in_context_messages[:-2]
else:
messages_to_summarize = in_context_messages[:-1]
else:
messages_to_summarize = in_context_messages[:-1]
else:
messages_to_summarize = in_context_messages
protected_messages = []
return messages_to_summarize, protected_messages

View File

@@ -495,7 +495,7 @@ async def simple_summary(
# Build the initial transcript without clamping to preserve fidelity
# TODO proactively clip here?
summary_transcript = simple_formatter(messages)
logger.info(f"Summarizing {len(messages)} messages with prompt: {system_prompt}")
logger.info(f"Summarizing {len(messages)} messages with prompt: {system_prompt[:100]}...")
if include_ack:
logger.info(f"Summarizing with ACK for model {llm_config.model}")
@@ -519,86 +519,9 @@ async def simple_summary(
summarizer_llm_config.put_inner_thoughts_in_kwargs = False
summarizer_llm_config.enable_reasoner = False
async def _run_summarizer_request(req_data: dict, req_messages_obj: list[Message]) -> str:
"""Run summarization request and return assistant text.
For Anthropic, use provider-side streaming to avoid long-request failures
(Anthropic requires streaming for requests that may exceed ~10 minutes).
"""
if summarizer_llm_config.model_endpoint_type in [ProviderType.anthropic, ProviderType.bedrock]:
logger.info(
"Summarizer: using provider streaming (%s/%s) to avoid long-request failures",
summarizer_llm_config.model_endpoint_type,
summarizer_llm_config.model,
)
# Stream from provider and accumulate the final assistant text.
from letta.interfaces.anthropic_parallel_tool_call_streaming_interface import (
SimpleAnthropicStreamingInterface,
)
interface = SimpleAnthropicStreamingInterface(
requires_approval_tools=[],
run_id=None,
step_id=None,
)
# AnthropicClient.stream_async sets request_data["stream"] = True internally.
try:
stream = await llm_client.stream_async(req_data, summarizer_llm_config)
async for _chunk in interface.process(stream):
pass
content_parts = interface.get_content()
text = "".join(part.text for part in content_parts if isinstance(part, TextContent)).strip()
await llm_client.log_provider_trace_async(
request_data=req_data,
response_json={
"content": text,
"model": summarizer_llm_config.model,
"usage": {
"input_tokens": getattr(interface, "input_tokens", None),
"output_tokens": getattr(interface, "output_tokens", None),
},
},
llm_config=summarizer_llm_config,
)
except Exception as e:
await llm_client.log_provider_trace_async(
request_data=req_data,
response_json=None,
llm_config=summarizer_llm_config,
error_msg=str(e),
error_type=type(e).__name__,
)
raise
if not text:
logger.warning("No content returned from summarizer (streaming path)")
raise Exception("Summary failed to generate")
return text
# Default: non-streaming provider request, then normalize via chat-completions conversion.
logger.debug(
"Summarizer: using non-streaming request (%s/%s)",
summarizer_llm_config.model_endpoint_type,
summarizer_llm_config.model,
)
response_data = await llm_client.request_async_with_telemetry(req_data, summarizer_llm_config)
response = await llm_client.convert_response_to_chat_completion(
response_data,
req_messages_obj,
summarizer_llm_config,
)
if response.choices[0].message.content is None:
logger.warning("No content returned from summarizer")
raise Exception("Summary failed to generate")
return response.choices[0].message.content.strip()
request_data = llm_client.build_request_data(AgentType.letta_v1_agent, input_messages_obj, summarizer_llm_config, tools=[])
try:
summary = await _run_summarizer_request(request_data, input_messages_obj)
summary = await _run_summarizer_request(request_data, input_messages_obj, summarizer_llm_config, llm_client)
except Exception as e:
# handle LLM error (likely a context window exceeded error)
try:
@@ -636,7 +559,7 @@ async def simple_summary(
)
try:
summary = await _run_summarizer_request(request_data, input_messages_obj)
summary = await _run_summarizer_request(request_data, input_messages_obj, summarizer_llm_config, llm_client)
except Exception as fallback_error_a:
# Fallback B: hard-truncate the user transcript to fit a conservative char budget
logger.warning(f"Clamped tool returns still overflowed ({fallback_error_a}). Falling back to transcript truncation.")
@@ -673,7 +596,7 @@ async def simple_summary(
tools=[],
)
try:
summary = await _run_summarizer_request(request_data, input_messages_obj)
summary = await _run_summarizer_request(request_data, input_messages_obj, summarizer_llm_config, llm_client)
except Exception as fallback_error_b:
logger.error(f"Transcript truncation fallback also failed: {fallback_error_b}. Propagating error.")
logger.info(f"Full fallback summarization payload: {request_data}")
@@ -742,3 +665,84 @@ def format_transcript(messages: List[Message], include_system: bool = False) ->
lines.append(f"{role}: {text}")
return lines
@trace_method
async def _run_summarizer_request(req_data: dict, req_messages_obj: list[Message], llm_config: LLMConfig, llm_client: LLMClient) -> str:
"""Run summarization request and return assistant text.
For Anthropic, use provider-side streaming to avoid long-request failures
(Anthropic requires streaming for requests that may exceed ~10 minutes).
"""
if llm_config.model_endpoint_type in [ProviderType.anthropic, ProviderType.bedrock]:
logger.info(
"Summarizer: using provider streaming (%s/%s) to avoid long-request failures",
llm_config.model_endpoint_type,
llm_config.model,
)
# Stream from provider and accumulate the final assistant text.
from letta.interfaces.anthropic_parallel_tool_call_streaming_interface import (
SimpleAnthropicStreamingInterface,
)
interface = SimpleAnthropicStreamingInterface(
requires_approval_tools=[],
run_id=None,
step_id=None,
)
# AnthropicClient.stream_async sets request_data["stream"] = True internally.
try:
stream = await llm_client.stream_async(req_data, llm_config)
async for _chunk in interface.process(stream):
pass
content_parts = interface.get_content()
text = "".join(part.text for part in content_parts if isinstance(part, TextContent)).strip()
await llm_client.log_provider_trace_async(
request_data=req_data,
response_json={
"content": text,
"model": llm_config.model,
"usage": {
"input_tokens": getattr(interface, "input_tokens", None),
"output_tokens": getattr(interface, "output_tokens", None),
"cache_read_input_tokens": getattr(interface, "cache_read_tokens", 0), # cache read
"cache_creation_input_tokens": getattr(interface, "cache_creation_tokens", 0), # cache write
},
},
llm_config=llm_config,
)
except Exception as e:
await llm_client.log_provider_trace_async(
request_data=req_data,
response_json=None,
llm_config=llm_config,
error_msg=str(e),
error_type=type(e).__name__,
)
raise
if not text:
logger.warning("No content returned from summarizer (streaming path)")
raise Exception("Summary failed to generate")
return text
# Default: non-streaming provider request, then normalize via chat-completions conversion.
logger.debug(
"Summarizer: using non-streaming request (%s/%s)",
llm_config.model_endpoint_type,
llm_config.model,
)
response_data = await llm_client.request_async_with_telemetry(req_data, llm_config)
response = await llm_client.convert_response_to_chat_completion(
response_data,
req_messages_obj,
llm_config,
)
if response.choices[0].message.content is None:
logger.warning("No content returned from summarizer")
raise Exception("Summary failed to generate")
return response.choices[0].message.content.strip()

View File

@@ -2,7 +2,7 @@ from typing import Literal
from pydantic import BaseModel, Field
from letta.prompts.summarizer_prompt import ALL_PROMPT, SLIDING_PROMPT
from letta.prompts.summarizer_prompt import ALL_PROMPT, SELF_ALL_PROMPT, SELF_SLIDING_PROMPT, SLIDING_PROMPT
from letta.schemas.enums import ProviderType
from letta.schemas.model import ModelSettingsUnion
from letta.settings import summarizer_settings
@@ -18,20 +18,23 @@ def get_default_summarizer_model(provider_type: ProviderType) -> str | None:
return summarizer_defaults.get(provider_type)
def get_default_prompt_for_mode(mode: Literal["all", "sliding_window"]) -> str:
def get_default_prompt_for_mode(mode: Literal["all", "sliding_window", "self_compact_all", "self_compact_sliding_window"]) -> str:
"""Get the default prompt for a given compaction mode.
Also used in /summarize endpoint if mode is changed and prompt is not explicitly set."""
if mode == "all":
return ALL_PROMPT
else: # sliding_window
if mode == "self_compact_sliding_window":
return SELF_SLIDING_PROMPT
elif mode == "self_compact_all":
return SELF_ALL_PROMPT
elif mode == "sliding_window":
return SLIDING_PROMPT
else: # all
return ALL_PROMPT
class CompactionSettings(BaseModel):
"""Configuration for conversation compaction / summarization.
``model`` is the only required user-facing field it specifies the summarizer
model handle (e.g. ``"openai/gpt-4o-mini"``). Per-model settings (temperature,
Per-model settings (temperature,
max tokens, etc.) are derived from the default configuration for that handle.
"""
@@ -39,7 +42,7 @@ class CompactionSettings(BaseModel):
# If None, uses lightweight provider-specific defaults (e.g., haiku for Anthropic, gpt-5-mini for OpenAI).
model: str | None = Field(
default=None,
description="Model handle to use for summarization (format: provider/model-name). If None, uses lightweight provider-specific defaults.",
description="Model handle to use for sliding_window/all summarization (format: provider/model-name). If None, uses lightweight provider-specific defaults.",
)
# Optional provider-specific model settings for the summarizer model
@@ -56,10 +59,12 @@ class CompactionSettings(BaseModel):
default=50000, description="The maximum length of the summary in characters. If none, no clipping is performed."
)
mode: Literal["all", "sliding_window", "self"] = Field(default="sliding_window", description="The type of summarization technique use.")
mode: Literal["all", "sliding_window", "self_compact_all", "self_compact_sliding_window"] = Field(
default="sliding_window", description="The type of summarization technique use."
)
sliding_window_percentage: float = Field(
default_factory=lambda: summarizer_settings.partial_evict_summarizer_percentage,
description="The percentage of the context window to keep post-summarization (only used in sliding window mode).",
description="The percentage of the context window to keep post-summarization (only used in sliding window modes).",
)
# Called upon agent creation and if mode is changed in summarize endpoint request

View File

@@ -77,7 +77,13 @@ async def count_tokens_with_tools(
actor=actor,
)
tool_definitions = [OpenAITool(type="function", function=t.json_schema) for t in tools if t.json_schema]
# Tools can be either Tool objects (with .json_schema) or dicts (json schemas directly)
# For compatibility with how tools need to be passed in for self compaction
tool_definitions = [
OpenAITool(type="function", function=t.json_schema if hasattr(t, "json_schema") else t)
for t in tools
if (hasattr(t, "json_schema") and t.json_schema) or (isinstance(t, dict) and t)
]
tool_tokens = await token_counter.count_tool_tokens(tool_definitions) if tool_definitions else 0
# Apply safety margin for approximate counting (message_tokens already has margin applied)
@@ -127,6 +133,11 @@ async def summarize_via_sliding_window(
else:
maximum_message_index = total_message_count - 1
# simple version: summarize(in_context[1:round(summarizer_config.sliding_window_percentage * len(in_context_messages))])
# this evicts 30% of the messages (via summarization) and keeps the remaining 70%
# problem: we need the cutoff point to be an assistant message, so will grow the cutoff point until we find an assistant message
# also need to grow the cutoff point until the token count is less than the target token count
# Starts at N% (eg 70%), and increments up until 100%
max(
1 - summarizer_config.sliding_window_percentage, 0.10
@@ -146,11 +157,6 @@ async def summarize_via_sliding_window(
return message.tool_calls is not None and len(message.tool_calls) > 0
return False
# simple version: summarize(in_context[1:round(summarizer_config.sliding_window_percentage * len(in_context_messages))])
# this evicts 30% of the messages (via summarization) and keeps the remaining 70%
# problem: we need the cutoff point to be an assistant message, so will grow the cutoff point until we find an assistant message
# also need to grow the cutoff point until the token count is less than the target token count
while approx_token_count >= goal_tokens and eviction_percentage < 1.0:
# more eviction percentage
eviction_percentage += 0.10
@@ -168,7 +174,9 @@ async def summarize_via_sliding_window(
None,
)
if assistant_message_index is None:
logger.warning(f"No assistant message found for evicting up to index {message_cutoff_index}, incrementing eviction percentage")
logger.warning(
f"No assistant/approval message found for evicting up to index {message_cutoff_index}, incrementing eviction percentage"
)
continue
# update token count
@@ -210,7 +218,7 @@ async def summarize_via_sliding_window(
},
)
logger.info(f"\n==================\nSummary message string: {summary_message_str[:100]}\n==================\n")
logger.info(f"\n==================\nSummary message string: {summary_message_str[:100]}...\n==================\n")
if summarizer_config.clip_chars is not None and len(summary_message_str) > summarizer_config.clip_chars:
logger.warning(f"Summary length {len(summary_message_str)} exceeds clip length {summarizer_config.clip_chars}. Truncating.")

View File

@@ -204,11 +204,24 @@ def package_summarize_message(summary, summary_message_count, hidden_message_cou
return json_dumps(packaged_message)
def package_summarize_message_no_counts(summary, timezone, compaction_stats: dict | None = None):
context_message = (
"Note: prior messages have been hidden from view due to conversation memory constraints.\n"
+ f"The following is a summary of the previous messages:\n {summary}"
)
def package_summarize_message_no_counts(summary, timezone, compaction_stats: dict | None = None, mode: str | None = None):
if mode and "sliding_window" in mode: # sliding_window, self_compact_sliding_window
if compaction_stats and "messages_count_before" in compaction_stats and "messages_count_after" in compaction_stats:
num_evicted = compaction_stats["messages_count_before"] - compaction_stats["messages_count_after"]
context_message = (
f"Note: {num_evicted} messages from the beginning of the conversation have been hidden from view due to memory constraints.\n"
+ f"The following is a summary of the previous messages:\n {summary}"
)
else:
context_message = (
"Note: prior messages from the beginning of the conversation have been hidden from view due to conversation memory constraints.\n"
+ f"The following is a summary of the previous messages:\n {summary}"
)
else: # all, self
context_message = (
"Note: prior messages have been hidden from view due to conversation memory constraints.\n"
+ f"The following is a summary of the previous messages:\n {summary}"
)
formatted_time = get_local_time(timezone=timezone)
packaged_message = {

View File

@@ -15,17 +15,14 @@ import pytest
from letta.agents.letta_agent_v3 import LettaAgentV3
from letta.config import LettaConfig
from letta.schemas.agent import CreateAgent, UpdateAgent
from letta.schemas.block import BlockUpdate, CreateBlock
from letta.schemas.agent import CreateAgent
from letta.schemas.embedding_config import EmbeddingConfig
from letta.schemas.enums import MessageRole
from letta.schemas.letta_message import EventMessage, SummaryMessage
from letta.schemas.letta_message_content import TextContent, ToolCallContent, ToolReturnContent
from letta.schemas.llm_config import LLMConfig
from letta.schemas.message import Message as PydanticMessage, MessageCreate
from letta.schemas.run import Run as PydanticRun
from letta.schemas.message import Message as PydanticMessage
from letta.server.server import SyncServer
from letta.services.run_manager import RunManager
from letta.services.summarizer.summarizer import simple_summary
from letta.settings import model_settings
@@ -669,14 +666,24 @@ from unittest.mock import patch
from letta.services.summarizer.summarizer_config import CompactionSettings
# Test both summarizer modes: "all" summarizes entire history, "sliding_window" keeps recent messages
SUMMARIZER_CONFIG_MODES: list[Literal["all", "sliding_window"]] = ["all", "sliding_window"]
# Test all summarizer modes: "all" summarizes entire history, "sliding_window" keeps recent messages
SUMMARIZER_CONFIG_MODES: list[Literal["all", "sliding_window", "self_compact_all", "self_compact_sliding_window"]] = [
"all",
"sliding_window",
"self_compact_all",
"self_compact_sliding_window",
]
@pytest.mark.asyncio
@pytest.mark.parametrize("mode", SUMMARIZER_CONFIG_MODES, ids=SUMMARIZER_CONFIG_MODES)
@pytest.mark.parametrize("llm_config", TESTED_LLM_CONFIGS, ids=[c.model for c in TESTED_LLM_CONFIGS])
async def test_summarize_with_mode(server: SyncServer, actor, llm_config: LLMConfig, mode: Literal["all", "sliding_window"]):
async def test_summarize_with_mode(
server: SyncServer,
actor,
llm_config: LLMConfig,
mode: Literal["all", "sliding_window", "self_compact_all", "self_compact_sliding_window"],
):
"""
Test summarization with different CompactionSettings modes using LettaAgentV3.
@@ -746,20 +753,20 @@ async def test_summarize_with_mode(server: SyncServer, actor, llm_config: LLMCon
print()
if mode == "all":
# For "all" mode, V3 keeps:
if mode == "all" or mode == "self_compact_all":
# For "all" or "self" mode, V3 keeps:
# 1. System prompt
# 2. A single user summary message (system_alert JSON)
# and no remaining historical messages.
assert len(result) == 2, f"Expected 2 messages for 'all' mode (system + summary), got {len(result)}"
assert len(result) == 2, f"Expected 2 messages for {mode} mode (system + summary), got {len(result)}"
assert result[0].role == MessageRole.system
assert result[1].role == MessageRole.user
else:
# For "sliding_window" mode, result should include:
# For "sliding_window" or "self_compact_sliding_window" mode, result should include:
# 1. System prompt
# 2. User summary message
# 3+. Recent user/assistant messages inside the window.
assert len(result) > 2, f"Expected >2 messages for 'sliding_window' mode, got {len(result)}"
assert len(result) > 2, f"Expected >2 messages for {mode} mode, got {len(result)}"
assert result[0].role == MessageRole.system
assert result[1].role == MessageRole.user
@@ -1195,97 +1202,206 @@ async def test_sliding_window_cutoff_index_does_not_exceed_message_count(server:
TESTED_LLM_CONFIGS,
ids=[c.model for c in TESTED_LLM_CONFIGS],
)
async def test_large_system_prompt_summarization(server: SyncServer, actor, llm_config: LLMConfig):
async def test_self_sliding_window_cutoff_index_does_not_exceed_message_count(server: SyncServer, actor, llm_config: LLMConfig):
"""
Test edge case of large system prompt / memory blocks.
Test that the sliding window summarizer correctly calculates cutoff indices.
This test verifies that summarization handles the case where the system prompt
and memory blocks are very large, potentially consuming most of the context window.
The summarizer should gracefully handle this scenario without errors.
This test verifies the fix for a bug where the cutoff percentage was treated as
a whole number (10) instead of a decimal (0.10), causing:
message_cutoff_index = round(10 * 65) = 650
when there were only 65 messages, resulting in an empty range loop and the error:
"No assistant message found from indices 650 to 65"
The fix changed:
- max(..., 10) -> max(..., 0.10)
- += 10 -> += 0.10
- >= 100 -> >= 1.0
This test uses the real token counter (via create_token_counter) to verify
the sliding window logic works with actual token counting.
"""
from letta.llm_api.llm_client import LLMClient
from letta.schemas.agent import AgentType
from letta.services.summarizer.self_summarizer import self_summarize_sliding_window
from letta.services.summarizer.summarizer_config import CompactionSettings
from letta.services.telemetry_manager import TelemetryManager
# Override context window to be small so we trigger summarization
llm_config.context_window = 10000
# Create a real summarizer config using the default factory
# Override sliding_window_percentage to 0.3 for this test
handle = llm_config.handle or f"{llm_config.model_endpoint_type}/{llm_config.model}"
summarizer_config = CompactionSettings(model=handle)
summarizer_config.sliding_window_percentage = 0.3
# Create agent with large system prompt and memory blocks
agent_name = f"test_agent_large_system_prompt_{llm_config.model}".replace(".", "_").replace("/", "_")
agent_create = CreateAgent(
name=agent_name,
llm_config=llm_config,
embedding_config=DEFAULT_EMBEDDING_CONFIG,
system="SYSTEM PROMPT " * 10000, # Large system prompt
memory_blocks=[
CreateBlock(
label="human",
limit=200000,
value="NAME " * 10000, # Large memory block
# Create 65 messages (similar to the failing case in the bug report)
# Pattern: system + alternating user/assistant messages
messages = [
PydanticMessage(
role=MessageRole.system,
content=[TextContent(type="text", text="You are a helpful assistant.")],
)
]
# Add 64 more messages (32 user-assistant pairs)
for i in range(32):
messages.append(
PydanticMessage(
role=MessageRole.user,
content=[TextContent(type="text", text=f"User message {i}")],
)
)
messages.append(
PydanticMessage(
role=MessageRole.assistant,
content=[TextContent(type="text", text=f"Assistant response {i}")],
)
],
)
agent_state = await server.agent_manager.create_agent_async(agent_create, actor=actor)
# Create a run for the agent using RunManager
run = PydanticRun(agent_id=agent_state.id)
run = await RunManager().create_run(pydantic_run=run, actor=actor)
# Create the agent loop using LettaAgentV3
agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
# message the agent
input_message = MessageCreate(role=MessageRole.user, content="Hello")
# Call step on the agent - may trigger summarization due to large context
from letta.errors import SystemPromptTokenExceededError
with pytest.raises(SystemPromptTokenExceededError):
response = await agent_loop.step(
input_messages=[input_message],
run_id=run.id,
max_steps=3,
)
# Repair the agent by shortening the memory blocks and system prompt
# Update system prompt to a shorter version
short_system_prompt = "You are a helpful assistant."
await server.agent_manager.update_agent_async(
agent_id=agent_state.id,
agent_update=UpdateAgent(system=short_system_prompt),
actor=actor,
)
assert len(messages) == 65, f"Expected 65 messages, got {len(messages)}"
# Update memory block to a shorter version
short_memory_value = "The user's name is Alice."
await server.agent_manager.modify_block_by_label_async(
agent_id=agent_state.id,
block_label="human",
block_update=BlockUpdate(value=short_memory_value),
actor=actor,
)
# This should NOT raise "No assistant message found from indices 650 to 65"
# With the fix, message_count_cutoff_percent starts at max(0.7, 0.10) = 0.7
# So message_cutoff_index = round(0.7 * 65) = 46, which is valid
try:
summary, remaining_messages = await self_summarize_sliding_window(
actor=actor,
agent_id="agent-test-self-sliding-window",
agent_llm_config=llm_config,
telemetry_manager=TelemetryManager(),
llm_client=LLMClient.create(llm_config),
agent_type=AgentType.letta_v1_agent,
messages=messages,
compaction_settings=summarizer_config,
timezone="UTC",
)
# Reload agent state after repairs
agent_state = await server.agent_manager.get_agent_by_id_async(agent_id=agent_state.id, actor=actor)
print("REPAIRED AGENT STATE ======")
print(agent_state.system)
print(agent_state.blocks)
# Verify the summary was generated (actual LLM response)
assert summary is not None
assert len(summary) > 0
# Create a new run for the repaired agent
run = PydanticRun(agent_id=agent_state.id)
run = await RunManager().create_run(pydantic_run=run, actor=actor)
# Verify remaining messages is a valid subset
assert len(remaining_messages) < len(messages)
assert len(remaining_messages) > 0
# Create a new agent loop with the repaired agent state
agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
print(f"Successfully summarized {len(messages)} messages to {len(remaining_messages)} remaining")
print(f"Summary: {summary[:200]}..." if len(summary) > 200 else f"Summary: {summary}")
print(f"Using {llm_config.model_endpoint_type} token counter for model {llm_config.model}")
# Now the agent should be able to respond without context window errors
response = await agent_loop.step(
input_messages=[input_message],
run_id=run.id,
max_steps=3,
)
except ValueError as e:
if "No assistant message found from indices" in str(e):
# Extract the indices from the error message
import re
# Verify we got a valid response after repair
assert response is not None
assert response.messages is not None
print(f"Agent successfully responded after repair with {len(response.messages)} messages")
match = re.search(r"from indices (\d+) to (\d+)", str(e))
if match:
start_idx, end_idx = int(match.group(1)), int(match.group(2))
pytest.fail(
f"Bug detected: cutoff index ({start_idx}) exceeds message count ({end_idx}). "
f"This indicates the percentage calculation bug where 10 was used instead of 0.10. "
f"Error: {e}"
)
raise
### NOTE: removing edge case test where sys prompt is huge for now
### because we no longer refresh the system prompt before compaction
### in order to leverage caching (for self compaction)
# @pytest.mark.asyncio
# @pytest.mark.parametrize(
# "llm_config",
# TESTED_LLM_CONFIGS,
# ids=[c.model for c in TESTED_LLM_CONFIGS],
# )
# async def test_large_system_prompt_summarization(server: SyncServer, actor, llm_config: LLMConfig):
# """
# Test edge case of large system prompt / memory blocks.
# This test verifies that summarization handles the case where the system prompt
# and memory blocks are very large, potentially consuming most of the context window.
# The summarizer should gracefully handle this scenario without errors.
# """
# # Override context window to be small so we trigger summarization
# llm_config.context_window = 10000
# # Create agent with large system prompt and memory blocks
# agent_name = f"test_agent_large_system_prompt_{llm_config.model}".replace(".", "_").replace("/", "_")
# agent_create = CreateAgent(
# name=agent_name,
# llm_config=llm_config,
# embedding_config=DEFAULT_EMBEDDING_CONFIG,
# system="SYSTEM PROMPT " * 10000, # Large system prompt
# memory_blocks=[
# CreateBlock(
# label="human",
# limit=200000,
# value="NAME " * 10000, # Large memory block
# )
# ],
# )
# agent_state = await server.agent_manager.create_agent_async(agent_create, actor=actor)
# # Create a run for the agent using RunManager
# run = PydanticRun(agent_id=agent_state.id)
# run = await RunManager().create_run(pydantic_run=run, actor=actor)
# # Create the agent loop using LettaAgentV3
# agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
# # message the agent
# input_message = MessageCreate(role=MessageRole.user, content="Hello")
# # Call step on the agent - may trigger summarization due to large context
# from letta.errors import SystemPromptTokenExceededError
# with pytest.raises(SystemPromptTokenExceededError):
# response = await agent_loop.step(
# input_messages=[input_message],
# run_id=run.id,
# max_steps=3,
# )
# # Repair the agent by shortening the memory blocks and system prompt
# # Update system prompt to a shorter version
# short_system_prompt = "You are a helpful assistant."
# await server.agent_manager.update_agent_async(
# agent_id=agent_state.id,
# agent_update=UpdateAgent(system=short_system_prompt),
# actor=actor,
# )
# # Update memory block to a shorter version
# short_memory_value = "The user's name is Alice."
# await server.agent_manager.modify_block_by_label_async(
# agent_id=agent_state.id,
# block_label="human",
# block_update=BlockUpdate(value=short_memory_value),
# actor=actor,
# )
# # Reload agent state after repairs
# agent_state = await server.agent_manager.get_agent_by_id_async(agent_id=agent_state.id, actor=actor)
# print("REPAIRED AGENT STATE ======")
# print(agent_state.system)
# print(agent_state.blocks)
# # Create a new run for the repaired agent
# run = PydanticRun(agent_id=agent_state.id)
# run = await RunManager().create_run(pydantic_run=run, actor=actor)
# # Create a new agent loop with the repaired agent state
# agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
# # Now the agent should be able to respond without context window errors
# response = await agent_loop.step(
# input_messages=[input_message],
# run_id=run.id,
# max_steps=3,
# )
# # Verify we got a valid response after repair
# assert response is not None
# assert response.messages is not None
# print(f"Agent successfully responded after repair with {len(response.messages)} messages")
# @pytest.mark.asyncio
@@ -1718,6 +1834,127 @@ async def test_summarize_all(server: SyncServer, actor, llm_config: LLMConfig):
print(f"Using {llm_config.model_endpoint_type} for model {llm_config.model}")
@pytest.mark.asyncio
@pytest.mark.parametrize(
"llm_config",
TESTED_LLM_CONFIGS,
ids=[c.model for c in TESTED_LLM_CONFIGS],
)
async def test_summarize_self(server: SyncServer, actor, llm_config: LLMConfig):
"""
Test the summarize_all function with real LLM calls.
This test verifies that the 'all' summarization mode works correctly,
summarizing the entire conversation into a single summary string.
"""
from letta.llm_api.llm_client import LLMClient
from letta.schemas.agent import AgentType
from letta.services.summarizer.self_summarizer import self_summarize_all
from letta.services.summarizer.summarizer_config import CompactionSettings
from letta.services.telemetry_manager import TelemetryManager
# Create a summarizer config with "self" mode
handle = llm_config.handle or f"{llm_config.model_endpoint_type}/{llm_config.model}"
summarizer_config = CompactionSettings(model=handle)
summarizer_config.mode = "self"
# Create test messages - a simple conversation
messages = [
PydanticMessage(
role=MessageRole.system,
content=[TextContent(type="text", text="You are a helpful assistant.")],
)
]
# Add 10 user-assistant pairs
for i in range(10):
messages.append(
PydanticMessage(
role=MessageRole.user,
content=[TextContent(type="text", text=f"User message {i}: What is {i} + {i}?")],
)
)
messages.append(
PydanticMessage(
role=MessageRole.assistant,
content=[TextContent(type="text", text=f"Assistant response {i}: {i} + {i} = {i * 2}.")],
)
)
assert len(messages) == 21, f"Expected 21 messages, got {len(messages)}"
# Call summarize_all with real LLM
summary, new_in_context_messages = await self_summarize_all(
actor=actor,
agent_id="agent-test-self-sliding-window",
agent_llm_config=llm_config,
telemetry_manager=TelemetryManager(),
llm_client=LLMClient.create(llm_config),
agent_type=AgentType.letta_v1_agent,
messages=messages,
compaction_settings=summarizer_config,
timezone="UTC",
)
# Verify the summary was generated
assert len(new_in_context_messages) == 1
assert summary is not None
assert len(summary) > 0
assert len(summary) <= 5000 # length should be less than 500 words, give some buffer in test
print(f"Successfully summarized {len(messages)} messages using 'self' mode")
print(f"Summary: {summary[:200]}..." if len(summary) > 200 else f"Summary: {summary}")
print(f"Using {llm_config.model_endpoint_type} for model {llm_config.model}")
@pytest.mark.asyncio
@pytest.mark.parametrize("llm_config", TESTED_LLM_CONFIGS, ids=[c.model for c in TESTED_LLM_CONFIGS])
async def test_self_mode_fallback(server: SyncServer, actor, llm_config: LLMConfig):
"""If self summarize fails, it should have proper fallback."""
from unittest.mock import AsyncMock, patch
messages = [
PydanticMessage(
role=MessageRole.system,
content=[TextContent(type="text", text="You are a helpful assistant.")],
)
]
for i in range(10):
messages.append(
PydanticMessage(
role=MessageRole.user,
content=[TextContent(type="text", text=f"User message {i}: Test message {i}.")],
)
)
messages.append(
PydanticMessage(
role=MessageRole.assistant,
content=[TextContent(type="text", text=f"Assistant response {i}: Acknowledged message {i}.")],
)
)
agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
handle = llm_config.handle or f"{llm_config.model_endpoint_type}/{llm_config.model}"
agent_state.compaction_settings = CompactionSettings(model=handle, mode="self_compact_all")
agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
# Mock self_summarize_all to always fail
with patch(
"letta.services.summarizer.compact.self_summarize_all",
new_callable=AsyncMock,
side_effect=RuntimeError("Simulated self_summarize_all failure"),
):
summary_message, compacted_messages, summary_text = await agent_loop.compact(messages=in_context_messages)
assert summary_message is not None
assert summary_text is not None
assert len(summary_text) > 0
assert len(compacted_messages) < len(in_context_messages)
print(f"Fallback succeeded: {len(in_context_messages)} -> {len(compacted_messages)} messages")
# =============================================================================
# CompactionStats tests
# =============================================================================
@@ -2033,3 +2270,15 @@ async def test_compact_with_stats_params_embeds_stats(server: SyncServer, actor,
assert stats.context_tokens_after is not None # Should be set by compact()
assert stats.messages_count_after == len(compacted_messages) # final_messages already includes summary
assert stats.context_window == llm_config.context_window
### basic self summarization
### fallback chain
### basic self sliding window summarization
### self sliding window preserves recent msgs
### self mode return compaction stats

View File

@@ -209,7 +209,7 @@ class TestSummarizeSlidingWindowTelemetryContext:
await summarizer_sliding_window.summarize_via_sliding_window(
actor=mock_actor,
llm_config=mock_llm_config,
agent_llm_config=mock_llm_config, # case where agent and summarizer have same config
agent_llm_config=mock_llm_config, # case where agent and summarizer have same config
summarizer_config=mock_compaction_settings,
in_context_messages=mock_messages,
agent_id=agent_id,