Files
letta-server/tests/integration_test_summarizer.py
jnjpng 3f23a23227 feat: add compaction stats (#9219)
* base

* update

* last

* generate

* fix test
2026-02-24 10:52:06 -08:00

2036 lines
79 KiB
Python

"""
Integration tests for conversation history summarization.
These tests verify the complete summarization flow:
1. Creating a LettaAgentV2 instance
2. Fetching messages via message_manager.get_messages_by_ids_async
3. Calling agent_loop.summarize_conversation_history with force=True
"""
import json
import os
from typing import List, Literal
import pytest
from letta.agents.letta_agent_v2 import LettaAgentV2
from letta.agents.letta_agent_v3 import LettaAgentV3
from letta.config import LettaConfig
from letta.schemas.agent import CreateAgent, UpdateAgent
from letta.schemas.block import BlockUpdate, CreateBlock
from letta.schemas.embedding_config import EmbeddingConfig
from letta.schemas.enums import MessageRole
from letta.schemas.letta_message import EventMessage, LettaMessage, SummaryMessage
from letta.schemas.letta_message_content import TextContent, ToolCallContent, ToolReturnContent
from letta.schemas.llm_config import LLMConfig
from letta.schemas.message import Message as PydanticMessage, MessageCreate
from letta.schemas.run import Run as PydanticRun
from letta.server.server import SyncServer
from letta.services.run_manager import RunManager
from letta.services.summarizer.summarizer import simple_summary
from letta.settings import model_settings
# Constants
DEFAULT_EMBEDDING_CONFIG = EmbeddingConfig.default_config(provider="openai")
def get_llm_config(filename: str, llm_config_dir: str = "tests/configs/llm_model_configs") -> LLMConfig:
"""Load LLM configuration from JSON file."""
filename = os.path.join(llm_config_dir, filename)
with open(filename, "r") as f:
config_data = json.load(f)
llm_config = LLMConfig(**config_data)
return llm_config
# Test configurations - using a subset of models for summarization tests
all_configs = [
"openai-gpt-5-mini.json",
# "claude-4-5-haiku.json",
# "gemini-2.5-flash.json",
# "gemini-2.5-flash-vertex.json", # Requires Vertex AI credentials
# "openai-gpt-4.1.json",
# "openai-o1.json",
# "openai-o3.json",
# "openai-o4-mini.json",
# "claude-4-sonnet.json",
# "claude-3-7-sonnet.json",
# "gemini-2.5-pro-vertex.json",
]
requested = os.getenv("LLM_CONFIG_FILE")
filenames = [requested] if requested else all_configs
TESTED_LLM_CONFIGS: List[LLMConfig] = [get_llm_config(fn) for fn in filenames]
# Filter out deprecated Gemini 1.5 models
TESTED_LLM_CONFIGS = [
cfg
for cfg in TESTED_LLM_CONFIGS
if not (cfg.model_endpoint_type in ["google_vertex", "google_ai"] and cfg.model.startswith("gemini-1.5"))
]
# ======================================================================================================================
# Fixtures
# ======================================================================================================================
@pytest.fixture
async def server():
config = LettaConfig.load()
config.save()
server = SyncServer(init_with_default_org_and_user=True)
await server.init_async()
await server.tool_manager.upsert_base_tools_async(actor=server.default_user)
yield server
@pytest.fixture
async def default_organization(server: SyncServer):
"""Create and return the default organization."""
org = await server.organization_manager.create_default_organization_async()
yield org
@pytest.fixture
async def default_user(server: SyncServer, default_organization):
"""Create and return the default user."""
user = await server.user_manager.create_default_actor_async(org_id=default_organization.id)
yield user
@pytest.fixture
async def actor(default_user):
"""Return actor for authorization."""
return default_user
# ======================================================================================================================
# Helper Functions
# ======================================================================================================================
def create_large_tool_return(size_chars: int = 50000) -> str:
"""Create a large tool return string for testing."""
# Create a realistic-looking tool return with repeated data
base_item = {
"id": 12345,
"name": "Sample Item",
"description": "This is a sample item description that will be repeated many times to create a large payload",
"metadata": {"created_at": "2025-01-01T00:00:00Z", "updated_at": "2025-01-01T00:00:00Z", "version": "1.0.0"},
"tags": ["tag1", "tag2", "tag3", "tag4", "tag5"],
"nested_data": {"level1": {"level2": {"level3": {"value": "deeply nested value"}}}},
}
items = []
current_size = 0
item_json = json.dumps(base_item)
item_size = len(item_json)
while current_size < size_chars:
items.append(base_item.copy())
current_size += item_size
result = {"status": "success", "total_items": len(items), "items": items}
return json.dumps(result)
async def create_agent_with_messages(server: SyncServer, actor, llm_config: LLMConfig, messages: List[PydanticMessage]) -> tuple:
"""
Create an agent and add messages to it.
Returns (agent_state, in_context_messages).
"""
# Create agent (replace dots and slashes with underscores for valid names)
agent_name = f"test_agent_{llm_config.model}".replace(".", "_").replace("/", "_")
agent_create = CreateAgent(
name=agent_name,
llm_config=llm_config,
embedding_config=DEFAULT_EMBEDDING_CONFIG,
)
agent_state = await server.agent_manager.create_agent_async(agent_create, actor=actor)
# Add messages to the agent
# Set agent_id on all message objects
message_objs = []
for msg in messages:
msg_dict = msg.model_dump() if hasattr(msg, "model_dump") else msg.dict()
msg_dict["agent_id"] = agent_state.id
message_objs.append(PydanticMessage(**msg_dict))
created_messages = await server.message_manager.create_many_messages_async(message_objs, actor=actor)
# Update agent's message_ids
message_ids = [m.id for m in created_messages]
await server.agent_manager.update_message_ids_async(agent_id=agent_state.id, message_ids=message_ids, actor=actor)
# Reload agent state to get updated message_ids
agent_state = await server.agent_manager.get_agent_by_id_async(agent_id=agent_state.id, actor=actor)
# Fetch messages using the message manager (as in the actual code path)
in_context_messages = await server.message_manager.get_messages_by_ids_async(message_ids=agent_state.message_ids, actor=actor)
return agent_state, in_context_messages
async def run_summarization(server: SyncServer, agent_state, in_context_messages, actor, force=True):
"""
Execute the summarization code path that needs to be tested.
This follows the exact code path specified:
1. Create LettaAgentV2 instance
2. Fetch messages via message_manager.get_messages_by_ids_async
3. Call agent_loop.summarize_conversation_history with force=True
"""
agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
# Run summarization with force parameter
summary_message, messages, summary = await agent_loop.compact(messages=in_context_messages)
return summary_message, messages, summary
# ======================================================================================================================
# Test Cases
# ======================================================================================================================
@pytest.mark.asyncio
@pytest.mark.parametrize(
"llm_config",
TESTED_LLM_CONFIGS,
ids=[c.model for c in TESTED_LLM_CONFIGS],
)
async def test_summarize_empty_message_buffer(server: SyncServer, actor, llm_config: LLMConfig):
"""
Test summarization when there are no messages in the buffer.
Should handle gracefully - either return empty list or raise a clear error.
"""
# Create agent with no messages (replace dots and slashes with underscores for valid names)
agent_name = f"test_agent_empty_{llm_config.model}".replace(".", "_").replace("/", "_")
agent_create = CreateAgent(
name=agent_name,
llm_config=llm_config,
embedding_config=DEFAULT_EMBEDDING_CONFIG,
)
agent_state = await server.agent_manager.create_agent_async(agent_create, actor=actor)
# Get messages (should be empty or only contain system messages)
in_context_messages = await server.message_manager.get_messages_by_ids_async(message_ids=agent_state.message_ids, actor=actor)
# Run summarization - this may fail with empty buffer, which is acceptable behavior
try:
summary, result, _ = await run_summarization(server, agent_state, in_context_messages, actor)
# If it succeeds, verify result
assert isinstance(result, list)
# When summarization runs, V3 ensures that in-context messages follow
# the pattern:
# 1. System prompt
# 2. User summary message (system_alert JSON)
# 3. Remaining messages (which may be empty for this test)
# We should always keep the original system message at the front.
assert len(result) >= 1
assert result[0].role == MessageRole.system
# If summarization did in fact add a summary message, we expect it to
# be the second message with user role.
if len(result) >= 2:
assert result[1].role == MessageRole.user
except ValueError as e:
# It's acceptable for summarization to fail on empty buffer
assert "No assistant message found" in str(e) or "empty" in str(e).lower()
@pytest.mark.asyncio
@pytest.mark.skipif(
not model_settings.anthropic_api_key,
reason="Missing LETTA_ANTHROPIC_API_KEY (or equivalent settings) for Anthropic integration test",
)
async def test_simple_summary_anthropic_uses_streaming_and_returns_summary(actor, monkeypatch):
"""Regression test: Anthropic summarization must use streaming and return real text."""
# If the summarizer ever falls back to a non-streaming Anthropic call, make it fail fast.
from letta.llm_api.anthropic_client import AnthropicClient
async def _nope_request_async(self, *args, **kwargs):
raise AssertionError("Anthropic summarizer should not call request_async (must use streaming)")
monkeypatch.setattr(AnthropicClient, "request_async", _nope_request_async)
# Keep the prompt tiny so this is fast and cheap.
messages = [
PydanticMessage(
role=MessageRole.user,
content=[TextContent(type="text", text="I'm planning a trip to Paris in April.")],
),
PydanticMessage(
role=MessageRole.assistant,
content=[
TextContent(
type="text",
text="Great—your priorities are museums and cafes, and you want to stay under $200/day.",
)
],
),
]
anthropic_config = get_llm_config("claude-4-5-haiku.json")
summary = await simple_summary(messages=messages, llm_config=anthropic_config, actor=actor)
assert isinstance(summary, str)
assert len(summary) > 10
# Sanity-check that the model is summarizing the right conversation.
assert any(token in summary.lower() for token in ["paris", "april", "museum", "cafe", "$200", "200"])
@pytest.mark.asyncio
@pytest.mark.parametrize(
"llm_config",
TESTED_LLM_CONFIGS,
ids=[c.model for c in TESTED_LLM_CONFIGS],
)
async def test_summarize_initialization_messages_only(server: SyncServer, actor, llm_config: LLMConfig):
"""
Test summarization when only initialization/system messages are in the buffer.
Should handle gracefully and likely not summarize.
"""
# Create messages - only system initialization messages
messages = [
PydanticMessage(
role=MessageRole.system,
content=[TextContent(type="text", text="You are a helpful assistant. Your name is Letta.")],
),
PydanticMessage(
role=MessageRole.system,
content=[TextContent(type="text", text="The current date and time is 2025-01-01 12:00:00 UTC.")],
),
]
agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
# Run summarization - force=True with system messages only may fail
try:
summary, result, _ = await run_summarization(server, agent_state, in_context_messages, actor, force=True)
# Verify result
assert isinstance(result, list)
# System messages should typically be preserved
assert len(result) >= 1
except ValueError as e:
# It's acceptable for summarization to fail on system-only messages
assert "No assistant message found" in str(e)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"llm_config",
TESTED_LLM_CONFIGS,
ids=[c.model for c in TESTED_LLM_CONFIGS],
)
async def test_summarize_small_conversation(server: SyncServer, actor, llm_config: LLMConfig):
"""
Test summarization with approximately 5 messages in the buffer.
This represents a typical small conversation.
"""
# Create a small conversation with ~5 messages
messages = [
PydanticMessage(
role=MessageRole.user,
content=[TextContent(type="text", text="Hello! Can you help me with a Python question?")],
),
PydanticMessage(
role=MessageRole.assistant,
content=[TextContent(type="text", text="Of course! I'd be happy to help you with Python. What would you like to know?")],
),
PydanticMessage(
role=MessageRole.user,
content=[TextContent(type="text", text="How do I read a file in Python?")],
),
PydanticMessage(
role=MessageRole.assistant,
content=[
TextContent(
type="text",
text="You can read a file in Python using the open() function. Here's an example:\n\n```python\nwith open('file.txt', 'r') as f:\n content = f.read()\n print(content)\n```",
)
],
),
PydanticMessage(
role=MessageRole.user,
content=[TextContent(type="text", text="Thank you! That's very helpful.")],
),
]
agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
# Run summarization with force=True
# Note: force=True with clear=True can be very aggressive and may fail on small message sets
try:
summary, result, _ = await run_summarization(server, agent_state, in_context_messages, actor, force=True)
# Verify result
assert isinstance(result, list)
# With force=True, some summarization should occur
# The result might be shorter than the original if summarization happened
assert len(result) >= 1
# Verify that the result contains valid messages
for msg in result:
assert hasattr(msg, "role")
assert hasattr(msg, "content")
except ValueError as e:
# With force=True + clear=True, aggressive summarization might fail on small message sets
# This is acceptable behavior
assert "No assistant message found" in str(e)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"llm_config",
TESTED_LLM_CONFIGS,
ids=[c.model for c in TESTED_LLM_CONFIGS],
)
async def test_summarize_large_tool_calls(server: SyncServer, actor, llm_config: LLMConfig):
"""
Test summarization with large tool calls and returns (~50k character tool returns).
This tests the system's ability to handle and summarize very large context windows.
"""
# Create a large tool return
large_return = create_large_tool_return(50000)
# Create messages with large tool calls and returns
messages = [
PydanticMessage(
role=MessageRole.user,
content=[TextContent(type="text", text="Please fetch all the data from the database.")],
),
PydanticMessage(
role=MessageRole.assistant,
content=[
TextContent(type="text", text="I'll fetch the data for you."),
ToolCallContent(
type="tool_call",
id="call_1",
name="fetch_database_records",
input={"query": "SELECT * FROM records"},
),
],
),
PydanticMessage(
role=MessageRole.tool,
tool_call_id="call_1",
content=[
ToolReturnContent(
type="tool_return",
tool_call_id="call_1",
content=large_return,
is_error=False,
)
],
),
PydanticMessage(
role=MessageRole.assistant,
content=[
TextContent(
type="text",
text="I've successfully fetched all the records from the database. There are thousands of items in the result set.",
)
],
),
PydanticMessage(
role=MessageRole.user,
content=[TextContent(type="text", text="Great! Can you summarize what you found?")],
),
PydanticMessage(
role=MessageRole.assistant,
content=[
TextContent(
type="text",
text="Based on the data I retrieved, there are numerous records containing various items with descriptions, metadata, and nested data structures. Each record includes timestamps and version information.",
)
],
),
]
agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
# Verify that we actually have large messages
total_content_size = sum(len(str(content)) for msg in in_context_messages for content in msg.content)
assert total_content_size > 40000, f"Expected large messages, got {total_content_size} chars"
# Run summarization
summary, result, _ = await run_summarization(server, agent_state, in_context_messages, actor)
# Verify result
assert isinstance(result, list)
assert len(result) >= 1
# Verify that summarization reduced the context size
result_content_size = sum(len(str(content)) for msg in result for content in msg.content)
# The summarized result should be smaller than the original
# (unless summarization was skipped for some reason)
print(f"Original size: {total_content_size} chars, Summarized size: {result_content_size} chars")
# Verify that the result contains valid messages
for msg in result:
assert hasattr(msg, "role")
assert hasattr(msg, "content")
@pytest.mark.asyncio
@pytest.mark.parametrize(
"llm_config",
TESTED_LLM_CONFIGS,
ids=[c.model for c in TESTED_LLM_CONFIGS],
)
async def test_summarize_multiple_large_tool_calls(server: SyncServer, actor, llm_config: LLMConfig):
"""
Test summarization with multiple large tool calls in sequence.
This stress-tests the summarization with multiple large context items.
"""
# Create multiple large tool returns
large_return_1 = create_large_tool_return(25000)
large_return_2 = create_large_tool_return(25000)
messages = [
PydanticMessage(
role=MessageRole.user,
content=[TextContent(type="text", text="Fetch user data.")],
),
PydanticMessage(
role=MessageRole.assistant,
content=[
TextContent(type="text", text="Fetching users..."),
ToolCallContent(
type="tool_call",
id="call_1",
name="fetch_users",
input={"limit": 10000},
),
],
),
PydanticMessage(
role=MessageRole.tool,
tool_call_id="call_1",
content=[
ToolReturnContent(
type="tool_return",
tool_call_id="call_1",
content=large_return_1,
is_error=False,
)
],
),
PydanticMessage(
role=MessageRole.assistant,
content=[TextContent(type="text", text="Retrieved user data. Now fetching product data.")],
),
PydanticMessage(
role=MessageRole.assistant,
content=[
TextContent(type="text", text="Fetching products..."),
ToolCallContent(
type="tool_call",
id="call_2",
name="fetch_products",
input={"category": "all"},
),
],
),
PydanticMessage(
role=MessageRole.tool,
tool_call_id="call_2",
content=[
ToolReturnContent(
type="tool_return",
tool_call_id="call_2",
content=large_return_2,
is_error=False,
)
],
),
PydanticMessage(
role=MessageRole.assistant,
content=[TextContent(type="text", text="I've successfully fetched both user and product data.")],
),
]
agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
# Verify that we have large messages
total_content_size = sum(len(str(content)) for msg in in_context_messages for content in msg.content)
assert total_content_size > 40000, f"Expected large messages, got {total_content_size} chars"
# Run summarization
summary, result, _ = await run_summarization(server, agent_state, in_context_messages, actor)
# Verify result
assert isinstance(result, list)
assert len(result) >= 1
# Verify that the result contains valid messages
for msg in result:
assert hasattr(msg, "role")
assert hasattr(msg, "content")
print(f"Summarized {len(in_context_messages)} messages with {total_content_size} chars to {len(result)} messages")
# @pytest.mark.asyncio
# @pytest.mark.parametrize(
# "llm_config",
# TESTED_LLM_CONFIGS,
# ids=[c.model for c in TESTED_LLM_CONFIGS],
# )
# async def test_summarize_truncates_large_tool_return(server: SyncServer, actor, llm_config: LLMConfig):
# """
# Test that summarization properly truncates very large tool returns.
# This ensures that oversized tool returns don't consume excessive context.
# """
# # Create an extremely large tool return (100k chars)
# large_return = create_large_tool_return(100000)
# original_size = len(large_return)
#
# # Create messages with a large tool return
# messages = [
# PydanticMessage(
# role=MessageRole.user,
# content=[TextContent(type="text", text="Please run the database query.")],
# ),
# PydanticMessage(
# role=MessageRole.assistant,
# content=[
# TextContent(type="text", text="Running query..."),
# ToolCallContent(
# type="tool_call",
# id="call_1",
# name="run_query",
# input={"query": "SELECT * FROM large_table"},
# ),
# ],
# ),
# PydanticMessage(
# role=MessageRole.tool,
# tool_call_id="call_1",
# content=[
# ToolReturnContent(
# type="tool_return",
# tool_call_id="call_1",
# content=large_return,
# is_error=False,
# )
# ],
# ),
# PydanticMessage(
# role=MessageRole.assistant,
# content=[TextContent(type="text", text="Query completed successfully with many results.")],
# ),
# ]
#
# agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
#
# # Verify the original tool return is indeed large
# assert original_size > 90000, f"Expected tool return >90k chars, got {original_size}"
#
# # Run summarization
# summary, result = await run_summarization(server, agent_state, in_context_messages, actor)
#
# # Verify result
# assert isinstance(result, list)
# assert len(result) >= 1
#
# # Find tool return messages in the result and verify truncation occurred
# tool_returns_found = False
# for msg in result:
# if msg.role == MessageRole.tool:
# for content in msg.content:
# if isinstance(content, ToolReturnContent):
# tool_returns_found = True
# result_size = len(content.content)
# # Verify that the tool return has been truncated
# assert result_size < original_size, (
# f"Expected tool return to be truncated from {original_size} chars, but got {result_size} chars"
# )
# print(f"Tool return successfully truncated from {original_size} to {result_size} chars")
#
# # If we didn't find any tool returns in the result, that's also acceptable
# # (they may have been completely removed during aggressive summarization)
# if not tool_returns_found:
# print("Tool returns were completely removed during summarization")
#
# ======================================================================================================================
# CompactionSettings Mode Tests - Using LettaAgentV3
# ======================================================================================================================
from unittest.mock import patch
from letta.services.summarizer.summarizer_config import CompactionSettings
# Test both summarizer modes: "all" summarizes entire history, "sliding_window" keeps recent messages
SUMMARIZER_CONFIG_MODES: list[Literal["all", "sliding_window"]] = ["all", "sliding_window"]
@pytest.mark.asyncio
@pytest.mark.parametrize("mode", SUMMARIZER_CONFIG_MODES, ids=SUMMARIZER_CONFIG_MODES)
@pytest.mark.parametrize("llm_config", TESTED_LLM_CONFIGS, ids=[c.model for c in TESTED_LLM_CONFIGS])
async def test_summarize_with_mode(server: SyncServer, actor, llm_config: LLMConfig, mode: Literal["all", "sliding_window"]):
"""
Test summarization with different CompactionSettings modes using LettaAgentV3.
This test verifies that both summarization modes work correctly:
- "all": Summarizes the entire conversation history into a single summary
- "sliding_window": Keeps recent messages and summarizes older ones
"""
# Create a conversation with enough messages to trigger summarization
messages = [
PydanticMessage(
role=MessageRole.system,
content=[TextContent(type="text", text="You are a helpful assistant.")],
)
]
for i in range(10):
messages.append(
PydanticMessage(
role=MessageRole.user,
content=[TextContent(type="text", text=f"User message {i}: Test message {i}.")],
)
)
messages.append(
PydanticMessage(
role=MessageRole.assistant,
content=[TextContent(type="text", text=f"Assistant response {i}: Acknowledged message {i}.")],
)
)
agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
# Create new messages that would be added during this step
new_letta_messages = [
PydanticMessage(
role=MessageRole.user,
content=[TextContent(type="text", text="This is a new user message during this step.")],
agent_id=agent_state.id,
)
]
# Persist the new messages
new_letta_messages = await server.message_manager.create_many_messages_async(new_letta_messages, actor=actor)
# Override compaction settings directly on the agent state
handle = llm_config.handle or f"{llm_config.model_endpoint_type}/{llm_config.model}"
agent_state.compaction_settings = CompactionSettings(model=handle, mode=mode)
agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
summary, result, summary_text = await agent_loop.compact(messages=in_context_messages)
assert isinstance(result, list)
# Verify that the result contains valid messages
for msg in result:
assert hasattr(msg, "role")
assert hasattr(msg, "content")
# Verify the summary text (third return value) is a non-empty string.
# This is used by the agent loop to construct a SummaryMessage for clients.
assert isinstance(summary_text, str), f"Expected summary_text to be a string, got {type(summary_text)}"
assert len(summary_text) > 0, "Expected non-empty summary text"
print()
print(f"RESULTS {mode} ======")
for msg in result:
print(f"MSG: {msg}")
print(f"SUMMARY TEXT: {summary_text[:200]}...")
print()
if mode == "all":
# For "all" mode, V3 keeps:
# 1. System prompt
# 2. A single user summary message (system_alert JSON)
# and no remaining historical messages.
assert len(result) == 2, f"Expected 2 messages for 'all' mode (system + summary), got {len(result)}"
assert result[0].role == MessageRole.system
assert result[1].role == MessageRole.user
else:
# For "sliding_window" mode, result should include:
# 1. System prompt
# 2. User summary message
# 3+. Recent user/assistant messages inside the window.
assert len(result) > 2, f"Expected >2 messages for 'sliding_window' mode, got {len(result)}"
assert result[0].role == MessageRole.system
assert result[1].role == MessageRole.user
@pytest.mark.asyncio
@pytest.mark.parametrize(
"llm_config",
TESTED_LLM_CONFIGS,
ids=[c.model for c in TESTED_LLM_CONFIGS],
)
async def test_compact_returns_valid_summary_message_and_event_message(server: SyncServer, actor, llm_config: LLMConfig):
"""
Test that compact() return values can be used to construct valid SummaryMessage and EventMessage objects.
This validates the contract that _step() relies on: compact() returns
(summary_message_obj, compacted_messages, summary_text) where summary_text
is used to build a SummaryMessage and the metadata is used for an EventMessage.
"""
import uuid
from letta.helpers.datetime_helpers import get_utc_time
# Create a conversation with enough messages to summarize
messages = [
PydanticMessage(
role=MessageRole.system,
content=[TextContent(type="text", text="You are a helpful assistant.")],
)
]
for i in range(10):
messages.append(
PydanticMessage(
role=MessageRole.user,
content=[TextContent(type="text", text=f"User message {i}: Test message {i}.")],
)
)
messages.append(
PydanticMessage(
role=MessageRole.assistant,
content=[TextContent(type="text", text=f"Assistant response {i}: Acknowledged message {i}.")],
)
)
agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
handle = llm_config.handle or f"{llm_config.model_endpoint_type}/{llm_config.model}"
agent_state.compaction_settings = CompactionSettings(model=handle, mode="all")
agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
summary_message_obj, compacted_messages, summary_text = await agent_loop.compact(messages=in_context_messages)
# Verify we can construct a valid SummaryMessage from compact() return values
summary_msg = SummaryMessage(
id=summary_message_obj.id,
date=summary_message_obj.created_at,
summary=summary_text,
otid=PydanticMessage.generate_otid_from_id(summary_message_obj.id, 0),
step_id=None,
run_id=None,
)
assert summary_msg.message_type == "summary_message"
assert isinstance(summary_msg.summary, str)
assert len(summary_msg.summary) > 0
assert summary_msg.id == summary_message_obj.id
# Verify we can construct a valid EventMessage for compaction
event_msg = EventMessage(
id=str(uuid.uuid4()),
date=get_utc_time(),
event_type="compaction",
event_data={
"trigger": "post_step_context_check",
"context_token_estimate": 1000,
"context_window": agent_state.llm_config.context_window,
},
run_id=None,
step_id=None,
)
assert event_msg.message_type == "event_message"
assert event_msg.event_type == "compaction"
assert "trigger" in event_msg.event_data
assert "context_window" in event_msg.event_data
@pytest.mark.asyncio
@pytest.mark.parametrize(
"llm_config",
TESTED_LLM_CONFIGS,
ids=[c.model for c in TESTED_LLM_CONFIGS],
)
async def test_compact_with_use_summary_role_creates_summary_message_role(server: SyncServer, actor, llm_config: LLMConfig):
"""
Test that compact() with use_summary_role=True creates a message with role=MessageRole.summary.
This validates that manual compaction endpoints (which pass use_summary_role=True)
will store summary messages with the dedicated 'summary' role instead of the legacy 'user' role.
"""
# Create a conversation with enough messages to summarize
messages = [
PydanticMessage(
role=MessageRole.system,
content=[TextContent(type="text", text="You are a helpful assistant.")],
)
]
for i in range(10):
messages.append(
PydanticMessage(
role=MessageRole.user,
content=[TextContent(type="text", text=f"User message {i}: Test message {i}.")],
)
)
messages.append(
PydanticMessage(
role=MessageRole.assistant,
content=[TextContent(type="text", text=f"Assistant response {i}: Acknowledged message {i}.")],
)
)
agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
handle = llm_config.handle or f"{llm_config.model_endpoint_type}/{llm_config.model}"
agent_state.compaction_settings = CompactionSettings(model=handle, mode="all")
agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
# Call compact with use_summary_role=True (as the REST endpoints now do)
summary_message_obj, compacted_messages, summary_text = await agent_loop.compact(
messages=in_context_messages,
use_summary_role=True,
)
# Verify the summary message has role=summary (not user)
assert summary_message_obj.role == MessageRole.summary, (
f"Expected summary message to have role=summary when use_summary_role=True, got {summary_message_obj.role}"
)
# Verify the compacted messages list structure
assert len(compacted_messages) == 2, f"Expected 2 messages (system + summary), got {len(compacted_messages)}"
assert compacted_messages[0].role == MessageRole.system
assert compacted_messages[1].role == MessageRole.summary
# Verify summary text is non-empty
assert isinstance(summary_text, str)
assert len(summary_text) > 0
@pytest.mark.asyncio
async def test_v3_compact_uses_compaction_settings_model_and_model_settings(server: SyncServer, actor):
"""Integration test: LettaAgentV3.compact uses the LLMConfig implied by CompactionSettings.
We set a different summarizer model handle + model_settings and verify that
the LLMConfig passed into simple_summary reflects both the handle and
the model_settings overrides.
"""
from letta.agents.letta_agent_v3 import LettaAgentV3
from letta.schemas.model import OpenAIModelSettings, OpenAIReasoning
from letta.services.summarizer import summarizer_all
base_llm_config = LLMConfig.default_config("gpt-4o-mini")
messages = [
PydanticMessage(
role=MessageRole.system,
content=[TextContent(type="text", text="You are a helpful assistant.")],
),
PydanticMessage(
role=MessageRole.user,
content=[TextContent(type="text", text="Hello")],
),
PydanticMessage(
role=MessageRole.assistant,
content=[TextContent(type="text", text="Hi there")],
),
]
# Create agent + messages via helper to get a real AgentState
agent_state, in_context_messages = await create_agent_with_messages(
server=server,
actor=actor,
llm_config=base_llm_config,
messages=messages,
)
summarizer_handle = "openai/gpt-5-mini"
summarizer_model_settings = OpenAIModelSettings(
max_output_tokens=4321,
temperature=0.05,
reasoning=OpenAIReasoning(reasoning_effort="high"),
response_format=None,
)
agent_state.compaction_settings = CompactionSettings(
model=summarizer_handle,
model_settings=summarizer_model_settings,
prompt="You are a summarizer.",
prompt_acknowledgement=True,
clip_chars=2000,
mode="all",
sliding_window_percentage=0.3,
)
captured_llm_config: dict = {}
async def fake_simple_summary(messages, llm_config, actor, include_ack=True, prompt=None, **kwargs): # type: ignore[override]
captured_llm_config["value"] = llm_config
return "summary text"
# Patch simple_summary so we don't hit the real LLM and can inspect llm_config
with patch.object(summarizer_all, "simple_summary", new=fake_simple_summary):
agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
summary_msg, compacted, _ = await agent_loop.compact(messages=in_context_messages)
assert summary_msg is not None
assert "value" in captured_llm_config
summarizer_llm_config = captured_llm_config["value"]
# Agent's llm_config remains the base config
assert agent_state.llm_config.model == "gpt-4o-mini"
# Summarizer llm_config should reflect compaction_settings.model and model_settings
assert summarizer_llm_config.handle == summarizer_handle
assert summarizer_llm_config.model == "gpt-5-mini"
assert summarizer_llm_config.max_tokens == 4321
assert summarizer_llm_config.temperature == 0.05
@pytest.mark.asyncio
@pytest.mark.parametrize("llm_config", TESTED_LLM_CONFIGS, ids=[c.model for c in TESTED_LLM_CONFIGS])
async def test_v3_summarize_hard_eviction_when_still_over_threshold(
server: SyncServer,
actor,
llm_config: LLMConfig,
caplog,
):
"""Regression test: ensure V3 summarizer does a hard eviction when
summarization fails to bring the context size below the proactive
summarization threshold.
This test simulates the edge case that previously led to summarization
loops:
1. A large pre-summarization token count triggers summarization.
2. Even after summarization, the (mocked) post-summarization token count
is still above the trigger threshold.
3. We verify that LettaAgentV3:
- Logs an error about summarization failing to reduce context size.
- Evicts all prior messages, keeping only the system message plus a
single synthetic user summary message (system_alert).
- Updates `context_token_estimate` to the token count of the minimal
context so future steps don't keep re-triggering summarization based
on a stale, oversized value.
"""
# Build a small but non-trivial conversation with an explicit system
# message so that after hard eviction we expect to keep exactly that
# system message plus a single user summary message.
messages = [
PydanticMessage(
role=MessageRole.system,
content=[TextContent(type="text", text="You are a helpful assistant.")],
),
PydanticMessage(
role=MessageRole.user,
content=[TextContent(type="text", text="User message 0: hello")],
),
PydanticMessage(
role=MessageRole.assistant,
content=[TextContent(type="text", text="Assistant response 0: hi there")],
),
]
agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
print("ORIGINAL IN-CONTEXT MESSAGES ======")
for msg in in_context_messages:
print(f"MSG: {msg}")
# Create the V3 agent loop
agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
# We don't care which summarizer mode is used here; we just need
# summarize_conversation_history to run and then hit the branch where the
# *post*-summarization token count is still above the proactive
# summarization threshold. We simulate that by patching the
# letta_agent_v3-level count_tokens helper to report an extremely large
# token count for the first call (post-summary) and a small count for the
# second call (after hard eviction).
with patch("letta.agents.letta_agent_v3.count_tokens") as mock_count_tokens:
# First call: pretend the summarized context is still huge relative to
# this model's context window so that we always trigger the
# hard-eviction path. Second call: minimal context (system only) is
# small.
context_limit = llm_config.context_window or 100_000
huge_tokens = context_limit * 10 # safely above any reasonable trigger
mock_count_tokens.side_effect = [huge_tokens, 10]
caplog.set_level("ERROR")
summary, result, summary_text = await agent_loop.compact(
messages=in_context_messages,
trigger_threshold=context_limit,
)
# We should have made exactly two token-count calls: one for the
# summarized context, one for the hard-evicted minimal context.
assert mock_count_tokens.call_count == 2
print("COMPACTED RESULT ======")
for msg in result:
print(f"MSG: {msg}")
# After hard eviction, we keep only:
# 1. The system prompt
# 2. The synthetic user summary message.
assert isinstance(result, list)
assert len(result) == 2, f"Expected system + summary after hard eviction, got {len(result)} messages"
assert result[0].role == MessageRole.system
assert result[1].role == MessageRole.user
# Verify the summary text is returned (used to construct SummaryMessage in the agent loop)
assert isinstance(summary_text, str), f"Expected summary_text to be a string, got {type(summary_text)}"
assert len(summary_text) > 0, "Expected non-empty summary text after hard eviction"
# ======================================================================================================================
# Sliding Window Summarizer Unit Tests
# ======================================================================================================================
@pytest.mark.asyncio
@pytest.mark.parametrize(
"llm_config",
TESTED_LLM_CONFIGS,
ids=[c.model for c in TESTED_LLM_CONFIGS],
)
async def test_sliding_window_cutoff_index_does_not_exceed_message_count(server: SyncServer, actor, llm_config: LLMConfig):
"""
Test that the sliding window summarizer correctly calculates cutoff indices.
This test verifies the fix for a bug where the cutoff percentage was treated as
a whole number (10) instead of a decimal (0.10), causing:
message_cutoff_index = round(10 * 65) = 650
when there were only 65 messages, resulting in an empty range loop and the error:
"No assistant message found from indices 650 to 65"
The fix changed:
- max(..., 10) -> max(..., 0.10)
- += 10 -> += 0.10
- >= 100 -> >= 1.0
This test uses the real token counter (via create_token_counter) to verify
the sliding window logic works with actual token counting.
"""
from letta.services.summarizer.summarizer_config import CompactionSettings
from letta.services.summarizer.summarizer_sliding_window import summarize_via_sliding_window
# Create a real summarizer config using the default factory
# Override sliding_window_percentage to 0.3 for this test
handle = llm_config.handle or f"{llm_config.model_endpoint_type}/{llm_config.model}"
summarizer_config = CompactionSettings(model=handle)
summarizer_config.sliding_window_percentage = 0.3
# Create 65 messages (similar to the failing case in the bug report)
# Pattern: system + alternating user/assistant messages
messages = [
PydanticMessage(
role=MessageRole.system,
content=[TextContent(type="text", text="You are a helpful assistant.")],
)
]
# Add 64 more messages (32 user-assistant pairs)
for i in range(32):
messages.append(
PydanticMessage(
role=MessageRole.user,
content=[TextContent(type="text", text=f"User message {i}")],
)
)
messages.append(
PydanticMessage(
role=MessageRole.assistant,
content=[TextContent(type="text", text=f"Assistant response {i}")],
)
)
assert len(messages) == 65, f"Expected 65 messages, got {len(messages)}"
# This should NOT raise "No assistant message found from indices 650 to 65"
# With the fix, message_count_cutoff_percent starts at max(0.7, 0.10) = 0.7
# So message_cutoff_index = round(0.7 * 65) = 46, which is valid
try:
summary, remaining_messages = await summarize_via_sliding_window(
actor=actor,
llm_config=llm_config,
summarizer_config=summarizer_config,
in_context_messages=messages,
)
# Verify the summary was generated (actual LLM response)
assert summary is not None
assert len(summary) > 0
# Verify remaining messages is a valid subset
assert len(remaining_messages) < len(messages)
assert len(remaining_messages) > 0
print(f"Successfully summarized {len(messages)} messages to {len(remaining_messages)} remaining")
print(f"Summary: {summary[:200]}..." if len(summary) > 200 else f"Summary: {summary}")
print(f"Using {llm_config.model_endpoint_type} token counter for model {llm_config.model}")
except ValueError as e:
if "No assistant message found from indices" in str(e):
# Extract the indices from the error message
import re
match = re.search(r"from indices (\d+) to (\d+)", str(e))
if match:
start_idx, end_idx = int(match.group(1)), int(match.group(2))
pytest.fail(
f"Bug detected: cutoff index ({start_idx}) exceeds message count ({end_idx}). "
f"This indicates the percentage calculation bug where 10 was used instead of 0.10. "
f"Error: {e}"
)
raise
@pytest.mark.asyncio
@pytest.mark.parametrize(
"llm_config",
TESTED_LLM_CONFIGS,
ids=[c.model for c in TESTED_LLM_CONFIGS],
)
async def test_large_system_prompt_summarization(server: SyncServer, actor, llm_config: LLMConfig):
"""
Test edge case of large system prompt / memory blocks.
This test verifies that summarization handles the case where the system prompt
and memory blocks are very large, potentially consuming most of the context window.
The summarizer should gracefully handle this scenario without errors.
"""
# Override context window to be small so we trigger summarization
llm_config.context_window = 10000
# Create agent with large system prompt and memory blocks
agent_name = f"test_agent_large_system_prompt_{llm_config.model}".replace(".", "_").replace("/", "_")
agent_create = CreateAgent(
name=agent_name,
llm_config=llm_config,
embedding_config=DEFAULT_EMBEDDING_CONFIG,
system="SYSTEM PROMPT " * 10000, # Large system prompt
memory_blocks=[
CreateBlock(
label="human",
limit=200000,
value="NAME " * 10000, # Large memory block
)
],
)
agent_state = await server.agent_manager.create_agent_async(agent_create, actor=actor)
# Create a run for the agent using RunManager
run = PydanticRun(agent_id=agent_state.id)
run = await RunManager().create_run(pydantic_run=run, actor=actor)
# Create the agent loop using LettaAgentV3
agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
# message the agent
input_message = MessageCreate(role=MessageRole.user, content="Hello")
# Call step on the agent - may trigger summarization due to large context
from letta.errors import SystemPromptTokenExceededError
with pytest.raises(SystemPromptTokenExceededError):
response = await agent_loop.step(
input_messages=[input_message],
run_id=run.id,
max_steps=3,
)
# Repair the agent by shortening the memory blocks and system prompt
# Update system prompt to a shorter version
short_system_prompt = "You are a helpful assistant."
await server.agent_manager.update_agent_async(
agent_id=agent_state.id,
agent_update=UpdateAgent(system=short_system_prompt),
actor=actor,
)
# Update memory block to a shorter version
short_memory_value = "The user's name is Alice."
await server.agent_manager.modify_block_by_label_async(
agent_id=agent_state.id,
block_label="human",
block_update=BlockUpdate(value=short_memory_value),
actor=actor,
)
# Reload agent state after repairs
agent_state = await server.agent_manager.get_agent_by_id_async(agent_id=agent_state.id, actor=actor)
print("REPAIRED AGENT STATE ======")
print(agent_state.system)
print(agent_state.blocks)
# Create a new run for the repaired agent
run = PydanticRun(agent_id=agent_state.id)
run = await RunManager().create_run(pydantic_run=run, actor=actor)
# Create a new agent loop with the repaired agent state
agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
# Now the agent should be able to respond without context window errors
response = await agent_loop.step(
input_messages=[input_message],
run_id=run.id,
max_steps=3,
)
# Verify we got a valid response after repair
assert response is not None
assert response.messages is not None
print(f"Agent successfully responded after repair with {len(response.messages)} messages")
# @pytest.mark.asyncio
# async def test_context_window_overflow_triggers_summarization_in_streaming(server: SyncServer, actor):
# """
# Test that a ContextWindowExceededError during a streaming LLM request
# properly triggers the summarizer and compacts the in-context messages.
#
# This test simulates:
# 1. An LLM streaming request that fails with ContextWindowExceededError
# 2. The summarizer being invoked to reduce context size
# 3. Verification that messages are compacted and summary message exists
#
# Note: This test only runs with OpenAI since it uses OpenAI-specific error handling.
# """
# import uuid
# from unittest.mock import patch
#
# import openai
#
# from letta.schemas.message import MessageCreate
# from letta.schemas.run import Run
# from letta.services.run_manager import RunManager
#
# # Use OpenAI config for this test (since we're using OpenAI-specific error handling)
# llm_config = get_llm_config("openai-gpt-4o-mini.json")
#
# # Create test messages - enough to have something to summarize
# messages = []
# for i in range(15):
# messages.append(
# PydanticMessage(
# role=MessageRole.user,
# content=[TextContent(type="text", text=f"User message {i}: This is test message number {i}.")],
# )
# )
# messages.append(
# PydanticMessage(
# role=MessageRole.assistant,
# content=[TextContent(type="text", text=f"Assistant response {i}: I acknowledge message {i}.")],
# )
# )
#
# agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
# original_message_count = len(agent_state.message_ids)
#
# # Create an input message to trigger the agent
# input_message = MessageCreate(
# role=MessageRole.user,
# content=[TextContent(type="text", text="Hello, please respond.")],
# )
#
# # Create a proper run record in the database
# run_manager = RunManager()
# test_run_id = f"run-{uuid.uuid4()}"
# test_run = Run(
# id=test_run_id,
# agent_id=agent_state.id,
# )
# await run_manager.create_run(test_run, actor)
#
# # Create the agent loop using LettaAgentV3
# agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
#
# # Track how many times stream_async is called
# call_count = 0
#
# # Store original stream_async method
# original_stream_async = agent_loop.llm_client.stream_async
#
# async def mock_stream_async_with_error(request_data, llm_config):
# nonlocal call_count
# call_count += 1
# if call_count == 1:
# # First call raises OpenAI BadRequestError with context_length_exceeded error code
# # This will be properly converted to ContextWindowExceededError by handle_llm_error
# from unittest.mock import MagicMock
#
# import httpx
#
# # Create a mock response with the required structure
# mock_request = httpx.Request("POST", "https://api.openai.com/v1/chat/completions")
# mock_response = httpx.Response(
# status_code=400,
# request=mock_request,
# json={
# "error": {
# "message": "This model's maximum context length is 8000 tokens. However, your messages resulted in 12000 tokens.",
# "type": "invalid_request_error",
# "code": "context_length_exceeded",
# }
# },
# )
#
# raise openai.BadRequestError(
# message="This model's maximum context length is 8000 tokens. However, your messages resulted in 12000 tokens.",
# response=mock_response,
# body={
# "error": {
# "message": "This model's maximum context length is 8000 tokens. However, your messages resulted in 12000 tokens.",
# "type": "invalid_request_error",
# "code": "context_length_exceeded",
# }
# },
# )
# # Subsequent calls use the real implementation
# return await original_stream_async(request_data, llm_config)
#
# # Patch the llm_client's stream_async to raise ContextWindowExceededError on first call
# with patch.object(agent_loop.llm_client, "stream_async", side_effect=mock_stream_async_with_error):
# # Execute a streaming step
# try:
# result_chunks = []
# async for chunk in agent_loop.stream(
# input_messages=[input_message],
# max_steps=1,
# stream_tokens=True,
# run_id=test_run_id,
# ):
# result_chunks.append(chunk)
# except Exception as e:
# # Some errors might happen due to real LLM calls after retry
# print(f"Exception during stream: {e}")
#
# # Reload agent state to get updated message_ids after summarization
# updated_agent_state = await server.agent_manager.get_agent_by_id_async(agent_id=agent_state.id, actor=actor)
# updated_message_count = len(updated_agent_state.message_ids)
#
# # Fetch the updated in-context messages
# updated_in_context_messages = await server.message_manager.get_messages_by_ids_async(
# message_ids=updated_agent_state.message_ids, actor=actor
# )
#
# # Convert to LettaMessage format for easier content inspection
# letta_messages = PydanticMessage.to_letta_messages_from_list(updated_in_context_messages)
#
# # Verify a summary message exists with the correct format
# # The summary message has content with type="system_alert" and message containing:
# # "prior messages ... have been hidden" and "summary of the previous"
# import json
#
# summary_message_found = False
# summary_message_text = None
# for msg in letta_messages:
# # Not all message types have a content attribute (e.g., ReasoningMessage)
# if not hasattr(msg, "content"):
# continue
#
# content = msg.content
# # Content can be a string (JSON) or an object with type/message fields
# if isinstance(content, str):
# # Try to parse as JSON
# try:
# parsed = json.loads(content)
# if isinstance(parsed, dict) and parsed.get("type") == "system_alert":
# text_to_check = parsed.get("message", "").lower()
# if "prior messages" in text_to_check and "hidden" in text_to_check and "summary of the previous" in text_to_check:
# summary_message_found = True
# summary_message_text = parsed.get("message")
# break
# except (json.JSONDecodeError, TypeError):
# pass
# # Check if content has system_alert type with the summary message (object form)
# elif hasattr(content, "type") and content.type == "system_alert":
# if hasattr(content, "message") and content.message:
# text_to_check = content.message.lower()
# if "prior messages" in text_to_check and "hidden" in text_to_check and "summary of the previous" in text_to_check:
# summary_message_found = True
# summary_message_text = content.message
# break
#
# assert summary_message_found, (
# "A summary message should exist in the in-context messages after summarization. "
# "Expected format containing 'prior messages...hidden' and 'summary of the previous'"
# )
#
# # Verify we attempted multiple invocations (the failing one + retry after summarization)
# assert call_count >= 2, f"Expected at least 2 LLM invocations (initial + retry), got {call_count}"
#
# # The original messages should have been compacted - the updated count should be less than
# # original + the new messages added (input + assistant response + tool results)
# # Since summarization should have removed most of the original 30 messages
# print("Test passed: Summary message found in context")
# print(f"Original message count: {original_message_count}, Updated: {updated_message_count}")
# print(f"Summary message: {summary_message_text[:200] if summary_message_text else 'N/A'}...")
# print(f"Total LLM invocations: {call_count}")
#
#
# @pytest.mark.asyncio
# async def test_context_window_overflow_triggers_summarization_in_blocking(server: SyncServer, actor):
# """
# Test that a ContextWindowExceededError during a blocking (non-streaming) LLM request
# properly triggers the summarizer and compacts the in-context messages.
#
# This test is similar to the streaming test but uses the blocking step() method.
#
# Note: This test only runs with OpenAI since it uses OpenAI-specific error handling.
# """
# import uuid
# from unittest.mock import patch
#
# import openai
#
# from letta.schemas.message import MessageCreate
# from letta.schemas.run import Run
# from letta.services.run_manager import RunManager
#
# # Use OpenAI config for this test (since we're using OpenAI-specific error handling)
# llm_config = get_llm_config("openai-gpt-4o-mini.json")
#
# # Create test messages
# messages = []
# for i in range(15):
# messages.append(
# PydanticMessage(
# role=MessageRole.user,
# content=[TextContent(type="text", text=f"User message {i}: This is test message number {i}.")],
# )
# )
# messages.append(
# PydanticMessage(
# role=MessageRole.assistant,
# content=[TextContent(type="text", text=f"Assistant response {i}: I acknowledge message {i}.")],
# )
# )
#
# agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
# original_message_count = len(agent_state.message_ids)
#
# # Create an input message to trigger the agent
# input_message = MessageCreate(
# role=MessageRole.user,
# content=[TextContent(type="text", text="Hello, please respond.")],
# )
#
# # Create a proper run record in the database
# run_manager = RunManager()
# test_run_id = f"run-{uuid.uuid4()}"
# test_run = Run(
# id=test_run_id,
# agent_id=agent_state.id,
# )
# await run_manager.create_run(test_run, actor)
#
# # Create the agent loop using LettaAgentV3
# agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
#
# # Track how many times request_async is called
# call_count = 0
#
# # Store original request_async method
# original_request_async = agent_loop.llm_client.request_async
#
# async def mock_request_async_with_error(request_data, llm_config):
# nonlocal call_count
# call_count += 1
# if call_count == 1:
# # First call raises OpenAI BadRequestError with context_length_exceeded error code
# # This will be properly converted to ContextWindowExceededError by handle_llm_error
# import httpx
#
# # Create a mock response with the required structure
# mock_request = httpx.Request("POST", "https://api.openai.com/v1/chat/completions")
# mock_response = httpx.Response(
# status_code=400,
# request=mock_request,
# json={
# "error": {
# "message": "This model's maximum context length is 8000 tokens. However, your messages resulted in 12000 tokens.",
# "type": "invalid_request_error",
# "code": "context_length_exceeded",
# }
# },
# )
#
# raise openai.BadRequestError(
# message="This model's maximum context length is 8000 tokens. However, your messages resulted in 12000 tokens.",
# response=mock_response,
# body={
# "error": {
# "message": "This model's maximum context length is 8000 tokens. However, your messages resulted in 12000 tokens.",
# "type": "invalid_request_error",
# "code": "context_length_exceeded",
# }
# },
# )
# # Subsequent calls use the real implementation
# return await original_request_async(request_data, llm_config)
#
# # Patch the llm_client's request_async to raise ContextWindowExceededError on first call
# with patch.object(agent_loop.llm_client, "request_async", side_effect=mock_request_async_with_error):
# # Execute a blocking step
# try:
# result = await agent_loop.step(
# input_messages=[input_message],
# max_steps=1,
# run_id=test_run_id,
# )
# except Exception as e:
# # Some errors might happen due to real LLM calls after retry
# print(f"Exception during step: {e}")
#
# # Reload agent state to get updated message_ids after summarization
# updated_agent_state = await server.agent_manager.get_agent_by_id_async(agent_id=agent_state.id, actor=actor)
# updated_message_count = len(updated_agent_state.message_ids)
#
# # Fetch the updated in-context messages
# updated_in_context_messages = await server.message_manager.get_messages_by_ids_async(
# message_ids=updated_agent_state.message_ids, actor=actor
# )
#
# # Convert to LettaMessage format for easier content inspection
# letta_messages = PydanticMessage.to_letta_messages_from_list(updated_in_context_messages)
#
# # Verify a summary message exists with the correct format
# # The summary message has content with type="system_alert" and message containing:
# # "prior messages ... have been hidden" and "summary of the previous"
# import json
#
# summary_message_found = False
# summary_message_text = None
# for msg in letta_messages:
# # Not all message types have a content attribute (e.g., ReasoningMessage)
# if not hasattr(msg, "content"):
# continue
#
# content = msg.content
# # Content can be a string (JSON) or an object with type/message fields
# if isinstance(content, str):
# # Try to parse as JSON
# try:
# parsed = json.loads(content)
# if isinstance(parsed, dict) and parsed.get("type") == "system_alert":
# text_to_check = parsed.get("message", "").lower()
# if "prior messages" in text_to_check and "hidden" in text_to_check and "summary of the previous" in text_to_check:
# summary_message_found = True
# summary_message_text = parsed.get("message")
# break
# except (json.JSONDecodeError, TypeError):
# pass
# # Check if content has system_alert type with the summary message (object form)
# elif hasattr(content, "type") and content.type == "system_alert":
# if hasattr(content, "message") and content.message:
# text_to_check = content.message.lower()
# if "prior messages" in text_to_check and "hidden" in text_to_check and "summary of the previous" in text_to_check:
# summary_message_found = True
# summary_message_text = content.message
# break
#
# assert summary_message_found, (
# "A summary message should exist in the in-context messages after summarization. "
# "Expected format containing 'prior messages...hidden' and 'summary of the previous'"
# )
#
# # Verify we attempted multiple invocations (the failing one + retry after summarization)
# assert call_count >= 2, f"Expected at least 2 LLM invocations (initial + retry), got {call_count}"
#
# # The original messages should have been compacted - the updated count should be less than
# # original + the new messages added (input + assistant response + tool results)
# print("Test passed: Summary message found in context (blocking mode)")
# print(f"Original message count: {original_message_count}, Updated: {updated_message_count}")
# print(f"Summary message: {summary_message_text[:200] if summary_message_text else 'N/A'}...")
# print(f"Total LLM invocations: {call_count}")
#
#
@pytest.mark.asyncio
@pytest.mark.parametrize(
"llm_config",
TESTED_LLM_CONFIGS,
ids=[c.model for c in TESTED_LLM_CONFIGS],
)
async def test_summarize_all(server: SyncServer, actor, llm_config: LLMConfig):
"""
Test the summarize_all function with real LLM calls.
This test verifies that the 'all' summarization mode works correctly,
summarizing the entire conversation into a single summary string.
"""
from letta.services.summarizer.summarizer_all import summarize_all
from letta.services.summarizer.summarizer_config import CompactionSettings
# Create a summarizer config with "all" mode
handle = llm_config.handle or f"{llm_config.model_endpoint_type}/{llm_config.model}"
summarizer_config = CompactionSettings(model=handle)
summarizer_config.mode = "all"
# Create test messages - a simple conversation
messages = [
PydanticMessage(
role=MessageRole.system,
content=[TextContent(type="text", text="You are a helpful assistant.")],
)
]
# Add 10 user-assistant pairs
for i in range(10):
messages.append(
PydanticMessage(
role=MessageRole.user,
content=[TextContent(type="text", text=f"User message {i}: What is {i} + {i}?")],
)
)
messages.append(
PydanticMessage(
role=MessageRole.assistant,
content=[TextContent(type="text", text=f"Assistant response {i}: {i} + {i} = {i * 2}.")],
)
)
assert len(messages) == 21, f"Expected 21 messages, got {len(messages)}"
# Call summarize_all with real LLM
summary, new_in_context_messages = await summarize_all(
actor=actor,
llm_config=llm_config,
summarizer_config=summarizer_config,
in_context_messages=messages,
)
# Verify the summary was generated
assert len(new_in_context_messages) == 1
assert summary is not None
assert len(summary) > 0
assert len(summary) <= 2000
print(f"Successfully summarized {len(messages)} messages using 'all' mode")
print(f"Summary: {summary[:200]}..." if len(summary) > 200 else f"Summary: {summary}")
print(f"Using {llm_config.model_endpoint_type} for model {llm_config.model}")
# =============================================================================
# CompactionStats tests
# =============================================================================
def test_compaction_stats_embedding_in_packed_json():
"""Test that compaction_stats are correctly embedded in the packed JSON by package_summarize_message_no_counts."""
from letta.system import package_summarize_message_no_counts
stats = {
"trigger": "post_step_context_check",
"context_tokens_before": 50000,
"context_tokens_after": 15000,
"context_window": 128000,
"messages_count_before": 45,
"messages_count_after": 12,
}
packed = package_summarize_message_no_counts(
summary="Test summary content",
timezone="UTC",
compaction_stats=stats,
)
# Parse the packed JSON
packed_json = json.loads(packed)
# Verify structure
assert "type" in packed_json
assert packed_json["type"] == "system_alert"
assert "message" in packed_json
assert "Test summary content" in packed_json["message"]
assert "compaction_stats" in packed_json
# Verify stats content
embedded_stats = packed_json["compaction_stats"]
assert embedded_stats["trigger"] == "post_step_context_check"
assert embedded_stats["context_tokens_before"] == 50000
assert embedded_stats["context_tokens_after"] == 15000
assert embedded_stats["context_window"] == 128000
assert embedded_stats["messages_count_before"] == 45
assert embedded_stats["messages_count_after"] == 12
def test_compaction_stats_embedding_without_stats():
"""Test that packed JSON works correctly when no stats are provided."""
from letta.system import package_summarize_message_no_counts
packed = package_summarize_message_no_counts(
summary="Test summary content",
timezone="UTC",
compaction_stats=None,
)
packed_json = json.loads(packed)
assert "type" in packed_json
assert "message" in packed_json
assert "compaction_stats" not in packed_json
def test_extract_compaction_stats_from_packed_json():
"""Test extracting CompactionStats from a packed JSON string."""
from letta.schemas.letta_message import CompactionStats, extract_compaction_stats_from_packed_json
packed_json = json.dumps(
{
"type": "system_alert",
"message": "Test summary",
"time": "2024-01-15T10:00:00",
"compaction_stats": {
"trigger": "context_window_exceeded",
"context_tokens_before": 100000,
"context_tokens_after": 30000,
"context_window": 128000,
"messages_count_before": 50,
"messages_count_after": 15,
},
}
)
stats = extract_compaction_stats_from_packed_json(packed_json)
assert stats is not None
assert isinstance(stats, CompactionStats)
assert stats.trigger == "context_window_exceeded"
assert stats.context_tokens_before == 100000
assert stats.context_tokens_after == 30000
assert stats.context_window == 128000
assert stats.messages_count_before == 50
assert stats.messages_count_after == 15
def test_extract_compaction_stats_from_packed_json_without_stats():
"""Test that extraction returns None when no stats are present (backward compatibility)."""
from letta.schemas.letta_message import extract_compaction_stats_from_packed_json
# Old format without compaction_stats
packed_json = json.dumps(
{
"type": "system_alert",
"message": "Test summary",
"time": "2024-01-15T10:00:00",
}
)
stats = extract_compaction_stats_from_packed_json(packed_json)
assert stats is None
def test_extract_compaction_stats_from_packed_json_invalid_json():
"""Test that extraction handles invalid JSON gracefully."""
from letta.schemas.letta_message import extract_compaction_stats_from_packed_json
stats = extract_compaction_stats_from_packed_json("not valid json")
assert stats is None
stats = extract_compaction_stats_from_packed_json("")
assert stats is None
def test_extract_compaction_stats_from_packed_json_invalid_stats():
"""Test that extraction handles invalid stats structure gracefully."""
from letta.schemas.letta_message import extract_compaction_stats_from_packed_json
# Missing required fields
packed_json = json.dumps(
{
"type": "system_alert",
"message": "Test summary",
"compaction_stats": {
"trigger": "test",
# Missing context_window, messages_count_before, messages_count_after
},
}
)
stats = extract_compaction_stats_from_packed_json(packed_json)
assert stats is None # Should return None due to validation failure
def test_extract_compaction_stats_from_message():
"""Test extracting CompactionStats from a Message object."""
from letta.agents.letta_agent_v3 import extract_compaction_stats_from_message
from letta.schemas.letta_message import CompactionStats
packed_content = json.dumps(
{
"type": "system_alert",
"message": "Test summary",
"time": "2024-01-15T10:00:00",
"compaction_stats": {
"trigger": "post_step_context_check",
"context_tokens_before": 50000,
"context_tokens_after": 15000,
"context_window": 128000,
"messages_count_before": 45,
"messages_count_after": 12,
},
}
)
message = PydanticMessage(
role=MessageRole.summary,
content=[TextContent(type="text", text=packed_content)],
)
stats = extract_compaction_stats_from_message(message)
assert stats is not None
assert isinstance(stats, CompactionStats)
assert stats.trigger == "post_step_context_check"
assert stats.context_tokens_before == 50000
assert stats.messages_count_after == 12
def test_extract_compaction_stats_from_message_without_stats():
"""Test that Message extraction returns None when no stats are present."""
from letta.agents.letta_agent_v3 import extract_compaction_stats_from_message
packed_content = json.dumps(
{
"type": "system_alert",
"message": "Old format summary",
"time": "2024-01-15T10:00:00",
}
)
message = PydanticMessage(
role=MessageRole.summary,
content=[TextContent(type="text", text=packed_content)],
)
stats = extract_compaction_stats_from_message(message)
assert stats is None
def test_message_to_summary_message_with_stats():
"""Test that Message._convert_summary_message extracts compaction_stats."""
from letta.schemas.letta_message import CompactionStats
packed_content = json.dumps(
{
"type": "system_alert",
"message": "Summary of conversation",
"time": "2024-01-15T10:00:00",
"compaction_stats": {
"trigger": "context_window_exceeded",
"context_tokens_before": 80000,
"context_tokens_after": 25000,
"context_window": 128000,
"messages_count_before": 60,
"messages_count_after": 20,
},
}
)
message = PydanticMessage(
role=MessageRole.summary,
content=[TextContent(type="text", text=packed_content)],
)
# Convert to SummaryMessage (as_user_message=False)
summary_msg = message._convert_summary_message(as_user_message=False)
assert summary_msg.message_type == "summary_message"
assert summary_msg.compaction_stats is not None
assert isinstance(summary_msg.compaction_stats, CompactionStats)
assert summary_msg.compaction_stats.trigger == "context_window_exceeded"
assert summary_msg.compaction_stats.context_tokens_before == 80000
def test_message_to_summary_message_backward_compatible():
"""Test that old messages without compaction_stats still convert correctly."""
packed_content = json.dumps(
{
"type": "system_alert",
"message": "Old format summary without stats",
"time": "2024-01-15T10:00:00",
}
)
message = PydanticMessage(
role=MessageRole.summary,
content=[TextContent(type="text", text=packed_content)],
)
summary_msg = message._convert_summary_message(as_user_message=False)
assert summary_msg.message_type == "summary_message"
assert summary_msg.compaction_stats is None # Should be None for old messages
assert "Old format summary" in summary_msg.summary
@pytest.mark.asyncio
@pytest.mark.parametrize(
"llm_config",
TESTED_LLM_CONFIGS,
ids=[c.model for c in TESTED_LLM_CONFIGS],
)
async def test_compact_with_stats_params_embeds_stats(server: SyncServer, actor, llm_config: LLMConfig):
"""
Integration test: compact() with trigger/context_tokens_before/messages_count_before
embeds compaction_stats in the packed message content.
"""
from letta.agents.letta_agent_v3 import extract_compaction_stats_from_message
# Create a conversation with enough messages to summarize
messages = [
PydanticMessage(
role=MessageRole.system,
content=[TextContent(type="text", text="You are a helpful assistant.")],
)
]
for i in range(10):
messages.append(
PydanticMessage(
role=MessageRole.user,
content=[TextContent(type="text", text=f"User message {i}")],
)
)
messages.append(
PydanticMessage(
role=MessageRole.assistant,
content=[TextContent(type="text", text=f"Response {i}")],
)
)
agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
handle = llm_config.handle or f"{llm_config.model_endpoint_type}/{llm_config.model}"
agent_state.compaction_settings = CompactionSettings(model=handle, mode="all")
agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
# Call compact with stats params
summary_message_obj, compacted_messages, summary_text = await agent_loop.compact(
messages=in_context_messages,
use_summary_role=True,
trigger="post_step_context_check",
context_tokens_before=50000,
messages_count_before=len(in_context_messages),
)
# Extract stats from the message
stats = extract_compaction_stats_from_message(summary_message_obj)
assert stats is not None, "CompactionStats should be embedded in the message"
assert stats.trigger == "post_step_context_check"
assert stats.context_tokens_before == 50000
assert stats.messages_count_before == len(in_context_messages)
assert stats.context_tokens_after is not None # Should be set by compact()
assert stats.messages_count_after == len(compacted_messages) # final_messages already includes summary
assert stats.context_window == llm_config.context_window