* fix: handle Anthropic overloaded_error in streaming interfaces * fix: handle Unicode surrogates in OpenAI requests Sanitize Unicode surrogate pairs before sending requests to OpenAI API. Surrogate pairs (U+D800-U+DFFF) are UTF-16 encoding artifacts that cause UnicodeEncodeError when encoding to UTF-8. Fixes Datadog error: 'utf-8' codec can't encode character '\ud83c' in position 326605: surrogates not allowed * fix: handle UnicodeEncodeError from lone Unicode surrogates in OpenAI requests Improved sanitize_unicode_surrogates() to explicitly filter out lone surrogate characters (U+D800 to U+DFFF) which are invalid in UTF-8. Previous implementation used errors='ignore' which could still fail in edge cases. New approach directly checks Unicode code points and removes any surrogates before data reaches httpx encoding. Also added sanitization to stream_async_responses() method which was missing it. Fixes: 'utf-8' codec can't encode character '\ud83c' in position X: surrogates not allowed
2037 lines
79 KiB
Python
2037 lines
79 KiB
Python
"""
|
|
Integration tests for conversation history summarization.
|
|
|
|
These tests verify the complete summarization flow:
|
|
1. Creating a LettaAgentV2 instance
|
|
2. Fetching messages via message_manager.get_messages_by_ids_async
|
|
3. Calling agent_loop.summarize_conversation_history with force=True
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
from typing import List, Literal
|
|
|
|
import pytest
|
|
|
|
from letta.agents.letta_agent_v2 import LettaAgentV2
|
|
from letta.agents.letta_agent_v3 import LettaAgentV3
|
|
from letta.config import LettaConfig
|
|
from letta.schemas.agent import CreateAgent, UpdateAgent
|
|
from letta.schemas.block import BlockUpdate, CreateBlock
|
|
from letta.schemas.embedding_config import EmbeddingConfig
|
|
from letta.schemas.enums import MessageRole
|
|
from letta.schemas.letta_message import EventMessage, LettaMessage, SummaryMessage
|
|
from letta.schemas.letta_message_content import TextContent, ToolCallContent, ToolReturnContent
|
|
from letta.schemas.llm_config import LLMConfig
|
|
from letta.schemas.message import Message as PydanticMessage, MessageCreate
|
|
from letta.schemas.run import Run as PydanticRun
|
|
from letta.server.server import SyncServer
|
|
from letta.services.run_manager import RunManager
|
|
from letta.services.summarizer.summarizer import simple_summary
|
|
from letta.settings import model_settings
|
|
|
|
# Constants
|
|
DEFAULT_EMBEDDING_CONFIG = EmbeddingConfig.default_config(provider="openai")
|
|
|
|
|
|
def get_llm_config(filename: str, llm_config_dir: str = "tests/configs/llm_model_configs") -> LLMConfig:
|
|
"""Load LLM configuration from JSON file."""
|
|
filename = os.path.join(llm_config_dir, filename)
|
|
with open(filename, "r") as f:
|
|
config_data = json.load(f)
|
|
llm_config = LLMConfig(**config_data)
|
|
return llm_config
|
|
|
|
|
|
# Test configurations - using a subset of models for summarization tests
|
|
all_configs = [
|
|
"openai-gpt-5-mini.json",
|
|
# "claude-4-5-haiku.json",
|
|
# "gemini-2.5-flash.json",
|
|
# "gemini-2.5-flash-vertex.json", # Requires Vertex AI credentials
|
|
# "openai-gpt-4.1.json",
|
|
# "openai-o1.json",
|
|
# "openai-o3.json",
|
|
# "openai-o4-mini.json",
|
|
# "claude-4-sonnet.json",
|
|
# "claude-3-7-sonnet.json",
|
|
# "gemini-2.5-pro-vertex.json",
|
|
]
|
|
|
|
requested = os.getenv("LLM_CONFIG_FILE")
|
|
filenames = [requested] if requested else all_configs
|
|
TESTED_LLM_CONFIGS: List[LLMConfig] = [get_llm_config(fn) for fn in filenames]
|
|
# Filter out deprecated Gemini 1.5 models
|
|
TESTED_LLM_CONFIGS = [
|
|
cfg
|
|
for cfg in TESTED_LLM_CONFIGS
|
|
if not (cfg.model_endpoint_type in ["google_vertex", "google_ai"] and cfg.model.startswith("gemini-1.5"))
|
|
]
|
|
|
|
|
|
# ======================================================================================================================
|
|
# Fixtures
|
|
# ======================================================================================================================
|
|
|
|
|
|
@pytest.fixture
|
|
async def server():
|
|
config = LettaConfig.load()
|
|
config.save()
|
|
server = SyncServer(init_with_default_org_and_user=True)
|
|
await server.init_async()
|
|
await server.tool_manager.upsert_base_tools_async(actor=server.default_user)
|
|
|
|
yield server
|
|
|
|
|
|
@pytest.fixture
|
|
async def default_organization(server: SyncServer):
|
|
"""Create and return the default organization."""
|
|
org = await server.organization_manager.create_default_organization_async()
|
|
yield org
|
|
|
|
|
|
@pytest.fixture
|
|
async def default_user(server: SyncServer, default_organization):
|
|
"""Create and return the default user."""
|
|
user = await server.user_manager.create_default_actor_async(org_id=default_organization.id)
|
|
yield user
|
|
|
|
|
|
@pytest.fixture
|
|
async def actor(default_user):
|
|
"""Return actor for authorization."""
|
|
return default_user
|
|
|
|
|
|
# ======================================================================================================================
|
|
# Helper Functions
|
|
# ======================================================================================================================
|
|
|
|
|
|
def create_large_tool_return(size_chars: int = 50000) -> str:
|
|
"""Create a large tool return string for testing."""
|
|
# Create a realistic-looking tool return with repeated data
|
|
base_item = {
|
|
"id": 12345,
|
|
"name": "Sample Item",
|
|
"description": "This is a sample item description that will be repeated many times to create a large payload",
|
|
"metadata": {"created_at": "2025-01-01T00:00:00Z", "updated_at": "2025-01-01T00:00:00Z", "version": "1.0.0"},
|
|
"tags": ["tag1", "tag2", "tag3", "tag4", "tag5"],
|
|
"nested_data": {"level1": {"level2": {"level3": {"value": "deeply nested value"}}}},
|
|
}
|
|
|
|
items = []
|
|
current_size = 0
|
|
item_json = json.dumps(base_item)
|
|
item_size = len(item_json)
|
|
|
|
while current_size < size_chars:
|
|
items.append(base_item.copy())
|
|
current_size += item_size
|
|
|
|
result = {"status": "success", "total_items": len(items), "items": items}
|
|
return json.dumps(result)
|
|
|
|
|
|
async def create_agent_with_messages(server: SyncServer, actor, llm_config: LLMConfig, messages: List[PydanticMessage]) -> tuple:
|
|
"""
|
|
Create an agent and add messages to it.
|
|
Returns (agent_state, in_context_messages).
|
|
"""
|
|
# Create agent (replace dots and slashes with underscores for valid names)
|
|
agent_name = f"test_agent_{llm_config.model}".replace(".", "_").replace("/", "_")
|
|
agent_create = CreateAgent(
|
|
name=agent_name,
|
|
llm_config=llm_config,
|
|
embedding_config=DEFAULT_EMBEDDING_CONFIG,
|
|
)
|
|
agent_state = await server.agent_manager.create_agent_async(agent_create, actor=actor)
|
|
|
|
# Add messages to the agent
|
|
# Set agent_id on all message objects
|
|
message_objs = []
|
|
for msg in messages:
|
|
msg_dict = msg.model_dump() if hasattr(msg, "model_dump") else msg.dict()
|
|
msg_dict["agent_id"] = agent_state.id
|
|
message_objs.append(PydanticMessage(**msg_dict))
|
|
|
|
created_messages = await server.message_manager.create_many_messages_async(message_objs, actor=actor)
|
|
|
|
# Update agent's message_ids
|
|
message_ids = [m.id for m in created_messages]
|
|
await server.agent_manager.update_message_ids_async(agent_id=agent_state.id, message_ids=message_ids, actor=actor)
|
|
|
|
# Reload agent state to get updated message_ids
|
|
agent_state = await server.agent_manager.get_agent_by_id_async(agent_id=agent_state.id, actor=actor)
|
|
|
|
# Fetch messages using the message manager (as in the actual code path)
|
|
in_context_messages = await server.message_manager.get_messages_by_ids_async(message_ids=agent_state.message_ids, actor=actor)
|
|
|
|
return agent_state, in_context_messages
|
|
|
|
|
|
async def run_summarization(server: SyncServer, agent_state, in_context_messages, actor, force=True):
|
|
"""
|
|
Execute the summarization code path that needs to be tested.
|
|
|
|
This follows the exact code path specified:
|
|
1. Create LettaAgentV2 instance
|
|
2. Fetch messages via message_manager.get_messages_by_ids_async
|
|
3. Call agent_loop.summarize_conversation_history with force=True
|
|
"""
|
|
agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
|
|
|
|
# Run summarization with force parameter
|
|
summary_message, messages, summary = await agent_loop.compact(messages=in_context_messages)
|
|
|
|
return summary_message, messages, summary
|
|
|
|
|
|
# ======================================================================================================================
|
|
# Test Cases
|
|
# ======================================================================================================================
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize(
|
|
"llm_config",
|
|
TESTED_LLM_CONFIGS,
|
|
ids=[c.model for c in TESTED_LLM_CONFIGS],
|
|
)
|
|
async def test_summarize_empty_message_buffer(server: SyncServer, actor, llm_config: LLMConfig):
|
|
"""
|
|
Test summarization when there are no messages in the buffer.
|
|
Should handle gracefully - either return empty list or raise a clear error.
|
|
"""
|
|
# Create agent with no messages (replace dots and slashes with underscores for valid names)
|
|
agent_name = f"test_agent_empty_{llm_config.model}".replace(".", "_").replace("/", "_")
|
|
agent_create = CreateAgent(
|
|
name=agent_name,
|
|
llm_config=llm_config,
|
|
embedding_config=DEFAULT_EMBEDDING_CONFIG,
|
|
)
|
|
agent_state = await server.agent_manager.create_agent_async(agent_create, actor=actor)
|
|
|
|
# Get messages (should be empty or only contain system messages)
|
|
in_context_messages = await server.message_manager.get_messages_by_ids_async(message_ids=agent_state.message_ids, actor=actor)
|
|
|
|
# Run summarization - this may fail with empty buffer, which is acceptable behavior
|
|
try:
|
|
summary, result, _ = await run_summarization(server, agent_state, in_context_messages, actor)
|
|
# If it succeeds, verify result
|
|
assert isinstance(result, list)
|
|
|
|
# When summarization runs, V3 ensures that in-context messages follow
|
|
# the pattern:
|
|
# 1. System prompt
|
|
# 2. User summary message (system_alert JSON)
|
|
# 3. Remaining messages (which may be empty for this test)
|
|
|
|
# We should always keep the original system message at the front.
|
|
assert len(result) >= 1
|
|
assert result[0].role == MessageRole.system
|
|
|
|
# If summarization did in fact add a summary message, we expect it to
|
|
# be the second message with user role.
|
|
if len(result) >= 2:
|
|
assert result[1].role == MessageRole.user
|
|
except ValueError as e:
|
|
# It's acceptable for summarization to fail on empty buffer
|
|
assert "No assistant message found" in str(e) or "empty" in str(e).lower()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.skipif(
|
|
not model_settings.anthropic_api_key,
|
|
reason="Missing LETTA_ANTHROPIC_API_KEY (or equivalent settings) for Anthropic integration test",
|
|
)
|
|
async def test_simple_summary_anthropic_uses_streaming_and_returns_summary(actor, monkeypatch):
|
|
"""Regression test: Anthropic summarization must use streaming and return real text."""
|
|
|
|
# If the summarizer ever falls back to a non-streaming Anthropic call, make it fail fast.
|
|
from letta.llm_api.anthropic_client import AnthropicClient
|
|
|
|
async def _nope_request_async(self, *args, **kwargs):
|
|
raise AssertionError("Anthropic summarizer should not call request_async (must use streaming)")
|
|
|
|
monkeypatch.setattr(AnthropicClient, "request_async", _nope_request_async)
|
|
|
|
# Keep the prompt tiny so this is fast and cheap.
|
|
messages = [
|
|
PydanticMessage(
|
|
role=MessageRole.user,
|
|
content=[TextContent(type="text", text="I'm planning a trip to Paris in April.")],
|
|
),
|
|
PydanticMessage(
|
|
role=MessageRole.assistant,
|
|
content=[
|
|
TextContent(
|
|
type="text",
|
|
text="Great—your priorities are museums and cafes, and you want to stay under $200/day.",
|
|
)
|
|
],
|
|
),
|
|
]
|
|
|
|
anthropic_config = get_llm_config("claude-4-5-haiku.json")
|
|
|
|
summary = await simple_summary(messages=messages, llm_config=anthropic_config, actor=actor)
|
|
|
|
assert isinstance(summary, str)
|
|
assert len(summary) > 10
|
|
# Sanity-check that the model is summarizing the right conversation.
|
|
assert any(token in summary.lower() for token in ["paris", "april", "museum", "cafe", "$200", "200"])
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize(
|
|
"llm_config",
|
|
TESTED_LLM_CONFIGS,
|
|
ids=[c.model for c in TESTED_LLM_CONFIGS],
|
|
)
|
|
async def test_summarize_initialization_messages_only(server: SyncServer, actor, llm_config: LLMConfig):
|
|
"""
|
|
Test summarization when only initialization/system messages are in the buffer.
|
|
Should handle gracefully and likely not summarize.
|
|
"""
|
|
# Create messages - only system initialization messages
|
|
messages = [
|
|
PydanticMessage(
|
|
role=MessageRole.system,
|
|
content=[TextContent(type="text", text="You are a helpful assistant. Your name is Letta.")],
|
|
),
|
|
PydanticMessage(
|
|
role=MessageRole.system,
|
|
content=[TextContent(type="text", text="The current date and time is 2025-01-01 12:00:00 UTC.")],
|
|
),
|
|
]
|
|
|
|
agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
|
|
|
|
# Run summarization - force=True with system messages only may fail
|
|
try:
|
|
summary, result, _ = await run_summarization(server, agent_state, in_context_messages, actor, force=True)
|
|
|
|
# Verify result
|
|
assert isinstance(result, list)
|
|
# System messages should typically be preserved
|
|
assert len(result) >= 1
|
|
except ValueError as e:
|
|
# It's acceptable for summarization to fail on system-only messages
|
|
assert "No assistant message found" in str(e)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize(
|
|
"llm_config",
|
|
TESTED_LLM_CONFIGS,
|
|
ids=[c.model for c in TESTED_LLM_CONFIGS],
|
|
)
|
|
async def test_summarize_small_conversation(server: SyncServer, actor, llm_config: LLMConfig):
|
|
"""
|
|
Test summarization with approximately 5 messages in the buffer.
|
|
This represents a typical small conversation.
|
|
"""
|
|
# Create a small conversation with ~5 messages
|
|
messages = [
|
|
PydanticMessage(
|
|
role=MessageRole.user,
|
|
content=[TextContent(type="text", text="Hello! Can you help me with a Python question?")],
|
|
),
|
|
PydanticMessage(
|
|
role=MessageRole.assistant,
|
|
content=[TextContent(type="text", text="Of course! I'd be happy to help you with Python. What would you like to know?")],
|
|
),
|
|
PydanticMessage(
|
|
role=MessageRole.user,
|
|
content=[TextContent(type="text", text="How do I read a file in Python?")],
|
|
),
|
|
PydanticMessage(
|
|
role=MessageRole.assistant,
|
|
content=[
|
|
TextContent(
|
|
type="text",
|
|
text="You can read a file in Python using the open() function. Here's an example:\n\n```python\nwith open('file.txt', 'r') as f:\n content = f.read()\n print(content)\n```",
|
|
)
|
|
],
|
|
),
|
|
PydanticMessage(
|
|
role=MessageRole.user,
|
|
content=[TextContent(type="text", text="Thank you! That's very helpful.")],
|
|
),
|
|
]
|
|
|
|
agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
|
|
|
|
# Run summarization with force=True
|
|
# Note: force=True with clear=True can be very aggressive and may fail on small message sets
|
|
try:
|
|
summary, result, _ = await run_summarization(server, agent_state, in_context_messages, actor, force=True)
|
|
|
|
# Verify result
|
|
assert isinstance(result, list)
|
|
# With force=True, some summarization should occur
|
|
# The result might be shorter than the original if summarization happened
|
|
assert len(result) >= 1
|
|
|
|
# Verify that the result contains valid messages
|
|
for msg in result:
|
|
assert hasattr(msg, "role")
|
|
assert hasattr(msg, "content")
|
|
except ValueError as e:
|
|
# With force=True + clear=True, aggressive summarization might fail on small message sets
|
|
# This is acceptable behavior
|
|
assert "No assistant message found" in str(e)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize(
|
|
"llm_config",
|
|
TESTED_LLM_CONFIGS,
|
|
ids=[c.model for c in TESTED_LLM_CONFIGS],
|
|
)
|
|
async def test_summarize_large_tool_calls(server: SyncServer, actor, llm_config: LLMConfig):
|
|
"""
|
|
Test summarization with large tool calls and returns (~50k character tool returns).
|
|
This tests the system's ability to handle and summarize very large context windows.
|
|
"""
|
|
# Create a large tool return
|
|
large_return = create_large_tool_return(50000)
|
|
|
|
# Create messages with large tool calls and returns
|
|
messages = [
|
|
PydanticMessage(
|
|
role=MessageRole.user,
|
|
content=[TextContent(type="text", text="Please fetch all the data from the database.")],
|
|
),
|
|
PydanticMessage(
|
|
role=MessageRole.assistant,
|
|
content=[
|
|
TextContent(type="text", text="I'll fetch the data for you."),
|
|
ToolCallContent(
|
|
type="tool_call",
|
|
id="call_1",
|
|
name="fetch_database_records",
|
|
input={"query": "SELECT * FROM records"},
|
|
),
|
|
],
|
|
),
|
|
PydanticMessage(
|
|
role=MessageRole.tool,
|
|
tool_call_id="call_1",
|
|
content=[
|
|
ToolReturnContent(
|
|
type="tool_return",
|
|
tool_call_id="call_1",
|
|
content=large_return,
|
|
is_error=False,
|
|
)
|
|
],
|
|
),
|
|
PydanticMessage(
|
|
role=MessageRole.assistant,
|
|
content=[
|
|
TextContent(
|
|
type="text",
|
|
text="I've successfully fetched all the records from the database. There are thousands of items in the result set.",
|
|
)
|
|
],
|
|
),
|
|
PydanticMessage(
|
|
role=MessageRole.user,
|
|
content=[TextContent(type="text", text="Great! Can you summarize what you found?")],
|
|
),
|
|
PydanticMessage(
|
|
role=MessageRole.assistant,
|
|
content=[
|
|
TextContent(
|
|
type="text",
|
|
text="Based on the data I retrieved, there are numerous records containing various items with descriptions, metadata, and nested data structures. Each record includes timestamps and version information.",
|
|
)
|
|
],
|
|
),
|
|
]
|
|
|
|
agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
|
|
|
|
# Verify that we actually have large messages
|
|
total_content_size = sum(len(str(content)) for msg in in_context_messages for content in msg.content)
|
|
assert total_content_size > 40000, f"Expected large messages, got {total_content_size} chars"
|
|
|
|
# Run summarization
|
|
summary, result, _ = await run_summarization(server, agent_state, in_context_messages, actor)
|
|
|
|
# Verify result
|
|
assert isinstance(result, list)
|
|
assert len(result) >= 1
|
|
|
|
# Verify that summarization reduced the context size
|
|
result_content_size = sum(len(str(content)) for msg in result for content in msg.content)
|
|
|
|
# The summarized result should be smaller than the original
|
|
# (unless summarization was skipped for some reason)
|
|
print(f"Original size: {total_content_size} chars, Summarized size: {result_content_size} chars")
|
|
|
|
# Verify that the result contains valid messages
|
|
for msg in result:
|
|
assert hasattr(msg, "role")
|
|
assert hasattr(msg, "content")
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize(
|
|
"llm_config",
|
|
TESTED_LLM_CONFIGS,
|
|
ids=[c.model for c in TESTED_LLM_CONFIGS],
|
|
)
|
|
async def test_summarize_multiple_large_tool_calls(server: SyncServer, actor, llm_config: LLMConfig):
|
|
"""
|
|
Test summarization with multiple large tool calls in sequence.
|
|
This stress-tests the summarization with multiple large context items.
|
|
"""
|
|
# Create multiple large tool returns
|
|
large_return_1 = create_large_tool_return(25000)
|
|
large_return_2 = create_large_tool_return(25000)
|
|
|
|
messages = [
|
|
PydanticMessage(
|
|
role=MessageRole.user,
|
|
content=[TextContent(type="text", text="Fetch user data.")],
|
|
),
|
|
PydanticMessage(
|
|
role=MessageRole.assistant,
|
|
content=[
|
|
TextContent(type="text", text="Fetching users..."),
|
|
ToolCallContent(
|
|
type="tool_call",
|
|
id="call_1",
|
|
name="fetch_users",
|
|
input={"limit": 10000},
|
|
),
|
|
],
|
|
),
|
|
PydanticMessage(
|
|
role=MessageRole.tool,
|
|
tool_call_id="call_1",
|
|
content=[
|
|
ToolReturnContent(
|
|
type="tool_return",
|
|
tool_call_id="call_1",
|
|
content=large_return_1,
|
|
is_error=False,
|
|
)
|
|
],
|
|
),
|
|
PydanticMessage(
|
|
role=MessageRole.assistant,
|
|
content=[TextContent(type="text", text="Retrieved user data. Now fetching product data.")],
|
|
),
|
|
PydanticMessage(
|
|
role=MessageRole.assistant,
|
|
content=[
|
|
TextContent(type="text", text="Fetching products..."),
|
|
ToolCallContent(
|
|
type="tool_call",
|
|
id="call_2",
|
|
name="fetch_products",
|
|
input={"category": "all"},
|
|
),
|
|
],
|
|
),
|
|
PydanticMessage(
|
|
role=MessageRole.tool,
|
|
tool_call_id="call_2",
|
|
content=[
|
|
ToolReturnContent(
|
|
type="tool_return",
|
|
tool_call_id="call_2",
|
|
content=large_return_2,
|
|
is_error=False,
|
|
)
|
|
],
|
|
),
|
|
PydanticMessage(
|
|
role=MessageRole.assistant,
|
|
content=[TextContent(type="text", text="I've successfully fetched both user and product data.")],
|
|
),
|
|
]
|
|
|
|
agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
|
|
|
|
# Verify that we have large messages
|
|
total_content_size = sum(len(str(content)) for msg in in_context_messages for content in msg.content)
|
|
assert total_content_size > 40000, f"Expected large messages, got {total_content_size} chars"
|
|
|
|
# Run summarization
|
|
summary, result, _ = await run_summarization(server, agent_state, in_context_messages, actor)
|
|
|
|
# Verify result
|
|
assert isinstance(result, list)
|
|
assert len(result) >= 1
|
|
|
|
# Verify that the result contains valid messages
|
|
for msg in result:
|
|
assert hasattr(msg, "role")
|
|
assert hasattr(msg, "content")
|
|
|
|
print(f"Summarized {len(in_context_messages)} messages with {total_content_size} chars to {len(result)} messages")
|
|
|
|
|
|
# @pytest.mark.asyncio
|
|
# @pytest.mark.parametrize(
|
|
# "llm_config",
|
|
# TESTED_LLM_CONFIGS,
|
|
# ids=[c.model for c in TESTED_LLM_CONFIGS],
|
|
# )
|
|
# async def test_summarize_truncates_large_tool_return(server: SyncServer, actor, llm_config: LLMConfig):
|
|
# """
|
|
# Test that summarization properly truncates very large tool returns.
|
|
# This ensures that oversized tool returns don't consume excessive context.
|
|
# """
|
|
# # Create an extremely large tool return (100k chars)
|
|
# large_return = create_large_tool_return(100000)
|
|
# original_size = len(large_return)
|
|
#
|
|
# # Create messages with a large tool return
|
|
# messages = [
|
|
# PydanticMessage(
|
|
# role=MessageRole.user,
|
|
# content=[TextContent(type="text", text="Please run the database query.")],
|
|
# ),
|
|
# PydanticMessage(
|
|
# role=MessageRole.assistant,
|
|
# content=[
|
|
# TextContent(type="text", text="Running query..."),
|
|
# ToolCallContent(
|
|
# type="tool_call",
|
|
# id="call_1",
|
|
# name="run_query",
|
|
# input={"query": "SELECT * FROM large_table"},
|
|
# ),
|
|
# ],
|
|
# ),
|
|
# PydanticMessage(
|
|
# role=MessageRole.tool,
|
|
# tool_call_id="call_1",
|
|
# content=[
|
|
# ToolReturnContent(
|
|
# type="tool_return",
|
|
# tool_call_id="call_1",
|
|
# content=large_return,
|
|
# is_error=False,
|
|
# )
|
|
# ],
|
|
# ),
|
|
# PydanticMessage(
|
|
# role=MessageRole.assistant,
|
|
# content=[TextContent(type="text", text="Query completed successfully with many results.")],
|
|
# ),
|
|
# ]
|
|
#
|
|
# agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
|
|
#
|
|
# # Verify the original tool return is indeed large
|
|
# assert original_size > 90000, f"Expected tool return >90k chars, got {original_size}"
|
|
#
|
|
# # Run summarization
|
|
# summary, result = await run_summarization(server, agent_state, in_context_messages, actor)
|
|
#
|
|
# # Verify result
|
|
# assert isinstance(result, list)
|
|
# assert len(result) >= 1
|
|
#
|
|
# # Find tool return messages in the result and verify truncation occurred
|
|
# tool_returns_found = False
|
|
# for msg in result:
|
|
# if msg.role == MessageRole.tool:
|
|
# for content in msg.content:
|
|
# if isinstance(content, ToolReturnContent):
|
|
# tool_returns_found = True
|
|
# result_size = len(content.content)
|
|
# # Verify that the tool return has been truncated
|
|
# assert result_size < original_size, (
|
|
# f"Expected tool return to be truncated from {original_size} chars, but got {result_size} chars"
|
|
# )
|
|
# print(f"Tool return successfully truncated from {original_size} to {result_size} chars")
|
|
#
|
|
# # If we didn't find any tool returns in the result, that's also acceptable
|
|
# # (they may have been completely removed during aggressive summarization)
|
|
# if not tool_returns_found:
|
|
# print("Tool returns were completely removed during summarization")
|
|
#
|
|
|
|
# ======================================================================================================================
|
|
# CompactionSettings Mode Tests - Using LettaAgentV3
|
|
# ======================================================================================================================
|
|
|
|
from unittest.mock import patch
|
|
|
|
from letta.services.summarizer.summarizer_config import CompactionSettings
|
|
|
|
# Test both summarizer modes: "all" summarizes entire history, "sliding_window" keeps recent messages
|
|
SUMMARIZER_CONFIG_MODES: list[Literal["all", "sliding_window"]] = ["all", "sliding_window"]
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize("mode", SUMMARIZER_CONFIG_MODES, ids=SUMMARIZER_CONFIG_MODES)
|
|
@pytest.mark.parametrize("llm_config", TESTED_LLM_CONFIGS, ids=[c.model for c in TESTED_LLM_CONFIGS])
|
|
async def test_summarize_with_mode(server: SyncServer, actor, llm_config: LLMConfig, mode: Literal["all", "sliding_window"]):
|
|
"""
|
|
Test summarization with different CompactionSettings modes using LettaAgentV3.
|
|
|
|
This test verifies that both summarization modes work correctly:
|
|
- "all": Summarizes the entire conversation history into a single summary
|
|
- "sliding_window": Keeps recent messages and summarizes older ones
|
|
"""
|
|
# Create a conversation with enough messages to trigger summarization
|
|
messages = [
|
|
PydanticMessage(
|
|
role=MessageRole.system,
|
|
content=[TextContent(type="text", text="You are a helpful assistant.")],
|
|
)
|
|
]
|
|
for i in range(10):
|
|
messages.append(
|
|
PydanticMessage(
|
|
role=MessageRole.user,
|
|
content=[TextContent(type="text", text=f"User message {i}: Test message {i}.")],
|
|
)
|
|
)
|
|
messages.append(
|
|
PydanticMessage(
|
|
role=MessageRole.assistant,
|
|
content=[TextContent(type="text", text=f"Assistant response {i}: Acknowledged message {i}.")],
|
|
)
|
|
)
|
|
|
|
agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
|
|
|
|
# Create new messages that would be added during this step
|
|
new_letta_messages = [
|
|
PydanticMessage(
|
|
role=MessageRole.user,
|
|
content=[TextContent(type="text", text="This is a new user message during this step.")],
|
|
agent_id=agent_state.id,
|
|
)
|
|
]
|
|
# Persist the new messages
|
|
new_letta_messages = await server.message_manager.create_many_messages_async(new_letta_messages, actor=actor)
|
|
|
|
# Override compaction settings directly on the agent state
|
|
handle = llm_config.handle or f"{llm_config.model_endpoint_type}/{llm_config.model}"
|
|
agent_state.compaction_settings = CompactionSettings(model=handle, mode=mode)
|
|
|
|
agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
|
|
|
|
summary, result, summary_text = await agent_loop.compact(messages=in_context_messages)
|
|
|
|
assert isinstance(result, list)
|
|
|
|
# Verify that the result contains valid messages
|
|
for msg in result:
|
|
assert hasattr(msg, "role")
|
|
assert hasattr(msg, "content")
|
|
|
|
# Verify the summary text (third return value) is a non-empty string.
|
|
# This is used by the agent loop to construct a SummaryMessage for clients.
|
|
assert isinstance(summary_text, str), f"Expected summary_text to be a string, got {type(summary_text)}"
|
|
assert len(summary_text) > 0, "Expected non-empty summary text"
|
|
|
|
print()
|
|
print(f"RESULTS {mode} ======")
|
|
for msg in result:
|
|
print(f"MSG: {msg}")
|
|
print(f"SUMMARY TEXT: {summary_text[:200]}...")
|
|
|
|
print()
|
|
|
|
if mode == "all":
|
|
# For "all" mode, V3 keeps:
|
|
# 1. System prompt
|
|
# 2. A single user summary message (system_alert JSON)
|
|
# and no remaining historical messages.
|
|
assert len(result) == 2, f"Expected 2 messages for 'all' mode (system + summary), got {len(result)}"
|
|
assert result[0].role == MessageRole.system
|
|
assert result[1].role == MessageRole.user
|
|
else:
|
|
# For "sliding_window" mode, result should include:
|
|
# 1. System prompt
|
|
# 2. User summary message
|
|
# 3+. Recent user/assistant messages inside the window.
|
|
assert len(result) > 2, f"Expected >2 messages for 'sliding_window' mode, got {len(result)}"
|
|
assert result[0].role == MessageRole.system
|
|
assert result[1].role == MessageRole.user
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize(
|
|
"llm_config",
|
|
TESTED_LLM_CONFIGS,
|
|
ids=[c.model for c in TESTED_LLM_CONFIGS],
|
|
)
|
|
async def test_compact_returns_valid_summary_message_and_event_message(server: SyncServer, actor, llm_config: LLMConfig):
|
|
"""
|
|
Test that compact() return values can be used to construct valid SummaryMessage and EventMessage objects.
|
|
|
|
This validates the contract that _step() relies on: compact() returns
|
|
(summary_message_obj, compacted_messages, summary_text) where summary_text
|
|
is used to build a SummaryMessage and the metadata is used for an EventMessage.
|
|
"""
|
|
import uuid
|
|
|
|
from letta.helpers.datetime_helpers import get_utc_time
|
|
|
|
# Create a conversation with enough messages to summarize
|
|
messages = [
|
|
PydanticMessage(
|
|
role=MessageRole.system,
|
|
content=[TextContent(type="text", text="You are a helpful assistant.")],
|
|
)
|
|
]
|
|
for i in range(10):
|
|
messages.append(
|
|
PydanticMessage(
|
|
role=MessageRole.user,
|
|
content=[TextContent(type="text", text=f"User message {i}: Test message {i}.")],
|
|
)
|
|
)
|
|
messages.append(
|
|
PydanticMessage(
|
|
role=MessageRole.assistant,
|
|
content=[TextContent(type="text", text=f"Assistant response {i}: Acknowledged message {i}.")],
|
|
)
|
|
)
|
|
|
|
agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
|
|
|
|
handle = llm_config.handle or f"{llm_config.model_endpoint_type}/{llm_config.model}"
|
|
agent_state.compaction_settings = CompactionSettings(model=handle, mode="all")
|
|
|
|
agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
|
|
|
|
summary_message_obj, compacted_messages, summary_text = await agent_loop.compact(messages=in_context_messages)
|
|
|
|
# Verify we can construct a valid SummaryMessage from compact() return values
|
|
summary_msg = SummaryMessage(
|
|
id=summary_message_obj.id,
|
|
date=summary_message_obj.created_at,
|
|
summary=summary_text,
|
|
otid=PydanticMessage.generate_otid_from_id(summary_message_obj.id, 0),
|
|
step_id=None,
|
|
run_id=None,
|
|
)
|
|
assert summary_msg.message_type == "summary_message"
|
|
assert isinstance(summary_msg.summary, str)
|
|
assert len(summary_msg.summary) > 0
|
|
assert summary_msg.id == summary_message_obj.id
|
|
|
|
# Verify we can construct a valid EventMessage for compaction
|
|
event_msg = EventMessage(
|
|
id=str(uuid.uuid4()),
|
|
date=get_utc_time(),
|
|
event_type="compaction",
|
|
event_data={
|
|
"trigger": "post_step_context_check",
|
|
"context_token_estimate": 1000,
|
|
"context_window": agent_state.llm_config.context_window,
|
|
},
|
|
run_id=None,
|
|
step_id=None,
|
|
)
|
|
assert event_msg.message_type == "event_message"
|
|
assert event_msg.event_type == "compaction"
|
|
assert "trigger" in event_msg.event_data
|
|
assert "context_window" in event_msg.event_data
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize(
|
|
"llm_config",
|
|
TESTED_LLM_CONFIGS,
|
|
ids=[c.model for c in TESTED_LLM_CONFIGS],
|
|
)
|
|
async def test_compact_with_use_summary_role_creates_summary_message_role(server: SyncServer, actor, llm_config: LLMConfig):
|
|
"""
|
|
Test that compact() with use_summary_role=True creates a message with role=MessageRole.summary.
|
|
|
|
This validates that manual compaction endpoints (which pass use_summary_role=True)
|
|
will store summary messages with the dedicated 'summary' role instead of the legacy 'user' role.
|
|
"""
|
|
# Create a conversation with enough messages to summarize
|
|
messages = [
|
|
PydanticMessage(
|
|
role=MessageRole.system,
|
|
content=[TextContent(type="text", text="You are a helpful assistant.")],
|
|
)
|
|
]
|
|
for i in range(10):
|
|
messages.append(
|
|
PydanticMessage(
|
|
role=MessageRole.user,
|
|
content=[TextContent(type="text", text=f"User message {i}: Test message {i}.")],
|
|
)
|
|
)
|
|
messages.append(
|
|
PydanticMessage(
|
|
role=MessageRole.assistant,
|
|
content=[TextContent(type="text", text=f"Assistant response {i}: Acknowledged message {i}.")],
|
|
)
|
|
)
|
|
|
|
agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
|
|
|
|
handle = llm_config.handle or f"{llm_config.model_endpoint_type}/{llm_config.model}"
|
|
agent_state.compaction_settings = CompactionSettings(model=handle, mode="all")
|
|
|
|
agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
|
|
|
|
# Call compact with use_summary_role=True (as the REST endpoints now do)
|
|
summary_message_obj, compacted_messages, summary_text = await agent_loop.compact(
|
|
messages=in_context_messages,
|
|
use_summary_role=True,
|
|
)
|
|
|
|
# Verify the summary message has role=summary (not user)
|
|
assert summary_message_obj.role == MessageRole.summary, (
|
|
f"Expected summary message to have role=summary when use_summary_role=True, got {summary_message_obj.role}"
|
|
)
|
|
|
|
# Verify the compacted messages list structure
|
|
assert len(compacted_messages) == 2, f"Expected 2 messages (system + summary), got {len(compacted_messages)}"
|
|
assert compacted_messages[0].role == MessageRole.system
|
|
assert compacted_messages[1].role == MessageRole.summary
|
|
|
|
# Verify summary text is non-empty
|
|
assert isinstance(summary_text, str)
|
|
assert len(summary_text) > 0
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_v3_compact_uses_compaction_settings_model_and_model_settings(server: SyncServer, actor):
|
|
"""Integration test: LettaAgentV3.compact uses the LLMConfig implied by CompactionSettings.
|
|
|
|
We set a different summarizer model handle + model_settings and verify that
|
|
the LLMConfig passed into simple_summary reflects both the handle and
|
|
the model_settings overrides.
|
|
"""
|
|
|
|
from letta.agents.letta_agent_v3 import LettaAgentV3
|
|
from letta.schemas.model import OpenAIModelSettings, OpenAIReasoning
|
|
from letta.services.summarizer import summarizer_all
|
|
|
|
base_llm_config = LLMConfig.default_config("gpt-4o-mini")
|
|
|
|
messages = [
|
|
PydanticMessage(
|
|
role=MessageRole.system,
|
|
content=[TextContent(type="text", text="You are a helpful assistant.")],
|
|
),
|
|
PydanticMessage(
|
|
role=MessageRole.user,
|
|
content=[TextContent(type="text", text="Hello")],
|
|
),
|
|
PydanticMessage(
|
|
role=MessageRole.assistant,
|
|
content=[TextContent(type="text", text="Hi there")],
|
|
),
|
|
]
|
|
|
|
# Create agent + messages via helper to get a real AgentState
|
|
agent_state, in_context_messages = await create_agent_with_messages(
|
|
server=server,
|
|
actor=actor,
|
|
llm_config=base_llm_config,
|
|
messages=messages,
|
|
)
|
|
|
|
summarizer_handle = "openai/gpt-5-mini"
|
|
summarizer_model_settings = OpenAIModelSettings(
|
|
max_output_tokens=4321,
|
|
temperature=0.05,
|
|
reasoning=OpenAIReasoning(reasoning_effort="high"),
|
|
response_format=None,
|
|
)
|
|
agent_state.compaction_settings = CompactionSettings(
|
|
model=summarizer_handle,
|
|
model_settings=summarizer_model_settings,
|
|
prompt="You are a summarizer.",
|
|
prompt_acknowledgement=True,
|
|
clip_chars=2000,
|
|
mode="all",
|
|
sliding_window_percentage=0.3,
|
|
)
|
|
|
|
captured_llm_config: dict = {}
|
|
|
|
async def fake_simple_summary(messages, llm_config, actor, include_ack=True, prompt=None, **kwargs): # type: ignore[override]
|
|
captured_llm_config["value"] = llm_config
|
|
return "summary text"
|
|
|
|
# Patch simple_summary so we don't hit the real LLM and can inspect llm_config
|
|
with patch.object(summarizer_all, "simple_summary", new=fake_simple_summary):
|
|
agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
|
|
summary_msg, compacted, _ = await agent_loop.compact(messages=in_context_messages)
|
|
|
|
assert summary_msg is not None
|
|
assert "value" in captured_llm_config
|
|
summarizer_llm_config = captured_llm_config["value"]
|
|
|
|
# Agent's llm_config remains the base config
|
|
assert agent_state.llm_config.model == "gpt-4o-mini"
|
|
|
|
# Summarizer llm_config should reflect compaction_settings.model and model_settings
|
|
assert summarizer_llm_config.handle == summarizer_handle
|
|
assert summarizer_llm_config.model == "gpt-5-mini"
|
|
assert summarizer_llm_config.max_tokens == 4321
|
|
assert summarizer_llm_config.temperature == 0.05
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize("llm_config", TESTED_LLM_CONFIGS, ids=[c.model for c in TESTED_LLM_CONFIGS])
|
|
async def test_v3_summarize_hard_eviction_when_still_over_threshold(
|
|
server: SyncServer,
|
|
actor,
|
|
llm_config: LLMConfig,
|
|
caplog,
|
|
):
|
|
"""Regression test: ensure V3 summarizer does a hard eviction when
|
|
summarization fails to bring the context size below the proactive
|
|
summarization threshold.
|
|
|
|
This test simulates the edge case that previously led to summarization
|
|
loops:
|
|
|
|
1. A large pre-summarization token count triggers summarization.
|
|
2. Even after summarization, the (mocked) post-summarization token count
|
|
is still above the trigger threshold.
|
|
3. We verify that LettaAgentV3:
|
|
- Logs an error about summarization failing to reduce context size.
|
|
- Evicts all prior messages, keeping only the system message plus a
|
|
single synthetic user summary message (system_alert).
|
|
- Updates `context_token_estimate` to the token count of the minimal
|
|
context so future steps don't keep re-triggering summarization based
|
|
on a stale, oversized value.
|
|
"""
|
|
|
|
# Build a small but non-trivial conversation with an explicit system
|
|
# message so that after hard eviction we expect to keep exactly that
|
|
# system message plus a single user summary message.
|
|
messages = [
|
|
PydanticMessage(
|
|
role=MessageRole.system,
|
|
content=[TextContent(type="text", text="You are a helpful assistant.")],
|
|
),
|
|
PydanticMessage(
|
|
role=MessageRole.user,
|
|
content=[TextContent(type="text", text="User message 0: hello")],
|
|
),
|
|
PydanticMessage(
|
|
role=MessageRole.assistant,
|
|
content=[TextContent(type="text", text="Assistant response 0: hi there")],
|
|
),
|
|
]
|
|
|
|
agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
|
|
|
|
print("ORIGINAL IN-CONTEXT MESSAGES ======")
|
|
for msg in in_context_messages:
|
|
print(f"MSG: {msg}")
|
|
|
|
# Create the V3 agent loop
|
|
agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
|
|
|
|
# We don't care which summarizer mode is used here; we just need
|
|
# summarize_conversation_history to run and then hit the branch where the
|
|
# *post*-summarization token count is still above the proactive
|
|
# summarization threshold. We simulate that by patching the
|
|
# count_tokens_with_tools helper to report an extremely large
|
|
# token count for the first call (post-summary) and a small count for the
|
|
# second call (after hard eviction).
|
|
with patch("letta.services.summarizer.compact.count_tokens_with_tools") as mock_count_tokens:
|
|
# First call: pretend the summarized context is still huge relative to
|
|
# this model's context window so that we always trigger the
|
|
# hard-eviction path. Second call: minimal context (system only) is
|
|
# small.
|
|
context_limit = llm_config.context_window or 100_000
|
|
huge_tokens = context_limit * 10 # safely above any reasonable trigger
|
|
mock_count_tokens.side_effect = [huge_tokens, 10]
|
|
|
|
caplog.set_level("ERROR")
|
|
|
|
summary, result, summary_text = await agent_loop.compact(
|
|
messages=in_context_messages,
|
|
trigger_threshold=context_limit,
|
|
)
|
|
|
|
# We should have made exactly two token-count calls: one for the
|
|
# summarized context, one for the hard-evicted minimal context.
|
|
assert mock_count_tokens.call_count == 2
|
|
|
|
print("COMPACTED RESULT ======")
|
|
for msg in result:
|
|
print(f"MSG: {msg}")
|
|
|
|
# After hard eviction, we keep only:
|
|
# 1. The system prompt
|
|
# 2. The synthetic user summary message.
|
|
assert isinstance(result, list)
|
|
assert len(result) == 2, f"Expected system + summary after hard eviction, got {len(result)} messages"
|
|
assert result[0].role == MessageRole.system
|
|
assert result[1].role == MessageRole.user
|
|
|
|
# Verify the summary text is returned (used to construct SummaryMessage in the agent loop)
|
|
assert isinstance(summary_text, str), f"Expected summary_text to be a string, got {type(summary_text)}"
|
|
assert len(summary_text) > 0, "Expected non-empty summary text after hard eviction"
|
|
|
|
|
|
# ======================================================================================================================
|
|
# Sliding Window Summarizer Unit Tests
|
|
# ======================================================================================================================
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize(
|
|
"llm_config",
|
|
TESTED_LLM_CONFIGS,
|
|
ids=[c.model for c in TESTED_LLM_CONFIGS],
|
|
)
|
|
async def test_sliding_window_cutoff_index_does_not_exceed_message_count(server: SyncServer, actor, llm_config: LLMConfig):
|
|
"""
|
|
Test that the sliding window summarizer correctly calculates cutoff indices.
|
|
|
|
This test verifies the fix for a bug where the cutoff percentage was treated as
|
|
a whole number (10) instead of a decimal (0.10), causing:
|
|
message_cutoff_index = round(10 * 65) = 650
|
|
when there were only 65 messages, resulting in an empty range loop and the error:
|
|
"No assistant message found from indices 650 to 65"
|
|
|
|
The fix changed:
|
|
- max(..., 10) -> max(..., 0.10)
|
|
- += 10 -> += 0.10
|
|
- >= 100 -> >= 1.0
|
|
|
|
This test uses the real token counter (via create_token_counter) to verify
|
|
the sliding window logic works with actual token counting.
|
|
"""
|
|
from letta.services.summarizer.summarizer_config import CompactionSettings
|
|
from letta.services.summarizer.summarizer_sliding_window import summarize_via_sliding_window
|
|
|
|
# Create a real summarizer config using the default factory
|
|
# Override sliding_window_percentage to 0.3 for this test
|
|
handle = llm_config.handle or f"{llm_config.model_endpoint_type}/{llm_config.model}"
|
|
summarizer_config = CompactionSettings(model=handle)
|
|
summarizer_config.sliding_window_percentage = 0.3
|
|
|
|
# Create 65 messages (similar to the failing case in the bug report)
|
|
# Pattern: system + alternating user/assistant messages
|
|
messages = [
|
|
PydanticMessage(
|
|
role=MessageRole.system,
|
|
content=[TextContent(type="text", text="You are a helpful assistant.")],
|
|
)
|
|
]
|
|
|
|
# Add 64 more messages (32 user-assistant pairs)
|
|
for i in range(32):
|
|
messages.append(
|
|
PydanticMessage(
|
|
role=MessageRole.user,
|
|
content=[TextContent(type="text", text=f"User message {i}")],
|
|
)
|
|
)
|
|
messages.append(
|
|
PydanticMessage(
|
|
role=MessageRole.assistant,
|
|
content=[TextContent(type="text", text=f"Assistant response {i}")],
|
|
)
|
|
)
|
|
|
|
assert len(messages) == 65, f"Expected 65 messages, got {len(messages)}"
|
|
|
|
# This should NOT raise "No assistant message found from indices 650 to 65"
|
|
# With the fix, message_count_cutoff_percent starts at max(0.7, 0.10) = 0.7
|
|
# So message_cutoff_index = round(0.7 * 65) = 46, which is valid
|
|
try:
|
|
summary, remaining_messages = await summarize_via_sliding_window(
|
|
actor=actor,
|
|
llm_config=llm_config,
|
|
agent_llm_config=llm_config, # case where agent and summarizer have same config
|
|
summarizer_config=summarizer_config,
|
|
in_context_messages=messages,
|
|
)
|
|
|
|
# Verify the summary was generated (actual LLM response)
|
|
assert summary is not None
|
|
assert len(summary) > 0
|
|
|
|
# Verify remaining messages is a valid subset
|
|
assert len(remaining_messages) < len(messages)
|
|
assert len(remaining_messages) > 0
|
|
|
|
print(f"Successfully summarized {len(messages)} messages to {len(remaining_messages)} remaining")
|
|
print(f"Summary: {summary[:200]}..." if len(summary) > 200 else f"Summary: {summary}")
|
|
print(f"Using {llm_config.model_endpoint_type} token counter for model {llm_config.model}")
|
|
|
|
except ValueError as e:
|
|
if "No assistant message found from indices" in str(e):
|
|
# Extract the indices from the error message
|
|
import re
|
|
|
|
match = re.search(r"from indices (\d+) to (\d+)", str(e))
|
|
if match:
|
|
start_idx, end_idx = int(match.group(1)), int(match.group(2))
|
|
pytest.fail(
|
|
f"Bug detected: cutoff index ({start_idx}) exceeds message count ({end_idx}). "
|
|
f"This indicates the percentage calculation bug where 10 was used instead of 0.10. "
|
|
f"Error: {e}"
|
|
)
|
|
raise
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize(
|
|
"llm_config",
|
|
TESTED_LLM_CONFIGS,
|
|
ids=[c.model for c in TESTED_LLM_CONFIGS],
|
|
)
|
|
async def test_large_system_prompt_summarization(server: SyncServer, actor, llm_config: LLMConfig):
|
|
"""
|
|
Test edge case of large system prompt / memory blocks.
|
|
|
|
This test verifies that summarization handles the case where the system prompt
|
|
and memory blocks are very large, potentially consuming most of the context window.
|
|
The summarizer should gracefully handle this scenario without errors.
|
|
"""
|
|
|
|
# Override context window to be small so we trigger summarization
|
|
llm_config.context_window = 10000
|
|
|
|
# Create agent with large system prompt and memory blocks
|
|
agent_name = f"test_agent_large_system_prompt_{llm_config.model}".replace(".", "_").replace("/", "_")
|
|
agent_create = CreateAgent(
|
|
name=agent_name,
|
|
llm_config=llm_config,
|
|
embedding_config=DEFAULT_EMBEDDING_CONFIG,
|
|
system="SYSTEM PROMPT " * 10000, # Large system prompt
|
|
memory_blocks=[
|
|
CreateBlock(
|
|
label="human",
|
|
limit=200000,
|
|
value="NAME " * 10000, # Large memory block
|
|
)
|
|
],
|
|
)
|
|
agent_state = await server.agent_manager.create_agent_async(agent_create, actor=actor)
|
|
|
|
# Create a run for the agent using RunManager
|
|
run = PydanticRun(agent_id=agent_state.id)
|
|
run = await RunManager().create_run(pydantic_run=run, actor=actor)
|
|
|
|
# Create the agent loop using LettaAgentV3
|
|
agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
|
|
|
|
# message the agent
|
|
input_message = MessageCreate(role=MessageRole.user, content="Hello")
|
|
|
|
# Call step on the agent - may trigger summarization due to large context
|
|
from letta.errors import SystemPromptTokenExceededError
|
|
|
|
with pytest.raises(SystemPromptTokenExceededError):
|
|
response = await agent_loop.step(
|
|
input_messages=[input_message],
|
|
run_id=run.id,
|
|
max_steps=3,
|
|
)
|
|
|
|
# Repair the agent by shortening the memory blocks and system prompt
|
|
# Update system prompt to a shorter version
|
|
short_system_prompt = "You are a helpful assistant."
|
|
await server.agent_manager.update_agent_async(
|
|
agent_id=agent_state.id,
|
|
agent_update=UpdateAgent(system=short_system_prompt),
|
|
actor=actor,
|
|
)
|
|
|
|
# Update memory block to a shorter version
|
|
short_memory_value = "The user's name is Alice."
|
|
await server.agent_manager.modify_block_by_label_async(
|
|
agent_id=agent_state.id,
|
|
block_label="human",
|
|
block_update=BlockUpdate(value=short_memory_value),
|
|
actor=actor,
|
|
)
|
|
|
|
# Reload agent state after repairs
|
|
agent_state = await server.agent_manager.get_agent_by_id_async(agent_id=agent_state.id, actor=actor)
|
|
print("REPAIRED AGENT STATE ======")
|
|
print(agent_state.system)
|
|
print(agent_state.blocks)
|
|
|
|
# Create a new run for the repaired agent
|
|
run = PydanticRun(agent_id=agent_state.id)
|
|
run = await RunManager().create_run(pydantic_run=run, actor=actor)
|
|
|
|
# Create a new agent loop with the repaired agent state
|
|
agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
|
|
|
|
# Now the agent should be able to respond without context window errors
|
|
response = await agent_loop.step(
|
|
input_messages=[input_message],
|
|
run_id=run.id,
|
|
max_steps=3,
|
|
)
|
|
|
|
# Verify we got a valid response after repair
|
|
assert response is not None
|
|
assert response.messages is not None
|
|
print(f"Agent successfully responded after repair with {len(response.messages)} messages")
|
|
|
|
|
|
# @pytest.mark.asyncio
|
|
# async def test_context_window_overflow_triggers_summarization_in_streaming(server: SyncServer, actor):
|
|
# """
|
|
# Test that a ContextWindowExceededError during a streaming LLM request
|
|
# properly triggers the summarizer and compacts the in-context messages.
|
|
#
|
|
# This test simulates:
|
|
# 1. An LLM streaming request that fails with ContextWindowExceededError
|
|
# 2. The summarizer being invoked to reduce context size
|
|
# 3. Verification that messages are compacted and summary message exists
|
|
#
|
|
# Note: This test only runs with OpenAI since it uses OpenAI-specific error handling.
|
|
# """
|
|
# import uuid
|
|
# from unittest.mock import patch
|
|
#
|
|
# import openai
|
|
#
|
|
# from letta.schemas.message import MessageCreate
|
|
# from letta.schemas.run import Run
|
|
# from letta.services.run_manager import RunManager
|
|
#
|
|
# # Use OpenAI config for this test (since we're using OpenAI-specific error handling)
|
|
# llm_config = get_llm_config("openai-gpt-4o-mini.json")
|
|
#
|
|
# # Create test messages - enough to have something to summarize
|
|
# messages = []
|
|
# for i in range(15):
|
|
# messages.append(
|
|
# PydanticMessage(
|
|
# role=MessageRole.user,
|
|
# content=[TextContent(type="text", text=f"User message {i}: This is test message number {i}.")],
|
|
# )
|
|
# )
|
|
# messages.append(
|
|
# PydanticMessage(
|
|
# role=MessageRole.assistant,
|
|
# content=[TextContent(type="text", text=f"Assistant response {i}: I acknowledge message {i}.")],
|
|
# )
|
|
# )
|
|
#
|
|
# agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
|
|
# original_message_count = len(agent_state.message_ids)
|
|
#
|
|
# # Create an input message to trigger the agent
|
|
# input_message = MessageCreate(
|
|
# role=MessageRole.user,
|
|
# content=[TextContent(type="text", text="Hello, please respond.")],
|
|
# )
|
|
#
|
|
# # Create a proper run record in the database
|
|
# run_manager = RunManager()
|
|
# test_run_id = f"run-{uuid.uuid4()}"
|
|
# test_run = Run(
|
|
# id=test_run_id,
|
|
# agent_id=agent_state.id,
|
|
# )
|
|
# await run_manager.create_run(test_run, actor)
|
|
#
|
|
# # Create the agent loop using LettaAgentV3
|
|
# agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
|
|
#
|
|
# # Track how many times stream_async is called
|
|
# call_count = 0
|
|
#
|
|
# # Store original stream_async method
|
|
# original_stream_async = agent_loop.llm_client.stream_async
|
|
#
|
|
# async def mock_stream_async_with_error(request_data, llm_config):
|
|
# nonlocal call_count
|
|
# call_count += 1
|
|
# if call_count == 1:
|
|
# # First call raises OpenAI BadRequestError with context_length_exceeded error code
|
|
# # This will be properly converted to ContextWindowExceededError by handle_llm_error
|
|
# from unittest.mock import MagicMock
|
|
#
|
|
# import httpx
|
|
#
|
|
# # Create a mock response with the required structure
|
|
# mock_request = httpx.Request("POST", "https://api.openai.com/v1/chat/completions")
|
|
# mock_response = httpx.Response(
|
|
# status_code=400,
|
|
# request=mock_request,
|
|
# json={
|
|
# "error": {
|
|
# "message": "This model's maximum context length is 8000 tokens. However, your messages resulted in 12000 tokens.",
|
|
# "type": "invalid_request_error",
|
|
# "code": "context_length_exceeded",
|
|
# }
|
|
# },
|
|
# )
|
|
#
|
|
# raise openai.BadRequestError(
|
|
# message="This model's maximum context length is 8000 tokens. However, your messages resulted in 12000 tokens.",
|
|
# response=mock_response,
|
|
# body={
|
|
# "error": {
|
|
# "message": "This model's maximum context length is 8000 tokens. However, your messages resulted in 12000 tokens.",
|
|
# "type": "invalid_request_error",
|
|
# "code": "context_length_exceeded",
|
|
# }
|
|
# },
|
|
# )
|
|
# # Subsequent calls use the real implementation
|
|
# return await original_stream_async(request_data, llm_config)
|
|
#
|
|
# # Patch the llm_client's stream_async to raise ContextWindowExceededError on first call
|
|
# with patch.object(agent_loop.llm_client, "stream_async", side_effect=mock_stream_async_with_error):
|
|
# # Execute a streaming step
|
|
# try:
|
|
# result_chunks = []
|
|
# async for chunk in agent_loop.stream(
|
|
# input_messages=[input_message],
|
|
# max_steps=1,
|
|
# stream_tokens=True,
|
|
# run_id=test_run_id,
|
|
# ):
|
|
# result_chunks.append(chunk)
|
|
# except Exception as e:
|
|
# # Some errors might happen due to real LLM calls after retry
|
|
# print(f"Exception during stream: {e}")
|
|
#
|
|
# # Reload agent state to get updated message_ids after summarization
|
|
# updated_agent_state = await server.agent_manager.get_agent_by_id_async(agent_id=agent_state.id, actor=actor)
|
|
# updated_message_count = len(updated_agent_state.message_ids)
|
|
#
|
|
# # Fetch the updated in-context messages
|
|
# updated_in_context_messages = await server.message_manager.get_messages_by_ids_async(
|
|
# message_ids=updated_agent_state.message_ids, actor=actor
|
|
# )
|
|
#
|
|
# # Convert to LettaMessage format for easier content inspection
|
|
# letta_messages = PydanticMessage.to_letta_messages_from_list(updated_in_context_messages)
|
|
#
|
|
# # Verify a summary message exists with the correct format
|
|
# # The summary message has content with type="system_alert" and message containing:
|
|
# # "prior messages ... have been hidden" and "summary of the previous"
|
|
# import json
|
|
#
|
|
# summary_message_found = False
|
|
# summary_message_text = None
|
|
# for msg in letta_messages:
|
|
# # Not all message types have a content attribute (e.g., ReasoningMessage)
|
|
# if not hasattr(msg, "content"):
|
|
# continue
|
|
#
|
|
# content = msg.content
|
|
# # Content can be a string (JSON) or an object with type/message fields
|
|
# if isinstance(content, str):
|
|
# # Try to parse as JSON
|
|
# try:
|
|
# parsed = json.loads(content)
|
|
# if isinstance(parsed, dict) and parsed.get("type") == "system_alert":
|
|
# text_to_check = parsed.get("message", "").lower()
|
|
# if "prior messages" in text_to_check and "hidden" in text_to_check and "summary of the previous" in text_to_check:
|
|
# summary_message_found = True
|
|
# summary_message_text = parsed.get("message")
|
|
# break
|
|
# except (json.JSONDecodeError, TypeError):
|
|
# pass
|
|
# # Check if content has system_alert type with the summary message (object form)
|
|
# elif hasattr(content, "type") and content.type == "system_alert":
|
|
# if hasattr(content, "message") and content.message:
|
|
# text_to_check = content.message.lower()
|
|
# if "prior messages" in text_to_check and "hidden" in text_to_check and "summary of the previous" in text_to_check:
|
|
# summary_message_found = True
|
|
# summary_message_text = content.message
|
|
# break
|
|
#
|
|
# assert summary_message_found, (
|
|
# "A summary message should exist in the in-context messages after summarization. "
|
|
# "Expected format containing 'prior messages...hidden' and 'summary of the previous'"
|
|
# )
|
|
#
|
|
# # Verify we attempted multiple invocations (the failing one + retry after summarization)
|
|
# assert call_count >= 2, f"Expected at least 2 LLM invocations (initial + retry), got {call_count}"
|
|
#
|
|
# # The original messages should have been compacted - the updated count should be less than
|
|
# # original + the new messages added (input + assistant response + tool results)
|
|
# # Since summarization should have removed most of the original 30 messages
|
|
# print("Test passed: Summary message found in context")
|
|
# print(f"Original message count: {original_message_count}, Updated: {updated_message_count}")
|
|
# print(f"Summary message: {summary_message_text[:200] if summary_message_text else 'N/A'}...")
|
|
# print(f"Total LLM invocations: {call_count}")
|
|
#
|
|
#
|
|
# @pytest.mark.asyncio
|
|
# async def test_context_window_overflow_triggers_summarization_in_blocking(server: SyncServer, actor):
|
|
# """
|
|
# Test that a ContextWindowExceededError during a blocking (non-streaming) LLM request
|
|
# properly triggers the summarizer and compacts the in-context messages.
|
|
#
|
|
# This test is similar to the streaming test but uses the blocking step() method.
|
|
#
|
|
# Note: This test only runs with OpenAI since it uses OpenAI-specific error handling.
|
|
# """
|
|
# import uuid
|
|
# from unittest.mock import patch
|
|
#
|
|
# import openai
|
|
#
|
|
# from letta.schemas.message import MessageCreate
|
|
# from letta.schemas.run import Run
|
|
# from letta.services.run_manager import RunManager
|
|
#
|
|
# # Use OpenAI config for this test (since we're using OpenAI-specific error handling)
|
|
# llm_config = get_llm_config("openai-gpt-4o-mini.json")
|
|
#
|
|
# # Create test messages
|
|
# messages = []
|
|
# for i in range(15):
|
|
# messages.append(
|
|
# PydanticMessage(
|
|
# role=MessageRole.user,
|
|
# content=[TextContent(type="text", text=f"User message {i}: This is test message number {i}.")],
|
|
# )
|
|
# )
|
|
# messages.append(
|
|
# PydanticMessage(
|
|
# role=MessageRole.assistant,
|
|
# content=[TextContent(type="text", text=f"Assistant response {i}: I acknowledge message {i}.")],
|
|
# )
|
|
# )
|
|
#
|
|
# agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
|
|
# original_message_count = len(agent_state.message_ids)
|
|
#
|
|
# # Create an input message to trigger the agent
|
|
# input_message = MessageCreate(
|
|
# role=MessageRole.user,
|
|
# content=[TextContent(type="text", text="Hello, please respond.")],
|
|
# )
|
|
#
|
|
# # Create a proper run record in the database
|
|
# run_manager = RunManager()
|
|
# test_run_id = f"run-{uuid.uuid4()}"
|
|
# test_run = Run(
|
|
# id=test_run_id,
|
|
# agent_id=agent_state.id,
|
|
# )
|
|
# await run_manager.create_run(test_run, actor)
|
|
#
|
|
# # Create the agent loop using LettaAgentV3
|
|
# agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
|
|
#
|
|
# # Track how many times request_async is called
|
|
# call_count = 0
|
|
#
|
|
# # Store original request_async method
|
|
# original_request_async = agent_loop.llm_client.request_async
|
|
#
|
|
# async def mock_request_async_with_error(request_data, llm_config):
|
|
# nonlocal call_count
|
|
# call_count += 1
|
|
# if call_count == 1:
|
|
# # First call raises OpenAI BadRequestError with context_length_exceeded error code
|
|
# # This will be properly converted to ContextWindowExceededError by handle_llm_error
|
|
# import httpx
|
|
#
|
|
# # Create a mock response with the required structure
|
|
# mock_request = httpx.Request("POST", "https://api.openai.com/v1/chat/completions")
|
|
# mock_response = httpx.Response(
|
|
# status_code=400,
|
|
# request=mock_request,
|
|
# json={
|
|
# "error": {
|
|
# "message": "This model's maximum context length is 8000 tokens. However, your messages resulted in 12000 tokens.",
|
|
# "type": "invalid_request_error",
|
|
# "code": "context_length_exceeded",
|
|
# }
|
|
# },
|
|
# )
|
|
#
|
|
# raise openai.BadRequestError(
|
|
# message="This model's maximum context length is 8000 tokens. However, your messages resulted in 12000 tokens.",
|
|
# response=mock_response,
|
|
# body={
|
|
# "error": {
|
|
# "message": "This model's maximum context length is 8000 tokens. However, your messages resulted in 12000 tokens.",
|
|
# "type": "invalid_request_error",
|
|
# "code": "context_length_exceeded",
|
|
# }
|
|
# },
|
|
# )
|
|
# # Subsequent calls use the real implementation
|
|
# return await original_request_async(request_data, llm_config)
|
|
#
|
|
# # Patch the llm_client's request_async to raise ContextWindowExceededError on first call
|
|
# with patch.object(agent_loop.llm_client, "request_async", side_effect=mock_request_async_with_error):
|
|
# # Execute a blocking step
|
|
# try:
|
|
# result = await agent_loop.step(
|
|
# input_messages=[input_message],
|
|
# max_steps=1,
|
|
# run_id=test_run_id,
|
|
# )
|
|
# except Exception as e:
|
|
# # Some errors might happen due to real LLM calls after retry
|
|
# print(f"Exception during step: {e}")
|
|
#
|
|
# # Reload agent state to get updated message_ids after summarization
|
|
# updated_agent_state = await server.agent_manager.get_agent_by_id_async(agent_id=agent_state.id, actor=actor)
|
|
# updated_message_count = len(updated_agent_state.message_ids)
|
|
#
|
|
# # Fetch the updated in-context messages
|
|
# updated_in_context_messages = await server.message_manager.get_messages_by_ids_async(
|
|
# message_ids=updated_agent_state.message_ids, actor=actor
|
|
# )
|
|
#
|
|
# # Convert to LettaMessage format for easier content inspection
|
|
# letta_messages = PydanticMessage.to_letta_messages_from_list(updated_in_context_messages)
|
|
#
|
|
# # Verify a summary message exists with the correct format
|
|
# # The summary message has content with type="system_alert" and message containing:
|
|
# # "prior messages ... have been hidden" and "summary of the previous"
|
|
# import json
|
|
#
|
|
# summary_message_found = False
|
|
# summary_message_text = None
|
|
# for msg in letta_messages:
|
|
# # Not all message types have a content attribute (e.g., ReasoningMessage)
|
|
# if not hasattr(msg, "content"):
|
|
# continue
|
|
#
|
|
# content = msg.content
|
|
# # Content can be a string (JSON) or an object with type/message fields
|
|
# if isinstance(content, str):
|
|
# # Try to parse as JSON
|
|
# try:
|
|
# parsed = json.loads(content)
|
|
# if isinstance(parsed, dict) and parsed.get("type") == "system_alert":
|
|
# text_to_check = parsed.get("message", "").lower()
|
|
# if "prior messages" in text_to_check and "hidden" in text_to_check and "summary of the previous" in text_to_check:
|
|
# summary_message_found = True
|
|
# summary_message_text = parsed.get("message")
|
|
# break
|
|
# except (json.JSONDecodeError, TypeError):
|
|
# pass
|
|
# # Check if content has system_alert type with the summary message (object form)
|
|
# elif hasattr(content, "type") and content.type == "system_alert":
|
|
# if hasattr(content, "message") and content.message:
|
|
# text_to_check = content.message.lower()
|
|
# if "prior messages" in text_to_check and "hidden" in text_to_check and "summary of the previous" in text_to_check:
|
|
# summary_message_found = True
|
|
# summary_message_text = content.message
|
|
# break
|
|
#
|
|
# assert summary_message_found, (
|
|
# "A summary message should exist in the in-context messages after summarization. "
|
|
# "Expected format containing 'prior messages...hidden' and 'summary of the previous'"
|
|
# )
|
|
#
|
|
# # Verify we attempted multiple invocations (the failing one + retry after summarization)
|
|
# assert call_count >= 2, f"Expected at least 2 LLM invocations (initial + retry), got {call_count}"
|
|
#
|
|
# # The original messages should have been compacted - the updated count should be less than
|
|
# # original + the new messages added (input + assistant response + tool results)
|
|
# print("Test passed: Summary message found in context (blocking mode)")
|
|
# print(f"Original message count: {original_message_count}, Updated: {updated_message_count}")
|
|
# print(f"Summary message: {summary_message_text[:200] if summary_message_text else 'N/A'}...")
|
|
# print(f"Total LLM invocations: {call_count}")
|
|
#
|
|
#
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize(
|
|
"llm_config",
|
|
TESTED_LLM_CONFIGS,
|
|
ids=[c.model for c in TESTED_LLM_CONFIGS],
|
|
)
|
|
async def test_summarize_all(server: SyncServer, actor, llm_config: LLMConfig):
|
|
"""
|
|
Test the summarize_all function with real LLM calls.
|
|
|
|
This test verifies that the 'all' summarization mode works correctly,
|
|
summarizing the entire conversation into a single summary string.
|
|
"""
|
|
from letta.services.summarizer.summarizer_all import summarize_all
|
|
from letta.services.summarizer.summarizer_config import CompactionSettings
|
|
|
|
# Create a summarizer config with "all" mode
|
|
handle = llm_config.handle or f"{llm_config.model_endpoint_type}/{llm_config.model}"
|
|
summarizer_config = CompactionSettings(model=handle)
|
|
summarizer_config.mode = "all"
|
|
|
|
# Create test messages - a simple conversation
|
|
messages = [
|
|
PydanticMessage(
|
|
role=MessageRole.system,
|
|
content=[TextContent(type="text", text="You are a helpful assistant.")],
|
|
)
|
|
]
|
|
|
|
# Add 10 user-assistant pairs
|
|
for i in range(10):
|
|
messages.append(
|
|
PydanticMessage(
|
|
role=MessageRole.user,
|
|
content=[TextContent(type="text", text=f"User message {i}: What is {i} + {i}?")],
|
|
)
|
|
)
|
|
messages.append(
|
|
PydanticMessage(
|
|
role=MessageRole.assistant,
|
|
content=[TextContent(type="text", text=f"Assistant response {i}: {i} + {i} = {i * 2}.")],
|
|
)
|
|
)
|
|
|
|
assert len(messages) == 21, f"Expected 21 messages, got {len(messages)}"
|
|
|
|
# Call summarize_all with real LLM
|
|
summary, new_in_context_messages = await summarize_all(
|
|
actor=actor,
|
|
llm_config=llm_config,
|
|
summarizer_config=summarizer_config,
|
|
in_context_messages=messages,
|
|
)
|
|
|
|
# Verify the summary was generated
|
|
assert len(new_in_context_messages) == 1
|
|
assert summary is not None
|
|
assert len(summary) > 0
|
|
assert len(summary) <= 2000
|
|
|
|
print(f"Successfully summarized {len(messages)} messages using 'all' mode")
|
|
print(f"Summary: {summary[:200]}..." if len(summary) > 200 else f"Summary: {summary}")
|
|
print(f"Using {llm_config.model_endpoint_type} for model {llm_config.model}")
|
|
|
|
|
|
# =============================================================================
|
|
# CompactionStats tests
|
|
# =============================================================================
|
|
|
|
|
|
def test_compaction_stats_embedding_in_packed_json():
|
|
"""Test that compaction_stats are correctly embedded in the packed JSON by package_summarize_message_no_counts."""
|
|
from letta.system import package_summarize_message_no_counts
|
|
|
|
stats = {
|
|
"trigger": "post_step_context_check",
|
|
"context_tokens_before": 50000,
|
|
"context_tokens_after": 15000,
|
|
"context_window": 128000,
|
|
"messages_count_before": 45,
|
|
"messages_count_after": 12,
|
|
}
|
|
|
|
packed = package_summarize_message_no_counts(
|
|
summary="Test summary content",
|
|
timezone="UTC",
|
|
compaction_stats=stats,
|
|
)
|
|
|
|
# Parse the packed JSON
|
|
packed_json = json.loads(packed)
|
|
|
|
# Verify structure
|
|
assert "type" in packed_json
|
|
assert packed_json["type"] == "system_alert"
|
|
assert "message" in packed_json
|
|
assert "Test summary content" in packed_json["message"]
|
|
assert "compaction_stats" in packed_json
|
|
|
|
# Verify stats content
|
|
embedded_stats = packed_json["compaction_stats"]
|
|
assert embedded_stats["trigger"] == "post_step_context_check"
|
|
assert embedded_stats["context_tokens_before"] == 50000
|
|
assert embedded_stats["context_tokens_after"] == 15000
|
|
assert embedded_stats["context_window"] == 128000
|
|
assert embedded_stats["messages_count_before"] == 45
|
|
assert embedded_stats["messages_count_after"] == 12
|
|
|
|
|
|
def test_compaction_stats_embedding_without_stats():
|
|
"""Test that packed JSON works correctly when no stats are provided."""
|
|
from letta.system import package_summarize_message_no_counts
|
|
|
|
packed = package_summarize_message_no_counts(
|
|
summary="Test summary content",
|
|
timezone="UTC",
|
|
compaction_stats=None,
|
|
)
|
|
|
|
packed_json = json.loads(packed)
|
|
|
|
assert "type" in packed_json
|
|
assert "message" in packed_json
|
|
assert "compaction_stats" not in packed_json
|
|
|
|
|
|
def test_extract_compaction_stats_from_packed_json():
|
|
"""Test extracting CompactionStats from a packed JSON string."""
|
|
from letta.schemas.letta_message import CompactionStats, extract_compaction_stats_from_packed_json
|
|
|
|
packed_json = json.dumps(
|
|
{
|
|
"type": "system_alert",
|
|
"message": "Test summary",
|
|
"time": "2024-01-15T10:00:00",
|
|
"compaction_stats": {
|
|
"trigger": "context_window_exceeded",
|
|
"context_tokens_before": 100000,
|
|
"context_tokens_after": 30000,
|
|
"context_window": 128000,
|
|
"messages_count_before": 50,
|
|
"messages_count_after": 15,
|
|
},
|
|
}
|
|
)
|
|
|
|
stats = extract_compaction_stats_from_packed_json(packed_json)
|
|
|
|
assert stats is not None
|
|
assert isinstance(stats, CompactionStats)
|
|
assert stats.trigger == "context_window_exceeded"
|
|
assert stats.context_tokens_before == 100000
|
|
assert stats.context_tokens_after == 30000
|
|
assert stats.context_window == 128000
|
|
assert stats.messages_count_before == 50
|
|
assert stats.messages_count_after == 15
|
|
|
|
|
|
def test_extract_compaction_stats_from_packed_json_without_stats():
|
|
"""Test that extraction returns None when no stats are present (backward compatibility)."""
|
|
from letta.schemas.letta_message import extract_compaction_stats_from_packed_json
|
|
|
|
# Old format without compaction_stats
|
|
packed_json = json.dumps(
|
|
{
|
|
"type": "system_alert",
|
|
"message": "Test summary",
|
|
"time": "2024-01-15T10:00:00",
|
|
}
|
|
)
|
|
|
|
stats = extract_compaction_stats_from_packed_json(packed_json)
|
|
|
|
assert stats is None
|
|
|
|
|
|
def test_extract_compaction_stats_from_packed_json_invalid_json():
|
|
"""Test that extraction handles invalid JSON gracefully."""
|
|
from letta.schemas.letta_message import extract_compaction_stats_from_packed_json
|
|
|
|
stats = extract_compaction_stats_from_packed_json("not valid json")
|
|
assert stats is None
|
|
|
|
stats = extract_compaction_stats_from_packed_json("")
|
|
assert stats is None
|
|
|
|
|
|
def test_extract_compaction_stats_from_packed_json_invalid_stats():
|
|
"""Test that extraction handles invalid stats structure gracefully."""
|
|
from letta.schemas.letta_message import extract_compaction_stats_from_packed_json
|
|
|
|
# Missing required fields
|
|
packed_json = json.dumps(
|
|
{
|
|
"type": "system_alert",
|
|
"message": "Test summary",
|
|
"compaction_stats": {
|
|
"trigger": "test",
|
|
# Missing context_window, messages_count_before, messages_count_after
|
|
},
|
|
}
|
|
)
|
|
|
|
stats = extract_compaction_stats_from_packed_json(packed_json)
|
|
assert stats is None # Should return None due to validation failure
|
|
|
|
|
|
def test_extract_compaction_stats_from_message():
|
|
"""Test extracting CompactionStats from a Message object."""
|
|
from letta.agents.letta_agent_v3 import extract_compaction_stats_from_message
|
|
from letta.schemas.letta_message import CompactionStats
|
|
|
|
packed_content = json.dumps(
|
|
{
|
|
"type": "system_alert",
|
|
"message": "Test summary",
|
|
"time": "2024-01-15T10:00:00",
|
|
"compaction_stats": {
|
|
"trigger": "post_step_context_check",
|
|
"context_tokens_before": 50000,
|
|
"context_tokens_after": 15000,
|
|
"context_window": 128000,
|
|
"messages_count_before": 45,
|
|
"messages_count_after": 12,
|
|
},
|
|
}
|
|
)
|
|
|
|
message = PydanticMessage(
|
|
role=MessageRole.summary,
|
|
content=[TextContent(type="text", text=packed_content)],
|
|
)
|
|
|
|
stats = extract_compaction_stats_from_message(message)
|
|
|
|
assert stats is not None
|
|
assert isinstance(stats, CompactionStats)
|
|
assert stats.trigger == "post_step_context_check"
|
|
assert stats.context_tokens_before == 50000
|
|
assert stats.messages_count_after == 12
|
|
|
|
|
|
def test_extract_compaction_stats_from_message_without_stats():
|
|
"""Test that Message extraction returns None when no stats are present."""
|
|
from letta.agents.letta_agent_v3 import extract_compaction_stats_from_message
|
|
|
|
packed_content = json.dumps(
|
|
{
|
|
"type": "system_alert",
|
|
"message": "Old format summary",
|
|
"time": "2024-01-15T10:00:00",
|
|
}
|
|
)
|
|
|
|
message = PydanticMessage(
|
|
role=MessageRole.summary,
|
|
content=[TextContent(type="text", text=packed_content)],
|
|
)
|
|
|
|
stats = extract_compaction_stats_from_message(message)
|
|
assert stats is None
|
|
|
|
|
|
def test_message_to_summary_message_with_stats():
|
|
"""Test that Message._convert_summary_message extracts compaction_stats."""
|
|
from letta.schemas.letta_message import CompactionStats
|
|
|
|
packed_content = json.dumps(
|
|
{
|
|
"type": "system_alert",
|
|
"message": "Summary of conversation",
|
|
"time": "2024-01-15T10:00:00",
|
|
"compaction_stats": {
|
|
"trigger": "context_window_exceeded",
|
|
"context_tokens_before": 80000,
|
|
"context_tokens_after": 25000,
|
|
"context_window": 128000,
|
|
"messages_count_before": 60,
|
|
"messages_count_after": 20,
|
|
},
|
|
}
|
|
)
|
|
|
|
message = PydanticMessage(
|
|
role=MessageRole.summary,
|
|
content=[TextContent(type="text", text=packed_content)],
|
|
)
|
|
|
|
# Convert to SummaryMessage (as_user_message=False)
|
|
summary_msg = message._convert_summary_message(as_user_message=False)
|
|
|
|
assert summary_msg.message_type == "summary_message"
|
|
assert summary_msg.compaction_stats is not None
|
|
assert isinstance(summary_msg.compaction_stats, CompactionStats)
|
|
assert summary_msg.compaction_stats.trigger == "context_window_exceeded"
|
|
assert summary_msg.compaction_stats.context_tokens_before == 80000
|
|
|
|
|
|
def test_message_to_summary_message_backward_compatible():
|
|
"""Test that old messages without compaction_stats still convert correctly."""
|
|
packed_content = json.dumps(
|
|
{
|
|
"type": "system_alert",
|
|
"message": "Old format summary without stats",
|
|
"time": "2024-01-15T10:00:00",
|
|
}
|
|
)
|
|
|
|
message = PydanticMessage(
|
|
role=MessageRole.summary,
|
|
content=[TextContent(type="text", text=packed_content)],
|
|
)
|
|
|
|
summary_msg = message._convert_summary_message(as_user_message=False)
|
|
|
|
assert summary_msg.message_type == "summary_message"
|
|
assert summary_msg.compaction_stats is None # Should be None for old messages
|
|
assert "Old format summary" in summary_msg.summary
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize(
|
|
"llm_config",
|
|
TESTED_LLM_CONFIGS,
|
|
ids=[c.model for c in TESTED_LLM_CONFIGS],
|
|
)
|
|
async def test_compact_with_stats_params_embeds_stats(server: SyncServer, actor, llm_config: LLMConfig):
|
|
"""
|
|
Integration test: compact() with trigger/context_tokens_before/messages_count_before
|
|
embeds compaction_stats in the packed message content.
|
|
"""
|
|
from letta.agents.letta_agent_v3 import extract_compaction_stats_from_message
|
|
|
|
# Create a conversation with enough messages to summarize
|
|
messages = [
|
|
PydanticMessage(
|
|
role=MessageRole.system,
|
|
content=[TextContent(type="text", text="You are a helpful assistant.")],
|
|
)
|
|
]
|
|
for i in range(10):
|
|
messages.append(
|
|
PydanticMessage(
|
|
role=MessageRole.user,
|
|
content=[TextContent(type="text", text=f"User message {i}")],
|
|
)
|
|
)
|
|
messages.append(
|
|
PydanticMessage(
|
|
role=MessageRole.assistant,
|
|
content=[TextContent(type="text", text=f"Response {i}")],
|
|
)
|
|
)
|
|
|
|
agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
|
|
|
|
handle = llm_config.handle or f"{llm_config.model_endpoint_type}/{llm_config.model}"
|
|
agent_state.compaction_settings = CompactionSettings(model=handle, mode="all")
|
|
|
|
agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
|
|
|
|
# Call compact with stats params
|
|
summary_message_obj, compacted_messages, summary_text = await agent_loop.compact(
|
|
messages=in_context_messages,
|
|
use_summary_role=True,
|
|
trigger="post_step_context_check",
|
|
context_tokens_before=50000,
|
|
messages_count_before=len(in_context_messages),
|
|
)
|
|
|
|
# Extract stats from the message
|
|
stats = extract_compaction_stats_from_message(summary_message_obj)
|
|
|
|
assert stats is not None, "CompactionStats should be embedded in the message"
|
|
assert stats.trigger == "post_step_context_check"
|
|
assert stats.context_tokens_before == 50000
|
|
assert stats.messages_count_before == len(in_context_messages)
|
|
assert stats.context_tokens_after is not None # Should be set by compact()
|
|
assert stats.messages_count_after == len(compacted_messages) # final_messages already includes summary
|
|
assert stats.context_window == llm_config.context_window
|