feat: add tool return truncation to summarization as a fallback [LET-5970] (#5859)

This commit is contained in:
Sarah Wooders
2025-10-31 15:29:14 -07:00
committed by Caren Thomas
parent cdde791b11
commit 57bb051ea4
13 changed files with 209 additions and 16 deletions

View File

@@ -39,13 +39,15 @@ def get_llm_config(filename: str, llm_config_dir: str = "tests/configs/llm_model
# Test configurations - using a subset of models for summarization tests
all_configs = [
"openai-gpt-5-mini.json",
"claude-4-5-haiku.json",
"gemini-2.5-flash.json",
# "gemini-2.5-flash-vertex.json", # Requires Vertex AI credentials
# "openai-gpt-4.1.json",
# "openai-o1.json",
# "openai-o3.json",
# "openai-o4-mini.json",
# "claude-4-sonnet.json",
# "claude-3-7-sonnet.json",
# "gemini-2.5-flash-vertex.json",
# "gemini-2.5-pro-vertex.json",
]
@@ -517,3 +519,86 @@ async def test_summarize_multiple_large_tool_calls(server: SyncServer, actor, ll
assert hasattr(msg, "content")
print(f"Summarized {len(in_context_messages)} messages with {total_content_size} chars to {len(result)} messages")
@pytest.mark.asyncio
@pytest.mark.parametrize(
"llm_config",
TESTED_LLM_CONFIGS,
ids=[c.model for c in TESTED_LLM_CONFIGS],
)
async def test_summarize_truncates_large_tool_return(server: SyncServer, actor, llm_config: LLMConfig):
"""
Test that summarization properly truncates very large tool returns.
This ensures that oversized tool returns don't consume excessive context.
"""
# Create an extremely large tool return (100k chars)
large_return = create_large_tool_return(100000)
original_size = len(large_return)
# Create messages with a large tool return
messages = [
PydanticMessage(
role=MessageRole.user,
content=[TextContent(type="text", text="Please run the database query.")],
),
PydanticMessage(
role=MessageRole.assistant,
content=[
TextContent(type="text", text="Running query..."),
ToolCallContent(
type="tool_call",
id="call_1",
name="run_query",
input={"query": "SELECT * FROM large_table"},
),
],
),
PydanticMessage(
role=MessageRole.tool,
tool_call_id="call_1",
content=[
ToolReturnContent(
type="tool_return",
tool_call_id="call_1",
content=large_return,
is_error=False,
)
],
),
PydanticMessage(
role=MessageRole.assistant,
content=[TextContent(type="text", text="Query completed successfully with many results.")],
),
]
agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
# Verify the original tool return is indeed large
assert original_size > 90000, f"Expected tool return >90k chars, got {original_size}"
# Run summarization
result = await run_summarization(server, agent_state, in_context_messages, actor)
# Verify result
assert isinstance(result, list)
assert len(result) >= 1
# Find tool return messages in the result and verify truncation occurred
tool_returns_found = False
for msg in result:
if msg.role == MessageRole.tool:
for content in msg.content:
if isinstance(content, ToolReturnContent):
tool_returns_found = True
result_size = len(content.content)
# Verify that the tool return has been truncated
assert result_size < original_size, (
f"Expected tool return to be truncated from {original_size} chars, but got {result_size} chars"
)
print(f"Tool return successfully truncated from {original_size} to {result_size} chars")
# If we didn't find any tool returns in the result, that's also acceptable
# (they may have been completely removed during aggressive summarization)
if not tool_returns_found:
print("Tool returns were completely removed during summarization")