Fix sliding window cutoff logic (#9261)

* fix sliding window cutoff calculations to use agent instead of summarizer config * allow approval messages with tool_calls as valid cutoffs, prevent approval pairs from being split * update tests with updated sliding window parameters --------- Co-authored-by: Amy Guan <amy@letta.com>
2026-02-03 22:05:02 -08:00
parent b89387bf38
commit 16c96cc3c0
4 changed files with 25 additions and 10 deletions
--- a/letta/services/summarizer/compact.py
+++ b/letta/services/summarizer/compact.py
@@ -192,6 +192,7 @@ async def compact_messages(
            summary, compacted_messages = await summarize_via_sliding_window(
                actor=actor,
                llm_config=summarizer_llm_config,
                agent_llm_config=agent_llm_config,
                summarizer_config=summarizer_config,
                in_context_messages=messages,
                agent_id=agent_id,
--- a/letta/services/summarizer/summarizer_sliding_window.py
+++ b/letta/services/summarizer/summarizer_sliding_window.py
@@ -93,8 +93,10 @@ async def count_tokens_with_tools(
 async def summarize_via_sliding_window(
    # Required to tag LLM calls
    actor: User,
-    # Actual summarization configuration
+    # LLM config for the summarizer model (used to generate the summary)
    llm_config: LLMConfig,
    # LLM config for the agent model (used to determine context window cutoff for eviction)
    agent_llm_config: LLMConfig,
    summarizer_config: CompactionSettings,
    in_context_messages: List[Message],
    # Telemetry context
@@ -107,10 +109,10 @@ async def summarize_via_sliding_window(
    If the total tokens is greater than the context window limit (or force=True),
    then summarize and rearrange the in-context messages (with the summary in front).
-    Finding the summarization cutoff point (target of final post-summarize count is N% of configured context window):
+    Finding the summarization cutoff point (target of final post-summarize count is N% of agent's context window):
    1. Start at a message index cutoff (1-N%)
    2. Count tokens with system prompt, prior summary (if it exists), and messages past cutoff point (messages[0] + messages[cutoff:])
-    3. Is count(post_sum_messages) <= N% of configured context window?
+    3. Is count(post_sum_messages) <= N% of agent's context window?
      3a. Yes -> create new summary with [prior summary, cutoff:], and safety truncate summary with char count
      3b. No -> increment cutoff by 10%, and repeat
@@ -134,16 +136,24 @@ async def summarize_via_sliding_window(
    eviction_percentage = summarizer_config.sliding_window_percentage
    assert summarizer_config.sliding_window_percentage <= 1.0, "Sliding window percentage must be less than or equal to 1.0"
    assistant_message_index = None
-    approx_token_count = llm_config.context_window
+
-    # valid_cutoff_roles = {MessageRole.assistant, MessageRole.approval}
+    goal_tokens = (1 - summarizer_config.sliding_window_percentage) * agent_llm_config.context_window
-    valid_cutoff_roles = {MessageRole.assistant}
+    approx_token_count = agent_llm_config.context_window
    # allow approvals to be cutoffs (for headless agents) but ensure proper grouping with tool calls
    def is_valid_cutoff(message: Message):
        if message.role == MessageRole.assistant:
            return True
        if message.role == MessageRole.approval:
            return message.tool_calls is not None and len(message.tool_calls) > 0
        return False
    # simple version: summarize(in_context[1:round(summarizer_config.sliding_window_percentage * len(in_context_messages))])
    # this evicts 30% of the messages (via summarization) and keeps the remaining 70%
    # problem: we need the cutoff point to be an assistant message, so will grow the cutoff point until we find an assistant message
    # also need to grow the cutoff point until the token count is less than the target token count
-    while approx_token_count >= (1 - summarizer_config.sliding_window_percentage) * llm_config.context_window and eviction_percentage < 1.0:
+    while approx_token_count >= goal_tokens and eviction_percentage < 1.0:
        # more eviction percentage
        eviction_percentage += 0.10
@@ -155,7 +165,7 @@ async def summarize_via_sliding_window(
            (
                i
                for i in reversed(range(1, message_cutoff_index + 1))
-                if i < len(in_context_messages) and in_context_messages[i].role in valid_cutoff_roles
+                if i < len(in_context_messages) and is_valid_cutoff(in_context_messages[i])
            ),
            None,
        )
@@ -166,9 +176,9 @@ async def summarize_via_sliding_window(
        # update token count
        logger.info(f"Attempting to compact messages index 1:{assistant_message_index} messages")
        post_summarization_buffer = [system_prompt] + in_context_messages[assistant_message_index:]
-        approx_token_count = await count_tokens(actor, llm_config, post_summarization_buffer)
+        approx_token_count = await count_tokens(actor, agent_llm_config, post_summarization_buffer)
        logger.info(
-            f"Compacting messages index 1:{assistant_message_index} messages resulted in {approx_token_count} tokens, goal is {(1 - summarizer_config.sliding_window_percentage) * llm_config.context_window}"
+            f"Compacting messages index 1:{assistant_message_index} messages resulted in {approx_token_count} tokens, goal is {goal_tokens}"
        )
    if assistant_message_index is None or eviction_percentage >= 1.0:
@@ -202,6 +212,8 @@ async def summarize_via_sliding_window(
        },
    )
    logger.info(f"\n==================\nSummary message string: {summary_message_str[:100]}\n==================\n")
    if summarizer_config.clip_chars is not None and len(summary_message_str) > summarizer_config.clip_chars:
        logger.warning(f"Summary length {len(summary_message_str)} exceeds clip length {summarizer_config.clip_chars}. Truncating.")
        summary_message_str = summary_message_str[: summarizer_config.clip_chars] + "... [summary truncated to fit]"
--- a/tests/integration_test_summarizer.py
+++ b/tests/integration_test_summarizer.py
@@ -1157,6 +1157,7 @@ async def test_sliding_window_cutoff_index_does_not_exceed_message_count(server:
        summary, remaining_messages = await summarize_via_sliding_window(
            actor=actor,
            llm_config=llm_config,
            agent_llm_config=llm_config, # case where agent and summarizer have same config
            summarizer_config=summarizer_config,
            in_context_messages=messages,
        )
--- a/tests/test_provider_trace_summarization.py
+++ b/tests/test_provider_trace_summarization.py
@@ -210,6 +210,7 @@ class TestSummarizeSlidingWindowTelemetryContext:
            await summarizer_sliding_window.summarize_via_sliding_window(
                actor=mock_actor,
                llm_config=mock_llm_config,
                agent_llm_config=mock_llm_config, # case where agent and summarizer have same config
                summarizer_config=mock_compaction_settings,
                in_context_messages=mock_messages,
                agent_id=agent_id,