Add modes self and self_sliding_window for prompt caching (#9372)

* add self compaction method with proper caching (pass in tools, don't refresh sys prompt beforehand) + sliding fallback * updated prompts for self compaction * add tests for self, self_sliding_window modes and w/o refresh messages before compaction * add cache logging to summarization * better handling to prevent agent from continuing convo on self modes * if mode changes via summarize endpoint, will use default prompt for the new mode --------- Co-authored-by: Amy Guan <amy@letta.com>
2026-02-24 10:15:36 -08:00
parent 47d55362a4
commit 47b0c87ebe
15 changed files with 1065 additions and 223 deletions
--- a/fern/openapi.json
+++ b/fern/openapi.json
@@ -31151,7 +31151,7 @@
              }
            ],
            "title": "Model",
-            "description": "Model handle to use for summarization (format: provider/model-name). If None, uses lightweight provider-specific defaults."
+            "description": "Model handle to use for sliding_window/all summarization (format: provider/model-name). If None, uses lightweight provider-specific defaults."
          },
          "model_settings": {
            "anyOf": [
@@ -31256,7 +31256,12 @@
          },
          "mode": {
            "type": "string",
-            "enum": ["all", "sliding_window", "self"],
+            "enum": [
+              "all",
+              "sliding_window",
+              "self_compact_all",
+              "self_compact_sliding_window"
+            ],
            "title": "Mode",
            "description": "The type of summarization technique use.",
            "default": "sliding_window"
@@ -31264,12 +31269,12 @@
          "sliding_window_percentage": {
            "type": "number",
            "title": "Sliding Window Percentage",
-            "description": "The percentage of the context window to keep post-summarization (only used in sliding window mode)."
+            "description": "The percentage of the context window to keep post-summarization (only used in sliding window modes)."
          }
        },
        "type": "object",
        "title": "CompactionSettings",
-        "description": "Configuration for conversation compaction / summarization.\n\n``model`` is the only required user-facing field – it specifies the summarizer\nmodel handle (e.g. ``\"openai/gpt-4o-mini\"``). Per-model settings (temperature,\nmax tokens, etc.) are derived from the default configuration for that handle."
+        "description": "Configuration for conversation compaction / summarization.\n\nPer-model settings (temperature,\nmax tokens, etc.) are derived from the default configuration for that handle."
      },
      "CompactionSettings-Output": {
        "properties": {
@@ -31283,7 +31288,7 @@
              }
            ],
            "title": "Model",
-            "description": "Model handle to use for summarization (format: provider/model-name). If None, uses lightweight provider-specific defaults."
+            "description": "Model handle to use for sliding_window/all summarization (format: provider/model-name). If None, uses lightweight provider-specific defaults."
          },
          "model_settings": {
            "anyOf": [
@@ -31388,7 +31393,12 @@
          },
          "mode": {
            "type": "string",
-            "enum": ["all", "sliding_window", "self"],
+            "enum": [
+              "all",
+              "sliding_window",
+              "self_compact_all",
+              "self_compact_sliding_window"
+            ],
            "title": "Mode",
            "description": "The type of summarization technique use.",
            "default": "sliding_window"
@@ -31396,12 +31406,12 @@
          "sliding_window_percentage": {
            "type": "number",
            "title": "Sliding Window Percentage",
-            "description": "The percentage of the context window to keep post-summarization (only used in sliding window mode)."
+            "description": "The percentage of the context window to keep post-summarization (only used in sliding window modes)."
          }
        },
        "type": "object",
        "title": "CompactionSettings",
-        "description": "Configuration for conversation compaction / summarization.\n\n``model`` is the only required user-facing field – it specifies the summarizer\nmodel handle (e.g. ``\"openai/gpt-4o-mini\"``). Per-model settings (temperature,\nmax tokens, etc.) are derived from the default configuration for that handle."
+        "description": "Configuration for conversation compaction / summarization.\n\nPer-model settings (temperature,\nmax tokens, etc.) are derived from the default configuration for that handle."
      },
      "CompactionStats": {
        "properties": {
--- a/letta/agents/letta_agent_v3.py
+++ b/letta/agents/letta_agent_v3.py
@@ -1014,7 +1014,8 @@ class LettaAgentV3(LettaAgentV2):

                                # Ensure system prompt is recompiled before summarization so compaction
                                # operates on the latest system+memory state (including recent repairs).
-                                messages = await self._refresh_messages(messages, force_system_prompt_refresh=True)
+                                # NOTE: we no longer refresh the system prompt before compaction so we can leverage cache for self mode
+                                # messages = await self._refresh_messages(messages, force_system_prompt_refresh=True)

                                summary_message, messages, summary_text = await self.compact(
                                    messages,
@@ -1233,7 +1234,8 @@ class LettaAgentV3(LettaAgentV2):
                try:
                    # Ensure system prompt is recompiled before summarization so compaction
                    # operates on the latest system+memory state (including recent repairs).
-                    messages = await self._refresh_messages(messages, force_system_prompt_refresh=True)
+                    # NOTE: we no longer refresh the system prompt before compaction so we can leverage cache for self mode
+                    # messages = await self._refresh_messages(messages, force_system_prompt_refresh=True)

                    summary_message, messages, summary_text = await self.compact(
                        messages,
@@ -1874,6 +1876,7 @@ class LettaAgentV3(LettaAgentV2):
            context_tokens_before: Token count before compaction (for stats).
            messages_count_before: Message count before compaction (for stats).
        """
+
        # Determine compaction settings: passed-in > agent's > global defaults
        effective_compaction_settings = compaction_settings or self.agent_state.compaction_settings

@@ -1881,12 +1884,14 @@ class LettaAgentV3(LettaAgentV2):
            actor=self.actor,
            agent_id=self.agent_state.id,
            agent_llm_config=self.agent_state.llm_config,
+            telemetry_manager=self.telemetry_manager,
+            llm_client=self.llm_client,
+            agent_type=self.agent_state.agent_type,
            messages=messages,
            timezone=self.agent_state.timezone,
            compaction_settings=effective_compaction_settings,
-            agent_model_handle=self.agent_state.model,
            agent_tags=self.agent_state.tags,
-            tools=self.agent_state.tools,
+            tools=await self._get_valid_tools(),  # Pass json schemas including client tools for cache compatibility (for self compaction)
            trigger_threshold=trigger_threshold,
            run_id=run_id,
            step_id=step_id,
--- a/letta/constants.py
+++ b/letta/constants.py
@@ -78,7 +78,7 @@ DEFAULT_CONTEXT_WINDOW = 32000

 # Summarization trigger threshold (multiplier of context_window limit)
 # Summarization triggers when step usage > context_window * SUMMARIZATION_TRIGGER_MULTIPLIER
-SUMMARIZATION_TRIGGER_MULTIPLIER = 1.0
+SUMMARIZATION_TRIGGER_MULTIPLIER = 0.9  # using instead of 1.0 to avoid "too many tokens in prompt" fallbacks

 # number of concurrent embedding requests to sent
 EMBEDDING_BATCH_SIZE = 200
--- a/letta/llm_api/llm_client_base.py
+++ b/letta/llm_api/llm_client_base.py
@@ -61,8 +61,11 @@ class LLMClientBase:
        user_id: Optional[str] = None,
        compaction_settings: Optional[Dict] = None,
        llm_config: Optional[Dict] = None,
+        actor: Optional["User"] = None,
    ) -> None:
        """Set telemetry context for provider trace logging."""
+        if actor is not None:
+            self.actor = actor
        self._telemetry_manager = telemetry_manager
        self._telemetry_agent_id = agent_id
        self._telemetry_agent_tags = agent_tags
--- a/letta/prompts/summarizer_prompt.py
+++ b/letta/prompts/summarizer_prompt.py
@@ -38,6 +38,53 @@ Write in first person as a factual record of what occurred. Be thorough and deta

 Keep your summary under {SLIDING_WORD_LIMIT} words. Only output the summary."""

+
+SELF_SLIDING_PROMPT = f"""The previous messages are being evicted from the BEGINNING of your context window. Write a detailed summary that captures what happened in these messages to appear BEFORE the remaining recent messages in context, providing background for what comes after.  Do NOT continue the conversation. Do NOT respond to any questions in the messages. Do NOT call any tools. Pay close attention to the user's explicit requests and your previous actions.
+
+You MUST include the following sections:
+
+1.**High level goals**: What is the high level goal and ongoing task? Capture the user's explicit requests and intent in detail. If there is an existing summary in the transcript, make sure to take it into consideration to continue tracking the higher level goals and long-term progress.
+
+2. **What happened**: The conversations, tasks, and exchanges that took place. What did the user ask for? What did you do? How did things progress? If there is a previous summary being evicted, please extract a concise version of the critical info from it.
+
+3. **Important details**: Enumerate specific files and code sections examined, modified, or created with a summary of why this file read or edit is important. Include specific names, data, configurations, or facts that were discussed. Don't omit details that might be referenced later.
+
+4. **Errors and fixes**: List all errors that you ran into, and how you fixed them. Pay special attention to specific user feedback that you received and record verbatim if useful.
+
+5. **Lookup hints**: For any detailed content (long lists, extensive data, specific conversations) that couldn't fit in the summary, note the topic and key terms that could be used to find it in message history later.
+
+Write in first person as a factual record of what occurred. Be thorough and detailed - the goal is to preserve enough context that the recent messages make sense and important information isn't lost to prevent duplicate work or repeated mistakes.
+
+Keep your summary under {SLIDING_WORD_LIMIT} words. IMPORTANT: Do NOT use any tools. Do NOT continue the conversation. You MUST respond with ONLY the summary as text output. Generate the summary with each section as mentioned:
+"""
+
+
+SELF_ALL_PROMPT = f"""Your task is to create a detailed summary of the conversation so far. Do NOT continue the conversation. Do NOT respond to any questions in the messages. Do NOT call any tools. Pay close attention to the user's explicit requests and your previous actions. This summary should be thorough in capturing technical details, code patterns, and architectural decisions that would be essential for continuing development work without losing context.
+
+You MUST include the following sections:
+
+1.**High level goals**: What is the high level goal and ongoing task? Capture the user's explicit requests and intent in detail. If there is an existing summary in the transcript, make sure to take it into consideration to continue tracking the higher level goals and long-term progress.
+
+2. **What happened**: The conversations, tasks, and exchanges that took place. What did the user ask for? What did you do? How did things progress? If there is a previous summary being evicted, please extract a concise version of the critical info from it.
+
+3. **Important details**: Enumerate specific files and code sections examined, modified, or created with a summary of why this file read or edit is important. Include specific names, data, configurations, or facts that were discussed. Don't omit details that might be referenced later.
+
+4. **Errors and fixes**: List all errors that you ran into, and how you fixed them. Pay special attention to specific user feedback that you received and record verbatim if useful.
+
+5. **Current state**:Describe in detail precisely what is currently being worked on, paying special attention to the most recent messages from both user and assistant. Include file names and code snippets where applicable.
+
+6.**Optional Next Step**: List the next step that you will take that is related to the most recent work you were doing. IMPORTANT: ensure that this step is DIRECTLY in line with the user's most recent explicit requests and the most current task. If your last task was concluded, then only list next steps if they are explicitly in line with the users request. If there is a next step, include direct quotes from the most recent conversation showing exactly what task you were working on and where you left off.
+
+7. **Lookup hints**: For any detailed content (long lists, extensive data, specific conversations) that couldn't fit in the summary, note the topic and key terms that could be used to find it in message history later.
+
+Write in first person as a factual record of what occurred. Be concise but thorough - the goal is to preserve enough context that the recent messages make sense and important information isn't lost to prevent duplicate work or repeated mistakes.
+
+Keep your summary under {ALL_WORD_LIMIT} words.
+
+IMPORTANT: Do NOT use any tools. Do NOT continue the conversation. You MUST respond with ONLY the summary as text output. Generate the summary with each section as mentioned:
+"""
+
+
 ANTHROPIC_SUMMARY_PROMPT = """You have been working on the task described above but have not yet completed it. Write a continuation summary that will allow you (or another instance of yourself) to resume work efficiently in a future context window where the conversation history will be replaced with this summary. Your summary should be structured, concise, and actionable. Include:

 1. Task Overview
@@ -70,7 +117,6 @@ Write the summary from the perspective of the AI (use the first person from the
 Only output the summary, do NOT include anything else in your output.
 """

-WORD_LIMIT = 250
 SHORTER_SUMMARY_PROMPT = f"""The following messages are being evicted from your context window. Write a detailed summary that captures what happened in these messages.

 This summary will appear BEFORE the remaining recent messages in context, providing background for what comes after. Include:
@@ -85,4 +131,104 @@ This summary will appear BEFORE the remaining recent messages in context, provid

 Write in first person as a factual record of what occurred. Be thorough and detailed - the goal is to preserve enough context that the recent messages make sense and important information isn't lost.

-Keep your summary under {WORD_LIMIT} words. Only output the summary."""
+Keep your summary under {SLIDING_WORD_LIMIT} words. Only output the summary."""
+
+SELF_SUMMARIZATION_PROMPT = """Your task is to create a detailed summary of the conversation so far, paying close attention to the user's explicit requests and your previous actions.
+This summary should be thorough in capturing technical details, code patterns, and architectural decisions that would be essential for continuing development work without losing context.
+
+Before providing your final summary, wrap your analysis in <analysis> tags to organize your thoughts and ensure you've covered all necessary points. In your analysis process:
+
+1. Chronologically analyze each message and section of the conversation. For each section thoroughly identify:
+   - The user's explicit requests and intents
+   - Your approach to addressing the user's requests
+   - Key decisions, technical concepts and code patterns
+   - Specific details like:
+     - file names
+     - full code snippets
+     - function signatures
+     - file edits
+  - Errors that you ran into and how you fixed them
+  - Pay special attention to specific user feedback that you received, especially if the user told you to do something differently.
+2. Double-check for technical accuracy and completeness, addressing each required element thoroughly.
+
+Your summary should include the following sections:
+
+1. Primary Request and Intent: Capture all of the user's explicit requests and intents in detail
+2. Key Technical Concepts: List all important technical concepts, technologies, and frameworks discussed.
+3. Files and Code Sections: Enumerate specific files and code sections examined, modified, or created. Pay special attention to the most recent messages and include full code snippets where applicable and include a summary of why this file read or edit is important.
+4. Errors and fixes: List all errors that you ran into, and how you fixed them. Pay special attention to specific user feedback that you received, especially if the user told you to do something differently.
+5. Problem Solving: Document problems solved and any ongoing troubleshooting efforts.
+6. All user messages: List ALL user messages that are not tool results. These are critical for understanding the users' feedback and changing intent.
+6. Pending Tasks: Outline any pending tasks that you have explicitly been asked to work on.
+7. Current Work: Describe in detail precisely what was being worked on immediately before this summary request, paying special attention to the most recent messages from both user and assistant. Include file names and code snippets where applicable.
+8. Optional Next Step: List the next step that you will take that is related to the most recent work you were doing. IMPORTANT: ensure that this step is DIRECTLY in line with the user's most recent explicit requests, and the task you were working on immediately before this summary request. If your last task was concluded, then only list next steps if they are explicitly in line with the users request. Do not start on tangential requests or really old requests that were already completed without confirming with the user first.
+                       If there is a next step, include direct quotes from the most recent conversation showing exactly what task you were working on and where you left off. This should be verbatim to ensure there's no drift in task interpretation.
+
+Here's an example of how your output should be structured:
+
+<example>
+<analysis>
+[Your thought process, ensuring all points are covered thoroughly and accurately]
+</analysis>
+
+<summary>
+1. Primary Request and Intent:
+   [Detailed description]
+
+2. Key Technical Concepts:
+   - [Concept 1]
+   - [Concept 2]
+   - [...]
+
+3. Files and Code Sections:
+   - [File Name 1]
+      - [Summary of why this file is important]
+      - [Summary of the changes made to this file, if any]
+      - [Important Code Snippet]
+   - [File Name 2]
+      - [Important Code Snippet]
+   - [...]
+
+4. Errors and fixes:
+    - [Detailed description of error 1]:
+      - [How you fixed the error]
+      - [User feedback on the error if any]
+    - [...]
+
+5. Problem Solving:
+   [Description of solved problems and ongoing troubleshooting]
+
+6. All user messages:
+    - [Detailed non tool use user message]
+    - [...]
+
+7. Pending Tasks:
+   - [Task 1]
+   - [Task 2]
+   - [...]
+
+8. Current Work:
+   [Precise description of current work]
+
+9. Optional Next Step:
+   [Optional Next step to take]
+
+</summary>
+</example>
+
+Please provide your summary based on the conversation so far, following this structure and ensuring precision and thoroughness in your response.
+
+There may be additional summarization instructions provided in the included context. If so, remember to follow these instructions when creating the above summary. Examples of instructions include:
+<example>
+## Compact Instructions
+When summarizing the conversation focus on typescript code changes and also remember the mistakes you made and how you fixed them.
+</example>
+
+<example>
+# Summary instructions
+When you are using compact - please focus on test output and code changes. Include file reads verbatim.
+</example>
+
+
+IMPORTANT: Do NOT use any tools. You MUST respond with ONLY the <summary>...</summary> block as your text output.
+"""
--- a/letta/server/rest_api/routers/v1/agents.py
+++ b/letta/server/rest_api/routers/v1/agents.py
@@ -2385,7 +2385,7 @@ async def summarize_messages(
    """

    actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
-    agent = await server.agent_manager.get_agent_by_id_async(agent_id, actor, include_relationships=["multi_agent_group"])
+    agent = await server.agent_manager.get_agent_by_id_async(agent_id, actor, include_relationships=["multi_agent_group", "tools"])

    agent_loop = LettaAgentV3(agent_state=agent, actor=actor)
    in_context_messages = await server.message_manager.get_messages_by_ids_async(message_ids=agent.message_ids, actor=actor)
@@ -2412,15 +2412,17 @@ async def summarize_messages(
    if agent.compaction_settings and request and request.compaction_settings:
        # Start with agent's settings, override with new values from request
        # Use model_fields_set to get the fields that were changed in the request (want to ignore the defaults that get set automatically)
-        compaction_settings = agent.compaction_settings
+        compaction_settings = agent.compaction_settings.copy()  # do not mutate original agent compaction settings
        changed_fields = request.compaction_settings.model_fields_set
        for field in changed_fields:
            setattr(compaction_settings, field, getattr(request.compaction_settings, field))

        # If mode changed from agent's original settings and prompt not explicitly set in request, then use the default prompt for the new mode
        # Ex: previously was sliding_window, now is all, so we need to use the default prompt for all mode
-        if "mode" in changed_fields and compaction_settings.mode != request.compaction_settings.mode:
-            compaction_settings = compaction_settings.set_mode_specific_prompt()
+        if "mode" in changed_fields and agent.compaction_settings.mode != request.compaction_settings.mode:
+            from letta.services.summarizer.summarizer_config import get_default_prompt_for_mode
+
+            compaction_settings.prompt = get_default_prompt_for_mode(compaction_settings.mode)
    else:
        compaction_settings = (request and request.compaction_settings) or agent.compaction_settings
    num_messages_before = len(in_context_messages)
@@ -2434,6 +2436,7 @@ async def summarize_messages(
    # update the agent state
    logger.info(f"Summarized {num_messages_before} messages to {num_messages_after}")
    if num_messages_before <= num_messages_after:
+        logger.warning(f"Summarization failed to reduce the number of messages. {num_messages_before} messages -> {num_messages_after}.")
        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
            detail="Summarization failed to reduce the number of messages. You may need to use a different CompactionSettings (e.g. using `all` mode).",
--- a/letta/server/rest_api/routers/v1/conversations.py
+++ b/letta/server/rest_api/routers/v1/conversations.py
@@ -537,10 +537,13 @@ async def compact_conversation(

    # Validate compaction reduced messages
    if num_messages_before <= num_messages_after:
-        raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST,
-            detail="Summarization failed to reduce the number of messages. You may need to use a different CompactionSettings (e.g. using `all` mode).",
+        logger.warning(
+            f"Summarization failed to reduce the number of messages. {num_messages_before} messages -> {num_messages_after} (only expected if drop_tool_returns is True)."
        )
+        # raise HTTPException(
+        #     status_code=status.HTTP_400_BAD_REQUEST,
+        #     detail="Summarization failed to reduce the number of messages. You may need to use a different CompactionSettings (e.g. using `all` mode).",
+        # )

    # Checkpoint the messages (this will update the conversation_messages table)
    await agent_loop._checkpoint_messages(run_id=None, step_id=None, new_messages=[summary_message], in_context_messages=messages)
--- a/letta/services/summarizer/compact.py
+++ b/letta/services/summarizer/compact.py
@@ -4,21 +4,24 @@ from dataclasses import dataclass
 from typing import List, Optional

 from letta.helpers.message_helper import convert_message_creates_to_messages
+from letta.llm_api.llm_client import LLMClient
 from letta.log import get_logger
 from letta.otel.tracing import trace_method
+from letta.schemas.agent import AgentType
 from letta.schemas.enums import MessageRole
 from letta.schemas.letta_message_content import TextContent
 from letta.schemas.llm_config import LLMConfig
 from letta.schemas.message import Message, MessageCreate
-from letta.schemas.tool import Tool
 from letta.schemas.user import User
+from letta.services.summarizer.self_summarizer import self_summarize_all, self_summarize_sliding_window
 from letta.services.summarizer.summarizer_all import summarize_all
-from letta.services.summarizer.summarizer_config import CompactionSettings, get_default_summarizer_model
+from letta.services.summarizer.summarizer_config import CompactionSettings, get_default_prompt_for_mode, get_default_summarizer_model
 from letta.services.summarizer.summarizer_sliding_window import (
    count_tokens,
    count_tokens_with_tools,
    summarize_via_sliding_window,
 )
+from letta.services.telemetry_manager import TelemetryManager
 from letta.system import package_summarize_message_no_counts

 logger = get_logger(__name__)
@@ -106,12 +109,14 @@ async def compact_messages(
    actor: User,
    agent_id: str,
    agent_llm_config: LLMConfig,
+    telemetry_manager: TelemetryManager,
+    llm_client: LLMClient,
+    agent_type: AgentType,
    messages: List[Message],
    timezone: str,
    compaction_settings: Optional[CompactionSettings] = None,
-    agent_model_handle: Optional[str] = None,
    agent_tags: Optional[List[str]] = None,
-    tools: Optional[List[Tool]] = None,
+    tools: Optional[List[dict]] = None,  # Tool json schemas
    trigger_threshold: Optional[int] = None,
    run_id: Optional[str] = None,
    step_id: Optional[str] = None,
@@ -154,7 +159,105 @@ async def compact_messages(
    )

    summarization_mode_used = summarizer_config.mode
-    if summarizer_config.mode == "all":
+    if summarizer_config.mode == "self_compact_all":
+        try:
+            summary, compacted_messages = await self_summarize_all(
+                actor=actor,
+                agent_id=agent_id,
+                agent_llm_config=agent_llm_config,
+                telemetry_manager=telemetry_manager,
+                llm_client=llm_client,
+                agent_type=agent_type,
+                messages=messages,
+                compaction_settings=summarizer_config,
+                run_id=run_id,
+                step_id=step_id,
+                timezone=timezone,
+                agent_tags=agent_tags,
+                tools=tools,
+            )
+        except Exception as e:
+            logger.error(f"Self summarization failed with exception: {str(e)}. Falling back to self sliding window mode.")
+            try:
+                fallback_config = summarizer_config.model_copy(
+                    update={
+                        "mode": "self_compact_sliding_window",
+                        "prompt": get_default_prompt_for_mode("self_compact_sliding_window"),
+                    }
+                )
+                summary, compacted_messages = await self_summarize_sliding_window(
+                    actor=actor,
+                    agent_id=agent_id,
+                    agent_llm_config=agent_llm_config,
+                    telemetry_manager=telemetry_manager,
+                    llm_client=llm_client,
+                    agent_type=agent_type,
+                    messages=messages,
+                    compaction_settings=fallback_config,
+                    run_id=run_id,
+                    step_id=step_id,
+                    timezone=timezone,
+                    agent_tags=agent_tags,
+                    tools=tools,
+                )
+                summarization_mode_used = "self_compact_sliding_window"
+            except Exception as e:
+                logger.error(f"Self sliding window summarization failed with exception: {str(e)}. Falling back to all mode.")
+                fallback_config = summarizer_config.model_copy(
+                    update={
+                        "mode": "all",
+                        "prompt": get_default_prompt_for_mode("all"),
+                    }
+                )
+                summary, compacted_messages = await summarize_all(
+                    actor=actor,
+                    llm_config=summarizer_llm_config,
+                    summarizer_config=fallback_config,
+                    in_context_messages=messages,
+                    agent_id=agent_id,
+                    agent_tags=agent_tags,
+                    run_id=run_id,
+                    step_id=step_id,
+                )
+                summarization_mode_used = "all"
+    elif summarizer_config.mode == "self_compact_sliding_window":
+        try:
+            summary, compacted_messages = await self_summarize_sliding_window(
+                actor=actor,
+                agent_id=agent_id,
+                agent_llm_config=agent_llm_config,
+                telemetry_manager=telemetry_manager,
+                llm_client=llm_client,
+                agent_type=agent_type,
+                messages=messages,
+                compaction_settings=summarizer_config,
+                run_id=run_id,
+                step_id=step_id,
+                timezone=timezone,
+                agent_tags=agent_tags,
+                tools=tools,
+            )
+        except Exception as e:
+            # Prompts for all and self mode should be similar --> can use original prompt
+            logger.error(f"Self sliding window summarization failed with exception: {str(e)}. Falling back to all mode.")
+            fallback_config = summarizer_config.model_copy(
+                update={
+                    "mode": "all",
+                    "prompt": get_default_prompt_for_mode("all"),
+                }
+            )
+            summary, compacted_messages = await summarize_all(
+                actor=actor,
+                llm_config=summarizer_llm_config,
+                summarizer_config=fallback_config,
+                in_context_messages=messages,
+                agent_id=agent_id,
+                agent_tags=agent_tags,
+                run_id=run_id,
+                step_id=step_id,
+            )
+            summarization_mode_used = "all"
+    elif summarizer_config.mode == "all":
        summary, compacted_messages = await summarize_all(
            actor=actor,
            llm_config=summarizer_llm_config,
@@ -180,10 +283,16 @@ async def compact_messages(
            )
        except Exception as e:
            logger.error(f"Sliding window summarization failed with exception: {str(e)}. Falling back to all mode.")
+            fallback_config = summarizer_config.model_copy(
+                update={
+                    "mode": "all",
+                    "prompt": get_default_prompt_for_mode("all"),
+                }
+            )
            summary, compacted_messages = await summarize_all(
                actor=actor,
                llm_config=summarizer_llm_config,
-                summarizer_config=summarizer_config,
+                summarizer_config=fallback_config,
                in_context_messages=messages,
                agent_id=agent_id,
                agent_tags=agent_tags,
@@ -271,6 +380,7 @@ async def compact_messages(
        summary=summary,
        timezone=timezone,
        compaction_stats=compaction_stats,
+        mode=summarization_mode_used,
    )

    if use_summary_role:
--- a/letta/services/summarizer/self_summarizer.py
+++ b/letta/services/summarizer/self_summarizer.py
@@ -0,0 +1,283 @@
+"""Claude Code-style summarization where agent self-summarizes using its own LLM."""
+
+from typing import List, Optional, Tuple
+
+from letta.llm_api.llm_client import LLMClient
+from letta.log import get_logger
+from letta.otel.tracing import trace_method
+from letta.schemas.agent import AgentType
+from letta.schemas.enums import MessageRole, ProviderType
+from letta.schemas.letta_message_content import TextContent
+from letta.schemas.llm_config import LLMConfig
+from letta.schemas.message import Message
+from letta.schemas.user import User
+from letta.services.summarizer.summarizer_config import CompactionSettings, get_default_prompt_for_mode
+from letta.services.summarizer.summarizer_sliding_window import count_tokens
+from letta.services.telemetry_manager import TelemetryManager
+
+logger = get_logger(__name__)
+
+
+@trace_method
+async def self_summarize_all(
+    actor: User,
+    agent_id: str,
+    agent_llm_config: LLMConfig,
+    telemetry_manager: TelemetryManager,
+    llm_client: LLMClient,
+    agent_type: AgentType,
+    messages: List[Message],
+    compaction_settings: CompactionSettings,
+    timezone: str,
+    run_id: Optional[str] = None,
+    step_id: Optional[str] = None,
+    agent_tags: Optional[List[str]] = None,
+    # For cache compatibility with regular agent requests
+    tools: Optional[List[dict]] = None,
+) -> Tuple[str, List[Message], str]:
+    """Summary request is added as a user message, then the agent's LLM is called with the messages + request.
+    The agent's summary response is parsed and returned.
+    """
+    logger.info(f"Starting self-summarization for {len(messages)} messages")
+
+    # Protect system message and handle last message
+    if len(messages) < 2:
+        logger.warning("Too few messages to summarize")
+        return "No conversation to summarize.", messages
+
+    system_message = messages[0]
+
+    # Cutoff rules for what you can/can't separate
+    messages_to_summarize, protected_messages = _get_protected_messages(messages)
+
+    # Create the summary request message
+    if compaction_settings.prompt is None:
+        compaction_settings.prompt = get_default_prompt_for_mode(compaction_settings.mode)
+
+    logger.info(f"Summarizing {len(messages)} messages with prompt: {compaction_settings.prompt[:100]}...")
+    summary_request_message = Message(
+        role=MessageRole.user,
+        content=[TextContent(text=compaction_settings.prompt)],
+        agent_id=agent_id,
+    )
+
+    # If the last message is not an assistant message, add a dummy assistant message to prevent LLM from continuing the conversation
+    if messages_to_summarize[-1].role != MessageRole.assistant:
+        messages_with_request = [
+            *messages_to_summarize,
+            Message(role=MessageRole.assistant, content=[TextContent(text="I understand. Let me summarize.")], agent_id=agent_id),
+            summary_request_message,
+        ]
+        logger.info(
+            f"Calling agent's LLM for self-summarization with {len(messages_with_request)} messages ({len(messages_to_summarize)} in-context + 1 dummy assistant message + 1 summary request)"
+        )
+    else:
+        # Last message is already assistant, safe to append user directly
+        messages_with_request = [*messages_to_summarize, summary_request_message]
+        logger.info(
+            f"Calling agent's LLM for self-summarization with {len(messages_with_request)} messages ({len(messages_to_summarize)} in-context + 1 summary request)"
+        )
+
+    # Set telemetry context
+    llm_client.set_telemetry_context(
+        telemetry_manager=telemetry_manager,
+        agent_id=agent_id,
+        agent_tags=agent_tags,
+        run_id=run_id,
+        step_id=step_id,
+        call_type="summarization",
+        org_id=actor.organization_id if actor.organization_id else None,
+        user_id=actor.id if actor.id else None,
+        compaction_settings=compaction_settings.model_dump() if compaction_settings else None,
+        actor=actor,
+    )
+
+    # Build request data using agent's llm_client
+    # Match params used by agent_v3 for cache compatibility
+    request_data = llm_client.build_request_data(
+        agent_type,
+        messages_with_request,
+        agent_llm_config,
+        tools=tools,
+        force_tool_call=None,  # Don't force tool calls during summarization
+        requires_subsequent_tool_call=False,
+        # tool_return_truncation_chars=TOOL_RETURN_TRUNCATION_CHARS,
+    )
+
+    # Match parallel_tool_calls setting from agent's llm_config for cache compatibility
+    # This mirrors the logic in letta_agent_v3.py step processing
+    if agent_llm_config.model_endpoint_type in [ProviderType.anthropic, ProviderType.bedrock]:
+        if isinstance(request_data.get("tool_choice"), dict) and "disable_parallel_tool_use" in request_data["tool_choice"]:
+            if agent_llm_config.parallel_tool_calls:
+                request_data["tool_choice"]["disable_parallel_tool_use"] = False
+            else:
+                request_data["tool_choice"]["disable_parallel_tool_use"] = True
+
+    # Call LLM by sending a message
+    from letta.services.summarizer.summarizer import _run_summarizer_request
+
+    try:
+        summary_text = await _run_summarizer_request(request_data, messages_with_request, agent_llm_config, llm_client)
+    except Exception as e:
+        logger.error(f"Self-summarization request failed: {e}")
+
+        # handle LLM error (likely a context window exceeded error)
+        try:
+            raise llm_client.handle_llm_error(e, llm_config=agent_llm_config)
+        except Exception as e:
+            logger.error(f"Self-summarization request failed: {e}")
+            raise e
+
+    # Clip if needed
+    if compaction_settings.clip_chars is not None and len(summary_text) > compaction_settings.clip_chars:
+        logger.warning(f"CC summary length {len(summary_text)} exceeds clip length {compaction_settings.clip_chars}. Truncating.")
+        summary_text = summary_text[: compaction_settings.clip_chars] + "... [summary truncated to fit]"
+
+    # Build final messages: [system] + protected messages
+    # Summary message handling is done in compact parent function
+    final_messages = [system_message]
+    if protected_messages:
+        final_messages += protected_messages
+
+    logger.info(
+        f"Self-summarization complete. Summary length: {len(summary_text)} chars. Keeping {len(protected_messages)} protected messages."
+    )
+
+    return summary_text, final_messages
+
+
+@trace_method
+async def self_summarize_sliding_window(
+    actor: User,
+    agent_id: str,
+    agent_llm_config: LLMConfig,
+    telemetry_manager: TelemetryManager,
+    llm_client: LLMClient,
+    agent_type: AgentType,
+    messages: List[Message],
+    compaction_settings: CompactionSettings,
+    timezone: str,
+    run_id: Optional[str] = None,
+    step_id: Optional[str] = None,
+    agent_tags: Optional[List[str]] = None,
+    # For cache compatibility with regular agent requests
+    tools: Optional[List[dict]] = None,
+) -> Tuple[Message, List[Message], str]:
+    """Summary request is added as a user message, then the agent's LLM is called with the messages + request.
+    The agent's summary response is parsed and returned.
+    """
+    logger.info("Starting self-summarization with sliding window mode")
+    # Protect system message and handle last message
+    if len(messages) < 2:
+        logger.warning("Too few messages to summarize")
+        return "No conversation to summarize.", messages
+
+    system_prompt = messages[0]
+
+    # cannot evict a pending approval request (will cause client-side errors)
+    total_message_count = len(messages)
+    if messages[-1].role == MessageRole.approval:
+        maximum_message_index = total_message_count - 2
+    else:
+        maximum_message_index = total_message_count - 1
+
+    eviction_percentage = compaction_settings.sliding_window_percentage
+    assert compaction_settings.sliding_window_percentage <= 1.0, "Sliding window percentage must be less than or equal to 1.0"
+    assistant_message_index = None
+
+    goal_tokens = (1 - compaction_settings.sliding_window_percentage) * agent_llm_config.context_window
+    approx_token_count = agent_llm_config.context_window
+
+    # allow approvals to be cutoffs (for headless agents) but ensure proper grouping with tool calls
+    def is_valid_cutoff(message: Message):
+        if message.role == MessageRole.assistant:
+            return True
+        if message.role == MessageRole.approval:
+            return message.tool_calls is not None and len(message.tool_calls) > 0
+        return False
+
+    post_summarization_buffer = [system_prompt]
+    while approx_token_count >= goal_tokens and eviction_percentage < 1.0:
+        # more eviction percentage
+        eviction_percentage += 0.10
+
+        # calculate message_cutoff_index
+        message_cutoff_index = round(eviction_percentage * total_message_count)
+
+        # get index of first assistant message after the cutoff point ()
+        assistant_message_index = next(
+            (i for i in reversed(range(1, message_cutoff_index + 1)) if i < len(messages) and is_valid_cutoff(messages[i])),
+            None,
+        )
+        if assistant_message_index is None:
+            logger.warning(
+                f"No assistant/approval message found for evicting up to index {message_cutoff_index}, incrementing eviction percentage"
+            )
+            continue
+
+        # update token count
+        logger.info(f"Attempting to compact messages to index {assistant_message_index} messages")
+        post_summarization_buffer = [system_prompt, *messages[assistant_message_index:]]
+        approx_token_count = await count_tokens(actor, agent_llm_config, post_summarization_buffer)
+        logger.info(
+            f"Compacting messages index 1:{assistant_message_index} messages resulted in {approx_token_count} tokens, goal is {goal_tokens}"
+        )
+
+    if assistant_message_index is None or eviction_percentage >= 1.0:
+        raise ValueError("No assistant message found for sliding window summarization")  # fall back to complete summarization
+
+    if assistant_message_index >= maximum_message_index:
+        # need to keep the last message (might contain an approval request)
+        raise ValueError(f"Assistant message index {assistant_message_index} is at the end of the message buffer, skipping summarization")
+
+    messages_to_summarize = messages[:assistant_message_index]
+    logger.info(
+        f"Summarizing {len(messages_to_summarize)} messages with self summarization sliding window, from index 1 to {assistant_message_index} (out of {total_message_count})"
+    )
+
+    # pass in messages_to_summarize instead of messages
+    summary_text, final_messages = await self_summarize_all(
+        actor=actor,
+        agent_id=agent_id,
+        agent_llm_config=agent_llm_config,
+        telemetry_manager=telemetry_manager,
+        llm_client=llm_client,
+        agent_type=agent_type,
+        messages=messages_to_summarize,
+        compaction_settings=compaction_settings,
+        timezone=timezone,
+        run_id=run_id,
+        step_id=step_id,
+        agent_tags=agent_tags,
+        tools=tools,
+    )
+
+    # final_messages should just be the system prompt
+    return summary_text, final_messages + post_summarization_buffer
+
+
+def _get_protected_messages(in_context_messages: List[Message]) -> Tuple[List[Message], List[Message]]:
+    """Determine which messages to keep in context window."""
+    if in_context_messages[-1].role == MessageRole.approval:
+        # cannot evict a pending approval request (will cause client-side errors)
+        # Also protect the assistant message before it if they share the same step_id
+        # (both are part of the same LLM response - assistant has thinking/tool_calls, approval has approval-required subset)
+        protected_messages = [in_context_messages[-1]]
+
+        # Check if the message before approval is also from the same step (has reasoning/tool_calls)
+        if len(in_context_messages) >= 2:
+            potential_assistant = in_context_messages[-2]
+            approval_request = in_context_messages[-1]
+            if potential_assistant.role == MessageRole.assistant and potential_assistant.step_id == approval_request.step_id:
+                # They're part of the same LLM response - protect both
+                protected_messages = [potential_assistant, approval_request]
+                messages_to_summarize = in_context_messages[:-2]
+            else:
+                messages_to_summarize = in_context_messages[:-1]
+        else:
+            messages_to_summarize = in_context_messages[:-1]
+    else:
+        messages_to_summarize = in_context_messages
+        protected_messages = []
+
+    return messages_to_summarize, protected_messages
--- a/letta/services/summarizer/summarizer.py
+++ b/letta/services/summarizer/summarizer.py
@@ -495,7 +495,7 @@ async def simple_summary(
    # Build the initial transcript without clamping to preserve fidelity
    # TODO proactively clip here?
    summary_transcript = simple_formatter(messages)
-    logger.info(f"Summarizing {len(messages)} messages with prompt: {system_prompt}")
+    logger.info(f"Summarizing {len(messages)} messages with prompt: {system_prompt[:100]}...")

    if include_ack:
        logger.info(f"Summarizing with ACK for model {llm_config.model}")
@@ -519,86 +519,9 @@ async def simple_summary(
    summarizer_llm_config.put_inner_thoughts_in_kwargs = False
    summarizer_llm_config.enable_reasoner = False

-    async def _run_summarizer_request(req_data: dict, req_messages_obj: list[Message]) -> str:
-        """Run summarization request and return assistant text.
-
-        For Anthropic, use provider-side streaming to avoid long-request failures
-        (Anthropic requires streaming for requests that may exceed ~10 minutes).
-        """
-
-        if summarizer_llm_config.model_endpoint_type in [ProviderType.anthropic, ProviderType.bedrock]:
-            logger.info(
-                "Summarizer: using provider streaming (%s/%s) to avoid long-request failures",
-                summarizer_llm_config.model_endpoint_type,
-                summarizer_llm_config.model,
-            )
-            # Stream from provider and accumulate the final assistant text.
-            from letta.interfaces.anthropic_parallel_tool_call_streaming_interface import (
-                SimpleAnthropicStreamingInterface,
-            )
-
-            interface = SimpleAnthropicStreamingInterface(
-                requires_approval_tools=[],
-                run_id=None,
-                step_id=None,
-            )
-
-            # AnthropicClient.stream_async sets request_data["stream"] = True internally.
-            try:
-                stream = await llm_client.stream_async(req_data, summarizer_llm_config)
-                async for _chunk in interface.process(stream):
-                    pass
-
-                content_parts = interface.get_content()
-                text = "".join(part.text for part in content_parts if isinstance(part, TextContent)).strip()
-
-                await llm_client.log_provider_trace_async(
-                    request_data=req_data,
-                    response_json={
-                        "content": text,
-                        "model": summarizer_llm_config.model,
-                        "usage": {
-                            "input_tokens": getattr(interface, "input_tokens", None),
-                            "output_tokens": getattr(interface, "output_tokens", None),
-                        },
-                    },
-                    llm_config=summarizer_llm_config,
-                )
-            except Exception as e:
-                await llm_client.log_provider_trace_async(
-                    request_data=req_data,
-                    response_json=None,
-                    llm_config=summarizer_llm_config,
-                    error_msg=str(e),
-                    error_type=type(e).__name__,
-                )
-                raise
-
-            if not text:
-                logger.warning("No content returned from summarizer (streaming path)")
-                raise Exception("Summary failed to generate")
-            return text
-
-        # Default: non-streaming provider request, then normalize via chat-completions conversion.
-        logger.debug(
-            "Summarizer: using non-streaming request (%s/%s)",
-            summarizer_llm_config.model_endpoint_type,
-            summarizer_llm_config.model,
-        )
-        response_data = await llm_client.request_async_with_telemetry(req_data, summarizer_llm_config)
-        response = await llm_client.convert_response_to_chat_completion(
-            response_data,
-            req_messages_obj,
-            summarizer_llm_config,
-        )
-        if response.choices[0].message.content is None:
-            logger.warning("No content returned from summarizer")
-            raise Exception("Summary failed to generate")
-        return response.choices[0].message.content.strip()
-
    request_data = llm_client.build_request_data(AgentType.letta_v1_agent, input_messages_obj, summarizer_llm_config, tools=[])
    try:
-        summary = await _run_summarizer_request(request_data, input_messages_obj)
+        summary = await _run_summarizer_request(request_data, input_messages_obj, summarizer_llm_config, llm_client)
    except Exception as e:
        # handle LLM error (likely a context window exceeded error)
        try:
@@ -636,7 +559,7 @@ async def simple_summary(
            )

            try:
-                summary = await _run_summarizer_request(request_data, input_messages_obj)
+                summary = await _run_summarizer_request(request_data, input_messages_obj, summarizer_llm_config, llm_client)
            except Exception as fallback_error_a:
                # Fallback B: hard-truncate the user transcript to fit a conservative char budget
                logger.warning(f"Clamped tool returns still overflowed ({fallback_error_a}). Falling back to transcript truncation.")
@@ -673,7 +596,7 @@ async def simple_summary(
                    tools=[],
                )
                try:
-                    summary = await _run_summarizer_request(request_data, input_messages_obj)
+                    summary = await _run_summarizer_request(request_data, input_messages_obj, summarizer_llm_config, llm_client)
                except Exception as fallback_error_b:
                    logger.error(f"Transcript truncation fallback also failed: {fallback_error_b}. Propagating error.")
                    logger.info(f"Full fallback summarization payload: {request_data}")
@@ -742,3 +665,84 @@ def format_transcript(messages: List[Message], include_system: bool = False) ->
        lines.append(f"{role}: {text}")

    return lines
+
+
+@trace_method
+async def _run_summarizer_request(req_data: dict, req_messages_obj: list[Message], llm_config: LLMConfig, llm_client: LLMClient) -> str:
+    """Run summarization request and return assistant text.
+
+    For Anthropic, use provider-side streaming to avoid long-request failures
+    (Anthropic requires streaming for requests that may exceed ~10 minutes).
+    """
+
+    if llm_config.model_endpoint_type in [ProviderType.anthropic, ProviderType.bedrock]:
+        logger.info(
+            "Summarizer: using provider streaming (%s/%s) to avoid long-request failures",
+            llm_config.model_endpoint_type,
+            llm_config.model,
+        )
+        # Stream from provider and accumulate the final assistant text.
+        from letta.interfaces.anthropic_parallel_tool_call_streaming_interface import (
+            SimpleAnthropicStreamingInterface,
+        )
+
+        interface = SimpleAnthropicStreamingInterface(
+            requires_approval_tools=[],
+            run_id=None,
+            step_id=None,
+        )
+
+        # AnthropicClient.stream_async sets request_data["stream"] = True internally.
+        try:
+            stream = await llm_client.stream_async(req_data, llm_config)
+            async for _chunk in interface.process(stream):
+                pass
+
+            content_parts = interface.get_content()
+            text = "".join(part.text for part in content_parts if isinstance(part, TextContent)).strip()
+
+            await llm_client.log_provider_trace_async(
+                request_data=req_data,
+                response_json={
+                    "content": text,
+                    "model": llm_config.model,
+                    "usage": {
+                        "input_tokens": getattr(interface, "input_tokens", None),
+                        "output_tokens": getattr(interface, "output_tokens", None),
+                        "cache_read_input_tokens": getattr(interface, "cache_read_tokens", 0),  # cache read
+                        "cache_creation_input_tokens": getattr(interface, "cache_creation_tokens", 0),  # cache write
+                    },
+                },
+                llm_config=llm_config,
+            )
+        except Exception as e:
+            await llm_client.log_provider_trace_async(
+                request_data=req_data,
+                response_json=None,
+                llm_config=llm_config,
+                error_msg=str(e),
+                error_type=type(e).__name__,
+            )
+            raise
+
+        if not text:
+            logger.warning("No content returned from summarizer (streaming path)")
+            raise Exception("Summary failed to generate")
+        return text
+
+    # Default: non-streaming provider request, then normalize via chat-completions conversion.
+    logger.debug(
+        "Summarizer: using non-streaming request (%s/%s)",
+        llm_config.model_endpoint_type,
+        llm_config.model,
+    )
+    response_data = await llm_client.request_async_with_telemetry(req_data, llm_config)
+    response = await llm_client.convert_response_to_chat_completion(
+        response_data,
+        req_messages_obj,
+        llm_config,
+    )
+    if response.choices[0].message.content is None:
+        logger.warning("No content returned from summarizer")
+        raise Exception("Summary failed to generate")
+    return response.choices[0].message.content.strip()
--- a/letta/services/summarizer/summarizer_config.py
+++ b/letta/services/summarizer/summarizer_config.py
@@ -2,7 +2,7 @@ from typing import Literal

 from pydantic import BaseModel, Field

-from letta.prompts.summarizer_prompt import ALL_PROMPT, SLIDING_PROMPT
+from letta.prompts.summarizer_prompt import ALL_PROMPT, SELF_ALL_PROMPT, SELF_SLIDING_PROMPT, SLIDING_PROMPT
 from letta.schemas.enums import ProviderType
 from letta.schemas.model import ModelSettingsUnion
 from letta.settings import summarizer_settings
@@ -18,20 +18,23 @@ def get_default_summarizer_model(provider_type: ProviderType) -> str | None:
    return summarizer_defaults.get(provider_type)


-def get_default_prompt_for_mode(mode: Literal["all", "sliding_window"]) -> str:
+def get_default_prompt_for_mode(mode: Literal["all", "sliding_window", "self_compact_all", "self_compact_sliding_window"]) -> str:
    """Get the default prompt for a given compaction mode.
    Also used in /summarize endpoint if mode is changed and prompt is not explicitly set."""
-    if mode == "all":
-        return ALL_PROMPT
-    else:  # sliding_window
+    if mode == "self_compact_sliding_window":
+        return SELF_SLIDING_PROMPT
+    elif mode == "self_compact_all":
+        return SELF_ALL_PROMPT
+    elif mode == "sliding_window":
        return SLIDING_PROMPT
+    else:  # all
+        return ALL_PROMPT


 class CompactionSettings(BaseModel):
    """Configuration for conversation compaction / summarization.

-    ``model`` is the only required user-facing field – it specifies the summarizer
-    model handle (e.g. ``"openai/gpt-4o-mini"``). Per-model settings (temperature,
+    Per-model settings (temperature,
    max tokens, etc.) are derived from the default configuration for that handle.
    """

@@ -39,7 +42,7 @@ class CompactionSettings(BaseModel):
    # If None, uses lightweight provider-specific defaults (e.g., haiku for Anthropic, gpt-5-mini for OpenAI).
    model: str | None = Field(
        default=None,
-        description="Model handle to use for summarization (format: provider/model-name). If None, uses lightweight provider-specific defaults.",
+        description="Model handle to use for sliding_window/all summarization (format: provider/model-name). If None, uses lightweight provider-specific defaults.",
    )

    # Optional provider-specific model settings for the summarizer model
@@ -56,10 +59,12 @@ class CompactionSettings(BaseModel):
        default=50000, description="The maximum length of the summary in characters. If none, no clipping is performed."
    )

-    mode: Literal["all", "sliding_window", "self"] = Field(default="sliding_window", description="The type of summarization technique use.")
+    mode: Literal["all", "sliding_window", "self_compact_all", "self_compact_sliding_window"] = Field(
+        default="sliding_window", description="The type of summarization technique use."
+    )
    sliding_window_percentage: float = Field(
        default_factory=lambda: summarizer_settings.partial_evict_summarizer_percentage,
-        description="The percentage of the context window to keep post-summarization (only used in sliding window mode).",
+        description="The percentage of the context window to keep post-summarization (only used in sliding window modes).",
    )

    # Called upon agent creation and if mode is changed in summarize endpoint request
--- a/letta/services/summarizer/summarizer_sliding_window.py
+++ b/letta/services/summarizer/summarizer_sliding_window.py
@@ -77,7 +77,13 @@ async def count_tokens_with_tools(
        actor=actor,
    )

-    tool_definitions = [OpenAITool(type="function", function=t.json_schema) for t in tools if t.json_schema]
+    # Tools can be either Tool objects (with .json_schema) or dicts (json schemas directly)
+    # For compatibility with how tools need to be passed in for self compaction
+    tool_definitions = [
+        OpenAITool(type="function", function=t.json_schema if hasattr(t, "json_schema") else t)
+        for t in tools
+        if (hasattr(t, "json_schema") and t.json_schema) or (isinstance(t, dict) and t)
+    ]
    tool_tokens = await token_counter.count_tool_tokens(tool_definitions) if tool_definitions else 0

    # Apply safety margin for approximate counting (message_tokens already has margin applied)
@@ -127,6 +133,11 @@ async def summarize_via_sliding_window(
    else:
        maximum_message_index = total_message_count - 1

+    # simple version: summarize(in_context[1:round(summarizer_config.sliding_window_percentage * len(in_context_messages))])
+    # this evicts 30% of the messages (via summarization) and keeps the remaining 70%
+    # problem: we need the cutoff point to be an assistant message, so will grow the cutoff point until we find an assistant message
+    # also need to grow the cutoff point until the token count is less than the target token count
+
    # Starts at N% (eg 70%), and increments up until 100%
    max(
        1 - summarizer_config.sliding_window_percentage, 0.10
@@ -146,11 +157,6 @@ async def summarize_via_sliding_window(
            return message.tool_calls is not None and len(message.tool_calls) > 0
        return False

-    # simple version: summarize(in_context[1:round(summarizer_config.sliding_window_percentage * len(in_context_messages))])
-    # this evicts 30% of the messages (via summarization) and keeps the remaining 70%
-    # problem: we need the cutoff point to be an assistant message, so will grow the cutoff point until we find an assistant message
-    # also need to grow the cutoff point until the token count is less than the target token count
-
    while approx_token_count >= goal_tokens and eviction_percentage < 1.0:
        # more eviction percentage
        eviction_percentage += 0.10
@@ -168,7 +174,9 @@ async def summarize_via_sliding_window(
            None,
        )
        if assistant_message_index is None:
-            logger.warning(f"No assistant message found for evicting up to index {message_cutoff_index}, incrementing eviction percentage")
+            logger.warning(
+                f"No assistant/approval message found for evicting up to index {message_cutoff_index}, incrementing eviction percentage"
+            )
            continue

        # update token count
@@ -210,7 +218,7 @@ async def summarize_via_sliding_window(
        },
    )

-    logger.info(f"\n==================\nSummary message string: {summary_message_str[:100]}\n==================\n")
+    logger.info(f"\n==================\nSummary message string: {summary_message_str[:100]}...\n==================\n")

    if summarizer_config.clip_chars is not None and len(summary_message_str) > summarizer_config.clip_chars:
        logger.warning(f"Summary length {len(summary_message_str)} exceeds clip length {summarizer_config.clip_chars}. Truncating.")
--- a/letta/system.py
+++ b/letta/system.py
@@ -204,11 +204,24 @@ def package_summarize_message(summary, summary_message_count, hidden_message_cou
    return json_dumps(packaged_message)


-def package_summarize_message_no_counts(summary, timezone, compaction_stats: dict | None = None):
-    context_message = (
-        "Note: prior messages have been hidden from view due to conversation memory constraints.\n"
-        + f"The following is a summary of the previous messages:\n {summary}"
-    )
+def package_summarize_message_no_counts(summary, timezone, compaction_stats: dict | None = None, mode: str | None = None):
+    if mode and "sliding_window" in mode:  # sliding_window, self_compact_sliding_window
+        if compaction_stats and "messages_count_before" in compaction_stats and "messages_count_after" in compaction_stats:
+            num_evicted = compaction_stats["messages_count_before"] - compaction_stats["messages_count_after"]
+            context_message = (
+                f"Note: {num_evicted} messages from the beginning of the conversation have been hidden from view due to memory constraints.\n"
+                + f"The following is a summary of the previous messages:\n {summary}"
+            )
+        else:
+            context_message = (
+                "Note: prior messages from the beginning of the conversation have been hidden from view due to conversation memory constraints.\n"
+                + f"The following is a summary of the previous messages:\n {summary}"
+            )
+    else:  # all, self
+        context_message = (
+            "Note: prior messages have been hidden from view due to conversation memory constraints.\n"
+            + f"The following is a summary of the previous messages:\n {summary}"
+        )

    formatted_time = get_local_time(timezone=timezone)
    packaged_message = {
--- a/tests/integration_test_summarizer.py
+++ b/tests/integration_test_summarizer.py
@@ -15,17 +15,14 @@ import pytest

 from letta.agents.letta_agent_v3 import LettaAgentV3
 from letta.config import LettaConfig
-from letta.schemas.agent import CreateAgent, UpdateAgent
-from letta.schemas.block import BlockUpdate, CreateBlock
+from letta.schemas.agent import CreateAgent
 from letta.schemas.embedding_config import EmbeddingConfig
 from letta.schemas.enums import MessageRole
 from letta.schemas.letta_message import EventMessage, SummaryMessage
 from letta.schemas.letta_message_content import TextContent, ToolCallContent, ToolReturnContent
 from letta.schemas.llm_config import LLMConfig
-from letta.schemas.message import Message as PydanticMessage, MessageCreate
-from letta.schemas.run import Run as PydanticRun
+from letta.schemas.message import Message as PydanticMessage
 from letta.server.server import SyncServer
-from letta.services.run_manager import RunManager
 from letta.services.summarizer.summarizer import simple_summary
 from letta.settings import model_settings

@@ -669,14 +666,24 @@ from unittest.mock import patch

 from letta.services.summarizer.summarizer_config import CompactionSettings

-# Test both summarizer modes: "all" summarizes entire history, "sliding_window" keeps recent messages
-SUMMARIZER_CONFIG_MODES: list[Literal["all", "sliding_window"]] = ["all", "sliding_window"]
+# Test all summarizer modes: "all" summarizes entire history, "sliding_window" keeps recent messages
+SUMMARIZER_CONFIG_MODES: list[Literal["all", "sliding_window", "self_compact_all", "self_compact_sliding_window"]] = [
+    "all",
+    "sliding_window",
+    "self_compact_all",
+    "self_compact_sliding_window",
+]


@pytest.mark.asyncio
@pytest.mark.parametrize("mode", SUMMARIZER_CONFIG_MODES, ids=SUMMARIZER_CONFIG_MODES)
@pytest.mark.parametrize("llm_config", TESTED_LLM_CONFIGS, ids=[c.model for c in TESTED_LLM_CONFIGS])
-async def test_summarize_with_mode(server: SyncServer, actor, llm_config: LLMConfig, mode: Literal["all", "sliding_window"]):
+async def test_summarize_with_mode(
+    server: SyncServer,
+    actor,
+    llm_config: LLMConfig,
+    mode: Literal["all", "sliding_window", "self_compact_all", "self_compact_sliding_window"],
+):
    """
    Test summarization with different CompactionSettings modes using LettaAgentV3.

@@ -746,20 +753,20 @@ async def test_summarize_with_mode(server: SyncServer, actor, llm_config: LLMCon

    print()

-    if mode == "all":
-        # For "all" mode, V3 keeps:
+    if mode == "all" or mode == "self_compact_all":
+        # For "all" or "self" mode, V3 keeps:
        #   1. System prompt
        #   2. A single user summary message (system_alert JSON)
        # and no remaining historical messages.
-        assert len(result) == 2, f"Expected 2 messages for 'all' mode (system + summary), got {len(result)}"
+        assert len(result) == 2, f"Expected 2 messages for {mode} mode (system + summary), got {len(result)}"
        assert result[0].role == MessageRole.system
        assert result[1].role == MessageRole.user
    else:
-        # For "sliding_window" mode, result should include:
+        # For "sliding_window" or "self_compact_sliding_window" mode, result should include:
        #   1. System prompt
        #   2. User summary message
        #   3+. Recent user/assistant messages inside the window.
-        assert len(result) > 2, f"Expected >2 messages for 'sliding_window' mode, got {len(result)}"
+        assert len(result) > 2, f"Expected >2 messages for {mode} mode, got {len(result)}"
        assert result[0].role == MessageRole.system
        assert result[1].role == MessageRole.user

@@ -1195,97 +1202,206 @@ async def test_sliding_window_cutoff_index_does_not_exceed_message_count(server:
    TESTED_LLM_CONFIGS,
    ids=[c.model for c in TESTED_LLM_CONFIGS],
 )
-async def test_large_system_prompt_summarization(server: SyncServer, actor, llm_config: LLMConfig):
+async def test_self_sliding_window_cutoff_index_does_not_exceed_message_count(server: SyncServer, actor, llm_config: LLMConfig):
    """
-    Test edge case of large system prompt / memory blocks.
+    Test that the sliding window summarizer correctly calculates cutoff indices.

-    This test verifies that summarization handles the case where the system prompt
-    and memory blocks are very large, potentially consuming most of the context window.
-    The summarizer should gracefully handle this scenario without errors.
+    This test verifies the fix for a bug where the cutoff percentage was treated as
+    a whole number (10) instead of a decimal (0.10), causing:
+      message_cutoff_index = round(10 * 65) = 650
+    when there were only 65 messages, resulting in an empty range loop and the error:
+      "No assistant message found from indices 650 to 65"
+
+    The fix changed:
+      - max(..., 10) -> max(..., 0.10)
+      - += 10 -> += 0.10
+      - >= 100 -> >= 1.0
+
+    This test uses the real token counter (via create_token_counter) to verify
+    the sliding window logic works with actual token counting.
    """
+    from letta.llm_api.llm_client import LLMClient
+    from letta.schemas.agent import AgentType
+    from letta.services.summarizer.self_summarizer import self_summarize_sliding_window
+    from letta.services.summarizer.summarizer_config import CompactionSettings
+    from letta.services.telemetry_manager import TelemetryManager

-    # Override context window to be small so we trigger summarization
-    llm_config.context_window = 10000
+    # Create a real summarizer config using the default factory
+    # Override sliding_window_percentage to 0.3 for this test
+    handle = llm_config.handle or f"{llm_config.model_endpoint_type}/{llm_config.model}"
+    summarizer_config = CompactionSettings(model=handle)
+    summarizer_config.sliding_window_percentage = 0.3

-    # Create agent with large system prompt and memory blocks
-    agent_name = f"test_agent_large_system_prompt_{llm_config.model}".replace(".", "_").replace("/", "_")
-    agent_create = CreateAgent(
-        name=agent_name,
-        llm_config=llm_config,
-        embedding_config=DEFAULT_EMBEDDING_CONFIG,
-        system="SYSTEM PROMPT " * 10000,  # Large system prompt
-        memory_blocks=[
-            CreateBlock(
-                label="human",
-                limit=200000,
-                value="NAME " * 10000,  # Large memory block
+    # Create 65 messages (similar to the failing case in the bug report)
+    # Pattern: system + alternating user/assistant messages
+    messages = [
+        PydanticMessage(
+            role=MessageRole.system,
+            content=[TextContent(type="text", text="You are a helpful assistant.")],
+        )
+    ]
+
+    # Add 64 more messages (32 user-assistant pairs)
+    for i in range(32):
+        messages.append(
+            PydanticMessage(
+                role=MessageRole.user,
+                content=[TextContent(type="text", text=f"User message {i}")],
+            )
+        )
+        messages.append(
+            PydanticMessage(
+                role=MessageRole.assistant,
+                content=[TextContent(type="text", text=f"Assistant response {i}")],
            )
-        ],
-    )
-    agent_state = await server.agent_manager.create_agent_async(agent_create, actor=actor)
-
-    # Create a run for the agent using RunManager
-    run = PydanticRun(agent_id=agent_state.id)
-    run = await RunManager().create_run(pydantic_run=run, actor=actor)
-
-    # Create the agent loop using LettaAgentV3
-    agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
-
-    # message the agent
-    input_message = MessageCreate(role=MessageRole.user, content="Hello")
-
-    # Call step on the agent - may trigger summarization due to large context
-    from letta.errors import SystemPromptTokenExceededError
-
-    with pytest.raises(SystemPromptTokenExceededError):
-        response = await agent_loop.step(
-            input_messages=[input_message],
-            run_id=run.id,
-            max_steps=3,
        )

-    # Repair the agent by shortening the memory blocks and system prompt
-    # Update system prompt to a shorter version
-    short_system_prompt = "You are a helpful assistant."
-    await server.agent_manager.update_agent_async(
-        agent_id=agent_state.id,
-        agent_update=UpdateAgent(system=short_system_prompt),
-        actor=actor,
-    )
+    assert len(messages) == 65, f"Expected 65 messages, got {len(messages)}"

-    # Update memory block to a shorter version
-    short_memory_value = "The user's name is Alice."
-    await server.agent_manager.modify_block_by_label_async(
-        agent_id=agent_state.id,
-        block_label="human",
-        block_update=BlockUpdate(value=short_memory_value),
-        actor=actor,
-    )
+    # This should NOT raise "No assistant message found from indices 650 to 65"
+    # With the fix, message_count_cutoff_percent starts at max(0.7, 0.10) = 0.7
+    # So message_cutoff_index = round(0.7 * 65) = 46, which is valid
+    try:
+        summary, remaining_messages = await self_summarize_sliding_window(
+            actor=actor,
+            agent_id="agent-test-self-sliding-window",
+            agent_llm_config=llm_config,
+            telemetry_manager=TelemetryManager(),
+            llm_client=LLMClient.create(llm_config),
+            agent_type=AgentType.letta_v1_agent,
+            messages=messages,
+            compaction_settings=summarizer_config,
+            timezone="UTC",
+        )

-    # Reload agent state after repairs
-    agent_state = await server.agent_manager.get_agent_by_id_async(agent_id=agent_state.id, actor=actor)
-    print("REPAIRED AGENT STATE ======")
-    print(agent_state.system)
-    print(agent_state.blocks)
+        # Verify the summary was generated (actual LLM response)
+        assert summary is not None
+        assert len(summary) > 0

-    # Create a new run for the repaired agent
-    run = PydanticRun(agent_id=agent_state.id)
-    run = await RunManager().create_run(pydantic_run=run, actor=actor)
+        # Verify remaining messages is a valid subset
+        assert len(remaining_messages) < len(messages)
+        assert len(remaining_messages) > 0

-    # Create a new agent loop with the repaired agent state
-    agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
+        print(f"Successfully summarized {len(messages)} messages to {len(remaining_messages)} remaining")
+        print(f"Summary: {summary[:200]}..." if len(summary) > 200 else f"Summary: {summary}")
+        print(f"Using {llm_config.model_endpoint_type} token counter for model {llm_config.model}")

-    # Now the agent should be able to respond without context window errors
-    response = await agent_loop.step(
-        input_messages=[input_message],
-        run_id=run.id,
-        max_steps=3,
-    )
+    except ValueError as e:
+        if "No assistant message found from indices" in str(e):
+            # Extract the indices from the error message
+            import re

-    # Verify we got a valid response after repair
-    assert response is not None
-    assert response.messages is not None
-    print(f"Agent successfully responded after repair with {len(response.messages)} messages")
+            match = re.search(r"from indices (\d+) to (\d+)", str(e))
+            if match:
+                start_idx, end_idx = int(match.group(1)), int(match.group(2))
+                pytest.fail(
+                    f"Bug detected: cutoff index ({start_idx}) exceeds message count ({end_idx}). "
+                    f"This indicates the percentage calculation bug where 10 was used instead of 0.10. "
+                    f"Error: {e}"
+                )
+        raise
+
+
+### NOTE: removing edge case test where sys prompt is huge for now
+### because we no longer refresh the system prompt before compaction
+### in order to leverage caching (for self compaction)
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize(
+#     "llm_config",
+#     TESTED_LLM_CONFIGS,
+#     ids=[c.model for c in TESTED_LLM_CONFIGS],
+# )
+# async def test_large_system_prompt_summarization(server: SyncServer, actor, llm_config: LLMConfig):
+#     """
+#     Test edge case of large system prompt / memory blocks.
+
+#     This test verifies that summarization handles the case where the system prompt
+#     and memory blocks are very large, potentially consuming most of the context window.
+#     The summarizer should gracefully handle this scenario without errors.
+#     """
+
+#     # Override context window to be small so we trigger summarization
+#     llm_config.context_window = 10000
+
+#     # Create agent with large system prompt and memory blocks
+#     agent_name = f"test_agent_large_system_prompt_{llm_config.model}".replace(".", "_").replace("/", "_")
+#     agent_create = CreateAgent(
+#         name=agent_name,
+#         llm_config=llm_config,
+#         embedding_config=DEFAULT_EMBEDDING_CONFIG,
+#         system="SYSTEM PROMPT " * 10000,  # Large system prompt
+#         memory_blocks=[
+#             CreateBlock(
+#                 label="human",
+#                 limit=200000,
+#                 value="NAME " * 10000,  # Large memory block
+#             )
+#         ],
+#     )
+#     agent_state = await server.agent_manager.create_agent_async(agent_create, actor=actor)
+
+#     # Create a run for the agent using RunManager
+#     run = PydanticRun(agent_id=agent_state.id)
+#     run = await RunManager().create_run(pydantic_run=run, actor=actor)
+
+#     # Create the agent loop using LettaAgentV3
+#     agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
+
+#     # message the agent
+#     input_message = MessageCreate(role=MessageRole.user, content="Hello")
+
+#     # Call step on the agent - may trigger summarization due to large context
+#     from letta.errors import SystemPromptTokenExceededError
+
+#     with pytest.raises(SystemPromptTokenExceededError):
+#         response = await agent_loop.step(
+#             input_messages=[input_message],
+#             run_id=run.id,
+#             max_steps=3,
+#         )
+
+#     # Repair the agent by shortening the memory blocks and system prompt
+#     # Update system prompt to a shorter version
+#     short_system_prompt = "You are a helpful assistant."
+#     await server.agent_manager.update_agent_async(
+#         agent_id=agent_state.id,
+#         agent_update=UpdateAgent(system=short_system_prompt),
+#         actor=actor,
+#     )
+
+#     # Update memory block to a shorter version
+#     short_memory_value = "The user's name is Alice."
+#     await server.agent_manager.modify_block_by_label_async(
+#         agent_id=agent_state.id,
+#         block_label="human",
+#         block_update=BlockUpdate(value=short_memory_value),
+#         actor=actor,
+#     )
+
+#     # Reload agent state after repairs
+#     agent_state = await server.agent_manager.get_agent_by_id_async(agent_id=agent_state.id, actor=actor)
+#     print("REPAIRED AGENT STATE ======")
+#     print(agent_state.system)
+#     print(agent_state.blocks)
+
+#     # Create a new run for the repaired agent
+#     run = PydanticRun(agent_id=agent_state.id)
+#     run = await RunManager().create_run(pydantic_run=run, actor=actor)
+
+#     # Create a new agent loop with the repaired agent state
+#     agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
+
+#     # Now the agent should be able to respond without context window errors
+#     response = await agent_loop.step(
+#         input_messages=[input_message],
+#         run_id=run.id,
+#         max_steps=3,
+#     )
+
+#     # Verify we got a valid response after repair
+#     assert response is not None
+#     assert response.messages is not None
+#     print(f"Agent successfully responded after repair with {len(response.messages)} messages")


 # @pytest.mark.asyncio
@@ -1718,6 +1834,127 @@ async def test_summarize_all(server: SyncServer, actor, llm_config: LLMConfig):
    print(f"Using {llm_config.model_endpoint_type} for model {llm_config.model}")


+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "llm_config",
+    TESTED_LLM_CONFIGS,
+    ids=[c.model for c in TESTED_LLM_CONFIGS],
+)
+async def test_summarize_self(server: SyncServer, actor, llm_config: LLMConfig):
+    """
+    Test the summarize_all function with real LLM calls.
+
+    This test verifies that the 'all' summarization mode works correctly,
+    summarizing the entire conversation into a single summary string.
+    """
+    from letta.llm_api.llm_client import LLMClient
+    from letta.schemas.agent import AgentType
+    from letta.services.summarizer.self_summarizer import self_summarize_all
+    from letta.services.summarizer.summarizer_config import CompactionSettings
+    from letta.services.telemetry_manager import TelemetryManager
+
+    # Create a summarizer config with "self" mode
+    handle = llm_config.handle or f"{llm_config.model_endpoint_type}/{llm_config.model}"
+    summarizer_config = CompactionSettings(model=handle)
+    summarizer_config.mode = "self"
+
+    # Create test messages - a simple conversation
+    messages = [
+        PydanticMessage(
+            role=MessageRole.system,
+            content=[TextContent(type="text", text="You are a helpful assistant.")],
+        )
+    ]
+
+    # Add 10 user-assistant pairs
+    for i in range(10):
+        messages.append(
+            PydanticMessage(
+                role=MessageRole.user,
+                content=[TextContent(type="text", text=f"User message {i}: What is {i} + {i}?")],
+            )
+        )
+        messages.append(
+            PydanticMessage(
+                role=MessageRole.assistant,
+                content=[TextContent(type="text", text=f"Assistant response {i}: {i} + {i} = {i * 2}.")],
+            )
+        )
+
+    assert len(messages) == 21, f"Expected 21 messages, got {len(messages)}"
+
+    # Call summarize_all with real LLM
+    summary, new_in_context_messages = await self_summarize_all(
+        actor=actor,
+        agent_id="agent-test-self-sliding-window",
+        agent_llm_config=llm_config,
+        telemetry_manager=TelemetryManager(),
+        llm_client=LLMClient.create(llm_config),
+        agent_type=AgentType.letta_v1_agent,
+        messages=messages,
+        compaction_settings=summarizer_config,
+        timezone="UTC",
+    )
+
+    # Verify the summary was generated
+    assert len(new_in_context_messages) == 1
+    assert summary is not None
+    assert len(summary) > 0
+    assert len(summary) <= 5000  # length should be less than 500 words, give some buffer in test
+
+    print(f"Successfully summarized {len(messages)} messages using 'self' mode")
+    print(f"Summary: {summary[:200]}..." if len(summary) > 200 else f"Summary: {summary}")
+    print(f"Using {llm_config.model_endpoint_type} for model {llm_config.model}")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("llm_config", TESTED_LLM_CONFIGS, ids=[c.model for c in TESTED_LLM_CONFIGS])
+async def test_self_mode_fallback(server: SyncServer, actor, llm_config: LLMConfig):
+    """If self summarize fails, it should have proper fallback."""
+    from unittest.mock import AsyncMock, patch
+
+    messages = [
+        PydanticMessage(
+            role=MessageRole.system,
+            content=[TextContent(type="text", text="You are a helpful assistant.")],
+        )
+    ]
+    for i in range(10):
+        messages.append(
+            PydanticMessage(
+                role=MessageRole.user,
+                content=[TextContent(type="text", text=f"User message {i}: Test message {i}.")],
+            )
+        )
+        messages.append(
+            PydanticMessage(
+                role=MessageRole.assistant,
+                content=[TextContent(type="text", text=f"Assistant response {i}: Acknowledged message {i}.")],
+            )
+        )
+
+    agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
+
+    handle = llm_config.handle or f"{llm_config.model_endpoint_type}/{llm_config.model}"
+    agent_state.compaction_settings = CompactionSettings(model=handle, mode="self_compact_all")
+
+    agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
+
+    # Mock self_summarize_all to always fail
+    with patch(
+        "letta.services.summarizer.compact.self_summarize_all",
+        new_callable=AsyncMock,
+        side_effect=RuntimeError("Simulated self_summarize_all failure"),
+    ):
+        summary_message, compacted_messages, summary_text = await agent_loop.compact(messages=in_context_messages)
+
+        assert summary_message is not None
+        assert summary_text is not None
+        assert len(summary_text) > 0
+        assert len(compacted_messages) < len(in_context_messages)
+        print(f"Fallback succeeded: {len(in_context_messages)} -> {len(compacted_messages)} messages")
+
+
 # =============================================================================
 # CompactionStats tests
 # =============================================================================
@@ -2033,3 +2270,15 @@ async def test_compact_with_stats_params_embeds_stats(server: SyncServer, actor,
    assert stats.context_tokens_after is not None  # Should be set by compact()
    assert stats.messages_count_after == len(compacted_messages)  # final_messages already includes summary
    assert stats.context_window == llm_config.context_window
+
+
+### basic self summarization
+
+
+### fallback chain
+
+### basic self sliding window summarization
+
+### self sliding window preserves recent msgs
+
+### self mode return compaction stats
--- a/tests/test_provider_trace_summarization.py
+++ b/tests/test_provider_trace_summarization.py
@@ -209,7 +209,7 @@ class TestSummarizeSlidingWindowTelemetryContext:
            await summarizer_sliding_window.summarize_via_sliding_window(
                actor=mock_actor,
                llm_config=mock_llm_config,
-                agent_llm_config=mock_llm_config,  # case where agent and summarizer have same config
+                agent_llm_config=mock_llm_config, # case where agent and summarizer have same config
                summarizer_config=mock_compaction_settings,
                in_context_messages=mock_messages,
                agent_id=agent_id,