perf: optimize prefix caching by skipping system prompt rebuild on every step (#9080)

2026-02-06 18:08:34 -08:00
parent 825019c2ce
commit 9dbe28e8f1
8 changed files with 488 additions and 133 deletions
--- a/fern/openapi.json
+++ b/fern/openapi.json
@@ -6717,6 +6717,151 @@
        }
      }
    },
    "/v1/agents/{agent_id}/recompile": {
      "post": {
        "tags": ["agents"],
        "summary": "Recompile Agent",
        "description": "Manually trigger system prompt recompilation for an agent.",
        "operationId": "recompile_agent",
        "parameters": [
          {
            "name": "agent_id",
            "in": "path",
            "required": true,
            "schema": {
              "type": "string",
              "minLength": 42,
              "maxLength": 42,
              "pattern": "^agent-[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$",
              "description": "The ID of the agent in the format 'agent-<uuid4>'",
              "examples": ["agent-123e4567-e89b-42d3-8456-426614174000"],
              "title": "Agent Id"
            },
            "description": "The ID of the agent in the format 'agent-<uuid4>'"
          },
          {
            "name": "update_timestamp",
            "in": "query",
            "required": false,
            "schema": {
              "type": "boolean",
              "description": "If True, update the in-context memory last edit timestamp embedded in the system prompt.",
              "default": false,
              "title": "Update Timestamp"
            },
            "description": "If True, update the in-context memory last edit timestamp embedded in the system prompt."
          },
          {
            "name": "dry_run",
            "in": "query",
            "required": false,
            "schema": {
              "type": "boolean",
              "description": "If True, do not persist changes; still returns the compiled system prompt.",
              "default": false,
              "title": "Dry Run"
            },
            "description": "If True, do not persist changes; still returns the compiled system prompt."
          }
        ],
        "responses": {
          "200": {
            "description": "Successful Response",
            "content": {
              "application/json": {
                "schema": {
                  "type": "string",
                  "title": "Response Recompile Agent"
                }
              }
            }
          },
          "422": {
            "description": "Validation Error",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/HTTPValidationError"
                }
              }
            }
          }
        }
      }
    },
    "/v1/agents/{agent_id}/system-prompt/recompile": {
      "post": {
        "tags": ["agents"],
        "summary": "Recompile Agent System Prompt",
        "description": "Deprecated alias for POST /v1/agents/{agent_id}/recompile.",
        "operationId": "recompile_agent_system_prompt",
        "deprecated": true,
        "parameters": [
          {
            "name": "agent_id",
            "in": "path",
            "required": true,
            "schema": {
              "type": "string",
              "minLength": 42,
              "maxLength": 42,
              "pattern": "^agent-[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$",
              "description": "The ID of the agent in the format 'agent-<uuid4>'",
              "examples": ["agent-123e4567-e89b-42d3-8456-426614174000"],
              "title": "Agent Id"
            },
            "description": "The ID of the agent in the format 'agent-<uuid4>'"
          },
          {
            "name": "update_timestamp",
            "in": "query",
            "required": false,
            "schema": {
              "type": "boolean",
              "description": "If True, update the in-context memory last edit timestamp embedded in the system prompt.",
              "default": false,
              "title": "Update Timestamp"
            },
            "description": "If True, update the in-context memory last edit timestamp embedded in the system prompt."
          },
          {
            "name": "dry_run",
            "in": "query",
            "required": false,
            "schema": {
              "type": "boolean",
              "description": "If True, do not persist changes; still returns the compiled system prompt.",
              "default": false,
              "title": "Dry Run"
            },
            "description": "If True, do not persist changes; still returns the compiled system prompt."
          }
        ],
        "responses": {
          "200": {
            "description": "Successful Response",
            "content": {
              "application/json": {
                "schema": {
                  "type": "string",
                  "title": "Response Recompile Agent System Prompt"
                }
              }
            }
          },
          "422": {
            "description": "Validation Error",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/HTTPValidationError"
                }
              }
            }
          }
        }
      }
    },
    "/v1/agents/{agent_id}/core-memory/blocks/attach/{block_id}": {
      "patch": {
        "tags": ["agents"],
--- a/letta/agents/letta_agent_v2.py
+++ b/letta/agents/letta_agent_v2.py
@@ -687,20 +687,38 @@ class LettaAgentV2(BaseAgentV2):
            return False
    @trace_method
-    async def _refresh_messages(self, in_context_messages: list[Message]):
+    async def _refresh_messages(self, in_context_messages: list[Message], force_system_prompt_refresh: bool = False):
-        num_messages = await self.message_manager.size_async(
+        """Refresh in-context messages.
-            agent_id=self.agent_state.id,
+
-            actor=self.actor,
+        This performs two tasks:
-        )
+        1) Rebuild the *system prompt* only if the memory/tool-rules/directories section has changed.
-        num_archival_memories = await self.passage_manager.agent_passage_size_async(
+           This avoids rebuilding the system prompt on every step due to dynamic metadata (e.g. message counts),
-            agent_id=self.agent_state.id,
+           which can bust prefix caching.
-            actor=self.actor,
+        2) Scrub inner thoughts from messages.
-        )
+
-        in_context_messages = await self._rebuild_memory(
+        Args:
-            in_context_messages,
+            in_context_messages: Current in-context messages
-            num_messages=num_messages,
+            force_system_prompt_refresh: If True, forces evaluation of whether the system prompt needs to be rebuilt.
-            num_archival_memories=num_archival_memories,
+                (The rebuild will still be skipped if memory/tool-rules/directories haven't changed.)
-        )
+
        Returns:
            Refreshed in-context messages.
        """
        # Always attempt to rebuild the system prompt if the memory section changed.
        # This method is careful to skip rebuilds when the memory section is unchanged.
        try:
            in_context_messages = await self._rebuild_memory(
                in_context_messages,
                num_messages=None,
                num_archival_memories=None,
            )
        except Exception as e:
            # If callers requested a forced refresh, surface the error.
            if force_system_prompt_refresh:
                raise
            self.logger.warning(f"Failed to refresh system prompt/memory: {e}")
        # Always scrub inner thoughts regardless of system prompt refresh
        in_context_messages = scrub_inner_thoughts_from_messages(in_context_messages, self.agent_state.llm_config)
        return in_context_messages
@@ -708,8 +726,8 @@ class LettaAgentV2(BaseAgentV2):
    async def _rebuild_memory(
        self,
        in_context_messages: list[Message],
-        num_messages: int,
+        num_messages: int | None,
-        num_archival_memories: int,
+        num_archival_memories: int | None,
    ):
        agent_state = await self.agent_manager.refresh_memory_async(agent_state=self.agent_state, actor=self.actor)
@@ -769,10 +787,14 @@ class LettaAgentV2(BaseAgentV2):
        )
        new_memory_section = extract_memory_section(curr_memory_str)
-        # compare just the memory sections (memory blocks, tool rules, directories)
+        # Compare just the memory sections (memory blocks, tool rules, directories).
-        if curr_memory_section.strip() == new_memory_section.strip():
+        # Also ensure the configured system prompt is still present; if the system prompt
        # changed (e.g. via UpdateAgent(system=...)), we must rebuild.
        system_prompt_changed = agent_state.system not in curr_system_message_text
        if (not system_prompt_changed) and (curr_memory_section.strip() == new_memory_section.strip()):
            self.logger.debug(
-                f"Memory and sources haven't changed for agent id={agent_state.id} and actor=({self.actor.id}, {self.actor.name}), skipping system prompt rebuild"
+                f"Memory, sources, and system prompt haven't changed for agent id={agent_state.id} and actor=({self.actor.id}, {self.actor.name}), skipping system prompt rebuild"
            )
            return in_context_messages
--- a/letta/agents/letta_agent_v3.py
+++ b/letta/agents/letta_agent_v3.py
@@ -733,13 +733,11 @@ class LettaAgentV3(LettaAgentV2):
                    self.logger.info("switching to unconstrained mode (allowing non-tool responses)")
            self._require_tool_call = require_tool_call
-            # Always refresh messages at the start of each step to pick up external inputs
+            # Refresh messages at the start of each step to scrub inner thoughts.
-            # (e.g., approval responses submitted by the client while this stream is running)
+            # NOTE: We skip system prompt refresh during normal steps to preserve prefix caching.
            # The system prompt is only rebuilt after compaction or message reset.
            try:
-                # TODO: cleanup and de-dup
+                messages = await self._refresh_messages(messages, force_system_prompt_refresh=False)
                # updates the system prompt with the latest blocks / message histories
                messages = await self._refresh_messages(messages)
            except Exception as e:
                self.logger.warning(f"Failed to refresh messages at step start: {e}")
@@ -924,6 +922,8 @@ class LettaAgentV3(LettaAgentV2):
                                    context_tokens_before=context_tokens_before,
                                    messages_count_before=messages_count_before,
                                )
                                # Force system prompt rebuild after compaction to update memory blocks and timestamps
                                messages = await self._refresh_messages(messages, force_system_prompt_refresh=True)
                                self.logger.info("Summarization succeeded, continuing to retry LLM request")
                                # Persist the summary message
@@ -1081,6 +1081,10 @@ class LettaAgentV3(LettaAgentV2):
                        context_tokens_before=context_tokens_before,
                        messages_count_before=messages_count_before,
                    )
                    # Force system prompt rebuild after compaction to update memory blocks and timestamps
                    messages = await self._refresh_messages(messages, force_system_prompt_refresh=True)
                    # TODO: persist + return the summary message
                    # TODO: convert this to a SummaryMessage
                    self.response_messages.append(summary_message)
                    # Yield summary result message to client
--- a/letta/functions/function_sets/base.py
+++ b/letta/functions/function_sets/base.py
@@ -242,7 +242,7 @@ async def archival_memory_search(
    raise NotImplementedError("This should never be invoked directly. Contact Letta if you see this error message.")
-def core_memory_append(agent_state: "AgentState", label: str, content: str) -> Optional[str]:  # type: ignore
+def core_memory_append(agent_state: "AgentState", label: str, content: str) -> str:  # type: ignore
    """
    Append to the contents of core memory.
@@ -251,15 +251,15 @@ def core_memory_append(agent_state: "AgentState", label: str, content: str) -> O
        content (str): Content to write to the memory. All unicode (including emojis) are supported.
    Returns:
-        Optional[str]: None is always returned as this function does not produce a response.
+        str: The updated value of the memory block.
    """
    current_value = str(agent_state.memory.get_block(label).value)
    new_value = current_value + "\n" + str(content)
    agent_state.memory.update_block_value(label=label, value=new_value)
-    return None
+    return new_value
-def core_memory_replace(agent_state: "AgentState", label: str, old_content: str, new_content: str) -> Optional[str]:  # type: ignore
+def core_memory_replace(agent_state: "AgentState", label: str, old_content: str, new_content: str) -> str:  # type: ignore
    """
    Replace the contents of core memory. To delete memories, use an empty string for new_content.
@@ -269,14 +269,14 @@ def core_memory_replace(agent_state: "AgentState", label: str, old_content: str,
        new_content (str): Content to write to the memory. All unicode (including emojis) are supported.
    Returns:
-        Optional[str]: None is always returned as this function does not produce a response.
+        str: The updated value of the memory block.
    """
    current_value = str(agent_state.memory.get_block(label).value)
    if old_content not in current_value:
        raise ValueError(f"Old content '{old_content}' not found in memory block '{label}'")
    new_value = current_value.replace(str(old_content), str(new_content))
    agent_state.memory.update_block_value(label=label, value=new_value)
-    return None
+    return new_value
 def rethink_memory(agent_state: "AgentState", new_memory: str, target_block_label: str) -> None:
@@ -337,7 +337,7 @@ def memory_replace(agent_state: "AgentState", label: str, old_str: str, new_str:
        memory_replace(label="human", old_str="Their name is Alice", new_str="Their name is Bob")
    Returns:
-        str: The success message
+        str: The updated value of the memory block.
    """
    import re
@@ -382,19 +382,10 @@ def memory_replace(agent_state: "AgentState", label: str, old_str: str, new_str:
    # end_line = replacement_line + SNIPPET_LINES + new_str.count("\n")
    # snippet = "\n".join(new_value.split("\n")[start_line : end_line + 1])
-    # Prepare the success message
+    return new_value
    success_msg = (
        f"The core memory block with label `{label}` has been successfully edited. "
        f"Your system prompt has been recompiled with the updated memory contents and is now active in your context. "
        f"Review the changes and make sure they are as expected (correct indentation, "
        f"no duplicate lines, etc)."
    )
    # return None
    return success_msg
-def memory_insert(agent_state: "AgentState", label: str, new_str: str, insert_line: int = -1) -> Optional[str]:  # type: ignore
+def memory_insert(agent_state: "AgentState", label: str, new_str: str, insert_line: int = -1) -> str:  # type: ignore
    """
    The memory_insert command allows you to insert text at a specific location in a memory block.
@@ -453,15 +444,7 @@ def memory_insert(agent_state: "AgentState", label: str, new_str: str, insert_li
    # Write into the block
    agent_state.memory.update_block_value(label=label, value=new_value)
-    # Prepare the success message
+    return new_value
    success_msg = (
        f"The core memory block with label `{label}` has been successfully edited. "
        f"Your system prompt has been recompiled with the updated memory contents and is now active in your context. "
        f"Review the changes and make sure they are as expected (correct indentation, "
        f"no duplicate lines, etc)."
    )
    return success_msg
 def memory_apply_patch(agent_state: "AgentState", label: str, patch: str) -> str:  # type: ignore
@@ -499,7 +482,7 @@ def memory_apply_patch(agent_state: "AgentState", label: str, patch: str) -> str
    raise NotImplementedError("This should never be invoked directly. Contact Letta if you see this error message.")
-def memory_rethink(agent_state: "AgentState", label: str, new_memory: str) -> None:
+def memory_rethink(agent_state: "AgentState", label: str, new_memory: str) -> str:
    """
    The memory_rethink command allows you to completely rewrite the contents of a memory block. Use this tool to make large sweeping changes (e.g. when you want to condense or reorganize the memory blocks), do NOT use this tool to make small precise edits (e.g. add or remove a line, replace a specific string, etc).
@@ -528,17 +511,7 @@ def memory_rethink(agent_state: "AgentState", label: str, new_memory: str) -> No
        agent_state.memory.set_block(new_block)
    agent_state.memory.update_block_value(label=label, value=new_memory)
-
+    return new_memory
    # Prepare the success message
    success_msg = (
        f"The core memory block with label `{label}` has been successfully edited. "
        f"Your system prompt has been recompiled with the updated memory contents and is now active in your context. "
        f"Review the changes and make sure they are as expected (correct indentation, "
        f"no duplicate lines, etc)."
    )
    # return None
    return success_msg
 def memory_finish_edits(agent_state: "AgentState") -> None:  # type: ignore
--- a/letta/server/rest_api/routers/v1/agents.py
+++ b/letta/server/rest_api/routers/v1/agents.py
@@ -1263,6 +1263,70 @@ async def modify_block_for_agent(
    return block
@router.post(
    "/{agent_id}/recompile",
    response_model=str,
    operation_id="recompile_agent",
 )
 async def recompile_agent(
    agent_id: AgentId,
    server: "SyncServer" = Depends(get_letta_server),
    headers: HeaderParams = Depends(get_headers),
    update_timestamp: bool = Query(
        False,
        description="If True, update the in-context memory last edit timestamp embedded in the system prompt.",
    ),
    dry_run: bool = Query(
        False,
        description="If True, do not persist changes; still returns the compiled system prompt.",
    ),
 ):
    """Manually trigger system prompt recompilation for an agent."""
    actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
    _, system_message, _, _ = await server.agent_manager.rebuild_system_prompt_async(
        agent_id=agent_id,
        actor=actor,
        force=True,
        update_timestamp=update_timestamp,
        dry_run=dry_run,
    )
    if system_message is None:
        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"No system message found for agent '{agent_id}'")
    return system_message.to_openai_dict().get("content", "")
@router.post(
    "/{agent_id}/system-prompt/recompile",
    response_model=str,
    operation_id="recompile_agent_system_prompt",
    deprecated=True,
 )
 async def recompile_agent_system_prompt(
    agent_id: AgentId,
    server: "SyncServer" = Depends(get_letta_server),
    headers: HeaderParams = Depends(get_headers),
    update_timestamp: bool = Query(
        False,
        description="If True, update the in-context memory last edit timestamp embedded in the system prompt.",
    ),
    dry_run: bool = Query(
        False,
        description="If True, do not persist changes; still returns the compiled system prompt.",
    ),
 ):
    """Deprecated alias for POST /v1/agents/{agent_id}/recompile."""
    return await recompile_agent(
        agent_id=agent_id,
        server=server,
        headers=headers,
        update_timestamp=update_timestamp,
        dry_run=dry_run,
    )
@router.patch("/{agent_id}/core-memory/blocks/attach/{block_id}", response_model=AgentState, operation_id="attach_core_memory_block")
 async def attach_block_to_agent(
    block_id: BlockId,
--- a/letta/services/agent_manager.py
+++ b/letta/services/agent_manager.py
@@ -1567,21 +1567,30 @@ class AgentManager:
    @enforce_types
    @trace_method
    async def reset_messages_async(
-        self, agent_id: str, actor: PydanticUser, add_default_initial_messages: bool = False, needs_agent_state: bool = True
+        self,
        agent_id: str,
        actor: PydanticUser,
        add_default_initial_messages: bool = False,
        needs_agent_state: bool = True,
        rebuild_system_prompt: bool = False,
    ) -> Optional[PydanticAgentState]:
        """
        Clears all in-context messages for the specified agent except the original system message by:
          1) Preserving the first message ID (original system message).
          2) Updating the agent's message_ids to only contain the system message.
-          3) Optionally adding default initial messages after the system message.
+          3) Optionally rebuilding the system prompt with current memory blocks (for prefix caching optimization).
          4) Optionally adding default initial messages after the system message.
        Note: This only clears messages from the agent's context, it does not delete them from the database.
        Args:
            add_default_initial_messages: If true, adds the default initial messages after resetting.
            agent_id (str): The ID of the agent whose messages will be reset.
            actor (PydanticUser): The user performing this action.
            add_default_initial_messages: If true, adds the default initial messages after resetting.
            needs_agent_state: If True, returns the updated agent state. If False, returns None (for performance optimization)
            rebuild_system_prompt: If True, rebuilds the system prompt with current memory blocks.
                This ensures the system prompt reflects the latest memory state after reset.
                Defaults to False to preserve the original system message content.
        Returns:
            Optional[PydanticAgentState]: The updated agent state with only the original system message preserved, or None if needs_agent_state=False.
@@ -1601,12 +1610,17 @@ class AgentManager:
            agent.message_ids = [system_message_id]
            await agent.update_async(db_session=session, actor=actor)
-            # Only convert to pydantic if we need to return it or add initial messages
+            # Only convert to pydantic if we need to return it or add initial messages or rebuild system prompt
-            if add_default_initial_messages or needs_agent_state:
+            if add_default_initial_messages or needs_agent_state or rebuild_system_prompt:
-                agent_state = await agent.to_pydantic_async(include_relationships=["sources"] if add_default_initial_messages else None)
+                include_rels = ["sources", "memory"] if (add_default_initial_messages or rebuild_system_prompt) else None
                agent_state = await agent.to_pydantic_async(include_relationships=include_rels)
            else:
                agent_state = None
        # Optionally rebuild the system prompt with current memory blocks
        if rebuild_system_prompt and agent_state:
            agent_state, _, _, _ = await self.rebuild_system_prompt_async(agent_id=agent_state.id, actor=actor, force=True)
        # Optionally add default initial messages after the system message
        if add_default_initial_messages:
            init_messages = await initialize_message_sequence_async(
--- a/letta/services/tool_executor/core_tool_executor.py
+++ b/letta/services/tool_executor/core_tool_executor.py
@@ -318,14 +318,14 @@ class LettaCoreToolExecutor(ToolExecutor):
        await self.agent_manager.rebuild_system_prompt_async(agent_id=agent_state.id, actor=actor, force=True)
        return None
-    async def core_memory_append(self, agent_state: AgentState, actor: User, label: str, content: str) -> Optional[str]:
+    async def core_memory_append(self, agent_state: AgentState, actor: User, label: str, content: str) -> str:
        if agent_state.memory.get_block(label).read_only:
            raise ValueError(f"{READ_ONLY_BLOCK_EDIT_ERROR}")
        current_value = str(agent_state.memory.get_block(label).value)
        new_value = current_value + "\n" + str(content)
        agent_state.memory.update_block_value(label=label, value=new_value)
        await self.agent_manager.update_memory_if_changed_async(agent_id=agent_state.id, new_memory=agent_state.memory, actor=actor)
-        return None
+        return new_value
    async def core_memory_replace(
        self,
@@ -334,7 +334,7 @@ class LettaCoreToolExecutor(ToolExecutor):
        label: str,
        old_content: str,
        new_content: str,
-    ) -> Optional[str]:
+    ) -> str:
        if agent_state.memory.get_block(label).read_only:
            raise ValueError(f"{READ_ONLY_BLOCK_EDIT_ERROR}")
        current_value = str(agent_state.memory.get_block(label).value)
@@ -343,7 +343,7 @@ class LettaCoreToolExecutor(ToolExecutor):
        new_value = current_value.replace(str(old_content), str(new_content))
        agent_state.memory.update_block_value(label=label, value=new_value)
        await self.agent_manager.update_memory_if_changed_async(agent_id=agent_state.id, new_memory=agent_state.memory, actor=actor)
-        return None
+        return new_value
    async def memory_replace(self, agent_state: AgentState, actor: User, label: str, old_str: str, new_str: str) -> str:
        if agent_state.memory.get_block(label).read_only:
@@ -393,23 +393,7 @@ class LettaCoreToolExecutor(ToolExecutor):
        await self.agent_manager.update_memory_if_changed_async(agent_id=agent_state.id, new_memory=agent_state.memory, actor=actor)
-        # Create a snippet of the edited section
+        return new_value
        SNIPPET_LINES = 3
        replacement_line = current_value.split(old_str)[0].count("\n")
        start_line = max(0, replacement_line - SNIPPET_LINES)
        end_line = replacement_line + SNIPPET_LINES + new_str.count("\n")
        snippet = "\n".join(new_value.split("\n")[start_line : end_line + 1])
        # Prepare the success message
        success_msg = (
            f"The core memory block with label `{label}` has been successfully edited. "
            f"Your system prompt has been recompiled with the updated memory contents and is now active in your context. "
            f"Review the changes and make sure they are as expected (correct indentation, "
            f"no duplicate lines, etc)."
        )
        # return None
        return success_msg
    async def memory_apply_patch(self, agent_state: AgentState, actor: User, label: str, patch: str) -> str:
        """Apply a simplified unified-diff style patch to one or more memory blocks.
@@ -545,11 +529,7 @@ class LettaCoreToolExecutor(ToolExecutor):
            agent_state.memory.update_block_value(label=label, value=new_value)
            await self.agent_manager.update_memory_if_changed_async(agent_id=agent_state.id, new_memory=agent_state.memory, actor=actor)
-            return (
+            return new_value
                f"The core memory block with label `{label}` has been successfully edited. "
                f"Your system prompt has been recompiled with the updated memory contents and is now active in your context. "
                f"Review the changes and make sure they are as expected (correct indentation, no duplicate lines, etc)."
            )
        # Extended mode: parse codex-like patch operations for memory blocks
        lines = patch.splitlines()
@@ -753,15 +733,7 @@ class LettaCoreToolExecutor(ToolExecutor):
        await self.agent_manager.update_memory_if_changed_async(agent_id=agent_state.id, new_memory=agent_state.memory, actor=actor)
-        # Prepare the success message
+        return new_value
        success_msg = (
            f"The core memory block with label `{label}` has been successfully edited. "
            f"Your system prompt has been recompiled with the updated memory contents and is now active in your context. "
            f"Review the changes and make sure they are as expected (correct indentation, "
            f"no duplicate lines, etc)."
        )
        return success_msg
    async def memory_rethink(self, agent_state: AgentState, actor: User, label: str, new_memory: str) -> str:
        if agent_state.memory.get_block(label).read_only:
@@ -793,16 +765,7 @@ class LettaCoreToolExecutor(ToolExecutor):
        await self.agent_manager.update_memory_if_changed_async(agent_id=agent_state.id, new_memory=agent_state.memory, actor=actor)
-        # Prepare the success message
+        return new_memory
        success_msg = (
            f"The core memory block with label `{label}` has been successfully edited. "
            f"Your system prompt has been recompiled with the updated memory contents and is now active in your context. "
            f"Review the changes and make sure they are as expected (correct indentation, "
            f"no duplicate lines, etc)."
        )
        # return None
        return success_msg
    async def memory_finish_edits(self, agent_state: AgentState, actor: User) -> None:
        return None
@@ -965,17 +928,13 @@ class LettaCoreToolExecutor(ToolExecutor):
        # Write the new content to the block
        await self.block_manager.update_block_async(block_id=memory_block.id, block_update=BlockUpdate(value=new_value), actor=actor)
        # Keep in-memory AgentState consistent with DB
        agent_state.memory.update_block_value(label=label, value=new_value)
        await self.agent_manager.rebuild_system_prompt_async(agent_id=agent_state.id, actor=actor, force=True)
-        # Prepare the success message
+        return new_value
        success_msg = (
            f"The core memory block with label `{label}` has been successfully edited. "
            f"Your system prompt has been recompiled with the updated memory contents and is now active in your context. "
            f"Review the changes and make sure they are as expected (correct indentation, "
            f"no duplicate lines, etc)."
        )
        return success_msg
    async def memory_str_insert(self, agent_state: AgentState, actor: User, path: str, insert_text: str, insert_line: int = -1) -> str:
        """Insert text into a memory block at a specific line."""
@@ -1032,17 +991,13 @@ class LettaCoreToolExecutor(ToolExecutor):
        # Write into the block
        await self.block_manager.update_block_async(block_id=memory_block.id, block_update=BlockUpdate(value=new_value), actor=actor)
        # Keep in-memory AgentState consistent with DB
        agent_state.memory.update_block_value(label=label, value=new_value)
        await self.agent_manager.rebuild_system_prompt_async(agent_id=agent_state.id, actor=actor, force=True)
-        # Prepare the success message
+        return new_value
        success_msg = (
            f"The core memory block with label `{label}` has been successfully edited. "
            f"Your system prompt has been recompiled with the updated memory contents and is now active in your context. "
            f"Review the changes and make sure they are as expected (correct indentation, "
            f"no duplicate lines, etc)."
        )
        return success_msg
    async def memory(
        self,
--- a/tests/integration_test_system_prompt_prefix_caching.py
+++ b/tests/integration_test_system_prompt_prefix_caching.py
@@ -0,0 +1,178 @@
 """
 Integration tests for system prompt prefix caching optimization.
 These tests verify that the system prompt is NOT rebuilt on every step,
 only after compaction or message reset. This helps preserve prefix caching
 for LLM providers.
 """
 import pytest
 from letta_client import Letta
@pytest.fixture(scope="module")
 def client(server_url: str) -> Letta:
    """Creates and returns a synchronous Letta REST client for testing."""
    return Letta(base_url=server_url)
@pytest.fixture(scope="function")
 def agent(client: Letta):
    """Create a test agent and clean up after test."""
    agent_state = client.agents.create(
        name="test-prefix-cache-agent",
        include_base_tools=True,
        model="openai/gpt-4o-mini",
        embedding="openai/text-embedding-ada-002",
    )
    yield agent_state
    # Cleanup
    try:
        client.agents.delete(agent_state.id)
    except Exception:
        pass
 class TestSystemPromptPrefixCaching:
    """Test that system prompt stays stable during normal agent execution."""
    def test_system_prompt_stable_after_memory_tool_and_messages(self, client: Letta, agent):
        """
        Test workflow:
        1. Get initial system prompt and human block value
        2. Tell agent to update its memory block using the memory tool
        3. Verify block was modified but system prompt hasn't changed
        4. Send another message to the agent
        5. Verify system prompt still hasn't changed
        6. Manually update a block via API
        7. Send another message and verify system prompt still hasn't changed
           (memory block changes are deferred to compaction)
        """
        # Step 1: Get initial context window, system prompt, and human block value
        initial_context = client.agents.context.retrieve(agent.id)
        initial_system_prompt = initial_context.system_prompt
        assert initial_system_prompt, "Initial system prompt should not be empty"
        # Get initial human block value
        human_block = None
        for block in agent.memory.blocks:
            if block.label == "human":
                human_block = block
                break
        assert human_block, "Agent should have a 'human' memory block"
        initial_block_value = human_block.value
        # Step 2: Tell the agent to update its memory using the memory tool
        response = client.agents.messages.create(
            agent_id=agent.id,
            messages=[
                {
                    "role": "user",
                    "content": "Please use the core_memory_append tool to add the following to your 'human' block: 'User likes pizza.'",
                }
            ],
        )
        assert response.messages, "Agent should respond with messages"
        # Step 3: Verify block was modified but system prompt hasn't changed
        # Check that the block was actually modified
        updated_block = client.blocks.retrieve(human_block.id)
        assert updated_block.value != initial_block_value, "Memory block should have been modified by the agent"
        assert "pizza" in updated_block.value.lower(), "Memory block should contain the new content about pizza"
        # Verify system prompt hasn't changed
        context_after_memory_update = client.agents.context.retrieve(agent.id)
        system_prompt_after_memory = context_after_memory_update.system_prompt
        assert system_prompt_after_memory == initial_system_prompt, (
            "System prompt should NOT change after agent uses memory tool (deferred to compaction)"
        )
        # Step 4: Send another message to the agent
        response2 = client.agents.messages.create(
            agent_id=agent.id,
            messages=[
                {
                    "role": "user",
                    "content": "What is my favorite food?",
                }
            ],
        )
        assert response2.messages, "Agent should respond with messages"
        # Step 5: Verify system prompt still hasn't changed
        context_after_second_message = client.agents.context.retrieve(agent.id)
        system_prompt_after_second = context_after_second_message.system_prompt
        assert system_prompt_after_second == initial_system_prompt, "System prompt should remain stable after multiple messages"
        # Step 6: Manually update a block via the API
        # Find the human block
        human_block = None
        for block in agent.memory.blocks:
            if block.label == "human":
                human_block = block
                break
        assert human_block, "Agent should have a 'human' memory block"
        # Update the block directly via API
        client.blocks.modify(
            block_id=human_block.id,
            value=human_block.value + "\nUser also likes sushi.",
        )
        # Step 7: Send another message and verify system prompt still hasn't changed
        response3 = client.agents.messages.create(
            agent_id=agent.id,
            messages=[
                {
                    "role": "user",
                    "content": "What foods do I like?",
                }
            ],
        )
        assert response3.messages, "Agent should respond with messages"
        # Verify system prompt STILL hasn't changed (deferred to compaction/reset)
        context_after_manual_update = client.agents.context.retrieve(agent.id)
        system_prompt_after_manual = context_after_manual_update.system_prompt
        assert system_prompt_after_manual == initial_system_prompt, (
            "System prompt should NOT change after manual block update (deferred to compaction)"
        )
    def test_system_prompt_updates_after_reset(self, client: Letta, agent):
        """
        Test that system prompt IS updated after message reset.
        1. Get initial system prompt
        2. Manually update a memory block
        3. Reset messages
        4. Verify system prompt HAS changed to include the new memory
        """
        # Step 1: Get initial system prompt
        initial_context = client.agents.context.retrieve(agent.id)
        initial_system_prompt = initial_context.system_prompt
        # Step 2: Manually update a block via the API
        human_block = None
        for block in agent.memory.blocks:
            if block.label == "human":
                human_block = block
                break
        assert human_block, "Agent should have a 'human' memory block"
        # Add distinctive text that we can verify in the system prompt
        new_memory_content = "UNIQUE_TEST_MARKER_12345: User loves ice cream."
        client.blocks.modify(
            block_id=human_block.id,
            value=human_block.value + f"\n{new_memory_content}",
        )
        # Step 3: Reset messages (this should trigger system prompt rebuild)
        client.agents.messages.reset(agent.id)
        # Step 4: Verify system prompt HAS changed and includes the new memory
        context_after_reset = client.agents.context.retrieve(agent.id)
        system_prompt_after_reset = context_after_reset.system_prompt
        assert system_prompt_after_reset != initial_system_prompt, "System prompt SHOULD change after message reset"
        assert "UNIQUE_TEST_MARKER_12345" in system_prompt_after_reset, (
            "System prompt should include the updated memory block content after reset"
        )