From b4e19f9a70bf5e2a6c332819f867a891926a7f3b Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Tue, 8 Apr 2025 21:10:48 -0700
Subject: [PATCH] fix: patch summarizer for google and use new client  (#1639)

---
 letta/agent.py                       |  9 ++++++++-
 letta/llm_api/anthropic_client.py    |  9 +++++----
 letta/llm_api/openai_client.py       |  4 +++-
 letta/memory.py                      | 24 ++++++++++++++++++++----
 tests/integration_test_summarizer.py | 15 ---------------
 5 files changed, 36 insertions(+), 25 deletions(-)

diff --git a/letta/agent.py b/letta/agent.py
index c4a5919c..f0941229 100644
--- a/letta/agent.py
+++ b/letta/agent.py
@@ -376,7 +376,6 @@ class Agent(BaseAgent):
                     else:
                         raise ValueError(f"Bad finish reason from API: {response.choices[0].finish_reason}")
                 log_telemetry(self.logger, "_handle_ai_response finish")
-                return response
 
             except ValueError as ve:
                 if attempt >= empty_response_retry_limit:
@@ -393,6 +392,14 @@ class Agent(BaseAgent):
                 log_telemetry(self.logger, "_handle_ai_response finish generic Exception")
                 raise e
 
+            # check if we are going over the context window: this allows for articifial constraints
+            if response.usage.total_tokens > self.agent_state.llm_config.context_window:
+                # trigger summarization
+                log_telemetry(self.logger, "_get_ai_reply summarize_messages_inplace")
+                self.summarize_messages_inplace()
+            # return the response
+            return response
+
         log_telemetry(self.logger, "_handle_ai_response finish catch-all exception")
         raise Exception("Retries exhausted and no valid response received.")
 
diff --git a/letta/llm_api/anthropic_client.py b/letta/llm_api/anthropic_client.py
index 944858c2..9aabc8dd 100644
--- a/letta/llm_api/anthropic_client.py
+++ b/letta/llm_api/anthropic_client.py
@@ -90,7 +90,7 @@ class AnthropicClient(LLMClientBase):
     def build_request_data(
         self,
         messages: List[PydanticMessage],
-        tools: List[dict],
+        tools: Optional[List[dict]] = None,
         force_tool_call: Optional[str] = None,
     ) -> dict:
         # TODO: This needs to get cleaned up. The logic here is pretty confusing.
@@ -146,11 +146,12 @@ class AnthropicClient(LLMClientBase):
             tools_for_request = [Tool(function=f) for f in tools] if tools is not None else None
 
         # Add tool choice
-        data["tool_choice"] = tool_choice
+        if tool_choice:
+            data["tool_choice"] = tool_choice
 
         # Add inner thoughts kwarg
         # TODO: Can probably make this more efficient
-        if len(tools_for_request) > 0 and self.llm_config.put_inner_thoughts_in_kwargs:
+        if tools_for_request and len(tools_for_request) > 0 and self.llm_config.put_inner_thoughts_in_kwargs:
             tools_with_inner_thoughts = add_inner_thoughts_to_functions(
                 functions=[t.function.model_dump() for t in tools_for_request],
                 inner_thoughts_key=INNER_THOUGHTS_KWARG,
@@ -158,7 +159,7 @@ class AnthropicClient(LLMClientBase):
             )
             tools_for_request = [Tool(function=f) for f in tools_with_inner_thoughts]
 
-        if len(tools_for_request) > 0:
+        if tools_for_request and len(tools_for_request) > 0:
             # TODO eventually enable parallel tool use
             data["tools"] = convert_tools_to_anthropic_format(tools_for_request)
 
diff --git a/letta/llm_api/openai_client.py b/letta/llm_api/openai_client.py
index 0d996977..856c795f 100644
--- a/letta/llm_api/openai_client.py
+++ b/letta/llm_api/openai_client.py
@@ -78,9 +78,11 @@ class OpenAIClient(LLMClientBase):
         # force function calling for reliability, see https://platform.openai.com/docs/api-reference/chat/create#chat-create-tool_choice
         # TODO(matt) move into LLMConfig
         # TODO: This vllm checking is very brittle and is a patch at most
+        tool_choice = None
         if self.llm_config.model_endpoint == "https://inference.memgpt.ai" or (self.llm_config.handle and "vllm" in self.llm_config.handle):
             tool_choice = "auto"  # TODO change to "required" once proxy supports it
-        else:
+        elif tools:
+            # only set if tools is non-Null
             tool_choice = "required"
 
         if force_tool_call is not None:
diff --git a/letta/memory.py b/letta/memory.py
index 7f7d20d9..8554709e 100644
--- a/letta/memory.py
+++ b/letta/memory.py
@@ -2,6 +2,7 @@ from typing import Callable, Dict, List
 
 from letta.constants import MESSAGE_SUMMARY_REQUEST_ACK
 from letta.llm_api.llm_api_tools import create
+from letta.llm_api.llm_client import LLMClient
 from letta.prompts.gpt_summarize import SYSTEM as SUMMARY_PROMPT_SYSTEM
 from letta.schemas.agent import AgentState
 from letta.schemas.enums import MessageRole
@@ -9,6 +10,7 @@ from letta.schemas.letta_message_content import TextContent
 from letta.schemas.memory import Memory
 from letta.schemas.message import Message
 from letta.settings import summarizer_settings
+from letta.tracing import trace_method
 from letta.utils import count_tokens, printd
 
 
@@ -45,6 +47,7 @@ def _format_summary_history(message_history: List[Message]):
     return "\n".join([f"{m.role}: {get_message_text(m.content)}" for m in message_history])
 
 
+@trace_method
 def summarize_messages(
     agent_state: AgentState,
     message_sequence_to_summarize: List[Message],
@@ -74,12 +77,25 @@ def summarize_messages(
     # TODO: We need to eventually have a separate LLM config for the summarizer LLM
     llm_config_no_inner_thoughts = agent_state.llm_config.model_copy(deep=True)
     llm_config_no_inner_thoughts.put_inner_thoughts_in_kwargs = False
-    response = create(
+
+    llm_client = LLMClient.create(
         llm_config=llm_config_no_inner_thoughts,
-        user_id=agent_state.created_by_id,
-        messages=message_sequence,
-        stream=False,
+        put_inner_thoughts_first=False,
     )
+    # try to use new client, otherwise fallback to old flow
+    # TODO: we can just directly call the LLM here?
+    if llm_client:
+        response = llm_client.send_llm_request(
+            messages=message_sequence,
+            stream=False,
+        )
+    else:
+        response = create(
+            llm_config=llm_config_no_inner_thoughts,
+            user_id=agent_state.created_by_id,
+            messages=message_sequence,
+            stream=False,
+        )
 
     printd(f"summarize_messages gpt reply: {response.choices[0]}")
     reply = response.choices[0].message.content
diff --git a/tests/integration_test_summarizer.py b/tests/integration_test_summarizer.py
index b47ce3fa..ff446b4d 100644
--- a/tests/integration_test_summarizer.py
+++ b/tests/integration_test_summarizer.py
@@ -130,21 +130,6 @@ def test_summarize_many_messages_basic(client, disable_e2b_api_key):
     client.delete_agent(small_agent_state.id)
 
 
-def test_summarize_large_message_does_not_loop_infinitely(client, disable_e2b_api_key):
-    small_context_llm_config = LLMConfig.default_config("gpt-4o-mini")
-    small_context_llm_config.context_window = 2000
-    small_agent_state = client.create_agent(
-        name="super_small_context_agent",
-        llm_config=small_context_llm_config,
-    )
-    with pytest.raises(ContextWindowExceededError, match=f"Ran summarizer {summarizer_settings.max_summarizer_retries}"):
-        client.user_message(
-            agent_id=small_agent_state.id,
-            message="hi " * 1000,
-        )
-    client.delete_agent(small_agent_state.id)
-
-
 def test_summarize_messages_inplace(client, agent_state, disable_e2b_api_key):
     """Test summarization via sending the summarize CLI command or via a direct call to the agent object"""
     # First send a few messages (5)