diff --git a/letta/agents/letta_agent_v3.py b/letta/agents/letta_agent_v3.py index bd08ff0e..f19e2916 100644 --- a/letta/agents/letta_agent_v3.py +++ b/letta/agents/letta_agent_v3.py @@ -63,6 +63,7 @@ from letta.services.summarizer.summarizer_all import summarize_all from letta.services.summarizer.summarizer_config import CompactionSettings from letta.services.summarizer.summarizer_sliding_window import ( count_tokens, + count_tokens_with_tools, summarize_via_sliding_window, ) from letta.settings import settings, summarizer_settings @@ -1740,9 +1741,12 @@ class LettaAgentV3(LettaAgentV2): else: raise ValueError(f"Invalid summarizer mode: {summarizer_config.mode}") - # update the token count - self.context_token_estimate = await count_tokens( - actor=self.actor, llm_config=self.agent_state.llm_config, messages=compacted_messages + # update the token count (including tools for accurate comparison with LLM's prompt_tokens) + self.context_token_estimate = await count_tokens_with_tools( + actor=self.actor, + llm_config=self.agent_state.llm_config, + messages=compacted_messages, + tools=self.agent_state.tools, ) self.logger.info(f"Context token estimate after summarization: {self.context_token_estimate}") @@ -1775,8 +1779,11 @@ class LettaAgentV3(LettaAgentV2): ) summarization_mode_used = "all" - self.context_token_estimate = await count_tokens( - actor=self.actor, llm_config=self.agent_state.llm_config, messages=compacted_messages + self.context_token_estimate = await count_tokens_with_tools( + actor=self.actor, + llm_config=self.agent_state.llm_config, + messages=compacted_messages, + tools=self.agent_state.tools, ) # final edge case: the system prompt is the cause of the context overflow (raise error) diff --git a/letta/services/summarizer/summarizer_sliding_window.py b/letta/services/summarizer/summarizer_sliding_window.py index 87739393..96902363 100644 --- a/letta/services/summarizer/summarizer_sliding_window.py +++ b/letta/services/summarizer/summarizer_sliding_window.py @@ -42,6 +42,53 @@ async def count_tokens(actor: User, llm_config: LLMConfig, messages: List[Messag return tokens +async def count_tokens_with_tools( + actor: User, + llm_config: LLMConfig, + messages: List[Message], + tools: Optional[List["Tool"]] = None, +) -> int: + """Count tokens in messages AND tool definitions. + + This provides a more accurate context token count by including tool definitions, + which are sent to the LLM but not included in the messages list. + + Args: + actor: The user making the request. + llm_config: The LLM configuration for selecting the appropriate tokenizer. + messages: The in-context messages (including system message). + tools: Optional list of Tool objects. If provided, their schemas are counted. + + Returns: + Total token count for messages + tools. + """ + # Delegate message counting to existing function + message_tokens = await count_tokens(actor, llm_config, messages) + + if not tools: + return message_tokens + + # Count tools + from openai.types.beta.function_tool import FunctionTool as OpenAITool + + from letta.services.context_window_calculator.token_counter import ApproxTokenCounter + + token_counter = create_token_counter( + model_endpoint_type=llm_config.model_endpoint_type, + model=llm_config.model, + actor=actor, + ) + + tool_definitions = [OpenAITool(type="function", function=t.json_schema) for t in tools if t.json_schema] + tool_tokens = await token_counter.count_tool_tokens(tool_definitions) if tool_definitions else 0 + + # Apply safety margin for approximate counting (message_tokens already has margin applied) + if isinstance(token_counter, ApproxTokenCounter): + tool_tokens = int(tool_tokens * APPROX_TOKEN_SAFETY_MARGIN) + + return message_tokens + tool_tokens + + @trace_method async def summarize_via_sliding_window( # Required to tag LLM calls diff --git a/tests/integration_test_summarizer.py b/tests/integration_test_summarizer.py index da4fa435..221c6b32 100644 --- a/tests/integration_test_summarizer.py +++ b/tests/integration_test_summarizer.py @@ -1046,10 +1046,10 @@ async def test_v3_summarize_hard_eviction_when_still_over_threshold( # summarize_conversation_history to run and then hit the branch where the # *post*-summarization token count is still above the proactive # summarization threshold. We simulate that by patching the - # letta_agent_v3-level count_tokens helper to report an extremely large + # letta_agent_v3-level count_tokens_with_tools helper to report an extremely large # token count for the first call (post-summary) and a small count for the # second call (after hard eviction). - with patch("letta.agents.letta_agent_v3.count_tokens") as mock_count_tokens: + with patch("letta.agents.letta_agent_v3.count_tokens_with_tools") as mock_count_tokens: # First call: pretend the summarized context is still huge relative to # this model's context window so that we always trigger the # hard-eviction path. Second call: minimal context (system only) is