feat: add new agent loop (squash rebase of OSS PR) (#4815)

* feat: squash rebase of OSS PR

* fix: revert changes that weren't on manual rebase

* fix: caught another one

* fix: disable force

* chore: drop print

* fix: just stage-api && just publish-api

* fix: make agent_type consistently an arg in the client

* fix: patch multi-modal support

* chore: put in todo stub

* fix: disable hardcoding for tests

* fix: patch validate agent sync (#4882)

patch validate agent sync

* fix: strip bad merge diff

* fix: revert unrelated diff

* fix: react_v2 naming -> letta_v1 naming

* fix: strip bad merge

---------

Co-authored-by: Kevin Lin <klin5061@gmail.com>
This commit is contained in:
Charles Packer
2025-09-23 17:49:59 -07:00
committed by Caren Thomas
parent 9edc7f4d64
commit a4041879a4
37 changed files with 3315 additions and 237 deletions

View File

@@ -34,6 +34,7 @@ from letta.local_llm.json_parser import clean_json_string_extra_backslash
from letta.local_llm.utils import count_tokens
from letta.log import get_logger
from letta.otel.tracing import trace_method
from letta.schemas.agent import AgentType
from letta.schemas.llm_config import LLMConfig
from letta.schemas.message import Message as PydanticMessage
from letta.schemas.openai.chat_completion_request import Tool
@@ -274,6 +275,7 @@ class GoogleVertexClient(LLMClientBase):
@trace_method
def build_request_data(
self,
agent_type: AgentType, # if react, use native content + strip heartbeats
messages: List[PydanticMessage],
llm_config: LLMConfig,
tools: List[dict],
@@ -282,6 +284,9 @@ class GoogleVertexClient(LLMClientBase):
"""
Constructs a request object in the expected data format for this client.
"""
# NOTE: forcing inner thoughts in kwargs off
if agent_type == AgentType.letta_v1_agent:
llm_config.put_inner_thoughts_in_kwargs = False
if tools:
tool_objs = [Tool(type="function", function=t) for t in tools]
@@ -293,7 +298,11 @@ class GoogleVertexClient(LLMClientBase):
tool_names = []
contents = self.add_dummy_model_messages(
PydanticMessage.to_google_dicts_from_list(messages),
PydanticMessage.to_google_dicts_from_list(
messages,
put_inner_thoughts_in_kwargs=False if agent_type == AgentType.letta_v1_agent else True,
native_content=True if agent_type == AgentType.letta_v1_agent else False,
),
)
request_data = {
@@ -312,16 +321,42 @@ class GoogleVertexClient(LLMClientBase):
request_data["config"]["response_schema"] = self.get_function_call_response_schema(tools[0])
del request_data["config"]["tools"]
elif tools:
tool_config = ToolConfig(
function_calling_config=FunctionCallingConfig(
# ANY mode forces the model to predict only function calls
mode=FunctionCallingConfigMode.ANY,
# Provide the list of tools (though empty should also work, it seems not to)
allowed_function_names=tool_names,
if agent_type == AgentType.letta_v1_agent:
# don't require tools
tool_call_mode = FunctionCallingConfigMode.AUTO
tool_config = ToolConfig(
function_calling_config=FunctionCallingConfig(
mode=tool_call_mode,
)
)
)
else:
# require tools
tool_call_mode = FunctionCallingConfigMode.ANY
tool_config = ToolConfig(
function_calling_config=FunctionCallingConfig(
mode=tool_call_mode,
# Provide the list of tools (though empty should also work, it seems not to)
allowed_function_names=tool_names,
)
)
request_data["config"]["tool_config"] = tool_config.model_dump()
# https://ai.google.dev/gemini-api/docs/thinking#set-budget
# 2.5 Pro
# - Default: dynamic thinking
# - Dynamic thinking that cannot be disabled
# - Range: -1 (for dynamic), or 128-32768
# 2.5 Flash
# - Default: dynamic thinking
# - Dynamic thinking that *can* be disabled
# - Range: -1, 0, or 0-24576
# 2.5 Flash Lite
# - Default: no thinking
# - Dynamic thinking that *can* be disabled
# - Range: -1, 0, or 512-24576
# TODO when using v3 agent loop, properly support the native thinking in Gemini
# Add thinking_config for flash
# If enable_reasoner is False, set thinking_budget to 0
# Otherwise, use the value from max_reasoning_tokens
@@ -410,8 +445,10 @@ class GoogleVertexClient(LLMClientBase):
function_args = function_call.args
assert isinstance(function_args, dict), function_args
# NOTE: this also involves stripping the inner monologue out of the function
# TODO this is kind of funky - really, we should be passing 'native_content' as a kwarg to fork behavior
inner_thoughts = response_message.text
if llm_config.put_inner_thoughts_in_kwargs:
# NOTE: this also involves stripping the inner monologue out of the function
from letta.local_llm.constants import INNER_THOUGHTS_KWARG_VERTEX
assert INNER_THOUGHTS_KWARG_VERTEX in function_args, (
@@ -420,7 +457,9 @@ class GoogleVertexClient(LLMClientBase):
inner_thoughts = function_args.pop(INNER_THOUGHTS_KWARG_VERTEX)
assert inner_thoughts is not None, f"Expected non-null inner thoughts function arg:\n{function_call}"
else:
inner_thoughts = None
pass
# inner_thoughts = None
# inner_thoughts = response_message.text
# Google AI API doesn't generate tool call IDs
openai_response_message = Message(