From e349ba3bddbd3c0142b4ca35e1c959272e844a6f Mon Sep 17 00:00:00 2001 From: Sarah Wooders Date: Tue, 25 Nov 2025 15:59:01 -0800 Subject: [PATCH] feat: support programmatic tool calling for custom tools [LET-6316] (#6369) --- .../tool_executor/builtin_tool_executor.py | 12 ++ tests/integration_test_builtin_tools.py | 183 +++++++++++++++++- 2 files changed, 192 insertions(+), 3 deletions(-) diff --git a/letta/services/tool_executor/builtin_tool_executor.py b/letta/services/tool_executor/builtin_tool_executor.py index d0c04be3..c5033fa4 100644 --- a/letta/services/tool_executor/builtin_tool_executor.py +++ b/letta/services/tool_executor/builtin_tool_executor.py @@ -51,6 +51,18 @@ class LettaBuiltinToolExecutor(ToolExecutor): raise ValueError("E2B_API_KEY is not set") sbx = await AsyncSandbox.create(api_key=tool_settings.e2b_api_key) + + # Inject source code from agent's tools to enable programmatic tool calling + # This allows Claude to compose tools in a single code execution, e.g.: + # run_code("result = add(multiply(4, 5), 6)") + if language == "python" and agent_state and agent_state.tools: + tool_source_code = "" + for tool in agent_state.tools: + if tool.source_code: + tool_source_code += tool.source_code + "\n\n" + if tool_source_code: + code = tool_source_code + code + params = {"code": code} if language != "python": # Leave empty for python diff --git a/tests/integration_test_builtin_tools.py b/tests/integration_test_builtin_tools.py index 827d0f8c..fd7433a3 100644 --- a/tests/integration_test_builtin_tools.py +++ b/tests/integration_test_builtin_tools.py @@ -10,8 +10,10 @@ import requests from dotenv import load_dotenv from letta_client import Letta from letta_client.types import AgentState, MessageCreateParam, ToolReturnMessage +from letta_client.types.agents import ToolCallMessage from letta.services.tool_executor.builtin_tool_executor import LettaBuiltinToolExecutor +from letta.settings import tool_settings # ------------------------------ # Fixtures @@ -72,9 +74,9 @@ def agent_state(client: Letta) -> AgentState: """ client.tools.upsert_base_tools() - send_message_tool = list(client.tools.list(name="send_message"))[0] - run_code_tool = list(client.tools.list(name="run_code"))[0] - web_search_tool = list(client.tools.list(name="web_search"))[0] + send_message_tool = client.tools.list(name="send_message").items[0] + run_code_tool = client.tools.list(name="run_code").items[0] + web_search_tool = client.tools.list(name="web_search").items[0] agent_state_instance = client.agents.create( name="test_builtin_tools_agent", include_base_tools=False, @@ -311,3 +313,178 @@ async def test_web_search_uses_exa(): assert "results" in response_json assert response_json["query"] == "test query" assert len(response_json["results"]) == 1 + + +# ------------------------------ +# Programmatic Tool Calling Tests +# ------------------------------ + + +ADD_TOOL_SOURCE = """ +def add(a: int, b: int) -> int: + \"\"\"Add two numbers together. + + Args: + a (int): The first number. + b (int): The second number. + + Returns: + int: The sum of a and b. + \"\"\" + return a + b +""" + +MULTIPLY_TOOL_SOURCE = """ +def multiply(a: int, b: int) -> int: + \"\"\"Multiply two numbers together. + + Args: + a (int): The first number. + b (int): The second number. + + Returns: + int: The product of a and b. + \"\"\" + return a * b +""" + + +@pytest.fixture(scope="function") +def agent_with_custom_tools(client: Letta) -> AgentState: + """ + Creates an agent with custom add/multiply tools and run_code tool + to test programmatic tool calling. + """ + client.tools.upsert_base_tools() + + # Create custom tools + add_tool = client.tools.create(source_code=ADD_TOOL_SOURCE) + multiply_tool = client.tools.create(source_code=MULTIPLY_TOOL_SOURCE) + + # Get the run_code tool + run_code_tool = client.tools.list(name="run_code").items[0] + send_message_tool = client.tools.list(name="send_message").items[0] + + agent_state_instance = client.agents.create( + name="test_programmatic_tool_calling_agent", + include_base_tools=False, + tool_ids=[send_message_tool.id, run_code_tool.id, add_tool.id, multiply_tool.id], + model="openai/gpt-4o", + embedding="letta/letta-free", + tags=["test_programmatic_tool_calling"], + ) + yield agent_state_instance + + # Cleanup + client.agents.delete(agent_state_instance.id) + client.tools.delete(add_tool.id) + client.tools.delete(multiply_tool.id) + + +def test_programmatic_tool_calling_compose_tools( + client: Letta, + agent_with_custom_tools: AgentState, +) -> None: + """ + Tests that run_code can compose agent tools programmatically in a SINGLE call. + This validates that: + 1. Tool source code is injected into the sandbox + 2. Claude composes tools in one run_code call, not multiple separate tool calls + 3. The result is computed correctly: add(multiply(4, 5), 6) = 26 + """ + # Expected result: multiply(4, 5) = 20, add(20, 6) = 26 + expected = "26" + + user_message = MessageCreateParam( + role="user", + content=( + "Use the run_code tool to execute Python code that composes the add and multiply tools. " + "Calculate add(multiply(4, 5), 6) and return the result. " + "The add and multiply functions are already available in the code execution environment. " + "Do this in a SINGLE run_code call - do NOT call add or multiply as separate tools." + ), + otid=str(uuid.uuid4()), + ) + + response = client.agents.messages.create( + agent_id=agent_with_custom_tools.id, + messages=[user_message], + ) + + # Extract all tool calls + tool_calls = [m for m in response.messages if isinstance(m, ToolCallMessage)] + assert tool_calls, "No ToolCallMessage found for programmatic tool calling test" + + # Verify the agent used run_code to compose tools, not direct add/multiply calls + tool_names = [m.tool_call.name for m in tool_calls] + run_code_calls = [name for name in tool_names if name == "run_code"] + direct_add_calls = [name for name in tool_names if name == "add"] + direct_multiply_calls = [name for name in tool_names if name == "multiply"] + + # The key assertion: tools should be composed via run_code, not called directly + assert len(run_code_calls) >= 1, f"Expected at least one run_code call, but got tool calls: {tool_names}" + assert len(direct_add_calls) == 0, ( + f"Expected no direct 'add' tool calls (should be called via run_code), but found {len(direct_add_calls)}" + ) + assert len(direct_multiply_calls) == 0, ( + f"Expected no direct 'multiply' tool calls (should be called via run_code), but found {len(direct_multiply_calls)}" + ) + + # Verify the result is correct + tool_returns = [m for m in response.messages if isinstance(m, ToolReturnMessage)] + returns = [m.tool_return for m in tool_returns] + assert any(expected in ret for ret in returns), f"Expected to find '{expected}' in tool_return, but got {returns!r}" + + +@pytest.mark.asyncio(scope="function") +async def test_run_code_injects_tool_source_code() -> None: + """ + Unit test that verifies run_code injects agent tool source code into the sandbox. + This test directly calls run_code with a mocked agent_state containing tools. + """ + from letta.schemas.tool import Tool + + # Create mock agent state with tools that have source code + mock_agent_state = MagicMock() + mock_agent_state.tools = [ + Tool( + id="tool-00000001", + name="add", + source_code=ADD_TOOL_SOURCE.strip(), + ), + Tool( + id="tool-00000002", + name="multiply", + source_code=MULTIPLY_TOOL_SOURCE.strip(), + ), + ] + + # Skip if E2B_API_KEY is not set + if not tool_settings.e2b_api_key: + pytest.skip("E2B_API_KEY not set, skipping run_code test") + + # Create executor with mock dependencies + executor = LettaBuiltinToolExecutor( + message_manager=MagicMock(), + agent_manager=MagicMock(), + block_manager=MagicMock(), + run_manager=MagicMock(), + passage_manager=MagicMock(), + actor=MagicMock(), + ) + + # Execute code that composes the tools + # Note: We don't define add/multiply in the code - they should be injected from tool source + result = await executor.run_code( + agent_state=mock_agent_state, + code="print(add(multiply(4, 5), 6))", + language="python", + ) + + response_json = json.loads(result) + + # Verify execution succeeded and returned correct result + assert "error" not in response_json or response_json.get("error") is None, f"Code execution failed: {response_json}" + assert "26" in str(response_json["results"]) or "26" in str(response_json["logs"]["stdout"]), ( + f"Expected '26' in results, got: {response_json}" + )