From 227b76fe0e81e9c67d1bfcaee7923e7ef6f409f7 Mon Sep 17 00:00:00 2001
From: Matthew Zhou <mattzh1314@gmail.com>
Date: Tue, 1 Apr 2025 16:54:09 -0700
Subject: [PATCH] feat: Add testing for SDK `send_message` variants (#1520)

---
 poetry.lock                                   |   8 +-
 pyproject.toml                                |   2 +-
 tests/conftest.py                             |  14 +-
 tests/integration_test_agent_tool_graph.py    |  18 +-
 tests/integration_test_async_tool_sandbox.py  |  20 +-
 tests/integration_test_chat_completions.py    |   6 +-
 tests/integration_test_experimental.py        |   6 +-
 .../integration_test_offline_memory_agent.py  |   6 +-
 tests/integration_test_send_message.py        | 333 ++++++++++++++++++
 tests/integration_test_summarizer.py          |   8 +-
 ...integration_test_tool_execution_sandbox.py |  28 +-
 tests/test_agent_serialization.py             |   4 +-
 tests/test_client.py                          |   2 +-
 tests/test_client_legacy.py                   |  12 +-
 tests/test_model_letta_performance.py         |  14 +-
 tests/test_server.py                          |  16 +-
 tests/test_streaming.py                       |   2 +-
 17 files changed, 414 insertions(+), 85 deletions(-)
 create mode 100644 tests/integration_test_send_message.py

diff --git a/poetry.lock b/poetry.lock
index b788c521..aa195e18 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2695,13 +2695,13 @@ pytest = ["pytest (>=7.0.0)", "rich (>=13.9.4,<14.0.0)"]
 
 [[package]]
 name = "letta-client"
-version = "0.1.84"
+version = "0.1.91"
 description = ""
 optional = false
 python-versions = "<4.0,>=3.8"
 files = [
-    {file = "letta_client-0.1.84-py3-none-any.whl", hash = "sha256:ac82b1d043dd6182b71f1abb339bc6b855f6aa851023ae67ae92c8b7c39ce0b5"},
-    {file = "letta_client-0.1.84.tar.gz", hash = "sha256:5705db7e89b0f598bd3645c668a14c55bc7cbe55db35bfd291646ab3d6eec434"},
+    {file = "letta_client-0.1.91-py3-none-any.whl", hash = "sha256:eb4508177dcbed5c4abc5cb1929cf67a7189851d9c310cab4e9bc8e4ce4d4d3f"},
+    {file = "letta_client-0.1.91.tar.gz", hash = "sha256:26b9936c4fca9fc9238afeaa8ce25fa6d4ef30153c425f0cfdd54c19ca78e028"},
 ]
 
 [package.dependencies]
@@ -6707,4 +6707,4 @@ tests = ["wikipedia"]
 [metadata]
 lock-version = "2.0"
 python-versions = "<3.14,>=3.10"
-content-hash = "6863aa7a366a80c9b7ba0904e1034974969184ecef5bf48abd5e02c33167ec71"
+content-hash = "d5db02048c6ad56bd289a76b8fdf522284f330ff6993e0825bece04d0bdda2c8"
diff --git a/pyproject.toml b/pyproject.toml
index 4f2b127d..7048b5fa 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -74,7 +74,7 @@ llama-index = "^0.12.2"
 llama-index-embeddings-openai = "^0.3.1"
 e2b-code-interpreter = {version = "^1.0.3", optional = true}
 anthropic = "^0.49.0"
-letta_client = "^0.1.65"
+letta_client = "^0.1.91"
 openai = "^1.60.0"
 opentelemetry-api = "1.30.0"
 opentelemetry-sdk = "1.30.0"
diff --git a/tests/conftest.py b/tests/conftest.py
index 78e60df1..220438e2 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,4 +1,5 @@
 import logging
+from typing import Generator
 
 import pytest
 
@@ -12,19 +13,16 @@ def pytest_configure(config):
 
 
 @pytest.fixture
-def mock_e2b_api_key_none():
+def disable_e2b_api_key() -> Generator[None, None, None]:
+    """
+    Temporarily disables the E2B API key by setting `tool_settings.e2b_api_key` to None
+    for the duration of the test. Restores the original value afterward.
+    """
     from letta.settings import tool_settings
 
-    # Store the original value of e2b_api_key
     original_api_key = tool_settings.e2b_api_key
-
-    # Set e2b_api_key to None
     tool_settings.e2b_api_key = None
-
-    # Yield control to the test
     yield
-
-    # Restore the original value of e2b_api_key
     tool_settings.e2b_api_key = original_api_key
 
 
diff --git a/tests/integration_test_agent_tool_graph.py b/tests/integration_test_agent_tool_graph.py
index 6e17bd92..bc3aee7a 100644
--- a/tests/integration_test_agent_tool_graph.py
+++ b/tests/integration_test_agent_tool_graph.py
@@ -109,7 +109,7 @@ def auto_error():
 
 
 @pytest.mark.timeout(60)  # Sets a 60-second timeout for the test since this could loop infinitely
-def test_single_path_agent_tool_call_graph(mock_e2b_api_key_none):
+def test_single_path_agent_tool_call_graph(disable_e2b_api_key):
     client = create_client()
     cleanup(client=client, agent_uuid=agent_uuid)
 
@@ -162,7 +162,7 @@ def test_single_path_agent_tool_call_graph(mock_e2b_api_key_none):
     cleanup(client=client, agent_uuid=agent_uuid)
 
 
-def test_check_tool_rules_with_different_models(mock_e2b_api_key_none):
+def test_check_tool_rules_with_different_models(disable_e2b_api_key):
     """Test that tool rules are properly checked for different model configurations."""
     client = create_client()
 
@@ -211,7 +211,7 @@ def test_check_tool_rules_with_different_models(mock_e2b_api_key_none):
         cleanup(client=client, agent_uuid=agent_uuid)
 
 
-def test_claude_initial_tool_rule_enforced(mock_e2b_api_key_none):
+def test_claude_initial_tool_rule_enforced(disable_e2b_api_key):
     """Test that the initial tool rule is enforced for the first message."""
     client = create_client()
 
@@ -262,7 +262,7 @@ def test_claude_initial_tool_rule_enforced(mock_e2b_api_key_none):
 
 
 @pytest.mark.timeout(60)  # Sets a 60-second timeout for the test since this could loop infinitely
-def test_agent_no_structured_output_with_one_child_tool(mock_e2b_api_key_none):
+def test_agent_no_structured_output_with_one_child_tool(disable_e2b_api_key):
     client = create_client()
     cleanup(client=client, agent_uuid=agent_uuid)
 
@@ -327,7 +327,7 @@ def test_agent_no_structured_output_with_one_child_tool(mock_e2b_api_key_none):
 
 
 # @pytest.mark.timeout(60)  # Sets a 60-second timeout for the test since this could loop infinitely
-# def test_agent_conditional_tool_easy(mock_e2b_api_key_none):
+# def test_agent_conditional_tool_easy(disable_e2b_api_key):
 #     """
 #     Test the agent with a conditional tool that has a child tool.
 #
@@ -395,7 +395,7 @@ def test_agent_no_structured_output_with_one_child_tool(mock_e2b_api_key_none):
 
 
 # @pytest.mark.timeout(60)
-# def test_agent_conditional_tool_without_default_child(mock_e2b_api_key_none):
+# def test_agent_conditional_tool_without_default_child(disable_e2b_api_key):
 #     """
 #     Test the agent with a conditional tool that allows any child tool to be called if a function returns None.
 #
@@ -456,7 +456,7 @@ def test_agent_no_structured_output_with_one_child_tool(mock_e2b_api_key_none):
 
 
 # @pytest.mark.timeout(60)
-# def test_agent_reload_remembers_function_response(mock_e2b_api_key_none):
+# def test_agent_reload_remembers_function_response(disable_e2b_api_key):
 #     """
 #     Test that when an agent is reloaded, it remembers the last function response for conditional tool chaining.
 #
@@ -512,7 +512,7 @@ def test_agent_no_structured_output_with_one_child_tool(mock_e2b_api_key_none):
 
 
 # @pytest.mark.timeout(60)  # Sets a 60-second timeout for the test since this could loop infinitely
-# def test_simple_tool_rule(mock_e2b_api_key_none):
+# def test_simple_tool_rule(disable_e2b_api_key):
 #     """
 #     Test a simple tool rule where fourth_secret_word must be called after flip_coin.
 #
@@ -676,7 +676,7 @@ def test_continue_tool_rule():
 
 @pytest.mark.timeout(60)
 @retry_until_success(max_attempts=3, sleep_time_seconds=2)
-def test_max_count_per_step_tool_rule_integration(mock_e2b_api_key_none):
+def test_max_count_per_step_tool_rule_integration(disable_e2b_api_key):
     """
     Test an agent with MaxCountPerStepToolRule to ensure a tool can only be called a limited number of times.
 
diff --git a/tests/integration_test_async_tool_sandbox.py b/tests/integration_test_async_tool_sandbox.py
index 11c64526..b85728db 100644
--- a/tests/integration_test_async_tool_sandbox.py
+++ b/tests/integration_test_async_tool_sandbox.py
@@ -253,7 +253,7 @@ def core_memory_tools(test_user):
 
 @pytest.mark.asyncio
 @pytest.mark.local_sandbox
-async def test_local_sandbox_default(mock_e2b_api_key_none, add_integers_tool, test_user):
+async def test_local_sandbox_default(disable_e2b_api_key, add_integers_tool, test_user):
     args = {"x": 10, "y": 5}
 
     # Mock and assert correct pathway was invoked
@@ -270,7 +270,7 @@ async def test_local_sandbox_default(mock_e2b_api_key_none, add_integers_tool, t
 
 @pytest.mark.asyncio
 @pytest.mark.local_sandbox
-async def test_local_sandbox_stateful_tool(mock_e2b_api_key_none, clear_core_memory_tool, test_user, agent_state):
+async def test_local_sandbox_stateful_tool(disable_e2b_api_key, clear_core_memory_tool, test_user, agent_state):
     args = {}
     sandbox = AsyncToolSandboxLocal(clear_core_memory_tool.name, args, user=test_user)
     result = await sandbox.run(agent_state=agent_state)
@@ -282,7 +282,7 @@ async def test_local_sandbox_stateful_tool(mock_e2b_api_key_none, clear_core_mem
 
 @pytest.mark.asyncio
 @pytest.mark.local_sandbox
-async def test_local_sandbox_with_list_rv(mock_e2b_api_key_none, list_tool, test_user):
+async def test_local_sandbox_with_list_rv(disable_e2b_api_key, list_tool, test_user):
     sandbox = AsyncToolSandboxLocal(list_tool.name, {}, user=test_user)
     result = await sandbox.run()
     assert len(result.func_return) == 5
@@ -290,7 +290,7 @@ async def test_local_sandbox_with_list_rv(mock_e2b_api_key_none, list_tool, test
 
 @pytest.mark.asyncio
 @pytest.mark.local_sandbox
-async def test_local_sandbox_env(mock_e2b_api_key_none, get_env_tool, test_user):
+async def test_local_sandbox_env(disable_e2b_api_key, get_env_tool, test_user):
     manager = SandboxConfigManager()
     sandbox_dir = str(Path(__file__).parent / "test_tool_sandbox")
     config_create = SandboxConfigCreate(config=LocalSandboxConfig(sandbox_dir=sandbox_dir).model_dump())
@@ -309,7 +309,7 @@ async def test_local_sandbox_env(mock_e2b_api_key_none, get_env_tool, test_user)
 
 @pytest.mark.asyncio
 @pytest.mark.local_sandbox
-async def test_local_sandbox_per_agent_env(mock_e2b_api_key_none, get_env_tool, agent_state, test_user):
+async def test_local_sandbox_per_agent_env(disable_e2b_api_key, get_env_tool, agent_state, test_user):
     manager = SandboxConfigManager()
     key = "secret_word"
     sandbox_dir = str(Path(__file__).parent / "test_tool_sandbox")
@@ -331,7 +331,7 @@ async def test_local_sandbox_per_agent_env(mock_e2b_api_key_none, get_env_tool,
 @pytest.mark.asyncio
 @pytest.mark.local_sandbox
 async def test_local_sandbox_external_codebase_with_venv(
-    mock_e2b_api_key_none, custom_test_sandbox_config, external_codebase_tool, test_user
+    disable_e2b_api_key, custom_test_sandbox_config, external_codebase_tool, test_user
 ):
     args = {"percentage": 10}
     sandbox = AsyncToolSandboxLocal(external_codebase_tool.name, args, user=test_user)
@@ -343,7 +343,7 @@ async def test_local_sandbox_external_codebase_with_venv(
 @pytest.mark.asyncio
 @pytest.mark.local_sandbox
 async def test_local_sandbox_with_venv_and_warnings_does_not_error(
-    mock_e2b_api_key_none, custom_test_sandbox_config, get_warning_tool, test_user
+    disable_e2b_api_key, custom_test_sandbox_config, get_warning_tool, test_user
 ):
     sandbox = AsyncToolSandboxLocal(get_warning_tool.name, {}, user=test_user)
     result = await sandbox.run()
@@ -352,7 +352,7 @@ async def test_local_sandbox_with_venv_and_warnings_does_not_error(
 
 @pytest.mark.asyncio
 @pytest.mark.e2b_sandbox
-async def test_local_sandbox_with_venv_errors(mock_e2b_api_key_none, custom_test_sandbox_config, always_err_tool, test_user):
+async def test_local_sandbox_with_venv_errors(disable_e2b_api_key, custom_test_sandbox_config, always_err_tool, test_user):
     sandbox = AsyncToolSandboxLocal(always_err_tool.name, {}, user=test_user)
     result = await sandbox.run()
     assert len(result.stdout) != 0
@@ -363,7 +363,7 @@ async def test_local_sandbox_with_venv_errors(mock_e2b_api_key_none, custom_test
 
 @pytest.mark.asyncio
 @pytest.mark.e2b_sandbox
-async def test_local_sandbox_with_venv_pip_installs_basic(mock_e2b_api_key_none, cowsay_tool, test_user):
+async def test_local_sandbox_with_venv_pip_installs_basic(disable_e2b_api_key, cowsay_tool, test_user):
     manager = SandboxConfigManager()
     config_create = SandboxConfigCreate(
         config=LocalSandboxConfig(use_venv=True, pip_requirements=[PipRequirement(name="cowsay")]).model_dump()
@@ -383,7 +383,7 @@ async def test_local_sandbox_with_venv_pip_installs_basic(mock_e2b_api_key_none,
 
 @pytest.mark.asyncio
 @pytest.mark.e2b_sandbox
-async def test_local_sandbox_with_venv_pip_installs_with_update(mock_e2b_api_key_none, cowsay_tool, test_user):
+async def test_local_sandbox_with_venv_pip_installs_with_update(disable_e2b_api_key, cowsay_tool, test_user):
     manager = SandboxConfigManager()
     config_create = SandboxConfigCreate(config=LocalSandboxConfig(use_venv=True).model_dump())
     config = manager.create_or_update_sandbox_config(config_create, test_user)
diff --git a/tests/integration_test_chat_completions.py b/tests/integration_test_chat_completions.py
index d14ace0e..3eb8d2bd 100644
--- a/tests/integration_test_chat_completions.py
+++ b/tests/integration_test_chat_completions.py
@@ -158,7 +158,7 @@ def _assert_valid_chunk(chunk, idx, chunks):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("message", ["Hi how are you today?"])
 @pytest.mark.parametrize("endpoint", ["v1/voice-beta"])
-async def test_latency(mock_e2b_api_key_none, client, agent, message, endpoint):
+async def test_latency(disable_e2b_api_key, client, agent, message, endpoint):
     """Tests chat completion streaming using the Async OpenAI client."""
     request = _get_chat_request(message)
 
@@ -172,7 +172,7 @@ async def test_latency(mock_e2b_api_key_none, client, agent, message, endpoint):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("message", ["Use recall memory tool to recall what my name is."])
 @pytest.mark.parametrize("endpoint", ["v1/voice-beta"])
-async def test_voice_recall_memory(mock_e2b_api_key_none, client, agent, message, endpoint):
+async def test_voice_recall_memory(disable_e2b_api_key, client, agent, message, endpoint):
     """Tests chat completion streaming using the Async OpenAI client."""
     request = _get_chat_request(message)
 
@@ -193,7 +193,7 @@ async def test_voice_recall_memory(mock_e2b_api_key_none, client, agent, message
 @pytest.mark.asyncio
 @pytest.mark.parametrize("message", ["Tell me something interesting about bananas.", "What's the weather in SF?"])
 @pytest.mark.parametrize("endpoint", ["openai/v1", "v1/voice-beta"])
-async def test_chat_completions_streaming_openai_client(mock_e2b_api_key_none, client, agent, message, endpoint):
+async def test_chat_completions_streaming_openai_client(disable_e2b_api_key, client, agent, message, endpoint):
     """Tests chat completion streaming using the Async OpenAI client."""
     request = _get_chat_request(message)
 
diff --git a/tests/integration_test_experimental.py b/tests/integration_test_experimental.py
index 111d5cc5..d1d4e486 100644
--- a/tests/integration_test_experimental.py
+++ b/tests/integration_test_experimental.py
@@ -238,7 +238,7 @@ def _assert_valid_chunk(chunk, idx, chunks):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("message", ["What is the weather today in SF?"])
-async def test_new_agent_loop(mock_e2b_api_key_none, openai_client, agent_state, message):
+async def test_new_agent_loop(disable_e2b_api_key, openai_client, agent_state, message):
     actor = UserManager().get_user_or_default(user_id="asf")
     agent = LettaAgent(
         agent_id=agent_state.id,
@@ -254,7 +254,7 @@ async def test_new_agent_loop(mock_e2b_api_key_none, openai_client, agent_state,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("message", ["Use your rethink tool to rethink the human memory considering Matt likes chicken."])
-async def test_rethink_tool(mock_e2b_api_key_none, openai_client, agent_state, message):
+async def test_rethink_tool(disable_e2b_api_key, openai_client, agent_state, message):
     actor = UserManager().get_user_or_default(user_id="asf")
     agent = LettaAgent(
         agent_id=agent_state.id,
@@ -271,7 +271,7 @@ async def test_rethink_tool(mock_e2b_api_key_none, openai_client, agent_state, m
 
 
 @pytest.mark.asyncio
-async def test_multi_agent_broadcast(mock_e2b_api_key_none, client, openai_client, weather_tool):
+async def test_multi_agent_broadcast(disable_e2b_api_key, client, openai_client, weather_tool):
     actor = UserManager().get_user_or_default(user_id="asf")
 
     stale_agents = AgentManager().list_agents(actor=actor, limit=300)
diff --git a/tests/integration_test_offline_memory_agent.py b/tests/integration_test_offline_memory_agent.py
index 5082251c..130fc1ef 100644
--- a/tests/integration_test_offline_memory_agent.py
+++ b/tests/integration_test_offline_memory_agent.py
@@ -28,7 +28,7 @@ def clear_agents(client):
         client.delete_agent(agent.id)
 
 
-def test_ripple_edit(client, mock_e2b_api_key_none):
+def test_ripple_edit(client, disable_e2b_api_key):
     trigger_rethink_memory_tool = client.create_or_update_tool(trigger_rethink_memory)
     send_message = client.server.tool_manager.get_tool_by_name(tool_name="send_message", actor=client.user)
 
@@ -120,7 +120,7 @@ def test_ripple_edit(client, mock_e2b_api_key_none):
     client.delete_agent(offline_memory_agent.id)
 
 
-def test_chat_only_agent(client, mock_e2b_api_key_none):
+def test_chat_only_agent(client, disable_e2b_api_key):
     from letta.offline_memory_agent import finish_rethinking_memory, rethink_memory
 
     send_message = client.server.tool_manager.get_tool_by_name(tool_name="send_message", actor=client.user)
@@ -202,7 +202,7 @@ def test_chat_only_agent(client, mock_e2b_api_key_none):
     client.delete_agent(offline_memory_agent.id)
 
 
-def test_initial_message_sequence(client, mock_e2b_api_key_none):
+def test_initial_message_sequence(client, disable_e2b_api_key):
     """
     Test that when we set the initial sequence to an empty list,
     we do not get the default initial message sequence.
diff --git a/tests/integration_test_send_message.py b/tests/integration_test_send_message.py
new file mode 100644
index 00000000..d5d69969
--- /dev/null
+++ b/tests/integration_test_send_message.py
@@ -0,0 +1,333 @@
+import os
+import threading
+import time
+from typing import Any, Dict, List
+
+import pytest
+from dotenv import load_dotenv
+from letta_client import AsyncLetta, Letta, Run, Tool
+from letta_client.types import AssistantMessage, LettaUsageStatistics, ReasoningMessage, ToolCallMessage, ToolReturnMessage
+
+from letta.schemas.agent import AgentState
+
+# ------------------------------
+# Fixtures
+# ------------------------------
+
+
+@pytest.fixture(scope="module")
+def server_url() -> str:
+    """
+    Provides the URL for the Letta server.
+    If the environment variable 'LETTA_SERVER_URL' is not set, this fixture
+    will start the Letta server in a background thread and return the default URL.
+    """
+
+    def _run_server() -> None:
+        """Starts the Letta server in a background thread."""
+        load_dotenv()  # Load environment variables from .env file
+        from letta.server.rest_api.app import start_server
+
+        start_server(debug=True)
+
+    # Retrieve server URL from environment, or default to localhost
+    url: str = os.getenv("LETTA_SERVER_URL", "http://localhost:8283")
+
+    # If no environment variable is set, start the server in a background thread
+    if not os.getenv("LETTA_SERVER_URL"):
+        thread = threading.Thread(target=_run_server, daemon=True)
+        thread.start()
+        time.sleep(5)  # Allow time for the server to start
+
+    return url
+
+
+@pytest.fixture(scope="module")
+def client(server_url: str) -> Letta:
+    """
+    Creates and returns a synchronous Letta REST client for testing.
+    """
+    client_instance = Letta(base_url=server_url)
+    yield client_instance
+
+
+@pytest.fixture(scope="module")
+def async_client(server_url: str) -> AsyncLetta:
+    """
+    Creates and returns an asynchronous Letta REST client for testing.
+    """
+    async_client_instance = AsyncLetta(base_url=server_url)
+    yield async_client_instance
+
+
+@pytest.fixture(scope="module")
+def roll_dice_tool(client: Letta) -> Tool:
+    """
+    Registers a simple roll dice tool with the provided client.
+
+    The tool simulates rolling a six-sided die but returns a fixed result.
+    """
+
+    def roll_dice() -> str:
+        """
+        Simulates rolling a die.
+
+        Returns:
+            str: The roll result.
+        """
+        # Note: The result here is intentionally incorrect for demonstration purposes.
+        return "Rolled a 10!"
+
+    tool = client.tools.upsert_from_function(func=roll_dice)
+    yield tool
+
+
+@pytest.fixture(scope="module")
+def agent_state(client: Letta, roll_dice_tool: Tool) -> AgentState:
+    """
+    Creates and returns an agent state for testing with a pre-configured agent.
+    The agent is named 'supervisor' and is configured with base tools and the roll_dice tool.
+    """
+    agent_state_instance = client.agents.create(
+        name="supervisor",
+        include_base_tools=True,
+        tool_ids=[roll_dice_tool.id],
+        model="openai/gpt-4o",
+        embedding="letta/letta-free",
+        tags=["supervisor"],
+    )
+    yield agent_state_instance
+
+
+# ------------------------------
+# Helper Functions and Constants
+# ------------------------------
+
+USER_MESSAGE: List[Dict[str, str]] = [{"role": "user", "content": "Roll the dice."}]
+TESTED_MODELS: List[str] = ["openai/gpt-4o"]
+
+
+def assert_tool_response_messages(messages: List[Any]) -> None:
+    """
+    Asserts that the messages list follows the expected sequence:
+    ReasoningMessage -> ToolCallMessage -> ToolReturnMessage ->
+    ReasoningMessage -> AssistantMessage.
+    """
+    assert isinstance(messages[0], ReasoningMessage)
+    assert isinstance(messages[1], ToolCallMessage)
+    assert isinstance(messages[2], ToolReturnMessage)
+    assert isinstance(messages[3], ReasoningMessage)
+    assert isinstance(messages[4], AssistantMessage)
+
+
+def assert_streaming_tool_response_messages(chunks: List[Any]) -> None:
+    """
+    Validates that streaming responses contain at least one reasoning message,
+    one tool call, one tool return, one assistant message, and one usage statistics message.
+    """
+
+    def msg_groups(msg_type: Any) -> List[Any]:
+        return [c for c in chunks if isinstance(c, msg_type)]
+
+    reasoning_msgs = msg_groups(ReasoningMessage)
+    tool_calls = msg_groups(ToolCallMessage)
+    tool_returns = msg_groups(ToolReturnMessage)
+    assistant_msgs = msg_groups(AssistantMessage)
+    usage_stats = msg_groups(LettaUsageStatistics)
+
+    assert len(reasoning_msgs) >= 1
+    assert len(tool_calls) == 1
+    assert len(tool_returns) == 1
+    assert len(assistant_msgs) == 1
+    assert len(usage_stats) == 1
+
+
+def wait_for_run_completion(client: Letta, run_id: str, timeout: float = 30.0, interval: float = 0.5) -> Run:
+    """
+    Polls the run status until it completes or fails.
+
+    Args:
+        client (Letta): The synchronous Letta client.
+        run_id (str): The identifier of the run to wait for.
+        timeout (float): Maximum time to wait (in seconds).
+        interval (float): Interval between status checks (in seconds).
+
+    Returns:
+        Run: The completed run object.
+
+    Raises:
+        RuntimeError: If the run fails.
+        TimeoutError: If the run does not complete within the specified timeout.
+    """
+    start = time.time()
+    while True:
+        run = client.runs.retrieve_run(run_id)
+        if run.status == "completed":
+            return run
+        if run.status == "failed":
+            raise RuntimeError(f"Run {run_id} did not complete: status = {run.status}")
+        if time.time() - start > timeout:
+            raise TimeoutError(f"Run {run_id} did not complete within {timeout} seconds (last status: {run.status})")
+        time.sleep(interval)
+
+
+def assert_tool_response_dict_messages(messages: List[Dict[str, Any]]) -> None:
+    """
+    Asserts that a list of message dictionaries contains the expected types and statuses.
+
+    Expected order:
+        1. reasoning_message
+        2. tool_call_message
+        3. tool_return_message (with status 'success')
+        4. reasoning_message
+        5. assistant_message
+    """
+    assert isinstance(messages, list)
+    assert messages[0]["message_type"] == "reasoning_message"
+    assert messages[1]["message_type"] == "tool_call_message"
+    assert messages[2]["message_type"] == "tool_return_message"
+    assert messages[3]["message_type"] == "reasoning_message"
+    assert messages[4]["message_type"] == "assistant_message"
+
+    tool_return = messages[2]
+    assert tool_return["status"] == "success"
+
+
+# ------------------------------
+# Test Cases
+# ------------------------------
+
+
+@pytest.mark.parametrize("model", TESTED_MODELS)
+def test_send_message_sync_client(
+    disable_e2b_api_key: Any,
+    client: Letta,
+    agent_state: AgentState,
+    model: str,
+) -> None:
+    """
+    Tests sending a message with a synchronous client.
+    Verifies that the response messages follow the expected order.
+    """
+    client.agents.modify(agent_id=agent_state.id, model=model)
+    response = client.agents.messages.create(
+        agent_id=agent_state.id,
+        messages=USER_MESSAGE,
+    )
+    assert_tool_response_messages(response.messages)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model", TESTED_MODELS)
+async def test_send_message_async_client(
+    disable_e2b_api_key: Any,
+    async_client: AsyncLetta,
+    agent_state: AgentState,
+    model: str,
+) -> None:
+    """
+    Tests sending a message with an asynchronous client.
+    Validates that the response messages match the expected sequence.
+    """
+    await async_client.agents.modify(agent_id=agent_state.id, model=model)
+    response = await async_client.agents.messages.create(
+        agent_id=agent_state.id,
+        messages=USER_MESSAGE,
+    )
+    assert_tool_response_messages(response.messages)
+
+
+@pytest.mark.parametrize("model", TESTED_MODELS)
+def test_send_message_streaming_sync_client(
+    disable_e2b_api_key: Any,
+    client: Letta,
+    agent_state: AgentState,
+    model: str,
+) -> None:
+    """
+    Tests sending a streaming message with a synchronous client.
+    Checks that each chunk in the stream has the correct message types.
+    """
+    client.agents.modify(agent_id=agent_state.id, model=model)
+    response = client.agents.messages.create_stream(
+        agent_id=agent_state.id,
+        messages=USER_MESSAGE,
+    )
+    chunks = list(response)
+    assert_streaming_tool_response_messages(chunks)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model", TESTED_MODELS)
+async def test_send_message_streaming_async_client(
+    disable_e2b_api_key: Any,
+    async_client: AsyncLetta,
+    agent_state: AgentState,
+    model: str,
+) -> None:
+    """
+    Tests sending a streaming message with an asynchronous client.
+    Validates that the streaming response chunks include the correct message types.
+    """
+    await async_client.agents.modify(agent_id=agent_state.id, model=model)
+    response = async_client.agents.messages.create_stream(
+        agent_id=agent_state.id,
+        messages=USER_MESSAGE,
+    )
+    chunks = [chunk async for chunk in response]
+    assert_streaming_tool_response_messages(chunks)
+
+
+@pytest.mark.parametrize("model", TESTED_MODELS)
+def test_send_message_job_sync_client(
+    disable_e2b_api_key: Any,
+    client: Letta,
+    agent_state: AgentState,
+    model: str,
+) -> None:
+    """
+    Tests sending a message as an asynchronous job using the synchronous client.
+    Waits for job completion and asserts that the result messages are as expected.
+    """
+    client.agents.modify(agent_id=agent_state.id, model=model)
+
+    run = client.agents.messages.create_async(
+        agent_id=agent_state.id,
+        messages=USER_MESSAGE,
+    )
+    run = wait_for_run_completion(client, run.id)
+
+    result = run.metadata.get("result")
+    assert result is not None, "Run metadata missing 'result' key"
+
+    messages = result["messages"]
+    assert_tool_response_dict_messages(messages)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model", TESTED_MODELS)
+async def test_send_message_job_async_client(
+    disable_e2b_api_key: Any,
+    client: Letta,
+    async_client: AsyncLetta,
+    agent_state: AgentState,
+    model: str,
+) -> None:
+    """
+    Tests sending a message as an asynchronous job using the asynchronous client.
+    Waits for job completion and verifies that the resulting messages meet the expected format.
+    """
+    await async_client.agents.modify(agent_id=agent_state.id, model=model)
+
+    run = await async_client.agents.messages.create_async(
+        agent_id=agent_state.id,
+        messages=USER_MESSAGE,
+    )
+    # Use the synchronous client to check job completion
+    run = wait_for_run_completion(client, run.id)
+
+    result = run.metadata.get("result")
+    assert result is not None, "Run metadata missing 'result' key"
+
+    messages = result["messages"]
+    assert_tool_response_dict_messages(messages)
diff --git a/tests/integration_test_summarizer.py b/tests/integration_test_summarizer.py
index 87c63245..b47ce3fa 100644
--- a/tests/integration_test_summarizer.py
+++ b/tests/integration_test_summarizer.py
@@ -115,7 +115,7 @@ def test_cutoff_calculation(mocker):
     assert messages[cutoff - 1].role == MessageRole.user
 
 
-def test_summarize_many_messages_basic(client, mock_e2b_api_key_none):
+def test_summarize_many_messages_basic(client, disable_e2b_api_key):
     small_context_llm_config = LLMConfig.default_config("gpt-4o-mini")
     small_context_llm_config.context_window = 3000
     small_agent_state = client.create_agent(
@@ -130,7 +130,7 @@ def test_summarize_many_messages_basic(client, mock_e2b_api_key_none):
     client.delete_agent(small_agent_state.id)
 
 
-def test_summarize_large_message_does_not_loop_infinitely(client, mock_e2b_api_key_none):
+def test_summarize_large_message_does_not_loop_infinitely(client, disable_e2b_api_key):
     small_context_llm_config = LLMConfig.default_config("gpt-4o-mini")
     small_context_llm_config.context_window = 2000
     small_agent_state = client.create_agent(
@@ -145,7 +145,7 @@ def test_summarize_large_message_does_not_loop_infinitely(client, mock_e2b_api_k
     client.delete_agent(small_agent_state.id)
 
 
-def test_summarize_messages_inplace(client, agent_state, mock_e2b_api_key_none):
+def test_summarize_messages_inplace(client, agent_state, disable_e2b_api_key):
     """Test summarization via sending the summarize CLI command or via a direct call to the agent object"""
     # First send a few messages (5)
     response = client.user_message(
@@ -179,7 +179,7 @@ def test_summarize_messages_inplace(client, agent_state, mock_e2b_api_key_none):
     agent_obj.summarize_messages_inplace()
 
 
-def test_auto_summarize(client, mock_e2b_api_key_none):
+def test_auto_summarize(client, disable_e2b_api_key):
     """Test that the summarizer triggers by itself"""
     small_context_llm_config = LLMConfig.default_config("gpt-4o-mini")
     small_context_llm_config.context_window = 4000
diff --git a/tests/integration_test_tool_execution_sandbox.py b/tests/integration_test_tool_execution_sandbox.py
index 0554e3d4..85aaab97 100644
--- a/tests/integration_test_tool_execution_sandbox.py
+++ b/tests/integration_test_tool_execution_sandbox.py
@@ -251,7 +251,7 @@ def core_memory_tools(test_user):
 
 
 @pytest.mark.local_sandbox
-def test_local_sandbox_default(mock_e2b_api_key_none, add_integers_tool, test_user):
+def test_local_sandbox_default(disable_e2b_api_key, add_integers_tool, test_user):
     args = {"x": 10, "y": 5}
 
     # Mock and assert correct pathway was invoked
@@ -267,7 +267,7 @@ def test_local_sandbox_default(mock_e2b_api_key_none, add_integers_tool, test_us
 
 
 @pytest.mark.local_sandbox
-def test_local_sandbox_stateful_tool(mock_e2b_api_key_none, clear_core_memory_tool, test_user, agent_state):
+def test_local_sandbox_stateful_tool(disable_e2b_api_key, clear_core_memory_tool, test_user, agent_state):
     args = {}
     # Run again to get actual response
     sandbox = ToolExecutionSandbox(clear_core_memory_tool.name, args, user=test_user)
@@ -278,14 +278,14 @@ def test_local_sandbox_stateful_tool(mock_e2b_api_key_none, clear_core_memory_to
 
 
 @pytest.mark.local_sandbox
-def test_local_sandbox_with_list_rv(mock_e2b_api_key_none, list_tool, test_user):
+def test_local_sandbox_with_list_rv(disable_e2b_api_key, list_tool, test_user):
     sandbox = ToolExecutionSandbox(list_tool.name, {}, user=test_user)
     result = sandbox.run()
     assert len(result.func_return) == 5
 
 
 @pytest.mark.local_sandbox
-def test_local_sandbox_env(mock_e2b_api_key_none, get_env_tool, test_user):
+def test_local_sandbox_env(disable_e2b_api_key, get_env_tool, test_user):
     manager = SandboxConfigManager()
 
     # Make a custom local sandbox config
@@ -311,7 +311,7 @@ def test_local_sandbox_env(mock_e2b_api_key_none, get_env_tool, test_user):
 
 
 @pytest.mark.local_sandbox
-def test_local_sandbox_per_agent_env(mock_e2b_api_key_none, get_env_tool, agent_state, test_user):
+def test_local_sandbox_per_agent_env(disable_e2b_api_key, get_env_tool, agent_state, test_user):
     manager = SandboxConfigManager()
     key = "secret_word"
 
@@ -346,7 +346,7 @@ def test_local_sandbox_per_agent_env(mock_e2b_api_key_none, get_env_tool, agent_
 
 
 @pytest.mark.local_sandbox
-def test_local_sandbox_external_codebase_with_venv(mock_e2b_api_key_none, custom_test_sandbox_config, external_codebase_tool, test_user):
+def test_local_sandbox_external_codebase_with_venv(disable_e2b_api_key, custom_test_sandbox_config, external_codebase_tool, test_user):
     # Set the args
     args = {"percentage": 10}
 
@@ -360,16 +360,14 @@ def test_local_sandbox_external_codebase_with_venv(mock_e2b_api_key_none, custom
 
 
 @pytest.mark.local_sandbox
-def test_local_sandbox_with_venv_and_warnings_does_not_error(
-    mock_e2b_api_key_none, custom_test_sandbox_config, get_warning_tool, test_user
-):
+def test_local_sandbox_with_venv_and_warnings_does_not_error(disable_e2b_api_key, custom_test_sandbox_config, get_warning_tool, test_user):
     sandbox = ToolExecutionSandbox(get_warning_tool.name, {}, user=test_user)
     result = sandbox.run()
     assert result.func_return == "Hello World"
 
 
 @pytest.mark.e2b_sandbox
-def test_local_sandbox_with_venv_errors(mock_e2b_api_key_none, custom_test_sandbox_config, always_err_tool, test_user):
+def test_local_sandbox_with_venv_errors(disable_e2b_api_key, custom_test_sandbox_config, always_err_tool, test_user):
     sandbox = ToolExecutionSandbox(always_err_tool.name, {}, user=test_user)
 
     # run the sandbox
@@ -381,7 +379,7 @@ def test_local_sandbox_with_venv_errors(mock_e2b_api_key_none, custom_test_sandb
 
 
 @pytest.mark.e2b_sandbox
-def test_local_sandbox_with_venv_pip_installs_basic(mock_e2b_api_key_none, cowsay_tool, test_user):
+def test_local_sandbox_with_venv_pip_installs_basic(disable_e2b_api_key, cowsay_tool, test_user):
     manager = SandboxConfigManager()
     config_create = SandboxConfigCreate(
         config=LocalSandboxConfig(use_venv=True, pip_requirements=[PipRequirement(name="cowsay")]).model_dump()
@@ -401,7 +399,7 @@ def test_local_sandbox_with_venv_pip_installs_basic(mock_e2b_api_key_none, cowsa
 
 
 @pytest.mark.e2b_sandbox
-def test_local_sandbox_with_venv_pip_installs_with_update(mock_e2b_api_key_none, cowsay_tool, test_user):
+def test_local_sandbox_with_venv_pip_installs_with_update(disable_e2b_api_key, cowsay_tool, test_user):
     manager = SandboxConfigManager()
     config_create = SandboxConfigCreate(config=LocalSandboxConfig(use_venv=True).model_dump())
     config = manager.create_or_update_sandbox_config(config_create, test_user)
@@ -602,7 +600,7 @@ class TestCoreMemoryTools:
 
     # Local sandbox tests
     @pytest.mark.local_sandbox
-    def test_core_memory_replace_local(self, mock_e2b_api_key_none, core_memory_tools, test_user, agent_state):
+    def test_core_memory_replace_local(self, disable_e2b_api_key, core_memory_tools, test_user, agent_state):
         """Test successful replacement of content in core memory - local sandbox."""
         new_name = "Charles"
         args = {"label": "human", "old_content": "Chad", "new_content": new_name}
@@ -613,7 +611,7 @@ class TestCoreMemoryTools:
         assert result.func_return is None
 
     @pytest.mark.local_sandbox
-    def test_core_memory_append_local(self, mock_e2b_api_key_none, core_memory_tools, test_user, agent_state):
+    def test_core_memory_append_local(self, disable_e2b_api_key, core_memory_tools, test_user, agent_state):
         """Test successful appending of content to core memory - local sandbox."""
         append_text = "\nLikes coffee"
         args = {"label": "human", "content": append_text}
@@ -624,7 +622,7 @@ class TestCoreMemoryTools:
         assert result.func_return is None
 
     @pytest.mark.local_sandbox
-    def test_core_memory_replace_error_local(self, mock_e2b_api_key_none, core_memory_tools, test_user, agent_state):
+    def test_core_memory_replace_error_local(self, disable_e2b_api_key, core_memory_tools, test_user, agent_state):
         """Test error handling when trying to replace non-existent content - local sandbox."""
         nonexistent_name = "Alexander Wang"
         args = {"label": "human", "old_content": nonexistent_name, "new_content": "Charles"}
diff --git a/tests/test_agent_serialization.py b/tests/test_agent_serialization.py
index d73bffb1..2c8b1ddc 100644
--- a/tests/test_agent_serialization.py
+++ b/tests/test_agent_serialization.py
@@ -476,7 +476,7 @@ def test_agent_serialize_with_user_messages(local_client, server, serialize_test
     )
 
 
-def test_agent_serialize_tool_calls(mock_e2b_api_key_none, local_client, server, serialize_test_agent, default_user, other_user):
+def test_agent_serialize_tool_calls(disable_e2b_api_key, local_client, server, serialize_test_agent, default_user, other_user):
     """Test deserializing JSON into an Agent instance."""
     append_copy_suffix = False
     server.send_messages(
@@ -512,7 +512,7 @@ def test_agent_serialize_tool_calls(mock_e2b_api_key_none, local_client, server,
     assert copy_agent_response.completion_tokens > 0 and copy_agent_response.step_count > 0
 
 
-def test_agent_serialize_update_blocks(mock_e2b_api_key_none, local_client, server, serialize_test_agent, default_user, other_user):
+def test_agent_serialize_update_blocks(disable_e2b_api_key, local_client, server, serialize_test_agent, default_user, other_user):
     """Test deserializing JSON into an Agent instance."""
     append_copy_suffix = False
     server.send_messages(
diff --git a/tests/test_client.py b/tests/test_client.py
index f0da5930..10121100 100644
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -267,7 +267,7 @@ def test_agent_tags(client: Letta):
 # --------------------------------------------------------------------------------------------------------------------
 # Agent memory blocks
 # --------------------------------------------------------------------------------------------------------------------
-def test_shared_blocks(mock_e2b_api_key_none, client: Letta):
+def test_shared_blocks(disable_e2b_api_key, client: Letta):
     # create a block
     block = client.blocks.create(label="human", value="username: sarah")
 
diff --git a/tests/test_client_legacy.py b/tests/test_client_legacy.py
index 3a3a1b3e..ba164542 100644
--- a/tests/test_client_legacy.py
+++ b/tests/test_client_legacy.py
@@ -124,7 +124,7 @@ def default_user(default_organization):
     yield user
 
 
-def test_agent(mock_e2b_api_key_none, client: Union[LocalClient, RESTClient], agent: AgentState):
+def test_agent(disable_e2b_api_key, client: Union[LocalClient, RESTClient], agent: AgentState):
 
     # test client.rename_agent
     new_name = "RenamedTestAgent"
@@ -143,7 +143,7 @@ def test_agent(mock_e2b_api_key_none, client: Union[LocalClient, RESTClient], ag
     assert client.agent_exists(agent_id=delete_agent.id) == False, "Agent deletion failed"
 
 
-def test_memory(mock_e2b_api_key_none, client: Union[LocalClient, RESTClient], agent: AgentState):
+def test_memory(disable_e2b_api_key, client: Union[LocalClient, RESTClient], agent: AgentState):
     # _reset_config()
 
     memory_response = client.get_in_context_memory(agent_id=agent.id)
@@ -159,7 +159,7 @@ def test_memory(mock_e2b_api_key_none, client: Union[LocalClient, RESTClient], a
     ), "Memory update failed"
 
 
-def test_agent_interactions(mock_e2b_api_key_none, client: Union[LocalClient, RESTClient], agent: AgentState):
+def test_agent_interactions(disable_e2b_api_key, client: Union[LocalClient, RESTClient], agent: AgentState):
     # test that it is a LettaMessage
     message = "Hello again, agent!"
     print("Sending message", message)
@@ -182,7 +182,7 @@ def test_agent_interactions(mock_e2b_api_key_none, client: Union[LocalClient, RE
     # TODO: add streaming tests
 
 
-def test_archival_memory(mock_e2b_api_key_none, client: Union[LocalClient, RESTClient], agent: AgentState):
+def test_archival_memory(disable_e2b_api_key, client: Union[LocalClient, RESTClient], agent: AgentState):
     # _reset_config()
 
     memory_content = "Archival memory content"
@@ -216,7 +216,7 @@ def test_archival_memory(mock_e2b_api_key_none, client: Union[LocalClient, RESTC
     client.get_archival_memory(agent.id)
 
 
-def test_core_memory(mock_e2b_api_key_none, client: Union[LocalClient, RESTClient], agent: AgentState):
+def test_core_memory(disable_e2b_api_key, client: Union[LocalClient, RESTClient], agent: AgentState):
     response = client.send_message(agent_id=agent.id, message="Update your core memory to remember that my name is Timber!", role="user")
     print("Response", response)
 
@@ -234,7 +234,7 @@ def test_core_memory(mock_e2b_api_key_none, client: Union[LocalClient, RESTClien
     ],
 )
 def test_streaming_send_message(
-    mock_e2b_api_key_none,
+    disable_e2b_api_key,
     client: RESTClient,
     agent: AgentState,
     stream_tokens: bool,
diff --git a/tests/test_model_letta_performance.py b/tests/test_model_letta_performance.py
index ea9c30ea..41f2da64 100644
--- a/tests/test_model_letta_performance.py
+++ b/tests/test_model_letta_performance.py
@@ -32,7 +32,7 @@ def test_openai_gpt_4o_returns_valid_first_message():
 
 @pytest.mark.openai_basic
 @retry_until_success(max_attempts=5, sleep_time_seconds=2)
-def test_openai_gpt_4o_uses_external_tool(mock_e2b_api_key_none):
+def test_openai_gpt_4o_uses_external_tool(disable_e2b_api_key):
     filename = os.path.join(llm_config_dir, "openai-gpt-4o.json")
     response = check_agent_uses_external_tool(filename)
     # Log out successful response
@@ -96,7 +96,7 @@ def test_azure_gpt_4o_mini_returns_valid_first_message():
 
 @pytest.mark.azure_basic
 @retry_until_success(max_attempts=5, sleep_time_seconds=2)
-def test_azure_gpt_4o_mini_uses_external_tool(mock_e2b_api_key_none):
+def test_azure_gpt_4o_mini_uses_external_tool(disable_e2b_api_key):
     filename = os.path.join(llm_config_dir, "azure-gpt-4o-mini.json")
     response = check_agent_uses_external_tool(filename)
     # Log out successful response
@@ -182,7 +182,7 @@ def test_claude_haiku_3_5_returns_valid_first_message():
 
 @pytest.mark.anthropic_basic
 @retry_until_success(max_attempts=5, sleep_time_seconds=2)
-def test_claude_haiku_3_5_uses_external_tool(mock_e2b_api_key_none):
+def test_claude_haiku_3_5_uses_external_tool(disable_e2b_api_key):
     filename = os.path.join(llm_config_dir, "claude-3-5-haiku.json")
     response = check_agent_uses_external_tool(filename)
     # Log out successful response
@@ -226,7 +226,7 @@ def test_groq_llama31_70b_returns_valid_first_message():
     print(f"Got successful response from client: \n\n{response}")
 
 
-def test_groq_llama31_70b_uses_external_tool(mock_e2b_api_key_none):
+def test_groq_llama31_70b_uses_external_tool(disable_e2b_api_key):
     filename = os.path.join(llm_config_dir, "groq.json")
     response = check_agent_uses_external_tool(filename)
     # Log out successful response
@@ -269,7 +269,7 @@ def test_gemini_pro_15_returns_valid_first_message():
 
 @pytest.mark.gemini_basic
 @retry_until_success(max_attempts=5, sleep_time_seconds=2)
-def test_gemini_pro_15_uses_external_tool(mock_e2b_api_key_none):
+def test_gemini_pro_15_uses_external_tool(disable_e2b_api_key):
     filename = os.path.join(llm_config_dir, "gemini-pro.json")
     response = check_agent_uses_external_tool(filename)
     # Log out successful response
@@ -349,7 +349,7 @@ def test_together_llama_3_70b_returns_valid_first_message():
     print(f"Got successful response from client: \n\n{response}")
 
 
-def test_together_llama_3_70b_uses_external_tool(mock_e2b_api_key_none):
+def test_together_llama_3_70b_uses_external_tool(disable_e2b_api_key):
     filename = os.path.join(llm_config_dir, "together-llama-3-70b.json")
     response = check_agent_uses_external_tool(filename)
     # Log out successful response
@@ -405,7 +405,7 @@ def test_bedrock_claude_sonnet_3_5_returns_valid_first_message():
 
 @pytest.mark.anthropic_bedrock_basic
 @retry_until_success(max_attempts=5, sleep_time_seconds=2)
-def test_bedrock_claude_sonnet_3_5_uses_external_tool(mock_e2b_api_key_none):
+def test_bedrock_claude_sonnet_3_5_uses_external_tool(disable_e2b_api_key):
     filename = os.path.join(llm_config_dir, "bedrock-claude-3-5-sonnet.json")
     response = check_agent_uses_external_tool(filename)
     # Log out successful response
diff --git a/tests/test_server.py b/tests/test_server.py
index c6db4da6..ec79fed5 100644
--- a/tests/test_server.py
+++ b/tests/test_server.py
@@ -801,7 +801,7 @@ def ingest(message: str):
 import pytest
 
 
-def test_tool_run_basic(server, mock_e2b_api_key_none, user):
+def test_tool_run_basic(server, disable_e2b_api_key, user):
     """Test running a simple tool from source"""
     result = server.run_tool_from_source(
         actor=user,
@@ -815,7 +815,7 @@ def test_tool_run_basic(server, mock_e2b_api_key_none, user):
     assert not result.stderr
 
 
-def test_tool_run_with_env_var(server, mock_e2b_api_key_none, user):
+def test_tool_run_with_env_var(server, disable_e2b_api_key, user):
     """Test running a tool that uses an environment variable"""
     result = server.run_tool_from_source(
         actor=user,
@@ -830,7 +830,7 @@ def test_tool_run_with_env_var(server, mock_e2b_api_key_none, user):
     assert not result.stderr
 
 
-def test_tool_run_invalid_args(server, mock_e2b_api_key_none, user):
+def test_tool_run_invalid_args(server, disable_e2b_api_key, user):
     """Test running a tool with incorrect arguments"""
     result = server.run_tool_from_source(
         actor=user,
@@ -846,7 +846,7 @@ def test_tool_run_invalid_args(server, mock_e2b_api_key_none, user):
     assert "missing 1 required positional argument" in result.stderr[0]
 
 
-def test_tool_run_with_distractor(server, mock_e2b_api_key_none, user):
+def test_tool_run_with_distractor(server, disable_e2b_api_key, user):
     """Test running a tool with a distractor function in the source"""
     result = server.run_tool_from_source(
         actor=user,
@@ -861,7 +861,7 @@ def test_tool_run_with_distractor(server, mock_e2b_api_key_none, user):
     assert not result.stderr
 
 
-def test_tool_run_explicit_tool_name(server, mock_e2b_api_key_none, user):
+def test_tool_run_explicit_tool_name(server, disable_e2b_api_key, user):
     """Test selecting a tool by name when multiple tools exist in the source"""
     result = server.run_tool_from_source(
         actor=user,
@@ -877,7 +877,7 @@ def test_tool_run_explicit_tool_name(server, mock_e2b_api_key_none, user):
     assert not result.stderr
 
 
-def test_tool_run_util_function(server, mock_e2b_api_key_none, user):
+def test_tool_run_util_function(server, disable_e2b_api_key, user):
     """Test selecting a utility function that does not return anything meaningful"""
     result = server.run_tool_from_source(
         actor=user,
@@ -893,7 +893,7 @@ def test_tool_run_util_function(server, mock_e2b_api_key_none, user):
     assert not result.stderr
 
 
-def test_tool_run_with_explicit_json_schema(server, mock_e2b_api_key_none, user):
+def test_tool_run_with_explicit_json_schema(server, disable_e2b_api_key, user):
     """Test overriding the autogenerated JSON schema with an explicit one"""
     explicit_json_schema = {
         "name": "ingest",
@@ -936,7 +936,7 @@ def test_composio_client_simple(server):
     assert len(actions) > 0
 
 
-def test_memory_rebuild_count(server, user, mock_e2b_api_key_none, base_tools, base_memory_tools):
+def test_memory_rebuild_count(server, user, disable_e2b_api_key, base_tools, base_memory_tools):
     """Test that the memory rebuild is generating the correct number of role=system messages"""
     actor = user
     # create agent
diff --git a/tests/test_streaming.py b/tests/test_streaming.py
index 55300ab5..18432250 100644
--- a/tests/test_streaming.py
+++ b/tests/test_streaming.py
@@ -59,7 +59,7 @@ def agent(client: Letta):
     ],
 )
 def test_streaming_send_message(
-    mock_e2b_api_key_none,
+    disable_e2b_api_key,
     client: Letta,
     agent: AgentState,
     stream_tokens: bool,