From 860cf1949ea59d995c56c4800a96c7164810fb7b Mon Sep 17 00:00:00 2001
From: Matthew Zhou <mattzh1314@gmail.com>
Date: Mon, 2 Dec 2024 11:08:44 -0800
Subject: [PATCH] feat: Improve error messages from tool sandbox (#2135)

---
 .github/workflows/integration_tests.yml       |  5 ++++-
 .github/workflows/tests.yml                   |  7 ++----
 letta/services/tool_execution_sandbox.py      | 13 +++++++++--
 ...ntegration_test_tool_execution_sandbox.py} | 22 +++++++++++++++++++
 4 files changed, 39 insertions(+), 8 deletions(-)
 rename tests/{test_tool_execution_sandbox.py => integration_test_tool_execution_sandbox.py} (93%)

diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml
index e3c673a3..350c4c4d 100644
--- a/.github/workflows/integration_tests.yml
+++ b/.github/workflows/integration_tests.yml
@@ -8,6 +8,8 @@ env:
   GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
   AZURE_API_KEY: ${{ secrets.AZURE_API_KEY }}
   AZURE_BASE_URL: ${{ secrets.AZURE_BASE_URL }}
+  E2B_API_KEY: ${{ secrets.E2B_API_KEY }}
+  E2B_SANDBOX_TEMPLATE_ID: ${{ secrets.E2B_SANDBOX_TEMPLATE_ID }}
 
 on:
   push:
@@ -24,6 +26,7 @@ jobs:
       matrix:
         integration_test_suite:
           - "integration_test_summarizer.py"
+          - "integration_test_tool_execution_sandbox.py"
     services:
       qdrant:
         image: qdrant/qdrant
@@ -52,7 +55,7 @@ jobs:
         with:
           python-version: "3.12"
           poetry-version: "1.8.2"
-          install-args: "-E dev -E postgres -E milvus -E external-tools -E tests"
+          install-args: "-E dev -E postgres -E milvus -E external-tools -E tests -E cloud-tool-sandbox"
       - name: Migrate database
         env:
           LETTA_PG_PORT: 5432
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index ffaa2704..d81681e4 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -6,8 +6,6 @@ env:
   ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
   GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
   GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
-  E2B_API_KEY: ${{ secrets.E2B_API_KEY }}
-  E2B_SANDBOX_TEMPLATE_ID: ${{ secrets.E2B_SANDBOX_TEMPLATE_ID }}
 
 on:
   push:
@@ -30,7 +28,6 @@ jobs:
           - "test_o1_agent.py"
           - "test_tool_rule_solver.py"
           - "test_agent_tool_graph.py"
-          - "test_tool_execution_sandbox.py"
           - "test_utils.py"
           - "test_tool_schema_parsing.py"
     services:
@@ -61,7 +58,7 @@ jobs:
         with:
           python-version: "3.12"
           poetry-version: "1.8.2"
-          install-args: "-E dev -E postgres -E milvus -E external-tools -E tests -E cloud-tool-sandbox"
+          install-args: "-E dev -E postgres -E milvus -E external-tools -E tests"
       - name: Migrate database
         env:
           LETTA_PG_PORT: 5432
@@ -135,4 +132,4 @@ jobs:
           LETTA_SERVER_PASS: test_server_token
           PYTHONPATH: ${{ github.workspace }}:${{ env.PYTHONPATH }}
         run: |
-          poetry run pytest -s -vv -k "not test_model_letta_perfomance.py and not test_utils.py and not test_client.py and not test_tool_execution_sandbox.py and not integration_test_summarizer.py and not test_agent_tool_graph.py and not test_tool_rule_solver.py and not test_local_client.py and not test_o1_agent.py and not test_cli.py and not test_concurrent_connections.py and not test_quickstart and not test_model_letta_performance and not test_storage and not test_server and not test_openai_client and not test_providers and not test_client_legacy.py" tests
+          poetry run pytest -s -vv -k "not test_model_letta_perfomance.py and not test_utils.py and not test_client.py and not integration_test_tool_execution_sandbox.py and not integration_test_summarizer.py and not test_agent_tool_graph.py and not test_tool_rule_solver.py and not test_local_client.py and not test_o1_agent.py and not test_cli.py and not test_concurrent_connections.py and not test_quickstart and not test_model_letta_performance and not test_storage and not test_server and not test_openai_client and not test_providers and not test_client_legacy.py" tests
diff --git a/letta/services/tool_execution_sandbox.py b/letta/services/tool_execution_sandbox.py
index 97a4903c..c1c48979 100644
--- a/letta/services/tool_execution_sandbox.py
+++ b/letta/services/tool_execution_sandbox.py
@@ -132,7 +132,8 @@ class ToolExecutionSandbox:
                 sandbox_config_fingerprint=sbx_config.fingerprint(),
             )
         except Exception as e:
-            raise RuntimeError(f"Executing tool {self.tool_name} has an unexpected error: {e}")
+            logger.error(f"Executing tool {self.tool_name} has an unexpected error: {e}")
+            raise e
         finally:
             # Clean up the temp file and restore stdout
             sys.stdout = old_stdout
@@ -154,7 +155,9 @@ class ToolExecutionSandbox:
         env_vars = self.sandbox_config_manager.get_sandbox_env_vars_as_dict(sandbox_config_id=sbx_config.id, actor=self.user, limit=100)
         execution = sbx.run_code(code, envs=env_vars)
         if execution.error is not None:
-            raise Exception(f"Executing tool {self.tool_name} failed with {execution.error}")
+            logger.error(f"Executing tool {self.tool_name} failed with {execution.error}")
+            # Raise a concise exception as this gets returned to the LLM
+            raise self.parse_exception_from_e2b_execution(execution)
         elif len(execution.results) == 0:
             return None
         else:
@@ -166,6 +169,12 @@ class ToolExecutionSandbox:
                 sandbox_config_fingerprint=sbx_config.fingerprint(),
             )
 
+    def parse_exception_from_e2b_execution(self, e2b_execution: "Execution") -> Exception:
+        builtins_dict = __builtins__ if isinstance(__builtins__, dict) else vars(__builtins__)
+        # Dynamically fetch the exception class from builtins, defaulting to Exception if not found
+        exception_class = builtins_dict.get(e2b_execution.error.name, Exception)
+        return exception_class(e2b_execution.error.value)
+
     def get_running_e2b_sandbox_with_same_state(self, sandbox_config: SandboxConfig) -> Optional["Sandbox"]:
         from e2b_code_interpreter import Sandbox
 
diff --git a/tests/test_tool_execution_sandbox.py b/tests/integration_test_tool_execution_sandbox.py
similarity index 93%
rename from tests/test_tool_execution_sandbox.py
rename to tests/integration_test_tool_execution_sandbox.py
index 977a674e..1d5f556e 100644
--- a/tests/test_tool_execution_sandbox.py
+++ b/tests/integration_test_tool_execution_sandbox.py
@@ -266,6 +266,17 @@ def test_local_sandbox_core_memory_replace(mock_e2b_api_key_none, core_memory_re
     assert result.func_return is None
 
 
+@pytest.mark.e2b_sandbox
+def test_local_sandbox_core_memory_replace_errors(mock_e2b_api_key_none, core_memory_replace_tool, test_user, agent_state):
+    nonexistent_name = "Alexander Wang"
+    args = {"label": "human", "old_content": nonexistent_name, "new_content": "Matt"}
+    sandbox = ToolExecutionSandbox(core_memory_replace_tool.name, args, user_id=test_user.id)
+
+    # run the sandbox
+    with pytest.raises(ValueError, match=f"Old content '{nonexistent_name}' not found in memory block 'human'"):
+        sandbox.run(agent_state=agent_state)
+
+
 @pytest.mark.local_sandbox
 def test_local_sandbox_with_list_rv(mock_e2b_api_key_none, list_tool, test_user):
     sandbox = ToolExecutionSandbox(list_tool.name, {}, user_id=test_user.id)
@@ -390,6 +401,17 @@ def test_e2b_sandbox_core_memory_replace(check_e2b_key_is_set, core_memory_repla
     assert result.func_return is None
 
 
+@pytest.mark.e2b_sandbox
+def test_e2b_sandbox_core_memory_replace_errors(check_e2b_key_is_set, core_memory_replace_tool, test_user, agent_state):
+    nonexistent_name = "Alexander Wang"
+    args = {"label": "human", "old_content": nonexistent_name, "new_content": "Matt"}
+    sandbox = ToolExecutionSandbox(core_memory_replace_tool.name, args, user_id=test_user.id)
+
+    # run the sandbox
+    with pytest.raises(ValueError, match=f"Old content '{nonexistent_name}' not found in memory block 'human'"):
+        sandbox.run(agent_state=agent_state)
+
+
 @pytest.mark.e2b_sandbox
 def test_e2b_sandbox_inject_env_var_existing_sandbox(check_e2b_key_is_set, get_env_tool, test_user):
     manager = SandboxConfigManager(tool_settings)