From 9a49f2991c8d853a6b18ff44c65e2d9e3f39a441 Mon Sep 17 00:00:00 2001
From: cthomas <caren@letta.com>
Date: Thu, 12 Dec 2024 14:36:21 -0800
Subject: [PATCH] feat: route all sandbox errors to stderr (#2222)

Co-authored-by: Caren Thomas <caren@caren-mac.local>
---
 letta/server/server.py                        | 40 +++++++--
 letta/services/tool_execution_sandbox.py      | 85 ++++++++-----------
 ...integration_test_tool_execution_sandbox.py | 20 +++--
 tests/test_server.py                          | 28 +++---
 4 files changed, 97 insertions(+), 76 deletions(-)

diff --git a/letta/server/server.py b/letta/server/server.py
index 2765b269..39536c66 100644
--- a/letta/server/server.py
+++ b/letta/server/server.py
@@ -1957,6 +1957,21 @@ class SyncServer(Server):
             if sandbox_run_result is None:
                 raise ValueError(f"Tool with id {tool.id} returned execution with None")
             function_response = str(sandbox_run_result.func_return)
+            stdout = [s for s in sandbox_run_result.stdout if s.strip()]
+            stderr = [s for s in sandbox_run_result.stderr if s.strip()]
+
+            # expected error
+            if stderr:
+                error_msg = self.get_error_msg_for_func_return(tool.name, stderr[-1])
+                return FunctionReturn(
+                    id="null",
+                    function_call_id="null",
+                    date=get_utc_time(),
+                    status="error",
+                    function_return=error_msg,
+                    stdout=stdout,
+                    stderr=stderr,
+                )
 
             return FunctionReturn(
                 id="null",
@@ -1964,17 +1979,13 @@ class SyncServer(Server):
                 date=get_utc_time(),
                 status="success",
                 function_return=function_response,
-                stdout=sandbox_run_result.stdout,
-                stderr=sandbox_run_result.stderr,
+                stdout=stdout,
+                stderr=stderr,
             )
+
+        # unexpected error TODO(@cthomas): consolidate error handling
         except Exception as e:
-            # same as agent.py
-            from letta.constants import MAX_ERROR_MESSAGE_CHAR_LIMIT
-
-            error_msg = f"Error executing tool {tool.name}: {e}"
-            if len(error_msg) > MAX_ERROR_MESSAGE_CHAR_LIMIT:
-                error_msg = error_msg[:MAX_ERROR_MESSAGE_CHAR_LIMIT]
-
+            error_msg = self.get_error_msg_for_func_return(tool.name, e)
             return FunctionReturn(
                 id="null",
                 function_call_id="null",
@@ -1985,6 +1996,17 @@ class SyncServer(Server):
                 stderr=[traceback.format_exc()],
             )
 
+
+    def get_error_msg_for_func_return(self, tool_name, exception_message):
+        # same as agent.py
+        from letta.constants import MAX_ERROR_MESSAGE_CHAR_LIMIT
+
+        error_msg = f"Error executing tool {tool_name}: {exception_message}"
+        if len(error_msg) > MAX_ERROR_MESSAGE_CHAR_LIMIT:
+            error_msg = error_msg[:MAX_ERROR_MESSAGE_CHAR_LIMIT]
+        return error_msg
+
+
     # Composio wrappers
     def get_composio_client(self, api_key: Optional[str] = None):
         if api_key:
diff --git a/letta/services/tool_execution_sandbox.py b/letta/services/tool_execution_sandbox.py
index e1698dfe..9adcff0b 100644
--- a/letta/services/tool_execution_sandbox.py
+++ b/letta/services/tool_execution_sandbox.py
@@ -7,6 +7,7 @@ import runpy
 import subprocess
 import sys
 import tempfile
+import traceback
 import uuid
 import venv
 from typing import Any, Dict, Optional, TextIO
@@ -174,41 +175,16 @@ class ToolExecutionSandbox:
                 capture_output=True,
                 text=True,
             )
-
-            # Handle error with optimistic error parsing from the string
-            # This is very brittle, so we fall back to a RuntimeError if parsing fails
-            if result.returncode != 0:
-                # Log the error
-                logger.error(f"Sandbox execution error:\n{result.stderr}")
-
-                # Parse and raise the actual error from stderr
-                tb_lines = result.stderr.strip().splitlines()
-                exception_line = tb_lines[-1]  # The last line contains the exception
-
-                try:
-                    # Split exception type and message
-                    exception_type, exception_message = exception_line.split(": ", 1)
-                    exception_type = exception_type.strip()
-                    exception_message = exception_message.strip()
-
-                    # Dynamically raise the exception
-                    exception_class = eval(exception_type)  # Look up the exception type
-
-                except Exception:
-                    # Fallback to RuntimeError if parsing fails
-                    raise RuntimeError(result.stderr)
-
-                raise exception_class(exception_message)
-
             func_result, stdout = self.parse_out_function_results_markers(result.stdout)
             func_return, agent_state = self.parse_best_effort(func_result)
             return SandboxRunResult(
-                func_return=func_return, 
+                func_return=func_return,
                 agent_state=agent_state,
                 stdout=[stdout],
                 stderr=[result.stderr],
                 sandbox_config_fingerprint=sbx_config.fingerprint(),
             )
+
         except subprocess.TimeoutExpired:
             raise TimeoutError(f"Executing tool {self.tool_name} has timed out.")
         except subprocess.CalledProcessError as e:
@@ -217,39 +193,49 @@ class ToolExecutionSandbox:
         except Exception as e:
             logger.error(f"Executing tool {self.tool_name} has an unexpected error: {e}")
             raise e
+        
 
     def run_local_dir_sandbox_runpy(
         self, sbx_config: SandboxConfig, env_vars: Dict[str, str], temp_file_path: str, old_stdout: TextIO, old_stderr: TextIO
     ) -> SandboxRunResult:
+        func_return, agent_state, error_msg = None, None, None
+
         # Redirect stdout and stderr to capture script output
-        captured_stdout = io.StringIO()
-        captured_stderr = io.StringIO()
+        captured_stdout, captured_stderr = io.StringIO(), io.StringIO()
         sys.stdout = captured_stdout
         sys.stderr = captured_stderr
 
-        # Execute the temp file
-        with self.temporary_env_vars(env_vars):
-            result = runpy.run_path(temp_file_path, init_globals=env_vars)
+        try:
+            # Execute the temp file
+            with self.temporary_env_vars(env_vars):
+                result = runpy.run_path(temp_file_path, init_globals=env_vars)
 
-        # Fetch the result
-        func_result = result.get(self.LOCAL_SANDBOX_RESULT_VAR_NAME)
-        func_return, agent_state = self.parse_best_effort(func_result)
+            # Fetch the result
+            func_result = result.get(self.LOCAL_SANDBOX_RESULT_VAR_NAME)
+            func_return, agent_state = self.parse_best_effort(func_result)
+
+        except Exception as e:
+            traceback.print_exc(file=sys.stderr)
+            error_msg = f"{type(e).__name__}: {str(e)}"
 
         # Restore stdout and stderr and collect captured output
         sys.stdout = old_stdout
         sys.stderr = old_stderr
-        stdout_output = captured_stdout.getvalue()
-        stderr_output = captured_stderr.getvalue()
+        stdout_output = [captured_stdout.getvalue()]
+        stderr_output = [captured_stderr.getvalue()]
+        stderr_output.append(error_msg if error_msg else '')
 
         return SandboxRunResult(
             func_return=func_return,
             agent_state=agent_state,
-            stdout=[stdout_output],
-            stderr=[stderr_output],
+            stdout=stdout_output,
+            stderr=stderr_output,
             sandbox_config_fingerprint=sbx_config.fingerprint(),
         )
 
     def parse_out_function_results_markers(self, text: str):
+        if self.LOCAL_SANDBOX_RESULT_START_MARKER not in text:
+            return '', text
         marker_len = len(self.LOCAL_SANDBOX_RESULT_START_MARKER)
         start_index = text.index(self.LOCAL_SANDBOX_RESULT_START_MARKER) + marker_len
         end_index = text.index(self.LOCAL_SANDBOX_RESULT_END_MARKER)
@@ -294,21 +280,22 @@ class ToolExecutionSandbox:
         env_vars = self.sandbox_config_manager.get_sandbox_env_vars_as_dict(sandbox_config_id=sbx_config.id, actor=self.user, limit=100)
         code = self.generate_execution_script(agent_state=agent_state)
         execution = sbx.run_code(code, envs=env_vars)
+        func_return, agent_state = None, None
         if execution.error is not None:
             logger.error(f"Executing tool {self.tool_name} failed with {execution.error}")
-            # Raise a concise exception as this gets returned to the LLM
-            raise self.parse_exception_from_e2b_execution(execution)
+            execution.logs.stderr.append(execution.error.traceback) 
+            execution.logs.stderr.append(f"{execution.error.name}: {execution.error.value}") 
         elif len(execution.results) == 0:
             return None
         else:
             func_return, agent_state = self.parse_best_effort(execution.results[0].text)
-            return SandboxRunResult(
-                func_return=func_return,
-                agent_state=agent_state,
-                stdout=execution.logs.stdout,
-                stderr=execution.logs.stderr,
-                sandbox_config_fingerprint=sbx_config.fingerprint(),
-            )
+        return SandboxRunResult(
+            func_return=func_return,
+            agent_state=agent_state,
+            stdout=execution.logs.stdout,
+            stderr=execution.logs.stderr,
+            sandbox_config_fingerprint=sbx_config.fingerprint(),
+        )
 
     def parse_exception_from_e2b_execution(self, e2b_execution: "Execution") -> Exception:
         builtins_dict = __builtins__ if isinstance(__builtins__, dict) else vars(__builtins__)
@@ -356,6 +343,8 @@ class ToolExecutionSandbox:
     # general utility functions
 
     def parse_best_effort(self, text: str) -> Any:
+        if not text:
+            return None, None
         result = pickle.loads(base64.b64decode(text))
         agent_state = None
         if not result["agent_state"] is None:
diff --git a/tests/integration_test_tool_execution_sandbox.py b/tests/integration_test_tool_execution_sandbox.py
index e13275b2..37597106 100644
--- a/tests/integration_test_tool_execution_sandbox.py
+++ b/tests/integration_test_tool_execution_sandbox.py
@@ -177,6 +177,7 @@ def always_err_tool(test_user):
             str: not important
         """
         # Raise a unusual error so we know it's from this function
+        print("Going to error now")
         raise ZeroDivisionError("This is an intentionally weird division!")
 
     tool = create_tool_from_func(error)
@@ -314,15 +315,16 @@ def test_local_sandbox_core_memory_replace(mock_e2b_api_key_none, core_memory_re
     assert result.func_return is None
 
 
-@pytest.mark.e2b_sandbox
+@pytest.mark.local_sandbox
 def test_local_sandbox_core_memory_replace_errors(mock_e2b_api_key_none, core_memory_replace_tool, test_user, agent_state):
     nonexistent_name = "Alexander Wang"
     args = {"label": "human", "old_content": nonexistent_name, "new_content": "Matt"}
     sandbox = ToolExecutionSandbox(core_memory_replace_tool.name, args, user_id=test_user.id)
 
     # run the sandbox
-    with pytest.raises(ValueError, match=f"Old content '{nonexistent_name}' not found in memory block 'human'"):
-        sandbox.run(agent_state=agent_state)
+    result = sandbox.run(agent_state=agent_state)
+    assert len(result.stderr) != 0, "stderr not empty"
+    assert f"ValueError: Old content '{nonexistent_name}' not found in memory block 'human'" in result.stderr[0], "stderr contains expected error"
 
 
 @pytest.mark.local_sandbox
@@ -402,8 +404,11 @@ def test_local_sandbox_with_venv_errors(mock_e2b_api_key_none, custom_test_sandb
     sandbox = ToolExecutionSandbox(always_err_tool.name, {}, user_id=test_user.id)
 
     # run the sandbox
-    with pytest.raises(ZeroDivisionError, match="This is an intentionally weird division!"):
-        sandbox.run()
+    result = sandbox.run()
+    assert len(result.stdout) != 0, "stdout not empty"
+    assert "error" in result.stdout[0], "stdout contains printed string"
+    assert len(result.stderr) != 0, "stderr not empty"
+    assert "ZeroDivisionError: This is an intentionally weird division!" in result.stderr[0], "stderr contains expected error"
 
 
 # E2B sandbox tests
@@ -500,8 +505,9 @@ def test_e2b_sandbox_core_memory_replace_errors(check_e2b_key_is_set, core_memor
     sandbox = ToolExecutionSandbox(core_memory_replace_tool.name, args, user_id=test_user.id)
 
     # run the sandbox
-    with pytest.raises(ValueError, match=f"Old content '{nonexistent_name}' not found in memory block 'human'"):
-        sandbox.run(agent_state=agent_state)
+    result = sandbox.run(agent_state=agent_state)
+    assert len(result.stderr) != 0, "stderr not empty"
+    assert f"ValueError: Old content '{nonexistent_name}' not found in memory block 'human'" in result.stderr[0], "stderr contains expected error"
 
 
 @pytest.mark.e2b_sandbox
diff --git a/tests/test_server.py b/tests/test_server.py
index 38404afc..482fe894 100644
--- a/tests/test_server.py
+++ b/tests/test_server.py
@@ -798,8 +798,8 @@ def test_tool_run(server, mock_e2b_api_key_none, user_id, agent_id):
     print(result)
     assert result.status == "success"
     assert result.function_return == "Ingested message Hello, world!", result.function_return
-    assert result.stdout == ['']
-    assert result.stderr == ['']
+    assert not result.stdout
+    assert not result.stderr
 
     result = server.run_tool_from_source(
         user_id=user_id,
@@ -811,8 +811,8 @@ def test_tool_run(server, mock_e2b_api_key_none, user_id, agent_id):
     print(result)
     assert result.status == "success"
     assert result.function_return == "Ingested message Well well well", result.function_return
-    assert result.stdout == ['']
-    assert result.stderr == ['']
+    assert not result.stdout
+    assert not result.stderr
 
     result = server.run_tool_from_source(
         user_id=user_id,
@@ -825,8 +825,9 @@ def test_tool_run(server, mock_e2b_api_key_none, user_id, agent_id):
     assert result.status == "error"
     assert "Error" in result.function_return, result.function_return
     assert "missing 1 required positional argument" in result.function_return, result.function_return
-    assert result.stdout == ['']
-    assert result.stderr != [''], "missing 1 required positional argument" in result.stderr[0]
+    assert not result.stdout
+    assert result.stderr
+    assert "missing 1 required positional argument" in result.stderr[0]
 
     # Test that we can still pull the tool out by default (pulls that last tool in the source)
     result = server.run_tool_from_source(
@@ -839,8 +840,9 @@ def test_tool_run(server, mock_e2b_api_key_none, user_id, agent_id):
     print(result)
     assert result.status == "success"
     assert result.function_return == "Ingested message Well well well", result.function_return
-    assert result.stdout != [''], "I'm a distractor" in result.stdout[0]
-    assert result.stderr == ['']
+    assert result.stdout
+    assert "I'm a distractor" in result.stdout[0]
+    assert not result.stderr
 
     # Test that we can pull the tool out by name
     result = server.run_tool_from_source(
@@ -853,8 +855,9 @@ def test_tool_run(server, mock_e2b_api_key_none, user_id, agent_id):
     print(result)
     assert result.status == "success"
     assert result.function_return == "Ingested message Well well well", result.function_return
-    assert result.stdout != [''], "I'm a distractor" in result.stdout[0]
-    assert result.stderr == ['']
+    assert result.stdout
+    assert "I'm a distractor" in result.stdout[0]
+    assert not result.stderr
 
     # Test that we can pull a different tool out by name
     result = server.run_tool_from_source(
@@ -867,8 +870,9 @@ def test_tool_run(server, mock_e2b_api_key_none, user_id, agent_id):
     print(result)
     assert result.status == "success"
     assert result.function_return == str(None), result.function_return
-    assert result.stdout != [''], "I'm a distractor" in result.stdout[0]
-    assert result.stderr == ['']
+    assert result.stdout
+    assert "I'm a distractor" in result.stdout[0]
+    assert not result.stderr
 
 
 def test_composio_client_simple(server):