From 9a49f2991c8d853a6b18ff44c65e2d9e3f39a441 Mon Sep 17 00:00:00 2001 From: cthomas Date: Thu, 12 Dec 2024 14:36:21 -0800 Subject: [PATCH] feat: route all sandbox errors to stderr (#2222) Co-authored-by: Caren Thomas --- letta/server/server.py | 40 +++++++-- letta/services/tool_execution_sandbox.py | 85 ++++++++----------- ...integration_test_tool_execution_sandbox.py | 20 +++-- tests/test_server.py | 28 +++--- 4 files changed, 97 insertions(+), 76 deletions(-) diff --git a/letta/server/server.py b/letta/server/server.py index 2765b269..39536c66 100644 --- a/letta/server/server.py +++ b/letta/server/server.py @@ -1957,6 +1957,21 @@ class SyncServer(Server): if sandbox_run_result is None: raise ValueError(f"Tool with id {tool.id} returned execution with None") function_response = str(sandbox_run_result.func_return) + stdout = [s for s in sandbox_run_result.stdout if s.strip()] + stderr = [s for s in sandbox_run_result.stderr if s.strip()] + + # expected error + if stderr: + error_msg = self.get_error_msg_for_func_return(tool.name, stderr[-1]) + return FunctionReturn( + id="null", + function_call_id="null", + date=get_utc_time(), + status="error", + function_return=error_msg, + stdout=stdout, + stderr=stderr, + ) return FunctionReturn( id="null", @@ -1964,17 +1979,13 @@ class SyncServer(Server): date=get_utc_time(), status="success", function_return=function_response, - stdout=sandbox_run_result.stdout, - stderr=sandbox_run_result.stderr, + stdout=stdout, + stderr=stderr, ) + + # unexpected error TODO(@cthomas): consolidate error handling except Exception as e: - # same as agent.py - from letta.constants import MAX_ERROR_MESSAGE_CHAR_LIMIT - - error_msg = f"Error executing tool {tool.name}: {e}" - if len(error_msg) > MAX_ERROR_MESSAGE_CHAR_LIMIT: - error_msg = error_msg[:MAX_ERROR_MESSAGE_CHAR_LIMIT] - + error_msg = self.get_error_msg_for_func_return(tool.name, e) return FunctionReturn( id="null", function_call_id="null", @@ -1985,6 +1996,17 @@ class SyncServer(Server): stderr=[traceback.format_exc()], ) + + def get_error_msg_for_func_return(self, tool_name, exception_message): + # same as agent.py + from letta.constants import MAX_ERROR_MESSAGE_CHAR_LIMIT + + error_msg = f"Error executing tool {tool_name}: {exception_message}" + if len(error_msg) > MAX_ERROR_MESSAGE_CHAR_LIMIT: + error_msg = error_msg[:MAX_ERROR_MESSAGE_CHAR_LIMIT] + return error_msg + + # Composio wrappers def get_composio_client(self, api_key: Optional[str] = None): if api_key: diff --git a/letta/services/tool_execution_sandbox.py b/letta/services/tool_execution_sandbox.py index e1698dfe..9adcff0b 100644 --- a/letta/services/tool_execution_sandbox.py +++ b/letta/services/tool_execution_sandbox.py @@ -7,6 +7,7 @@ import runpy import subprocess import sys import tempfile +import traceback import uuid import venv from typing import Any, Dict, Optional, TextIO @@ -174,41 +175,16 @@ class ToolExecutionSandbox: capture_output=True, text=True, ) - - # Handle error with optimistic error parsing from the string - # This is very brittle, so we fall back to a RuntimeError if parsing fails - if result.returncode != 0: - # Log the error - logger.error(f"Sandbox execution error:\n{result.stderr}") - - # Parse and raise the actual error from stderr - tb_lines = result.stderr.strip().splitlines() - exception_line = tb_lines[-1] # The last line contains the exception - - try: - # Split exception type and message - exception_type, exception_message = exception_line.split(": ", 1) - exception_type = exception_type.strip() - exception_message = exception_message.strip() - - # Dynamically raise the exception - exception_class = eval(exception_type) # Look up the exception type - - except Exception: - # Fallback to RuntimeError if parsing fails - raise RuntimeError(result.stderr) - - raise exception_class(exception_message) - func_result, stdout = self.parse_out_function_results_markers(result.stdout) func_return, agent_state = self.parse_best_effort(func_result) return SandboxRunResult( - func_return=func_return, + func_return=func_return, agent_state=agent_state, stdout=[stdout], stderr=[result.stderr], sandbox_config_fingerprint=sbx_config.fingerprint(), ) + except subprocess.TimeoutExpired: raise TimeoutError(f"Executing tool {self.tool_name} has timed out.") except subprocess.CalledProcessError as e: @@ -217,39 +193,49 @@ class ToolExecutionSandbox: except Exception as e: logger.error(f"Executing tool {self.tool_name} has an unexpected error: {e}") raise e + def run_local_dir_sandbox_runpy( self, sbx_config: SandboxConfig, env_vars: Dict[str, str], temp_file_path: str, old_stdout: TextIO, old_stderr: TextIO ) -> SandboxRunResult: + func_return, agent_state, error_msg = None, None, None + # Redirect stdout and stderr to capture script output - captured_stdout = io.StringIO() - captured_stderr = io.StringIO() + captured_stdout, captured_stderr = io.StringIO(), io.StringIO() sys.stdout = captured_stdout sys.stderr = captured_stderr - # Execute the temp file - with self.temporary_env_vars(env_vars): - result = runpy.run_path(temp_file_path, init_globals=env_vars) + try: + # Execute the temp file + with self.temporary_env_vars(env_vars): + result = runpy.run_path(temp_file_path, init_globals=env_vars) - # Fetch the result - func_result = result.get(self.LOCAL_SANDBOX_RESULT_VAR_NAME) - func_return, agent_state = self.parse_best_effort(func_result) + # Fetch the result + func_result = result.get(self.LOCAL_SANDBOX_RESULT_VAR_NAME) + func_return, agent_state = self.parse_best_effort(func_result) + + except Exception as e: + traceback.print_exc(file=sys.stderr) + error_msg = f"{type(e).__name__}: {str(e)}" # Restore stdout and stderr and collect captured output sys.stdout = old_stdout sys.stderr = old_stderr - stdout_output = captured_stdout.getvalue() - stderr_output = captured_stderr.getvalue() + stdout_output = [captured_stdout.getvalue()] + stderr_output = [captured_stderr.getvalue()] + stderr_output.append(error_msg if error_msg else '') return SandboxRunResult( func_return=func_return, agent_state=agent_state, - stdout=[stdout_output], - stderr=[stderr_output], + stdout=stdout_output, + stderr=stderr_output, sandbox_config_fingerprint=sbx_config.fingerprint(), ) def parse_out_function_results_markers(self, text: str): + if self.LOCAL_SANDBOX_RESULT_START_MARKER not in text: + return '', text marker_len = len(self.LOCAL_SANDBOX_RESULT_START_MARKER) start_index = text.index(self.LOCAL_SANDBOX_RESULT_START_MARKER) + marker_len end_index = text.index(self.LOCAL_SANDBOX_RESULT_END_MARKER) @@ -294,21 +280,22 @@ class ToolExecutionSandbox: env_vars = self.sandbox_config_manager.get_sandbox_env_vars_as_dict(sandbox_config_id=sbx_config.id, actor=self.user, limit=100) code = self.generate_execution_script(agent_state=agent_state) execution = sbx.run_code(code, envs=env_vars) + func_return, agent_state = None, None if execution.error is not None: logger.error(f"Executing tool {self.tool_name} failed with {execution.error}") - # Raise a concise exception as this gets returned to the LLM - raise self.parse_exception_from_e2b_execution(execution) + execution.logs.stderr.append(execution.error.traceback) + execution.logs.stderr.append(f"{execution.error.name}: {execution.error.value}") elif len(execution.results) == 0: return None else: func_return, agent_state = self.parse_best_effort(execution.results[0].text) - return SandboxRunResult( - func_return=func_return, - agent_state=agent_state, - stdout=execution.logs.stdout, - stderr=execution.logs.stderr, - sandbox_config_fingerprint=sbx_config.fingerprint(), - ) + return SandboxRunResult( + func_return=func_return, + agent_state=agent_state, + stdout=execution.logs.stdout, + stderr=execution.logs.stderr, + sandbox_config_fingerprint=sbx_config.fingerprint(), + ) def parse_exception_from_e2b_execution(self, e2b_execution: "Execution") -> Exception: builtins_dict = __builtins__ if isinstance(__builtins__, dict) else vars(__builtins__) @@ -356,6 +343,8 @@ class ToolExecutionSandbox: # general utility functions def parse_best_effort(self, text: str) -> Any: + if not text: + return None, None result = pickle.loads(base64.b64decode(text)) agent_state = None if not result["agent_state"] is None: diff --git a/tests/integration_test_tool_execution_sandbox.py b/tests/integration_test_tool_execution_sandbox.py index e13275b2..37597106 100644 --- a/tests/integration_test_tool_execution_sandbox.py +++ b/tests/integration_test_tool_execution_sandbox.py @@ -177,6 +177,7 @@ def always_err_tool(test_user): str: not important """ # Raise a unusual error so we know it's from this function + print("Going to error now") raise ZeroDivisionError("This is an intentionally weird division!") tool = create_tool_from_func(error) @@ -314,15 +315,16 @@ def test_local_sandbox_core_memory_replace(mock_e2b_api_key_none, core_memory_re assert result.func_return is None -@pytest.mark.e2b_sandbox +@pytest.mark.local_sandbox def test_local_sandbox_core_memory_replace_errors(mock_e2b_api_key_none, core_memory_replace_tool, test_user, agent_state): nonexistent_name = "Alexander Wang" args = {"label": "human", "old_content": nonexistent_name, "new_content": "Matt"} sandbox = ToolExecutionSandbox(core_memory_replace_tool.name, args, user_id=test_user.id) # run the sandbox - with pytest.raises(ValueError, match=f"Old content '{nonexistent_name}' not found in memory block 'human'"): - sandbox.run(agent_state=agent_state) + result = sandbox.run(agent_state=agent_state) + assert len(result.stderr) != 0, "stderr not empty" + assert f"ValueError: Old content '{nonexistent_name}' not found in memory block 'human'" in result.stderr[0], "stderr contains expected error" @pytest.mark.local_sandbox @@ -402,8 +404,11 @@ def test_local_sandbox_with_venv_errors(mock_e2b_api_key_none, custom_test_sandb sandbox = ToolExecutionSandbox(always_err_tool.name, {}, user_id=test_user.id) # run the sandbox - with pytest.raises(ZeroDivisionError, match="This is an intentionally weird division!"): - sandbox.run() + result = sandbox.run() + assert len(result.stdout) != 0, "stdout not empty" + assert "error" in result.stdout[0], "stdout contains printed string" + assert len(result.stderr) != 0, "stderr not empty" + assert "ZeroDivisionError: This is an intentionally weird division!" in result.stderr[0], "stderr contains expected error" # E2B sandbox tests @@ -500,8 +505,9 @@ def test_e2b_sandbox_core_memory_replace_errors(check_e2b_key_is_set, core_memor sandbox = ToolExecutionSandbox(core_memory_replace_tool.name, args, user_id=test_user.id) # run the sandbox - with pytest.raises(ValueError, match=f"Old content '{nonexistent_name}' not found in memory block 'human'"): - sandbox.run(agent_state=agent_state) + result = sandbox.run(agent_state=agent_state) + assert len(result.stderr) != 0, "stderr not empty" + assert f"ValueError: Old content '{nonexistent_name}' not found in memory block 'human'" in result.stderr[0], "stderr contains expected error" @pytest.mark.e2b_sandbox diff --git a/tests/test_server.py b/tests/test_server.py index 38404afc..482fe894 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -798,8 +798,8 @@ def test_tool_run(server, mock_e2b_api_key_none, user_id, agent_id): print(result) assert result.status == "success" assert result.function_return == "Ingested message Hello, world!", result.function_return - assert result.stdout == [''] - assert result.stderr == [''] + assert not result.stdout + assert not result.stderr result = server.run_tool_from_source( user_id=user_id, @@ -811,8 +811,8 @@ def test_tool_run(server, mock_e2b_api_key_none, user_id, agent_id): print(result) assert result.status == "success" assert result.function_return == "Ingested message Well well well", result.function_return - assert result.stdout == [''] - assert result.stderr == [''] + assert not result.stdout + assert not result.stderr result = server.run_tool_from_source( user_id=user_id, @@ -825,8 +825,9 @@ def test_tool_run(server, mock_e2b_api_key_none, user_id, agent_id): assert result.status == "error" assert "Error" in result.function_return, result.function_return assert "missing 1 required positional argument" in result.function_return, result.function_return - assert result.stdout == [''] - assert result.stderr != [''], "missing 1 required positional argument" in result.stderr[0] + assert not result.stdout + assert result.stderr + assert "missing 1 required positional argument" in result.stderr[0] # Test that we can still pull the tool out by default (pulls that last tool in the source) result = server.run_tool_from_source( @@ -839,8 +840,9 @@ def test_tool_run(server, mock_e2b_api_key_none, user_id, agent_id): print(result) assert result.status == "success" assert result.function_return == "Ingested message Well well well", result.function_return - assert result.stdout != [''], "I'm a distractor" in result.stdout[0] - assert result.stderr == [''] + assert result.stdout + assert "I'm a distractor" in result.stdout[0] + assert not result.stderr # Test that we can pull the tool out by name result = server.run_tool_from_source( @@ -853,8 +855,9 @@ def test_tool_run(server, mock_e2b_api_key_none, user_id, agent_id): print(result) assert result.status == "success" assert result.function_return == "Ingested message Well well well", result.function_return - assert result.stdout != [''], "I'm a distractor" in result.stdout[0] - assert result.stderr == [''] + assert result.stdout + assert "I'm a distractor" in result.stdout[0] + assert not result.stderr # Test that we can pull a different tool out by name result = server.run_tool_from_source( @@ -867,8 +870,9 @@ def test_tool_run(server, mock_e2b_api_key_none, user_id, agent_id): print(result) assert result.status == "success" assert result.function_return == str(None), result.function_return - assert result.stdout != [''], "I'm a distractor" in result.stdout[0] - assert result.stderr == [''] + assert result.stdout + assert "I'm a distractor" in result.stdout[0] + assert not result.stderr def test_composio_client_simple(server):