fix(core): catch all MCP tool execution errors instead of re-raising (#9419)

* fix(core): catch all MCP tool execution errors instead of re-raising MCP tools are external user-configured servers - any failure during tool execution is expected and should be returned as (error_msg, False) to the agent, not raised as an exception that hits Datadog as a 500. Previously: - base_client.py only caught McpError/ToolError, re-raised everything else - fastmcp_client.py (both SSE and StreamableHTTP) always re-raised Now all three execute_tool() methods catch all exceptions and return the error message to the agent conversation. The agent handles tool failures via the error message naturally. This silences ~15 Datadog issue types including: - fastmcp.exceptions.ToolError (validation, permissions) - mcp.shared.exceptions.McpError (connection closed, credentials) - httpx.HTTPStatusError (503 from Zapier, etc.) - httpx.ConnectError, ReadTimeout, RemoteProtocolError - requests.exceptions.ConnectionError - builtins.ConnectionError 🐾 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * fix(core): log unexpected MCP errors at warning level with traceback Expected MCP errors (ToolError, McpError, httpx.*, ConnectionError, etc.) log at info level. Anything else (e.g. TypeError, AttributeError from our own code) logs at warning with exc_info=True so it still surfaces in Datadog without crashing the request. 🐾 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> --------- Co-authored-by: Letta <noreply@letta.com>
2026-02-11 10:51:37 -08:00
parent 382e216cbb
commit ddcfeb26b1
2 changed files with 38 additions and 45 deletions
--- a/letta/services/mcp/base_client.py
+++ b/letta/services/mcp/base_client.py
@@ -11,6 +11,31 @@ from letta.log import get_logger

 logger = get_logger(__name__)

+EXPECTED_MCP_TOOL_ERRORS = (
+    "McpError",
+    "ToolError",
+    "HTTPStatusError",
+    "ConnectError",
+    "ConnectTimeout",
+    "ReadTimeout",
+    "ReadError",
+    "RemoteProtocolError",
+    "LocalProtocolError",
+    "ConnectionError",
+    "SSLError",
+    "MaxRetryError",
+    "ProtocolError",
+    "BrokenResourceError",
+)
+
+
+def _log_mcp_tool_error(log: "get_logger", tool_name: str, exc: Exception) -> None:
+    exc_name = type(exc).__name__
+    if exc_name in EXPECTED_MCP_TOOL_ERRORS:
+        log.info(f"MCP tool '{tool_name}' execution failed ({exc_name}): {exc}")
+    else:
+        log.warning(f"MCP tool '{tool_name}' execution failed with unexpected error ({exc_name}): {exc}", exc_info=True)
+

 # TODO: Get rid of Async prefix on this class name once we deprecate old sync code
 class AsyncBaseMCPClient:
@@ -81,24 +106,11 @@ class AsyncBaseMCPClient:
        try:
            result = await self.session.call_tool(tool_name, tool_args)
        except Exception as e:
-            # ToolError is raised by fastmcp for input validation errors (e.g., missing required properties)
-            # McpError is raised for other MCP-related errors
-            # Both are expected user-facing issues from external MCP servers
-            # Log at debug level to avoid triggering production alerts for expected failures
-
-            # Handle ExceptionGroup wrapping (Python 3.11+ async TaskGroup can wrap exceptions)
            exception_to_check = e
-            if hasattr(e, "exceptions") and e.exceptions:
-                # If it's an ExceptionGroup with a single wrapped exception, unwrap it
-                if len(e.exceptions) == 1:
-                    exception_to_check = e.exceptions[0]
-
-            if exception_to_check.__class__.__name__ in ("McpError", "ToolError"):
-                logger.debug(f"MCP tool '{tool_name}' execution failed: {str(exception_to_check)}")
-                # Return error message with failure status instead of raising to avoid Datadog alerts
-                return str(e), False
-            # Re-raise unexpected errors
-            raise
+            if hasattr(e, "exceptions") and e.exceptions and len(e.exceptions) == 1:
+                exception_to_check = e.exceptions[0]
+            _log_mcp_tool_error(logger, tool_name, exception_to_check)
+            return str(exception_to_check), False

        parsed_content = []
        for content_piece in result.content:
--- a/letta/services/mcp/fastmcp_client.py
+++ b/letta/services/mcp/fastmcp_client.py
@@ -19,6 +19,7 @@ from mcp import Tool as MCPTool
 from letta.errors import LettaMCPConnectionError
 from letta.functions.mcp_client.types import SSEServerConfig, StreamableHTTPServerConfig
 from letta.log import get_logger
+from letta.services.mcp.base_client import _log_mcp_tool_error
 from letta.services.mcp.server_side_oauth import ServerSideOAuth

 logger = get_logger(__name__)
@@ -142,21 +143,11 @@ class AsyncFastMCPSSEClient:
        try:
            result = await self.client.call_tool(tool_name, tool_args)
        except Exception as e:
-            # ToolError is raised by fastmcp for input validation errors (e.g., missing required properties)
-            # McpError is raised for other MCP-related errors
-            # Both are expected user-facing issues from external MCP servers
-            # Log at debug level to avoid triggering production alerts for expected failures
-
-            # Handle ExceptionGroup wrapping (Python 3.11+ async TaskGroup can wrap exceptions)
            exception_to_check = e
-            if hasattr(e, "exceptions") and e.exceptions:
-                # If it's an ExceptionGroup with a single wrapped exception, unwrap it
-                if len(e.exceptions) == 1:
-                    exception_to_check = e.exceptions[0]
-
-            if exception_to_check.__class__.__name__ in ("McpError", "ToolError"):
-                logger.debug(f"MCP tool '{tool_name}' execution failed: {str(exception_to_check)}")
-            raise
+            if hasattr(e, "exceptions") and e.exceptions and len(e.exceptions) == 1:
+                exception_to_check = e.exceptions[0]
+            _log_mcp_tool_error(logger, tool_name, exception_to_check)
+            return str(exception_to_check), False

        # Parse content from result
        parsed_content = []
@@ -309,21 +300,11 @@ class AsyncFastMCPStreamableHTTPClient:
        try:
            result = await self.client.call_tool(tool_name, tool_args)
        except Exception as e:
-            # ToolError is raised by fastmcp for input validation errors (e.g., missing required properties)
-            # McpError is raised for other MCP-related errors
-            # Both are expected user-facing issues from external MCP servers
-            # Log at debug level to avoid triggering production alerts for expected failures
-
-            # Handle ExceptionGroup wrapping (Python 3.11+ async TaskGroup can wrap exceptions)
            exception_to_check = e
-            if hasattr(e, "exceptions") and e.exceptions:
-                # If it's an ExceptionGroup with a single wrapped exception, unwrap it
-                if len(e.exceptions) == 1:
-                    exception_to_check = e.exceptions[0]
-
-            if exception_to_check.__class__.__name__ in ("McpError", "ToolError"):
-                logger.debug(f"MCP tool '{tool_name}' execution failed: {str(exception_to_check)}")
-            raise
+            if hasattr(e, "exceptions") and e.exceptions and len(e.exceptions) == 1:
+                exception_to_check = e.exceptions[0]
+            _log_mcp_tool_error(logger, tool_name, exception_to_check)
+            return str(exception_to_check), False

        # Parse content from result
        parsed_content = []