From ddcfeb26b1f2de284ec862b70a1b30ad69a6f3f4 Mon Sep 17 00:00:00 2001
From: Kian Jones <11655409+kianjones9@users.noreply.github.com>
Date: Wed, 11 Feb 2026 10:51:37 -0800
Subject: [PATCH] fix(core): catch all MCP tool execution errors instead of
 re-raising (#9419)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix(core): catch all MCP tool execution errors instead of re-raising

MCP tools are external user-configured servers - any failure during
tool execution is expected and should be returned as (error_msg, False)
to the agent, not raised as an exception that hits Datadog as a 500.

Previously:
- base_client.py only caught McpError/ToolError, re-raised everything else
- fastmcp_client.py (both SSE and StreamableHTTP) always re-raised

Now all three execute_tool() methods catch all exceptions and return
the error message to the agent conversation. The agent handles tool
failures via the error message naturally.

This silences ~15 Datadog issue types including:
- fastmcp.exceptions.ToolError (validation, permissions)
- mcp.shared.exceptions.McpError (connection closed, credentials)
- httpx.HTTPStatusError (503 from Zapier, etc.)
- httpx.ConnectError, ReadTimeout, RemoteProtocolError
- requests.exceptions.ConnectionError
- builtins.ConnectionError

🐾 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* fix(core): log unexpected MCP errors at warning level with traceback

Expected MCP errors (ToolError, McpError, httpx.*, ConnectionError, etc.)
log at info level. Anything else (e.g. TypeError, AttributeError from
our own code) logs at warning with exc_info=True so it still surfaces
in Datadog without crashing the request.

🐾 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

---------

Co-authored-by: Letta <noreply@letta.com>
---
 letta/services/mcp/base_client.py    | 46 ++++++++++++++++++----------
 letta/services/mcp/fastmcp_client.py | 37 ++++++----------------
 2 files changed, 38 insertions(+), 45 deletions(-)

diff --git a/letta/services/mcp/base_client.py b/letta/services/mcp/base_client.py
index 31b25e29..c53b6f99 100644
--- a/letta/services/mcp/base_client.py
+++ b/letta/services/mcp/base_client.py
@@ -11,6 +11,31 @@ from letta.log import get_logger
 
 logger = get_logger(__name__)
 
+EXPECTED_MCP_TOOL_ERRORS = (
+    "McpError",
+    "ToolError",
+    "HTTPStatusError",
+    "ConnectError",
+    "ConnectTimeout",
+    "ReadTimeout",
+    "ReadError",
+    "RemoteProtocolError",
+    "LocalProtocolError",
+    "ConnectionError",
+    "SSLError",
+    "MaxRetryError",
+    "ProtocolError",
+    "BrokenResourceError",
+)
+
+
+def _log_mcp_tool_error(log: "get_logger", tool_name: str, exc: Exception) -> None:
+    exc_name = type(exc).__name__
+    if exc_name in EXPECTED_MCP_TOOL_ERRORS:
+        log.info(f"MCP tool '{tool_name}' execution failed ({exc_name}): {exc}")
+    else:
+        log.warning(f"MCP tool '{tool_name}' execution failed with unexpected error ({exc_name}): {exc}", exc_info=True)
+
 
 # TODO: Get rid of Async prefix on this class name once we deprecate old sync code
 class AsyncBaseMCPClient:
@@ -81,24 +106,11 @@ class AsyncBaseMCPClient:
         try:
             result = await self.session.call_tool(tool_name, tool_args)
         except Exception as e:
-            # ToolError is raised by fastmcp for input validation errors (e.g., missing required properties)
-            # McpError is raised for other MCP-related errors
-            # Both are expected user-facing issues from external MCP servers
-            # Log at debug level to avoid triggering production alerts for expected failures
-
-            # Handle ExceptionGroup wrapping (Python 3.11+ async TaskGroup can wrap exceptions)
             exception_to_check = e
-            if hasattr(e, "exceptions") and e.exceptions:
-                # If it's an ExceptionGroup with a single wrapped exception, unwrap it
-                if len(e.exceptions) == 1:
-                    exception_to_check = e.exceptions[0]
-
-            if exception_to_check.__class__.__name__ in ("McpError", "ToolError"):
-                logger.debug(f"MCP tool '{tool_name}' execution failed: {str(exception_to_check)}")
-                # Return error message with failure status instead of raising to avoid Datadog alerts
-                return str(e), False
-            # Re-raise unexpected errors
-            raise
+            if hasattr(e, "exceptions") and e.exceptions and len(e.exceptions) == 1:
+                exception_to_check = e.exceptions[0]
+            _log_mcp_tool_error(logger, tool_name, exception_to_check)
+            return str(exception_to_check), False
 
         parsed_content = []
         for content_piece in result.content:
diff --git a/letta/services/mcp/fastmcp_client.py b/letta/services/mcp/fastmcp_client.py
index 01190e56..2a2b05c2 100644
--- a/letta/services/mcp/fastmcp_client.py
+++ b/letta/services/mcp/fastmcp_client.py
@@ -19,6 +19,7 @@ from mcp import Tool as MCPTool
 from letta.errors import LettaMCPConnectionError
 from letta.functions.mcp_client.types import SSEServerConfig, StreamableHTTPServerConfig
 from letta.log import get_logger
+from letta.services.mcp.base_client import _log_mcp_tool_error
 from letta.services.mcp.server_side_oauth import ServerSideOAuth
 
 logger = get_logger(__name__)
@@ -142,21 +143,11 @@ class AsyncFastMCPSSEClient:
         try:
             result = await self.client.call_tool(tool_name, tool_args)
         except Exception as e:
-            # ToolError is raised by fastmcp for input validation errors (e.g., missing required properties)
-            # McpError is raised for other MCP-related errors
-            # Both are expected user-facing issues from external MCP servers
-            # Log at debug level to avoid triggering production alerts for expected failures
-
-            # Handle ExceptionGroup wrapping (Python 3.11+ async TaskGroup can wrap exceptions)
             exception_to_check = e
-            if hasattr(e, "exceptions") and e.exceptions:
-                # If it's an ExceptionGroup with a single wrapped exception, unwrap it
-                if len(e.exceptions) == 1:
-                    exception_to_check = e.exceptions[0]
-
-            if exception_to_check.__class__.__name__ in ("McpError", "ToolError"):
-                logger.debug(f"MCP tool '{tool_name}' execution failed: {str(exception_to_check)}")
-            raise
+            if hasattr(e, "exceptions") and e.exceptions and len(e.exceptions) == 1:
+                exception_to_check = e.exceptions[0]
+            _log_mcp_tool_error(logger, tool_name, exception_to_check)
+            return str(exception_to_check), False
 
         # Parse content from result
         parsed_content = []
@@ -309,21 +300,11 @@ class AsyncFastMCPStreamableHTTPClient:
         try:
             result = await self.client.call_tool(tool_name, tool_args)
         except Exception as e:
-            # ToolError is raised by fastmcp for input validation errors (e.g., missing required properties)
-            # McpError is raised for other MCP-related errors
-            # Both are expected user-facing issues from external MCP servers
-            # Log at debug level to avoid triggering production alerts for expected failures
-
-            # Handle ExceptionGroup wrapping (Python 3.11+ async TaskGroup can wrap exceptions)
             exception_to_check = e
-            if hasattr(e, "exceptions") and e.exceptions:
-                # If it's an ExceptionGroup with a single wrapped exception, unwrap it
-                if len(e.exceptions) == 1:
-                    exception_to_check = e.exceptions[0]
-
-            if exception_to_check.__class__.__name__ in ("McpError", "ToolError"):
-                logger.debug(f"MCP tool '{tool_name}' execution failed: {str(exception_to_check)}")
-            raise
+            if hasattr(e, "exceptions") and e.exceptions and len(e.exceptions) == 1:
+                exception_to_check = e.exceptions[0]
+            _log_mcp_tool_error(logger, tool_name, exception_to_check)
+            return str(exception_to_check), False
 
         # Parse content from result
         parsed_content = []