fix(core): catch all MCP tool execution errors instead of re-raising (#9419)

* fix(core): catch all MCP tool execution errors instead of re-raising

MCP tools are external user-configured servers - any failure during
tool execution is expected and should be returned as (error_msg, False)
to the agent, not raised as an exception that hits Datadog as a 500.

Previously:
- base_client.py only caught McpError/ToolError, re-raised everything else
- fastmcp_client.py (both SSE and StreamableHTTP) always re-raised

Now all three execute_tool() methods catch all exceptions and return
the error message to the agent conversation. The agent handles tool
failures via the error message naturally.

This silences ~15 Datadog issue types including:
- fastmcp.exceptions.ToolError (validation, permissions)
- mcp.shared.exceptions.McpError (connection closed, credentials)
- httpx.HTTPStatusError (503 from Zapier, etc.)
- httpx.ConnectError, ReadTimeout, RemoteProtocolError
- requests.exceptions.ConnectionError
- builtins.ConnectionError

🐾 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* fix(core): log unexpected MCP errors at warning level with traceback

Expected MCP errors (ToolError, McpError, httpx.*, ConnectionError, etc.)
log at info level. Anything else (e.g. TypeError, AttributeError from
our own code) logs at warning with exc_info=True so it still surfaces
in Datadog without crashing the request.

🐾 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

---------

Co-authored-by: Letta <noreply@letta.com>
This commit is contained in:
Kian Jones
2026-02-11 10:51:37 -08:00
committed by Caren Thomas
parent 382e216cbb
commit ddcfeb26b1
2 changed files with 38 additions and 45 deletions

View File

@@ -11,6 +11,31 @@ from letta.log import get_logger
logger = get_logger(__name__)
EXPECTED_MCP_TOOL_ERRORS = (
"McpError",
"ToolError",
"HTTPStatusError",
"ConnectError",
"ConnectTimeout",
"ReadTimeout",
"ReadError",
"RemoteProtocolError",
"LocalProtocolError",
"ConnectionError",
"SSLError",
"MaxRetryError",
"ProtocolError",
"BrokenResourceError",
)
def _log_mcp_tool_error(log: "get_logger", tool_name: str, exc: Exception) -> None:
exc_name = type(exc).__name__
if exc_name in EXPECTED_MCP_TOOL_ERRORS:
log.info(f"MCP tool '{tool_name}' execution failed ({exc_name}): {exc}")
else:
log.warning(f"MCP tool '{tool_name}' execution failed with unexpected error ({exc_name}): {exc}", exc_info=True)
# TODO: Get rid of Async prefix on this class name once we deprecate old sync code
class AsyncBaseMCPClient:
@@ -81,24 +106,11 @@ class AsyncBaseMCPClient:
try:
result = await self.session.call_tool(tool_name, tool_args)
except Exception as e:
# ToolError is raised by fastmcp for input validation errors (e.g., missing required properties)
# McpError is raised for other MCP-related errors
# Both are expected user-facing issues from external MCP servers
# Log at debug level to avoid triggering production alerts for expected failures
# Handle ExceptionGroup wrapping (Python 3.11+ async TaskGroup can wrap exceptions)
exception_to_check = e
if hasattr(e, "exceptions") and e.exceptions:
# If it's an ExceptionGroup with a single wrapped exception, unwrap it
if len(e.exceptions) == 1:
exception_to_check = e.exceptions[0]
if exception_to_check.__class__.__name__ in ("McpError", "ToolError"):
logger.debug(f"MCP tool '{tool_name}' execution failed: {str(exception_to_check)}")
# Return error message with failure status instead of raising to avoid Datadog alerts
return str(e), False
# Re-raise unexpected errors
raise
if hasattr(e, "exceptions") and e.exceptions and len(e.exceptions) == 1:
exception_to_check = e.exceptions[0]
_log_mcp_tool_error(logger, tool_name, exception_to_check)
return str(exception_to_check), False
parsed_content = []
for content_piece in result.content:

View File

@@ -19,6 +19,7 @@ from mcp import Tool as MCPTool
from letta.errors import LettaMCPConnectionError
from letta.functions.mcp_client.types import SSEServerConfig, StreamableHTTPServerConfig
from letta.log import get_logger
from letta.services.mcp.base_client import _log_mcp_tool_error
from letta.services.mcp.server_side_oauth import ServerSideOAuth
logger = get_logger(__name__)
@@ -142,21 +143,11 @@ class AsyncFastMCPSSEClient:
try:
result = await self.client.call_tool(tool_name, tool_args)
except Exception as e:
# ToolError is raised by fastmcp for input validation errors (e.g., missing required properties)
# McpError is raised for other MCP-related errors
# Both are expected user-facing issues from external MCP servers
# Log at debug level to avoid triggering production alerts for expected failures
# Handle ExceptionGroup wrapping (Python 3.11+ async TaskGroup can wrap exceptions)
exception_to_check = e
if hasattr(e, "exceptions") and e.exceptions:
# If it's an ExceptionGroup with a single wrapped exception, unwrap it
if len(e.exceptions) == 1:
exception_to_check = e.exceptions[0]
if exception_to_check.__class__.__name__ in ("McpError", "ToolError"):
logger.debug(f"MCP tool '{tool_name}' execution failed: {str(exception_to_check)}")
raise
if hasattr(e, "exceptions") and e.exceptions and len(e.exceptions) == 1:
exception_to_check = e.exceptions[0]
_log_mcp_tool_error(logger, tool_name, exception_to_check)
return str(exception_to_check), False
# Parse content from result
parsed_content = []
@@ -309,21 +300,11 @@ class AsyncFastMCPStreamableHTTPClient:
try:
result = await self.client.call_tool(tool_name, tool_args)
except Exception as e:
# ToolError is raised by fastmcp for input validation errors (e.g., missing required properties)
# McpError is raised for other MCP-related errors
# Both are expected user-facing issues from external MCP servers
# Log at debug level to avoid triggering production alerts for expected failures
# Handle ExceptionGroup wrapping (Python 3.11+ async TaskGroup can wrap exceptions)
exception_to_check = e
if hasattr(e, "exceptions") and e.exceptions:
# If it's an ExceptionGroup with a single wrapped exception, unwrap it
if len(e.exceptions) == 1:
exception_to_check = e.exceptions[0]
if exception_to_check.__class__.__name__ in ("McpError", "ToolError"):
logger.debug(f"MCP tool '{tool_name}' execution failed: {str(exception_to_check)}")
raise
if hasattr(e, "exceptions") and e.exceptions and len(e.exceptions) == 1:
exception_to_check = e.exceptions[0]
_log_mcp_tool_error(logger, tool_name, exception_to_check)
return str(exception_to_check), False
# Parse content from result
parsed_content = []