fix(core): catch all MCP tool execution errors instead of re-raising (#9419)
* fix(core): catch all MCP tool execution errors instead of re-raising MCP tools are external user-configured servers - any failure during tool execution is expected and should be returned as (error_msg, False) to the agent, not raised as an exception that hits Datadog as a 500. Previously: - base_client.py only caught McpError/ToolError, re-raised everything else - fastmcp_client.py (both SSE and StreamableHTTP) always re-raised Now all three execute_tool() methods catch all exceptions and return the error message to the agent conversation. The agent handles tool failures via the error message naturally. This silences ~15 Datadog issue types including: - fastmcp.exceptions.ToolError (validation, permissions) - mcp.shared.exceptions.McpError (connection closed, credentials) - httpx.HTTPStatusError (503 from Zapier, etc.) - httpx.ConnectError, ReadTimeout, RemoteProtocolError - requests.exceptions.ConnectionError - builtins.ConnectionError 🐾 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * fix(core): log unexpected MCP errors at warning level with traceback Expected MCP errors (ToolError, McpError, httpx.*, ConnectionError, etc.) log at info level. Anything else (e.g. TypeError, AttributeError from our own code) logs at warning with exc_info=True so it still surfaces in Datadog without crashing the request. 🐾 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> --------- Co-authored-by: Letta <noreply@letta.com>
This commit is contained in:
@@ -11,6 +11,31 @@ from letta.log import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
EXPECTED_MCP_TOOL_ERRORS = (
|
||||
"McpError",
|
||||
"ToolError",
|
||||
"HTTPStatusError",
|
||||
"ConnectError",
|
||||
"ConnectTimeout",
|
||||
"ReadTimeout",
|
||||
"ReadError",
|
||||
"RemoteProtocolError",
|
||||
"LocalProtocolError",
|
||||
"ConnectionError",
|
||||
"SSLError",
|
||||
"MaxRetryError",
|
||||
"ProtocolError",
|
||||
"BrokenResourceError",
|
||||
)
|
||||
|
||||
|
||||
def _log_mcp_tool_error(log: "get_logger", tool_name: str, exc: Exception) -> None:
|
||||
exc_name = type(exc).__name__
|
||||
if exc_name in EXPECTED_MCP_TOOL_ERRORS:
|
||||
log.info(f"MCP tool '{tool_name}' execution failed ({exc_name}): {exc}")
|
||||
else:
|
||||
log.warning(f"MCP tool '{tool_name}' execution failed with unexpected error ({exc_name}): {exc}", exc_info=True)
|
||||
|
||||
|
||||
# TODO: Get rid of Async prefix on this class name once we deprecate old sync code
|
||||
class AsyncBaseMCPClient:
|
||||
@@ -81,24 +106,11 @@ class AsyncBaseMCPClient:
|
||||
try:
|
||||
result = await self.session.call_tool(tool_name, tool_args)
|
||||
except Exception as e:
|
||||
# ToolError is raised by fastmcp for input validation errors (e.g., missing required properties)
|
||||
# McpError is raised for other MCP-related errors
|
||||
# Both are expected user-facing issues from external MCP servers
|
||||
# Log at debug level to avoid triggering production alerts for expected failures
|
||||
|
||||
# Handle ExceptionGroup wrapping (Python 3.11+ async TaskGroup can wrap exceptions)
|
||||
exception_to_check = e
|
||||
if hasattr(e, "exceptions") and e.exceptions:
|
||||
# If it's an ExceptionGroup with a single wrapped exception, unwrap it
|
||||
if len(e.exceptions) == 1:
|
||||
exception_to_check = e.exceptions[0]
|
||||
|
||||
if exception_to_check.__class__.__name__ in ("McpError", "ToolError"):
|
||||
logger.debug(f"MCP tool '{tool_name}' execution failed: {str(exception_to_check)}")
|
||||
# Return error message with failure status instead of raising to avoid Datadog alerts
|
||||
return str(e), False
|
||||
# Re-raise unexpected errors
|
||||
raise
|
||||
if hasattr(e, "exceptions") and e.exceptions and len(e.exceptions) == 1:
|
||||
exception_to_check = e.exceptions[0]
|
||||
_log_mcp_tool_error(logger, tool_name, exception_to_check)
|
||||
return str(exception_to_check), False
|
||||
|
||||
parsed_content = []
|
||||
for content_piece in result.content:
|
||||
|
||||
@@ -19,6 +19,7 @@ from mcp import Tool as MCPTool
|
||||
from letta.errors import LettaMCPConnectionError
|
||||
from letta.functions.mcp_client.types import SSEServerConfig, StreamableHTTPServerConfig
|
||||
from letta.log import get_logger
|
||||
from letta.services.mcp.base_client import _log_mcp_tool_error
|
||||
from letta.services.mcp.server_side_oauth import ServerSideOAuth
|
||||
|
||||
logger = get_logger(__name__)
|
||||
@@ -142,21 +143,11 @@ class AsyncFastMCPSSEClient:
|
||||
try:
|
||||
result = await self.client.call_tool(tool_name, tool_args)
|
||||
except Exception as e:
|
||||
# ToolError is raised by fastmcp for input validation errors (e.g., missing required properties)
|
||||
# McpError is raised for other MCP-related errors
|
||||
# Both are expected user-facing issues from external MCP servers
|
||||
# Log at debug level to avoid triggering production alerts for expected failures
|
||||
|
||||
# Handle ExceptionGroup wrapping (Python 3.11+ async TaskGroup can wrap exceptions)
|
||||
exception_to_check = e
|
||||
if hasattr(e, "exceptions") and e.exceptions:
|
||||
# If it's an ExceptionGroup with a single wrapped exception, unwrap it
|
||||
if len(e.exceptions) == 1:
|
||||
exception_to_check = e.exceptions[0]
|
||||
|
||||
if exception_to_check.__class__.__name__ in ("McpError", "ToolError"):
|
||||
logger.debug(f"MCP tool '{tool_name}' execution failed: {str(exception_to_check)}")
|
||||
raise
|
||||
if hasattr(e, "exceptions") and e.exceptions and len(e.exceptions) == 1:
|
||||
exception_to_check = e.exceptions[0]
|
||||
_log_mcp_tool_error(logger, tool_name, exception_to_check)
|
||||
return str(exception_to_check), False
|
||||
|
||||
# Parse content from result
|
||||
parsed_content = []
|
||||
@@ -309,21 +300,11 @@ class AsyncFastMCPStreamableHTTPClient:
|
||||
try:
|
||||
result = await self.client.call_tool(tool_name, tool_args)
|
||||
except Exception as e:
|
||||
# ToolError is raised by fastmcp for input validation errors (e.g., missing required properties)
|
||||
# McpError is raised for other MCP-related errors
|
||||
# Both are expected user-facing issues from external MCP servers
|
||||
# Log at debug level to avoid triggering production alerts for expected failures
|
||||
|
||||
# Handle ExceptionGroup wrapping (Python 3.11+ async TaskGroup can wrap exceptions)
|
||||
exception_to_check = e
|
||||
if hasattr(e, "exceptions") and e.exceptions:
|
||||
# If it's an ExceptionGroup with a single wrapped exception, unwrap it
|
||||
if len(e.exceptions) == 1:
|
||||
exception_to_check = e.exceptions[0]
|
||||
|
||||
if exception_to_check.__class__.__name__ in ("McpError", "ToolError"):
|
||||
logger.debug(f"MCP tool '{tool_name}' execution failed: {str(exception_to_check)}")
|
||||
raise
|
||||
if hasattr(e, "exceptions") and e.exceptions and len(e.exceptions) == 1:
|
||||
exception_to_check = e.exceptions[0]
|
||||
_log_mcp_tool_error(logger, tool_name, exception_to_check)
|
||||
return str(exception_to_check), False
|
||||
|
||||
# Parse content from result
|
||||
parsed_content = []
|
||||
|
||||
Reference in New Issue
Block a user