chore: continued logging cleanup and bugfixes (#5882)

* gracefully handle mcp error, runs/temporal error, and otel exporter bug fixes

* move error handling to managers

* remove migrated error handling from routers

* move logger.error calls to logger.warnings
This commit is contained in:
Kian Jones
2025-10-31 14:44:38 -07:00
committed by Caren Thomas
parent 381ca5bde8
commit 193c4f7c4a
5 changed files with 56 additions and 26 deletions

View File

@@ -23,7 +23,6 @@ from letta.server.rest_api.streaming_response import (
cancellation_aware_stream_wrapper, cancellation_aware_stream_wrapper,
) )
from letta.server.server import SyncServer from letta.server.server import SyncServer
from letta.services.lettuce import LettuceClient
from letta.services.run_manager import RunManager from letta.services.run_manager import RunManager
from letta.settings import settings from letta.settings import settings
@@ -150,26 +149,7 @@ async def retrieve_run(
""" """
actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id) actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
runs_manager = RunManager() runs_manager = RunManager()
return await runs_manager.get_run_with_status(run_id=run_id, actor=actor)
run = await runs_manager.get_run_by_id(run_id=run_id, actor=actor)
use_lettuce = run.metadata and run.metadata.get("lettuce")
if use_lettuce and run.status not in [RunStatus.completed, RunStatus.failed, RunStatus.cancelled]:
lettuce_client = await LettuceClient.create()
status = await lettuce_client.get_status(run_id=run_id)
# Map the status to our enum
run_status = run.status
if status == "RUNNING":
run_status = RunStatus.running
elif status == "COMPLETED":
run_status = RunStatus.completed
elif status == "FAILED":
run_status = RunStatus.failed
elif status == "CANCELLED":
run_status = RunStatus.cancelled
run.status = run_status
return run
RunMessagesResponse = Annotated[ RunMessagesResponse = Annotated[

View File

@@ -38,8 +38,8 @@ class AsyncBaseMCPClient:
raise e raise e
except Exception as e: except Exception as e:
# MCP connection failures are often due to user misconfiguration, not system errors # MCP connection failures are often due to user misconfiguration, not system errors
# Log at info level to help with debugging without triggering Sentry alerts # Log as warning for visibility in monitoring
logger.info( logger.warning(
f"Connecting to MCP server failed. Please review your server config: {self.server_config.model_dump_json(indent=4)}. Error: {str(e)}" f"Connecting to MCP server failed. Please review your server config: {self.server_config.model_dump_json(indent=4)}. Error: {str(e)}"
) )
if hasattr(self.server_config, "server_url") and self.server_config.server_url: if hasattr(self.server_config, "server_url") and self.server_config.server_url:
@@ -78,7 +78,13 @@ class AsyncBaseMCPClient:
async def execute_tool(self, tool_name: str, tool_args: dict) -> Tuple[str, bool]: async def execute_tool(self, tool_name: str, tool_args: dict) -> Tuple[str, bool]:
self._check_initialized() self._check_initialized()
try:
result = await self.session.call_tool(tool_name, tool_args) result = await self.session.call_tool(tool_name, tool_args)
except Exception as e:
if e.__class__.__name__ == "McpError":
logger.warning(f"MCP tool '{tool_name}' execution failed: {str(e)}")
raise
parsed_content = [] parsed_content = []
for content_piece in result.content: for content_piece in result.content:
if isinstance(content_piece, TextContent): if isinstance(content_piece, TextContent):

View File

@@ -97,6 +97,34 @@ class RunManager:
raise NoResultFound(f"Run with id {run_id} not found") raise NoResultFound(f"Run with id {run_id} not found")
return run.to_pydantic() return run.to_pydantic()
@enforce_types
async def get_run_with_status(self, run_id: str, actor: PydanticUser) -> PydanticRun:
"""Get a run by its ID and update status from Lettuce if applicable."""
run = await self.get_run_by_id(run_id=run_id, actor=actor)
use_lettuce = run.metadata and run.metadata.get("lettuce")
if use_lettuce and run.status not in [RunStatus.completed, RunStatus.failed, RunStatus.cancelled]:
try:
from letta.services.lettuce_client import LettuceClient
lettuce_client = await LettuceClient.create()
status = await lettuce_client.get_status(run_id=run_id)
# Map the status to our enum
if status == "RUNNING":
run.status = RunStatus.running
elif status == "COMPLETED":
run.status = RunStatus.completed
elif status == "FAILED":
run.status = RunStatus.failed
elif status == "CANCELLED":
run.status = RunStatus.cancelled
except Exception as e:
logger.error(f"Failed to get status from Lettuce for run {run_id}: {str(e)}")
# Return run with current status from DB if Lettuce fails
return run
@enforce_types @enforce_types
async def list_runs( async def list_runs(
self, self,

View File

@@ -8,6 +8,8 @@ receivers:
filelog: filelog:
include: include:
- /root/.letta/logs/Letta.log - /root/.letta/logs/Letta.log
multiline:
line_start_pattern: '^[\{\[]|^[0-9]{4}-[0-9]{2}-[0-9]{2}'
operators: operators:
# Parse JSON logs (skip non-JSON lines) # Parse JSON logs (skip non-JSON lines)
- type: json_parser - type: json_parser
@@ -19,8 +21,14 @@ receivers:
layout_type: gotime layout_type: gotime
layout: '2006-01-02T15:04:05.999999Z07:00' layout: '2006-01-02T15:04:05.999999Z07:00'
on_error: send on_error: send
if: 'attributes.timestamp != nil'
processors: processors:
resource:
attributes:
- key: environment
value: ${env:LETTA_ENVIRONMENT}
action: upsert
memory_limiter: memory_limiter:
check_interval: 1s check_interval: 1s
limit_mib: 1024 limit_mib: 1024
@@ -64,7 +72,7 @@ service:
exporters: [clickhouse] exporters: [clickhouse]
logs: logs:
receivers: [filelog] receivers: [filelog]
processors: [memory_limiter, batch] processors: [resource, memory_limiter, batch]
exporters: [clickhouse] exporters: [clickhouse]
metrics: metrics:
receivers: [otlp] receivers: [otlp]

View File

@@ -8,6 +8,8 @@ receivers:
filelog: filelog:
include: include:
- /root/.letta/logs/Letta.log - /root/.letta/logs/Letta.log
multiline:
line_start_pattern: '^[\{\[]|^[0-9]{4}-[0-9]{2}-[0-9]{2}'
operators: operators:
# Parse JSON logs (skip non-JSON lines) # Parse JSON logs (skip non-JSON lines)
- type: json_parser - type: json_parser
@@ -19,8 +21,14 @@ receivers:
layout_type: gotime layout_type: gotime
layout: '2006-01-02T15:04:05.999999Z07:00' layout: '2006-01-02T15:04:05.999999Z07:00'
on_error: send on_error: send
if: 'attributes.timestamp != nil'
processors: processors:
resource:
attributes:
- key: environment
value: ${env:LETTA_ENVIRONMENT}
action: upsert
memory_limiter: memory_limiter:
check_interval: 1s check_interval: 1s
limit_mib: 1024 limit_mib: 1024
@@ -65,7 +73,7 @@ service:
exporters: [clickhouse] exporters: [clickhouse]
logs: logs:
receivers: [filelog] receivers: [filelog]
processors: [memory_limiter, batch] processors: [resource, memory_limiter, batch]
exporters: [clickhouse] exporters: [clickhouse]
metrics: metrics:
receivers: [otlp] receivers: [otlp]