* feat: add provider trace backend abstraction for multi-backend telemetry Introduces a pluggable backend system for provider traces: - Base class with async/sync create and read interfaces - PostgreSQL backend (existing behavior) - ClickHouse backend (via OTEL instrumentation) - Socket backend (writes to Unix socket for crouton sidecar) - Factory for instantiating backends from config Refactors TelemetryManager to use backends with support for: - Multi-backend writes (concurrent via asyncio.gather) - Primary backend for reads (first in config list) - Graceful error handling per backend Config: LETTA_TELEMETRY_PROVIDER_TRACE_BACKEND (comma-separated) Example: "postgres,socket" for dual-write to Postgres and crouton 🐙 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * feat: add protocol version to socket backend records Adds PROTOCOL_VERSION constant to socket backend: - Included in every telemetry record sent to crouton - Must match ProtocolVersion in apps/crouton/main.go - Enables crouton to detect and reject incompatible messages 🐙 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * fix: remove organization_id from ProviderTraceCreate calls The organization_id is now handled via the actor parameter in the telemetry manager, not through ProviderTraceCreate schema. This fixes validation errors after changing ProviderTraceCreate to inherit from BaseProviderTrace which forbids extra fields. 🐙 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * consolidate provider trace * add clickhouse-connect to fix bug on main lmao * auto generated sdk changes, and deployment details, and clikchouse prefix bug and added fields to runs trace return api * auto generated sdk changes, and deployment details, and clikchouse prefix bug and added fields to runs trace return api * consolidate provider trace * consolidate provider trace bug fix --------- Co-authored-by: Letta <noreply@letta.com>
108 lines
3.8 KiB
Python
108 lines
3.8 KiB
Python
"""Unix socket provider trace backend."""
|
|
|
|
import json
|
|
import os
|
|
import socket as socket_module
|
|
import threading
|
|
from datetime import datetime, timezone
|
|
from typing import Any
|
|
|
|
from letta.log import get_logger
|
|
from letta.schemas.provider_trace import ProviderTrace
|
|
from letta.schemas.user import User
|
|
from letta.services.provider_trace_backends.base import ProviderTraceBackendClient
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
# Protocol version for crouton communication.
|
|
# Bump this when making breaking changes to the record schema.
|
|
# Must match ProtocolVersion in apps/crouton/main.go.
|
|
PROTOCOL_VERSION = 1
|
|
|
|
|
|
class SocketProviderTraceBackend(ProviderTraceBackendClient):
|
|
"""
|
|
Store provider traces via Unix socket.
|
|
|
|
Sends NDJSON telemetry records to a Unix socket. The receiving service
|
|
(sidecar) is responsible for storage (e.g., GCS, S3, local filesystem).
|
|
|
|
This is a write-only backend - reads are not supported.
|
|
"""
|
|
|
|
def __init__(self, socket_path: str = "/var/run/telemetry/telemetry.sock"):
|
|
self.socket_path = socket_path
|
|
|
|
async def create_async(
|
|
self,
|
|
actor: User,
|
|
provider_trace: ProviderTrace,
|
|
) -> ProviderTrace | None:
|
|
self._send_to_crouton(provider_trace)
|
|
|
|
# Return a ProviderTrace with the same ID for consistency across backends
|
|
return ProviderTrace(
|
|
id=provider_trace.id,
|
|
step_id=provider_trace.step_id,
|
|
request_json=provider_trace.request_json or {},
|
|
response_json=provider_trace.response_json or {},
|
|
)
|
|
|
|
def create_sync(
|
|
self,
|
|
actor: User,
|
|
provider_trace: ProviderTrace,
|
|
) -> ProviderTrace | None:
|
|
self._send_to_crouton(provider_trace)
|
|
return None
|
|
|
|
async def get_by_step_id_async(
|
|
self,
|
|
step_id: str,
|
|
actor: User,
|
|
) -> ProviderTrace | None:
|
|
# Socket backend is write-only - reads should go through the storage backend directly.
|
|
logger.warning("Socket backend does not support reads")
|
|
return None
|
|
|
|
def _send_to_crouton(self, provider_trace: ProviderTrace) -> None:
|
|
"""Build telemetry record and send to Crouton sidecar (fire-and-forget)."""
|
|
response = provider_trace.response_json or {}
|
|
request = provider_trace.request_json or {}
|
|
|
|
# Extract error if present
|
|
error = response.get("error", {}).get("message") if isinstance(response.get("error"), dict) else None
|
|
|
|
record = {
|
|
"protocol_version": PROTOCOL_VERSION,
|
|
"provider_trace_id": provider_trace.id,
|
|
"agent_id": provider_trace.agent_id,
|
|
"run_id": provider_trace.run_id,
|
|
"step_id": provider_trace.step_id,
|
|
"tags": provider_trace.agent_tags or [],
|
|
"type": provider_trace.call_type or "agent_step",
|
|
"request": request,
|
|
"response": response if not error else None,
|
|
"error": error,
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
|
|
# Fire-and-forget in background thread
|
|
thread = threading.Thread(target=self._send_async, args=(record,), daemon=True)
|
|
thread.start()
|
|
|
|
def _send_async(self, record: dict[str, Any]) -> None:
|
|
"""Send record to Unix socket (runs in background thread)."""
|
|
try:
|
|
if not os.path.exists(self.socket_path):
|
|
logger.warning(f"Crouton socket not found at {self.socket_path}")
|
|
return
|
|
|
|
with socket_module.socket(socket_module.AF_UNIX, socket_module.SOCK_STREAM) as sock:
|
|
sock.settimeout(5.0)
|
|
sock.connect(self.socket_path)
|
|
payload = json.dumps(record, default=str) + "\n"
|
|
sock.sendall(payload.encode())
|
|
except Exception as e:
|
|
logger.warning(f"Failed to send telemetry to Crouton: {e}")
|