From 0198201bbecc17f32d1be7c75458d57fb5b4aab4 Mon Sep 17 00:00:00 2001 From: cthomas Date: Fri, 14 Mar 2025 16:04:03 -0700 Subject: [PATCH] feat: bake otel collector into letta image (#1292) --- Dockerfile | 19 +++++-- compose.tracing.yaml | 19 ------- compose.yaml | 4 ++ letta/server/startup.sh | 20 ++++++++ letta/services/agent_manager.py | 3 -- letta/settings.py | 1 + letta/tracing.py | 2 +- otel-collector-config-clickhouse.yaml | 73 +++++++++++++++++++++++++++ otel-collector-config-file.yaml | 27 ++++++++++ otel-collector-config.yaml | 32 ------------ 10 files changed, 141 insertions(+), 59 deletions(-) delete mode 100644 compose.tracing.yaml create mode 100644 otel-collector-config-clickhouse.yaml create mode 100644 otel-collector-config-file.yaml delete mode 100644 otel-collector-config.yaml diff --git a/Dockerfile b/Dockerfile index c1abecc4..0e99ff19 100644 --- a/Dockerfile +++ b/Dockerfile @@ -40,12 +40,22 @@ RUN poetry lock --no-update && \ # Runtime stage FROM ankane/pgvector:v0.5.1 AS runtime -# Install Python packages +# Install Python packages and OpenTelemetry Collector RUN apt-get update && apt-get install -y \ python3 \ python3-venv \ + curl \ && rm -rf /var/lib/apt/lists/* \ - && mkdir -p /app + && mkdir -p /app \ + # Install OpenTelemetry Collector + && curl -L https://github.com/open-telemetry/opentelemetry-collector-releases/releases/download/v0.96.0/otelcol-contrib_0.96.0_linux_amd64.tar.gz -o /tmp/otel-collector.tar.gz \ + && tar xzf /tmp/otel-collector.tar.gz -C /usr/local/bin \ + && rm /tmp/otel-collector.tar.gz \ + && mkdir -p /etc/otel + +# Add OpenTelemetry Collector configs +COPY otel-collector-config-file.yaml /etc/otel/config-file.yaml +COPY otel-collector-config-clickhouse.yaml /etc/otel/config-clickhouse.yaml ARG LETTA_ENVIRONMENT=PRODUCTION ENV LETTA_ENVIRONMENT=${LETTA_ENVIRONMENT} \ @@ -54,7 +64,8 @@ ENV LETTA_ENVIRONMENT=${LETTA_ENVIRONMENT} \ POSTGRES_USER=letta \ POSTGRES_PASSWORD=letta \ POSTGRES_DB=letta \ - COMPOSIO_DISABLE_VERSION_CHECK=true + COMPOSIO_DISABLE_VERSION_CHECK=true \ + OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 WORKDIR /app @@ -64,7 +75,7 @@ COPY --from=builder /app . # Copy initialization SQL if it exists COPY init.sql /docker-entrypoint-initdb.d/ -EXPOSE 8283 5432 +EXPOSE 8283 5432 4317 4318 ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"] CMD ["./letta/server/startup.sh"] diff --git a/compose.tracing.yaml b/compose.tracing.yaml deleted file mode 100644 index 169ab517..00000000 --- a/compose.tracing.yaml +++ /dev/null @@ -1,19 +0,0 @@ -services: - letta_server: - environment: - - ENV_NAME=${ENV_NAME} # optional service name - - OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317 - - otel-collector: - image: otel/opentelemetry-collector-contrib:0.92.0 - command: ["--config=/etc/otel-collector-config.yaml"] - volumes: - - ./otel-collector-config.yaml:/etc/otel-collector-config.yaml - environment: - - CLICKHOUSE_ENDPOINT=${CLICKHOUSE_ENDPOINT} - - CLICKHOUSE_DATABASE=${CLICKHOUSE_DATABASE} - - CLICKHOUSE_USER=${CLICKHOUSE_USER} - - CLICKHOUSE_PASSWORD=${CLICKHOUSE_PASSWORD} - ports: - - "4317:4317" - - "4318:4318" diff --git a/compose.yaml b/compose.yaml index f6d13abc..d7ce6e6d 100644 --- a/compose.yaml +++ b/compose.yaml @@ -49,6 +49,10 @@ services: - VLLM_API_BASE=${VLLM_API_BASE} - OPENLLM_AUTH_TYPE=${OPENLLM_AUTH_TYPE} - OPENLLM_API_KEY=${OPENLLM_API_KEY} + - CLICKHOUSE_ENDPOINT=${CLICKHOUSE_ENDPOINT} + - CLICKHOUSE_DATABASE=${CLICKHOUSE_DATABASE} + - CLICKHOUSE_USERNAME=${CLICKHOUSE_USERNAME} + - CLICKHOUSE_PASSWORD=${CLICKHOUSE_PASSWORD} # volumes: # - ./configs/server_config.yaml:/root/.letta/config # config file # - ~/.letta/credentials:/root/.letta/credentials # credentials file diff --git a/letta/server/startup.sh b/letta/server/startup.sh index 44b790f1..60f427ac 100755 --- a/letta/server/startup.sh +++ b/letta/server/startup.sh @@ -53,6 +53,26 @@ if [ "${SECURE:-false}" = "true" ]; then CMD="$CMD --secure" fi +# Start OpenTelemetry Collector in the background +if [ -n "$CLICKHOUSE_ENDPOINT" ] && [ -n "$CLICKHOUSE_PASSWORD" ]; then + echo "Starting OpenTelemetry Collector with Clickhouse export..." + CONFIG_FILE="/etc/otel/config-clickhouse.yaml" +else + echo "Starting OpenTelemetry Collector with file export only..." + CONFIG_FILE="/etc/otel/config-file.yaml" +fi + +/usr/local/bin/otelcol-contrib --config "$CONFIG_FILE" & +OTEL_PID=$! + +# Function to cleanup processes on exit +cleanup() { + echo "Shutting down..." + kill $OTEL_PID + wait $OTEL_PID +} +trap cleanup EXIT + echo "Starting Letta server at http://$HOST:$PORT..." echo "Executing: $CMD" exec $CMD diff --git a/letta/services/agent_manager.py b/letta/services/agent_manager.py index d751544c..6bf4a5cf 100644 --- a/letta/services/agent_manager.py +++ b/letta/services/agent_manager.py @@ -59,7 +59,6 @@ from letta.services.passage_manager import PassageManager from letta.services.source_manager import SourceManager from letta.services.tool_manager import ToolManager from letta.settings import settings -from letta.tracing import trace_method from letta.utils import enforce_types, united_diff logger = get_logger(__name__) @@ -83,7 +82,6 @@ class AgentManager: # ====================================================================================================================== # Basic CRUD operations # ====================================================================================================================== - @trace_method @enforce_types def create_agent( self, @@ -446,7 +444,6 @@ class AgentManager: agent = AgentModel.read(db_session=session, name=agent_name, actor=actor) return agent.to_pydantic() - @trace_method @enforce_types def delete_agent(self, agent_id: str, actor: PydanticUser) -> None: """ diff --git a/letta/settings.py b/letta/settings.py index 2aad93ba..bb8eae16 100644 --- a/letta/settings.py +++ b/letta/settings.py @@ -173,6 +173,7 @@ class Settings(BaseSettings): # telemetry logging verbose_telemetry_logging: bool = False + otel_exporter_otlp_endpoint: str = "http://localhost:4317" # uvicorn settings uvicorn_workers: int = 1 diff --git a/letta/tracing.py b/letta/tracing.py index 6971cad1..2275759c 100644 --- a/letta/tracing.py +++ b/letta/tracing.py @@ -207,7 +207,7 @@ def log_event(name: str, attributes: Optional[Dict[str, Any]] = None, timestamp: current_span = trace.get_current_span() if current_span: if timestamp is None: - timestamp = int(time.perf_counter_ns()) + timestamp = time.time_ns() def _safe_convert(v): if isinstance(v, (str, bool, int, float)): diff --git a/otel-collector-config-clickhouse.yaml b/otel-collector-config-clickhouse.yaml new file mode 100644 index 00000000..c18a1843 --- /dev/null +++ b/otel-collector-config-clickhouse.yaml @@ -0,0 +1,73 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + filelog: + include: + - /root/.letta/logs/Letta.log + multiline: + line_start_pattern: ^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3} + operators: + # Extract timestamp and other fields + - type: regex_parser + regex: '^(?P\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3})\s+-\s+(?P[\w\.-]+)\s+-\s+(?P\w+)\s+-\s+(?P.*)$' + # Parse the timestamp + - type: time_parser + parse_from: attributes.timestamp + layout: '%Y-%m-%d %H:%M:%S,%L' + # Set severity + - type: severity_parser + parse_from: attributes.severity + mapping: + debug: DEBUG + info: INFO + warning: WARN + error: ERROR + critical: FATAL + # Add resource attributes + - type: add + field: resource.service_name + value: letta-server + - type: add + field: resource.environment + value: ${ENV_NAME} + +processors: + batch: + timeout: 1s + send_batch_size: 1024 + +exporters: + file: + path: /root/.letta/logs/traces.json + rotation: + max_megabytes: 100 + max_days: 7 + max_backups: 5 + clickhouse: + endpoint: ${CLICKHOUSE_ENDPOINT} + database: ${CLICKHOUSE_DATABASE} + username: ${CLICKHOUSE_USERNAME} + password: ${CLICKHOUSE_PASSWORD} + timeout: 5s + sending_queue: + queue_size: 100 + retry_on_failure: + enabled: true + initial_interval: 5s + max_interval: 30s + max_elapsed_time: 300s + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [file, clickhouse] + logs: + receivers: [filelog] + processors: [batch] + exporters: [clickhouse] diff --git a/otel-collector-config-file.yaml b/otel-collector-config-file.yaml new file mode 100644 index 00000000..2552c0cc --- /dev/null +++ b/otel-collector-config-file.yaml @@ -0,0 +1,27 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 1s + send_batch_size: 1024 + +exporters: + file: + path: /root/.letta/logs/traces.json + rotation: + max_megabytes: 100 + max_days: 7 + max_backups: 5 + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [file] diff --git a/otel-collector-config.yaml b/otel-collector-config.yaml deleted file mode 100644 index d13164ea..00000000 --- a/otel-collector-config.yaml +++ /dev/null @@ -1,32 +0,0 @@ -receivers: - otlp: - protocols: - grpc: - endpoint: 0.0.0.0:4317 - http: - endpoint: 0.0.0.0:4318 - -processors: - batch: - timeout: 1s - send_batch_size: 1024 - -exporters: - clickhouse: - endpoint: ${CLICKHOUSE_ENDPOINT} - username: ${CLICKHOUSE_USER} - password: ${CLICKHOUSE_PASSWORD} - database: ${CLICKHOUSE_DATABASE} - timeout: 10s - retry_on_failure: - enabled: true - initial_interval: 5s - max_interval: 30s - max_elapsed_time: 300s - -service: - pipelines: - traces: - receivers: [otlp] - processors: [batch] - exporters: [clickhouse]