feat: add metadata-only provider trace storage option (#9155)

* feat: add metadata-only provider trace storage option

Add support for writing provider traces to a lightweight metadata-only
table (~1.5GB) instead of the full table (~725GB) since request/response
JSON is now stored in GCS.

- Add `LETTA_TELEMETRY_PROVIDER_TRACE_PG_METADATA_ONLY` setting
- Create `provider_trace_metadata` table via alembic migration
- Conditionally write to new table when flag is enabled
- Include backfill script for migrating existing data

🐾 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* chore: regenerate API spec and SDK

* fix: use composite PK (created_at, id) for provider_trace_metadata

Aligns with GCS partitioning structure (raw/date=YYYY-MM-DD/{id}.json.gz)
and enables efficient date-range queries via the B-tree index.

🐾 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* ammendments

* fix: add bulk data copy to migration

Copy existing provider_traces metadata in-migration instead of separate
backfill script. Creates indexes after bulk insert for better performance.

🐾 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* fix: remove data copy from migration, create empty table only

Old data stays in provider_traces, new writes go to provider_trace_metadata
when flag is enabled. Full traces are in GCS anyway.

🐾 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* fix: address PR comments

- Remove GCS mention from ProviderTraceMetadata docstring
- Move metadata object creation outside session context

🐾 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* fix: reads always use full provider_traces table

The metadata_only flag should only control writes. Reads always go to
the full table to avoid returning ProviderTraceMetadata where
ProviderTrace is expected.

🐾 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* feat: enable metadata-only provider trace writes in prod

Add LETTA_TELEMETRY_PROVIDER_TRACE_PG_METADATA_ONLY=true to all
Helm values (memgpt-server and lettuce-py, prod and dev).

🐾 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

---------

Co-authored-by: Letta <noreply@letta.com>
This commit is contained in:
Kian Jones
2026-01-28 15:56:12 -08:00
committed by Caren Thomas
parent 69cad47e6a
commit c1a02fa180
6 changed files with 189 additions and 1 deletions

View File

@@ -0,0 +1,59 @@
"""create provider_trace_metadata table
Revision ID: a1b2c3d4e5f8
Revises: 9275f62ad282
Create Date: 2026-01-28
"""
from typing import Sequence, Union
import sqlalchemy as sa
from alembic import op
from letta.settings import settings
revision: str = "a1b2c3d4e5f8"
down_revision: Union[str, None] = "9275f62ad282"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
if not settings.letta_pg_uri_no_default:
return
op.create_table(
"provider_trace_metadata",
sa.Column("id", sa.String(), nullable=False),
sa.Column("step_id", sa.String(), nullable=True),
sa.Column("agent_id", sa.String(), nullable=True),
sa.Column("agent_tags", sa.JSON(), nullable=True),
sa.Column("call_type", sa.String(), nullable=True),
sa.Column("run_id", sa.String(), nullable=True),
sa.Column("source", sa.String(), nullable=True),
sa.Column("org_id", sa.String(), nullable=True),
sa.Column("user_id", sa.String(), nullable=True),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=True),
sa.Column("is_deleted", sa.Boolean(), server_default=sa.text("FALSE"), nullable=False),
sa.Column("_created_by_id", sa.String(), nullable=True),
sa.Column("_last_updated_by_id", sa.String(), nullable=True),
sa.Column("organization_id", sa.String(), nullable=False),
sa.ForeignKeyConstraint(
["organization_id"],
["organizations.id"],
),
sa.PrimaryKeyConstraint("created_at", "id"),
)
op.create_index("ix_provider_trace_metadata_step_id", "provider_trace_metadata", ["step_id"], unique=False)
op.create_index("ix_provider_trace_metadata_id", "provider_trace_metadata", ["id"], unique=True)
def downgrade() -> None:
if not settings.letta_pg_uri_no_default:
return
op.drop_index("ix_provider_trace_metadata_id", table_name="provider_trace_metadata")
op.drop_index("ix_provider_trace_metadata_step_id", table_name="provider_trace_metadata")
op.drop_table("provider_trace_metadata")

View File

@@ -31,6 +31,7 @@ from letta.orm.prompt import Prompt
from letta.orm.provider import Provider
from letta.orm.provider_model import ProviderModel
from letta.orm.provider_trace import ProviderTrace
from letta.orm.provider_trace_metadata import ProviderTraceMetadata
from letta.orm.run import Run
from letta.orm.run_metrics import RunMetrics
from letta.orm.sandbox_config import AgentEnvironmentVariable, SandboxConfig, SandboxEnvironmentVariable

View File

@@ -0,0 +1,45 @@
import uuid
from datetime import datetime
from typing import Optional
from sqlalchemy import JSON, DateTime, Index, String, UniqueConstraint, func
from sqlalchemy.orm import Mapped, mapped_column, relationship
from letta.orm.mixins import OrganizationMixin
from letta.orm.sqlalchemy_base import SqlalchemyBase
from letta.schemas.provider_trace import ProviderTraceMetadata as PydanticProviderTraceMetadata
class ProviderTraceMetadata(SqlalchemyBase, OrganizationMixin):
"""Metadata-only provider trace storage (no request/response JSON)."""
__tablename__ = "provider_trace_metadata"
__pydantic_model__ = PydanticProviderTraceMetadata
__table_args__ = (
Index("ix_provider_trace_metadata_step_id", "step_id"),
UniqueConstraint("id", name="uq_provider_trace_metadata_id"),
)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), primary_key=True, server_default=func.now(), doc="Timestamp when the trace was created"
)
id: Mapped[str] = mapped_column(
String, primary_key=True, doc="Unique provider trace identifier", default=lambda: f"provider_trace-{uuid.uuid4()}"
)
step_id: Mapped[Optional[str]] = mapped_column(String, nullable=True, doc="ID of the step that this trace is associated with")
# Telemetry context fields
agent_id: Mapped[Optional[str]] = mapped_column(String, nullable=True, doc="ID of the agent that generated this trace")
agent_tags: Mapped[Optional[list]] = mapped_column(JSON, nullable=True, doc="Tags associated with the agent for filtering")
call_type: Mapped[Optional[str]] = mapped_column(String, nullable=True, doc="Type of call (agent_step, summarization, etc.)")
run_id: Mapped[Optional[str]] = mapped_column(String, nullable=True, doc="ID of the run this trace is associated with")
source: Mapped[Optional[str]] = mapped_column(
String, nullable=True, doc="Source service that generated this trace (memgpt-server, lettuce-py)"
)
# v2 protocol fields
org_id: Mapped[Optional[str]] = mapped_column(String, nullable=True, doc="ID of the organization")
user_id: Mapped[Optional[str]] = mapped_column(String, nullable=True, doc="ID of the user who initiated the request")
# Relationships
organization: Mapped["Organization"] = relationship("Organization", lazy="selectin")

View File

@@ -54,3 +54,23 @@ class ProviderTrace(BaseProviderTrace):
llm_config: Optional[Dict[str, Any]] = Field(None, description="LLM configuration used for this call (non-summarization calls only)")
created_at: datetime = Field(default_factory=get_utc_time, description="The timestamp when the object was created.")
class ProviderTraceMetadata(BaseProviderTrace):
"""Metadata-only representation of a provider trace (no request/response JSON)."""
id: str = BaseProviderTrace.generate_id_field()
step_id: Optional[str] = Field(None, description="ID of the step that this trace is associated with")
# Telemetry context fields
agent_id: Optional[str] = Field(None, description="ID of the agent that generated this trace")
agent_tags: Optional[list[str]] = Field(None, description="Tags associated with the agent for filtering")
call_type: Optional[str] = Field(None, description="Type of call (agent_step, summarization, etc.)")
run_id: Optional[str] = Field(None, description="ID of the run this trace is associated with")
source: Optional[str] = Field(None, description="Source service that generated this trace (memgpt-server, lettuce-py)")
# v2 protocol fields
org_id: Optional[str] = Field(None, description="ID of the organization")
user_id: Optional[str] = Field(None, description="ID of the user who initiated the request")
created_at: datetime = Field(default_factory=get_utc_time, description="The timestamp when the object was created.")

View File

@@ -2,10 +2,12 @@
from letta.helpers.json_helpers import json_dumps, json_loads
from letta.orm.provider_trace import ProviderTrace as ProviderTraceModel
from letta.schemas.provider_trace import ProviderTrace
from letta.orm.provider_trace_metadata import ProviderTraceMetadata as ProviderTraceMetadataModel
from letta.schemas.provider_trace import ProviderTrace, ProviderTraceMetadata
from letta.schemas.user import User
from letta.server.db import db_registry
from letta.services.provider_trace_backends.base import ProviderTraceBackendClient
from letta.settings import telemetry_settings
class PostgresProviderTraceBackend(ProviderTraceBackendClient):
@@ -15,7 +17,17 @@ class PostgresProviderTraceBackend(ProviderTraceBackendClient):
self,
actor: User,
provider_trace: ProviderTrace,
) -> ProviderTrace | ProviderTraceMetadata:
if telemetry_settings.provider_trace_pg_metadata_only:
return await self._create_metadata_only_async(actor, provider_trace)
return await self._create_full_async(actor, provider_trace)
async def _create_full_async(
self,
actor: User,
provider_trace: ProviderTrace,
) -> ProviderTrace:
"""Write full provider trace to provider_traces table."""
async with db_registry.async_session() as session:
provider_trace_model = ProviderTraceModel(**provider_trace.model_dump())
provider_trace_model.organization_id = actor.organization_id
@@ -31,11 +43,44 @@ class PostgresProviderTraceBackend(ProviderTraceBackendClient):
await provider_trace_model.create_async(session, actor=actor, no_commit=True, no_refresh=True)
return provider_trace_model.to_pydantic()
async def _create_metadata_only_async(
self,
actor: User,
provider_trace: ProviderTrace,
) -> ProviderTraceMetadata:
"""Write metadata-only trace to provider_trace_metadata table."""
metadata = ProviderTraceMetadata(
id=provider_trace.id,
step_id=provider_trace.step_id,
agent_id=provider_trace.agent_id,
agent_tags=provider_trace.agent_tags,
call_type=provider_trace.call_type,
run_id=provider_trace.run_id,
source=provider_trace.source,
org_id=provider_trace.org_id,
user_id=provider_trace.user_id,
)
metadata_model = ProviderTraceMetadataModel(**metadata.model_dump())
metadata_model.organization_id = actor.organization_id
async with db_registry.async_session() as session:
await metadata_model.create_async(session, actor=actor, no_commit=True, no_refresh=True)
return metadata_model.to_pydantic()
async def get_by_step_id_async(
self,
step_id: str,
actor: User,
) -> ProviderTrace | None:
"""Read from provider_traces table. Always reads from full table regardless of write flag."""
return await self._get_full_by_step_id_async(step_id, actor)
async def _get_full_by_step_id_async(
self,
step_id: str,
actor: User,
) -> ProviderTrace | None:
"""Read from provider_traces table."""
async with db_registry.async_session() as session:
provider_trace_model = await ProviderTraceModel.read_async(
db_session=session,
@@ -43,3 +88,17 @@ class PostgresProviderTraceBackend(ProviderTraceBackendClient):
actor=actor,
)
return provider_trace_model.to_pydantic() if provider_trace_model else None
async def _get_metadata_by_step_id_async(
self,
step_id: str,
actor: User,
) -> ProviderTraceMetadata | None:
"""Read from provider_trace_metadata table."""
async with db_registry.async_session() as session:
metadata_model = await ProviderTraceMetadataModel.read_async(
db_session=session,
step_id=step_id,
actor=actor,
)
return metadata_model.to_pydantic() if metadata_model else None

View File

@@ -519,6 +519,10 @@ class TelemetrySettings(BaseSettings):
default=None,
description="Source identifier for telemetry (memgpt-server, lettuce-py, etc.).",
)
provider_trace_pg_metadata_only: bool = Field(
default=False,
description="Write only metadata to Postgres (no request/response JSON). Requires provider_trace_metadata table to exist.",
)
@property
def provider_trace_backends(self) -> list[str]: