Files
letta-server/letta/agents/base_agent.py
cthomas 416ffc7cd7 Add billing context to LLM telemetry traces (#9745)
* feat: add billing context to LLM telemetry traces

Add billing metadata (plan type, cost source, customer ID) to LLM traces in ClickHouse for cost analytics and attribution.

**Data Flow:**
- Cloud-API: Extract billing info from subscription in rate limiting, set x-billing-* headers
- Core: Parse headers into BillingContext object via dependencies
- Adapters: Flow billing_context through all LLM adapters (blocking & streaming)
- Agent: Pass billing_context to step() and stream() methods
- ClickHouse: Store in billing_plan_type, billing_cost_source, billing_customer_id columns

**Changes:**
- Add BillingContext schema to provider_trace.py
- Add billing columns to llm_traces ClickHouse table DDL
- Update getCustomerSubscription to fetch stripeCustomerId from organization_billing_details
- Propagate billing_context through agent step flow, adapters, and streaming service
- Update ProviderTrace and LLMTrace to include billing metadata
- Regenerate SDK with autogen

**Production Deployment:**
Requires env vars: LETTA_PROVIDER_TRACE_BACKEND=clickhouse, LETTA_STORE_LLM_TRACES=true, CLICKHOUSE_*

🐾 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* fix: add billing_context parameter to agent step methods

- Add billing_context to BaseAgent and BaseAgentV2 abstract methods
- Update LettaAgent, LettaAgentV2, LettaAgentV3 step methods
- Update multi-agent groups: SleeptimeMultiAgentV2, V3, V4
- Fix test_utils.py to include billing header parameters
- Import BillingContext in all affected files

* fix: add billing_context to stream methods

- Add billing_context parameter to BaseAgentV2.stream()
- Add billing_context parameter to LettaAgentV2.stream()
- LettaAgentV3.stream() already has it from previous commit

* fix: exclude billing headers from OpenAPI spec

Mark billing headers as internal (include_in_schema=False) so they don't appear in the public API.
These are internal headers between cloud-api and core, not part of the public SDK.

Regenerated SDK with stage-api - removes 10,650 lines of bloat that was causing OOM during Next.js build.

* refactor: return billing context from handleUnifiedRateLimiting instead of mutating req

Instead of passing req into handleUnifiedRateLimiting and mutating headers inside it:
- Return billing context fields (billingPlanType, billingCostSource, billingCustomerId) from handleUnifiedRateLimiting
- Set headers in handleMessageRateLimiting (middleware layer) after getting the result
- This fixes step-orchestrator compatibility since it doesn't have a real Express req object

* chore: remove extra gencode

* p

---------

Co-authored-by: Letta <noreply@letta.com>
2026-03-03 18:34:13 -08:00

192 lines
7.9 KiB
Python

from abc import ABC, abstractmethod
from typing import Any, AsyncGenerator, List, Optional, Union
import openai
from letta.constants import DEFAULT_MAX_STEPS
from letta.helpers import ToolRulesSolver
from letta.helpers.datetime_helpers import get_utc_time
from letta.log import get_logger
from letta.prompts.prompt_generator import PromptGenerator
from letta.schemas.agent import AgentState
from letta.schemas.enums import MessageStreamStatus
from letta.schemas.letta_message import LegacyLettaMessage, LettaMessage
from letta.schemas.letta_message_content import TextContent
from letta.schemas.letta_response import LettaResponse
from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType
from letta.schemas.message import Message, MessageCreate, MessageUpdate
from letta.schemas.provider_trace import BillingContext
from letta.schemas.usage import LettaUsageStatistics
from letta.schemas.user import User
from letta.services.agent_manager import AgentManager
from letta.services.message_manager import MessageManager
from letta.services.passage_manager import PassageManager
from letta.utils import united_diff
logger = get_logger(__name__)
class BaseAgent(ABC):
"""
Abstract base class for AI agents, handling message management, tool execution,
and context tracking.
"""
def __init__(
self,
agent_id: str,
# TODO: Make required once client refactor hits
openai_client: Optional[openai.AsyncClient],
message_manager: MessageManager,
agent_manager: AgentManager,
actor: User,
):
self.agent_id = agent_id
self.openai_client = openai_client
self.message_manager = message_manager
self.agent_manager = agent_manager
# TODO: Pass this in
self.passage_manager = PassageManager()
self.actor = actor
self.logger = get_logger(agent_id)
@abstractmethod
async def step(
self,
input_messages: List[MessageCreate],
max_steps: int = DEFAULT_MAX_STEPS,
run_id: Optional[str] = None,
billing_context: "BillingContext | None" = None,
) -> LettaResponse:
"""
Main execution loop for the agent.
"""
raise NotImplementedError
@abstractmethod
async def step_stream(
self, input_messages: List[MessageCreate], max_steps: int = DEFAULT_MAX_STEPS
) -> AsyncGenerator[Union[LettaMessage, LegacyLettaMessage, MessageStreamStatus], None]:
"""
Main streaming execution loop for the agent.
"""
raise NotImplementedError
@staticmethod
def pre_process_input_message(input_messages: List[MessageCreate]) -> Any:
"""
Pre-process function to run on the input_message.
"""
def get_content(message: MessageCreate) -> str:
if isinstance(message.content, str):
return message.content
elif message.content and len(message.content) == 1 and isinstance(message.content[0], TextContent):
return message.content[0].text
else:
return ""
return [{"role": input_message.role.value, "content": get_content(input_message)} for input_message in input_messages]
async def _rebuild_memory_async(
self,
in_context_messages: List[Message],
agent_state: AgentState,
tool_rules_solver: Optional[ToolRulesSolver] = None,
num_messages: Optional[int] = None, # storing these calculations is specific to the voice agent
num_archival_memories: Optional[int] = None,
) -> List[Message]:
"""
Async version of function above. For now before breaking up components, changes should be made in both places.
"""
try:
# [DB Call] loading blocks (modifies: agent_state.memory.blocks)
agent_state = await self.agent_manager.refresh_memory_async(agent_state=agent_state, actor=self.actor)
tool_constraint_block = None
if tool_rules_solver is not None:
tool_constraint_block = tool_rules_solver.compile_tool_rule_prompts()
# compile archive tags if there's an attached archive
from letta.services.archive_manager import ArchiveManager
archive_manager = ArchiveManager()
archive = await archive_manager.get_default_archive_for_agent_async(
agent_id=agent_state.id,
actor=self.actor,
)
if archive:
archive_tags = await self.passage_manager.get_unique_tags_for_archive_async(
archive_id=archive.id,
actor=self.actor,
)
else:
archive_tags = None
# TODO: This is a pretty brittle pattern established all over our code, need to get rid of this
curr_system_message = in_context_messages[0]
curr_system_message_text = curr_system_message.content[0].text
# generate memory string with current state for comparison
curr_memory_str = agent_state.memory.compile(
tool_usage_rules=tool_constraint_block,
sources=agent_state.sources,
max_files_open=agent_state.max_files_open,
llm_config=agent_state.llm_config,
)
system_prompt_changed = agent_state.system not in curr_system_message_text
memory_changed = curr_memory_str not in curr_system_message_text
if (not system_prompt_changed) and (not memory_changed):
logger.debug(
f"Memory and sources haven't changed for agent id={agent_state.id} and actor=({self.actor.id}, {self.actor.name}), skipping system prompt rebuild"
)
return in_context_messages
memory_edit_timestamp = get_utc_time()
# size of messages and archival memories
if num_messages is None:
num_messages = await self.message_manager.size_async(actor=self.actor, agent_id=agent_state.id)
if num_archival_memories is None:
num_archival_memories = await self.passage_manager.agent_passage_size_async(actor=self.actor, agent_id=agent_state.id)
new_system_message_str = PromptGenerator.get_system_message_from_compiled_memory(
system_prompt=agent_state.system,
memory_with_sources=curr_memory_str,
in_context_memory_last_edit=memory_edit_timestamp,
timezone=agent_state.timezone,
previous_message_count=num_messages - len(in_context_messages),
archival_memory_size=num_archival_memories,
archive_tags=archive_tags,
)
diff = united_diff(curr_system_message_text, new_system_message_str)
if len(diff) > 0:
logger.debug(f"Rebuilding system with new memory...\nDiff:\n{diff}")
# [DB Call] Update Messages
new_system_message = await self.message_manager.update_message_by_id_async(
curr_system_message.id,
message_update=MessageUpdate(content=new_system_message_str),
actor=self.actor,
project_id=agent_state.project_id,
)
return [new_system_message, *in_context_messages[1:]]
else:
return in_context_messages
except:
logger.exception(f"Failed to rebuild memory for agent id={agent_state.id} and actor=({self.actor.id}, {self.actor.name})")
raise
def get_finish_chunks_for_stream(self, usage: LettaUsageStatistics, stop_reason: Optional[LettaStopReason] = None):
if stop_reason is None:
stop_reason = LettaStopReason(stop_reason=StopReasonType.end_turn.value)
return [
stop_reason.model_dump_json(),
usage.model_dump_json(),
MessageStreamStatus.done.value,
]