letta-server/letta/schemas/usage.py

from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union

from pydantic import BaseModel, Field

if TYPE_CHECKING:
    from letta.schemas.enums import ProviderType
    from letta.schemas.openai.chat_completion_response import (
        UsageStatistics,
        UsageStatisticsCompletionTokenDetails,
        UsageStatisticsPromptTokenDetails,
    )


def normalize_cache_tokens(
    prompt_details: Union["UsageStatisticsPromptTokenDetails", Dict[str, Any], None],
) -> Tuple[int, int]:
    """
    Extract normalized cache token counts from provider-specific prompt details.

    Handles both Pydantic model objects (from adapters) and dict objects (from database).

    Provider mappings:
    - OpenAI/Gemini: cached_tokens -> cached_input_tokens
    - Anthropic: cache_read_tokens -> cached_input_tokens, cache_creation_tokens -> cache_write_tokens

    Args:
        prompt_details: Provider-specific prompt token details (model or dict)

    Returns:
        Tuple of (cached_input_tokens, cache_write_tokens)
    """
    if prompt_details is None:
        return 0, 0

    # Handle dict (from database storage)
    if isinstance(prompt_details, dict):
        cached_input = 0
        if prompt_details.get("cached_tokens"):
            cached_input = prompt_details.get("cached_tokens", 0)
        elif prompt_details.get("cache_read_tokens"):
            cached_input = prompt_details.get("cache_read_tokens", 0)

        cache_write = prompt_details.get("cache_creation_tokens", 0) or 0
        return cached_input, cache_write

    # Handle Pydantic model (from adapters)
    cached_input = 0
    if hasattr(prompt_details, "cached_tokens") and prompt_details.cached_tokens:
        cached_input = prompt_details.cached_tokens
    elif hasattr(prompt_details, "cache_read_tokens") and prompt_details.cache_read_tokens:
        cached_input = prompt_details.cache_read_tokens

    cache_write = 0
    if hasattr(prompt_details, "cache_creation_tokens") and prompt_details.cache_creation_tokens:
        cache_write = prompt_details.cache_creation_tokens

    return cached_input, cache_write


def normalize_reasoning_tokens(
    completion_details: Union["UsageStatisticsCompletionTokenDetails", Dict[str, Any], None],
) -> int:
    """
    Extract normalized reasoning token count from provider-specific completion details.

    Handles both Pydantic model objects (from adapters) and dict objects (from database).

    Provider mappings:
    - OpenAI: completion_tokens_details.reasoning_tokens
    - Gemini: thoughts_token_count (mapped to reasoning_tokens in UsageStatistics)
    - Anthropic: thinking tokens are included in completion_tokens, not separately tracked

    Args:
        completion_details: Provider-specific completion token details (model or dict)

    Returns:
        The reasoning token count
    """
    if completion_details is None:
        return 0

    # Handle dict (from database storage)
    if isinstance(completion_details, dict):
        return completion_details.get("reasoning_tokens", 0) or 0

    # Handle Pydantic model (from adapters)
    if hasattr(completion_details, "reasoning_tokens") and completion_details.reasoning_tokens:
        return completion_details.reasoning_tokens

    return 0


class LettaUsageStatistics(BaseModel):
    """
    Usage statistics for the agent interaction.

    Attributes:
        completion_tokens (int): The number of tokens generated by the agent.
        prompt_tokens (int): The number of tokens in the prompt.
        total_tokens (int): The total number of tokens processed by the agent.
        step_count (int): The number of steps taken by the agent.
        cached_input_tokens (Optional[int]): The number of input tokens served from cache. None if not reported.
        cache_write_tokens (Optional[int]): The number of input tokens written to cache. None if not reported.
        reasoning_tokens (Optional[int]): The number of reasoning/thinking tokens generated. None if not reported.
    """

    message_type: Literal["usage_statistics"] = "usage_statistics"
    completion_tokens: int = Field(0, description="The number of tokens generated by the agent.")
    prompt_tokens: int = Field(0, description="The number of tokens in the prompt.")
    total_tokens: int = Field(0, description="The total number of tokens processed by the agent.")
    step_count: int = Field(0, description="The number of steps taken by the agent.")
    # TODO: Optional for now. This field makes everyone's lives easier
    run_ids: Optional[List[str]] = Field(None, description="The background task run IDs associated with the agent interaction")

    # Cache tracking (common across providers)
    # None means provider didn't report this data, 0 means provider reported 0
    cached_input_tokens: Optional[int] = Field(
        None, description="The number of input tokens served from cache. None if not reported by provider."
    )
    cache_write_tokens: Optional[int] = Field(
        None, description="The number of input tokens written to cache (Anthropic only). None if not reported by provider."
    )

    # Reasoning token tracking
    # None means provider didn't report this data, 0 means provider reported 0
    reasoning_tokens: Optional[int] = Field(
        None, description="The number of reasoning/thinking tokens generated. None if not reported by provider."
    )

    # Context window tracking
    context_tokens: Optional[int] = Field(
        None,
        description="Estimate of tokens currently in the context window.",
    )

    def to_usage(self, provider_type: Optional["ProviderType"] = None) -> "UsageStatistics":
        """Convert to UsageStatistics (OpenAI-compatible format).

        Args:
            provider_type: ProviderType enum indicating which provider format to use.
                          Used to determine which cache field to populate.

        Returns:
            UsageStatistics object with nested prompt/completion token details.
        """
        from letta.schemas.enums import ProviderType
        from letta.schemas.openai.chat_completion_response import (
            UsageStatistics,
            UsageStatisticsCompletionTokenDetails,
            UsageStatisticsPromptTokenDetails,
        )

        # Providers that use Anthropic-style cache fields (cache_read_tokens, cache_creation_tokens)
        anthropic_style_providers = {ProviderType.anthropic, ProviderType.bedrock}

        # Build prompt_tokens_details if we have cache data
        prompt_tokens_details = None
        if self.cached_input_tokens is not None or self.cache_write_tokens is not None:
            if provider_type in anthropic_style_providers:
                # Anthropic uses cache_read_tokens and cache_creation_tokens
                prompt_tokens_details = UsageStatisticsPromptTokenDetails(
                    cache_read_tokens=self.cached_input_tokens,
                    cache_creation_tokens=self.cache_write_tokens,
                )
            else:
                # OpenAI/Gemini use cached_tokens
                prompt_tokens_details = UsageStatisticsPromptTokenDetails(
                    cached_tokens=self.cached_input_tokens,
                )

        # Build completion_tokens_details if we have reasoning tokens
        completion_tokens_details = None
        if self.reasoning_tokens is not None:
            completion_tokens_details = UsageStatisticsCompletionTokenDetails(
                reasoning_tokens=self.reasoning_tokens,
            )

        return UsageStatistics(
            prompt_tokens=self.prompt_tokens,
            completion_tokens=self.completion_tokens,
            total_tokens=self.total_tokens,
            prompt_tokens_details=prompt_tokens_details,
            completion_tokens_details=completion_tokens_details,
        )