letta-server/tests/test_usage_parsing.py

"""
Tests for usage statistics parsing through the production adapter path.

These tests verify that SimpleLLMRequestAdapter correctly extracts usage statistics
from LLM responses, including:
1. Basic usage (prompt_tokens, completion_tokens, total_tokens)
2. Cache-related fields (cached_input_tokens, cache_write_tokens)
3. Reasoning tokens (for models that support it)

This tests the actual production code path:
  SimpleLLMRequestAdapter.invoke_llm()
    → llm_client.request_async_with_telemetry()
    → llm_client.convert_response_to_chat_completion()
    → adapter extracts from chat_completions_response.usage
    → normalize_cache_tokens() / normalize_reasoning_tokens()
"""

import os

import pytest

from letta.adapters.simple_llm_request_adapter import SimpleLLMRequestAdapter
from letta.errors import LLMAuthenticationError
from letta.llm_api.anthropic_client import AnthropicClient
from letta.llm_api.google_ai_client import GoogleAIClient
from letta.llm_api.openai_client import OpenAIClient
from letta.schemas.enums import AgentType, LLMCallType, MessageRole
from letta.schemas.letta_message_content import TextContent
from letta.schemas.llm_config import LLMConfig
from letta.schemas.message import Message
from letta.settings import model_settings


def _has_openai_credentials() -> bool:
    return bool(model_settings.openai_api_key or os.environ.get("OPENAI_API_KEY"))


def _has_anthropic_credentials() -> bool:
    return bool(model_settings.anthropic_api_key or os.environ.get("ANTHROPIC_API_KEY"))


def _has_gemini_credentials() -> bool:
    return bool(model_settings.gemini_api_key or os.environ.get("GEMINI_API_KEY"))


def _build_simple_messages(user_content: str) -> list[Message]:
    """Build a minimal message list for testing."""
    return [
        Message(
            role=MessageRole.user,
            content=[TextContent(text=user_content)],
        )
    ]


# Large system prompt to exceed caching thresholds (>1024 tokens)
LARGE_SYSTEM_PROMPT = """You are an advanced AI assistant with extensive knowledge across multiple domains.

# Core Capabilities

## Technical Knowledge
- Software Engineering: Expert in Python, JavaScript, TypeScript, Go, Rust, and many other languages
- System Design: Deep understanding of distributed systems, microservices, and cloud architecture
- DevOps: Proficient in Docker, Kubernetes, CI/CD pipelines, and infrastructure as code
- Databases: Experience with SQL (PostgreSQL, MySQL) and NoSQL (MongoDB, Redis, Cassandra) databases
- Machine Learning: Knowledge of neural networks, transformers, and modern ML frameworks

## Problem Solving Approach
When tackling problems, you follow a structured methodology:
1. Understand the requirements thoroughly
2. Break down complex problems into manageable components
3. Consider multiple solution approaches
4. Evaluate trade-offs between different options
5. Implement solutions with clean, maintainable code
6. Test thoroughly and iterate based on feedback

## Communication Style
- Clear and concise explanations
- Use examples and analogies when helpful
- Adapt technical depth to the audience
- Ask clarifying questions when requirements are ambiguous
- Provide context and rationale for recommendations

# Domain Expertise

## Web Development
You have deep knowledge of:
- Frontend: React, Vue, Angular, Next.js, modern CSS frameworks
- Backend: Node.js, Express, FastAPI, Django, Flask
- API Design: REST, GraphQL, gRPC
- Authentication: OAuth, JWT, session management
- Performance: Caching strategies, CDNs, lazy loading

## Data Engineering
You understand:
- ETL pipelines and data transformation
- Data warehousing concepts (Snowflake, BigQuery, Redshift)
- Stream processing (Kafka, Kinesis)
- Data modeling and schema design
- Data quality and validation

## Cloud Platforms
You're familiar with:
- AWS: EC2, S3, Lambda, RDS, DynamoDB, CloudFormation
- GCP: Compute Engine, Cloud Storage, Cloud Functions, BigQuery
- Azure: Virtual Machines, Blob Storage, Azure Functions
- Serverless architectures and best practices
- Cost optimization strategies

## Security
You consider:
- Common vulnerabilities (OWASP Top 10)
- Secure coding practices
- Encryption and key management
- Access control and authorization patterns
- Security audit and compliance requirements

# Interaction Principles

## Helpfulness
- Provide actionable guidance
- Share relevant resources and documentation
- Offer multiple approaches when appropriate
- Point out potential pitfalls and edge cases
- Follow up to ensure understanding

## Accuracy
- Acknowledge limitations and uncertainties
- Distinguish between facts and opinions
- Cite sources when making specific claims
- Correct mistakes promptly when identified
- Stay current with latest developments

## Respect
- Value diverse perspectives and approaches
- Maintain professional boundaries
- Protect user privacy and confidentiality
- Avoid assumptions about user background
- Be patient with varying skill levels

Remember: Your goal is to empower users to solve problems and learn, not just to provide answers."""


@pytest.mark.asyncio
async def test_openai_usage_via_adapter():
    """Test OpenAI usage extraction through SimpleLLMRequestAdapter.

    This tests the actual production code path used by letta_agent_v3.
    """
    if not _has_openai_credentials():
        pytest.skip("OpenAI credentials not configured")

    client = OpenAIClient()
    llm_config = LLMConfig.default_config("gpt-4o-mini")

    adapter = SimpleLLMRequestAdapter(
        llm_client=client,
        llm_config=llm_config,
        call_type=LLMCallType.agent_step,
    )

    messages = _build_simple_messages("Say hello in exactly 5 words.")
    request_data = client.build_request_data(AgentType.letta_v1_agent, messages, llm_config)

    # Call through the adapter (production path)
    try:
        async for _ in adapter.invoke_llm(
            request_data=request_data,
            messages=messages,
            tools=[],
            use_assistant_message=False,
        ):
            pass
    except LLMAuthenticationError:
        pytest.skip("OpenAI credentials invalid")

    # Verify usage was extracted
    assert adapter.usage is not None, "adapter.usage should not be None"
    assert adapter.usage.prompt_tokens > 0, f"prompt_tokens should be > 0, got {adapter.usage.prompt_tokens}"
    assert adapter.usage.completion_tokens > 0, f"completion_tokens should be > 0, got {adapter.usage.completion_tokens}"
    assert adapter.usage.total_tokens > 0, f"total_tokens should be > 0, got {adapter.usage.total_tokens}"
    assert adapter.usage.step_count == 1, f"step_count should be 1, got {adapter.usage.step_count}"

    print(f"OpenAI usage: prompt={adapter.usage.prompt_tokens}, completion={adapter.usage.completion_tokens}")
    print(f"OpenAI cache: cached_input={adapter.usage.cached_input_tokens}, cache_write={adapter.usage.cache_write_tokens}")
    print(f"OpenAI reasoning: {adapter.usage.reasoning_tokens}")


@pytest.mark.asyncio
async def test_anthropic_usage_via_adapter():
    """Test Anthropic usage extraction through SimpleLLMRequestAdapter.

    This tests the actual production code path used by letta_agent_v3.

    Note: Anthropic's input_tokens is NON-cached only. The adapter should
    compute total prompt_tokens = input_tokens + cache_read + cache_creation.
    """
    if not _has_anthropic_credentials():
        pytest.skip("Anthropic credentials not configured")

    client = AnthropicClient()
    llm_config = LLMConfig(
        model="claude-haiku-4-5-20251001",
        model_endpoint_type="anthropic",
        model_endpoint="https://api.anthropic.com/v1",
        context_window=200000,
        max_tokens=256,
    )

    adapter = SimpleLLMRequestAdapter(
        llm_client=client,
        llm_config=llm_config,
        call_type=LLMCallType.agent_step,
    )

    # Anthropic requires a system message first
    messages = [
        Message(role=MessageRole.system, content=[TextContent(text="You are a helpful assistant.")]),
        Message(role=MessageRole.user, content=[TextContent(text="Say hello in exactly 5 words.")]),
    ]
    request_data = client.build_request_data(AgentType.letta_v1_agent, messages, llm_config, tools=[])

    # Call through the adapter (production path)
    try:
        async for _ in adapter.invoke_llm(
            request_data=request_data,
            messages=messages,
            tools=[],
            use_assistant_message=False,
        ):
            pass
    except LLMAuthenticationError:
        pytest.skip("Anthropic credentials invalid")

    # Verify usage was extracted
    assert adapter.usage is not None, "adapter.usage should not be None"
    assert adapter.usage.prompt_tokens > 0, f"prompt_tokens should be > 0, got {adapter.usage.prompt_tokens}"
    assert adapter.usage.completion_tokens > 0, f"completion_tokens should be > 0, got {adapter.usage.completion_tokens}"
    assert adapter.usage.total_tokens > 0, f"total_tokens should be > 0, got {adapter.usage.total_tokens}"
    assert adapter.usage.step_count == 1, f"step_count should be 1, got {adapter.usage.step_count}"

    print(f"Anthropic usage: prompt={adapter.usage.prompt_tokens}, completion={adapter.usage.completion_tokens}")
    print(f"Anthropic cache: cached_input={adapter.usage.cached_input_tokens}, cache_write={adapter.usage.cache_write_tokens}")


@pytest.mark.asyncio
async def test_gemini_usage_via_adapter():
    """Test Gemini usage extraction through SimpleLLMRequestAdapter.

    This tests the actual production code path used by letta_agent_v3.
    """
    if not _has_gemini_credentials():
        pytest.skip("Gemini credentials not configured")

    client = GoogleAIClient()
    llm_config = LLMConfig(
        model="gemini-2.0-flash",
        model_endpoint_type="google_ai",
        model_endpoint="https://generativelanguage.googleapis.com",
        context_window=1048576,
        max_tokens=256,
    )

    adapter = SimpleLLMRequestAdapter(
        llm_client=client,
        llm_config=llm_config,
        call_type=LLMCallType.agent_step,
    )

    messages = _build_simple_messages("Say hello in exactly 5 words.")
    request_data = client.build_request_data(AgentType.letta_v1_agent, messages, llm_config, tools=[])

    # Call through the adapter (production path)
    try:
        async for _ in adapter.invoke_llm(
            request_data=request_data,
            messages=messages,
            tools=[],
            use_assistant_message=False,
        ):
            pass
    except LLMAuthenticationError:
        pytest.skip("Gemini credentials invalid")

    # Verify usage was extracted
    assert adapter.usage is not None, "adapter.usage should not be None"
    assert adapter.usage.prompt_tokens > 0, f"prompt_tokens should be > 0, got {adapter.usage.prompt_tokens}"
    assert adapter.usage.completion_tokens > 0, f"completion_tokens should be > 0, got {adapter.usage.completion_tokens}"
    assert adapter.usage.total_tokens > 0, f"total_tokens should be > 0, got {adapter.usage.total_tokens}"
    assert adapter.usage.step_count == 1, f"step_count should be 1, got {adapter.usage.step_count}"

    print(f"Gemini usage: prompt={adapter.usage.prompt_tokens}, completion={adapter.usage.completion_tokens}")
    print(f"Gemini cache: cached_input={adapter.usage.cached_input_tokens}")
    print(f"Gemini reasoning: {adapter.usage.reasoning_tokens}")


@pytest.mark.asyncio
async def test_openai_prefix_caching_via_adapter():
    """Test OpenAI prefix caching through SimpleLLMRequestAdapter.

    Makes two requests with the same large system prompt to verify
    cached_input_tokens is populated on the second request.

    Note: Prefix caching is probabilistic and depends on server-side state.
    """
    if not _has_openai_credentials():
        pytest.skip("OpenAI credentials not configured")

    client = OpenAIClient()
    llm_config = LLMConfig.default_config("gpt-4o-mini")

    # First request - should populate the cache
    adapter1 = SimpleLLMRequestAdapter(llm_client=client, llm_config=llm_config, call_type=LLMCallType.agent_step)
    messages1 = [
        Message(role=MessageRole.system, content=[TextContent(text=LARGE_SYSTEM_PROMPT)]),
        Message(role=MessageRole.user, content=[TextContent(text="What is 2+2?")]),
    ]
    request_data1 = client.build_request_data(AgentType.letta_v1_agent, messages1, llm_config)

    try:
        async for _ in adapter1.invoke_llm(request_data=request_data1, messages=messages1, tools=[], use_assistant_message=False):
            pass
    except LLMAuthenticationError:
        pytest.skip("OpenAI credentials invalid")

    print(f"Request 1 - prompt={adapter1.usage.prompt_tokens}, cached={adapter1.usage.cached_input_tokens}")

    # Second request - same system prompt, should hit cache
    adapter2 = SimpleLLMRequestAdapter(llm_client=client, llm_config=llm_config, call_type=LLMCallType.agent_step)
    messages2 = [
        Message(role=MessageRole.system, content=[TextContent(text=LARGE_SYSTEM_PROMPT)]),
        Message(role=MessageRole.user, content=[TextContent(text="What is 3+3?")]),
    ]
    request_data2 = client.build_request_data(AgentType.letta_v1_agent, messages2, llm_config)

    async for _ in adapter2.invoke_llm(request_data=request_data2, messages=messages2, tools=[], use_assistant_message=False):
        pass

    print(f"Request 2 - prompt={adapter2.usage.prompt_tokens}, cached={adapter2.usage.cached_input_tokens}")

    # Verify basic usage
    assert adapter2.usage.prompt_tokens > 0
    assert adapter2.usage.completion_tokens > 0

    # Note: We can't guarantee cache hit, but if it happened, cached_input_tokens should be > 0
    if adapter2.usage.cached_input_tokens and adapter2.usage.cached_input_tokens > 0:
        print(f"SUCCESS: OpenAI cache hit! cached_input_tokens={adapter2.usage.cached_input_tokens}")
    else:
        print("INFO: No cache hit (cache may not have been populated yet)")


@pytest.mark.asyncio
async def test_anthropic_prefix_caching_via_adapter():
    """Test Anthropic prefix caching through SimpleLLMRequestAdapter.

    Makes two requests with the same large system prompt using cache_control
    to verify cache tokens are populated.

    Note: Anthropic requires explicit cache_control breakpoints.
    """
    if not _has_anthropic_credentials():
        pytest.skip("Anthropic credentials not configured")

    client = AnthropicClient()
    llm_config = LLMConfig(
        model="claude-haiku-4-5-20251001",
        model_endpoint_type="anthropic",
        model_endpoint="https://api.anthropic.com/v1",
        context_window=200000,
        max_tokens=256,
    )

    # First request
    adapter1 = SimpleLLMRequestAdapter(llm_client=client, llm_config=llm_config, call_type=LLMCallType.agent_step)
    messages1 = [
        Message(role=MessageRole.system, content=[TextContent(text=LARGE_SYSTEM_PROMPT)]),
        Message(role=MessageRole.user, content=[TextContent(text="What is 2+2?")]),
    ]
    request_data1 = client.build_request_data(AgentType.letta_v1_agent, messages1, llm_config, tools=[])

    try:
        async for _ in adapter1.invoke_llm(request_data=request_data1, messages=messages1, tools=[], use_assistant_message=False):
            pass
    except LLMAuthenticationError:
        pytest.skip("Anthropic credentials invalid")

    print(
        f"Request 1 - prompt={adapter1.usage.prompt_tokens}, cached={adapter1.usage.cached_input_tokens}, cache_write={adapter1.usage.cache_write_tokens}"
    )

    # Second request
    adapter2 = SimpleLLMRequestAdapter(llm_client=client, llm_config=llm_config, call_type=LLMCallType.agent_step)
    messages2 = [
        Message(role=MessageRole.system, content=[TextContent(text=LARGE_SYSTEM_PROMPT)]),
        Message(role=MessageRole.user, content=[TextContent(text="What is 3+3?")]),
    ]
    request_data2 = client.build_request_data(AgentType.letta_v1_agent, messages2, llm_config, tools=[])

    async for _ in adapter2.invoke_llm(request_data=request_data2, messages=messages2, tools=[], use_assistant_message=False):
        pass

    print(
        f"Request 2 - prompt={adapter2.usage.prompt_tokens}, cached={adapter2.usage.cached_input_tokens}, cache_write={adapter2.usage.cache_write_tokens}"
    )

    # Verify basic usage
    assert adapter2.usage.prompt_tokens > 0
    assert adapter2.usage.completion_tokens > 0

    # Check for cache activity
    if adapter2.usage.cached_input_tokens and adapter2.usage.cached_input_tokens > 0:
        print(f"SUCCESS: Anthropic cache hit! cached_input_tokens={adapter2.usage.cached_input_tokens}")
    elif adapter2.usage.cache_write_tokens and adapter2.usage.cache_write_tokens > 0:
        print(f"INFO: Anthropic cache write! cache_write_tokens={adapter2.usage.cache_write_tokens}")
    else:
        print("INFO: No cache activity detected")


@pytest.mark.asyncio
async def test_gemini_prefix_caching_via_adapter():
    """Test Gemini prefix caching through SimpleLLMRequestAdapter.

    Makes two requests with the same large system prompt to verify
    cached_input_tokens is populated.

    Note: Gemini 2.0+ has implicit caching.
    """
    if not _has_gemini_credentials():
        pytest.skip("Gemini credentials not configured")

    client = GoogleAIClient()
    llm_config = LLMConfig(
        model="gemini-2.0-flash",
        model_endpoint_type="google_ai",
        model_endpoint="https://generativelanguage.googleapis.com",
        context_window=1048576,
        max_tokens=256,
    )

    # First request
    adapter1 = SimpleLLMRequestAdapter(llm_client=client, llm_config=llm_config, call_type=LLMCallType.agent_step)
    messages1 = [
        Message(role=MessageRole.system, content=[TextContent(text=LARGE_SYSTEM_PROMPT)]),
        Message(role=MessageRole.user, content=[TextContent(text="What is 2+2?")]),
    ]
    request_data1 = client.build_request_data(AgentType.letta_v1_agent, messages1, llm_config, tools=[])

    try:
        async for _ in adapter1.invoke_llm(request_data=request_data1, messages=messages1, tools=[], use_assistant_message=False):
            pass
    except LLMAuthenticationError:
        pytest.skip("Gemini credentials invalid")

    print(f"Request 1 - prompt={adapter1.usage.prompt_tokens}, cached={adapter1.usage.cached_input_tokens}")

    # Second request
    adapter2 = SimpleLLMRequestAdapter(llm_client=client, llm_config=llm_config, call_type=LLMCallType.agent_step)
    messages2 = [
        Message(role=MessageRole.system, content=[TextContent(text=LARGE_SYSTEM_PROMPT)]),
        Message(role=MessageRole.user, content=[TextContent(text="What is 3+3?")]),
    ]
    request_data2 = client.build_request_data(AgentType.letta_v1_agent, messages2, llm_config, tools=[])

    async for _ in adapter2.invoke_llm(request_data=request_data2, messages=messages2, tools=[], use_assistant_message=False):
        pass

    print(f"Request 2 - prompt={adapter2.usage.prompt_tokens}, cached={adapter2.usage.cached_input_tokens}")

    # Verify basic usage
    assert adapter2.usage.prompt_tokens > 0
    assert adapter2.usage.completion_tokens > 0

    if adapter2.usage.cached_input_tokens and adapter2.usage.cached_input_tokens > 0:
        print(f"SUCCESS: Gemini cache hit! cached_input_tokens={adapter2.usage.cached_input_tokens}")
    else:
        print("INFO: No cache hit detected")