refactor: add extract_usage_statistics returning LettaUsageStatistics (#9065)

👾 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> --------- Co-authored-by: Letta <noreply@letta.com>
2026-01-25 16:15:03 -08:00
parent 2bccd36382
commit 221b4e6279
17 changed files with 1097 additions and 206 deletions
--- a/tests/test_usage_parsing.py
+++ b/tests/test_usage_parsing.py
@@ -0,0 +1,473 @@
+"""
+Tests for usage statistics parsing through the production adapter path.
+
+These tests verify that SimpleLLMRequestAdapter correctly extracts usage statistics
+from LLM responses, including:
+1. Basic usage (prompt_tokens, completion_tokens, total_tokens)
+2. Cache-related fields (cached_input_tokens, cache_write_tokens)
+3. Reasoning tokens (for models that support it)
+
+This tests the actual production code path:
+  SimpleLLMRequestAdapter.invoke_llm()
+    → llm_client.request_async_with_telemetry()
+    → llm_client.convert_response_to_chat_completion()
+    → adapter extracts from chat_completions_response.usage
+    → normalize_cache_tokens() / normalize_reasoning_tokens()
+"""
+
+import os
+
+import pytest
+
+from letta.adapters.simple_llm_request_adapter import SimpleLLMRequestAdapter
+from letta.errors import LLMAuthenticationError
+from letta.llm_api.anthropic_client import AnthropicClient
+from letta.llm_api.google_ai_client import GoogleAIClient
+from letta.llm_api.openai_client import OpenAIClient
+from letta.schemas.enums import AgentType, MessageRole
+from letta.schemas.letta_message_content import TextContent
+from letta.schemas.llm_config import LLMConfig
+from letta.schemas.message import Message
+from letta.settings import model_settings
+
+
+def _has_openai_credentials() -> bool:
+    return bool(model_settings.openai_api_key or os.environ.get("OPENAI_API_KEY"))
+
+
+def _has_anthropic_credentials() -> bool:
+    return bool(model_settings.anthropic_api_key or os.environ.get("ANTHROPIC_API_KEY"))
+
+
+def _has_gemini_credentials() -> bool:
+    return bool(model_settings.gemini_api_key or os.environ.get("GEMINI_API_KEY"))
+
+
+def _build_simple_messages(user_content: str) -> list[Message]:
+    """Build a minimal message list for testing."""
+    return [
+        Message(
+            role=MessageRole.user,
+            content=[TextContent(text=user_content)],
+        )
+    ]
+
+
+# Large system prompt to exceed caching thresholds (>1024 tokens)
+LARGE_SYSTEM_PROMPT = """You are an advanced AI assistant with extensive knowledge across multiple domains.
+
+# Core Capabilities
+
+## Technical Knowledge
+- Software Engineering: Expert in Python, JavaScript, TypeScript, Go, Rust, and many other languages
+- System Design: Deep understanding of distributed systems, microservices, and cloud architecture
+- DevOps: Proficient in Docker, Kubernetes, CI/CD pipelines, and infrastructure as code
+- Databases: Experience with SQL (PostgreSQL, MySQL) and NoSQL (MongoDB, Redis, Cassandra) databases
+- Machine Learning: Knowledge of neural networks, transformers, and modern ML frameworks
+
+## Problem Solving Approach
+When tackling problems, you follow a structured methodology:
+1. Understand the requirements thoroughly
+2. Break down complex problems into manageable components
+3. Consider multiple solution approaches
+4. Evaluate trade-offs between different options
+5. Implement solutions with clean, maintainable code
+6. Test thoroughly and iterate based on feedback
+
+## Communication Style
+- Clear and concise explanations
+- Use examples and analogies when helpful
+- Adapt technical depth to the audience
+- Ask clarifying questions when requirements are ambiguous
+- Provide context and rationale for recommendations
+
+# Domain Expertise
+
+## Web Development
+You have deep knowledge of:
+- Frontend: React, Vue, Angular, Next.js, modern CSS frameworks
+- Backend: Node.js, Express, FastAPI, Django, Flask
+- API Design: REST, GraphQL, gRPC
+- Authentication: OAuth, JWT, session management
+- Performance: Caching strategies, CDNs, lazy loading
+
+## Data Engineering
+You understand:
+- ETL pipelines and data transformation
+- Data warehousing concepts (Snowflake, BigQuery, Redshift)
+- Stream processing (Kafka, Kinesis)
+- Data modeling and schema design
+- Data quality and validation
+
+## Cloud Platforms
+You're familiar with:
+- AWS: EC2, S3, Lambda, RDS, DynamoDB, CloudFormation
+- GCP: Compute Engine, Cloud Storage, Cloud Functions, BigQuery
+- Azure: Virtual Machines, Blob Storage, Azure Functions
+- Serverless architectures and best practices
+- Cost optimization strategies
+
+## Security
+You consider:
+- Common vulnerabilities (OWASP Top 10)
+- Secure coding practices
+- Encryption and key management
+- Access control and authorization patterns
+- Security audit and compliance requirements
+
+# Interaction Principles
+
+## Helpfulness
+- Provide actionable guidance
+- Share relevant resources and documentation
+- Offer multiple approaches when appropriate
+- Point out potential pitfalls and edge cases
+- Follow up to ensure understanding
+
+## Accuracy
+- Acknowledge limitations and uncertainties
+- Distinguish between facts and opinions
+- Cite sources when making specific claims
+- Correct mistakes promptly when identified
+- Stay current with latest developments
+
+## Respect
+- Value diverse perspectives and approaches
+- Maintain professional boundaries
+- Protect user privacy and confidentiality
+- Avoid assumptions about user background
+- Be patient with varying skill levels
+
+Remember: Your goal is to empower users to solve problems and learn, not just to provide answers."""
+
+
+@pytest.mark.asyncio
+async def test_openai_usage_via_adapter():
+    """Test OpenAI usage extraction through SimpleLLMRequestAdapter.
+
+    This tests the actual production code path used by letta_agent_v3.
+    """
+    if not _has_openai_credentials():
+        pytest.skip("OpenAI credentials not configured")
+
+    client = OpenAIClient()
+    llm_config = LLMConfig.default_config("gpt-4o-mini")
+
+    adapter = SimpleLLMRequestAdapter(
+        llm_client=client,
+        llm_config=llm_config,
+    )
+
+    messages = _build_simple_messages("Say hello in exactly 5 words.")
+    request_data = client.build_request_data(AgentType.letta_v1_agent, messages, llm_config)
+
+    # Call through the adapter (production path)
+    try:
+        async for _ in adapter.invoke_llm(
+            request_data=request_data,
+            messages=messages,
+            tools=[],
+            use_assistant_message=False,
+        ):
+            pass
+    except LLMAuthenticationError:
+        pytest.skip("OpenAI credentials invalid")
+
+    # Verify usage was extracted
+    assert adapter.usage is not None, "adapter.usage should not be None"
+    assert adapter.usage.prompt_tokens > 0, f"prompt_tokens should be > 0, got {adapter.usage.prompt_tokens}"
+    assert adapter.usage.completion_tokens > 0, f"completion_tokens should be > 0, got {adapter.usage.completion_tokens}"
+    assert adapter.usage.total_tokens > 0, f"total_tokens should be > 0, got {adapter.usage.total_tokens}"
+    assert adapter.usage.step_count == 1, f"step_count should be 1, got {adapter.usage.step_count}"
+
+    print(f"OpenAI usage: prompt={adapter.usage.prompt_tokens}, completion={adapter.usage.completion_tokens}")
+    print(f"OpenAI cache: cached_input={adapter.usage.cached_input_tokens}, cache_write={adapter.usage.cache_write_tokens}")
+    print(f"OpenAI reasoning: {adapter.usage.reasoning_tokens}")
+
+
+@pytest.mark.asyncio
+async def test_anthropic_usage_via_adapter():
+    """Test Anthropic usage extraction through SimpleLLMRequestAdapter.
+
+    This tests the actual production code path used by letta_agent_v3.
+
+    Note: Anthropic's input_tokens is NON-cached only. The adapter should
+    compute total prompt_tokens = input_tokens + cache_read + cache_creation.
+    """
+    if not _has_anthropic_credentials():
+        pytest.skip("Anthropic credentials not configured")
+
+    client = AnthropicClient()
+    llm_config = LLMConfig(
+        model="claude-3-5-haiku-20241022",
+        model_endpoint_type="anthropic",
+        model_endpoint="https://api.anthropic.com/v1",
+        context_window=200000,
+        max_tokens=256,
+    )
+
+    adapter = SimpleLLMRequestAdapter(
+        llm_client=client,
+        llm_config=llm_config,
+    )
+
+    # Anthropic requires a system message first
+    messages = [
+        Message(role=MessageRole.system, content=[TextContent(text="You are a helpful assistant.")]),
+        Message(role=MessageRole.user, content=[TextContent(text="Say hello in exactly 5 words.")]),
+    ]
+    request_data = client.build_request_data(AgentType.letta_v1_agent, messages, llm_config, tools=[])
+
+    # Call through the adapter (production path)
+    try:
+        async for _ in adapter.invoke_llm(
+            request_data=request_data,
+            messages=messages,
+            tools=[],
+            use_assistant_message=False,
+        ):
+            pass
+    except LLMAuthenticationError:
+        pytest.skip("Anthropic credentials invalid")
+
+    # Verify usage was extracted
+    assert adapter.usage is not None, "adapter.usage should not be None"
+    assert adapter.usage.prompt_tokens > 0, f"prompt_tokens should be > 0, got {adapter.usage.prompt_tokens}"
+    assert adapter.usage.completion_tokens > 0, f"completion_tokens should be > 0, got {adapter.usage.completion_tokens}"
+    assert adapter.usage.total_tokens > 0, f"total_tokens should be > 0, got {adapter.usage.total_tokens}"
+    assert adapter.usage.step_count == 1, f"step_count should be 1, got {adapter.usage.step_count}"
+
+    print(f"Anthropic usage: prompt={adapter.usage.prompt_tokens}, completion={adapter.usage.completion_tokens}")
+    print(f"Anthropic cache: cached_input={adapter.usage.cached_input_tokens}, cache_write={adapter.usage.cache_write_tokens}")
+
+
+@pytest.mark.asyncio
+async def test_gemini_usage_via_adapter():
+    """Test Gemini usage extraction through SimpleLLMRequestAdapter.
+
+    This tests the actual production code path used by letta_agent_v3.
+    """
+    if not _has_gemini_credentials():
+        pytest.skip("Gemini credentials not configured")
+
+    client = GoogleAIClient()
+    llm_config = LLMConfig(
+        model="gemini-2.0-flash",
+        model_endpoint_type="google_ai",
+        model_endpoint="https://generativelanguage.googleapis.com",
+        context_window=1048576,
+        max_tokens=256,
+    )
+
+    adapter = SimpleLLMRequestAdapter(
+        llm_client=client,
+        llm_config=llm_config,
+    )
+
+    messages = _build_simple_messages("Say hello in exactly 5 words.")
+    request_data = client.build_request_data(AgentType.letta_v1_agent, messages, llm_config, tools=[])
+
+    # Call through the adapter (production path)
+    try:
+        async for _ in adapter.invoke_llm(
+            request_data=request_data,
+            messages=messages,
+            tools=[],
+            use_assistant_message=False,
+        ):
+            pass
+    except LLMAuthenticationError:
+        pytest.skip("Gemini credentials invalid")
+
+    # Verify usage was extracted
+    assert adapter.usage is not None, "adapter.usage should not be None"
+    assert adapter.usage.prompt_tokens > 0, f"prompt_tokens should be > 0, got {adapter.usage.prompt_tokens}"
+    assert adapter.usage.completion_tokens > 0, f"completion_tokens should be > 0, got {adapter.usage.completion_tokens}"
+    assert adapter.usage.total_tokens > 0, f"total_tokens should be > 0, got {adapter.usage.total_tokens}"
+    assert adapter.usage.step_count == 1, f"step_count should be 1, got {adapter.usage.step_count}"
+
+    print(f"Gemini usage: prompt={adapter.usage.prompt_tokens}, completion={adapter.usage.completion_tokens}")
+    print(f"Gemini cache: cached_input={adapter.usage.cached_input_tokens}")
+    print(f"Gemini reasoning: {adapter.usage.reasoning_tokens}")
+
+
+@pytest.mark.asyncio
+async def test_openai_prefix_caching_via_adapter():
+    """Test OpenAI prefix caching through SimpleLLMRequestAdapter.
+
+    Makes two requests with the same large system prompt to verify
+    cached_input_tokens is populated on the second request.
+
+    Note: Prefix caching is probabilistic and depends on server-side state.
+    """
+    if not _has_openai_credentials():
+        pytest.skip("OpenAI credentials not configured")
+
+    client = OpenAIClient()
+    llm_config = LLMConfig.default_config("gpt-4o-mini")
+
+    # First request - should populate the cache
+    adapter1 = SimpleLLMRequestAdapter(llm_client=client, llm_config=llm_config)
+    messages1 = [
+        Message(role=MessageRole.system, content=[TextContent(text=LARGE_SYSTEM_PROMPT)]),
+        Message(role=MessageRole.user, content=[TextContent(text="What is 2+2?")]),
+    ]
+    request_data1 = client.build_request_data(AgentType.letta_v1_agent, messages1, llm_config)
+
+    try:
+        async for _ in adapter1.invoke_llm(request_data=request_data1, messages=messages1, tools=[], use_assistant_message=False):
+            pass
+    except LLMAuthenticationError:
+        pytest.skip("OpenAI credentials invalid")
+
+    print(f"Request 1 - prompt={adapter1.usage.prompt_tokens}, cached={adapter1.usage.cached_input_tokens}")
+
+    # Second request - same system prompt, should hit cache
+    adapter2 = SimpleLLMRequestAdapter(llm_client=client, llm_config=llm_config)
+    messages2 = [
+        Message(role=MessageRole.system, content=[TextContent(text=LARGE_SYSTEM_PROMPT)]),
+        Message(role=MessageRole.user, content=[TextContent(text="What is 3+3?")]),
+    ]
+    request_data2 = client.build_request_data(AgentType.letta_v1_agent, messages2, llm_config)
+
+    async for _ in adapter2.invoke_llm(request_data=request_data2, messages=messages2, tools=[], use_assistant_message=False):
+        pass
+
+    print(f"Request 2 - prompt={adapter2.usage.prompt_tokens}, cached={adapter2.usage.cached_input_tokens}")
+
+    # Verify basic usage
+    assert adapter2.usage.prompt_tokens > 0
+    assert adapter2.usage.completion_tokens > 0
+
+    # Note: We can't guarantee cache hit, but if it happened, cached_input_tokens should be > 0
+    if adapter2.usage.cached_input_tokens and adapter2.usage.cached_input_tokens > 0:
+        print(f"SUCCESS: OpenAI cache hit! cached_input_tokens={adapter2.usage.cached_input_tokens}")
+    else:
+        print("INFO: No cache hit (cache may not have been populated yet)")
+
+
+@pytest.mark.asyncio
+async def test_anthropic_prefix_caching_via_adapter():
+    """Test Anthropic prefix caching through SimpleLLMRequestAdapter.
+
+    Makes two requests with the same large system prompt using cache_control
+    to verify cache tokens are populated.
+
+    Note: Anthropic requires explicit cache_control breakpoints.
+    """
+    if not _has_anthropic_credentials():
+        pytest.skip("Anthropic credentials not configured")
+
+    client = AnthropicClient()
+    llm_config = LLMConfig(
+        model="claude-3-5-haiku-20241022",
+        model_endpoint_type="anthropic",
+        model_endpoint="https://api.anthropic.com/v1",
+        context_window=200000,
+        max_tokens=256,
+    )
+
+    # First request
+    adapter1 = SimpleLLMRequestAdapter(llm_client=client, llm_config=llm_config)
+    messages1 = [
+        Message(role=MessageRole.system, content=[TextContent(text=LARGE_SYSTEM_PROMPT)]),
+        Message(role=MessageRole.user, content=[TextContent(text="What is 2+2?")]),
+    ]
+    request_data1 = client.build_request_data(AgentType.letta_v1_agent, messages1, llm_config, tools=[])
+
+    try:
+        async for _ in adapter1.invoke_llm(request_data=request_data1, messages=messages1, tools=[], use_assistant_message=False):
+            pass
+    except LLMAuthenticationError:
+        pytest.skip("Anthropic credentials invalid")
+
+    print(
+        f"Request 1 - prompt={adapter1.usage.prompt_tokens}, cached={adapter1.usage.cached_input_tokens}, cache_write={adapter1.usage.cache_write_tokens}"
+    )
+
+    # Second request
+    adapter2 = SimpleLLMRequestAdapter(llm_client=client, llm_config=llm_config)
+    messages2 = [
+        Message(role=MessageRole.system, content=[TextContent(text=LARGE_SYSTEM_PROMPT)]),
+        Message(role=MessageRole.user, content=[TextContent(text="What is 3+3?")]),
+    ]
+    request_data2 = client.build_request_data(AgentType.letta_v1_agent, messages2, llm_config, tools=[])
+
+    async for _ in adapter2.invoke_llm(request_data=request_data2, messages=messages2, tools=[], use_assistant_message=False):
+        pass
+
+    print(
+        f"Request 2 - prompt={adapter2.usage.prompt_tokens}, cached={adapter2.usage.cached_input_tokens}, cache_write={adapter2.usage.cache_write_tokens}"
+    )
+
+    # Verify basic usage
+    assert adapter2.usage.prompt_tokens > 0
+    assert adapter2.usage.completion_tokens > 0
+
+    # Check for cache activity
+    if adapter2.usage.cached_input_tokens and adapter2.usage.cached_input_tokens > 0:
+        print(f"SUCCESS: Anthropic cache hit! cached_input_tokens={adapter2.usage.cached_input_tokens}")
+    elif adapter2.usage.cache_write_tokens and adapter2.usage.cache_write_tokens > 0:
+        print(f"INFO: Anthropic cache write! cache_write_tokens={adapter2.usage.cache_write_tokens}")
+    else:
+        print("INFO: No cache activity detected")
+
+
+@pytest.mark.asyncio
+async def test_gemini_prefix_caching_via_adapter():
+    """Test Gemini prefix caching through SimpleLLMRequestAdapter.
+
+    Makes two requests with the same large system prompt to verify
+    cached_input_tokens is populated.
+
+    Note: Gemini 2.0+ has implicit caching.
+    """
+    if not _has_gemini_credentials():
+        pytest.skip("Gemini credentials not configured")
+
+    client = GoogleAIClient()
+    llm_config = LLMConfig(
+        model="gemini-2.0-flash",
+        model_endpoint_type="google_ai",
+        model_endpoint="https://generativelanguage.googleapis.com",
+        context_window=1048576,
+        max_tokens=256,
+    )
+
+    # First request
+    adapter1 = SimpleLLMRequestAdapter(llm_client=client, llm_config=llm_config)
+    messages1 = [
+        Message(role=MessageRole.system, content=[TextContent(text=LARGE_SYSTEM_PROMPT)]),
+        Message(role=MessageRole.user, content=[TextContent(text="What is 2+2?")]),
+    ]
+    request_data1 = client.build_request_data(AgentType.letta_v1_agent, messages1, llm_config, tools=[])
+
+    try:
+        async for _ in adapter1.invoke_llm(request_data=request_data1, messages=messages1, tools=[], use_assistant_message=False):
+            pass
+    except LLMAuthenticationError:
+        pytest.skip("Gemini credentials invalid")
+
+    print(f"Request 1 - prompt={adapter1.usage.prompt_tokens}, cached={adapter1.usage.cached_input_tokens}")
+
+    # Second request
+    adapter2 = SimpleLLMRequestAdapter(llm_client=client, llm_config=llm_config)
+    messages2 = [
+        Message(role=MessageRole.system, content=[TextContent(text=LARGE_SYSTEM_PROMPT)]),
+        Message(role=MessageRole.user, content=[TextContent(text="What is 3+3?")]),
+    ]
+    request_data2 = client.build_request_data(AgentType.letta_v1_agent, messages2, llm_config, tools=[])
+
+    async for _ in adapter2.invoke_llm(request_data=request_data2, messages=messages2, tools=[], use_assistant_message=False):
+        pass
+
+    print(f"Request 2 - prompt={adapter2.usage.prompt_tokens}, cached={adapter2.usage.cached_input_tokens}")
+
+    # Verify basic usage
+    assert adapter2.usage.prompt_tokens > 0
+    assert adapter2.usage.completion_tokens > 0
+
+    if adapter2.usage.cached_input_tokens and adapter2.usage.cached_input_tokens > 0:
+        print(f"SUCCESS: Gemini cache hit! cached_input_tokens={adapter2.usage.cached_input_tokens}")
+    else:
+        print("INFO: No cache hit detected")