Files
letta-server/tests/test_usage_parsing.py
jnjpng f10440b49c fix: update Anthropic Haiku test model after 3.5 retirement (#9569)
* fix: migrate Anthropic Haiku test model off retired release

Update Anthropic Haiku references in integration and usage parsing tests to a supported model id so test requests stop failing with 404 model not found errors.

👾 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* fix: use canonical Anthropic Haiku handle in tests

Replace dated Anthropic Haiku handle references with the canonical provider handle so handle-based model resolution does not fail in batch and client tests.

👾 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

---------

Co-authored-by: Letta <noreply@letta.com>
2026-02-24 10:55:12 -08:00

477 lines
19 KiB
Python

"""
Tests for usage statistics parsing through the production adapter path.
These tests verify that SimpleLLMRequestAdapter correctly extracts usage statistics
from LLM responses, including:
1. Basic usage (prompt_tokens, completion_tokens, total_tokens)
2. Cache-related fields (cached_input_tokens, cache_write_tokens)
3. Reasoning tokens (for models that support it)
This tests the actual production code path:
SimpleLLMRequestAdapter.invoke_llm()
→ llm_client.request_async_with_telemetry()
→ llm_client.convert_response_to_chat_completion()
→ adapter extracts from chat_completions_response.usage
→ normalize_cache_tokens() / normalize_reasoning_tokens()
"""
import os
import pytest
from letta.adapters.simple_llm_request_adapter import SimpleLLMRequestAdapter
from letta.errors import LLMAuthenticationError
from letta.llm_api.anthropic_client import AnthropicClient
from letta.llm_api.google_ai_client import GoogleAIClient
from letta.llm_api.openai_client import OpenAIClient
from letta.schemas.enums import AgentType, LLMCallType, MessageRole
from letta.schemas.letta_message_content import TextContent
from letta.schemas.llm_config import LLMConfig
from letta.schemas.message import Message
from letta.settings import model_settings
def _has_openai_credentials() -> bool:
return bool(model_settings.openai_api_key or os.environ.get("OPENAI_API_KEY"))
def _has_anthropic_credentials() -> bool:
return bool(model_settings.anthropic_api_key or os.environ.get("ANTHROPIC_API_KEY"))
def _has_gemini_credentials() -> bool:
return bool(model_settings.gemini_api_key or os.environ.get("GEMINI_API_KEY"))
def _build_simple_messages(user_content: str) -> list[Message]:
"""Build a minimal message list for testing."""
return [
Message(
role=MessageRole.user,
content=[TextContent(text=user_content)],
)
]
# Large system prompt to exceed caching thresholds (>1024 tokens)
LARGE_SYSTEM_PROMPT = """You are an advanced AI assistant with extensive knowledge across multiple domains.
# Core Capabilities
## Technical Knowledge
- Software Engineering: Expert in Python, JavaScript, TypeScript, Go, Rust, and many other languages
- System Design: Deep understanding of distributed systems, microservices, and cloud architecture
- DevOps: Proficient in Docker, Kubernetes, CI/CD pipelines, and infrastructure as code
- Databases: Experience with SQL (PostgreSQL, MySQL) and NoSQL (MongoDB, Redis, Cassandra) databases
- Machine Learning: Knowledge of neural networks, transformers, and modern ML frameworks
## Problem Solving Approach
When tackling problems, you follow a structured methodology:
1. Understand the requirements thoroughly
2. Break down complex problems into manageable components
3. Consider multiple solution approaches
4. Evaluate trade-offs between different options
5. Implement solutions with clean, maintainable code
6. Test thoroughly and iterate based on feedback
## Communication Style
- Clear and concise explanations
- Use examples and analogies when helpful
- Adapt technical depth to the audience
- Ask clarifying questions when requirements are ambiguous
- Provide context and rationale for recommendations
# Domain Expertise
## Web Development
You have deep knowledge of:
- Frontend: React, Vue, Angular, Next.js, modern CSS frameworks
- Backend: Node.js, Express, FastAPI, Django, Flask
- API Design: REST, GraphQL, gRPC
- Authentication: OAuth, JWT, session management
- Performance: Caching strategies, CDNs, lazy loading
## Data Engineering
You understand:
- ETL pipelines and data transformation
- Data warehousing concepts (Snowflake, BigQuery, Redshift)
- Stream processing (Kafka, Kinesis)
- Data modeling and schema design
- Data quality and validation
## Cloud Platforms
You're familiar with:
- AWS: EC2, S3, Lambda, RDS, DynamoDB, CloudFormation
- GCP: Compute Engine, Cloud Storage, Cloud Functions, BigQuery
- Azure: Virtual Machines, Blob Storage, Azure Functions
- Serverless architectures and best practices
- Cost optimization strategies
## Security
You consider:
- Common vulnerabilities (OWASP Top 10)
- Secure coding practices
- Encryption and key management
- Access control and authorization patterns
- Security audit and compliance requirements
# Interaction Principles
## Helpfulness
- Provide actionable guidance
- Share relevant resources and documentation
- Offer multiple approaches when appropriate
- Point out potential pitfalls and edge cases
- Follow up to ensure understanding
## Accuracy
- Acknowledge limitations and uncertainties
- Distinguish between facts and opinions
- Cite sources when making specific claims
- Correct mistakes promptly when identified
- Stay current with latest developments
## Respect
- Value diverse perspectives and approaches
- Maintain professional boundaries
- Protect user privacy and confidentiality
- Avoid assumptions about user background
- Be patient with varying skill levels
Remember: Your goal is to empower users to solve problems and learn, not just to provide answers."""
@pytest.mark.asyncio
async def test_openai_usage_via_adapter():
"""Test OpenAI usage extraction through SimpleLLMRequestAdapter.
This tests the actual production code path used by letta_agent_v3.
"""
if not _has_openai_credentials():
pytest.skip("OpenAI credentials not configured")
client = OpenAIClient()
llm_config = LLMConfig.default_config("gpt-4o-mini")
adapter = SimpleLLMRequestAdapter(
llm_client=client,
llm_config=llm_config,
call_type=LLMCallType.agent_step,
)
messages = _build_simple_messages("Say hello in exactly 5 words.")
request_data = client.build_request_data(AgentType.letta_v1_agent, messages, llm_config)
# Call through the adapter (production path)
try:
async for _ in adapter.invoke_llm(
request_data=request_data,
messages=messages,
tools=[],
use_assistant_message=False,
):
pass
except LLMAuthenticationError:
pytest.skip("OpenAI credentials invalid")
# Verify usage was extracted
assert adapter.usage is not None, "adapter.usage should not be None"
assert adapter.usage.prompt_tokens > 0, f"prompt_tokens should be > 0, got {adapter.usage.prompt_tokens}"
assert adapter.usage.completion_tokens > 0, f"completion_tokens should be > 0, got {adapter.usage.completion_tokens}"
assert adapter.usage.total_tokens > 0, f"total_tokens should be > 0, got {adapter.usage.total_tokens}"
assert adapter.usage.step_count == 1, f"step_count should be 1, got {adapter.usage.step_count}"
print(f"OpenAI usage: prompt={adapter.usage.prompt_tokens}, completion={adapter.usage.completion_tokens}")
print(f"OpenAI cache: cached_input={adapter.usage.cached_input_tokens}, cache_write={adapter.usage.cache_write_tokens}")
print(f"OpenAI reasoning: {adapter.usage.reasoning_tokens}")
@pytest.mark.asyncio
async def test_anthropic_usage_via_adapter():
"""Test Anthropic usage extraction through SimpleLLMRequestAdapter.
This tests the actual production code path used by letta_agent_v3.
Note: Anthropic's input_tokens is NON-cached only. The adapter should
compute total prompt_tokens = input_tokens + cache_read + cache_creation.
"""
if not _has_anthropic_credentials():
pytest.skip("Anthropic credentials not configured")
client = AnthropicClient()
llm_config = LLMConfig(
model="claude-haiku-4-5-20251001",
model_endpoint_type="anthropic",
model_endpoint="https://api.anthropic.com/v1",
context_window=200000,
max_tokens=256,
)
adapter = SimpleLLMRequestAdapter(
llm_client=client,
llm_config=llm_config,
call_type=LLMCallType.agent_step,
)
# Anthropic requires a system message first
messages = [
Message(role=MessageRole.system, content=[TextContent(text="You are a helpful assistant.")]),
Message(role=MessageRole.user, content=[TextContent(text="Say hello in exactly 5 words.")]),
]
request_data = client.build_request_data(AgentType.letta_v1_agent, messages, llm_config, tools=[])
# Call through the adapter (production path)
try:
async for _ in adapter.invoke_llm(
request_data=request_data,
messages=messages,
tools=[],
use_assistant_message=False,
):
pass
except LLMAuthenticationError:
pytest.skip("Anthropic credentials invalid")
# Verify usage was extracted
assert adapter.usage is not None, "adapter.usage should not be None"
assert adapter.usage.prompt_tokens > 0, f"prompt_tokens should be > 0, got {adapter.usage.prompt_tokens}"
assert adapter.usage.completion_tokens > 0, f"completion_tokens should be > 0, got {adapter.usage.completion_tokens}"
assert adapter.usage.total_tokens > 0, f"total_tokens should be > 0, got {adapter.usage.total_tokens}"
assert adapter.usage.step_count == 1, f"step_count should be 1, got {adapter.usage.step_count}"
print(f"Anthropic usage: prompt={adapter.usage.prompt_tokens}, completion={adapter.usage.completion_tokens}")
print(f"Anthropic cache: cached_input={adapter.usage.cached_input_tokens}, cache_write={adapter.usage.cache_write_tokens}")
@pytest.mark.asyncio
async def test_gemini_usage_via_adapter():
"""Test Gemini usage extraction through SimpleLLMRequestAdapter.
This tests the actual production code path used by letta_agent_v3.
"""
if not _has_gemini_credentials():
pytest.skip("Gemini credentials not configured")
client = GoogleAIClient()
llm_config = LLMConfig(
model="gemini-2.0-flash",
model_endpoint_type="google_ai",
model_endpoint="https://generativelanguage.googleapis.com",
context_window=1048576,
max_tokens=256,
)
adapter = SimpleLLMRequestAdapter(
llm_client=client,
llm_config=llm_config,
call_type=LLMCallType.agent_step,
)
messages = _build_simple_messages("Say hello in exactly 5 words.")
request_data = client.build_request_data(AgentType.letta_v1_agent, messages, llm_config, tools=[])
# Call through the adapter (production path)
try:
async for _ in adapter.invoke_llm(
request_data=request_data,
messages=messages,
tools=[],
use_assistant_message=False,
):
pass
except LLMAuthenticationError:
pytest.skip("Gemini credentials invalid")
# Verify usage was extracted
assert adapter.usage is not None, "adapter.usage should not be None"
assert adapter.usage.prompt_tokens > 0, f"prompt_tokens should be > 0, got {adapter.usage.prompt_tokens}"
assert adapter.usage.completion_tokens > 0, f"completion_tokens should be > 0, got {adapter.usage.completion_tokens}"
assert adapter.usage.total_tokens > 0, f"total_tokens should be > 0, got {adapter.usage.total_tokens}"
assert adapter.usage.step_count == 1, f"step_count should be 1, got {adapter.usage.step_count}"
print(f"Gemini usage: prompt={adapter.usage.prompt_tokens}, completion={adapter.usage.completion_tokens}")
print(f"Gemini cache: cached_input={adapter.usage.cached_input_tokens}")
print(f"Gemini reasoning: {adapter.usage.reasoning_tokens}")
@pytest.mark.asyncio
async def test_openai_prefix_caching_via_adapter():
"""Test OpenAI prefix caching through SimpleLLMRequestAdapter.
Makes two requests with the same large system prompt to verify
cached_input_tokens is populated on the second request.
Note: Prefix caching is probabilistic and depends on server-side state.
"""
if not _has_openai_credentials():
pytest.skip("OpenAI credentials not configured")
client = OpenAIClient()
llm_config = LLMConfig.default_config("gpt-4o-mini")
# First request - should populate the cache
adapter1 = SimpleLLMRequestAdapter(llm_client=client, llm_config=llm_config, call_type=LLMCallType.agent_step)
messages1 = [
Message(role=MessageRole.system, content=[TextContent(text=LARGE_SYSTEM_PROMPT)]),
Message(role=MessageRole.user, content=[TextContent(text="What is 2+2?")]),
]
request_data1 = client.build_request_data(AgentType.letta_v1_agent, messages1, llm_config)
try:
async for _ in adapter1.invoke_llm(request_data=request_data1, messages=messages1, tools=[], use_assistant_message=False):
pass
except LLMAuthenticationError:
pytest.skip("OpenAI credentials invalid")
print(f"Request 1 - prompt={adapter1.usage.prompt_tokens}, cached={adapter1.usage.cached_input_tokens}")
# Second request - same system prompt, should hit cache
adapter2 = SimpleLLMRequestAdapter(llm_client=client, llm_config=llm_config, call_type=LLMCallType.agent_step)
messages2 = [
Message(role=MessageRole.system, content=[TextContent(text=LARGE_SYSTEM_PROMPT)]),
Message(role=MessageRole.user, content=[TextContent(text="What is 3+3?")]),
]
request_data2 = client.build_request_data(AgentType.letta_v1_agent, messages2, llm_config)
async for _ in adapter2.invoke_llm(request_data=request_data2, messages=messages2, tools=[], use_assistant_message=False):
pass
print(f"Request 2 - prompt={adapter2.usage.prompt_tokens}, cached={adapter2.usage.cached_input_tokens}")
# Verify basic usage
assert adapter2.usage.prompt_tokens > 0
assert adapter2.usage.completion_tokens > 0
# Note: We can't guarantee cache hit, but if it happened, cached_input_tokens should be > 0
if adapter2.usage.cached_input_tokens and adapter2.usage.cached_input_tokens > 0:
print(f"SUCCESS: OpenAI cache hit! cached_input_tokens={adapter2.usage.cached_input_tokens}")
else:
print("INFO: No cache hit (cache may not have been populated yet)")
@pytest.mark.asyncio
async def test_anthropic_prefix_caching_via_adapter():
"""Test Anthropic prefix caching through SimpleLLMRequestAdapter.
Makes two requests with the same large system prompt using cache_control
to verify cache tokens are populated.
Note: Anthropic requires explicit cache_control breakpoints.
"""
if not _has_anthropic_credentials():
pytest.skip("Anthropic credentials not configured")
client = AnthropicClient()
llm_config = LLMConfig(
model="claude-haiku-4-5-20251001",
model_endpoint_type="anthropic",
model_endpoint="https://api.anthropic.com/v1",
context_window=200000,
max_tokens=256,
)
# First request
adapter1 = SimpleLLMRequestAdapter(llm_client=client, llm_config=llm_config, call_type=LLMCallType.agent_step)
messages1 = [
Message(role=MessageRole.system, content=[TextContent(text=LARGE_SYSTEM_PROMPT)]),
Message(role=MessageRole.user, content=[TextContent(text="What is 2+2?")]),
]
request_data1 = client.build_request_data(AgentType.letta_v1_agent, messages1, llm_config, tools=[])
try:
async for _ in adapter1.invoke_llm(request_data=request_data1, messages=messages1, tools=[], use_assistant_message=False):
pass
except LLMAuthenticationError:
pytest.skip("Anthropic credentials invalid")
print(
f"Request 1 - prompt={adapter1.usage.prompt_tokens}, cached={adapter1.usage.cached_input_tokens}, cache_write={adapter1.usage.cache_write_tokens}"
)
# Second request
adapter2 = SimpleLLMRequestAdapter(llm_client=client, llm_config=llm_config, call_type=LLMCallType.agent_step)
messages2 = [
Message(role=MessageRole.system, content=[TextContent(text=LARGE_SYSTEM_PROMPT)]),
Message(role=MessageRole.user, content=[TextContent(text="What is 3+3?")]),
]
request_data2 = client.build_request_data(AgentType.letta_v1_agent, messages2, llm_config, tools=[])
async for _ in adapter2.invoke_llm(request_data=request_data2, messages=messages2, tools=[], use_assistant_message=False):
pass
print(
f"Request 2 - prompt={adapter2.usage.prompt_tokens}, cached={adapter2.usage.cached_input_tokens}, cache_write={adapter2.usage.cache_write_tokens}"
)
# Verify basic usage
assert adapter2.usage.prompt_tokens > 0
assert adapter2.usage.completion_tokens > 0
# Check for cache activity
if adapter2.usage.cached_input_tokens and adapter2.usage.cached_input_tokens > 0:
print(f"SUCCESS: Anthropic cache hit! cached_input_tokens={adapter2.usage.cached_input_tokens}")
elif adapter2.usage.cache_write_tokens and adapter2.usage.cache_write_tokens > 0:
print(f"INFO: Anthropic cache write! cache_write_tokens={adapter2.usage.cache_write_tokens}")
else:
print("INFO: No cache activity detected")
@pytest.mark.asyncio
async def test_gemini_prefix_caching_via_adapter():
"""Test Gemini prefix caching through SimpleLLMRequestAdapter.
Makes two requests with the same large system prompt to verify
cached_input_tokens is populated.
Note: Gemini 2.0+ has implicit caching.
"""
if not _has_gemini_credentials():
pytest.skip("Gemini credentials not configured")
client = GoogleAIClient()
llm_config = LLMConfig(
model="gemini-2.0-flash",
model_endpoint_type="google_ai",
model_endpoint="https://generativelanguage.googleapis.com",
context_window=1048576,
max_tokens=256,
)
# First request
adapter1 = SimpleLLMRequestAdapter(llm_client=client, llm_config=llm_config, call_type=LLMCallType.agent_step)
messages1 = [
Message(role=MessageRole.system, content=[TextContent(text=LARGE_SYSTEM_PROMPT)]),
Message(role=MessageRole.user, content=[TextContent(text="What is 2+2?")]),
]
request_data1 = client.build_request_data(AgentType.letta_v1_agent, messages1, llm_config, tools=[])
try:
async for _ in adapter1.invoke_llm(request_data=request_data1, messages=messages1, tools=[], use_assistant_message=False):
pass
except LLMAuthenticationError:
pytest.skip("Gemini credentials invalid")
print(f"Request 1 - prompt={adapter1.usage.prompt_tokens}, cached={adapter1.usage.cached_input_tokens}")
# Second request
adapter2 = SimpleLLMRequestAdapter(llm_client=client, llm_config=llm_config, call_type=LLMCallType.agent_step)
messages2 = [
Message(role=MessageRole.system, content=[TextContent(text=LARGE_SYSTEM_PROMPT)]),
Message(role=MessageRole.user, content=[TextContent(text="What is 3+3?")]),
]
request_data2 = client.build_request_data(AgentType.letta_v1_agent, messages2, llm_config, tools=[])
async for _ in adapter2.invoke_llm(request_data=request_data2, messages=messages2, tools=[], use_assistant_message=False):
pass
print(f"Request 2 - prompt={adapter2.usage.prompt_tokens}, cached={adapter2.usage.cached_input_tokens}")
# Verify basic usage
assert adapter2.usage.prompt_tokens > 0
assert adapter2.usage.completion_tokens > 0
if adapter2.usage.cached_input_tokens and adapter2.usage.cached_input_tokens > 0:
print(f"SUCCESS: Gemini cache hit! cached_input_tokens={adapter2.usage.cached_input_tokens}")
else:
print("INFO: No cache hit detected")