diff --git a/letta/adapters/letta_llm_request_adapter.py b/letta/adapters/letta_llm_request_adapter.py index fdcf050b..f045c0c2 100644 --- a/letta/adapters/letta_llm_request_adapter.py +++ b/letta/adapters/letta_llm_request_adapter.py @@ -6,6 +6,7 @@ from letta.otel.tracing import log_attributes, log_event, safe_json_dumps, trace from letta.schemas.letta_message import LettaMessage from letta.schemas.letta_message_content import OmittedReasoningContent, ReasoningContent, TextContent from letta.schemas.provider_trace import ProviderTraceCreate +from letta.schemas.usage import normalize_cache_tokens, normalize_reasoning_tokens from letta.schemas.user import User from letta.settings import settings from letta.utils import safe_create_task @@ -82,6 +83,11 @@ class LettaLLMRequestAdapter(LettaLLMAdapter): self.usage.prompt_tokens = self.chat_completions_response.usage.prompt_tokens self.usage.total_tokens = self.chat_completions_response.usage.total_tokens + # Extract cache and reasoning token details using normalized helpers + usage = self.chat_completions_response.usage + self.usage.cached_input_tokens, self.usage.cache_write_tokens = normalize_cache_tokens(usage.prompt_tokens_details) + self.usage.reasoning_tokens = normalize_reasoning_tokens(usage.completion_tokens_details) + self.log_provider_trace(step_id=step_id, actor=actor) yield None diff --git a/letta/adapters/letta_llm_stream_adapter.py b/letta/adapters/letta_llm_stream_adapter.py index 4ad4bf92..0c3c4ae2 100644 --- a/letta/adapters/letta_llm_stream_adapter.py +++ b/letta/adapters/letta_llm_stream_adapter.py @@ -117,15 +117,47 @@ class LettaLLMStreamAdapter(LettaLLMAdapter): if not output_tokens and hasattr(self.interface, "fallback_output_tokens"): output_tokens = self.interface.fallback_output_tokens - # NOTE: For Anthropic, input_tokens is NON-cached only, so total_tokens here - # undercounts the actual total (missing cache_read + cache_creation tokens). - # For OpenAI/Gemini, input_tokens is already the total, so this is correct. - # See simple_llm_stream_adapter.py for the proper provider-aware calculation. + # Extract cache token data (OpenAI/Gemini use cached_tokens, Anthropic uses cache_read_tokens) + # None means provider didn't report, 0 means provider reported 0 + cached_input_tokens = None + if hasattr(self.interface, "cached_tokens") and self.interface.cached_tokens is not None: + cached_input_tokens = self.interface.cached_tokens + elif hasattr(self.interface, "cache_read_tokens") and self.interface.cache_read_tokens is not None: + cached_input_tokens = self.interface.cache_read_tokens + + # Extract cache write tokens (Anthropic only) + cache_write_tokens = None + if hasattr(self.interface, "cache_creation_tokens") and self.interface.cache_creation_tokens is not None: + cache_write_tokens = self.interface.cache_creation_tokens + + # Extract reasoning tokens (OpenAI o1/o3 models use reasoning_tokens, Gemini uses thinking_tokens) + reasoning_tokens = None + if hasattr(self.interface, "reasoning_tokens") and self.interface.reasoning_tokens is not None: + reasoning_tokens = self.interface.reasoning_tokens + elif hasattr(self.interface, "thinking_tokens") and self.interface.thinking_tokens is not None: + reasoning_tokens = self.interface.thinking_tokens + + # Calculate actual total input tokens + # + # ANTHROPIC: input_tokens is NON-cached only, must add cache tokens + # Total = input_tokens + cache_read_input_tokens + cache_creation_input_tokens + # + # OPENAI/GEMINI: input_tokens is already TOTAL + # cached_tokens is a subset, NOT additive + is_anthropic = hasattr(self.interface, "cache_read_tokens") or hasattr(self.interface, "cache_creation_tokens") + if is_anthropic: + actual_input_tokens = (input_tokens or 0) + (cached_input_tokens or 0) + (cache_write_tokens or 0) + else: + actual_input_tokens = input_tokens or 0 + self.usage = LettaUsageStatistics( step_count=1, completion_tokens=output_tokens or 0, - prompt_tokens=input_tokens or 0, - total_tokens=(input_tokens or 0) + (output_tokens or 0), + prompt_tokens=actual_input_tokens, + total_tokens=actual_input_tokens + (output_tokens or 0), + cached_input_tokens=cached_input_tokens, + cache_write_tokens=cache_write_tokens, + reasoning_tokens=reasoning_tokens, ) else: # Default usage statistics if not available diff --git a/letta/adapters/simple_llm_stream_adapter.py b/letta/adapters/simple_llm_stream_adapter.py index 91d5e211..89f94099 100644 --- a/letta/adapters/simple_llm_stream_adapter.py +++ b/letta/adapters/simple_llm_stream_adapter.py @@ -200,7 +200,7 @@ class SimpleLLMStreamAdapter(LettaLLMStreamAdapter): self.usage = LettaUsageStatistics( step_count=1, completion_tokens=output_tokens or 0, - prompt_tokens=input_tokens or 0, + prompt_tokens=actual_input_tokens, total_tokens=actual_input_tokens + (output_tokens or 0), cached_input_tokens=cached_input_tokens, cache_write_tokens=cache_write_tokens, diff --git a/letta/llm_api/anthropic_client.py b/letta/llm_api/anthropic_client.py index 17829366..25bf2ddd 100644 --- a/letta/llm_api/anthropic_client.py +++ b/letta/llm_api/anthropic_client.py @@ -896,7 +896,7 @@ class AnthropicClient(LLMClientBase): created=get_utc_time_int(), model=response.model, usage=UsageStatistics( - prompt_tokens=prompt_tokens, + prompt_tokens=actual_input_tokens, completion_tokens=completion_tokens, total_tokens=actual_input_tokens + completion_tokens, prompt_tokens_details=prompt_tokens_details,