fix(core): patch usage data tracking for anthropic when context caching is on (#6997)

2025-12-14 18:14:09 -08:00
parent a731e01e88
commit 33d39f4643
4 changed files with 46 additions and 8 deletions
--- a/letta/adapters/letta_llm_stream_adapter.py
+++ b/letta/adapters/letta_llm_stream_adapter.py
@@ -117,15 +117,47 @@ class LettaLLMStreamAdapter(LettaLLMAdapter):
            if not output_tokens and hasattr(self.interface, "fallback_output_tokens"):
                output_tokens = self.interface.fallback_output_tokens

-            # NOTE: For Anthropic, input_tokens is NON-cached only, so total_tokens here
-            # undercounts the actual total (missing cache_read + cache_creation tokens).
-            # For OpenAI/Gemini, input_tokens is already the total, so this is correct.
-            # See simple_llm_stream_adapter.py for the proper provider-aware calculation.
+            # Extract cache token data (OpenAI/Gemini use cached_tokens, Anthropic uses cache_read_tokens)
+            # None means provider didn't report, 0 means provider reported 0
+            cached_input_tokens = None
+            if hasattr(self.interface, "cached_tokens") and self.interface.cached_tokens is not None:
+                cached_input_tokens = self.interface.cached_tokens
+            elif hasattr(self.interface, "cache_read_tokens") and self.interface.cache_read_tokens is not None:
+                cached_input_tokens = self.interface.cache_read_tokens
+
+            # Extract cache write tokens (Anthropic only)
+            cache_write_tokens = None
+            if hasattr(self.interface, "cache_creation_tokens") and self.interface.cache_creation_tokens is not None:
+                cache_write_tokens = self.interface.cache_creation_tokens
+
+            # Extract reasoning tokens (OpenAI o1/o3 models use reasoning_tokens, Gemini uses thinking_tokens)
+            reasoning_tokens = None
+            if hasattr(self.interface, "reasoning_tokens") and self.interface.reasoning_tokens is not None:
+                reasoning_tokens = self.interface.reasoning_tokens
+            elif hasattr(self.interface, "thinking_tokens") and self.interface.thinking_tokens is not None:
+                reasoning_tokens = self.interface.thinking_tokens
+
+            # Calculate actual total input tokens
+            #
+            # ANTHROPIC: input_tokens is NON-cached only, must add cache tokens
+            #   Total = input_tokens + cache_read_input_tokens + cache_creation_input_tokens
+            #
+            # OPENAI/GEMINI: input_tokens is already TOTAL
+            #   cached_tokens is a subset, NOT additive
+            is_anthropic = hasattr(self.interface, "cache_read_tokens") or hasattr(self.interface, "cache_creation_tokens")
+            if is_anthropic:
+                actual_input_tokens = (input_tokens or 0) + (cached_input_tokens or 0) + (cache_write_tokens or 0)
+            else:
+                actual_input_tokens = input_tokens or 0
+
            self.usage = LettaUsageStatistics(
                step_count=1,
                completion_tokens=output_tokens or 0,
-                prompt_tokens=input_tokens or 0,
-                total_tokens=(input_tokens or 0) + (output_tokens or 0),
+                prompt_tokens=actual_input_tokens,
+                total_tokens=actual_input_tokens + (output_tokens or 0),
+                cached_input_tokens=cached_input_tokens,
+                cache_write_tokens=cache_write_tokens,
+                reasoning_tokens=reasoning_tokens,
            )
        else:
            # Default usage statistics if not available