From c801866d89f1a74393cd223f00939cee1e182d3e Mon Sep 17 00:00:00 2001
From: jnjpng <jin@letta.com>
Date: Wed, 4 Feb 2026 18:14:32 -0800
Subject: [PATCH] feat: add context token estimates to llm usage (#9295)

* base

* generate

* update
---
 fern/openapi.json              | 12 ++++++++++++
 letta/agents/letta_agent_v3.py |  5 +++++
 letta/schemas/usage.py         |  6 ++++++
 3 files changed, 23 insertions(+)

diff --git a/fern/openapi.json b/fern/openapi.json
index 3230ece0..2135a1f2 100644
--- a/fern/openapi.json
+++ b/fern/openapi.json
@@ -37646,6 +37646,18 @@
             ],
             "title": "Reasoning Tokens",
             "description": "The number of reasoning/thinking tokens generated. None if not reported by provider."
+          },
+          "context_tokens": {
+            "anyOf": [
+              {
+                "type": "integer"
+              },
+              {
+                "type": "null"
+              }
+            ],
+            "title": "Context Tokens",
+            "description": "Estimate of tokens currently in the context window."
           }
         },
         "type": "object",
diff --git a/letta/agents/letta_agent_v3.py b/letta/agents/letta_agent_v3.py
index 9e52e0df..bd111eee 100644
--- a/letta/agents/letta_agent_v3.py
+++ b/letta/agents/letta_agent_v3.py
@@ -290,6 +290,8 @@ class LettaAgentV3(LettaAgentV2):
         )
         if include_return_message_types:
             response_letta_messages = [m for m in response_letta_messages if m.message_type in include_return_message_types]
+        # Set context_tokens to expose actual context window usage (vs accumulated prompt_tokens)
+        self.usage.context_tokens = self.context_token_estimate
         result = LettaResponse(messages=response_letta_messages, stop_reason=self.stop_reason, usage=self.usage)
         if run_id:
             if self.job_update_metadata is None:
@@ -480,6 +482,9 @@ class LettaAgentV3(LettaAgentV2):
 
         # Cleanup and finalize (only runs if no exception occurred)
         try:
+            # Set context_tokens to expose actual context window usage (vs accumulated prompt_tokens)
+            self.usage.context_tokens = self.context_token_estimate
+
             if run_id:
                 # Filter out LettaStopReason from messages (only valid in LettaStreamingResponse, not LettaResponse)
                 filtered_messages = [m for m in response_letta_messages if not isinstance(m, LettaStopReason)]
diff --git a/letta/schemas/usage.py b/letta/schemas/usage.py
index d2f5191d..c066423f 100644
--- a/letta/schemas/usage.py
+++ b/letta/schemas/usage.py
@@ -127,6 +127,12 @@ class LettaUsageStatistics(BaseModel):
         None, description="The number of reasoning/thinking tokens generated. None if not reported by provider."
     )
 
+    # Context window tracking
+    context_tokens: Optional[int] = Field(
+        None,
+        description="Estimate of tokens currently in the context window.",
+    )
+
     def to_usage(self, provider_type: Optional["ProviderType"] = None) -> "UsageStatistics":
         """Convert to UsageStatistics (OpenAI-compatible format).