fix(core): add OpenAI prompt cache key and model-gated 24h retention (#9492)

* fix(core): apply OpenAI prompt cache settings to request payloads Set prompt_cache_key using agent and conversation context on both Responses and Chat Completions request builders, and enable 24h retention only for supported OpenAI models while excluding OpenRouter paths. 👾 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * fix(core): prefix prompt cache key with letta tag Add a `letta:` prefix to generated OpenAI prompt_cache_key values so cache-related entries are easier to identify in provider-side logs and diagnostics. 👾 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * add integration test * skip test --------- Co-authored-by: Letta <noreply@letta.com> Co-authored-by: Ari Webb <ari@letta.com>
2026-02-16 13:27:42 -08:00
parent 5b001a7749
commit 619e81ed1e
5 changed files with 444 additions and 3 deletions
--- a/letta/llm_api/openai_client.py
+++ b/letta/llm_api/openai_client.py
@@ -247,6 +247,81 @@ class OpenAIClient(LLMClientBase):
    def supports_structured_output(self, llm_config: LLMConfig) -> bool:
        return supports_structured_output(llm_config)

+    def _is_openrouter_request(self, llm_config: LLMConfig) -> bool:
+        return (llm_config.model_endpoint and "openrouter.ai" in llm_config.model_endpoint) or (llm_config.provider_name == "openrouter")
+
+    def _is_true_openai_request(self, llm_config: LLMConfig) -> bool:
+        if llm_config.model_endpoint_type != "openai":
+            return False
+
+        if self._is_openrouter_request(llm_config):
+            return False
+
+        # Keep Letta inference endpoint behavior unchanged.
+        if llm_config.model_endpoint == LETTA_MODEL_ENDPOINT:
+            return False
+
+        # If provider_name is explicitly set and not openai, don't apply OpenAI-specific prompt caching fields.
+        if llm_config.provider_name and llm_config.provider_name != "openai":
+            return False
+
+        return True
+
+    def _normalize_model_name(self, model: Optional[str]) -> Optional[str]:
+        if not model:
+            return None
+        return model.split("/", 1)[-1]
+
+    def _supports_extended_prompt_cache_retention(self, model: Optional[str]) -> bool:
+        normalized_model = self._normalize_model_name(model)
+        if not normalized_model:
+            return False
+
+        # Per OpenAI docs: extended retention is available on gpt-4.1 and gpt-5 family models but not gpt-5-mini or gpt-5.2-codex.
+        exceptions = ["gpt-5-mini", "gpt-5.2-codex"]
+        return normalized_model == "gpt-4.1" or normalized_model.startswith("gpt-5") and normalized_model not in exceptions
+
+    def _build_prompt_cache_key(self, messages: List[PydanticMessage]) -> Optional[str]:
+        agent_id = None
+        conversation_id = None
+
+        for message in reversed(messages):
+            if agent_id is None and getattr(message, "agent_id", None):
+                agent_id = message.agent_id
+            if conversation_id is None and getattr(message, "conversation_id", None):
+                conversation_id = message.conversation_id
+            if agent_id is not None and conversation_id is not None:
+                break
+
+        if agent_id is None:
+            agent_id = self._telemetry_agent_id
+
+        if agent_id is None:
+            return None
+
+        # Use requested fallback string for non-conversation/default-conversation paths.
+        if not conversation_id or conversation_id == "default":
+            conversation_id = "defaultconv"
+
+        return f"letta:{agent_id}:{conversation_id}"
+
+    def _apply_prompt_cache_settings(
+        self,
+        llm_config: LLMConfig,
+        model: Optional[str],
+        messages: List[PydanticMessage],
+        request_obj: Any,
+    ) -> None:
+        if not self._is_true_openai_request(llm_config):
+            return
+
+        prompt_cache_key = self._build_prompt_cache_key(messages)
+        if prompt_cache_key:
+            request_obj.prompt_cache_key = prompt_cache_key
+
+        if self._supports_extended_prompt_cache_retention(model):
+            request_obj.prompt_cache_retention = "24h"
+
    @trace_method
    def build_request_data_responses(
        self,
@@ -387,6 +462,13 @@ class OpenAIClient(LLMClientBase):

            data.model = "memgpt-openai"

+        self._apply_prompt_cache_settings(
+            llm_config=llm_config,
+            model=model,
+            messages=messages,
+            request_obj=data,
+        )
+
        request_data = data.model_dump(exclude_unset=True)
        # print("responses request data", request_data)
        return request_data
@@ -455,9 +537,7 @@ class OpenAIClient(LLMClientBase):
            model = None

        # TODO: we may need to extend this to more models using proxy?
-        is_openrouter = (llm_config.model_endpoint and "openrouter.ai" in llm_config.model_endpoint) or (
-            llm_config.provider_name == "openrouter"
-        )
+        is_openrouter = self._is_openrouter_request(llm_config)
        if is_openrouter:
            try:
                model = llm_config.handle.split("/", 1)[-1]
@@ -560,6 +640,13 @@ class OpenAIClient(LLMClientBase):
                    new_tools.append(tool.model_copy(deep=True))
                data.tools = new_tools

+        self._apply_prompt_cache_settings(
+            llm_config=llm_config,
+            model=model,
+            messages=messages,
+            request_obj=data,
+        )
+
        # Note: Tools are already processed by enable_strict_mode() in the workflow/agent code
        # (temporal_letta_v1_agent_workflow.py or letta_agent_v3.py) before reaching here.
        # enable_strict_mode() handles: strict flag, additionalProperties, required array, nullable fields