fix(core): add OpenAI prompt cache key and model-gated 24h retention (#9492)

* fix(core): apply OpenAI prompt cache settings to request payloads

Set prompt_cache_key using agent and conversation context on both Responses and Chat Completions request builders, and enable 24h retention only for supported OpenAI models while excluding OpenRouter paths.

👾 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* fix(core): prefix prompt cache key with letta tag

Add a `letta:` prefix to generated OpenAI prompt_cache_key values so cache-related entries are easier to identify in provider-side logs and diagnostics.

👾 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* add integration test

* skip test

---------

Co-authored-by: Letta <noreply@letta.com>
Co-authored-by: Ari Webb <ari@letta.com>
This commit is contained in:
Charles Packer
2026-02-16 13:27:42 -08:00
committed by Caren Thomas
parent 5b001a7749
commit 619e81ed1e
5 changed files with 444 additions and 3 deletions

View File

@@ -247,6 +247,81 @@ class OpenAIClient(LLMClientBase):
def supports_structured_output(self, llm_config: LLMConfig) -> bool:
return supports_structured_output(llm_config)
def _is_openrouter_request(self, llm_config: LLMConfig) -> bool:
return (llm_config.model_endpoint and "openrouter.ai" in llm_config.model_endpoint) or (llm_config.provider_name == "openrouter")
def _is_true_openai_request(self, llm_config: LLMConfig) -> bool:
if llm_config.model_endpoint_type != "openai":
return False
if self._is_openrouter_request(llm_config):
return False
# Keep Letta inference endpoint behavior unchanged.
if llm_config.model_endpoint == LETTA_MODEL_ENDPOINT:
return False
# If provider_name is explicitly set and not openai, don't apply OpenAI-specific prompt caching fields.
if llm_config.provider_name and llm_config.provider_name != "openai":
return False
return True
def _normalize_model_name(self, model: Optional[str]) -> Optional[str]:
if not model:
return None
return model.split("/", 1)[-1]
def _supports_extended_prompt_cache_retention(self, model: Optional[str]) -> bool:
normalized_model = self._normalize_model_name(model)
if not normalized_model:
return False
# Per OpenAI docs: extended retention is available on gpt-4.1 and gpt-5 family models but not gpt-5-mini or gpt-5.2-codex.
exceptions = ["gpt-5-mini", "gpt-5.2-codex"]
return normalized_model == "gpt-4.1" or normalized_model.startswith("gpt-5") and normalized_model not in exceptions
def _build_prompt_cache_key(self, messages: List[PydanticMessage]) -> Optional[str]:
agent_id = None
conversation_id = None
for message in reversed(messages):
if agent_id is None and getattr(message, "agent_id", None):
agent_id = message.agent_id
if conversation_id is None and getattr(message, "conversation_id", None):
conversation_id = message.conversation_id
if agent_id is not None and conversation_id is not None:
break
if agent_id is None:
agent_id = self._telemetry_agent_id
if agent_id is None:
return None
# Use requested fallback string for non-conversation/default-conversation paths.
if not conversation_id or conversation_id == "default":
conversation_id = "defaultconv"
return f"letta:{agent_id}:{conversation_id}"
def _apply_prompt_cache_settings(
self,
llm_config: LLMConfig,
model: Optional[str],
messages: List[PydanticMessage],
request_obj: Any,
) -> None:
if not self._is_true_openai_request(llm_config):
return
prompt_cache_key = self._build_prompt_cache_key(messages)
if prompt_cache_key:
request_obj.prompt_cache_key = prompt_cache_key
if self._supports_extended_prompt_cache_retention(model):
request_obj.prompt_cache_retention = "24h"
@trace_method
def build_request_data_responses(
self,
@@ -387,6 +462,13 @@ class OpenAIClient(LLMClientBase):
data.model = "memgpt-openai"
self._apply_prompt_cache_settings(
llm_config=llm_config,
model=model,
messages=messages,
request_obj=data,
)
request_data = data.model_dump(exclude_unset=True)
# print("responses request data", request_data)
return request_data
@@ -455,9 +537,7 @@ class OpenAIClient(LLMClientBase):
model = None
# TODO: we may need to extend this to more models using proxy?
is_openrouter = (llm_config.model_endpoint and "openrouter.ai" in llm_config.model_endpoint) or (
llm_config.provider_name == "openrouter"
)
is_openrouter = self._is_openrouter_request(llm_config)
if is_openrouter:
try:
model = llm_config.handle.split("/", 1)[-1]
@@ -560,6 +640,13 @@ class OpenAIClient(LLMClientBase):
new_tools.append(tool.model_copy(deep=True))
data.tools = new_tools
self._apply_prompt_cache_settings(
llm_config=llm_config,
model=model,
messages=messages,
request_obj=data,
)
# Note: Tools are already processed by enable_strict_mode() in the workflow/agent code
# (temporal_letta_v1_agent_workflow.py or letta_agent_v3.py) before reaching here.
# enable_strict_mode() handles: strict flag, additionalProperties, required array, nullable fields