fix(core): add OpenAI prompt cache key and model-gated 24h retention (#9492)
* fix(core): apply OpenAI prompt cache settings to request payloads Set prompt_cache_key using agent and conversation context on both Responses and Chat Completions request builders, and enable 24h retention only for supported OpenAI models while excluding OpenRouter paths. 👾 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * fix(core): prefix prompt cache key with letta tag Add a `letta:` prefix to generated OpenAI prompt_cache_key values so cache-related entries are easier to identify in provider-side logs and diagnostics. 👾 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * add integration test * skip test --------- Co-authored-by: Letta <noreply@letta.com> Co-authored-by: Ari Webb <ari@letta.com>
This commit is contained in:
committed by
Caren Thomas
parent
5b001a7749
commit
619e81ed1e
@@ -247,6 +247,81 @@ class OpenAIClient(LLMClientBase):
|
||||
def supports_structured_output(self, llm_config: LLMConfig) -> bool:
|
||||
return supports_structured_output(llm_config)
|
||||
|
||||
def _is_openrouter_request(self, llm_config: LLMConfig) -> bool:
|
||||
return (llm_config.model_endpoint and "openrouter.ai" in llm_config.model_endpoint) or (llm_config.provider_name == "openrouter")
|
||||
|
||||
def _is_true_openai_request(self, llm_config: LLMConfig) -> bool:
|
||||
if llm_config.model_endpoint_type != "openai":
|
||||
return False
|
||||
|
||||
if self._is_openrouter_request(llm_config):
|
||||
return False
|
||||
|
||||
# Keep Letta inference endpoint behavior unchanged.
|
||||
if llm_config.model_endpoint == LETTA_MODEL_ENDPOINT:
|
||||
return False
|
||||
|
||||
# If provider_name is explicitly set and not openai, don't apply OpenAI-specific prompt caching fields.
|
||||
if llm_config.provider_name and llm_config.provider_name != "openai":
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _normalize_model_name(self, model: Optional[str]) -> Optional[str]:
|
||||
if not model:
|
||||
return None
|
||||
return model.split("/", 1)[-1]
|
||||
|
||||
def _supports_extended_prompt_cache_retention(self, model: Optional[str]) -> bool:
|
||||
normalized_model = self._normalize_model_name(model)
|
||||
if not normalized_model:
|
||||
return False
|
||||
|
||||
# Per OpenAI docs: extended retention is available on gpt-4.1 and gpt-5 family models but not gpt-5-mini or gpt-5.2-codex.
|
||||
exceptions = ["gpt-5-mini", "gpt-5.2-codex"]
|
||||
return normalized_model == "gpt-4.1" or normalized_model.startswith("gpt-5") and normalized_model not in exceptions
|
||||
|
||||
def _build_prompt_cache_key(self, messages: List[PydanticMessage]) -> Optional[str]:
|
||||
agent_id = None
|
||||
conversation_id = None
|
||||
|
||||
for message in reversed(messages):
|
||||
if agent_id is None and getattr(message, "agent_id", None):
|
||||
agent_id = message.agent_id
|
||||
if conversation_id is None and getattr(message, "conversation_id", None):
|
||||
conversation_id = message.conversation_id
|
||||
if agent_id is not None and conversation_id is not None:
|
||||
break
|
||||
|
||||
if agent_id is None:
|
||||
agent_id = self._telemetry_agent_id
|
||||
|
||||
if agent_id is None:
|
||||
return None
|
||||
|
||||
# Use requested fallback string for non-conversation/default-conversation paths.
|
||||
if not conversation_id or conversation_id == "default":
|
||||
conversation_id = "defaultconv"
|
||||
|
||||
return f"letta:{agent_id}:{conversation_id}"
|
||||
|
||||
def _apply_prompt_cache_settings(
|
||||
self,
|
||||
llm_config: LLMConfig,
|
||||
model: Optional[str],
|
||||
messages: List[PydanticMessage],
|
||||
request_obj: Any,
|
||||
) -> None:
|
||||
if not self._is_true_openai_request(llm_config):
|
||||
return
|
||||
|
||||
prompt_cache_key = self._build_prompt_cache_key(messages)
|
||||
if prompt_cache_key:
|
||||
request_obj.prompt_cache_key = prompt_cache_key
|
||||
|
||||
if self._supports_extended_prompt_cache_retention(model):
|
||||
request_obj.prompt_cache_retention = "24h"
|
||||
|
||||
@trace_method
|
||||
def build_request_data_responses(
|
||||
self,
|
||||
@@ -387,6 +462,13 @@ class OpenAIClient(LLMClientBase):
|
||||
|
||||
data.model = "memgpt-openai"
|
||||
|
||||
self._apply_prompt_cache_settings(
|
||||
llm_config=llm_config,
|
||||
model=model,
|
||||
messages=messages,
|
||||
request_obj=data,
|
||||
)
|
||||
|
||||
request_data = data.model_dump(exclude_unset=True)
|
||||
# print("responses request data", request_data)
|
||||
return request_data
|
||||
@@ -455,9 +537,7 @@ class OpenAIClient(LLMClientBase):
|
||||
model = None
|
||||
|
||||
# TODO: we may need to extend this to more models using proxy?
|
||||
is_openrouter = (llm_config.model_endpoint and "openrouter.ai" in llm_config.model_endpoint) or (
|
||||
llm_config.provider_name == "openrouter"
|
||||
)
|
||||
is_openrouter = self._is_openrouter_request(llm_config)
|
||||
if is_openrouter:
|
||||
try:
|
||||
model = llm_config.handle.split("/", 1)[-1]
|
||||
@@ -560,6 +640,13 @@ class OpenAIClient(LLMClientBase):
|
||||
new_tools.append(tool.model_copy(deep=True))
|
||||
data.tools = new_tools
|
||||
|
||||
self._apply_prompt_cache_settings(
|
||||
llm_config=llm_config,
|
||||
model=model,
|
||||
messages=messages,
|
||||
request_obj=data,
|
||||
)
|
||||
|
||||
# Note: Tools are already processed by enable_strict_mode() in the workflow/agent code
|
||||
# (temporal_letta_v1_agent_workflow.py or letta_agent_v3.py) before reaching here.
|
||||
# enable_strict_mode() handles: strict flag, additionalProperties, required array, nullable fields
|
||||
|
||||
Reference in New Issue
Block a user