Files
letta-server/tests/test_openai_prompt_cache_request_fields.py
Ari Webb 21765d16c9 fix(core): add OpenAI 24h prompt cache retention for supported models (#9509)
* fix(core): add OpenAI prompt cache key and model-gated 24h retention (#9492)

* fix(core): apply OpenAI prompt cache settings to request payloads

Set prompt_cache_key using agent and conversation context on both Responses and Chat Completions request builders, and enable 24h retention only for supported OpenAI models while excluding OpenRouter paths.

👾 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* fix(core): prefix prompt cache key with letta tag

Add a `letta:` prefix to generated OpenAI prompt_cache_key values so cache-related entries are easier to identify in provider-side logs and diagnostics.

👾 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* add integration test

* skip test

---------

Co-authored-by: Letta <noreply@letta.com>
Co-authored-by: Ari Webb <ari@letta.com>

* fix(core): only set prompt_cache_retention, drop prompt_cache_key

Two issues with the original prompt_cache_key approach:
1. Key exceeded 64-char max (agent-<uuid>:conv-<uuid> = 90 chars)
2. Setting an explicit key disrupted OpenAI's default prefix-hash
   routing, dropping cache hit rates from 40-45% to 10-13%

OpenAI's default routing (hash of first ~256 tokens) already provides
good cache affinity since each agent has a unique system prompt.
We only need prompt_cache_retention="24h" for extended retention.

Also fixes:
- Operator precedence bug in _supports_extended_prompt_cache_retention
- Removes incorrect gpt-5.2-codex exclusion (it IS supported per docs)

🐾 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

---------

Co-authored-by: Charles Packer <packercharles@gmail.com>
Co-authored-by: Letta <noreply@letta.com>
2026-02-24 10:55:11 -08:00

151 lines
4.6 KiB
Python

from letta.llm_api.openai_client import OpenAIClient
from letta.schemas.enums import AgentType, MessageRole
from letta.schemas.letta_message_content import TextContent
from letta.schemas.llm_config import LLMConfig
from letta.schemas.message import Message
def _message(text: str = "hello") -> Message:
return Message(
role=MessageRole.user,
content=[TextContent(text=text)],
agent_id="agent-abc",
)
def _openai_config(model: str, endpoint_type: str = "openai", provider_name: str | None = "openai") -> LLMConfig:
return LLMConfig(
model=model,
model_endpoint_type=endpoint_type,
model_endpoint="https://api.openai.com/v1",
context_window=256000,
provider_name=provider_name,
)
def test_responses_request_sets_24h_retention_for_supported_model():
client = OpenAIClient()
llm_config = _openai_config(model="gpt-5.1")
messages = [_message()]
request_data = client.build_request_data(
agent_type=AgentType.letta_v1_agent,
messages=messages,
llm_config=llm_config,
tools=[],
)
assert "input" in request_data
assert "prompt_cache_key" not in request_data
assert request_data.get("prompt_cache_retention") == "24h"
def test_responses_request_omits_24h_for_unsupported_model():
client = OpenAIClient()
llm_config = _openai_config(model="o3-mini")
messages = [_message()]
request_data = client.build_request_data(
agent_type=AgentType.letta_v1_agent,
messages=messages,
llm_config=llm_config,
tools=[],
)
assert "prompt_cache_key" not in request_data
assert "prompt_cache_retention" not in request_data
def test_chat_completions_request_sets_24h_retention_for_supported_model():
client = OpenAIClient()
llm_config = _openai_config(model="gpt-4.1")
messages = [_message()]
request_data = client.build_request_data(
agent_type=AgentType.memgpt_v2_agent,
messages=messages,
llm_config=llm_config,
tools=[],
)
assert "messages" in request_data
assert "prompt_cache_key" not in request_data
assert request_data.get("prompt_cache_retention") == "24h"
def test_chat_completions_request_omits_24h_for_unsupported_model():
client = OpenAIClient()
llm_config = _openai_config(model="gpt-4o-mini")
messages = [_message()]
request_data = client.build_request_data(
agent_type=AgentType.memgpt_v2_agent,
messages=messages,
llm_config=llm_config,
tools=[],
)
assert "prompt_cache_key" not in request_data
assert "prompt_cache_retention" not in request_data
def test_openrouter_request_omits_all_prompt_cache_fields():
client = OpenAIClient()
llm_config = LLMConfig(
model="gpt-5.1",
handle="openrouter/gpt-5.1",
model_endpoint_type="openai",
model_endpoint="https://openrouter.ai/api/v1",
context_window=256000,
provider_name="openrouter",
)
messages = [_message()]
responses_request_data = client.build_request_data(
agent_type=AgentType.letta_v1_agent,
messages=messages,
llm_config=llm_config,
tools=[],
)
chat_request_data = client.build_request_data(
agent_type=AgentType.memgpt_v2_agent,
messages=messages,
llm_config=llm_config,
tools=[],
)
assert "prompt_cache_key" not in responses_request_data
assert "prompt_cache_retention" not in responses_request_data
assert "prompt_cache_key" not in chat_request_data
assert "prompt_cache_retention" not in chat_request_data
def test_gpt5_family_gets_24h_retention():
"""gpt-5, gpt-5-codex, gpt-5.1, gpt-5.2 all get 24h retention."""
client = OpenAIClient()
for model in ["gpt-5", "gpt-5-codex", "gpt-5.1", "gpt-5.1-codex", "gpt-5.2"]:
llm_config = _openai_config(model=model)
request_data = client.build_request_data(
agent_type=AgentType.letta_v1_agent,
messages=[_message()],
llm_config=llm_config,
tools=[],
)
assert request_data.get("prompt_cache_retention") == "24h", f"{model} should get 24h retention"
def test_gpt5_mini_excluded_from_24h_retention():
"""gpt-5-mini is not listed in OpenAI docs for extended retention."""
client = OpenAIClient()
llm_config = _openai_config(model="gpt-5-mini")
request_data = client.build_request_data(
agent_type=AgentType.letta_v1_agent,
messages=[_message()],
llm_config=llm_config,
tools=[],
)
assert "prompt_cache_retention" not in request_data