fix(core): add OpenAI 24h prompt cache retention for supported models (#9509)
* fix(core): add OpenAI prompt cache key and model-gated 24h retention (#9492) * fix(core): apply OpenAI prompt cache settings to request payloads Set prompt_cache_key using agent and conversation context on both Responses and Chat Completions request builders, and enable 24h retention only for supported OpenAI models while excluding OpenRouter paths. 👾 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * fix(core): prefix prompt cache key with letta tag Add a `letta:` prefix to generated OpenAI prompt_cache_key values so cache-related entries are easier to identify in provider-side logs and diagnostics. 👾 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * add integration test * skip test --------- Co-authored-by: Letta <noreply@letta.com> Co-authored-by: Ari Webb <ari@letta.com> * fix(core): only set prompt_cache_retention, drop prompt_cache_key Two issues with the original prompt_cache_key approach: 1. Key exceeded 64-char max (agent-<uuid>:conv-<uuid> = 90 chars) 2. Setting an explicit key disrupted OpenAI's default prefix-hash routing, dropping cache hit rates from 40-45% to 10-13% OpenAI's default routing (hash of first ~256 tokens) already provides good cache affinity since each agent has a unique system prompt. We only need prompt_cache_retention="24h" for extended retention. Also fixes: - Operator precedence bug in _supports_extended_prompt_cache_retention - Removes incorrect gpt-5.2-codex exclusion (it IS supported per docs) 🐾 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> --------- Co-authored-by: Charles Packer <packercharles@gmail.com> Co-authored-by: Letta <noreply@letta.com>
This commit is contained in:
@@ -245,6 +245,64 @@ class OpenAIClient(LLMClientBase):
|
||||
def supports_structured_output(self, llm_config: LLMConfig) -> bool:
|
||||
return supports_structured_output(llm_config)
|
||||
|
||||
def _is_openrouter_request(self, llm_config: LLMConfig) -> bool:
|
||||
return (llm_config.model_endpoint and "openrouter.ai" in llm_config.model_endpoint) or (llm_config.provider_name == "openrouter")
|
||||
|
||||
def _is_true_openai_request(self, llm_config: LLMConfig) -> bool:
|
||||
if llm_config.model_endpoint_type != "openai":
|
||||
return False
|
||||
|
||||
if self._is_openrouter_request(llm_config):
|
||||
return False
|
||||
|
||||
# Keep Letta inference endpoint behavior unchanged.
|
||||
if llm_config.model_endpoint == LETTA_MODEL_ENDPOINT:
|
||||
return False
|
||||
|
||||
# If provider_name is explicitly set and not openai, don't apply OpenAI-specific prompt caching fields.
|
||||
if llm_config.provider_name and llm_config.provider_name != "openai":
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _normalize_model_name(self, model: Optional[str]) -> Optional[str]:
|
||||
if not model:
|
||||
return None
|
||||
return model.split("/", 1)[-1]
|
||||
|
||||
def _supports_extended_prompt_cache_retention(self, model: Optional[str]) -> bool:
|
||||
normalized_model = self._normalize_model_name(model)
|
||||
if not normalized_model:
|
||||
return False
|
||||
|
||||
# Per OpenAI docs: extended retention is available on gpt-4.1 and gpt-5 family models.
|
||||
# gpt-5-mini is excluded (not listed in docs).
|
||||
return normalized_model == "gpt-4.1" or (normalized_model.startswith("gpt-5") and normalized_model != "gpt-5-mini")
|
||||
|
||||
def _apply_prompt_cache_settings(
|
||||
self,
|
||||
llm_config: LLMConfig,
|
||||
model: Optional[str],
|
||||
messages: List[PydanticMessage],
|
||||
request_obj: Any,
|
||||
) -> None:
|
||||
"""Apply OpenAI prompt cache settings to the request.
|
||||
|
||||
We intentionally do NOT set prompt_cache_key. OpenAI's default routing
|
||||
(based on a hash of the first ~256 tokens of the prompt) already provides
|
||||
good cache affinity for Letta agents, since each agent has a unique system
|
||||
prompt. Setting an explicit key can disrupt existing warm caches and reduce
|
||||
hit rates.
|
||||
|
||||
We only set prompt_cache_retention to "24h" for models that support extended
|
||||
retention, which keeps cached prefixes active longer (up to 24h vs 5-10min).
|
||||
"""
|
||||
if not self._is_true_openai_request(llm_config):
|
||||
return
|
||||
|
||||
if self._supports_extended_prompt_cache_retention(model):
|
||||
request_obj.prompt_cache_retention = "24h"
|
||||
|
||||
@trace_method
|
||||
def build_request_data_responses(
|
||||
self,
|
||||
@@ -385,6 +443,13 @@ class OpenAIClient(LLMClientBase):
|
||||
|
||||
data.model = "memgpt-openai"
|
||||
|
||||
self._apply_prompt_cache_settings(
|
||||
llm_config=llm_config,
|
||||
model=model,
|
||||
messages=messages,
|
||||
request_obj=data,
|
||||
)
|
||||
|
||||
request_data = data.model_dump(exclude_unset=True)
|
||||
# print("responses request data", request_data)
|
||||
return request_data
|
||||
@@ -453,9 +518,7 @@ class OpenAIClient(LLMClientBase):
|
||||
model = None
|
||||
|
||||
# TODO: we may need to extend this to more models using proxy?
|
||||
is_openrouter = (llm_config.model_endpoint and "openrouter.ai" in llm_config.model_endpoint) or (
|
||||
llm_config.provider_name == "openrouter"
|
||||
)
|
||||
is_openrouter = self._is_openrouter_request(llm_config)
|
||||
if is_openrouter:
|
||||
try:
|
||||
model = llm_config.handle.split("/", 1)[-1]
|
||||
@@ -558,6 +621,13 @@ class OpenAIClient(LLMClientBase):
|
||||
new_tools.append(tool.model_copy(deep=True))
|
||||
data.tools = new_tools
|
||||
|
||||
self._apply_prompt_cache_settings(
|
||||
llm_config=llm_config,
|
||||
model=model,
|
||||
messages=messages,
|
||||
request_obj=data,
|
||||
)
|
||||
|
||||
# Note: Tools are already processed by enable_strict_mode() in the workflow/agent code
|
||||
# (temporal_letta_v1_agent_workflow.py or letta_agent_v3.py) before reaching here.
|
||||
# enable_strict_mode() handles: strict flag, additionalProperties, required array, nullable fields
|
||||
|
||||
@@ -143,6 +143,7 @@ class ChatCompletionRequest(BaseModel):
|
||||
temperature: Optional[float] = 1
|
||||
top_p: Optional[float] = 1
|
||||
user: Optional[str] = None # unique ID of the end-user (for monitoring)
|
||||
prompt_cache_retention: Optional[Literal["in_memory", "24h"]] = None
|
||||
parallel_tool_calls: Optional[bool] = None
|
||||
instructions: Optional[str] = None
|
||||
verbosity: Optional[Literal["low", "medium", "high"]] = None # For verbosity control in GPT-5 models
|
||||
|
||||
@@ -29,7 +29,7 @@ class ResponsesRequest(BaseModel):
|
||||
parallel_tool_calls: Optional[bool] = Field(default=NOT_GIVEN)
|
||||
previous_response_id: Optional[str] = Field(default=NOT_GIVEN)
|
||||
prompt: Optional[ResponsePromptParam] = Field(default=NOT_GIVEN)
|
||||
prompt_cache_key: Optional[str] = Field(default=NOT_GIVEN)
|
||||
prompt_cache_retention: Optional[Literal["in_memory", "24h"]] = Field(default=NOT_GIVEN)
|
||||
reasoning: Optional[Reasoning] = Field(default=NOT_GIVEN)
|
||||
safety_identifier: Optional[str] = Field(default=NOT_GIVEN)
|
||||
service_tier: Optional[Literal["auto", "default", "flex", "scale", "priority"]] = Field(default=NOT_GIVEN)
|
||||
|
||||
@@ -1193,3 +1193,217 @@ async def test_json_schema_response_format(
|
||||
finally:
|
||||
# Cleanup
|
||||
await client.agents.delete(agent_state.id)
|
||||
|
||||
|
||||
# Large memory block to exceed OpenAI's 1024 token caching threshold.
|
||||
# This ensures the system prompt is large enough for OpenAI to cache it.
|
||||
_LARGE_PERSONA_BLOCK = """
|
||||
You are an advanced AI assistant with extensive knowledge across multiple domains.
|
||||
|
||||
# Core Capabilities
|
||||
|
||||
## Technical Knowledge
|
||||
- Software Engineering: Expert in Python, JavaScript, TypeScript, Go, Rust, and many other languages
|
||||
- System Design: Deep understanding of distributed systems, microservices, and cloud architecture
|
||||
- DevOps: Proficient in Docker, Kubernetes, CI/CD pipelines, and infrastructure as code
|
||||
- Databases: Experience with SQL (PostgreSQL, MySQL) and NoSQL (MongoDB, Redis, Cassandra) databases
|
||||
- Machine Learning: Knowledge of neural networks, transformers, and modern ML frameworks
|
||||
|
||||
## Problem Solving Approach
|
||||
When tackling problems, you follow a structured methodology:
|
||||
1. Understand the requirements thoroughly
|
||||
2. Break down complex problems into manageable components
|
||||
3. Consider multiple solution approaches
|
||||
4. Evaluate trade-offs between different options
|
||||
5. Implement solutions with clean, maintainable code
|
||||
6. Test thoroughly and iterate based on feedback
|
||||
|
||||
## Communication Style
|
||||
- Clear and concise explanations
|
||||
- Use examples and analogies when helpful
|
||||
- Adapt technical depth to the audience
|
||||
- Ask clarifying questions when requirements are ambiguous
|
||||
- Provide context and rationale for recommendations
|
||||
|
||||
# Domain Expertise
|
||||
|
||||
## Web Development
|
||||
You have deep knowledge of:
|
||||
- Frontend: React, Vue, Angular, Next.js, modern CSS frameworks
|
||||
- Backend: Node.js, Express, FastAPI, Django, Flask
|
||||
- API Design: REST, GraphQL, gRPC
|
||||
- Authentication: OAuth, JWT, session management
|
||||
- Performance: Caching strategies, CDNs, lazy loading
|
||||
|
||||
## Data Engineering
|
||||
You understand:
|
||||
- ETL pipelines and data transformation
|
||||
- Data warehousing concepts (Snowflake, BigQuery, Redshift)
|
||||
- Stream processing (Kafka, Kinesis)
|
||||
- Data modeling and schema design
|
||||
- Data quality and validation
|
||||
|
||||
## Cloud Platforms
|
||||
You're familiar with:
|
||||
- AWS: EC2, S3, Lambda, RDS, DynamoDB, CloudFormation
|
||||
- GCP: Compute Engine, Cloud Storage, Cloud Functions, BigQuery
|
||||
- Azure: Virtual Machines, Blob Storage, Azure Functions
|
||||
- Serverless architectures and best practices
|
||||
- Cost optimization strategies
|
||||
|
||||
## Security
|
||||
You consider:
|
||||
- Common vulnerabilities (OWASP Top 10)
|
||||
- Secure coding practices
|
||||
- Encryption and key management
|
||||
- Access control and authorization patterns
|
||||
- Security audit and compliance requirements
|
||||
|
||||
# Interaction Principles
|
||||
|
||||
## Helpfulness
|
||||
- Provide actionable guidance
|
||||
- Share relevant resources and documentation
|
||||
- Offer multiple approaches when appropriate
|
||||
- Point out potential pitfalls and edge cases
|
||||
|
||||
## Accuracy
|
||||
- Verify information before sharing
|
||||
- Acknowledge uncertainty when appropriate
|
||||
- Correct mistakes promptly
|
||||
- Stay up-to-date with best practices
|
||||
|
||||
## Efficiency
|
||||
- Get to the point quickly
|
||||
- Avoid unnecessary verbosity
|
||||
- Focus on what's most relevant
|
||||
- Provide code examples when they clarify concepts
|
||||
""" + "\n\n".join(
|
||||
[
|
||||
f"Section {i + 1}: "
|
||||
+ """
|
||||
You have deep expertise in software development, including but not limited to:
|
||||
- Programming languages: Python, JavaScript, TypeScript, Java, C++, Rust, Go, Swift, Kotlin, Ruby, PHP, Scala
|
||||
- Web frameworks: React, Vue, Angular, Django, Flask, FastAPI, Express, Next.js, Nuxt, SvelteKit, Remix, Astro
|
||||
- Databases: PostgreSQL, MySQL, MongoDB, Redis, Cassandra, DynamoDB, ElasticSearch, Neo4j, InfluxDB, TimescaleDB
|
||||
- Cloud platforms: AWS (EC2, S3, Lambda, ECS, EKS, RDS), GCP (Compute Engine, Cloud Run, GKE), Azure (VMs, Functions, AKS)
|
||||
- DevOps tools: Docker, Kubernetes, Terraform, Ansible, Jenkins, GitHub Actions, GitLab CI, CircleCI, ArgoCD
|
||||
- Testing frameworks: pytest, Jest, Mocha, JUnit, unittest, Cypress, Playwright, Selenium, TestNG, RSpec
|
||||
- Architecture patterns: Microservices, Event-driven, Serverless, Monolithic, CQRS, Event Sourcing, Hexagonal
|
||||
- API design: REST, GraphQL, gRPC, WebSockets, Server-Sent Events, tRPC, JSON-RPC
|
||||
"""
|
||||
for i in range(4)
|
||||
]
|
||||
)
|
||||
|
||||
# Models that support prompt_cache_retention="24h":
|
||||
# gpt-4.1, gpt-5 family (but not gpt-5-mini).
|
||||
_PROMPT_CACHE_RETENTION_PREFIXES = ("gpt-4.1", "gpt-5")
|
||||
|
||||
PROMPT_CACHE_MODEL_CONFIGS: List[Tuple[str, dict]] = [
|
||||
(handle, settings)
|
||||
for handle, settings in TESTED_MODEL_CONFIGS
|
||||
if settings.get("provider_type") == "openai" and any(handle.split("/")[-1].startswith(p) for p in _PROMPT_CACHE_RETENTION_PREFIXES)
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="the prompt caching is flaky")
|
||||
@pytest.mark.parametrize(
|
||||
"model_config",
|
||||
PROMPT_CACHE_MODEL_CONFIGS,
|
||||
ids=[handle for handle, _ in PROMPT_CACHE_MODEL_CONFIGS],
|
||||
)
|
||||
@pytest.mark.asyncio(loop_scope="function")
|
||||
async def test_openai_prompt_cache_integration(
|
||||
disable_e2b_api_key: Any,
|
||||
client: AsyncLetta,
|
||||
model_config: Tuple[str, dict],
|
||||
) -> None:
|
||||
"""
|
||||
Integration test verifying OpenAI prompt caching works end-to-end.
|
||||
|
||||
Tests models that support prompt_cache_retention="24h".
|
||||
Validates that this field is accepted by OpenAI's API and produce cache hits.
|
||||
|
||||
Strategy:
|
||||
1. Create an agent with a large persona block (>1024 tokens, OpenAI's caching threshold)
|
||||
2. Send message 1 -> primes the cache (cached_input_tokens should be 0 or small)
|
||||
3. Send message 2 -> should hit the cache (cached_input_tokens > 0)
|
||||
|
||||
We rely on OpenAI's default prefix-hash routing (no prompt_cache_key) since each
|
||||
agent has a unique system prompt, providing natural cache affinity.
|
||||
"""
|
||||
from letta_client.types import CreateBlockParam
|
||||
|
||||
model_handle, model_settings = model_config
|
||||
|
||||
agent = await client.agents.create(
|
||||
name=f"prompt-cache-test-{uuid.uuid4().hex[:8]}",
|
||||
agent_type="letta_v1_agent",
|
||||
model=model_handle,
|
||||
model_settings=model_settings,
|
||||
embedding="openai/text-embedding-3-small",
|
||||
include_base_tools=False,
|
||||
memory_blocks=[
|
||||
CreateBlockParam(
|
||||
label="persona",
|
||||
value=_LARGE_PERSONA_BLOCK,
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
try:
|
||||
# Message 1: Prime the cache. First request typically has cached_input_tokens=0.
|
||||
response1 = await client.agents.messages.create(
|
||||
agent_id=agent.id,
|
||||
messages=[MessageCreateParam(role="user", content="Hello! Please introduce yourself briefly.")],
|
||||
)
|
||||
assert response1.usage is not None, "First message should return usage data"
|
||||
assert response1.usage.prompt_tokens > 0, "First message should have prompt_tokens > 0"
|
||||
|
||||
logger.info(
|
||||
f"[{model_handle}] Message 1 usage: "
|
||||
f"prompt={response1.usage.prompt_tokens}, "
|
||||
f"completion={response1.usage.completion_tokens}, "
|
||||
f"cached_input={response1.usage.cached_input_tokens}"
|
||||
)
|
||||
|
||||
# Verify we exceeded the 1024 token threshold for OpenAI caching
|
||||
total_input_tokens = response1.usage.prompt_tokens + (response1.usage.cached_input_tokens or 0)
|
||||
assert total_input_tokens >= 1024, f"Total input tokens ({total_input_tokens}) must be >= 1024 for OpenAI caching to activate"
|
||||
|
||||
# Message 2: Should hit the cache thanks to prefix-hash routing.
|
||||
response2 = await client.agents.messages.create(
|
||||
agent_id=agent.id,
|
||||
messages=[MessageCreateParam(role="user", content="What are your main areas of expertise?")],
|
||||
)
|
||||
assert response2.usage is not None, "Second message should return usage data"
|
||||
assert response2.usage.prompt_tokens > 0, "Second message should have prompt_tokens > 0"
|
||||
|
||||
logger.info(
|
||||
f"[{model_handle}] Message 2 usage: "
|
||||
f"prompt={response2.usage.prompt_tokens}, "
|
||||
f"completion={response2.usage.completion_tokens}, "
|
||||
f"cached_input={response2.usage.cached_input_tokens}"
|
||||
)
|
||||
|
||||
# CRITICAL: The second message should show cached_input_tokens > 0.
|
||||
# This proves that prompt_cache_retention is being sent correctly
|
||||
# and OpenAI is caching the prompt prefix.
|
||||
cached_tokens = response2.usage.cached_input_tokens
|
||||
assert cached_tokens is not None and cached_tokens > 0, (
|
||||
f"[{model_handle}] Expected cached_input_tokens > 0 on second message, got {cached_tokens}. "
|
||||
"This means prompt caching is not working (cache miss occurred)."
|
||||
)
|
||||
|
||||
# Cache hit ratio should be significant (most of the system prompt should be cached)
|
||||
total_input_msg2 = response2.usage.prompt_tokens + (response2.usage.cached_input_tokens or 0)
|
||||
cache_hit_ratio = cached_tokens / total_input_msg2 if total_input_msg2 > 0 else 0
|
||||
logger.info(f"[{model_handle}] Cache hit ratio: {cache_hit_ratio:.2%}")
|
||||
|
||||
assert cache_hit_ratio >= 0.20, (
|
||||
f"[{model_handle}] Expected cache hit ratio >= 20%, got {cache_hit_ratio:.2%}. The large persona block should be mostly cached."
|
||||
)
|
||||
|
||||
finally:
|
||||
await client.agents.delete(agent.id)
|
||||
|
||||
150
tests/test_openai_prompt_cache_request_fields.py
Normal file
150
tests/test_openai_prompt_cache_request_fields.py
Normal file
@@ -0,0 +1,150 @@
|
||||
from letta.llm_api.openai_client import OpenAIClient
|
||||
from letta.schemas.enums import AgentType, MessageRole
|
||||
from letta.schemas.letta_message_content import TextContent
|
||||
from letta.schemas.llm_config import LLMConfig
|
||||
from letta.schemas.message import Message
|
||||
|
||||
|
||||
def _message(text: str = "hello") -> Message:
|
||||
return Message(
|
||||
role=MessageRole.user,
|
||||
content=[TextContent(text=text)],
|
||||
agent_id="agent-abc",
|
||||
)
|
||||
|
||||
|
||||
def _openai_config(model: str, endpoint_type: str = "openai", provider_name: str | None = "openai") -> LLMConfig:
|
||||
return LLMConfig(
|
||||
model=model,
|
||||
model_endpoint_type=endpoint_type,
|
||||
model_endpoint="https://api.openai.com/v1",
|
||||
context_window=256000,
|
||||
provider_name=provider_name,
|
||||
)
|
||||
|
||||
|
||||
def test_responses_request_sets_24h_retention_for_supported_model():
|
||||
client = OpenAIClient()
|
||||
llm_config = _openai_config(model="gpt-5.1")
|
||||
messages = [_message()]
|
||||
|
||||
request_data = client.build_request_data(
|
||||
agent_type=AgentType.letta_v1_agent,
|
||||
messages=messages,
|
||||
llm_config=llm_config,
|
||||
tools=[],
|
||||
)
|
||||
|
||||
assert "input" in request_data
|
||||
assert "prompt_cache_key" not in request_data
|
||||
assert request_data.get("prompt_cache_retention") == "24h"
|
||||
|
||||
|
||||
def test_responses_request_omits_24h_for_unsupported_model():
|
||||
client = OpenAIClient()
|
||||
llm_config = _openai_config(model="o3-mini")
|
||||
messages = [_message()]
|
||||
|
||||
request_data = client.build_request_data(
|
||||
agent_type=AgentType.letta_v1_agent,
|
||||
messages=messages,
|
||||
llm_config=llm_config,
|
||||
tools=[],
|
||||
)
|
||||
|
||||
assert "prompt_cache_key" not in request_data
|
||||
assert "prompt_cache_retention" not in request_data
|
||||
|
||||
|
||||
def test_chat_completions_request_sets_24h_retention_for_supported_model():
|
||||
client = OpenAIClient()
|
||||
llm_config = _openai_config(model="gpt-4.1")
|
||||
messages = [_message()]
|
||||
|
||||
request_data = client.build_request_data(
|
||||
agent_type=AgentType.memgpt_v2_agent,
|
||||
messages=messages,
|
||||
llm_config=llm_config,
|
||||
tools=[],
|
||||
)
|
||||
|
||||
assert "messages" in request_data
|
||||
assert "prompt_cache_key" not in request_data
|
||||
assert request_data.get("prompt_cache_retention") == "24h"
|
||||
|
||||
|
||||
def test_chat_completions_request_omits_24h_for_unsupported_model():
|
||||
client = OpenAIClient()
|
||||
llm_config = _openai_config(model="gpt-4o-mini")
|
||||
messages = [_message()]
|
||||
|
||||
request_data = client.build_request_data(
|
||||
agent_type=AgentType.memgpt_v2_agent,
|
||||
messages=messages,
|
||||
llm_config=llm_config,
|
||||
tools=[],
|
||||
)
|
||||
|
||||
assert "prompt_cache_key" not in request_data
|
||||
assert "prompt_cache_retention" not in request_data
|
||||
|
||||
|
||||
def test_openrouter_request_omits_all_prompt_cache_fields():
|
||||
client = OpenAIClient()
|
||||
llm_config = LLMConfig(
|
||||
model="gpt-5.1",
|
||||
handle="openrouter/gpt-5.1",
|
||||
model_endpoint_type="openai",
|
||||
model_endpoint="https://openrouter.ai/api/v1",
|
||||
context_window=256000,
|
||||
provider_name="openrouter",
|
||||
)
|
||||
messages = [_message()]
|
||||
|
||||
responses_request_data = client.build_request_data(
|
||||
agent_type=AgentType.letta_v1_agent,
|
||||
messages=messages,
|
||||
llm_config=llm_config,
|
||||
tools=[],
|
||||
)
|
||||
chat_request_data = client.build_request_data(
|
||||
agent_type=AgentType.memgpt_v2_agent,
|
||||
messages=messages,
|
||||
llm_config=llm_config,
|
||||
tools=[],
|
||||
)
|
||||
|
||||
assert "prompt_cache_key" not in responses_request_data
|
||||
assert "prompt_cache_retention" not in responses_request_data
|
||||
assert "prompt_cache_key" not in chat_request_data
|
||||
assert "prompt_cache_retention" not in chat_request_data
|
||||
|
||||
|
||||
def test_gpt5_family_gets_24h_retention():
|
||||
"""gpt-5, gpt-5-codex, gpt-5.1, gpt-5.2 all get 24h retention."""
|
||||
client = OpenAIClient()
|
||||
|
||||
for model in ["gpt-5", "gpt-5-codex", "gpt-5.1", "gpt-5.1-codex", "gpt-5.2"]:
|
||||
llm_config = _openai_config(model=model)
|
||||
request_data = client.build_request_data(
|
||||
agent_type=AgentType.letta_v1_agent,
|
||||
messages=[_message()],
|
||||
llm_config=llm_config,
|
||||
tools=[],
|
||||
)
|
||||
assert request_data.get("prompt_cache_retention") == "24h", f"{model} should get 24h retention"
|
||||
|
||||
|
||||
def test_gpt5_mini_excluded_from_24h_retention():
|
||||
"""gpt-5-mini is not listed in OpenAI docs for extended retention."""
|
||||
client = OpenAIClient()
|
||||
llm_config = _openai_config(model="gpt-5-mini")
|
||||
|
||||
request_data = client.build_request_data(
|
||||
agent_type=AgentType.letta_v1_agent,
|
||||
messages=[_message()],
|
||||
llm_config=llm_config,
|
||||
tools=[],
|
||||
)
|
||||
|
||||
assert "prompt_cache_retention" not in request_data
|
||||
Reference in New Issue
Block a user