From 21765d16c9993c195ef60f6679fbc717042c049d Mon Sep 17 00:00:00 2001 From: Ari Webb Date: Tue, 17 Feb 2026 15:07:21 -0800 Subject: [PATCH] fix(core): add OpenAI 24h prompt cache retention for supported models (#9509) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(core): add OpenAI prompt cache key and model-gated 24h retention (#9492) * fix(core): apply OpenAI prompt cache settings to request payloads Set prompt_cache_key using agent and conversation context on both Responses and Chat Completions request builders, and enable 24h retention only for supported OpenAI models while excluding OpenRouter paths. 👾 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta * fix(core): prefix prompt cache key with letta tag Add a `letta:` prefix to generated OpenAI prompt_cache_key values so cache-related entries are easier to identify in provider-side logs and diagnostics. 👾 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta * add integration test * skip test --------- Co-authored-by: Letta Co-authored-by: Ari Webb * fix(core): only set prompt_cache_retention, drop prompt_cache_key Two issues with the original prompt_cache_key approach: 1. Key exceeded 64-char max (agent-:conv- = 90 chars) 2. Setting an explicit key disrupted OpenAI's default prefix-hash routing, dropping cache hit rates from 40-45% to 10-13% OpenAI's default routing (hash of first ~256 tokens) already provides good cache affinity since each agent has a unique system prompt. We only need prompt_cache_retention="24h" for extended retention. Also fixes: - Operator precedence bug in _supports_extended_prompt_cache_retention - Removes incorrect gpt-5.2-codex exclusion (it IS supported per docs) 🐾 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta --------- Co-authored-by: Charles Packer Co-authored-by: Letta --- letta/llm_api/openai_client.py | 76 ++++++- .../schemas/openai/chat_completion_request.py | 1 + letta/schemas/openai/responses_request.py | 2 +- tests/integration_test_send_message_v2.py | 214 ++++++++++++++++++ ...test_openai_prompt_cache_request_fields.py | 150 ++++++++++++ 5 files changed, 439 insertions(+), 4 deletions(-) create mode 100644 tests/test_openai_prompt_cache_request_fields.py diff --git a/letta/llm_api/openai_client.py b/letta/llm_api/openai_client.py index 82672cc9..0367072f 100644 --- a/letta/llm_api/openai_client.py +++ b/letta/llm_api/openai_client.py @@ -245,6 +245,64 @@ class OpenAIClient(LLMClientBase): def supports_structured_output(self, llm_config: LLMConfig) -> bool: return supports_structured_output(llm_config) + def _is_openrouter_request(self, llm_config: LLMConfig) -> bool: + return (llm_config.model_endpoint and "openrouter.ai" in llm_config.model_endpoint) or (llm_config.provider_name == "openrouter") + + def _is_true_openai_request(self, llm_config: LLMConfig) -> bool: + if llm_config.model_endpoint_type != "openai": + return False + + if self._is_openrouter_request(llm_config): + return False + + # Keep Letta inference endpoint behavior unchanged. + if llm_config.model_endpoint == LETTA_MODEL_ENDPOINT: + return False + + # If provider_name is explicitly set and not openai, don't apply OpenAI-specific prompt caching fields. + if llm_config.provider_name and llm_config.provider_name != "openai": + return False + + return True + + def _normalize_model_name(self, model: Optional[str]) -> Optional[str]: + if not model: + return None + return model.split("/", 1)[-1] + + def _supports_extended_prompt_cache_retention(self, model: Optional[str]) -> bool: + normalized_model = self._normalize_model_name(model) + if not normalized_model: + return False + + # Per OpenAI docs: extended retention is available on gpt-4.1 and gpt-5 family models. + # gpt-5-mini is excluded (not listed in docs). + return normalized_model == "gpt-4.1" or (normalized_model.startswith("gpt-5") and normalized_model != "gpt-5-mini") + + def _apply_prompt_cache_settings( + self, + llm_config: LLMConfig, + model: Optional[str], + messages: List[PydanticMessage], + request_obj: Any, + ) -> None: + """Apply OpenAI prompt cache settings to the request. + + We intentionally do NOT set prompt_cache_key. OpenAI's default routing + (based on a hash of the first ~256 tokens of the prompt) already provides + good cache affinity for Letta agents, since each agent has a unique system + prompt. Setting an explicit key can disrupt existing warm caches and reduce + hit rates. + + We only set prompt_cache_retention to "24h" for models that support extended + retention, which keeps cached prefixes active longer (up to 24h vs 5-10min). + """ + if not self._is_true_openai_request(llm_config): + return + + if self._supports_extended_prompt_cache_retention(model): + request_obj.prompt_cache_retention = "24h" + @trace_method def build_request_data_responses( self, @@ -385,6 +443,13 @@ class OpenAIClient(LLMClientBase): data.model = "memgpt-openai" + self._apply_prompt_cache_settings( + llm_config=llm_config, + model=model, + messages=messages, + request_obj=data, + ) + request_data = data.model_dump(exclude_unset=True) # print("responses request data", request_data) return request_data @@ -453,9 +518,7 @@ class OpenAIClient(LLMClientBase): model = None # TODO: we may need to extend this to more models using proxy? - is_openrouter = (llm_config.model_endpoint and "openrouter.ai" in llm_config.model_endpoint) or ( - llm_config.provider_name == "openrouter" - ) + is_openrouter = self._is_openrouter_request(llm_config) if is_openrouter: try: model = llm_config.handle.split("/", 1)[-1] @@ -558,6 +621,13 @@ class OpenAIClient(LLMClientBase): new_tools.append(tool.model_copy(deep=True)) data.tools = new_tools + self._apply_prompt_cache_settings( + llm_config=llm_config, + model=model, + messages=messages, + request_obj=data, + ) + # Note: Tools are already processed by enable_strict_mode() in the workflow/agent code # (temporal_letta_v1_agent_workflow.py or letta_agent_v3.py) before reaching here. # enable_strict_mode() handles: strict flag, additionalProperties, required array, nullable fields diff --git a/letta/schemas/openai/chat_completion_request.py b/letta/schemas/openai/chat_completion_request.py index 9188eb93..8eb2ce9e 100644 --- a/letta/schemas/openai/chat_completion_request.py +++ b/letta/schemas/openai/chat_completion_request.py @@ -143,6 +143,7 @@ class ChatCompletionRequest(BaseModel): temperature: Optional[float] = 1 top_p: Optional[float] = 1 user: Optional[str] = None # unique ID of the end-user (for monitoring) + prompt_cache_retention: Optional[Literal["in_memory", "24h"]] = None parallel_tool_calls: Optional[bool] = None instructions: Optional[str] = None verbosity: Optional[Literal["low", "medium", "high"]] = None # For verbosity control in GPT-5 models diff --git a/letta/schemas/openai/responses_request.py b/letta/schemas/openai/responses_request.py index aeeefa23..112fcb91 100644 --- a/letta/schemas/openai/responses_request.py +++ b/letta/schemas/openai/responses_request.py @@ -29,7 +29,7 @@ class ResponsesRequest(BaseModel): parallel_tool_calls: Optional[bool] = Field(default=NOT_GIVEN) previous_response_id: Optional[str] = Field(default=NOT_GIVEN) prompt: Optional[ResponsePromptParam] = Field(default=NOT_GIVEN) - prompt_cache_key: Optional[str] = Field(default=NOT_GIVEN) + prompt_cache_retention: Optional[Literal["in_memory", "24h"]] = Field(default=NOT_GIVEN) reasoning: Optional[Reasoning] = Field(default=NOT_GIVEN) safety_identifier: Optional[str] = Field(default=NOT_GIVEN) service_tier: Optional[Literal["auto", "default", "flex", "scale", "priority"]] = Field(default=NOT_GIVEN) diff --git a/tests/integration_test_send_message_v2.py b/tests/integration_test_send_message_v2.py index b91db8cf..fb19126d 100644 --- a/tests/integration_test_send_message_v2.py +++ b/tests/integration_test_send_message_v2.py @@ -1193,3 +1193,217 @@ async def test_json_schema_response_format( finally: # Cleanup await client.agents.delete(agent_state.id) + + +# Large memory block to exceed OpenAI's 1024 token caching threshold. +# This ensures the system prompt is large enough for OpenAI to cache it. +_LARGE_PERSONA_BLOCK = """ +You are an advanced AI assistant with extensive knowledge across multiple domains. + +# Core Capabilities + +## Technical Knowledge +- Software Engineering: Expert in Python, JavaScript, TypeScript, Go, Rust, and many other languages +- System Design: Deep understanding of distributed systems, microservices, and cloud architecture +- DevOps: Proficient in Docker, Kubernetes, CI/CD pipelines, and infrastructure as code +- Databases: Experience with SQL (PostgreSQL, MySQL) and NoSQL (MongoDB, Redis, Cassandra) databases +- Machine Learning: Knowledge of neural networks, transformers, and modern ML frameworks + +## Problem Solving Approach +When tackling problems, you follow a structured methodology: +1. Understand the requirements thoroughly +2. Break down complex problems into manageable components +3. Consider multiple solution approaches +4. Evaluate trade-offs between different options +5. Implement solutions with clean, maintainable code +6. Test thoroughly and iterate based on feedback + +## Communication Style +- Clear and concise explanations +- Use examples and analogies when helpful +- Adapt technical depth to the audience +- Ask clarifying questions when requirements are ambiguous +- Provide context and rationale for recommendations + +# Domain Expertise + +## Web Development +You have deep knowledge of: +- Frontend: React, Vue, Angular, Next.js, modern CSS frameworks +- Backend: Node.js, Express, FastAPI, Django, Flask +- API Design: REST, GraphQL, gRPC +- Authentication: OAuth, JWT, session management +- Performance: Caching strategies, CDNs, lazy loading + +## Data Engineering +You understand: +- ETL pipelines and data transformation +- Data warehousing concepts (Snowflake, BigQuery, Redshift) +- Stream processing (Kafka, Kinesis) +- Data modeling and schema design +- Data quality and validation + +## Cloud Platforms +You're familiar with: +- AWS: EC2, S3, Lambda, RDS, DynamoDB, CloudFormation +- GCP: Compute Engine, Cloud Storage, Cloud Functions, BigQuery +- Azure: Virtual Machines, Blob Storage, Azure Functions +- Serverless architectures and best practices +- Cost optimization strategies + +## Security +You consider: +- Common vulnerabilities (OWASP Top 10) +- Secure coding practices +- Encryption and key management +- Access control and authorization patterns +- Security audit and compliance requirements + +# Interaction Principles + +## Helpfulness +- Provide actionable guidance +- Share relevant resources and documentation +- Offer multiple approaches when appropriate +- Point out potential pitfalls and edge cases + +## Accuracy +- Verify information before sharing +- Acknowledge uncertainty when appropriate +- Correct mistakes promptly +- Stay up-to-date with best practices + +## Efficiency +- Get to the point quickly +- Avoid unnecessary verbosity +- Focus on what's most relevant +- Provide code examples when they clarify concepts +""" + "\n\n".join( + [ + f"Section {i + 1}: " + + """ +You have deep expertise in software development, including but not limited to: +- Programming languages: Python, JavaScript, TypeScript, Java, C++, Rust, Go, Swift, Kotlin, Ruby, PHP, Scala +- Web frameworks: React, Vue, Angular, Django, Flask, FastAPI, Express, Next.js, Nuxt, SvelteKit, Remix, Astro +- Databases: PostgreSQL, MySQL, MongoDB, Redis, Cassandra, DynamoDB, ElasticSearch, Neo4j, InfluxDB, TimescaleDB +- Cloud platforms: AWS (EC2, S3, Lambda, ECS, EKS, RDS), GCP (Compute Engine, Cloud Run, GKE), Azure (VMs, Functions, AKS) +- DevOps tools: Docker, Kubernetes, Terraform, Ansible, Jenkins, GitHub Actions, GitLab CI, CircleCI, ArgoCD +- Testing frameworks: pytest, Jest, Mocha, JUnit, unittest, Cypress, Playwright, Selenium, TestNG, RSpec +- Architecture patterns: Microservices, Event-driven, Serverless, Monolithic, CQRS, Event Sourcing, Hexagonal +- API design: REST, GraphQL, gRPC, WebSockets, Server-Sent Events, tRPC, JSON-RPC +""" + for i in range(4) + ] +) + +# Models that support prompt_cache_retention="24h": +# gpt-4.1, gpt-5 family (but not gpt-5-mini). +_PROMPT_CACHE_RETENTION_PREFIXES = ("gpt-4.1", "gpt-5") + +PROMPT_CACHE_MODEL_CONFIGS: List[Tuple[str, dict]] = [ + (handle, settings) + for handle, settings in TESTED_MODEL_CONFIGS + if settings.get("provider_type") == "openai" and any(handle.split("/")[-1].startswith(p) for p in _PROMPT_CACHE_RETENTION_PREFIXES) +] + + +@pytest.mark.skip(reason="the prompt caching is flaky") +@pytest.mark.parametrize( + "model_config", + PROMPT_CACHE_MODEL_CONFIGS, + ids=[handle for handle, _ in PROMPT_CACHE_MODEL_CONFIGS], +) +@pytest.mark.asyncio(loop_scope="function") +async def test_openai_prompt_cache_integration( + disable_e2b_api_key: Any, + client: AsyncLetta, + model_config: Tuple[str, dict], +) -> None: + """ + Integration test verifying OpenAI prompt caching works end-to-end. + + Tests models that support prompt_cache_retention="24h". + Validates that this field is accepted by OpenAI's API and produce cache hits. + + Strategy: + 1. Create an agent with a large persona block (>1024 tokens, OpenAI's caching threshold) + 2. Send message 1 -> primes the cache (cached_input_tokens should be 0 or small) + 3. Send message 2 -> should hit the cache (cached_input_tokens > 0) + + We rely on OpenAI's default prefix-hash routing (no prompt_cache_key) since each + agent has a unique system prompt, providing natural cache affinity. + """ + from letta_client.types import CreateBlockParam + + model_handle, model_settings = model_config + + agent = await client.agents.create( + name=f"prompt-cache-test-{uuid.uuid4().hex[:8]}", + agent_type="letta_v1_agent", + model=model_handle, + model_settings=model_settings, + embedding="openai/text-embedding-3-small", + include_base_tools=False, + memory_blocks=[ + CreateBlockParam( + label="persona", + value=_LARGE_PERSONA_BLOCK, + ) + ], + ) + + try: + # Message 1: Prime the cache. First request typically has cached_input_tokens=0. + response1 = await client.agents.messages.create( + agent_id=agent.id, + messages=[MessageCreateParam(role="user", content="Hello! Please introduce yourself briefly.")], + ) + assert response1.usage is not None, "First message should return usage data" + assert response1.usage.prompt_tokens > 0, "First message should have prompt_tokens > 0" + + logger.info( + f"[{model_handle}] Message 1 usage: " + f"prompt={response1.usage.prompt_tokens}, " + f"completion={response1.usage.completion_tokens}, " + f"cached_input={response1.usage.cached_input_tokens}" + ) + + # Verify we exceeded the 1024 token threshold for OpenAI caching + total_input_tokens = response1.usage.prompt_tokens + (response1.usage.cached_input_tokens or 0) + assert total_input_tokens >= 1024, f"Total input tokens ({total_input_tokens}) must be >= 1024 for OpenAI caching to activate" + + # Message 2: Should hit the cache thanks to prefix-hash routing. + response2 = await client.agents.messages.create( + agent_id=agent.id, + messages=[MessageCreateParam(role="user", content="What are your main areas of expertise?")], + ) + assert response2.usage is not None, "Second message should return usage data" + assert response2.usage.prompt_tokens > 0, "Second message should have prompt_tokens > 0" + + logger.info( + f"[{model_handle}] Message 2 usage: " + f"prompt={response2.usage.prompt_tokens}, " + f"completion={response2.usage.completion_tokens}, " + f"cached_input={response2.usage.cached_input_tokens}" + ) + + # CRITICAL: The second message should show cached_input_tokens > 0. + # This proves that prompt_cache_retention is being sent correctly + # and OpenAI is caching the prompt prefix. + cached_tokens = response2.usage.cached_input_tokens + assert cached_tokens is not None and cached_tokens > 0, ( + f"[{model_handle}] Expected cached_input_tokens > 0 on second message, got {cached_tokens}. " + "This means prompt caching is not working (cache miss occurred)." + ) + + # Cache hit ratio should be significant (most of the system prompt should be cached) + total_input_msg2 = response2.usage.prompt_tokens + (response2.usage.cached_input_tokens or 0) + cache_hit_ratio = cached_tokens / total_input_msg2 if total_input_msg2 > 0 else 0 + logger.info(f"[{model_handle}] Cache hit ratio: {cache_hit_ratio:.2%}") + + assert cache_hit_ratio >= 0.20, ( + f"[{model_handle}] Expected cache hit ratio >= 20%, got {cache_hit_ratio:.2%}. The large persona block should be mostly cached." + ) + + finally: + await client.agents.delete(agent.id) diff --git a/tests/test_openai_prompt_cache_request_fields.py b/tests/test_openai_prompt_cache_request_fields.py new file mode 100644 index 00000000..b0a06c47 --- /dev/null +++ b/tests/test_openai_prompt_cache_request_fields.py @@ -0,0 +1,150 @@ +from letta.llm_api.openai_client import OpenAIClient +from letta.schemas.enums import AgentType, MessageRole +from letta.schemas.letta_message_content import TextContent +from letta.schemas.llm_config import LLMConfig +from letta.schemas.message import Message + + +def _message(text: str = "hello") -> Message: + return Message( + role=MessageRole.user, + content=[TextContent(text=text)], + agent_id="agent-abc", + ) + + +def _openai_config(model: str, endpoint_type: str = "openai", provider_name: str | None = "openai") -> LLMConfig: + return LLMConfig( + model=model, + model_endpoint_type=endpoint_type, + model_endpoint="https://api.openai.com/v1", + context_window=256000, + provider_name=provider_name, + ) + + +def test_responses_request_sets_24h_retention_for_supported_model(): + client = OpenAIClient() + llm_config = _openai_config(model="gpt-5.1") + messages = [_message()] + + request_data = client.build_request_data( + agent_type=AgentType.letta_v1_agent, + messages=messages, + llm_config=llm_config, + tools=[], + ) + + assert "input" in request_data + assert "prompt_cache_key" not in request_data + assert request_data.get("prompt_cache_retention") == "24h" + + +def test_responses_request_omits_24h_for_unsupported_model(): + client = OpenAIClient() + llm_config = _openai_config(model="o3-mini") + messages = [_message()] + + request_data = client.build_request_data( + agent_type=AgentType.letta_v1_agent, + messages=messages, + llm_config=llm_config, + tools=[], + ) + + assert "prompt_cache_key" not in request_data + assert "prompt_cache_retention" not in request_data + + +def test_chat_completions_request_sets_24h_retention_for_supported_model(): + client = OpenAIClient() + llm_config = _openai_config(model="gpt-4.1") + messages = [_message()] + + request_data = client.build_request_data( + agent_type=AgentType.memgpt_v2_agent, + messages=messages, + llm_config=llm_config, + tools=[], + ) + + assert "messages" in request_data + assert "prompt_cache_key" not in request_data + assert request_data.get("prompt_cache_retention") == "24h" + + +def test_chat_completions_request_omits_24h_for_unsupported_model(): + client = OpenAIClient() + llm_config = _openai_config(model="gpt-4o-mini") + messages = [_message()] + + request_data = client.build_request_data( + agent_type=AgentType.memgpt_v2_agent, + messages=messages, + llm_config=llm_config, + tools=[], + ) + + assert "prompt_cache_key" not in request_data + assert "prompt_cache_retention" not in request_data + + +def test_openrouter_request_omits_all_prompt_cache_fields(): + client = OpenAIClient() + llm_config = LLMConfig( + model="gpt-5.1", + handle="openrouter/gpt-5.1", + model_endpoint_type="openai", + model_endpoint="https://openrouter.ai/api/v1", + context_window=256000, + provider_name="openrouter", + ) + messages = [_message()] + + responses_request_data = client.build_request_data( + agent_type=AgentType.letta_v1_agent, + messages=messages, + llm_config=llm_config, + tools=[], + ) + chat_request_data = client.build_request_data( + agent_type=AgentType.memgpt_v2_agent, + messages=messages, + llm_config=llm_config, + tools=[], + ) + + assert "prompt_cache_key" not in responses_request_data + assert "prompt_cache_retention" not in responses_request_data + assert "prompt_cache_key" not in chat_request_data + assert "prompt_cache_retention" not in chat_request_data + + +def test_gpt5_family_gets_24h_retention(): + """gpt-5, gpt-5-codex, gpt-5.1, gpt-5.2 all get 24h retention.""" + client = OpenAIClient() + + for model in ["gpt-5", "gpt-5-codex", "gpt-5.1", "gpt-5.1-codex", "gpt-5.2"]: + llm_config = _openai_config(model=model) + request_data = client.build_request_data( + agent_type=AgentType.letta_v1_agent, + messages=[_message()], + llm_config=llm_config, + tools=[], + ) + assert request_data.get("prompt_cache_retention") == "24h", f"{model} should get 24h retention" + + +def test_gpt5_mini_excluded_from_24h_retention(): + """gpt-5-mini is not listed in OpenAI docs for extended retention.""" + client = OpenAIClient() + llm_config = _openai_config(model="gpt-5-mini") + + request_data = client.build_request_data( + agent_type=AgentType.letta_v1_agent, + messages=[_message()], + llm_config=llm_config, + tools=[], + ) + + assert "prompt_cache_retention" not in request_data