fix(core): add OpenAI prompt cache key and model-gated 24h retention (#9492)

* fix(core): apply OpenAI prompt cache settings to request payloads Set prompt_cache_key using agent and conversation context on both Responses and Chat Completions request builders, and enable 24h retention only for supported OpenAI models while excluding OpenRouter paths. 👾 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * fix(core): prefix prompt cache key with letta tag Add a `letta:` prefix to generated OpenAI prompt_cache_key values so cache-related entries are easier to identify in provider-side logs and diagnostics. 👾 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * add integration test * skip test --------- Co-authored-by: Letta <noreply@letta.com> Co-authored-by: Ari Webb <ari@letta.com>
2026-02-16 13:27:42 -08:00
parent 5b001a7749
commit 619e81ed1e
5 changed files with 444 additions and 3 deletions
--- a/tests/integration_test_send_message_v2.py
+++ b/tests/integration_test_send_message_v2.py
@@ -1189,3 +1189,217 @@ async def test_json_schema_response_format(
    finally:
        # Cleanup
        await client.agents.delete(agent_state.id)
+
+
+# Large memory block to exceed OpenAI's 1024 token caching threshold.
+# This ensures the system prompt is large enough for OpenAI to cache it.
+_LARGE_PERSONA_BLOCK = """
+You are an advanced AI assistant with extensive knowledge across multiple domains.
+
+# Core Capabilities
+
+## Technical Knowledge
+- Software Engineering: Expert in Python, JavaScript, TypeScript, Go, Rust, and many other languages
+- System Design: Deep understanding of distributed systems, microservices, and cloud architecture
+- DevOps: Proficient in Docker, Kubernetes, CI/CD pipelines, and infrastructure as code
+- Databases: Experience with SQL (PostgreSQL, MySQL) and NoSQL (MongoDB, Redis, Cassandra) databases
+- Machine Learning: Knowledge of neural networks, transformers, and modern ML frameworks
+
+## Problem Solving Approach
+When tackling problems, you follow a structured methodology:
+1. Understand the requirements thoroughly
+2. Break down complex problems into manageable components
+3. Consider multiple solution approaches
+4. Evaluate trade-offs between different options
+5. Implement solutions with clean, maintainable code
+6. Test thoroughly and iterate based on feedback
+
+## Communication Style
+- Clear and concise explanations
+- Use examples and analogies when helpful
+- Adapt technical depth to the audience
+- Ask clarifying questions when requirements are ambiguous
+- Provide context and rationale for recommendations
+
+# Domain Expertise
+
+## Web Development
+You have deep knowledge of:
+- Frontend: React, Vue, Angular, Next.js, modern CSS frameworks
+- Backend: Node.js, Express, FastAPI, Django, Flask
+- API Design: REST, GraphQL, gRPC
+- Authentication: OAuth, JWT, session management
+- Performance: Caching strategies, CDNs, lazy loading
+
+## Data Engineering
+You understand:
+- ETL pipelines and data transformation
+- Data warehousing concepts (Snowflake, BigQuery, Redshift)
+- Stream processing (Kafka, Kinesis)
+- Data modeling and schema design
+- Data quality and validation
+
+## Cloud Platforms
+You're familiar with:
+- AWS: EC2, S3, Lambda, RDS, DynamoDB, CloudFormation
+- GCP: Compute Engine, Cloud Storage, Cloud Functions, BigQuery
+- Azure: Virtual Machines, Blob Storage, Azure Functions
+- Serverless architectures and best practices
+- Cost optimization strategies
+
+## Security
+You consider:
+- Common vulnerabilities (OWASP Top 10)
+- Secure coding practices
+- Encryption and key management
+- Access control and authorization patterns
+- Security audit and compliance requirements
+
+# Interaction Principles
+
+## Helpfulness
+- Provide actionable guidance
+- Share relevant resources and documentation
+- Offer multiple approaches when appropriate
+- Point out potential pitfalls and edge cases
+
+## Accuracy
+- Verify information before sharing
+- Acknowledge uncertainty when appropriate
+- Correct mistakes promptly
+- Stay up-to-date with best practices
+
+## Efficiency
+- Get to the point quickly
+- Avoid unnecessary verbosity
+- Focus on what's most relevant
+- Provide code examples when they clarify concepts
+""" + "\n\n".join(
+    [
+        f"Section {i + 1}: "
+        + """
+You have deep expertise in software development, including but not limited to:
+- Programming languages: Python, JavaScript, TypeScript, Java, C++, Rust, Go, Swift, Kotlin, Ruby, PHP, Scala
+- Web frameworks: React, Vue, Angular, Django, Flask, FastAPI, Express, Next.js, Nuxt, SvelteKit, Remix, Astro
+- Databases: PostgreSQL, MySQL, MongoDB, Redis, Cassandra, DynamoDB, ElasticSearch, Neo4j, InfluxDB, TimescaleDB
+- Cloud platforms: AWS (EC2, S3, Lambda, ECS, EKS, RDS), GCP (Compute Engine, Cloud Run, GKE), Azure (VMs, Functions, AKS)
+- DevOps tools: Docker, Kubernetes, Terraform, Ansible, Jenkins, GitHub Actions, GitLab CI, CircleCI, ArgoCD
+- Testing frameworks: pytest, Jest, Mocha, JUnit, unittest, Cypress, Playwright, Selenium, TestNG, RSpec
+- Architecture patterns: Microservices, Event-driven, Serverless, Monolithic, CQRS, Event Sourcing, Hexagonal
+- API design: REST, GraphQL, gRPC, WebSockets, Server-Sent Events, tRPC, JSON-RPC
+"""
+        for i in range(4)
+    ]
+)
+
+# Models that support prompt_cache_key + prompt_cache_retention="24h":
+# gpt-4.1, gpt-5 family (but not gpt-5-mini or gpt-5.2-codex).
+_PROMPT_CACHE_RETENTION_PREFIXES = ("gpt-4.1", "gpt-5")
+
+PROMPT_CACHE_MODEL_CONFIGS: List[Tuple[str, dict]] = [
+    (handle, settings)
+    for handle, settings in TESTED_MODEL_CONFIGS
+    if settings.get("provider_type") == "openai" and any(handle.split("/")[-1].startswith(p) for p in _PROMPT_CACHE_RETENTION_PREFIXES)
+]
+
+
+@pytest.mark.skip(reason="the prompt caching is flaky")
+@pytest.mark.parametrize(
+    "model_config",
+    PROMPT_CACHE_MODEL_CONFIGS,
+    ids=[handle for handle, _ in PROMPT_CACHE_MODEL_CONFIGS],
+)
+@pytest.mark.asyncio(loop_scope="function")
+async def test_openai_prompt_cache_integration(
+    disable_e2b_api_key: Any,
+    client: AsyncLetta,
+    model_config: Tuple[str, dict],
+) -> None:
+    """
+    Integration test verifying OpenAI prompt caching works end-to-end.
+
+    Tests models that support both prompt_cache_key and prompt_cache_retention="24h".
+    Validates that these fields are accepted by OpenAI's API and produce cache hits.
+
+    Strategy:
+    1. Create an agent with a large persona block (>1024 tokens, OpenAI's caching threshold)
+    2. Send message 1 -> primes the cache (cached_input_tokens should be 0 or small)
+    3. Send message 2 -> should hit the cache (cached_input_tokens > 0)
+
+    The prompt_cache_key (letta:{agent_id}:{conversation_id}) improves cache routing
+    so that subsequent requests land on the same machine with warm KV tensors.
+    """
+    from letta_client.types import CreateBlockParam
+
+    model_handle, model_settings = model_config
+
+    agent = await client.agents.create(
+        name=f"prompt-cache-test-{uuid.uuid4().hex[:8]}",
+        agent_type="letta_v1_agent",
+        model=model_handle,
+        model_settings=model_settings,
+        embedding="openai/text-embedding-3-small",
+        include_base_tools=False,
+        memory_blocks=[
+            CreateBlockParam(
+                label="persona",
+                value=_LARGE_PERSONA_BLOCK,
+            )
+        ],
+    )
+
+    try:
+        # Message 1: Prime the cache. First request typically has cached_input_tokens=0.
+        response1 = await client.agents.messages.create(
+            agent_id=agent.id,
+            messages=[MessageCreateParam(role="user", content="Hello! Please introduce yourself briefly.")],
+        )
+        assert response1.usage is not None, "First message should return usage data"
+        assert response1.usage.prompt_tokens > 0, "First message should have prompt_tokens > 0"
+
+        logger.info(
+            f"[{model_handle}] Message 1 usage: "
+            f"prompt={response1.usage.prompt_tokens}, "
+            f"completion={response1.usage.completion_tokens}, "
+            f"cached_input={response1.usage.cached_input_tokens}"
+        )
+
+        # Verify we exceeded the 1024 token threshold for OpenAI caching
+        total_input_tokens = response1.usage.prompt_tokens + (response1.usage.cached_input_tokens or 0)
+        assert total_input_tokens >= 1024, f"Total input tokens ({total_input_tokens}) must be >= 1024 for OpenAI caching to activate"
+
+        # Message 2: Should hit the cache thanks to prompt_cache_key routing.
+        response2 = await client.agents.messages.create(
+            agent_id=agent.id,
+            messages=[MessageCreateParam(role="user", content="What are your main areas of expertise?")],
+        )
+        assert response2.usage is not None, "Second message should return usage data"
+        assert response2.usage.prompt_tokens > 0, "Second message should have prompt_tokens > 0"
+
+        logger.info(
+            f"[{model_handle}] Message 2 usage: "
+            f"prompt={response2.usage.prompt_tokens}, "
+            f"completion={response2.usage.completion_tokens}, "
+            f"cached_input={response2.usage.cached_input_tokens}"
+        )
+
+        # CRITICAL: The second message should show cached_input_tokens > 0.
+        # This proves that prompt_cache_key and prompt_cache_retention are being
+        # sent correctly and OpenAI is caching the prompt prefix.
+        cached_tokens = response2.usage.cached_input_tokens
+        assert cached_tokens is not None and cached_tokens > 0, (
+            f"[{model_handle}] Expected cached_input_tokens > 0 on second message, got {cached_tokens}. "
+            "This means prompt caching is not working (prompt_cache_key may not be sent or cache miss occurred)."
+        )
+
+        # Cache hit ratio should be significant (most of the system prompt should be cached)
+        total_input_msg2 = response2.usage.prompt_tokens + (response2.usage.cached_input_tokens or 0)
+        cache_hit_ratio = cached_tokens / total_input_msg2 if total_input_msg2 > 0 else 0
+        logger.info(f"[{model_handle}] Cache hit ratio: {cache_hit_ratio:.2%}")
+
+        assert cache_hit_ratio >= 0.20, (
+            f"[{model_handle}] Expected cache hit ratio >= 20%, got {cache_hit_ratio:.2%}. The large persona block should be mostly cached."
+        )
+
+    finally:
+        await client.agents.delete(agent.id)
--- a/tests/test_openai_prompt_cache_request_fields.py
+++ b/tests/test_openai_prompt_cache_request_fields.py
@@ -0,0 +1,137 @@
+from letta.llm_api.openai_client import OpenAIClient
+from letta.schemas.enums import AgentType, MessageRole
+from letta.schemas.letta_message_content import TextContent
+from letta.schemas.llm_config import LLMConfig
+from letta.schemas.message import Message
+
+
+def _message_with_ids(agent_id: str, conversation_id: str | None, text: str = "hello") -> Message:
+    return Message(
+        role=MessageRole.user,
+        content=[TextContent(text=text)],
+        agent_id=agent_id,
+        conversation_id=conversation_id,
+    )
+
+
+def _openai_config(model: str, endpoint_type: str = "openai", provider_name: str | None = "openai") -> LLMConfig:
+    return LLMConfig(
+        model=model,
+        model_endpoint_type=endpoint_type,
+        model_endpoint="https://api.openai.com/v1",
+        context_window=256000,
+        provider_name=provider_name,
+    )
+
+
+def test_responses_request_sets_prompt_cache_fields_for_supported_openai_model():
+    client = OpenAIClient()
+    llm_config = _openai_config(model="gpt-5.1")
+    messages = [_message_with_ids(agent_id="agent-abc", conversation_id="conversation-123")]
+
+    request_data = client.build_request_data(
+        agent_type=AgentType.letta_v1_agent,
+        messages=messages,
+        llm_config=llm_config,
+        tools=[],
+    )
+
+    assert "input" in request_data
+    assert request_data.get("prompt_cache_key") == "letta:agent-abc:conversation-123"
+    assert request_data.get("prompt_cache_retention") == "24h"
+
+
+def test_responses_request_uses_defaultconv_when_conversation_missing():
+    client = OpenAIClient()
+    llm_config = _openai_config(model="gpt-5.1")
+    messages = [_message_with_ids(agent_id="agent-abc", conversation_id=None)]
+
+    request_data = client.build_request_data(
+        agent_type=AgentType.letta_v1_agent,
+        messages=messages,
+        llm_config=llm_config,
+        tools=[],
+    )
+
+    assert request_data.get("prompt_cache_key") == "letta:agent-abc:defaultconv"
+    assert request_data.get("prompt_cache_retention") == "24h"
+
+
+def test_responses_request_omits_24h_for_unsupported_extended_retention_model():
+    client = OpenAIClient()
+    llm_config = _openai_config(model="o3-mini")
+    messages = [_message_with_ids(agent_id="agent-abc", conversation_id="conversation-123")]
+
+    request_data = client.build_request_data(
+        agent_type=AgentType.letta_v1_agent,
+        messages=messages,
+        llm_config=llm_config,
+        tools=[],
+    )
+
+    assert request_data.get("prompt_cache_key") == "letta:agent-abc:conversation-123"
+    assert "prompt_cache_retention" not in request_data
+
+
+def test_chat_completions_request_sets_prompt_cache_fields_for_supported_openai_model():
+    client = OpenAIClient()
+    llm_config = _openai_config(model="gpt-4.1")
+    messages = [_message_with_ids(agent_id="agent-abc", conversation_id="conversation-123")]
+
+    request_data = client.build_request_data(
+        agent_type=AgentType.memgpt_v2_agent,
+        messages=messages,
+        llm_config=llm_config,
+        tools=[],
+    )
+
+    assert "messages" in request_data
+    assert request_data.get("prompt_cache_key") == "letta:agent-abc:conversation-123"
+    assert request_data.get("prompt_cache_retention") == "24h"
+
+
+def test_chat_completions_request_omits_24h_for_unsupported_extended_retention_model():
+    client = OpenAIClient()
+    llm_config = _openai_config(model="gpt-4o-mini")
+    messages = [_message_with_ids(agent_id="agent-abc", conversation_id="conversation-123")]
+
+    request_data = client.build_request_data(
+        agent_type=AgentType.memgpt_v2_agent,
+        messages=messages,
+        llm_config=llm_config,
+        tools=[],
+    )
+
+    assert request_data.get("prompt_cache_key") == "letta:agent-abc:conversation-123"
+    assert "prompt_cache_retention" not in request_data
+
+
+def test_openrouter_request_omits_prompt_cache_fields_on_both_paths():
+    client = OpenAIClient()
+    llm_config = LLMConfig(
+        model="gpt-5.1",
+        handle="openrouter/gpt-5.1",
+        model_endpoint_type="openai",
+        model_endpoint="https://openrouter.ai/api/v1",
+        context_window=256000,
+        provider_name="openrouter",
+    )
+    messages = [_message_with_ids(agent_id="agent-abc", conversation_id="conversation-123")]
+
+    responses_request_data = client.build_request_data(
+        agent_type=AgentType.letta_v1_agent,
+        messages=messages,
+        llm_config=llm_config,
+        tools=[],
+    )
+    chat_request_data = client.build_request_data(
+        agent_type=AgentType.memgpt_v2_agent,
+        messages=messages,
+        llm_config=llm_config,
+        tools=[],
+    )
+
+    assert "prompt_cache_key" not in responses_request_data
+    assert "prompt_cache_retention" not in responses_request_data
+    assert "prompt_cache_key" not in chat_request_data
+    assert "prompt_cache_retention" not in chat_request_data