diff --git a/letta/llm_api/openai_client.py b/letta/llm_api/openai_client.py
index 0d58fb9f..e4b78736 100644
--- a/letta/llm_api/openai_client.py
+++ b/letta/llm_api/openai_client.py
@@ -247,81 +247,6 @@ class OpenAIClient(LLMClientBase):
     def supports_structured_output(self, llm_config: LLMConfig) -> bool:
         return supports_structured_output(llm_config)
 
-    def _is_openrouter_request(self, llm_config: LLMConfig) -> bool:
-        return (llm_config.model_endpoint and "openrouter.ai" in llm_config.model_endpoint) or (llm_config.provider_name == "openrouter")
-
-    def _is_true_openai_request(self, llm_config: LLMConfig) -> bool:
-        if llm_config.model_endpoint_type != "openai":
-            return False
-
-        if self._is_openrouter_request(llm_config):
-            return False
-
-        # Keep Letta inference endpoint behavior unchanged.
-        if llm_config.model_endpoint == LETTA_MODEL_ENDPOINT:
-            return False
-
-        # If provider_name is explicitly set and not openai, don't apply OpenAI-specific prompt caching fields.
-        if llm_config.provider_name and llm_config.provider_name != "openai":
-            return False
-
-        return True
-
-    def _normalize_model_name(self, model: Optional[str]) -> Optional[str]:
-        if not model:
-            return None
-        return model.split("/", 1)[-1]
-
-    def _supports_extended_prompt_cache_retention(self, model: Optional[str]) -> bool:
-        normalized_model = self._normalize_model_name(model)
-        if not normalized_model:
-            return False
-
-        # Per OpenAI docs: extended retention is available on gpt-4.1 and gpt-5 family models but not gpt-5-mini or gpt-5.2-codex.
-        exceptions = ["gpt-5-mini", "gpt-5.2-codex"]
-        return normalized_model == "gpt-4.1" or normalized_model.startswith("gpt-5") and normalized_model not in exceptions
-
-    def _build_prompt_cache_key(self, messages: List[PydanticMessage]) -> Optional[str]:
-        agent_id = None
-        conversation_id = None
-
-        for message in reversed(messages):
-            if agent_id is None and getattr(message, "agent_id", None):
-                agent_id = message.agent_id
-            if conversation_id is None and getattr(message, "conversation_id", None):
-                conversation_id = message.conversation_id
-            if agent_id is not None and conversation_id is not None:
-                break
-
-        if agent_id is None:
-            agent_id = self._telemetry_agent_id
-
-        if agent_id is None:
-            return None
-
-        # Use requested fallback string for non-conversation/default-conversation paths.
-        if not conversation_id or conversation_id == "default":
-            conversation_id = "defaultconv"
-
-        return f"letta:{agent_id}:{conversation_id}"
-
-    def _apply_prompt_cache_settings(
-        self,
-        llm_config: LLMConfig,
-        model: Optional[str],
-        messages: List[PydanticMessage],
-        request_obj: Any,
-    ) -> None:
-        if not self._is_true_openai_request(llm_config):
-            return
-
-        prompt_cache_key = self._build_prompt_cache_key(messages)
-        if prompt_cache_key:
-            request_obj.prompt_cache_key = prompt_cache_key
-
-        if self._supports_extended_prompt_cache_retention(model):
-            request_obj.prompt_cache_retention = "24h"
-
     @trace_method
     def build_request_data_responses(
         self,
@@ -462,13 +387,6 @@ class OpenAIClient(LLMClientBase):
 
             data.model = "memgpt-openai"
 
-        self._apply_prompt_cache_settings(
-            llm_config=llm_config,
-            model=model,
-            messages=messages,
-            request_obj=data,
-        )
-
         request_data = data.model_dump(exclude_unset=True)
         # print("responses request data", request_data)
         return request_data
@@ -537,7 +455,9 @@ class OpenAIClient(LLMClientBase):
             model = None
 
         # TODO: we may need to extend this to more models using proxy?
-        is_openrouter = self._is_openrouter_request(llm_config)
+        is_openrouter = (llm_config.model_endpoint and "openrouter.ai" in llm_config.model_endpoint) or (
+            llm_config.provider_name == "openrouter"
+        )
         if is_openrouter:
             try:
                 model = llm_config.handle.split("/", 1)[-1]
@@ -640,13 +560,6 @@ class OpenAIClient(LLMClientBase):
                     new_tools.append(tool.model_copy(deep=True))
                 data.tools = new_tools
 
-        self._apply_prompt_cache_settings(
-            llm_config=llm_config,
-            model=model,
-            messages=messages,
-            request_obj=data,
-        )
-
         # Note: Tools are already processed by enable_strict_mode() in the workflow/agent code
         # (temporal_letta_v1_agent_workflow.py or letta_agent_v3.py) before reaching here.
         # enable_strict_mode() handles: strict flag, additionalProperties, required array, nullable fields
diff --git a/letta/schemas/openai/chat_completion_request.py b/letta/schemas/openai/chat_completion_request.py
index 2b271ee3..c0939257 100644
--- a/letta/schemas/openai/chat_completion_request.py
+++ b/letta/schemas/openai/chat_completion_request.py
@@ -143,8 +143,6 @@ class ChatCompletionRequest(BaseModel):
     temperature: Optional[float] = 1
     top_p: Optional[float] = 1
     user: Optional[str] = None  # unique ID of the end-user (for monitoring)
-    prompt_cache_key: Optional[str] = None
-    prompt_cache_retention: Optional[Literal["in_memory", "24h"]] = None
     parallel_tool_calls: Optional[bool] = None
     instructions: Optional[str] = None
     verbosity: Optional[Literal["low", "medium", "high"]] = None  # For verbosity control in GPT-5 models
diff --git a/letta/schemas/openai/responses_request.py b/letta/schemas/openai/responses_request.py
index 5c4fbfe7..aeeefa23 100644
--- a/letta/schemas/openai/responses_request.py
+++ b/letta/schemas/openai/responses_request.py
@@ -30,7 +30,6 @@ class ResponsesRequest(BaseModel):
     previous_response_id: Optional[str] = Field(default=NOT_GIVEN)
     prompt: Optional[ResponsePromptParam] = Field(default=NOT_GIVEN)
     prompt_cache_key: Optional[str] = Field(default=NOT_GIVEN)
-    prompt_cache_retention: Optional[Literal["in_memory", "24h"]] = Field(default=NOT_GIVEN)
     reasoning: Optional[Reasoning] = Field(default=NOT_GIVEN)
     safety_identifier: Optional[str] = Field(default=NOT_GIVEN)
     service_tier: Optional[Literal["auto", "default", "flex", "scale", "priority"]] = Field(default=NOT_GIVEN)
diff --git a/tests/integration_test_send_message_v2.py b/tests/integration_test_send_message_v2.py
index c59afad2..4ef0cd02 100644
--- a/tests/integration_test_send_message_v2.py
+++ b/tests/integration_test_send_message_v2.py
@@ -1189,217 +1189,3 @@ async def test_json_schema_response_format(
     finally:
         # Cleanup
         await client.agents.delete(agent_state.id)
-
-
-# Large memory block to exceed OpenAI's 1024 token caching threshold.
-# This ensures the system prompt is large enough for OpenAI to cache it.
-_LARGE_PERSONA_BLOCK = """
-You are an advanced AI assistant with extensive knowledge across multiple domains.
-
-# Core Capabilities
-
-## Technical Knowledge
-- Software Engineering: Expert in Python, JavaScript, TypeScript, Go, Rust, and many other languages
-- System Design: Deep understanding of distributed systems, microservices, and cloud architecture
-- DevOps: Proficient in Docker, Kubernetes, CI/CD pipelines, and infrastructure as code
-- Databases: Experience with SQL (PostgreSQL, MySQL) and NoSQL (MongoDB, Redis, Cassandra) databases
-- Machine Learning: Knowledge of neural networks, transformers, and modern ML frameworks
-
-## Problem Solving Approach
-When tackling problems, you follow a structured methodology:
-1. Understand the requirements thoroughly
-2. Break down complex problems into manageable components
-3. Consider multiple solution approaches
-4. Evaluate trade-offs between different options
-5. Implement solutions with clean, maintainable code
-6. Test thoroughly and iterate based on feedback
-
-## Communication Style
-- Clear and concise explanations
-- Use examples and analogies when helpful
-- Adapt technical depth to the audience
-- Ask clarifying questions when requirements are ambiguous
-- Provide context and rationale for recommendations
-
-# Domain Expertise
-
-## Web Development
-You have deep knowledge of:
-- Frontend: React, Vue, Angular, Next.js, modern CSS frameworks
-- Backend: Node.js, Express, FastAPI, Django, Flask
-- API Design: REST, GraphQL, gRPC
-- Authentication: OAuth, JWT, session management
-- Performance: Caching strategies, CDNs, lazy loading
-
-## Data Engineering
-You understand:
-- ETL pipelines and data transformation
-- Data warehousing concepts (Snowflake, BigQuery, Redshift)
-- Stream processing (Kafka, Kinesis)
-- Data modeling and schema design
-- Data quality and validation
-
-## Cloud Platforms
-You're familiar with:
-- AWS: EC2, S3, Lambda, RDS, DynamoDB, CloudFormation
-- GCP: Compute Engine, Cloud Storage, Cloud Functions, BigQuery
-- Azure: Virtual Machines, Blob Storage, Azure Functions
-- Serverless architectures and best practices
-- Cost optimization strategies
-
-## Security
-You consider:
-- Common vulnerabilities (OWASP Top 10)
-- Secure coding practices
-- Encryption and key management
-- Access control and authorization patterns
-- Security audit and compliance requirements
-
-# Interaction Principles
-
-## Helpfulness
-- Provide actionable guidance
-- Share relevant resources and documentation
-- Offer multiple approaches when appropriate
-- Point out potential pitfalls and edge cases
-
-## Accuracy
-- Verify information before sharing
-- Acknowledge uncertainty when appropriate
-- Correct mistakes promptly
-- Stay up-to-date with best practices
-
-## Efficiency
-- Get to the point quickly
-- Avoid unnecessary verbosity
-- Focus on what's most relevant
-- Provide code examples when they clarify concepts
-""" + "\n\n".join(
-    [
-        f"Section {i + 1}: "
-        + """
-You have deep expertise in software development, including but not limited to:
-- Programming languages: Python, JavaScript, TypeScript, Java, C++, Rust, Go, Swift, Kotlin, Ruby, PHP, Scala
-- Web frameworks: React, Vue, Angular, Django, Flask, FastAPI, Express, Next.js, Nuxt, SvelteKit, Remix, Astro
-- Databases: PostgreSQL, MySQL, MongoDB, Redis, Cassandra, DynamoDB, ElasticSearch, Neo4j, InfluxDB, TimescaleDB
-- Cloud platforms: AWS (EC2, S3, Lambda, ECS, EKS, RDS), GCP (Compute Engine, Cloud Run, GKE), Azure (VMs, Functions, AKS)
-- DevOps tools: Docker, Kubernetes, Terraform, Ansible, Jenkins, GitHub Actions, GitLab CI, CircleCI, ArgoCD
-- Testing frameworks: pytest, Jest, Mocha, JUnit, unittest, Cypress, Playwright, Selenium, TestNG, RSpec
-- Architecture patterns: Microservices, Event-driven, Serverless, Monolithic, CQRS, Event Sourcing, Hexagonal
-- API design: REST, GraphQL, gRPC, WebSockets, Server-Sent Events, tRPC, JSON-RPC
-"""
-        for i in range(4)
-    ]
-)
-
-# Models that support prompt_cache_key + prompt_cache_retention="24h":
-# gpt-4.1, gpt-5 family (but not gpt-5-mini or gpt-5.2-codex).
-_PROMPT_CACHE_RETENTION_PREFIXES = ("gpt-4.1", "gpt-5")
-
-PROMPT_CACHE_MODEL_CONFIGS: List[Tuple[str, dict]] = [
-    (handle, settings)
-    for handle, settings in TESTED_MODEL_CONFIGS
-    if settings.get("provider_type") == "openai" and any(handle.split("/")[-1].startswith(p) for p in _PROMPT_CACHE_RETENTION_PREFIXES)
-]
-
-
-@pytest.mark.skip(reason="the prompt caching is flaky")
-@pytest.mark.parametrize(
-    "model_config",
-    PROMPT_CACHE_MODEL_CONFIGS,
-    ids=[handle for handle, _ in PROMPT_CACHE_MODEL_CONFIGS],
-)
-@pytest.mark.asyncio(loop_scope="function")
-async def test_openai_prompt_cache_integration(
-    disable_e2b_api_key: Any,
-    client: AsyncLetta,
-    model_config: Tuple[str, dict],
-) -> None:
-    """
-    Integration test verifying OpenAI prompt caching works end-to-end.
-
-    Tests models that support both prompt_cache_key and prompt_cache_retention="24h".
-    Validates that these fields are accepted by OpenAI's API and produce cache hits.
-
-    Strategy:
-    1. Create an agent with a large persona block (>1024 tokens, OpenAI's caching threshold)
-    2. Send message 1 -> primes the cache (cached_input_tokens should be 0 or small)
-    3. Send message 2 -> should hit the cache (cached_input_tokens > 0)
-
-    The prompt_cache_key (letta:{agent_id}:{conversation_id}) improves cache routing
-    so that subsequent requests land on the same machine with warm KV tensors.
-    """
-    from letta_client.types import CreateBlockParam
-
-    model_handle, model_settings = model_config
-
-    agent = await client.agents.create(
-        name=f"prompt-cache-test-{uuid.uuid4().hex[:8]}",
-        agent_type="letta_v1_agent",
-        model=model_handle,
-        model_settings=model_settings,
-        embedding="openai/text-embedding-3-small",
-        include_base_tools=False,
-        memory_blocks=[
-            CreateBlockParam(
-                label="persona",
-                value=_LARGE_PERSONA_BLOCK,
-            )
-        ],
-    )
-
-    try:
-        # Message 1: Prime the cache. First request typically has cached_input_tokens=0.
-        response1 = await client.agents.messages.create(
-            agent_id=agent.id,
-            messages=[MessageCreateParam(role="user", content="Hello! Please introduce yourself briefly.")],
-        )
-        assert response1.usage is not None, "First message should return usage data"
-        assert response1.usage.prompt_tokens > 0, "First message should have prompt_tokens > 0"
-
-        logger.info(
-            f"[{model_handle}] Message 1 usage: "
-            f"prompt={response1.usage.prompt_tokens}, "
-            f"completion={response1.usage.completion_tokens}, "
-            f"cached_input={response1.usage.cached_input_tokens}"
-        )
-
-        # Verify we exceeded the 1024 token threshold for OpenAI caching
-        total_input_tokens = response1.usage.prompt_tokens + (response1.usage.cached_input_tokens or 0)
-        assert total_input_tokens >= 1024, f"Total input tokens ({total_input_tokens}) must be >= 1024 for OpenAI caching to activate"
-
-        # Message 2: Should hit the cache thanks to prompt_cache_key routing.
-        response2 = await client.agents.messages.create(
-            agent_id=agent.id,
-            messages=[MessageCreateParam(role="user", content="What are your main areas of expertise?")],
-        )
-        assert response2.usage is not None, "Second message should return usage data"
-        assert response2.usage.prompt_tokens > 0, "Second message should have prompt_tokens > 0"
-
-        logger.info(
-            f"[{model_handle}] Message 2 usage: "
-            f"prompt={response2.usage.prompt_tokens}, "
-            f"completion={response2.usage.completion_tokens}, "
-            f"cached_input={response2.usage.cached_input_tokens}"
-        )
-
-        # CRITICAL: The second message should show cached_input_tokens > 0.
-        # This proves that prompt_cache_key and prompt_cache_retention are being
-        # sent correctly and OpenAI is caching the prompt prefix.
-        cached_tokens = response2.usage.cached_input_tokens
-        assert cached_tokens is not None and cached_tokens > 0, (
-            f"[{model_handle}] Expected cached_input_tokens > 0 on second message, got {cached_tokens}. "
-            "This means prompt caching is not working (prompt_cache_key may not be sent or cache miss occurred)."
-        )
-
-        # Cache hit ratio should be significant (most of the system prompt should be cached)
-        total_input_msg2 = response2.usage.prompt_tokens + (response2.usage.cached_input_tokens or 0)
-        cache_hit_ratio = cached_tokens / total_input_msg2 if total_input_msg2 > 0 else 0
-        logger.info(f"[{model_handle}] Cache hit ratio: {cache_hit_ratio:.2%}")
-
-        assert cache_hit_ratio >= 0.20, (
-            f"[{model_handle}] Expected cache hit ratio >= 20%, got {cache_hit_ratio:.2%}. The large persona block should be mostly cached."
-        )
-
-    finally:
-        await client.agents.delete(agent.id)
diff --git a/tests/test_openai_prompt_cache_request_fields.py b/tests/test_openai_prompt_cache_request_fields.py
deleted file mode 100644
index 044f9e7a..00000000
--- a/tests/test_openai_prompt_cache_request_fields.py
+++ /dev/null
@@ -1,137 +0,0 @@
-from letta.llm_api.openai_client import OpenAIClient
-from letta.schemas.enums import AgentType, MessageRole
-from letta.schemas.letta_message_content import TextContent
-from letta.schemas.llm_config import LLMConfig
-from letta.schemas.message import Message
-
-
-def _message_with_ids(agent_id: str, conversation_id: str | None, text: str = "hello") -> Message:
-    return Message(
-        role=MessageRole.user,
-        content=[TextContent(text=text)],
-        agent_id=agent_id,
-        conversation_id=conversation_id,
-    )
-
-
-def _openai_config(model: str, endpoint_type: str = "openai", provider_name: str | None = "openai") -> LLMConfig:
-    return LLMConfig(
-        model=model,
-        model_endpoint_type=endpoint_type,
-        model_endpoint="https://api.openai.com/v1",
-        context_window=256000,
-        provider_name=provider_name,
-    )
-
-
-def test_responses_request_sets_prompt_cache_fields_for_supported_openai_model():
-    client = OpenAIClient()
-    llm_config = _openai_config(model="gpt-5.1")
-    messages = [_message_with_ids(agent_id="agent-abc", conversation_id="conversation-123")]
-
-    request_data = client.build_request_data(
-        agent_type=AgentType.letta_v1_agent,
-        messages=messages,
-        llm_config=llm_config,
-        tools=[],
-    )
-
-    assert "input" in request_data
-    assert request_data.get("prompt_cache_key") == "letta:agent-abc:conversation-123"
-    assert request_data.get("prompt_cache_retention") == "24h"
-
-
-def test_responses_request_uses_defaultconv_when_conversation_missing():
-    client = OpenAIClient()
-    llm_config = _openai_config(model="gpt-5.1")
-    messages = [_message_with_ids(agent_id="agent-abc", conversation_id=None)]
-
-    request_data = client.build_request_data(
-        agent_type=AgentType.letta_v1_agent,
-        messages=messages,
-        llm_config=llm_config,
-        tools=[],
-    )
-
-    assert request_data.get("prompt_cache_key") == "letta:agent-abc:defaultconv"
-    assert request_data.get("prompt_cache_retention") == "24h"
-
-
-def test_responses_request_omits_24h_for_unsupported_extended_retention_model():
-    client = OpenAIClient()
-    llm_config = _openai_config(model="o3-mini")
-    messages = [_message_with_ids(agent_id="agent-abc", conversation_id="conversation-123")]
-
-    request_data = client.build_request_data(
-        agent_type=AgentType.letta_v1_agent,
-        messages=messages,
-        llm_config=llm_config,
-        tools=[],
-    )
-
-    assert request_data.get("prompt_cache_key") == "letta:agent-abc:conversation-123"
-    assert "prompt_cache_retention" not in request_data
-
-
-def test_chat_completions_request_sets_prompt_cache_fields_for_supported_openai_model():
-    client = OpenAIClient()
-    llm_config = _openai_config(model="gpt-4.1")
-    messages = [_message_with_ids(agent_id="agent-abc", conversation_id="conversation-123")]
-
-    request_data = client.build_request_data(
-        agent_type=AgentType.memgpt_v2_agent,
-        messages=messages,
-        llm_config=llm_config,
-        tools=[],
-    )
-
-    assert "messages" in request_data
-    assert request_data.get("prompt_cache_key") == "letta:agent-abc:conversation-123"
-    assert request_data.get("prompt_cache_retention") == "24h"
-
-
-def test_chat_completions_request_omits_24h_for_unsupported_extended_retention_model():
-    client = OpenAIClient()
-    llm_config = _openai_config(model="gpt-4o-mini")
-    messages = [_message_with_ids(agent_id="agent-abc", conversation_id="conversation-123")]
-
-    request_data = client.build_request_data(
-        agent_type=AgentType.memgpt_v2_agent,
-        messages=messages,
-        llm_config=llm_config,
-        tools=[],
-    )
-
-    assert request_data.get("prompt_cache_key") == "letta:agent-abc:conversation-123"
-    assert "prompt_cache_retention" not in request_data
-
-
-def test_openrouter_request_omits_prompt_cache_fields_on_both_paths():
-    client = OpenAIClient()
-    llm_config = LLMConfig(
-        model="gpt-5.1",
-        handle="openrouter/gpt-5.1",
-        model_endpoint_type="openai",
-        model_endpoint="https://openrouter.ai/api/v1",
-        context_window=256000,
-        provider_name="openrouter",
-    )
-    messages = [_message_with_ids(agent_id="agent-abc", conversation_id="conversation-123")]
-
-    responses_request_data = client.build_request_data(
-        agent_type=AgentType.letta_v1_agent,
-        messages=messages,
-        llm_config=llm_config,
-        tools=[],
-    )
-    chat_request_data = client.build_request_data(
-        agent_type=AgentType.memgpt_v2_agent,
-        messages=messages,
-        llm_config=llm_config,
-        tools=[],
-    )
-
-    assert "prompt_cache_key" not in responses_request_data
-    assert "prompt_cache_retention" not in responses_request_data
-    assert "prompt_cache_key" not in chat_request_data
-    assert "prompt_cache_retention" not in chat_request_data