diff --git a/letta/llm_api/openai_client.py b/letta/llm_api/openai_client.py index 0d58fb9f..e4b78736 100644 --- a/letta/llm_api/openai_client.py +++ b/letta/llm_api/openai_client.py @@ -247,81 +247,6 @@ class OpenAIClient(LLMClientBase): def supports_structured_output(self, llm_config: LLMConfig) -> bool: return supports_structured_output(llm_config) - def _is_openrouter_request(self, llm_config: LLMConfig) -> bool: - return (llm_config.model_endpoint and "openrouter.ai" in llm_config.model_endpoint) or (llm_config.provider_name == "openrouter") - - def _is_true_openai_request(self, llm_config: LLMConfig) -> bool: - if llm_config.model_endpoint_type != "openai": - return False - - if self._is_openrouter_request(llm_config): - return False - - # Keep Letta inference endpoint behavior unchanged. - if llm_config.model_endpoint == LETTA_MODEL_ENDPOINT: - return False - - # If provider_name is explicitly set and not openai, don't apply OpenAI-specific prompt caching fields. - if llm_config.provider_name and llm_config.provider_name != "openai": - return False - - return True - - def _normalize_model_name(self, model: Optional[str]) -> Optional[str]: - if not model: - return None - return model.split("/", 1)[-1] - - def _supports_extended_prompt_cache_retention(self, model: Optional[str]) -> bool: - normalized_model = self._normalize_model_name(model) - if not normalized_model: - return False - - # Per OpenAI docs: extended retention is available on gpt-4.1 and gpt-5 family models but not gpt-5-mini or gpt-5.2-codex. - exceptions = ["gpt-5-mini", "gpt-5.2-codex"] - return normalized_model == "gpt-4.1" or normalized_model.startswith("gpt-5") and normalized_model not in exceptions - - def _build_prompt_cache_key(self, messages: List[PydanticMessage]) -> Optional[str]: - agent_id = None - conversation_id = None - - for message in reversed(messages): - if agent_id is None and getattr(message, "agent_id", None): - agent_id = message.agent_id - if conversation_id is None and getattr(message, "conversation_id", None): - conversation_id = message.conversation_id - if agent_id is not None and conversation_id is not None: - break - - if agent_id is None: - agent_id = self._telemetry_agent_id - - if agent_id is None: - return None - - # Use requested fallback string for non-conversation/default-conversation paths. - if not conversation_id or conversation_id == "default": - conversation_id = "defaultconv" - - return f"letta:{agent_id}:{conversation_id}" - - def _apply_prompt_cache_settings( - self, - llm_config: LLMConfig, - model: Optional[str], - messages: List[PydanticMessage], - request_obj: Any, - ) -> None: - if not self._is_true_openai_request(llm_config): - return - - prompt_cache_key = self._build_prompt_cache_key(messages) - if prompt_cache_key: - request_obj.prompt_cache_key = prompt_cache_key - - if self._supports_extended_prompt_cache_retention(model): - request_obj.prompt_cache_retention = "24h" - @trace_method def build_request_data_responses( self, @@ -462,13 +387,6 @@ class OpenAIClient(LLMClientBase): data.model = "memgpt-openai" - self._apply_prompt_cache_settings( - llm_config=llm_config, - model=model, - messages=messages, - request_obj=data, - ) - request_data = data.model_dump(exclude_unset=True) # print("responses request data", request_data) return request_data @@ -537,7 +455,9 @@ class OpenAIClient(LLMClientBase): model = None # TODO: we may need to extend this to more models using proxy? - is_openrouter = self._is_openrouter_request(llm_config) + is_openrouter = (llm_config.model_endpoint and "openrouter.ai" in llm_config.model_endpoint) or ( + llm_config.provider_name == "openrouter" + ) if is_openrouter: try: model = llm_config.handle.split("/", 1)[-1] @@ -640,13 +560,6 @@ class OpenAIClient(LLMClientBase): new_tools.append(tool.model_copy(deep=True)) data.tools = new_tools - self._apply_prompt_cache_settings( - llm_config=llm_config, - model=model, - messages=messages, - request_obj=data, - ) - # Note: Tools are already processed by enable_strict_mode() in the workflow/agent code # (temporal_letta_v1_agent_workflow.py or letta_agent_v3.py) before reaching here. # enable_strict_mode() handles: strict flag, additionalProperties, required array, nullable fields diff --git a/letta/schemas/openai/chat_completion_request.py b/letta/schemas/openai/chat_completion_request.py index 2b271ee3..c0939257 100644 --- a/letta/schemas/openai/chat_completion_request.py +++ b/letta/schemas/openai/chat_completion_request.py @@ -143,8 +143,6 @@ class ChatCompletionRequest(BaseModel): temperature: Optional[float] = 1 top_p: Optional[float] = 1 user: Optional[str] = None # unique ID of the end-user (for monitoring) - prompt_cache_key: Optional[str] = None - prompt_cache_retention: Optional[Literal["in_memory", "24h"]] = None parallel_tool_calls: Optional[bool] = None instructions: Optional[str] = None verbosity: Optional[Literal["low", "medium", "high"]] = None # For verbosity control in GPT-5 models diff --git a/letta/schemas/openai/responses_request.py b/letta/schemas/openai/responses_request.py index 5c4fbfe7..aeeefa23 100644 --- a/letta/schemas/openai/responses_request.py +++ b/letta/schemas/openai/responses_request.py @@ -30,7 +30,6 @@ class ResponsesRequest(BaseModel): previous_response_id: Optional[str] = Field(default=NOT_GIVEN) prompt: Optional[ResponsePromptParam] = Field(default=NOT_GIVEN) prompt_cache_key: Optional[str] = Field(default=NOT_GIVEN) - prompt_cache_retention: Optional[Literal["in_memory", "24h"]] = Field(default=NOT_GIVEN) reasoning: Optional[Reasoning] = Field(default=NOT_GIVEN) safety_identifier: Optional[str] = Field(default=NOT_GIVEN) service_tier: Optional[Literal["auto", "default", "flex", "scale", "priority"]] = Field(default=NOT_GIVEN) diff --git a/tests/integration_test_send_message_v2.py b/tests/integration_test_send_message_v2.py index c59afad2..4ef0cd02 100644 --- a/tests/integration_test_send_message_v2.py +++ b/tests/integration_test_send_message_v2.py @@ -1189,217 +1189,3 @@ async def test_json_schema_response_format( finally: # Cleanup await client.agents.delete(agent_state.id) - - -# Large memory block to exceed OpenAI's 1024 token caching threshold. -# This ensures the system prompt is large enough for OpenAI to cache it. -_LARGE_PERSONA_BLOCK = """ -You are an advanced AI assistant with extensive knowledge across multiple domains. - -# Core Capabilities - -## Technical Knowledge -- Software Engineering: Expert in Python, JavaScript, TypeScript, Go, Rust, and many other languages -- System Design: Deep understanding of distributed systems, microservices, and cloud architecture -- DevOps: Proficient in Docker, Kubernetes, CI/CD pipelines, and infrastructure as code -- Databases: Experience with SQL (PostgreSQL, MySQL) and NoSQL (MongoDB, Redis, Cassandra) databases -- Machine Learning: Knowledge of neural networks, transformers, and modern ML frameworks - -## Problem Solving Approach -When tackling problems, you follow a structured methodology: -1. Understand the requirements thoroughly -2. Break down complex problems into manageable components -3. Consider multiple solution approaches -4. Evaluate trade-offs between different options -5. Implement solutions with clean, maintainable code -6. Test thoroughly and iterate based on feedback - -## Communication Style -- Clear and concise explanations -- Use examples and analogies when helpful -- Adapt technical depth to the audience -- Ask clarifying questions when requirements are ambiguous -- Provide context and rationale for recommendations - -# Domain Expertise - -## Web Development -You have deep knowledge of: -- Frontend: React, Vue, Angular, Next.js, modern CSS frameworks -- Backend: Node.js, Express, FastAPI, Django, Flask -- API Design: REST, GraphQL, gRPC -- Authentication: OAuth, JWT, session management -- Performance: Caching strategies, CDNs, lazy loading - -## Data Engineering -You understand: -- ETL pipelines and data transformation -- Data warehousing concepts (Snowflake, BigQuery, Redshift) -- Stream processing (Kafka, Kinesis) -- Data modeling and schema design -- Data quality and validation - -## Cloud Platforms -You're familiar with: -- AWS: EC2, S3, Lambda, RDS, DynamoDB, CloudFormation -- GCP: Compute Engine, Cloud Storage, Cloud Functions, BigQuery -- Azure: Virtual Machines, Blob Storage, Azure Functions -- Serverless architectures and best practices -- Cost optimization strategies - -## Security -You consider: -- Common vulnerabilities (OWASP Top 10) -- Secure coding practices -- Encryption and key management -- Access control and authorization patterns -- Security audit and compliance requirements - -# Interaction Principles - -## Helpfulness -- Provide actionable guidance -- Share relevant resources and documentation -- Offer multiple approaches when appropriate -- Point out potential pitfalls and edge cases - -## Accuracy -- Verify information before sharing -- Acknowledge uncertainty when appropriate -- Correct mistakes promptly -- Stay up-to-date with best practices - -## Efficiency -- Get to the point quickly -- Avoid unnecessary verbosity -- Focus on what's most relevant -- Provide code examples when they clarify concepts -""" + "\n\n".join( - [ - f"Section {i + 1}: " - + """ -You have deep expertise in software development, including but not limited to: -- Programming languages: Python, JavaScript, TypeScript, Java, C++, Rust, Go, Swift, Kotlin, Ruby, PHP, Scala -- Web frameworks: React, Vue, Angular, Django, Flask, FastAPI, Express, Next.js, Nuxt, SvelteKit, Remix, Astro -- Databases: PostgreSQL, MySQL, MongoDB, Redis, Cassandra, DynamoDB, ElasticSearch, Neo4j, InfluxDB, TimescaleDB -- Cloud platforms: AWS (EC2, S3, Lambda, ECS, EKS, RDS), GCP (Compute Engine, Cloud Run, GKE), Azure (VMs, Functions, AKS) -- DevOps tools: Docker, Kubernetes, Terraform, Ansible, Jenkins, GitHub Actions, GitLab CI, CircleCI, ArgoCD -- Testing frameworks: pytest, Jest, Mocha, JUnit, unittest, Cypress, Playwright, Selenium, TestNG, RSpec -- Architecture patterns: Microservices, Event-driven, Serverless, Monolithic, CQRS, Event Sourcing, Hexagonal -- API design: REST, GraphQL, gRPC, WebSockets, Server-Sent Events, tRPC, JSON-RPC -""" - for i in range(4) - ] -) - -# Models that support prompt_cache_key + prompt_cache_retention="24h": -# gpt-4.1, gpt-5 family (but not gpt-5-mini or gpt-5.2-codex). -_PROMPT_CACHE_RETENTION_PREFIXES = ("gpt-4.1", "gpt-5") - -PROMPT_CACHE_MODEL_CONFIGS: List[Tuple[str, dict]] = [ - (handle, settings) - for handle, settings in TESTED_MODEL_CONFIGS - if settings.get("provider_type") == "openai" and any(handle.split("/")[-1].startswith(p) for p in _PROMPT_CACHE_RETENTION_PREFIXES) -] - - -@pytest.mark.skip(reason="the prompt caching is flaky") -@pytest.mark.parametrize( - "model_config", - PROMPT_CACHE_MODEL_CONFIGS, - ids=[handle for handle, _ in PROMPT_CACHE_MODEL_CONFIGS], -) -@pytest.mark.asyncio(loop_scope="function") -async def test_openai_prompt_cache_integration( - disable_e2b_api_key: Any, - client: AsyncLetta, - model_config: Tuple[str, dict], -) -> None: - """ - Integration test verifying OpenAI prompt caching works end-to-end. - - Tests models that support both prompt_cache_key and prompt_cache_retention="24h". - Validates that these fields are accepted by OpenAI's API and produce cache hits. - - Strategy: - 1. Create an agent with a large persona block (>1024 tokens, OpenAI's caching threshold) - 2. Send message 1 -> primes the cache (cached_input_tokens should be 0 or small) - 3. Send message 2 -> should hit the cache (cached_input_tokens > 0) - - The prompt_cache_key (letta:{agent_id}:{conversation_id}) improves cache routing - so that subsequent requests land on the same machine with warm KV tensors. - """ - from letta_client.types import CreateBlockParam - - model_handle, model_settings = model_config - - agent = await client.agents.create( - name=f"prompt-cache-test-{uuid.uuid4().hex[:8]}", - agent_type="letta_v1_agent", - model=model_handle, - model_settings=model_settings, - embedding="openai/text-embedding-3-small", - include_base_tools=False, - memory_blocks=[ - CreateBlockParam( - label="persona", - value=_LARGE_PERSONA_BLOCK, - ) - ], - ) - - try: - # Message 1: Prime the cache. First request typically has cached_input_tokens=0. - response1 = await client.agents.messages.create( - agent_id=agent.id, - messages=[MessageCreateParam(role="user", content="Hello! Please introduce yourself briefly.")], - ) - assert response1.usage is not None, "First message should return usage data" - assert response1.usage.prompt_tokens > 0, "First message should have prompt_tokens > 0" - - logger.info( - f"[{model_handle}] Message 1 usage: " - f"prompt={response1.usage.prompt_tokens}, " - f"completion={response1.usage.completion_tokens}, " - f"cached_input={response1.usage.cached_input_tokens}" - ) - - # Verify we exceeded the 1024 token threshold for OpenAI caching - total_input_tokens = response1.usage.prompt_tokens + (response1.usage.cached_input_tokens or 0) - assert total_input_tokens >= 1024, f"Total input tokens ({total_input_tokens}) must be >= 1024 for OpenAI caching to activate" - - # Message 2: Should hit the cache thanks to prompt_cache_key routing. - response2 = await client.agents.messages.create( - agent_id=agent.id, - messages=[MessageCreateParam(role="user", content="What are your main areas of expertise?")], - ) - assert response2.usage is not None, "Second message should return usage data" - assert response2.usage.prompt_tokens > 0, "Second message should have prompt_tokens > 0" - - logger.info( - f"[{model_handle}] Message 2 usage: " - f"prompt={response2.usage.prompt_tokens}, " - f"completion={response2.usage.completion_tokens}, " - f"cached_input={response2.usage.cached_input_tokens}" - ) - - # CRITICAL: The second message should show cached_input_tokens > 0. - # This proves that prompt_cache_key and prompt_cache_retention are being - # sent correctly and OpenAI is caching the prompt prefix. - cached_tokens = response2.usage.cached_input_tokens - assert cached_tokens is not None and cached_tokens > 0, ( - f"[{model_handle}] Expected cached_input_tokens > 0 on second message, got {cached_tokens}. " - "This means prompt caching is not working (prompt_cache_key may not be sent or cache miss occurred)." - ) - - # Cache hit ratio should be significant (most of the system prompt should be cached) - total_input_msg2 = response2.usage.prompt_tokens + (response2.usage.cached_input_tokens or 0) - cache_hit_ratio = cached_tokens / total_input_msg2 if total_input_msg2 > 0 else 0 - logger.info(f"[{model_handle}] Cache hit ratio: {cache_hit_ratio:.2%}") - - assert cache_hit_ratio >= 0.20, ( - f"[{model_handle}] Expected cache hit ratio >= 20%, got {cache_hit_ratio:.2%}. The large persona block should be mostly cached." - ) - - finally: - await client.agents.delete(agent.id) diff --git a/tests/test_openai_prompt_cache_request_fields.py b/tests/test_openai_prompt_cache_request_fields.py deleted file mode 100644 index 044f9e7a..00000000 --- a/tests/test_openai_prompt_cache_request_fields.py +++ /dev/null @@ -1,137 +0,0 @@ -from letta.llm_api.openai_client import OpenAIClient -from letta.schemas.enums import AgentType, MessageRole -from letta.schemas.letta_message_content import TextContent -from letta.schemas.llm_config import LLMConfig -from letta.schemas.message import Message - - -def _message_with_ids(agent_id: str, conversation_id: str | None, text: str = "hello") -> Message: - return Message( - role=MessageRole.user, - content=[TextContent(text=text)], - agent_id=agent_id, - conversation_id=conversation_id, - ) - - -def _openai_config(model: str, endpoint_type: str = "openai", provider_name: str | None = "openai") -> LLMConfig: - return LLMConfig( - model=model, - model_endpoint_type=endpoint_type, - model_endpoint="https://api.openai.com/v1", - context_window=256000, - provider_name=provider_name, - ) - - -def test_responses_request_sets_prompt_cache_fields_for_supported_openai_model(): - client = OpenAIClient() - llm_config = _openai_config(model="gpt-5.1") - messages = [_message_with_ids(agent_id="agent-abc", conversation_id="conversation-123")] - - request_data = client.build_request_data( - agent_type=AgentType.letta_v1_agent, - messages=messages, - llm_config=llm_config, - tools=[], - ) - - assert "input" in request_data - assert request_data.get("prompt_cache_key") == "letta:agent-abc:conversation-123" - assert request_data.get("prompt_cache_retention") == "24h" - - -def test_responses_request_uses_defaultconv_when_conversation_missing(): - client = OpenAIClient() - llm_config = _openai_config(model="gpt-5.1") - messages = [_message_with_ids(agent_id="agent-abc", conversation_id=None)] - - request_data = client.build_request_data( - agent_type=AgentType.letta_v1_agent, - messages=messages, - llm_config=llm_config, - tools=[], - ) - - assert request_data.get("prompt_cache_key") == "letta:agent-abc:defaultconv" - assert request_data.get("prompt_cache_retention") == "24h" - - -def test_responses_request_omits_24h_for_unsupported_extended_retention_model(): - client = OpenAIClient() - llm_config = _openai_config(model="o3-mini") - messages = [_message_with_ids(agent_id="agent-abc", conversation_id="conversation-123")] - - request_data = client.build_request_data( - agent_type=AgentType.letta_v1_agent, - messages=messages, - llm_config=llm_config, - tools=[], - ) - - assert request_data.get("prompt_cache_key") == "letta:agent-abc:conversation-123" - assert "prompt_cache_retention" not in request_data - - -def test_chat_completions_request_sets_prompt_cache_fields_for_supported_openai_model(): - client = OpenAIClient() - llm_config = _openai_config(model="gpt-4.1") - messages = [_message_with_ids(agent_id="agent-abc", conversation_id="conversation-123")] - - request_data = client.build_request_data( - agent_type=AgentType.memgpt_v2_agent, - messages=messages, - llm_config=llm_config, - tools=[], - ) - - assert "messages" in request_data - assert request_data.get("prompt_cache_key") == "letta:agent-abc:conversation-123" - assert request_data.get("prompt_cache_retention") == "24h" - - -def test_chat_completions_request_omits_24h_for_unsupported_extended_retention_model(): - client = OpenAIClient() - llm_config = _openai_config(model="gpt-4o-mini") - messages = [_message_with_ids(agent_id="agent-abc", conversation_id="conversation-123")] - - request_data = client.build_request_data( - agent_type=AgentType.memgpt_v2_agent, - messages=messages, - llm_config=llm_config, - tools=[], - ) - - assert request_data.get("prompt_cache_key") == "letta:agent-abc:conversation-123" - assert "prompt_cache_retention" not in request_data - - -def test_openrouter_request_omits_prompt_cache_fields_on_both_paths(): - client = OpenAIClient() - llm_config = LLMConfig( - model="gpt-5.1", - handle="openrouter/gpt-5.1", - model_endpoint_type="openai", - model_endpoint="https://openrouter.ai/api/v1", - context_window=256000, - provider_name="openrouter", - ) - messages = [_message_with_ids(agent_id="agent-abc", conversation_id="conversation-123")] - - responses_request_data = client.build_request_data( - agent_type=AgentType.letta_v1_agent, - messages=messages, - llm_config=llm_config, - tools=[], - ) - chat_request_data = client.build_request_data( - agent_type=AgentType.memgpt_v2_agent, - messages=messages, - llm_config=llm_config, - tools=[], - ) - - assert "prompt_cache_key" not in responses_request_data - assert "prompt_cache_retention" not in responses_request_data - assert "prompt_cache_key" not in chat_request_data - assert "prompt_cache_retention" not in chat_request_data