diff --git a/fern/openapi.json b/fern/openapi.json index ede210fe..3284662a 100644 --- a/fern/openapi.json +++ b/fern/openapi.json @@ -8003,6 +8003,62 @@ } } }, + "/v1/agents/{agent_id}/generate": { + "post": { + "tags": ["agents"], + "summary": "Generate Completion", + "description": "Generate a completion directly from the LLM provider using the agent's configuration.\n\nThis endpoint makes a direct request to the LLM provider without any agent processing:\n- No memory or context retrieval\n- No tool calling\n- No message persistence\n- No agent state modification\n\nSimply provide a prompt, and the endpoint formats it as a user message.\nOptionally include a system_prompt for context/instructions.\n\nThe agent's LLM configuration (model, credentials, settings) is used by default.\nUse override_model to switch to a different model/provider while still using\nthe organization's configured providers.\n\nExample use cases:\n- Quick LLM queries without agent overhead\n- Testing different models with the same prompt\n- Simple chat completions using agent's credentials\n- Comparing model outputs on identical prompts", + "operationId": "generate_completion", + "parameters": [ + { + "name": "agent_id", + "in": "path", + "required": true, + "schema": { + "type": "string", + "minLength": 42, + "maxLength": 42, + "pattern": "^agent-[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$", + "description": "The ID of the agent in the format 'agent-'", + "examples": ["agent-123e4567-e89b-42d3-8456-426614174000"], + "title": "Agent Id" + }, + "description": "The ID of the agent in the format 'agent-'" + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/GenerateRequest" + } + } + } + }, + "responses": { + "200": { + "description": "Successful generation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/GenerateResponse" + } + } + } + }, + "404": { + "description": "Agent not found" + }, + "422": { + "description": "Invalid request parameters" + }, + "502": { + "description": "LLM provider error" + } + } + } + }, "/v1/agents/messages/search": { "post": { "tags": ["agents"], @@ -33584,6 +33640,66 @@ "type": "object", "title": "GeminiThinkingConfig" }, + "GenerateRequest": { + "properties": { + "prompt": { + "type": "string", + "minLength": 1, + "title": "Prompt", + "description": "The prompt/message to send to the LLM" + }, + "system_prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "System Prompt", + "description": "Optional system prompt to prepend to the conversation" + }, + "override_model": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Override Model", + "description": "Model handle to use instead of agent's default (e.g., 'openai/gpt-4', 'anthropic/claude-3-5-sonnet')" + } + }, + "type": "object", + "required": ["prompt"], + "title": "GenerateRequest", + "description": "Request for direct LLM generation without agent processing." + }, + "GenerateResponse": { + "properties": { + "content": { + "type": "string", + "title": "Content", + "description": "The LLM's response text" + }, + "model": { + "type": "string", + "title": "Model", + "description": "The model that generated this response" + }, + "usage": { + "$ref": "#/components/schemas/LettaUsageStatistics", + "description": "Token usage statistics" + } + }, + "type": "object", + "required": ["content", "model", "usage"], + "title": "GenerateResponse", + "description": "Response from direct LLM generation." + }, "GenerateToolInput": { "properties": { "tool_name": { diff --git a/letta/server/rest_api/routers/v1/agents.py b/letta/server/rest_api/routers/v1/agents.py index 4ee04ef4..34f4a27e 100644 --- a/letta/server/rest_api/routers/v1/agents.py +++ b/letta/server/rest_api/routers/v1/agents.py @@ -8,7 +8,7 @@ from fastapi import APIRouter, Body, Depends, File, Form, Header, HTTPException, from fastapi.responses import JSONResponse from marshmallow import ValidationError from orjson import orjson -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, field_validator from sqlalchemy.exc import IntegrityError, OperationalError from starlette.responses import Response, StreamingResponse @@ -24,11 +24,14 @@ from letta.errors import ( AgentExportProcessingError, AgentFileImportError, AgentNotFoundForExportError, + HandleNotFoundError, + LLMError, NoActiveRunsToCancelError, PendingApprovalError, ) from letta.groups.sleeptime_multi_agent_v4 import SleeptimeMultiAgentV4 from letta.helpers.datetime_helpers import get_utc_time, get_utc_timestamp_ns +from letta.llm_api.llm_client import LLMClient from letta.log import get_logger from letta.orm.errors import NoResultFound from letta.otel.context import get_ctx_attributes @@ -59,6 +62,7 @@ from letta.schemas.run import Run as PydanticRun, RunUpdate from letta.schemas.source import BaseSource, Source from letta.schemas.tool import BaseTool, Tool from letta.schemas.tool_execution_result import ToolExecutionResult +from letta.schemas.usage import LettaUsageStatistics from letta.schemas.user import User from letta.serialize_schemas.pydantic_agent_schema import AgentSchema from letta.server.rest_api.dependencies import HeaderParams, get_headers, get_letta_server @@ -79,6 +83,43 @@ router = APIRouter(prefix="/agents", tags=["agents"]) logger = get_logger(__name__) +# Schemas for direct LLM generation endpoint +class GenerateRequest(BaseModel): + """Request for direct LLM generation without agent processing.""" + + prompt: str = Field( + ..., + description="The prompt/message to send to the LLM", + min_length=1, + ) + + system_prompt: Optional[str] = Field( + None, + description="Optional system prompt to prepend to the conversation", + ) + + override_model: Optional[str] = Field( + None, + description="Model handle to use instead of agent's default (e.g., 'openai/gpt-4', 'anthropic/claude-3-5-sonnet')", + ) + + @field_validator("prompt") + @classmethod + def validate_prompt_not_empty(cls, v: str) -> str: + """Ensure prompt is not empty or whitespace-only.""" + if not v or not v.strip(): + raise ValueError("prompt cannot be empty or whitespace-only") + return v + + +class GenerateResponse(BaseModel): + """Response from direct LLM generation.""" + + content: str = Field(..., description="The LLM's response text") + model: str = Field(..., description="The model that generated this response") + usage: LettaUsageStatistics = Field(..., description="Token usage statistics") + + @router.get("/", response_model=list[AgentState], operation_id="list_agents") async def list_agents( name: str | None = Query(None, description="Name of the agent"), @@ -1785,6 +1826,75 @@ async def cancel_message( return results +@router.post( + "/{agent_id}/generate", + response_model=GenerateResponse, + operation_id="generate_completion", + responses={ + 200: {"description": "Successful generation"}, + 404: {"description": "Agent not found"}, + 422: {"description": "Invalid request parameters"}, + 502: {"description": "LLM provider error"}, + }, +) +async def generate_completion( + agent_id: AgentId, + server: SyncServer = Depends(get_letta_server), + request: GenerateRequest = Body(...), + headers: HeaderParams = Depends(get_headers), +) -> GenerateResponse: + """ + Generate a completion directly from the LLM provider using the agent's configuration. + + This endpoint makes a direct request to the LLM provider without any agent processing: + - No memory or context retrieval + - No tool calling + - No message persistence + - No agent state modification + + Simply provide a prompt, and the endpoint formats it as a user message. + Optionally include a system_prompt for context/instructions. + + The agent's LLM configuration (model, credentials, settings) is used by default. + Use override_model to switch to a different model/provider while still using + the organization's configured providers. + + Example use cases: + - Quick LLM queries without agent overhead + - Testing different models with the same prompt + - Simple chat completions using agent's credentials + - Comparing model outputs on identical prompts + """ + # Get actor for permissions + actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id) + + # Call the manager to generate the completion + try: + service_response = await server.agent_generate_completion_manager.generate_completion_with_agent_config_async( + agent_id=str(agent_id), + prompt=request.prompt, + system_prompt=request.system_prompt, + actor=actor, + override_model=request.override_model, + ) + except NoResultFound: + raise HTTPException(status_code=404, detail=f"Agent with ID {agent_id} not found") + except HandleNotFoundError: + raise HTTPException(status_code=404, detail=f"Model '{request.override_model}' not found or not accessible") + except LLMError as e: + raise HTTPException(status_code=502, detail=f"LLM provider error: {str(e)}") + except Exception as e: + logger.error(f"Failed to process LLM response: {str(e)}") + raise HTTPException(status_code=502, detail=f"Failed to process LLM response: {str(e)}") + + # Convert service response to API response model + return GenerateResponse( + content=service_response.content, + model=service_response.model, + usage=service_response.usage, + ) + + @router.post("/messages/search", response_model=List[MessageSearchResult], operation_id="search_messages") async def search_messages( request: MessageSearchRequest = Body(...), diff --git a/letta/server/server.py b/letta/server/server.py index 064767cc..405ecb59 100644 --- a/letta/server/server.py +++ b/letta/server/server.py @@ -191,6 +191,11 @@ class SyncServer(object): self.file_agent_manager = FileAgentManager() self.file_manager = FileManager() + # Import and initialize the agent generate completion manager + from letta.services.agent_generate_completion_manager import AgentGenerateCompletionManager + + self.agent_generate_completion_manager = AgentGenerateCompletionManager(server=self) + self.agent_serialization_manager = AgentSerializationManager( agent_manager=self.agent_manager, tool_manager=self.tool_manager, diff --git a/letta/services/agent_generate_completion_manager.py b/letta/services/agent_generate_completion_manager.py new file mode 100644 index 00000000..e11ee8bd --- /dev/null +++ b/letta/services/agent_generate_completion_manager.py @@ -0,0 +1,168 @@ +"""Manager for handling direct LLM completions using agent configuration.""" + +from typing import TYPE_CHECKING, Optional + +from letta.errors import HandleNotFoundError, LLMError +from letta.llm_api.llm_client import LLMClient +from letta.log import get_logger +from letta.orm.errors import NoResultFound +from letta.schemas.enums import MessageRole +from letta.schemas.letta_message_content import TextContent +from letta.schemas.message import Message +from letta.schemas.usage import LettaUsageStatistics + +if TYPE_CHECKING: + from letta.orm import User + from letta.schemas.llm_config import LLMConfig + from letta.server.server import SyncServer + +logger = get_logger(__name__) + + +class GenerateResponse: + """Response from direct LLM generation.""" + + def __init__(self, content: str, model: str, usage: LettaUsageStatistics): + self.content = content + self.model = model + self.usage = usage + + +class AgentGenerateCompletionManager: + """Manager for handling direct LLM completions using agent configuration.""" + + def __init__(self, server: "SyncServer"): + """ + Initialize the agent generate completion manager. + + Args: + server: The SyncServer instance for accessing managers + """ + self.server = server + self.agent_manager = server.agent_manager + self.provider_manager = server.provider_manager + + async def generate_completion_with_agent_config_async( + self, + agent_id: str, + prompt: str, + actor: "User", + system_prompt: Optional[str] = None, + override_model: Optional[str] = None, + ) -> GenerateResponse: + """ + Generate a completion directly from the LLM provider using the agent's configuration. + + This method makes a direct request to the LLM provider without any agent processing: + - No memory or context retrieval + - No tool calling + - No message persistence + - No agent state modification + + Args: + agent_id: The agent ID whose configuration to use + prompt: The prompt/message to send to the LLM + actor: The user making the request + system_prompt: Optional system prompt to prepend to the conversation + override_model: Optional model handle to override the agent's default + (e.g., 'openai/gpt-4', 'anthropic/claude-3-5-sonnet') + + Returns: + GenerateResponse with content, model, and usage statistics + + Raises: + NoResultFound: If agent not found + HandleNotFoundError: If override_model is invalid + LLMError: If LLM provider error occurs + """ + # 1. Validate agent exists and user has access + agent = await self.agent_manager.get_agent_by_id_async( + agent_id, + actor, + include_relationships=[], + ) + + # 2. Get LLM config (with optional override) + llm_config: "LLMConfig" = agent.llm_config + if override_model: + # Get full LLM config for the override model + # This ensures we get the right provider, endpoint, credentials, etc. + llm_config = await self.server.get_llm_config_from_handle_async( + actor=actor, + handle=override_model, + ) + + logger.info( + f"Generating completion for agent {agent_id}", + extra={ + "agent_id": str(agent_id), + "override_model": override_model, + "prompt_length": len(prompt), + "has_system_prompt": system_prompt is not None, + "model": llm_config.model, + }, + ) + + # 3. Build messages from prompt and optional system_prompt + letta_messages = [] + + # Always add a system message (required by some providers like Anthropic) + # Use provided system_prompt or minimal default (empty strings not allowed with cache_control) + letta_messages.append( + Message( + role=MessageRole.system, + content=[TextContent(text=system_prompt if system_prompt else "You are a helpful assistant.")], + ) + ) + + # Add user prompt + letta_messages.append( + Message( + role=MessageRole.user, + content=[TextContent(text=prompt)], + ) + ) + + # 4. Create LLM client for the provider + llm_client = LLMClient.create( + provider_type=llm_config.model_endpoint_type, + actor=actor, + ) + + if llm_client is None: + raise LLMError(f"Unsupported provider type: {llm_config.model_endpoint_type}") + + # 5. Build request data (no tools, no function calling) + request_data = llm_client.build_request_data( + agent_type=agent.agent_type, + messages=letta_messages, + llm_config=llm_config, + tools=None, # No tools for direct generation + force_tool_call=None, + ) + + # 6. Make direct LLM request + response_data = await llm_client.request_async(request_data, llm_config) + + # 7. Convert to standard chat completion format + chat_completion = await llm_client.convert_response_to_chat_completion( + response_data, + letta_messages, + llm_config, + ) + + # 8. Extract response content + content = "" + if chat_completion.choices and len(chat_completion.choices) > 0: + message = chat_completion.choices[0].message + content = message.content or "" + + # 9. Extract usage statistics + usage = llm_client.extract_usage_statistics(response_data, llm_config) + + # 10. Build and return response + return GenerateResponse( + content=content, + model=llm_config.model, + usage=usage, + ) diff --git a/tests/test_client.py b/tests/test_client.py index 811241a9..0ee428ec 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -4,6 +4,7 @@ import threading import uuid from http.server import BaseHTTPRequestHandler, HTTPServer +import httpx import pytest from dotenv import load_dotenv from letta_client import APIError, Letta @@ -895,3 +896,208 @@ def test_attach_sleeptime_block(client: Letta): # cleanup client.agents.delete(agent.id) + + +# -------------------------------------------------------------------------------------------------------------------- +# Agent Generate Endpoint Tests +# -------------------------------------------------------------------------------------------------------------------- + + +def test_agent_generate_basic(client: Letta, agent: AgentState): + """Test basic generate endpoint with simple prompt.""" + response = httpx.post( + f"{client._client._base_url}/v1/agents/{agent.id}/generate", + json={"prompt": "What is 2+2?"}, + timeout=30.0, + ) + + # Verify successful response + assert response.status_code == 200, f"Expected 200, got {response.status_code}: {response.text}" + + response_data = response.json() + + # Verify response structure + assert response_data is not None + assert "content" in response_data + assert "model" in response_data + assert "usage" in response_data + + # Verify content is returned + assert response_data["content"] is not None + assert len(response_data["content"]) > 0 + assert isinstance(response_data["content"], str) + + # Verify model is set + assert response_data["model"] is not None + assert isinstance(response_data["model"], str) + + # Verify usage statistics + assert response_data["usage"] is not None + assert response_data["usage"]["total_tokens"] > 0 + assert response_data["usage"]["prompt_tokens"] > 0 + assert response_data["usage"]["completion_tokens"] > 0 + + +def test_agent_generate_with_system_prompt(client: Letta, agent: AgentState): + """Test generate endpoint with system prompt.""" + response = httpx.post( + f"{client._client._base_url}/v1/agents/{agent.id}/generate", + json={ + "prompt": "What is your role?", + "system_prompt": "You are a helpful math tutor who always responds with exactly 5 words.", + }, + timeout=30.0, + ) + + # Verify successful response + assert response.status_code == 200, f"Expected 200, got {response.status_code}: {response.text}" + + response_data = response.json() + + # Verify response + assert response_data is not None + assert response_data["content"] is not None + assert len(response_data["content"]) > 0 + + # Verify usage includes system prompt tokens + assert response_data["usage"]["prompt_tokens"] > 10 # Should include system prompt tokens + + +def test_agent_generate_with_model_override(client: Letta, agent: AgentState): + """Test generate endpoint with model override.""" + # Get the agent's current model + original_model = agent.llm_config.model + + # Use OpenAI model (more likely to be available in test environment) + override_model_handle = "openai/gpt-4o-mini" + + response = httpx.post( + f"{client._client._base_url}/v1/agents/{agent.id}/generate", + json={ + "prompt": "Say hello", + "override_model": override_model_handle, + }, + timeout=30.0, + ) + + # Verify successful response + assert response.status_code == 200, f"Expected 200, got {response.status_code}: {response.text}" + + response_data = response.json() + + # Verify response + assert response_data is not None + assert response_data["content"] is not None + + # Verify the override model was used (model name should be different from original) + # Note: The actual model name in response might be the full model name, not the handle + assert response_data["model"] is not None + + +def test_agent_generate_empty_prompt_error(client: Letta, agent: AgentState): + """Test that empty prompt returns validation error.""" + response = httpx.post( + f"{client._client._base_url}/v1/agents/{agent.id}/generate", + json={"prompt": ""}, # Empty prompt should fail validation + timeout=30.0, + ) + + # Verify it's a validation error (422) + assert response.status_code == 422, f"Expected 422, got {response.status_code}: {response.text}" + + +def test_agent_generate_whitespace_prompt_error(client: Letta, agent: AgentState): + """Test that whitespace-only prompt returns validation error.""" + response = httpx.post( + f"{client._client._base_url}/v1/agents/{agent.id}/generate", + json={"prompt": " \n\t "}, # Whitespace-only prompt should fail validation + timeout=30.0, + ) + + # Verify it's a validation error (422) + assert response.status_code == 422, f"Expected 422, got {response.status_code}: {response.text}" + + +def test_agent_generate_invalid_agent_id(client: Letta): + """Test that invalid agent ID returns 404.""" + # Use properly formatted agent ID that doesn't exist + fake_agent_id = "agent-00000000-0000-4000-8000-000000000000" + + response = httpx.post( + f"{client._client._base_url}/v1/agents/{fake_agent_id}/generate", + json={"prompt": "Hello"}, + timeout=30.0, + ) + + # Verify it's a not found error (404) + assert response.status_code == 404, f"Expected 404, got {response.status_code}: {response.text}" + assert "not found" in response.text.lower() + + +def test_agent_generate_invalid_model_override(client: Letta, agent: AgentState): + """Test that invalid model override returns 404.""" + response = httpx.post( + f"{client._client._base_url}/v1/agents/{agent.id}/generate", + json={ + "prompt": "Hello", + "override_model": "invalid/model-that-does-not-exist", + }, + timeout=30.0, + ) + + # Verify it's a not found error (404) + assert response.status_code == 404, f"Expected 404, got {response.status_code}: {response.text}" + assert "not found" in response.text.lower() or "not accessible" in response.text.lower() + + +def test_agent_generate_long_prompt(client: Letta, agent: AgentState): + """Test generate endpoint with a longer prompt.""" + # Create a longer prompt + long_prompt = " ".join(["This is a test sentence."] * 50) + + response = httpx.post( + f"{client._client._base_url}/v1/agents/{agent.id}/generate", + json={"prompt": long_prompt}, + timeout=30.0, + ) + + # Verify successful response + assert response.status_code == 200, f"Expected 200, got {response.status_code}: {response.text}" + + response_data = response.json() + + # Verify response + assert response_data is not None + assert response_data["content"] is not None + + # Verify token usage reflects the longer prompt + assert response_data["usage"]["prompt_tokens"] > 100 # Should have substantial prompt tokens + + +def test_agent_generate_no_persistence(client: Letta, agent: AgentState): + """Test that generate endpoint does not persist messages to agent.""" + # Get initial message count + initial_messages = client.agents.messages.list(agent_id=agent.id).items + initial_count = len(initial_messages) + + # Make a generate request + response = httpx.post( + f"{client._client._base_url}/v1/agents/{agent.id}/generate", + json={"prompt": "This should not be saved to agent memory"}, + timeout=30.0, + ) + + # Verify successful response + assert response.status_code == 200, f"Expected 200, got {response.status_code}: {response.text}" + + response_data = response.json() + + # Verify response was generated + assert response_data is not None + assert response_data["content"] is not None + + # Verify no new messages were added to the agent + final_messages = client.agents.messages.list(agent_id=agent.id).items + final_count = len(final_messages) + + assert final_count == initial_count, "Generate endpoint should not persist messages"