letta-server/letta/services/agent_generate_completion_manager.py

"""Manager for handling direct LLM completions using agent configuration."""

import json
from typing import TYPE_CHECKING, Any, Dict, Optional

from letta.errors import HandleNotFoundError, LLMError
from letta.llm_api.llm_client import LLMClient
from letta.log import get_logger
from letta.orm.errors import NoResultFound
from letta.schemas.enums import AgentType, MessageRole
from letta.schemas.letta_message_content import TextContent
from letta.schemas.message import Message
from letta.schemas.usage import LettaUsageStatistics

# Tool name used for structured output via tool forcing
STRUCTURED_OUTPUT_TOOL_NAME = "structured_output"

if TYPE_CHECKING:
    from letta.orm import User
    from letta.schemas.llm_config import LLMConfig
    from letta.server.server import SyncServer

logger = get_logger(__name__)


def _schema_to_tool_definition(schema: Dict[str, Any]) -> Dict[str, Any]:
    """
    Convert a JSON schema into a tool definition for forced tool calling.

    Args:
        schema: JSON schema object with 'properties' and optionally 'required'

    Returns:
        Tool definition dict compatible with OpenAI/Anthropic function calling format
    """
    return {
        "name": STRUCTURED_OUTPUT_TOOL_NAME,
        "description": "Returns a structured response matching the requested schema.",
        "parameters": {
            "type": "object",
            "properties": schema.get("properties", {}),
            "required": schema.get("required", list(schema.get("properties", {}).keys())),
        },
    }


class GenerateResponse:
    """Response from direct LLM generation."""

    def __init__(self, content: str, model: str, usage: LettaUsageStatistics):
        self.content = content
        self.model = model
        self.usage = usage


class AgentGenerateCompletionManager:
    """Manager for handling direct LLM completions using agent configuration."""

    def __init__(self, server: "SyncServer"):
        """
        Initialize the agent generate completion manager.

        Args:
            server: The SyncServer instance for accessing managers
        """
        self.server = server
        self.agent_manager = server.agent_manager
        self.provider_manager = server.provider_manager

    async def generate_completion_with_agent_config_async(
        self,
        agent_id: str,
        prompt: str,
        actor: "User",
        system_prompt: Optional[str] = None,
        override_model: Optional[str] = None,
        response_schema: Optional[Dict[str, Any]] = None,
    ) -> GenerateResponse:
        """
        Generate a completion directly from the LLM provider using the agent's configuration.

        This method makes a direct request to the LLM provider without any agent processing:
        - No memory or context retrieval
        - No tool calling (unless response_schema is provided)
        - No message persistence
        - No agent state modification

        Args:
            agent_id: The agent ID whose configuration to use
            prompt: The prompt/message to send to the LLM
            actor: The user making the request
            system_prompt: Optional system prompt to prepend to the conversation
            override_model: Optional model handle to override the agent's default
                          (e.g., 'openai/gpt-4', 'anthropic/claude-3-5-sonnet')
            response_schema: Optional JSON schema for structured output. When provided,
                           the LLM will be forced to return a response matching this
                           schema via tool calling.

        Returns:
            GenerateResponse with content, model, and usage statistics.
            When response_schema is provided, content will be the JSON string
            matching the schema.

        Raises:
            NoResultFound: If agent not found
            HandleNotFoundError: If override_model is invalid
            LLMError: If LLM provider error occurs
        """
        # 1. Validate agent exists and user has access
        agent = await self.agent_manager.get_agent_by_id_async(
            agent_id,
            actor,
            include_relationships=[],
        )

        # 2. Get LLM config (with optional override)
        llm_config: "LLMConfig" = agent.llm_config
        if override_model:
            # Get full LLM config for the override model
            # This ensures we get the right provider, endpoint, credentials, etc.
            llm_config = await self.server.get_llm_config_from_handle_async(
                actor=actor,
                handle=override_model,
            )

        logger.info(
            f"Generating completion for agent {agent_id}",
            extra={
                "agent_id": str(agent_id),
                "override_model": override_model,
                "prompt_length": len(prompt),
                "has_system_prompt": system_prompt is not None,
                "has_response_schema": response_schema is not None,
                "model": llm_config.model,
            },
        )

        # 3. Build messages from prompt and optional system_prompt
        letta_messages = []

        # Always add a system message (required by some providers like Anthropic)
        # Use provided system_prompt or minimal default (empty strings not allowed with cache_control)
        letta_messages.append(
            Message(
                role=MessageRole.system,
                content=[TextContent(text=system_prompt if system_prompt else "You are a helpful assistant.")],
            )
        )

        # Add user prompt
        letta_messages.append(
            Message(
                role=MessageRole.user,
                content=[TextContent(text=prompt)],
            )
        )

        # 4. Create LLM client for the provider
        llm_client = LLMClient.create(
            provider_type=llm_config.model_endpoint_type,
            actor=actor,
        )

        if llm_client is None:
            raise LLMError(f"Unsupported provider type: {llm_config.model_endpoint_type}")

        # 5. Build request data
        # If response_schema is provided, create a tool and force the model to call it
        tools = None
        force_tool_call = None
        if response_schema:
            tools = [_schema_to_tool_definition(response_schema)]
            force_tool_call = STRUCTURED_OUTPUT_TOOL_NAME

        # TODO: create a separate agent type
        effective_agent_type = AgentType.split_thread_agent if response_schema else agent.agent_type

        request_data = llm_client.build_request_data(
            agent_type=effective_agent_type,
            messages=letta_messages,
            llm_config=llm_config,
            tools=tools,
            force_tool_call=force_tool_call,
        )

        # 6. Make direct LLM request
        response_data = await llm_client.request_async(request_data, llm_config)

        # 7. Convert to standard chat completion format
        chat_completion = await llm_client.convert_response_to_chat_completion(
            response_data,
            letta_messages,
            llm_config,
        )

        # 8. Extract response content
        content = ""
        if chat_completion.choices and len(chat_completion.choices) > 0:
            message = chat_completion.choices[0].message

            if response_schema:
                # When using structured output, extract from tool call arguments
                if message.tool_calls and len(message.tool_calls) > 0:
                    # The tool call arguments contain the structured output as JSON string
                    content = message.tool_calls[0].function.arguments
                else:
                    # Fallback: some providers may return in content even with tool forcing
                    content = message.content or ""
                    logger.warning(
                        "Expected tool call for structured output but got content response",
                        extra={"agent_id": str(agent_id), "content_length": len(content)},
                    )
            else:
                content = message.content or ""

        # 9. Extract usage statistics
        usage = llm_client.extract_usage_statistics(response_data, llm_config)

        # 10. Build and return response
        return GenerateResponse(
            content=content,
            model=llm_config.model,
            usage=usage,
        )