letta-server/letta/schemas/model.py

from typing import Annotated, Literal, Optional, Union

from pydantic import BaseModel, Field

from letta.schemas.embedding_config import EmbeddingConfig
from letta.schemas.enums import ProviderCategory, ProviderType
from letta.schemas.llm_config import LLMConfig
from letta.schemas.response_format import ResponseFormatUnion


class ModelBase(BaseModel):
    handle: str = Field(..., description="Unique handle for API reference (format: provider_display_name/model_display_name)")
    name: str = Field(..., description="The actual model name used by the provider")
    display_name: str = Field(..., description="Display name for the model shown in UI")
    provider_type: ProviderType = Field(..., description="The type of the provider")
    provider_name: str = Field(..., description="The name of the provider")
    model_type: Literal["llm", "embedding"] = Field(..., description="Type of model (llm or embedding)")


class Model(LLMConfig, ModelBase):
    model_type: Literal["llm"] = Field("llm", description="Type of model (llm or embedding)")
    max_context_window: int = Field(..., description="The maximum context window for the model")
    # supports_token_streaming: Optional[bool] = Field(None, description="Whether token streaming is supported")
    # supports_tool_calling: Optional[bool] = Field(None, description="Whether tool calling is supported")

    # Deprecated fields from LLMConfig - use new field names instead
    model: str = Field(..., description="Deprecated: Use 'name' field instead. LLM model name.", deprecated=True)
    model_endpoint_type: Literal[
        "openai",
        "anthropic",
        "google_ai",
        "google_vertex",
        "azure",
        "groq",
        "ollama",
        "webui",
        "webui-legacy",
        "lmstudio",
        "lmstudio-legacy",
        "lmstudio-chatcompletions",
        "llamacpp",
        "koboldcpp",
        "vllm",
        "hugging-face",
        "minimax",
        "mistral",
        "together",
        "bedrock",
        "deepseek",
        "xai",
        "zai",
        "openrouter",
        "chatgpt_oauth",
    ] = Field(..., description="Deprecated: Use 'provider_type' field instead. The endpoint type for the model.", deprecated=True)
    context_window: int = Field(
        ..., description="Deprecated: Use 'max_context_window' field instead. The context window size for the model.", deprecated=True
    )

    # Additional deprecated LLMConfig fields - kept for backward compatibility
    model_endpoint: Optional[str] = Field(None, description="Deprecated: The endpoint for the model.", deprecated=True)
    model_wrapper: Optional[str] = Field(None, description="Deprecated: The wrapper for the model.", deprecated=True)
    put_inner_thoughts_in_kwargs: Optional[bool] = Field(
        True, description="Deprecated: Puts 'inner_thoughts' as a kwarg in the function call.", deprecated=True
    )
    temperature: float = Field(0.7, description="Deprecated: The temperature to use when generating text with the model.", deprecated=True)
    max_tokens: Optional[int] = Field(None, description="Deprecated: The maximum number of tokens to generate.", deprecated=True)
    enable_reasoner: bool = Field(
        True,
        description="Deprecated: Whether or not the model should use extended thinking if it is a 'reasoning' style model.",
        deprecated=True,
    )
    reasoning_effort: Optional[Literal["none", "minimal", "low", "medium", "high", "xhigh"]] = Field(
        None, description="Deprecated: The reasoning effort to use when generating text reasoning models.", deprecated=True
    )
    max_reasoning_tokens: int = Field(0, description="Deprecated: Configurable thinking budget for extended thinking.", deprecated=True)
    frequency_penalty: Optional[float] = Field(
        None,
        description="Deprecated: Positive values penalize new tokens based on their existing frequency in the text so far.",
        deprecated=True,
    )
    compatibility_type: Optional[Literal["gguf", "mlx"]] = Field(
        None, description="Deprecated: The framework compatibility type for the model.", deprecated=True
    )
    verbosity: Optional[Literal["low", "medium", "high"]] = Field(
        None, description="Deprecated: Soft control for how verbose model output should be.", deprecated=True
    )
    tier: Optional[str] = Field(None, description="Deprecated: The cost tier for the model (cloud only).", deprecated=True)
    parallel_tool_calls: Optional[bool] = Field(
        False, description="Deprecated: If set to True, enables parallel tool calling.", deprecated=True
    )
    provider_category: Optional[ProviderCategory] = Field(
        None, description="Deprecated: The provider category for the model.", deprecated=True
    )

    @classmethod
    def from_llm_config(cls, llm_config: "LLMConfig") -> "Model":
        """Create a Model instance from an LLMConfig"""
        return cls(
            # New fields
            handle=llm_config.handle or f"{llm_config.provider_name}/{llm_config.model}",
            name=llm_config.model,
            display_name=llm_config.display_name or llm_config.model,
            provider_type=llm_config.model_endpoint_type,
            provider_name=llm_config.provider_name or llm_config.model_endpoint_type,
            model_type="llm",
            max_context_window=llm_config.context_window,
            # Deprecated fields (copy from LLMConfig for backward compatibility)
            model=llm_config.model,
            model_endpoint_type=llm_config.model_endpoint_type,
            model_endpoint=llm_config.model_endpoint,
            model_wrapper=llm_config.model_wrapper,
            context_window=llm_config.context_window,
            put_inner_thoughts_in_kwargs=llm_config.put_inner_thoughts_in_kwargs,
            temperature=llm_config.temperature,
            max_tokens=llm_config.max_tokens,
            enable_reasoner=llm_config.enable_reasoner,
            reasoning_effort=llm_config.reasoning_effort,
            max_reasoning_tokens=llm_config.max_reasoning_tokens,
            effort=llm_config.effort,
            frequency_penalty=llm_config.frequency_penalty,
            compatibility_type=llm_config.compatibility_type,
            verbosity=llm_config.verbosity,
            tier=llm_config.tier,
            parallel_tool_calls=llm_config.parallel_tool_calls,
            provider_category=llm_config.provider_category,
        )

    @property
    def model_settings_schema(self) -> Optional[dict]:
        """Returns the JSON schema for the ModelSettings class corresponding to this model's provider."""
        PROVIDER_SETTINGS_MAP = {
            ProviderType.openai: OpenAIModelSettings,
            ProviderType.anthropic: AnthropicModelSettings,
            ProviderType.google_ai: GoogleAIModelSettings,
            ProviderType.google_vertex: GoogleVertexModelSettings,
            ProviderType.azure: AzureModelSettings,
            ProviderType.xai: XAIModelSettings,
            ProviderType.zai: ZAIModelSettings,
            ProviderType.groq: GroqModelSettings,
            ProviderType.deepseek: DeepseekModelSettings,
            ProviderType.together: TogetherModelSettings,
            ProviderType.bedrock: BedrockModelSettings,
            ProviderType.openrouter: OpenRouterModelSettings,
        }

        settings_class = PROVIDER_SETTINGS_MAP.get(self.provider_type)
        return settings_class.model_json_schema() if settings_class else None


class EmbeddingModel(EmbeddingConfig, ModelBase):
    model_type: Literal["embedding"] = Field("embedding", description="Type of model (llm or embedding)")
    embedding_dim: int = Field(..., description="The dimension of the embedding")

    # Deprecated fields from EmbeddingConfig - use new field names instead
    embedding_model: str = Field(..., description="Deprecated: Use 'name' field instead. Embedding model name.", deprecated=True)
    embedding_endpoint_type: Literal[
        "openai",
        "anthropic",
        "bedrock",
        "google_ai",
        "google_vertex",
        "azure",
        "groq",
        "ollama",
        "webui",
        "webui-legacy",
        "lmstudio",
        "lmstudio-legacy",
        "llamacpp",
        "koboldcpp",
        "vllm",
        "hugging-face",
        "mistral",
        "together",
        "pinecone",
    ] = Field(..., description="Deprecated: Use 'provider_type' field instead. The endpoint type for the embedding model.", deprecated=True)

    # Additional deprecated EmbeddingConfig fields - kept for backward compatibility
    embedding_endpoint: Optional[str] = Field(None, description="Deprecated: The endpoint for the model.", deprecated=True)
    embedding_chunk_size: Optional[int] = Field(300, description="Deprecated: The chunk size of the embedding.", deprecated=True)
    batch_size: int = Field(32, description="Deprecated: The maximum batch size for processing embeddings.", deprecated=True)
    azure_endpoint: Optional[str] = Field(None, description="Deprecated: The Azure endpoint for the model.", deprecated=True)
    azure_version: Optional[str] = Field(None, description="Deprecated: The Azure version for the model.", deprecated=True)
    azure_deployment: Optional[str] = Field(None, description="Deprecated: The Azure deployment for the model.", deprecated=True)

    @classmethod
    def from_embedding_config(cls, embedding_config: "EmbeddingConfig") -> "EmbeddingModel":
        """Create an EmbeddingModel instance from an EmbeddingConfig"""
        return cls(
            # New fields
            handle=embedding_config.handle or f"{embedding_config.embedding_endpoint_type}/{embedding_config.embedding_model}",
            name=embedding_config.embedding_model,
            display_name=embedding_config.embedding_model,
            provider_type=embedding_config.embedding_endpoint_type,
            provider_name=embedding_config.embedding_endpoint_type,
            model_type="embedding",
            embedding_dim=embedding_config.embedding_dim,
            # Deprecated fields (copy from EmbeddingConfig for backward compatibility)
            embedding_model=embedding_config.embedding_model,
            embedding_endpoint_type=embedding_config.embedding_endpoint_type,
            embedding_endpoint=embedding_config.embedding_endpoint,
            embedding_chunk_size=embedding_config.embedding_chunk_size,
            batch_size=embedding_config.batch_size,
            azure_endpoint=embedding_config.azure_endpoint,
            azure_version=embedding_config.azure_version,
            azure_deployment=embedding_config.azure_deployment,
        )


class ModelSettings(BaseModel):
    """Schema for defining settings for a model"""

    # model: str = Field(..., description="The name of the model.")
    max_output_tokens: int = Field(4096, description="The maximum number of tokens the model can generate.")
    parallel_tool_calls: bool = Field(True, description="Whether to enable parallel tool calling.")


class OpenAIReasoning(BaseModel):
    reasoning_effort: Literal["none", "minimal", "low", "medium", "high", "xhigh"] = Field(
        "minimal", description="The reasoning effort to use when generating text reasoning models"
    )

    # TODO: implement support for this
    # summary: Optional[Literal["auto", "detailed"]] = Field(
    #    None, description="The reasoning summary level to use when generating text reasoning models"
    # )


class OpenAIModelSettings(ModelSettings):
    provider_type: Literal[ProviderType.openai] = Field(ProviderType.openai, description="The type of the provider.")
    temperature: float = Field(0.7, description="The temperature of the model.")
    reasoning: OpenAIReasoning = Field(OpenAIReasoning(reasoning_effort="high"), description="The reasoning configuration for the model.")
    response_format: Optional[ResponseFormatUnion] = Field(None, description="The response format for the model.")
    # OpenAI supports strict mode for tool calling - defaults to True
    strict: bool = Field(
        True,
        description="Enable strict mode for tool calling. When true, tool outputs are guaranteed to match JSON schemas.",
    )

    # TODO: implement support for these
    # reasoning_summary: Optional[Literal["none", "short", "detailed"]] = Field(
    #    None, description="The reasoning summary level to use when generating text reasoning models"
    # )
    # max_tool_calls: int = Field(10, description="The maximum number of tool calls the model can make.")
    # parallel_tool_calls: bool = Field(False, description="Whether the model supports parallel tool calls.")
    # top_logprobs: int = Field(10, description="The number of top logprobs to return.")
    # top_p: float = Field(1.0, description="The top-p value to use when generating text.")

    def _to_legacy_config_params(self) -> dict:
        return {
            "temperature": self.temperature,
            "max_tokens": self.max_output_tokens,
            "reasoning_effort": self.reasoning.reasoning_effort,
            "response_format": self.response_format,
            "parallel_tool_calls": self.parallel_tool_calls,
            "strict": self.strict,
        }


#    "thinking": {
#        "type": "enabled",
#        "budget_tokens": 10000
#    }


class AnthropicThinking(BaseModel):
    type: Literal["enabled", "disabled"] = Field("enabled", description="The type of thinking to use.")
    budget_tokens: int = Field(1024, description="The maximum number of tokens the model can use for extended thinking.")


class AnthropicModelSettings(ModelSettings):
    provider_type: Literal[ProviderType.anthropic] = Field(ProviderType.anthropic, description="The type of the provider.")
    temperature: float = Field(1.0, description="The temperature of the model.")
    thinking: AnthropicThinking = Field(
        AnthropicThinking(type="enabled", budget_tokens=1024), description="The thinking configuration for the model."
    )
    response_format: Optional[ResponseFormatUnion] = Field(None, description="The response format for the model.")

    # gpt-5 models only
    verbosity: Optional[Literal["low", "medium", "high"]] = Field(
        None,
        description="Soft control for how verbose model output should be, used for GPT-5 models.",
    )

    # Effort parameter for Opus 4.5, Opus 4.6, and Sonnet 4.6
    effort: Optional[Literal["low", "medium", "high", "max"]] = Field(
        None,
        description="Effort level for supported Anthropic models (controls token spending). 'max' is only available on Opus 4.6. Not setting this gives similar performance to 'high'.",
    )

    # Anthropic supports strict mode for tool calling - defaults to False
    strict: bool = Field(
        False,
        description="Enable strict mode for tool calling. When true, tool outputs are guaranteed to match JSON schemas.",
    )

    # TODO: implement support for these
    # top_k: Optional[int] = Field(None, description="The number of top tokens to return.")
    # top_p: Optional[float] = Field(None, description="The top-p value to use when generating text.")

    def _to_legacy_config_params(self) -> dict:
        return {
            "temperature": self.temperature,
            "max_tokens": self.max_output_tokens,
            "extended_thinking": self.thinking.type == "enabled",
            "max_reasoning_tokens": self.thinking.budget_tokens,
            "verbosity": self.verbosity,
            "parallel_tool_calls": self.parallel_tool_calls,
            "effort": self.effort,
            "response_format": self.response_format,
            "strict": self.strict,
        }


class GeminiThinkingConfig(BaseModel):
    include_thoughts: bool = Field(True, description="Whether to include thoughts in the model's response.")
    thinking_budget: int = Field(1024, description="The thinking budget for the model.")


class GoogleAIModelSettings(ModelSettings):
    provider_type: Literal[ProviderType.google_ai] = Field(ProviderType.google_ai, description="The type of the provider.")
    temperature: float = Field(0.7, description="The temperature of the model.")
    thinking_config: GeminiThinkingConfig = Field(
        GeminiThinkingConfig(include_thoughts=True, thinking_budget=1024), description="The thinking configuration for the model."
    )
    response_schema: Optional[ResponseFormatUnion] = Field(None, description="The response schema for the model.")
    max_output_tokens: int = Field(65536, description="The maximum number of tokens the model can generate.")

    def _to_legacy_config_params(self) -> dict:
        return {
            "temperature": self.temperature,
            "max_tokens": self.max_output_tokens,
            "max_reasoning_tokens": self.thinking_config.thinking_budget if self.thinking_config.include_thoughts else 0,
            "parallel_tool_calls": self.parallel_tool_calls,
            "strict": False,  # Google AI does not support strict mode
        }


class GoogleVertexModelSettings(GoogleAIModelSettings):
    provider_type: Literal[ProviderType.google_vertex] = Field(ProviderType.google_vertex, description="The type of the provider.")


class AzureModelSettings(ModelSettings):
    """Azure OpenAI model configuration (OpenAI-compatible)."""

    provider_type: Literal[ProviderType.azure] = Field(ProviderType.azure, description="The type of the provider.")
    temperature: float = Field(0.7, description="The temperature of the model.")
    response_format: Optional[ResponseFormatUnion] = Field(None, description="The response format for the model.")

    def _to_legacy_config_params(self) -> dict:
        return {
            "temperature": self.temperature,
            "max_tokens": self.max_output_tokens,
            "response_format": self.response_format,
            "parallel_tool_calls": self.parallel_tool_calls,
            "strict": False,  # Azure does not support strict mode
        }


class XAIModelSettings(ModelSettings):
    """xAI model configuration (OpenAI-compatible)."""

    provider_type: Literal[ProviderType.xai] = Field(ProviderType.xai, description="The type of the provider.")
    temperature: float = Field(0.7, description="The temperature of the model.")
    response_format: Optional[ResponseFormatUnion] = Field(None, description="The response format for the model.")

    def _to_legacy_config_params(self) -> dict:
        return {
            "temperature": self.temperature,
            "max_tokens": self.max_output_tokens,
            "response_format": self.response_format,
            "parallel_tool_calls": self.parallel_tool_calls,
            "strict": False,  # xAI does not support strict mode
        }


class ZAIThinking(BaseModel):
    """Thinking configuration for ZAI GLM-4.5+ models."""

    type: Literal["enabled", "disabled"] = Field("enabled", description="Whether thinking is enabled or disabled.")
    clear_thinking: bool = Field(False, description="If False, preserved thinking is used (recommended for agents).")


class ZAIModelSettings(ModelSettings):
    """Z.ai (ZhipuAI) model configuration (OpenAI-compatible)."""

    provider_type: Literal[ProviderType.zai] = Field(ProviderType.zai, description="The type of the provider.")
    temperature: float = Field(0.7, description="The temperature of the model.")
    response_format: Optional[ResponseFormatUnion] = Field(None, description="The response format for the model.")
    thinking: ZAIThinking = Field(
        ZAIThinking(type="enabled", clear_thinking=False), description="The thinking configuration for GLM-4.5+ models."
    )

    def _to_legacy_config_params(self) -> dict:
        return {
            "temperature": self.temperature,
            "max_tokens": self.max_output_tokens,
            "response_format": self.response_format,
            "parallel_tool_calls": self.parallel_tool_calls,
            "strict": False,  # ZAI does not support strict mode
            "extended_thinking": self.thinking.type == "enabled",
        }


class GroqModelSettings(ModelSettings):
    """Groq model configuration (OpenAI-compatible)."""

    provider_type: Literal[ProviderType.groq] = Field(ProviderType.groq, description="The type of the provider.")
    temperature: float = Field(0.7, description="The temperature of the model.")
    response_format: Optional[ResponseFormatUnion] = Field(None, description="The response format for the model.")

    def _to_legacy_config_params(self) -> dict:
        return {
            "temperature": self.temperature,
            "max_tokens": self.max_output_tokens,
            "response_format": self.response_format,
            "parallel_tool_calls": self.parallel_tool_calls,
            "strict": False,  # Groq does not support strict mode
        }


class DeepseekModelSettings(ModelSettings):
    """Deepseek model configuration (OpenAI-compatible)."""

    provider_type: Literal[ProviderType.deepseek] = Field(ProviderType.deepseek, description="The type of the provider.")
    temperature: float = Field(0.7, description="The temperature of the model.")
    response_format: Optional[ResponseFormatUnion] = Field(None, description="The response format for the model.")

    def _to_legacy_config_params(self) -> dict:
        return {
            "temperature": self.temperature,
            "max_tokens": self.max_output_tokens,
            "response_format": self.response_format,
            "parallel_tool_calls": self.parallel_tool_calls,
            "strict": False,  # Deepseek does not support strict mode
        }


class TogetherModelSettings(ModelSettings):
    """Together AI model configuration (OpenAI-compatible)."""

    provider_type: Literal[ProviderType.together] = Field(ProviderType.together, description="The type of the provider.")
    temperature: float = Field(0.7, description="The temperature of the model.")
    response_format: Optional[ResponseFormatUnion] = Field(None, description="The response format for the model.")

    def _to_legacy_config_params(self) -> dict:
        return {
            "temperature": self.temperature,
            "max_tokens": self.max_output_tokens,
            "response_format": self.response_format,
            "parallel_tool_calls": self.parallel_tool_calls,
            "strict": False,  # Together does not support strict mode
        }


class BedrockModelSettings(ModelSettings):
    """AWS Bedrock model configuration."""

    provider_type: Literal[ProviderType.bedrock] = Field(ProviderType.bedrock, description="The type of the provider.")
    temperature: float = Field(0.7, description="The temperature of the model.")
    response_format: Optional[ResponseFormatUnion] = Field(None, description="The response format for the model.")

    def _to_legacy_config_params(self) -> dict:
        return {
            "temperature": self.temperature,
            "max_tokens": self.max_output_tokens,
            "response_format": self.response_format,
            "parallel_tool_calls": self.parallel_tool_calls,
            "strict": False,  # Bedrock does not support strict mode
        }


class OpenRouterModelSettings(ModelSettings):
    """OpenRouter model configuration (OpenAI-compatible)."""

    provider_type: Literal[ProviderType.openrouter] = Field(ProviderType.openrouter, description="The type of the provider.")
    temperature: float = Field(0.7, description="The temperature of the model.")
    response_format: Optional[ResponseFormatUnion] = Field(None, description="The response format for the model.")

    def _to_legacy_config_params(self) -> dict:
        return {
            "temperature": self.temperature,
            "max_tokens": self.max_output_tokens,
            "response_format": self.response_format,
            "parallel_tool_calls": self.parallel_tool_calls,
            "strict": False,  # OpenRouter does not support strict mode
        }


class ChatGPTOAuthReasoning(BaseModel):
    """Reasoning configuration for ChatGPT OAuth models (GPT-5.x, o-series)."""

    reasoning_effort: Literal["none", "low", "medium", "high", "xhigh"] = Field(
        "medium", description="The reasoning effort level for GPT-5.x and o-series models."
    )


class ChatGPTOAuthModelSettings(ModelSettings):
    """ChatGPT OAuth model configuration (uses ChatGPT backend API)."""

    provider_type: Literal[ProviderType.chatgpt_oauth] = Field(ProviderType.chatgpt_oauth, description="The type of the provider.")
    temperature: float = Field(0.7, description="The temperature of the model.")
    reasoning: ChatGPTOAuthReasoning = Field(
        ChatGPTOAuthReasoning(reasoning_effort="medium"), description="The reasoning configuration for the model."
    )

    def _to_legacy_config_params(self) -> dict:
        return {
            "temperature": self.temperature,
            "max_tokens": self.max_output_tokens,
            "reasoning_effort": self.reasoning.reasoning_effort,
            "parallel_tool_calls": self.parallel_tool_calls,
        }


ModelSettingsUnion = Annotated[
    Union[
        OpenAIModelSettings,
        AnthropicModelSettings,
        GoogleAIModelSettings,
        GoogleVertexModelSettings,
        AzureModelSettings,
        XAIModelSettings,
        ZAIModelSettings,
        GroqModelSettings,
        DeepseekModelSettings,
        TogetherModelSettings,
        BedrockModelSettings,
        OpenRouterModelSettings,
        ChatGPTOAuthModelSettings,
    ],
    Field(discriminator="provider_type"),
]