letta-server/letta/services/file_manager.py

import asyncio
import os
from datetime import datetime, timedelta, timezone
from typing import List, Optional

from sqlalchemy import func, select, update
from sqlalchemy.dialects.postgresql import insert as pg_insert
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import selectinload

from letta.constants import MAX_FILENAME_LENGTH
from letta.helpers.pinecone_utils import list_pinecone_index_for_files, should_use_pinecone
from letta.log import get_logger
from letta.orm.errors import NoResultFound
from letta.orm.file import FileContent as FileContentModel
from letta.orm.file import FileMetadata as FileMetadataModel
from letta.orm.sqlalchemy_base import AccessType
from letta.otel.tracing import trace_method
from letta.schemas.enums import FileProcessingStatus
from letta.schemas.file import FileMetadata as PydanticFileMetadata
from letta.schemas.source import Source as PydanticSource
from letta.schemas.source_metadata import FileStats, OrganizationSourcesStats, SourceStats
from letta.schemas.user import User as PydanticUser
from letta.server.db import db_registry
from letta.settings import settings
from letta.utils import enforce_types

logger = get_logger(__name__)


class DuplicateFileError(Exception):
    """Raised when a duplicate file is encountered and error handling is specified"""

    def __init__(self, filename: str, source_name: str):
        self.filename = filename
        self.source_name = source_name
        super().__init__(f"File '{filename}' already exists in source '{source_name}'")


class FileManager:
    """Manager class to handle business logic related to files."""

    async def _invalidate_file_caches(self, file_id: str, actor: PydanticUser, original_filename: str = None, source_id: str = None):
        """Invalidate all caches related to a file."""
        # TEMPORARILY DISABLED - caching is disabled
        # # invalidate file content cache (all variants)
        # await self.get_file_by_id.cache_invalidate(self, file_id, actor, include_content=True)
        # await self.get_file_by_id.cache_invalidate(self, file_id, actor, include_content=False)

        # # invalidate filename-based cache if we have the info
        # if original_filename and source_id:
        #     await self.get_file_by_original_name_and_source.cache_invalidate(self, original_filename, source_id, actor)

    @enforce_types
    @trace_method
    async def create_file(
        self,
        file_metadata: PydanticFileMetadata,
        actor: PydanticUser,
        *,
        text: Optional[str] = None,
    ) -> PydanticFileMetadata:

        # short-circuit if it already exists
        existing = await self.get_file_by_id(file_metadata.id, actor=actor)
        if existing:
            return existing

        async with db_registry.async_session() as session:
            try:
                file_metadata.organization_id = actor.organization_id
                file_orm = FileMetadataModel(**file_metadata.model_dump(to_orm=True, exclude_none=True))
                await file_orm.create_async(session, actor=actor, no_commit=True)

                if text is not None:
                    content_orm = FileContentModel(file_id=file_orm.id, text=text)
                    await content_orm.create_async(session, actor=actor, no_commit=True)

                await session.commit()
                await session.refresh(file_orm)

                # invalidate cache for this new file
                await self._invalidate_file_caches(file_orm.id, actor, file_orm.original_file_name, file_orm.source_id)

                return await file_orm.to_pydantic_async()

            except IntegrityError:
                await session.rollback()
                return await self.get_file_by_id(file_metadata.id, actor=actor)

    # TODO: We make actor optional for now, but should most likely be enforced due to security reasons
    @enforce_types
    @trace_method
    # @async_redis_cache(
    #     key_func=lambda self, file_id, actor=None, include_content=False, strip_directory_prefix=False: f"{file_id}:{actor.organization_id if actor else 'none'}:{include_content}:{strip_directory_prefix}",
    #     prefix="file_content",
    #     ttl_s=3600,
    #     model_class=PydanticFileMetadata,
    # )
    async def get_file_by_id(
        self, file_id: str, actor: Optional[PydanticUser] = None, *, include_content: bool = False, strip_directory_prefix: bool = False
    ) -> Optional[PydanticFileMetadata]:
        """Retrieve a file by its ID.

        If `include_content=True`, the FileContent relationship is eagerly
        loaded so `to_pydantic(include_content=True)` never triggers a
        lazy SELECT (avoids MissingGreenlet).
        """
        async with db_registry.async_session() as session:
            try:
                if include_content:
                    # explicit eager load
                    query = (
                        select(FileMetadataModel).where(FileMetadataModel.id == file_id).options(selectinload(FileMetadataModel.content))
                    )
                    # apply org-scoping if actor provided
                    if actor:
                        query = FileMetadataModel.apply_access_predicate(
                            query,
                            actor,
                            access=["read"],
                            access_type=AccessType.ORGANIZATION,
                        )

                    result = await session.execute(query)
                    file_orm = result.scalar_one()
                else:
                    # fast path (metadata only)
                    file_orm = await FileMetadataModel.read_async(
                        db_session=session,
                        identifier=file_id,
                        actor=actor,
                    )

                return await file_orm.to_pydantic_async(include_content=include_content, strip_directory_prefix=strip_directory_prefix)

            except NoResultFound:
                return None

    @enforce_types
    @trace_method
    async def update_file_status(
        self,
        *,
        file_id: str,
        actor: PydanticUser,
        processing_status: Optional[FileProcessingStatus] = None,
        error_message: Optional[str] = None,
        total_chunks: Optional[int] = None,
        chunks_embedded: Optional[int] = None,
        enforce_state_transitions: bool = True,
    ) -> Optional[PydanticFileMetadata]:
        """
        Update processing_status, error_message, total_chunks, and/or chunks_embedded on a FileMetadata row.

        Enforces state transition rules (when enforce_state_transitions=True):
        - PENDING -> PARSING -> EMBEDDING -> COMPLETED (normal flow)
        - Any non-terminal state -> ERROR
        - Same-state transitions are allowed (e.g., EMBEDDING -> EMBEDDING)
        - ERROR and COMPLETED are terminal (no status transitions allowed, metadata updates blocked)

        Args:
            file_id: ID of the file to update
            actor: User performing the update
            processing_status: New processing status to set
            error_message: Error message to set (if any)
            total_chunks: Total number of chunks in the file
            chunks_embedded: Number of chunks already embedded
            enforce_state_transitions: Whether to enforce state transition rules (default: True).
                                     Set to False to bypass validation for testing or special cases.

        Returns:
            Updated file metadata, or None if the update was blocked

        * 1st round-trip → UPDATE with optional state validation
        * 2nd round-trip → SELECT fresh row (same as read_async) if update succeeded
        """

        if processing_status is None and error_message is None and total_chunks is None and chunks_embedded is None:
            raise ValueError("Nothing to update")

        # validate that ERROR status must have an error message
        if processing_status == FileProcessingStatus.ERROR and not error_message:
            raise ValueError("Error message is required when setting processing status to ERROR")

        values: dict[str, object] = {"updated_at": datetime.utcnow()}
        if processing_status is not None:
            values["processing_status"] = processing_status
        if error_message is not None:
            values["error_message"] = error_message
        if total_chunks is not None:
            values["total_chunks"] = total_chunks
        if chunks_embedded is not None:
            values["chunks_embedded"] = chunks_embedded

        # validate state transitions before making any database calls
        if enforce_state_transitions and processing_status == FileProcessingStatus.PENDING:
            # PENDING cannot be set after initial creation
            raise ValueError(f"Cannot transition to PENDING state for file {file_id} - PENDING is only valid as initial state")

        async with db_registry.async_session() as session:
            # build where conditions
            where_conditions = [
                FileMetadataModel.id == file_id,
                FileMetadataModel.organization_id == actor.organization_id,
            ]

            # only add state transition validation if enforce_state_transitions is True
            if enforce_state_transitions and processing_status is not None:
                # enforce specific transitions based on target status
                if processing_status == FileProcessingStatus.PARSING:
                    where_conditions.append(
                        FileMetadataModel.processing_status.in_([FileProcessingStatus.PENDING, FileProcessingStatus.PARSING])
                    )
                elif processing_status == FileProcessingStatus.EMBEDDING:
                    where_conditions.append(
                        FileMetadataModel.processing_status.in_([FileProcessingStatus.PARSING, FileProcessingStatus.EMBEDDING])
                    )
                elif processing_status == FileProcessingStatus.COMPLETED:
                    where_conditions.append(
                        FileMetadataModel.processing_status.in_([FileProcessingStatus.EMBEDDING, FileProcessingStatus.COMPLETED])
                    )
                elif processing_status == FileProcessingStatus.ERROR:
                    # ERROR can be set from any non-terminal state
                    where_conditions.append(
                        FileMetadataModel.processing_status.notin_([FileProcessingStatus.ERROR, FileProcessingStatus.COMPLETED])
                    )
            elif enforce_state_transitions and processing_status is None:
                # If only updating metadata fields (not status), prevent updates to terminal states
                where_conditions.append(
                    FileMetadataModel.processing_status.notin_([FileProcessingStatus.ERROR, FileProcessingStatus.COMPLETED])
                )

            # fast in-place update with state validation
            stmt = (
                update(FileMetadataModel)
                .where(*where_conditions)
                .values(**values)
                .returning(FileMetadataModel.id)  # return id if update succeeded
            )
            result = await session.execute(stmt)
            updated_id = result.scalar()

            if not updated_id:
                # update was blocked
                await session.commit()

                if enforce_state_transitions:
                    # update was blocked by state transition rules - raise error
                    # fetch current state to provide informative error
                    current_file = await FileMetadataModel.read_async(
                        db_session=session,
                        identifier=file_id,
                        actor=actor,
                    )
                    current_status = current_file.processing_status

                    # build informative error message
                    if processing_status is not None:
                        if current_status in [FileProcessingStatus.ERROR, FileProcessingStatus.COMPLETED]:
                            raise ValueError(
                                f"Cannot update file {file_id} status from terminal state {current_status} to {processing_status}"
                            )
                        else:
                            raise ValueError(f"Invalid state transition for file {file_id}: {current_status} -> {processing_status}")
                    else:
                        raise ValueError(f"Cannot update file {file_id} in terminal state {current_status}")
                else:
                    # validation was bypassed but update still failed (e.g., file doesn't exist)
                    return None

            await session.commit()

            # invalidate cache for this file
            await self._invalidate_file_caches(file_id, actor)

            # reload via normal accessor so we return a fully-attached object
            file_orm = await FileMetadataModel.read_async(
                db_session=session,
                identifier=file_id,
                actor=actor,
            )
            return await file_orm.to_pydantic_async()

    @enforce_types
    @trace_method
    async def check_and_update_file_status(
        self,
        file_metadata: PydanticFileMetadata,
        actor: PydanticUser,
    ) -> PydanticFileMetadata:
        """
        Check and update file status for timeout and embedding completion.

        This method consolidates logic for:
        1. Checking if a file has timed out during processing
        2. Checking Pinecone embedding status and updating counts

        Args:
            file_metadata: The file metadata to check
            actor: User performing the check

        Returns:
            Updated file metadata with current status
        """
        # check for timeout if status is not terminal
        if not file_metadata.processing_status.is_terminal_state():
            if file_metadata.created_at:
                # handle timezone differences between PostgreSQL (timezone-aware) and SQLite (timezone-naive)
                if settings.letta_pg_uri_no_default:
                    # postgresql: both datetimes are timezone-aware
                    timeout_threshold = datetime.now(timezone.utc) - timedelta(minutes=settings.file_processing_timeout_minutes)
                    file_created_at = file_metadata.created_at
                else:
                    # sqlite: both datetimes should be timezone-naive
                    timeout_threshold = datetime.utcnow() - timedelta(minutes=settings.file_processing_timeout_minutes)
                    file_created_at = file_metadata.created_at

                if file_created_at < timeout_threshold:
                    # move file to error status with timeout message
                    timeout_message = settings.file_processing_timeout_error_message.format(settings.file_processing_timeout_minutes)
                    try:
                        file_metadata = await self.update_file_status(
                            file_id=file_metadata.id,
                            actor=actor,
                            processing_status=FileProcessingStatus.ERROR,
                            error_message=timeout_message,
                        )
                    except ValueError as e:
                        # state transition was blocked - log it but don't fail
                        logger.warning(f"Could not update file to timeout error state: {str(e)}")
                        # continue with existing file_metadata

        # check pinecone embedding status
        if should_use_pinecone() and file_metadata.processing_status == FileProcessingStatus.EMBEDDING:
            ids = await list_pinecone_index_for_files(file_id=file_metadata.id, actor=actor)
            logger.info(
                f"Embedded chunks {len(ids)}/{file_metadata.total_chunks} for {file_metadata.id} ({file_metadata.file_name}) in organization {actor.organization_id}"
            )

            if len(ids) != file_metadata.chunks_embedded or len(ids) == file_metadata.total_chunks:
                if len(ids) != file_metadata.total_chunks:
                    file_status = file_metadata.processing_status
                else:
                    file_status = FileProcessingStatus.COMPLETED
                try:
                    file_metadata = await self.update_file_status(
                        file_id=file_metadata.id, actor=actor, chunks_embedded=len(ids), processing_status=file_status
                    )
                except ValueError as e:
                    # state transition was blocked - this is a race condition
                    # log it but don't fail since we're just checking status
                    logger.warning(f"Race condition detected in check_and_update_file_status: {str(e)}")
                    # return the current file state without updating

        return file_metadata

    @enforce_types
    @trace_method
    async def upsert_file_content(
        self,
        *,
        file_id: str,
        text: str,
        actor: PydanticUser,
    ) -> PydanticFileMetadata:
        async with db_registry.async_session() as session:
            await FileMetadataModel.read_async(session, file_id, actor)

            dialect_name = session.bind.dialect.name

            if dialect_name == "postgresql":
                stmt = (
                    pg_insert(FileContentModel)
                    .values(file_id=file_id, text=text)
                    .on_conflict_do_update(
                        index_elements=[FileContentModel.file_id],
                        set_={"text": text},
                    )
                )
                await session.execute(stmt)
            else:
                # Emulate upsert for SQLite and others
                stmt = select(FileContentModel).where(FileContentModel.file_id == file_id)
                result = await session.execute(stmt)
                existing = result.scalar_one_or_none()

                if existing:
                    await session.execute(update(FileContentModel).where(FileContentModel.file_id == file_id).values(text=text))
                else:
                    session.add(FileContentModel(file_id=file_id, text=text))

            await session.commit()

            # invalidate cache for this file since content changed
            await self._invalidate_file_caches(file_id, actor)

            # Reload with content
            query = select(FileMetadataModel).options(selectinload(FileMetadataModel.content)).where(FileMetadataModel.id == file_id)
            result = await session.execute(query)
            return await result.scalar_one().to_pydantic_async(include_content=True)

    @enforce_types
    @trace_method
    async def list_files(
        self,
        source_id: str,
        actor: PydanticUser,
        after: Optional[str] = None,
        limit: Optional[int] = 50,
        include_content: bool = False,
        strip_directory_prefix: bool = False,
        check_status_updates: bool = False,
    ) -> List[PydanticFileMetadata]:
        """List all files with optional pagination and status checking.

        Args:
            source_id: Source to list files from
            actor: User performing the request
            after: Pagination cursor
            limit: Maximum number of files to return
            include_content: Whether to include file content
            strip_directory_prefix: Whether to strip directory prefix from filenames
            check_status_updates: Whether to check and update status for timeout and embedding completion

        Returns:
            List of file metadata
        """
        async with db_registry.async_session() as session:
            options = [selectinload(FileMetadataModel.content)] if include_content else None

            files = await FileMetadataModel.list_async(
                db_session=session,
                after=after,
                limit=limit,
                organization_id=actor.organization_id,
                source_id=source_id,
                query_options=options,
            )

            # convert all files to pydantic models
            file_metadatas = await asyncio.gather(
                *[file.to_pydantic_async(include_content=include_content, strip_directory_prefix=strip_directory_prefix) for file in files]
            )

            # if status checking is enabled, check all files concurrently
            if check_status_updates:
                file_metadatas = await asyncio.gather(
                    *[self.check_and_update_file_status(file_metadata, actor) for file_metadata in file_metadatas]
                )

            return file_metadatas

    @enforce_types
    @trace_method
    async def delete_file(self, file_id: str, actor: PydanticUser) -> PydanticFileMetadata:
        """Delete a file by its ID."""
        async with db_registry.async_session() as session:
            file = await FileMetadataModel.read_async(db_session=session, identifier=file_id)

            # invalidate cache for this file before deletion
            await self._invalidate_file_caches(file_id, actor, file.original_file_name, file.source_id)

            await file.hard_delete_async(db_session=session, actor=actor)
            return await file.to_pydantic_async()

    @enforce_types
    @trace_method
    async def generate_unique_filename(self, original_filename: str, source: PydanticSource, organization_id: str) -> str:
        """
        Generate a unique filename by adding a numeric suffix if duplicates exist.
        Always returns a unique filename - does not handle duplicate policies.

        Parameters:
            original_filename (str): The original filename as uploaded.
            source (PydanticSource): Source to check for duplicates within.
            organization_id (str): Organization ID to check for duplicates within.

        Returns:
            str: A unique filename with source.name prefix and numeric suffix if needed.
        """
        base, ext = os.path.splitext(original_filename)

        # Reserve space for potential suffix: " (999)" = 6 characters
        max_base_length = MAX_FILENAME_LENGTH - len(ext) - 6
        if len(base) > max_base_length:
            base = base[:max_base_length]
            original_filename = f"{base}{ext}"

        async with db_registry.async_session() as session:
            # Count existing files with the same original_file_name in this source
            query = select(func.count(FileMetadataModel.id)).where(
                FileMetadataModel.original_file_name == original_filename,
                FileMetadataModel.source_id == source.id,
                FileMetadataModel.organization_id == organization_id,
                FileMetadataModel.is_deleted == False,
            )
            result = await session.execute(query)
            count = result.scalar() or 0

            if count == 0:
                # No duplicates, return original filename with source.name
                return f"{source.name}/{original_filename}"
            else:
                # Add numeric suffix to make unique
                return f"{source.name}/{base}_({count}){ext}"

    @enforce_types
    @trace_method
    # @async_redis_cache(
    #     key_func=lambda self, original_filename, source_id, actor: f"{original_filename}:{source_id}:{actor.organization_id}",
    #     prefix="file_by_name",
    #     ttl_s=3600,
    #     model_class=PydanticFileMetadata,
    # )
    async def get_file_by_original_name_and_source(
        self, original_filename: str, source_id: str, actor: PydanticUser
    ) -> Optional[PydanticFileMetadata]:
        """
        Get a file by its original filename and source ID.

        Parameters:
            original_filename (str): The original filename to search for.
            source_id (str): The source ID to search within.
            actor (PydanticUser): The actor performing the request.

        Returns:
            Optional[PydanticFileMetadata]: The file metadata if found, None otherwise.
        """
        async with db_registry.async_session() as session:
            query = (
                select(FileMetadataModel)
                .where(
                    FileMetadataModel.original_file_name == original_filename,
                    FileMetadataModel.source_id == source_id,
                    FileMetadataModel.organization_id == actor.organization_id,
                    FileMetadataModel.is_deleted == False,
                )
                .limit(1)
            )

            result = await session.execute(query)
            file_orm = result.scalar_one_or_none()

            if file_orm:
                return await file_orm.to_pydantic_async()
            return None

    @enforce_types
    @trace_method
    async def get_organization_sources_metadata(
        self, actor: PydanticUser, include_detailed_per_source_metadata: bool = False
    ) -> OrganizationSourcesStats:
        """
        Get aggregated metadata for all sources in an organization with optimized queries.

        Returns structured metadata including:
        - Total number of sources
        - Total number of files across all sources
        - Total size of all files
        - Per-source breakdown with file details (if include_detailed_per_source_metadata is True)
        """
        async with db_registry.async_session() as session:
            # Import here to avoid circular imports
            from letta.orm.source import Source as SourceModel

            # Single optimized query to get all sources with their file aggregations
            query = (
                select(
                    SourceModel.id,
                    SourceModel.name,
                    func.count(FileMetadataModel.id).label("file_count"),
                    func.coalesce(func.sum(FileMetadataModel.file_size), 0).label("total_size"),
                )
                .outerjoin(FileMetadataModel, (FileMetadataModel.source_id == SourceModel.id) & (FileMetadataModel.is_deleted == False))
                .where(SourceModel.organization_id == actor.organization_id)
                .where(SourceModel.is_deleted == False)
                .group_by(SourceModel.id, SourceModel.name)
                .order_by(SourceModel.name)
            )

            result = await session.execute(query)
            source_aggregations = result.fetchall()

            # Build response
            metadata = OrganizationSourcesStats()

            for row in source_aggregations:
                source_id, source_name, file_count, total_size = row

                if include_detailed_per_source_metadata:
                    # Get individual file details for this source
                    files_query = (
                        select(FileMetadataModel.id, FileMetadataModel.file_name, FileMetadataModel.file_size)
                        .where(
                            FileMetadataModel.source_id == source_id,
                            FileMetadataModel.organization_id == actor.organization_id,
                            FileMetadataModel.is_deleted == False,
                        )
                        .order_by(FileMetadataModel.file_name)
                    )

                    files_result = await session.execute(files_query)
                    files_rows = files_result.fetchall()

                    # Build file stats
                    files = [FileStats(file_id=file_row[0], file_name=file_row[1], file_size=file_row[2]) for file_row in files_rows]

                    # Build source metadata
                    source_metadata = SourceStats(
                        source_id=source_id, source_name=source_name, file_count=file_count, total_size=total_size, files=files
                    )

                    metadata.sources.append(source_metadata)

                metadata.total_files += file_count
                metadata.total_size += total_size

            metadata.total_sources = len(source_aggregations)
            return metadata

    @enforce_types
    @trace_method
    async def get_files_by_ids_async(
        self, file_ids: List[str], actor: PydanticUser, *, include_content: bool = False
    ) -> List[PydanticFileMetadata]:
        """
        Get multiple files by their IDs in a single query.

        Args:
            file_ids: List of file IDs to retrieve
            actor: User performing the action
            include_content: Whether to include file content in the response

        Returns:
            List[PydanticFileMetadata]: List of files (may be fewer than requested if some don't exist)
        """
        if not file_ids:
            return []

        async with db_registry.async_session() as session:
            query = select(FileMetadataModel).where(
                FileMetadataModel.id.in_(file_ids),
                FileMetadataModel.organization_id == actor.organization_id,
                FileMetadataModel.is_deleted == False,
            )

            # Eagerly load content if requested
            if include_content:
                query = query.options(selectinload(FileMetadataModel.content))

            result = await session.execute(query)
            files_orm = result.scalars().all()

            return await asyncio.gather(*[file.to_pydantic_async(include_content=include_content) for file in files_orm])

    @enforce_types
    @trace_method
    async def get_files_for_agents_async(
        self, agent_ids: List[str], actor: PydanticUser, *, include_content: bool = False
    ) -> List[PydanticFileMetadata]:
        """
        Get all files associated with the given agents via file-agent relationships.

        Args:
            agent_ids: List of agent IDs to find files for
            actor: User performing the action
            include_content: Whether to include file content in the response

        Returns:
            List[PydanticFileMetadata]: List of unique files associated with these agents
        """
        if not agent_ids:
            return []

        async with db_registry.async_session() as session:
            # We need to import FileAgent here to avoid circular imports
            from letta.orm.file_agent import FileAgent as FileAgentModel

            # Join through file-agent relationships
            query = (
                select(FileMetadataModel)
                .join(FileAgentModel, FileMetadataModel.id == FileAgentModel.file_id)
                .where(
                    FileAgentModel.agent_id.in_(agent_ids),
                    FileMetadataModel.organization_id == actor.organization_id,
                    FileMetadataModel.is_deleted == False,
                    FileAgentModel.is_deleted == False,
                )
                .distinct()  # Ensure we don't get duplicate files
            )

            # Eagerly load content if requested
            if include_content:
                query = query.options(selectinload(FileMetadataModel.content))

            result = await session.execute(query)
            files_orm = result.scalars().all()

            return await asyncio.gather(*[file.to_pydantic_async(include_content=include_content) for file in files_orm])