letta-server/letta/helpers/message_helper.py

import asyncio
import base64
import mimetypes
from urllib.parse import unquote, urlparse

import httpx

from letta import __version__, system
from letta.errors import LettaImageFetchError
from letta.schemas.enums import MessageRole
from letta.schemas.letta_message_content import Base64Image, ImageContent, ImageSourceType, TextContent
from letta.schemas.message import Message, MessageCreate


async def _fetch_image_from_url(url: str, max_retries: int = 1, timeout_seconds: float = 5.0) -> tuple[bytes, str | None]:
    """
    Async helper to fetch image from URL without blocking the event loop.
    Retries once on timeout to handle transient network issues.

    Args:
        url: URL of the image to fetch
        max_retries: Number of retry attempts (default: 1)
        timeout_seconds: Total timeout in seconds (default: 5.0)

    Returns:
        Tuple of (image_bytes, media_type)

    Raises:
        LettaImageFetchError: If image fetch fails after all retries
    """
    # Connect timeout is half of total timeout, capped at 3 seconds
    connect_timeout = min(timeout_seconds / 2, 3.0)
    timeout = httpx.Timeout(timeout_seconds, connect=connect_timeout)
    headers = {"User-Agent": f"Letta/{__version__}"}

    last_exception = None
    for attempt in range(max_retries + 1):
        try:
            async with httpx.AsyncClient(timeout=timeout, headers=headers) as client:
                image_response = await client.get(url, follow_redirects=True)
                image_response.raise_for_status()
                image_bytes = image_response.content
                image_media_type = image_response.headers.get("content-type")
                return image_bytes, image_media_type
        except httpx.TimeoutException as e:
            last_exception = e
            if attempt < max_retries:
                # Brief delay before retry
                await asyncio.sleep(0.5)
                continue
            # Final attempt failed
            raise LettaImageFetchError(url=url, reason=f"Timeout after {max_retries + 1} attempts: {e}")
        except (httpx.RemoteProtocolError, httpx.HTTPStatusError) as e:
            # Don't retry on protocol errors or HTTP errors (4xx, 5xx)
            raise LettaImageFetchError(url=url, reason=str(e))
        except Exception as e:
            raise LettaImageFetchError(url=url, reason=f"Unexpected error: {e}")

    # Should never reach here, but just in case
    raise LettaImageFetchError(url=url, reason=f"Failed after {max_retries + 1} attempts: {last_exception}")


async def convert_message_creates_to_messages(
    message_creates: list[MessageCreate],
    agent_id: str,
    timezone: str,
    run_id: str,
    wrap_user_message: bool = True,
    wrap_system_message: bool = True,
) -> list[Message]:
    # Process all messages concurrently
    tasks = [
        _convert_message_create_to_message(
            message_create=create,
            agent_id=agent_id,
            timezone=timezone,
            run_id=run_id,
            wrap_user_message=wrap_user_message,
            wrap_system_message=wrap_system_message,
        )
        for create in message_creates
    ]
    return await asyncio.gather(*tasks)


async def _convert_message_create_to_message(
    message_create: MessageCreate,
    agent_id: str,
    timezone: str,
    run_id: str,
    wrap_user_message: bool = True,
    wrap_system_message: bool = True,
) -> Message:
    """Converts a MessageCreate object into a Message object, applying wrapping if needed."""
    # TODO: This seems like extra boilerplate with little benefit
    assert isinstance(message_create, MessageCreate)

    # Extract message content
    if isinstance(message_create.content, str) and message_create.content != "":
        message_content = [TextContent(text=message_create.content)]
    elif isinstance(message_create.content, list) and len(message_create.content) > 0:
        message_content = message_create.content
    else:
        raise ValueError("Message content is empty or invalid")

    # Validate message role (assistant messages are allowed but won't be wrapped)
    assert message_create.role in {
        MessageRole.user,
        MessageRole.system,
        MessageRole.assistant,
    }, f"Invalid message role: {message_create.role}"

    for content in message_content:
        if isinstance(content, TextContent):
            # Apply wrapping only to user and system messages
            if message_create.role == MessageRole.user and wrap_user_message:
                content.text = system.package_user_message(user_message=content.text, timezone=timezone)
            elif message_create.role == MessageRole.system and wrap_system_message:
                content.text = system.package_system_message(system_message=content.text, timezone=timezone)
        elif isinstance(content, ImageContent):
            if content.source.type == ImageSourceType.url:
                # Convert URL image to Base64Image if needed
                url = content.source.url

                # Handle file:// URLs for local filesystem access
                if url.startswith("file://"):
                    # Parse file path from file:// URL
                    parsed = urlparse(url)
                    file_path = unquote(parsed.path)

                    # Read file directly from filesystem (wrapped to avoid blocking event loop)
                    def _read_file():
                        with open(file_path, "rb") as f:
                            return f.read()

                    image_bytes = await asyncio.to_thread(_read_file)

                    # Guess media type from file extension
                    image_media_type, _ = mimetypes.guess_type(file_path)
                    if not image_media_type:
                        image_media_type = "image/jpeg"  # default fallback
                elif url.startswith("data:"):
                    # Handle data: URLs (inline base64 encoded images)
                    # Format: data:[<mediatype>][;base64],<data>
                    try:
                        # Split header from data
                        header, image_data = url.split(",", 1)
                        # Extract media type from header (e.g., "data:image/jpeg;base64")
                        header_parts = header.split(";")
                        image_media_type = header_parts[0].replace("data:", "") or "image/jpeg"
                        # Data is already base64 encoded, set directly and continue
                        content.source = Base64Image(media_type=image_media_type, data=image_data)
                        continue  # Skip the common conversion path below
                    except ValueError:
                        raise LettaImageFetchError(url=url[:100] + "...", reason="Invalid data URL format")
                else:
                    # Handle http(s):// URLs using async httpx
                    image_bytes, image_media_type = await _fetch_image_from_url(url)
                    if not image_media_type:
                        image_media_type, _ = mimetypes.guess_type(url)

                # Convert to base64 (common path for both file:// and http(s)://)
                image_data = base64.standard_b64encode(image_bytes).decode("utf-8")
                content.source = Base64Image(media_type=image_media_type, data=image_data)
            if content.source.type == ImageSourceType.letta and not content.source.data:
                # TODO: hydrate letta image with data from db
                pass

    return Message(
        agent_id=agent_id,
        role=message_create.role,
        content=message_content,
        name=message_create.name,
        model=None,  # assigned later?
        tool_calls=None,  # irrelevant
        tool_call_id=None,
        otid=message_create.otid,
        sender_id=message_create.sender_id,
        group_id=message_create.group_id,
        batch_item_id=message_create.batch_item_id,
        run_id=run_id,
    )


async def _resolve_url_to_base64(url: str) -> tuple[str, str]:
    """Resolve URL to base64 data and media type."""
    if url.startswith("file://"):
        parsed = urlparse(url)
        file_path = unquote(parsed.path)
        image_bytes = await asyncio.to_thread(lambda: open(file_path, "rb").read())
        media_type, _ = mimetypes.guess_type(file_path)
        media_type = media_type or "image/jpeg"
    else:
        image_bytes, media_type = await _fetch_image_from_url(url)
        media_type = media_type or mimetypes.guess_type(url)[0] or "image/png"

    image_data = base64.standard_b64encode(image_bytes).decode("utf-8")
    return image_data, media_type


async def resolve_tool_return_images(func_response: str | list) -> str | list:
    """Resolve URL and LettaImage sources to base64 for tool returns."""
    if isinstance(func_response, str):
        return func_response

    resolved = []
    for part in func_response:
        if isinstance(part, ImageContent):
            if part.source.type == ImageSourceType.url:
                image_data, media_type = await _resolve_url_to_base64(part.source.url)
                part.source = Base64Image(media_type=media_type, data=image_data)
            elif part.source.type == ImageSourceType.letta and not part.source.data:
                pass
            resolved.append(part)
        elif isinstance(part, TextContent):
            resolved.append(part)
        elif isinstance(part, dict):
            if part.get("type") == "image" and part.get("source", {}).get("type") == "url":
                url = part["source"].get("url")
                if url:
                    image_data, media_type = await _resolve_url_to_base64(url)
                    resolved.append(
                        ImageContent(
                            source=Base64Image(
                                media_type=media_type,
                                data=image_data,
                                detail=part.get("source", {}).get("detail"),
                            )
                        )
                    )
                else:
                    resolved.append(part)
            elif part.get("type") == "text":
                resolved.append(TextContent(text=part.get("text", "")))
            else:
                resolved.append(part)
        else:
            resolved.append(part)

    return resolved