diff --git a/letta/llm_api/openai_client.py b/letta/llm_api/openai_client.py index bc8b014c..d4b989f9 100644 --- a/letta/llm_api/openai_client.py +++ b/letta/llm_api/openai_client.py @@ -1,7 +1,7 @@ import asyncio import os import time -from typing import List, Optional +from typing import Any, List, Optional import openai from openai import AsyncOpenAI, AsyncStream, OpenAI @@ -1101,6 +1101,11 @@ def fill_image_content_in_responses_input(openai_message_list: List[dict], pydan pm = user_msgs[user_idx] user_idx += 1 + existing_content = item.get("content") + if _is_responses_style_content(existing_content): + rewritten.append(item) + continue + # Only rewrite if the pydantic message actually contains multiple parts or images if not isinstance(pm.content, list) or (len(pm.content) == 1 and pm.content[0].type == MessageContentType.text): rewritten.append(item) @@ -1128,3 +1133,17 @@ def fill_image_content_in_responses_input(openai_message_list: List[dict], pydan rewritten.append(item) return rewritten + + +def _is_responses_style_content(content: Optional[Any]) -> bool: + if not isinstance(content, list): + return False + + allowed_types = {"input_text", "input_image"} + for part in content: + if not isinstance(part, dict): + return False + part_type = part.get("type") + if part_type not in allowed_types: + return False + return True diff --git a/letta/schemas/message.py b/letta/schemas/message.py index 1f9b64bf..4adc92e8 100644 --- a/letta/schemas/message.py +++ b/letta/schemas/message.py @@ -43,6 +43,7 @@ from letta.schemas.letta_message import ( ) from letta.schemas.letta_message_content import ( ImageContent, + ImageSourceType, LettaMessageContentUnion, OmittedReasoningContent, ReasoningContent, @@ -1309,13 +1310,12 @@ class Message(BaseMessage): ) elif self.role == "user": - # TODO do we need to do a swap to placeholder text here for images? + assert self.content, vars(self) assert all([isinstance(c, TextContent) or isinstance(c, ImageContent) for c in self.content]), vars(self) user_dict = { "role": self.role.value if hasattr(self.role, "value") else self.role, - # TODO support multi-modal - "content": self.content[0].text, + "content": self._build_responses_user_content(), } # Optional field, do not include if null or invalid @@ -1397,6 +1397,53 @@ class Message(BaseMessage): return message_dicts + def _build_responses_user_content(self) -> List[dict]: + content_parts: List[dict] = [] + for content in self.content or []: + if isinstance(content, TextContent): + content_parts.append({"type": "input_text", "text": content.text}) + elif isinstance(content, ImageContent): + image_part = self._image_content_to_responses_part(content) + if image_part: + content_parts.append(image_part) + + if not content_parts: + content_parts.append({"type": "input_text", "text": ""}) + + return content_parts + + @staticmethod + def _image_content_to_responses_part(image_content: ImageContent) -> Optional[dict]: + image_url = Message._image_source_to_data_url(image_content) + if not image_url: + return None + + detail = getattr(image_content.source, "detail", None) or "auto" + return {"type": "input_image", "image_url": image_url, "detail": detail} + + @staticmethod + def _image_source_to_data_url(image_content: ImageContent) -> Optional[str]: + source = image_content.source + + if source.type == ImageSourceType.base64: + data = getattr(source, "data", None) + if not data: + return None + media_type = getattr(source, "media_type", None) or "image/png" + return f"data:{media_type};base64,{data}" + + if source.type == ImageSourceType.url: + return getattr(source, "url", None) + + if source.type == ImageSourceType.letta: + data = getattr(source, "data", None) + if not data: + return None + media_type = getattr(source, "media_type", None) or "image/png" + return f"data:{media_type};base64,{data}" + + return None + @staticmethod def to_openai_responses_dicts_from_list( messages: List[Message], diff --git a/tests/test_message_serialization.py b/tests/test_message_serialization.py new file mode 100644 index 00000000..c0124a25 --- /dev/null +++ b/tests/test_message_serialization.py @@ -0,0 +1,32 @@ +from letta.llm_api.openai_client import fill_image_content_in_responses_input +from letta.schemas.enums import MessageRole +from letta.schemas.letta_message_content import Base64Image, ImageContent, TextContent +from letta.schemas.message import Message + + +def _user_message_with_image_first(text: str) -> Message: + image = ImageContent(source=Base64Image(media_type="image/png", data="dGVzdA==")) + return Message(role=MessageRole.user, content=[image, TextContent(text=text)]) + + +def test_to_openai_responses_dicts_handles_image_first_content(): + message = _user_message_with_image_first("hello world") + serialized = Message.to_openai_responses_dicts_from_list([message]) + parts = serialized[0]["content"] + assert any(part["type"] == "input_text" and part["text"] == "hello world" for part in parts) + assert any(part["type"] == "input_image" for part in parts) + + +def test_fill_image_content_in_responses_input_includes_image_parts(): + message = _user_message_with_image_first("describe image") + serialized = Message.to_openai_responses_dicts_from_list([message]) + rewritten = fill_image_content_in_responses_input(serialized, [message]) + assert rewritten == serialized + + +def test_to_openai_responses_dicts_handles_image_only_content(): + image = ImageContent(source=Base64Image(media_type="image/png", data="dGVzdA==")) + message = Message(role=MessageRole.user, content=[image]) + serialized = Message.to_openai_responses_dicts_from_list([message]) + parts = serialized[0]["content"] + assert parts[0]["type"] == "input_image"