fix: bug which causes unrecoverable state if previous message was an image (#6486)
* trying tout gpt-5.1-codex * add unit test for message content * try to support multimodal
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
import asyncio
|
||||
import os
|
||||
import time
|
||||
from typing import List, Optional
|
||||
from typing import Any, List, Optional
|
||||
|
||||
import openai
|
||||
from openai import AsyncOpenAI, AsyncStream, OpenAI
|
||||
@@ -1101,6 +1101,11 @@ def fill_image_content_in_responses_input(openai_message_list: List[dict], pydan
|
||||
pm = user_msgs[user_idx]
|
||||
user_idx += 1
|
||||
|
||||
existing_content = item.get("content")
|
||||
if _is_responses_style_content(existing_content):
|
||||
rewritten.append(item)
|
||||
continue
|
||||
|
||||
# Only rewrite if the pydantic message actually contains multiple parts or images
|
||||
if not isinstance(pm.content, list) or (len(pm.content) == 1 and pm.content[0].type == MessageContentType.text):
|
||||
rewritten.append(item)
|
||||
@@ -1128,3 +1133,17 @@ def fill_image_content_in_responses_input(openai_message_list: List[dict], pydan
|
||||
rewritten.append(item)
|
||||
|
||||
return rewritten
|
||||
|
||||
|
||||
def _is_responses_style_content(content: Optional[Any]) -> bool:
|
||||
if not isinstance(content, list):
|
||||
return False
|
||||
|
||||
allowed_types = {"input_text", "input_image"}
|
||||
for part in content:
|
||||
if not isinstance(part, dict):
|
||||
return False
|
||||
part_type = part.get("type")
|
||||
if part_type not in allowed_types:
|
||||
return False
|
||||
return True
|
||||
|
||||
@@ -43,6 +43,7 @@ from letta.schemas.letta_message import (
|
||||
)
|
||||
from letta.schemas.letta_message_content import (
|
||||
ImageContent,
|
||||
ImageSourceType,
|
||||
LettaMessageContentUnion,
|
||||
OmittedReasoningContent,
|
||||
ReasoningContent,
|
||||
@@ -1309,13 +1310,12 @@ class Message(BaseMessage):
|
||||
)
|
||||
|
||||
elif self.role == "user":
|
||||
# TODO do we need to do a swap to placeholder text here for images?
|
||||
assert self.content, vars(self)
|
||||
assert all([isinstance(c, TextContent) or isinstance(c, ImageContent) for c in self.content]), vars(self)
|
||||
|
||||
user_dict = {
|
||||
"role": self.role.value if hasattr(self.role, "value") else self.role,
|
||||
# TODO support multi-modal
|
||||
"content": self.content[0].text,
|
||||
"content": self._build_responses_user_content(),
|
||||
}
|
||||
|
||||
# Optional field, do not include if null or invalid
|
||||
@@ -1397,6 +1397,53 @@ class Message(BaseMessage):
|
||||
|
||||
return message_dicts
|
||||
|
||||
def _build_responses_user_content(self) -> List[dict]:
|
||||
content_parts: List[dict] = []
|
||||
for content in self.content or []:
|
||||
if isinstance(content, TextContent):
|
||||
content_parts.append({"type": "input_text", "text": content.text})
|
||||
elif isinstance(content, ImageContent):
|
||||
image_part = self._image_content_to_responses_part(content)
|
||||
if image_part:
|
||||
content_parts.append(image_part)
|
||||
|
||||
if not content_parts:
|
||||
content_parts.append({"type": "input_text", "text": ""})
|
||||
|
||||
return content_parts
|
||||
|
||||
@staticmethod
|
||||
def _image_content_to_responses_part(image_content: ImageContent) -> Optional[dict]:
|
||||
image_url = Message._image_source_to_data_url(image_content)
|
||||
if not image_url:
|
||||
return None
|
||||
|
||||
detail = getattr(image_content.source, "detail", None) or "auto"
|
||||
return {"type": "input_image", "image_url": image_url, "detail": detail}
|
||||
|
||||
@staticmethod
|
||||
def _image_source_to_data_url(image_content: ImageContent) -> Optional[str]:
|
||||
source = image_content.source
|
||||
|
||||
if source.type == ImageSourceType.base64:
|
||||
data = getattr(source, "data", None)
|
||||
if not data:
|
||||
return None
|
||||
media_type = getattr(source, "media_type", None) or "image/png"
|
||||
return f"data:{media_type};base64,{data}"
|
||||
|
||||
if source.type == ImageSourceType.url:
|
||||
return getattr(source, "url", None)
|
||||
|
||||
if source.type == ImageSourceType.letta:
|
||||
data = getattr(source, "data", None)
|
||||
if not data:
|
||||
return None
|
||||
media_type = getattr(source, "media_type", None) or "image/png"
|
||||
return f"data:{media_type};base64,{data}"
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def to_openai_responses_dicts_from_list(
|
||||
messages: List[Message],
|
||||
|
||||
32
tests/test_message_serialization.py
Normal file
32
tests/test_message_serialization.py
Normal file
@@ -0,0 +1,32 @@
|
||||
from letta.llm_api.openai_client import fill_image_content_in_responses_input
|
||||
from letta.schemas.enums import MessageRole
|
||||
from letta.schemas.letta_message_content import Base64Image, ImageContent, TextContent
|
||||
from letta.schemas.message import Message
|
||||
|
||||
|
||||
def _user_message_with_image_first(text: str) -> Message:
|
||||
image = ImageContent(source=Base64Image(media_type="image/png", data="dGVzdA=="))
|
||||
return Message(role=MessageRole.user, content=[image, TextContent(text=text)])
|
||||
|
||||
|
||||
def test_to_openai_responses_dicts_handles_image_first_content():
|
||||
message = _user_message_with_image_first("hello world")
|
||||
serialized = Message.to_openai_responses_dicts_from_list([message])
|
||||
parts = serialized[0]["content"]
|
||||
assert any(part["type"] == "input_text" and part["text"] == "hello world" for part in parts)
|
||||
assert any(part["type"] == "input_image" for part in parts)
|
||||
|
||||
|
||||
def test_fill_image_content_in_responses_input_includes_image_parts():
|
||||
message = _user_message_with_image_first("describe image")
|
||||
serialized = Message.to_openai_responses_dicts_from_list([message])
|
||||
rewritten = fill_image_content_in_responses_input(serialized, [message])
|
||||
assert rewritten == serialized
|
||||
|
||||
|
||||
def test_to_openai_responses_dicts_handles_image_only_content():
|
||||
image = ImageContent(source=Base64Image(media_type="image/png", data="dGVzdA=="))
|
||||
message = Message(role=MessageRole.user, content=[image])
|
||||
serialized = Message.to_openai_responses_dicts_from_list([message])
|
||||
parts = serialized[0]["content"]
|
||||
assert parts[0]["type"] == "input_image"
|
||||
Reference in New Issue
Block a user