fix: bug which causes unrecoverable state if previous message was an image (#6486)

* trying tout gpt-5.1-codex

* add unit test for message content

* try to support multimodal
This commit is contained in:
Kian Jones
2025-12-02 16:02:07 -08:00
committed by Caren Thomas
parent 5109ba1384
commit d6292b6eb6
3 changed files with 102 additions and 4 deletions

View File

@@ -1,7 +1,7 @@
import asyncio
import os
import time
from typing import List, Optional
from typing import Any, List, Optional
import openai
from openai import AsyncOpenAI, AsyncStream, OpenAI
@@ -1101,6 +1101,11 @@ def fill_image_content_in_responses_input(openai_message_list: List[dict], pydan
pm = user_msgs[user_idx]
user_idx += 1
existing_content = item.get("content")
if _is_responses_style_content(existing_content):
rewritten.append(item)
continue
# Only rewrite if the pydantic message actually contains multiple parts or images
if not isinstance(pm.content, list) or (len(pm.content) == 1 and pm.content[0].type == MessageContentType.text):
rewritten.append(item)
@@ -1128,3 +1133,17 @@ def fill_image_content_in_responses_input(openai_message_list: List[dict], pydan
rewritten.append(item)
return rewritten
def _is_responses_style_content(content: Optional[Any]) -> bool:
if not isinstance(content, list):
return False
allowed_types = {"input_text", "input_image"}
for part in content:
if not isinstance(part, dict):
return False
part_type = part.get("type")
if part_type not in allowed_types:
return False
return True

View File

@@ -43,6 +43,7 @@ from letta.schemas.letta_message import (
)
from letta.schemas.letta_message_content import (
ImageContent,
ImageSourceType,
LettaMessageContentUnion,
OmittedReasoningContent,
ReasoningContent,
@@ -1309,13 +1310,12 @@ class Message(BaseMessage):
)
elif self.role == "user":
# TODO do we need to do a swap to placeholder text here for images?
assert self.content, vars(self)
assert all([isinstance(c, TextContent) or isinstance(c, ImageContent) for c in self.content]), vars(self)
user_dict = {
"role": self.role.value if hasattr(self.role, "value") else self.role,
# TODO support multi-modal
"content": self.content[0].text,
"content": self._build_responses_user_content(),
}
# Optional field, do not include if null or invalid
@@ -1397,6 +1397,53 @@ class Message(BaseMessage):
return message_dicts
def _build_responses_user_content(self) -> List[dict]:
content_parts: List[dict] = []
for content in self.content or []:
if isinstance(content, TextContent):
content_parts.append({"type": "input_text", "text": content.text})
elif isinstance(content, ImageContent):
image_part = self._image_content_to_responses_part(content)
if image_part:
content_parts.append(image_part)
if not content_parts:
content_parts.append({"type": "input_text", "text": ""})
return content_parts
@staticmethod
def _image_content_to_responses_part(image_content: ImageContent) -> Optional[dict]:
image_url = Message._image_source_to_data_url(image_content)
if not image_url:
return None
detail = getattr(image_content.source, "detail", None) or "auto"
return {"type": "input_image", "image_url": image_url, "detail": detail}
@staticmethod
def _image_source_to_data_url(image_content: ImageContent) -> Optional[str]:
source = image_content.source
if source.type == ImageSourceType.base64:
data = getattr(source, "data", None)
if not data:
return None
media_type = getattr(source, "media_type", None) or "image/png"
return f"data:{media_type};base64,{data}"
if source.type == ImageSourceType.url:
return getattr(source, "url", None)
if source.type == ImageSourceType.letta:
data = getattr(source, "data", None)
if not data:
return None
media_type = getattr(source, "media_type", None) or "image/png"
return f"data:{media_type};base64,{data}"
return None
@staticmethod
def to_openai_responses_dicts_from_list(
messages: List[Message],

View File

@@ -0,0 +1,32 @@
from letta.llm_api.openai_client import fill_image_content_in_responses_input
from letta.schemas.enums import MessageRole
from letta.schemas.letta_message_content import Base64Image, ImageContent, TextContent
from letta.schemas.message import Message
def _user_message_with_image_first(text: str) -> Message:
image = ImageContent(source=Base64Image(media_type="image/png", data="dGVzdA=="))
return Message(role=MessageRole.user, content=[image, TextContent(text=text)])
def test_to_openai_responses_dicts_handles_image_first_content():
message = _user_message_with_image_first("hello world")
serialized = Message.to_openai_responses_dicts_from_list([message])
parts = serialized[0]["content"]
assert any(part["type"] == "input_text" and part["text"] == "hello world" for part in parts)
assert any(part["type"] == "input_image" for part in parts)
def test_fill_image_content_in_responses_input_includes_image_parts():
message = _user_message_with_image_first("describe image")
serialized = Message.to_openai_responses_dicts_from_list([message])
rewritten = fill_image_content_in_responses_input(serialized, [message])
assert rewritten == serialized
def test_to_openai_responses_dicts_handles_image_only_content():
image = ImageContent(source=Base64Image(media_type="image/png", data="dGVzdA=="))
message = Message(role=MessageRole.user, content=[image])
serialized = Message.to_openai_responses_dicts_from_list([message])
parts = serialized[0]["content"]
assert parts[0]["type"] == "input_image"