fix: pass images from tool returns to vision models
OpenAI Chat Completions requires tool message content to be a string, so images in tool returns were silently replaced with [Image omitted]. Now: text stays in the tool return, images get injected as a user message right after. The model actually sees what the tool saw. to_openai_dict also cleaned up — image handling lives in to_openai_dicts_from_list where it can inject the extra message.
This commit is contained in:
committed by
Ani - Annie Tunturi
parent
c79de81cea
commit
93337ce680
@@ -1442,11 +1442,12 @@ class Message(BaseMessage):
|
|||||||
tool_return = self.tool_returns[0]
|
tool_return = self.tool_returns[0]
|
||||||
if not tool_return.tool_call_id:
|
if not tool_return.tool_call_id:
|
||||||
raise TypeError("OpenAI API requires tool_call_id to be set.")
|
raise TypeError("OpenAI API requires tool_call_id to be set.")
|
||||||
# Convert to text first (replaces images with placeholders), then truncate
|
# Tool message content must be a string per OpenAI Chat Completions spec.
|
||||||
|
# Images are handled in to_openai_dicts_from_list via injected user messages.
|
||||||
func_response_text = tool_return_to_text(tool_return.func_response)
|
func_response_text = tool_return_to_text(tool_return.func_response)
|
||||||
func_response = truncate_tool_return(func_response_text, tool_return_truncation_chars)
|
openai_content = truncate_tool_return(func_response_text, tool_return_truncation_chars)
|
||||||
openai_message = {
|
openai_message = {
|
||||||
"content": func_response,
|
"content": openai_content,
|
||||||
"role": self.role,
|
"role": self.role,
|
||||||
"tool_call_id": tool_return.tool_call_id[:max_tool_id_length] if max_tool_id_length else tool_return.tool_call_id,
|
"tool_call_id": tool_return.tool_call_id[:max_tool_id_length] if max_tool_id_length else tool_return.tool_call_id,
|
||||||
}
|
}
|
||||||
@@ -1499,16 +1500,52 @@ class Message(BaseMessage):
|
|||||||
for tr in m.tool_returns:
|
for tr in m.tool_returns:
|
||||||
if not tr.tool_call_id:
|
if not tr.tool_call_id:
|
||||||
raise TypeError("ToolReturn came back without a tool_call_id.")
|
raise TypeError("ToolReturn came back without a tool_call_id.")
|
||||||
# Convert multi-modal to text (images → placeholders), then truncate
|
# OpenAI Chat Completions: tool message content must be a string.
|
||||||
func_response_text = tool_return_to_text(tr.func_response)
|
# Images can only go in user messages, so split: text in tool return,
|
||||||
func_response = truncate_tool_return(func_response_text, tool_return_truncation_chars)
|
# image in a follow-up user message.
|
||||||
|
func_response = tr.func_response
|
||||||
|
image_parts = []
|
||||||
|
if isinstance(func_response, list) and any(
|
||||||
|
isinstance(p, ImageContent) or (isinstance(p, dict) and p.get("type") == "image")
|
||||||
|
for p in func_response
|
||||||
|
):
|
||||||
|
# Extract text for the tool return, collect images for user message
|
||||||
|
text_pieces = []
|
||||||
|
for part in func_response:
|
||||||
|
if isinstance(part, TextContent):
|
||||||
|
text_pieces.append(part.text)
|
||||||
|
elif isinstance(part, ImageContent):
|
||||||
|
image_url = Message._image_source_to_data_url(part)
|
||||||
|
if image_url:
|
||||||
|
image_parts.append({"type": "image_url", "image_url": {"url": image_url}})
|
||||||
|
elif isinstance(part, dict):
|
||||||
|
if part.get("type") == "text":
|
||||||
|
text_pieces.append(part.get("text", ""))
|
||||||
|
elif part.get("type") == "image":
|
||||||
|
image_url = Message._image_dict_to_data_url(part)
|
||||||
|
if image_url:
|
||||||
|
image_parts.append({"type": "image_url", "image_url": {"url": image_url}})
|
||||||
|
else:
|
||||||
|
text_pieces.append(str(part))
|
||||||
|
openai_content = truncate_tool_return("\n".join(text_pieces), tool_return_truncation_chars)
|
||||||
|
else:
|
||||||
|
func_response_text = tool_return_to_text(func_response)
|
||||||
|
openai_content = truncate_tool_return(func_response_text, tool_return_truncation_chars)
|
||||||
result.append(
|
result.append(
|
||||||
{
|
{
|
||||||
"content": func_response,
|
"content": openai_content,
|
||||||
"role": "tool",
|
"role": "tool",
|
||||||
"tool_call_id": tr.tool_call_id[:max_tool_id_length] if max_tool_id_length else tr.tool_call_id,
|
"tool_call_id": tr.tool_call_id[:max_tool_id_length] if max_tool_id_length else tr.tool_call_id,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
# Inject image as a user message right after the tool return
|
||||||
|
if image_parts:
|
||||||
|
result.append(
|
||||||
|
{
|
||||||
|
"content": [{"type": "text", "text": "[Tool returned image]"}] + image_parts,
|
||||||
|
"role": "user",
|
||||||
|
}
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
d = m.to_openai_dict(
|
d = m.to_openai_dict(
|
||||||
|
|||||||
Reference in New Issue
Block a user