From 93c15244ab0c600be3811ee12fbbc8a030e23ba8 Mon Sep 17 00:00:00 2001 From: Kevin Lin Date: Fri, 13 Jun 2025 14:54:37 -0700 Subject: [PATCH] feat: add reasoning models to `integration_test_send_message` (#2710) --- letta/agents/letta_agent.py | 4 + letta/llm_api/openai_client.py | 2 +- .../openai/chat_completion_response.py | 2 +- .../llm_model_configs/openai-o1-mini.json | 8 ++ .../configs/llm_model_configs/openai-o1.json | 8 ++ .../llm_model_configs/openai-o3-mini.json | 8 ++ .../configs/llm_model_configs/openai-o3.json | 8 ++ tests/integration_test_send_message.py | 91 ++++++++++++------- 8 files changed, 98 insertions(+), 33 deletions(-) create mode 100644 tests/configs/llm_model_configs/openai-o1-mini.json create mode 100644 tests/configs/llm_model_configs/openai-o1.json create mode 100644 tests/configs/llm_model_configs/openai-o3-mini.json create mode 100644 tests/configs/llm_model_configs/openai-o3.json diff --git a/letta/agents/letta_agent.py b/letta/agents/letta_agent.py index 94fbcad8..495278db 100644 --- a/letta/agents/letta_agent.py +++ b/letta/agents/letta_agent.py @@ -210,6 +210,8 @@ class LettaAgent(BaseAgent): signature=response.choices[0].message.reasoning_content_signature, ) ] + elif response.choices[0].message.omitted_reasoning_content: + reasoning = [OmittedReasoningContent()] elif response.choices[0].message.content: reasoning = [TextContent(text=response.choices[0].message.content)] # reasoning placed into content for legacy reasons else: @@ -356,6 +358,8 @@ class LettaAgent(BaseAgent): ] elif response.choices[0].message.content: reasoning = [TextContent(text=response.choices[0].message.content)] # reasoning placed into content for legacy reasons + elif response.choices[0].message.omitted_reasoning_content: + reasoning = [OmittedReasoningContent()] else: logger.info("No reasoning content found.") reasoning = None diff --git a/letta/llm_api/openai_client.py b/letta/llm_api/openai_client.py index f9dc3c34..1c36faa4 100644 --- a/letta/llm_api/openai_client.py +++ b/letta/llm_api/openai_client.py @@ -286,7 +286,7 @@ class OpenAIClient(LLMClientBase): # If we used a reasoning model, create a content part for the ommitted reasoning if is_openai_reasoning_model(llm_config.model): - chat_completion_response.choices[0].message.ommitted_reasoning_content = True + chat_completion_response.choices[0].message.omitted_reasoning_content = True return chat_completion_response diff --git a/letta/schemas/openai/chat_completion_response.py b/letta/schemas/openai/chat_completion_response.py index 44ef5cff..77f8b991 100644 --- a/letta/schemas/openai/chat_completion_response.py +++ b/letta/schemas/openai/chat_completion_response.py @@ -62,7 +62,7 @@ class Message(BaseModel): reasoning_content: Optional[str] = None # Used in newer reasoning APIs, e.g. DeepSeek reasoning_content_signature: Optional[str] = None # NOTE: for Anthropic redacted_reasoning_content: Optional[str] = None # NOTE: for Anthropic - ommitted_reasoning_content: bool = False # NOTE: for OpenAI o1/o3 + omitted_reasoning_content: bool = False # NOTE: for OpenAI o1/o3 class Choice(BaseModel): diff --git a/tests/configs/llm_model_configs/openai-o1-mini.json b/tests/configs/llm_model_configs/openai-o1-mini.json new file mode 100644 index 00000000..fbfa0c01 --- /dev/null +++ b/tests/configs/llm_model_configs/openai-o1-mini.json @@ -0,0 +1,8 @@ +{ + "context_window": 128000, + "model": "o1-mini", + "model_endpoint_type": "openai", + "model_endpoint": "https://api.openai.com/v1", + "model_wrapper": null, + "temperature": 1.0 +} diff --git a/tests/configs/llm_model_configs/openai-o1.json b/tests/configs/llm_model_configs/openai-o1.json new file mode 100644 index 00000000..b2336337 --- /dev/null +++ b/tests/configs/llm_model_configs/openai-o1.json @@ -0,0 +1,8 @@ +{ + "context_window": 128000, + "model": "o1", + "model_endpoint_type": "openai", + "model_endpoint": "https://api.openai.com/v1", + "model_wrapper": null, + "temperature": 1.0 +} diff --git a/tests/configs/llm_model_configs/openai-o3-mini.json b/tests/configs/llm_model_configs/openai-o3-mini.json new file mode 100644 index 00000000..c690aa83 --- /dev/null +++ b/tests/configs/llm_model_configs/openai-o3-mini.json @@ -0,0 +1,8 @@ +{ + "context_window": 128000, + "model": "o3-mini", + "model_endpoint_type": "openai", + "model_endpoint": "https://api.openai.com/v1", + "model_wrapper": null, + "temperature": 1.0 +} diff --git a/tests/configs/llm_model_configs/openai-o3.json b/tests/configs/llm_model_configs/openai-o3.json new file mode 100644 index 00000000..1edc2742 --- /dev/null +++ b/tests/configs/llm_model_configs/openai-o3.json @@ -0,0 +1,8 @@ +{ + "context_window": 128000, + "model": "o3", + "model_endpoint_type": "openai", + "model_endpoint": "https://api.openai.com/v1", + "model_wrapper": null, + "temperature": 1.0 +} diff --git a/tests/integration_test_send_message.py b/tests/integration_test_send_message.py index 90f57e7a..93aea777 100644 --- a/tests/integration_test_send_message.py +++ b/tests/integration_test_send_message.py @@ -15,6 +15,7 @@ from letta_client.core.api_error import ApiError from letta_client.types import ( AssistantMessage, Base64Image, + HiddenReasoningMessage, ImageContent, LettaUsageStatistics, ReasoningMessage, @@ -25,6 +26,7 @@ from letta_client.types import ( UserMessage, ) +from letta.llm_api.openai_client import is_openai_reasoning_model from letta.schemas.agent import AgentState from letta.schemas.llm_config import LLMConfig @@ -93,6 +95,11 @@ USER_MESSAGE_BASE64_IMAGE: List[MessageCreate] = [ ] all_configs = [ "openai-gpt-4o-mini.json", + "openai-o1.json", + "openai-o1-mini.json", + "openai-o3.json", + "openai-o3-mini.json", + # "azure-gpt-4o-mini.json", # TODO: Re-enable on new agent loop "azure-gpt-4o-mini.json", "claude-3-5-sonnet.json", "claude-3-7-sonnet.json", @@ -103,6 +110,8 @@ all_configs = [ "together-qwen-2.5-72b-instruct.json", "ollama.json", ] + + requested = os.getenv("LLM_CONFIG_FILE") filenames = [requested] if requested else all_configs TESTED_LLM_CONFIGS: List[LLMConfig] = [get_llm_config(fn) for fn in filenames] @@ -110,6 +119,7 @@ TESTED_LLM_CONFIGS: List[LLMConfig] = [get_llm_config(fn) for fn in filenames] def assert_greeting_with_assistant_message_response( messages: List[Any], + llm_config: LLMConfig, streaming: bool = False, token_streaming: bool = False, from_db: bool = False, @@ -128,7 +138,11 @@ def assert_greeting_with_assistant_message_response( index += 1 # Agent Step 1 - assert isinstance(messages[index], ReasoningMessage) + if is_openai_reasoning_model(llm_config.model): + assert isinstance(messages[index], HiddenReasoningMessage) + else: + assert isinstance(messages[index], ReasoningMessage) + assert messages[index].otid and messages[index].otid[-1] == "0" index += 1 @@ -148,6 +162,7 @@ def assert_greeting_with_assistant_message_response( def assert_greeting_without_assistant_message_response( messages: List[Any], + llm_config: LLMConfig, streaming: bool = False, token_streaming: bool = False, from_db: bool = False, @@ -166,7 +181,10 @@ def assert_greeting_without_assistant_message_response( index += 1 # Agent Step 1 - assert isinstance(messages[index], ReasoningMessage) + if is_openai_reasoning_model(llm_config.model): + assert isinstance(messages[index], HiddenReasoningMessage) + else: + assert isinstance(messages[index], ReasoningMessage) assert messages[index].otid and messages[index].otid[-1] == "0" index += 1 @@ -188,6 +206,7 @@ def assert_greeting_without_assistant_message_response( def assert_tool_call_response( messages: List[Any], + llm_config: LLMConfig, streaming: bool = False, from_db: bool = False, ) -> None: @@ -206,7 +225,10 @@ def assert_tool_call_response( index += 1 # Agent Step 1 - assert isinstance(messages[index], ReasoningMessage) + if is_openai_reasoning_model(llm_config.model): + assert isinstance(messages[index], HiddenReasoningMessage) + else: + assert isinstance(messages[index], ReasoningMessage) assert messages[index].otid and messages[index].otid[-1] == "0" index += 1 @@ -226,7 +248,10 @@ def assert_tool_call_response( index += 1 # Agent Step 3 - assert isinstance(messages[index], ReasoningMessage) + if is_openai_reasoning_model(llm_config.model): + assert isinstance(messages[index], HiddenReasoningMessage) + else: + assert isinstance(messages[index], ReasoningMessage) assert messages[index].otid and messages[index].otid[-1] == "0" index += 1 @@ -240,6 +265,7 @@ def assert_tool_call_response( def assert_image_input_response( messages: List[Any], + llm_config: LLMConfig, streaming: bool = False, token_streaming: bool = False, from_db: bool = False, @@ -258,7 +284,10 @@ def assert_image_input_response( index += 1 # Agent Step 1 - assert isinstance(messages[index], ReasoningMessage) + if is_openai_reasoning_model(llm_config.model): + assert isinstance(messages[index], HiddenReasoningMessage) + else: + assert isinstance(messages[index], ReasoningMessage) assert messages[index].otid and messages[index].otid[-1] == "0" index += 1 @@ -321,7 +350,7 @@ def server_url() -> str: """ Provides the URL for the Letta server. If LETTA_SERVER_URL is not set, starts the server in a background thread - and polls until it’s accepting connections. + and polls until it's accepting connections. """ def _run_server() -> None: @@ -439,9 +468,9 @@ def test_greeting_with_assistant_message( agent_id=agent_state.id, messages=USER_MESSAGE_FORCE_REPLY, ) - assert_greeting_with_assistant_message_response(response.messages) + assert_greeting_with_assistant_message_response(response.messages, llm_config=llm_config) messages_from_db = client.agents.messages.list(agent_id=agent_state.id, after=last_message[0].id) - assert_greeting_with_assistant_message_response(messages_from_db, from_db=True) + assert_greeting_with_assistant_message_response(messages_from_db, from_db=True, llm_config=llm_config) @pytest.mark.parametrize( @@ -466,9 +495,9 @@ def test_greeting_without_assistant_message( messages=USER_MESSAGE_FORCE_REPLY, use_assistant_message=False, ) - assert_greeting_without_assistant_message_response(response.messages) + assert_greeting_without_assistant_message_response(response.messages, llm_config=llm_config) messages_from_db = client.agents.messages.list(agent_id=agent_state.id, after=last_message[0].id, use_assistant_message=False) - assert_greeting_without_assistant_message_response(messages_from_db, from_db=True) + assert_greeting_without_assistant_message_response(messages_from_db, from_db=True, llm_config=llm_config) @pytest.mark.parametrize( @@ -492,9 +521,9 @@ def test_tool_call( agent_id=agent_state.id, messages=USER_MESSAGE_ROLL_DICE, ) - assert_tool_call_response(response.messages) + assert_tool_call_response(response.messages, llm_config=llm_config) messages_from_db = client.agents.messages.list(agent_id=agent_state.id, after=last_message[0].id) - assert_tool_call_response(messages_from_db, from_db=True) + assert_tool_call_response(messages_from_db, from_db=True, llm_config=llm_config) @pytest.mark.parametrize( @@ -518,9 +547,9 @@ def test_url_image_input( agent_id=agent_state.id, messages=USER_MESSAGE_URL_IMAGE, ) - assert_image_input_response(response.messages) + assert_image_input_response(response.messages, llm_config=llm_config) messages_from_db = client.agents.messages.list(agent_id=agent_state.id, after=last_message[0].id) - assert_image_input_response(messages_from_db, from_db=True) + assert_image_input_response(messages_from_db, from_db=True, llm_config=llm_config) @pytest.mark.parametrize( @@ -544,9 +573,9 @@ def test_base64_image_input( agent_id=agent_state.id, messages=USER_MESSAGE_BASE64_IMAGE, ) - assert_image_input_response(response.messages) + assert_image_input_response(response.messages, llm_config=llm_config) messages_from_db = client.agents.messages.list(agent_id=agent_state.id, after=last_message[0].id) - assert_image_input_response(messages_from_db, from_db=True) + assert_image_input_response(messages_from_db, from_db=True, llm_config=llm_config) @pytest.mark.parametrize( @@ -597,9 +626,9 @@ def test_step_streaming_greeting_with_assistant_message( messages=USER_MESSAGE_FORCE_REPLY, ) messages = accumulate_chunks(list(response)) - assert_greeting_with_assistant_message_response(messages, streaming=True) + assert_greeting_with_assistant_message_response(messages, streaming=True, llm_config=llm_config) messages_from_db = client.agents.messages.list(agent_id=agent_state.id, after=last_message[0].id) - assert_greeting_with_assistant_message_response(messages_from_db, from_db=True) + assert_greeting_with_assistant_message_response(messages_from_db, from_db=True, llm_config=llm_config) @pytest.mark.parametrize( @@ -625,9 +654,9 @@ def test_step_streaming_greeting_without_assistant_message( use_assistant_message=False, ) messages = accumulate_chunks(list(response)) - assert_greeting_without_assistant_message_response(messages, streaming=True) + assert_greeting_without_assistant_message_response(messages, streaming=True, llm_config=llm_config) messages_from_db = client.agents.messages.list(agent_id=agent_state.id, after=last_message[0].id, use_assistant_message=False) - assert_greeting_without_assistant_message_response(messages_from_db, from_db=True) + assert_greeting_without_assistant_message_response(messages_from_db, from_db=True, llm_config=llm_config) @pytest.mark.parametrize( @@ -652,9 +681,9 @@ def test_step_streaming_tool_call( messages=USER_MESSAGE_ROLL_DICE, ) messages = accumulate_chunks(list(response)) - assert_tool_call_response(messages, streaming=True) + assert_tool_call_response(messages, streaming=True, llm_config=llm_config) messages_from_db = client.agents.messages.list(agent_id=agent_state.id, after=last_message[0].id) - assert_tool_call_response(messages_from_db, from_db=True) + assert_tool_call_response(messages_from_db, from_db=True, llm_config=llm_config) @pytest.mark.parametrize( @@ -708,9 +737,9 @@ def test_token_streaming_greeting_with_assistant_message( stream_tokens=True, ) messages = accumulate_chunks(list(response)) - assert_greeting_with_assistant_message_response(messages, streaming=True, token_streaming=True) + assert_greeting_with_assistant_message_response(messages, streaming=True, token_streaming=True, llm_config=llm_config) messages_from_db = client.agents.messages.list(agent_id=agent_state.id, after=last_message[0].id) - assert_greeting_with_assistant_message_response(messages_from_db, from_db=True) + assert_greeting_with_assistant_message_response(messages_from_db, from_db=True, llm_config=llm_config) @pytest.mark.parametrize( @@ -737,9 +766,9 @@ def test_token_streaming_greeting_without_assistant_message( stream_tokens=True, ) messages = accumulate_chunks(list(response)) - assert_greeting_without_assistant_message_response(messages, streaming=True, token_streaming=True) + assert_greeting_without_assistant_message_response(messages, streaming=True, token_streaming=True, llm_config=llm_config) messages_from_db = client.agents.messages.list(agent_id=agent_state.id, after=last_message[0].id, use_assistant_message=False) - assert_greeting_without_assistant_message_response(messages_from_db, from_db=True) + assert_greeting_without_assistant_message_response(messages_from_db, from_db=True, llm_config=llm_config) @pytest.mark.parametrize( @@ -765,9 +794,9 @@ def test_token_streaming_tool_call( stream_tokens=True, ) messages = accumulate_chunks(list(response)) - assert_tool_call_response(messages, streaming=True) + assert_tool_call_response(messages, streaming=True, llm_config=llm_config) messages_from_db = client.agents.messages.list(agent_id=agent_state.id, after=last_message[0].id) - assert_tool_call_response(messages_from_db, from_db=True) + assert_tool_call_response(messages_from_db, from_db=True, llm_config=llm_config) @pytest.mark.parametrize( @@ -866,11 +895,11 @@ def test_auto_summarize(disable_e2b_api_key: Any, client: Letta, llm_config: LLM ) philosophical_question = """ -You know, sometimes I wonder if the entire structure of our lives is built on a series of unexamined assumptions we just silently agreed to somewhere along the way—like how we all just decided that five days a week of work and two days of “rest” constitutes balance, or how 9-to-5 became the default rhythm of a meaningful life, or even how the idea of “success” got boiled down to job titles and property ownership and productivity metrics on a LinkedIn profile, when maybe none of that is actually what makes a life feel full, or grounded, or real. And then there’s the weird paradox of ambition, how we're taught to chase it like a finish line that keeps moving, constantly redefining itself right as you’re about to grasp it—because even when you get the job, or the degree, or the validation, there's always something next, something more, like a treadmill with invisible settings you didn’t realize were turned up all the way. +You know, sometimes I wonder if the entire structure of our lives is built on a series of unexamined assumptions we just silently agreed to somewhere along the way—like how we all just decided that five days a week of work and two days of "rest" constitutes balance, or how 9-to-5 became the default rhythm of a meaningful life, or even how the idea of "success" got boiled down to job titles and property ownership and productivity metrics on a LinkedIn profile, when maybe none of that is actually what makes a life feel full, or grounded, or real. And then there's the weird paradox of ambition, how we're taught to chase it like a finish line that keeps moving, constantly redefining itself right as you're about to grasp it—because even when you get the job, or the degree, or the validation, there's always something next, something more, like a treadmill with invisible settings you didn't realize were turned up all the way. -And have you noticed how we rarely stop to ask who set those definitions for us? Like was there ever a council that decided, yes, owning a home by thirty-five and retiring by sixty-five is the universal template for fulfillment? Or did it just accumulate like cultural sediment over generations, layered into us so deeply that questioning it feels uncomfortable, even dangerous? And isn’t it strange that we spend so much of our lives trying to optimize things—our workflows, our diets, our sleep, our morning routines—as though the point of life is to operate more efficiently rather than to experience it more richly? We build these intricate systems, these rulebooks for being a “high-functioning” human, but where in all of that is the space for feeling lost, for being soft, for wandering without a purpose just because it’s a sunny day and your heart is tugging you toward nowhere in particular? +And have you noticed how we rarely stop to ask who set those definitions for us? Like was there ever a council that decided, yes, owning a home by thirty-five and retiring by sixty-five is the universal template for fulfillment? Or did it just accumulate like cultural sediment over generations, layered into us so deeply that questioning it feels uncomfortable, even dangerous? And isn't it strange that we spend so much of our lives trying to optimize things—our workflows, our diets, our sleep, our morning routines—as though the point of life is to operate more efficiently rather than to experience it more richly? We build these intricate systems, these rulebooks for being a "high-functioning" human, but where in all of that is the space for feeling lost, for being soft, for wandering without a purpose just because it's a sunny day and your heart is tugging you toward nowhere in particular? -Sometimes I lie awake at night and wonder if all the noise we wrap around ourselves—notifications, updates, performance reviews, even our internal monologues—might be crowding out the questions we were meant to live into slowly, like how to love better, or how to forgive ourselves, or what the hell we’re even doing here in the first place. And when you strip it all down—no goals, no KPIs, no curated identity—what’s actually left of us? Are we just a sum of the roles we perform, or is there something quieter underneath that we've forgotten how to hear? +Sometimes I lie awake at night and wonder if all the noise we wrap around ourselves—notifications, updates, performance reviews, even our internal monologues—might be crowding out the questions we were meant to live into slowly, like how to love better, or how to forgive ourselves, or what the hell we're even doing here in the first place. And when you strip it all down—no goals, no KPIs, no curated identity—what's actually left of us? Are we just a sum of the roles we perform, or is there something quieter underneath that we've forgotten how to hear? And if there is something underneath all of it—something real, something worth listening to—then how do we begin to uncover it, gently, without rushing or reducing it to another task on our to-do list? """