letta-server/tests/integration_test_summarizer.py

"""
Integration tests for conversation history summarization.

These tests verify the complete summarization flow:
1. Creating a LettaAgentV2 instance
2. Fetching messages via message_manager.get_messages_by_ids_async
3. Calling agent_loop.summarize_conversation_history with force=True
"""

import json
import os
from typing import List, Literal

import pytest

from letta.agents.letta_agent_v2 import LettaAgentV2
from letta.agents.letta_agent_v3 import LettaAgentV3
from letta.config import LettaConfig
from letta.schemas.agent import CreateAgent, UpdateAgent
from letta.schemas.block import BlockUpdate, CreateBlock
from letta.schemas.embedding_config import EmbeddingConfig
from letta.schemas.enums import MessageRole
from letta.schemas.letta_message import EventMessage, LettaMessage, SummaryMessage
from letta.schemas.letta_message_content import TextContent, ToolCallContent, ToolReturnContent
from letta.schemas.llm_config import LLMConfig
from letta.schemas.message import Message as PydanticMessage, MessageCreate
from letta.schemas.run import Run as PydanticRun
from letta.server.server import SyncServer
from letta.services.run_manager import RunManager
from letta.services.summarizer.summarizer import simple_summary
from letta.settings import model_settings

# Constants
DEFAULT_EMBEDDING_CONFIG = EmbeddingConfig.default_config(provider="openai")


def get_llm_config(filename: str, llm_config_dir: str = "tests/configs/llm_model_configs") -> LLMConfig:
    """Load LLM configuration from JSON file."""
    filename = os.path.join(llm_config_dir, filename)
    with open(filename, "r") as f:
        config_data = json.load(f)
    llm_config = LLMConfig(**config_data)
    return llm_config


# Test configurations - using a subset of models for summarization tests
all_configs = [
    "openai-gpt-5-mini.json",
    # "claude-4-5-haiku.json",
    # "gemini-2.5-flash.json",
    # "gemini-2.5-flash-vertex.json",  # Requires Vertex AI credentials
    # "openai-gpt-4.1.json",
    # "openai-o1.json",
    # "openai-o3.json",
    # "openai-o4-mini.json",
    # "claude-4-sonnet.json",
    # "claude-3-7-sonnet.json",
    # "gemini-2.5-pro-vertex.json",
]

requested = os.getenv("LLM_CONFIG_FILE")
filenames = [requested] if requested else all_configs
TESTED_LLM_CONFIGS: List[LLMConfig] = [get_llm_config(fn) for fn in filenames]
# Filter out deprecated Gemini 1.5 models
TESTED_LLM_CONFIGS = [
    cfg
    for cfg in TESTED_LLM_CONFIGS
    if not (cfg.model_endpoint_type in ["google_vertex", "google_ai"] and cfg.model.startswith("gemini-1.5"))
]


# ======================================================================================================================
# Fixtures
# ======================================================================================================================


@pytest.fixture
async def server():
    config = LettaConfig.load()
    config.save()
    server = SyncServer(init_with_default_org_and_user=True)
    await server.init_async()
    await server.tool_manager.upsert_base_tools_async(actor=server.default_user)

    yield server


@pytest.fixture
async def default_organization(server: SyncServer):
    """Create and return the default organization."""
    org = await server.organization_manager.create_default_organization_async()
    yield org


@pytest.fixture
async def default_user(server: SyncServer, default_organization):
    """Create and return the default user."""
    user = await server.user_manager.create_default_actor_async(org_id=default_organization.id)
    yield user


@pytest.fixture
async def actor(default_user):
    """Return actor for authorization."""
    return default_user


# ======================================================================================================================
# Helper Functions
# ======================================================================================================================


def create_large_tool_return(size_chars: int = 50000) -> str:
    """Create a large tool return string for testing."""
    # Create a realistic-looking tool return with repeated data
    base_item = {
        "id": 12345,
        "name": "Sample Item",
        "description": "This is a sample item description that will be repeated many times to create a large payload",
        "metadata": {"created_at": "2025-01-01T00:00:00Z", "updated_at": "2025-01-01T00:00:00Z", "version": "1.0.0"},
        "tags": ["tag1", "tag2", "tag3", "tag4", "tag5"],
        "nested_data": {"level1": {"level2": {"level3": {"value": "deeply nested value"}}}},
    }

    items = []
    current_size = 0
    item_json = json.dumps(base_item)
    item_size = len(item_json)

    while current_size < size_chars:
        items.append(base_item.copy())
        current_size += item_size

    result = {"status": "success", "total_items": len(items), "items": items}
    return json.dumps(result)


async def create_agent_with_messages(server: SyncServer, actor, llm_config: LLMConfig, messages: List[PydanticMessage]) -> tuple:
    """
    Create an agent and add messages to it.
    Returns (agent_state, in_context_messages).
    """
    # Create agent (replace dots and slashes with underscores for valid names)
    agent_name = f"test_agent_{llm_config.model}".replace(".", "_").replace("/", "_")
    agent_create = CreateAgent(
        name=agent_name,
        llm_config=llm_config,
        embedding_config=DEFAULT_EMBEDDING_CONFIG,
    )
    agent_state = await server.agent_manager.create_agent_async(agent_create, actor=actor)

    # Add messages to the agent
    # Set agent_id on all message objects
    message_objs = []
    for msg in messages:
        msg_dict = msg.model_dump() if hasattr(msg, "model_dump") else msg.dict()
        msg_dict["agent_id"] = agent_state.id
        message_objs.append(PydanticMessage(**msg_dict))

    created_messages = await server.message_manager.create_many_messages_async(message_objs, actor=actor)

    # Update agent's message_ids
    message_ids = [m.id for m in created_messages]
    await server.agent_manager.update_message_ids_async(agent_id=agent_state.id, message_ids=message_ids, actor=actor)

    # Reload agent state to get updated message_ids
    agent_state = await server.agent_manager.get_agent_by_id_async(agent_id=agent_state.id, actor=actor)

    # Fetch messages using the message manager (as in the actual code path)
    in_context_messages = await server.message_manager.get_messages_by_ids_async(message_ids=agent_state.message_ids, actor=actor)

    return agent_state, in_context_messages


async def run_summarization(server: SyncServer, agent_state, in_context_messages, actor, force=True):
    """
    Execute the summarization code path that needs to be tested.

    This follows the exact code path specified:
    1. Create LettaAgentV2 instance
    2. Fetch messages via message_manager.get_messages_by_ids_async
    3. Call agent_loop.summarize_conversation_history with force=True
    """
    agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)

    # Run summarization with force parameter
    summary_message, messages, summary = await agent_loop.compact(messages=in_context_messages)

    return summary_message, messages, summary


# ======================================================================================================================
# Test Cases
# ======================================================================================================================


@pytest.mark.asyncio
@pytest.mark.parametrize(
    "llm_config",
    TESTED_LLM_CONFIGS,
    ids=[c.model for c in TESTED_LLM_CONFIGS],
)
async def test_summarize_empty_message_buffer(server: SyncServer, actor, llm_config: LLMConfig):
    """
    Test summarization when there are no messages in the buffer.
    Should handle gracefully - either return empty list or raise a clear error.
    """
    # Create agent with no messages (replace dots and slashes with underscores for valid names)
    agent_name = f"test_agent_empty_{llm_config.model}".replace(".", "_").replace("/", "_")
    agent_create = CreateAgent(
        name=agent_name,
        llm_config=llm_config,
        embedding_config=DEFAULT_EMBEDDING_CONFIG,
    )
    agent_state = await server.agent_manager.create_agent_async(agent_create, actor=actor)

    # Get messages (should be empty or only contain system messages)
    in_context_messages = await server.message_manager.get_messages_by_ids_async(message_ids=agent_state.message_ids, actor=actor)

    # Run summarization - this may fail with empty buffer, which is acceptable behavior
    try:
        summary, result, _ = await run_summarization(server, agent_state, in_context_messages, actor)
        # If it succeeds, verify result
        assert isinstance(result, list)

        # When summarization runs, V3 ensures that in-context messages follow
        # the pattern:
        #   1. System prompt
        #   2. User summary message (system_alert JSON)
        #   3. Remaining messages (which may be empty for this test)

        # We should always keep the original system message at the front.
        assert len(result) >= 1
        assert result[0].role == MessageRole.system

        # If summarization did in fact add a summary message, we expect it to
        # be the second message with user role.
        if len(result) >= 2:
            assert result[1].role == MessageRole.user
    except ValueError as e:
        # It's acceptable for summarization to fail on empty buffer
        assert "No assistant message found" in str(e) or "empty" in str(e).lower()


@pytest.mark.asyncio
@pytest.mark.skipif(
    not model_settings.anthropic_api_key,
    reason="Missing LETTA_ANTHROPIC_API_KEY (or equivalent settings) for Anthropic integration test",
)
async def test_simple_summary_anthropic_uses_streaming_and_returns_summary(actor, monkeypatch):
    """Regression test: Anthropic summarization must use streaming and return real text."""

    # If the summarizer ever falls back to a non-streaming Anthropic call, make it fail fast.
    from letta.llm_api.anthropic_client import AnthropicClient

    async def _nope_request_async(self, *args, **kwargs):
        raise AssertionError("Anthropic summarizer should not call request_async (must use streaming)")

    monkeypatch.setattr(AnthropicClient, "request_async", _nope_request_async)

    # Keep the prompt tiny so this is fast and cheap.
    messages = [
        PydanticMessage(
            role=MessageRole.user,
            content=[TextContent(type="text", text="I'm planning a trip to Paris in April.")],
        ),
        PydanticMessage(
            role=MessageRole.assistant,
            content=[
                TextContent(
                    type="text",
                    text="Great—your priorities are museums and cafes, and you want to stay under $200/day.",
                )
            ],
        ),
    ]

    anthropic_config = get_llm_config("claude-4-5-haiku.json")

    summary = await simple_summary(messages=messages, llm_config=anthropic_config, actor=actor)

    assert isinstance(summary, str)
    assert len(summary) > 10
    # Sanity-check that the model is summarizing the right conversation.
    assert any(token in summary.lower() for token in ["paris", "april", "museum", "cafe", "$200", "200"])


@pytest.mark.asyncio
@pytest.mark.parametrize(
    "llm_config",
    TESTED_LLM_CONFIGS,
    ids=[c.model for c in TESTED_LLM_CONFIGS],
)
async def test_summarize_initialization_messages_only(server: SyncServer, actor, llm_config: LLMConfig):
    """
    Test summarization when only initialization/system messages are in the buffer.
    Should handle gracefully and likely not summarize.
    """
    # Create messages - only system initialization messages
    messages = [
        PydanticMessage(
            role=MessageRole.system,
            content=[TextContent(type="text", text="You are a helpful assistant. Your name is Letta.")],
        ),
        PydanticMessage(
            role=MessageRole.system,
            content=[TextContent(type="text", text="The current date and time is 2025-01-01 12:00:00 UTC.")],
        ),
    ]

    agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)

    # Run summarization - force=True with system messages only may fail
    try:
        summary, result, _ = await run_summarization(server, agent_state, in_context_messages, actor, force=True)

        # Verify result
        assert isinstance(result, list)
        # System messages should typically be preserved
        assert len(result) >= 1
    except ValueError as e:
        # It's acceptable for summarization to fail on system-only messages
        assert "No assistant message found" in str(e)


@pytest.mark.asyncio
@pytest.mark.parametrize(
    "llm_config",
    TESTED_LLM_CONFIGS,
    ids=[c.model for c in TESTED_LLM_CONFIGS],
)
async def test_summarize_small_conversation(server: SyncServer, actor, llm_config: LLMConfig):
    """
    Test summarization with approximately 5 messages in the buffer.
    This represents a typical small conversation.
    """
    # Create a small conversation with ~5 messages
    messages = [
        PydanticMessage(
            role=MessageRole.user,
            content=[TextContent(type="text", text="Hello! Can you help me with a Python question?")],
        ),
        PydanticMessage(
            role=MessageRole.assistant,
            content=[TextContent(type="text", text="Of course! I'd be happy to help you with Python. What would you like to know?")],
        ),
        PydanticMessage(
            role=MessageRole.user,
            content=[TextContent(type="text", text="How do I read a file in Python?")],
        ),
        PydanticMessage(
            role=MessageRole.assistant,
            content=[
                TextContent(
                    type="text",
                    text="You can read a file in Python using the open() function. Here's an example:\n\n```python\nwith open('file.txt', 'r') as f:\n    content = f.read()\n    print(content)\n```",
                )
            ],
        ),
        PydanticMessage(
            role=MessageRole.user,
            content=[TextContent(type="text", text="Thank you! That's very helpful.")],
        ),
    ]

    agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)

    # Run summarization with force=True
    # Note: force=True with clear=True can be very aggressive and may fail on small message sets
    try:
        summary, result, _ = await run_summarization(server, agent_state, in_context_messages, actor, force=True)

        # Verify result
        assert isinstance(result, list)
        # With force=True, some summarization should occur
        # The result might be shorter than the original if summarization happened
        assert len(result) >= 1

        # Verify that the result contains valid messages
        for msg in result:
            assert hasattr(msg, "role")
            assert hasattr(msg, "content")
    except ValueError as e:
        # With force=True + clear=True, aggressive summarization might fail on small message sets
        # This is acceptable behavior
        assert "No assistant message found" in str(e)


@pytest.mark.asyncio
@pytest.mark.parametrize(
    "llm_config",
    TESTED_LLM_CONFIGS,
    ids=[c.model for c in TESTED_LLM_CONFIGS],
)
async def test_summarize_large_tool_calls(server: SyncServer, actor, llm_config: LLMConfig):
    """
    Test summarization with large tool calls and returns (~50k character tool returns).
    This tests the system's ability to handle and summarize very large context windows.
    """
    # Create a large tool return
    large_return = create_large_tool_return(50000)

    # Create messages with large tool calls and returns
    messages = [
        PydanticMessage(
            role=MessageRole.user,
            content=[TextContent(type="text", text="Please fetch all the data from the database.")],
        ),
        PydanticMessage(
            role=MessageRole.assistant,
            content=[
                TextContent(type="text", text="I'll fetch the data for you."),
                ToolCallContent(
                    type="tool_call",
                    id="call_1",
                    name="fetch_database_records",
                    input={"query": "SELECT * FROM records"},
                ),
            ],
        ),
        PydanticMessage(
            role=MessageRole.tool,
            tool_call_id="call_1",
            content=[
                ToolReturnContent(
                    type="tool_return",
                    tool_call_id="call_1",
                    content=large_return,
                    is_error=False,
                )
            ],
        ),
        PydanticMessage(
            role=MessageRole.assistant,
            content=[
                TextContent(
                    type="text",
                    text="I've successfully fetched all the records from the database. There are thousands of items in the result set.",
                )
            ],
        ),
        PydanticMessage(
            role=MessageRole.user,
            content=[TextContent(type="text", text="Great! Can you summarize what you found?")],
        ),
        PydanticMessage(
            role=MessageRole.assistant,
            content=[
                TextContent(
                    type="text",
                    text="Based on the data I retrieved, there are numerous records containing various items with descriptions, metadata, and nested data structures. Each record includes timestamps and version information.",
                )
            ],
        ),
    ]

    agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)

    # Verify that we actually have large messages
    total_content_size = sum(len(str(content)) for msg in in_context_messages for content in msg.content)
    assert total_content_size > 40000, f"Expected large messages, got {total_content_size} chars"

    # Run summarization
    summary, result, _ = await run_summarization(server, agent_state, in_context_messages, actor)

    # Verify result
    assert isinstance(result, list)
    assert len(result) >= 1

    # Verify that summarization reduced the context size
    result_content_size = sum(len(str(content)) for msg in result for content in msg.content)

    # The summarized result should be smaller than the original
    # (unless summarization was skipped for some reason)
    print(f"Original size: {total_content_size} chars, Summarized size: {result_content_size} chars")

    # Verify that the result contains valid messages
    for msg in result:
        assert hasattr(msg, "role")
        assert hasattr(msg, "content")


@pytest.mark.asyncio
@pytest.mark.parametrize(
    "llm_config",
    TESTED_LLM_CONFIGS,
    ids=[c.model for c in TESTED_LLM_CONFIGS],
)
async def test_summarize_multiple_large_tool_calls(server: SyncServer, actor, llm_config: LLMConfig):
    """
    Test summarization with multiple large tool calls in sequence.
    This stress-tests the summarization with multiple large context items.
    """
    # Create multiple large tool returns
    large_return_1 = create_large_tool_return(25000)
    large_return_2 = create_large_tool_return(25000)

    messages = [
        PydanticMessage(
            role=MessageRole.user,
            content=[TextContent(type="text", text="Fetch user data.")],
        ),
        PydanticMessage(
            role=MessageRole.assistant,
            content=[
                TextContent(type="text", text="Fetching users..."),
                ToolCallContent(
                    type="tool_call",
                    id="call_1",
                    name="fetch_users",
                    input={"limit": 10000},
                ),
            ],
        ),
        PydanticMessage(
            role=MessageRole.tool,
            tool_call_id="call_1",
            content=[
                ToolReturnContent(
                    type="tool_return",
                    tool_call_id="call_1",
                    content=large_return_1,
                    is_error=False,
                )
            ],
        ),
        PydanticMessage(
            role=MessageRole.assistant,
            content=[TextContent(type="text", text="Retrieved user data. Now fetching product data.")],
        ),
        PydanticMessage(
            role=MessageRole.assistant,
            content=[
                TextContent(type="text", text="Fetching products..."),
                ToolCallContent(
                    type="tool_call",
                    id="call_2",
                    name="fetch_products",
                    input={"category": "all"},
                ),
            ],
        ),
        PydanticMessage(
            role=MessageRole.tool,
            tool_call_id="call_2",
            content=[
                ToolReturnContent(
                    type="tool_return",
                    tool_call_id="call_2",
                    content=large_return_2,
                    is_error=False,
                )
            ],
        ),
        PydanticMessage(
            role=MessageRole.assistant,
            content=[TextContent(type="text", text="I've successfully fetched both user and product data.")],
        ),
    ]

    agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)

    # Verify that we have large messages
    total_content_size = sum(len(str(content)) for msg in in_context_messages for content in msg.content)
    assert total_content_size > 40000, f"Expected large messages, got {total_content_size} chars"

    # Run summarization
    summary, result, _ = await run_summarization(server, agent_state, in_context_messages, actor)

    # Verify result
    assert isinstance(result, list)
    assert len(result) >= 1

    # Verify that the result contains valid messages
    for msg in result:
        assert hasattr(msg, "role")
        assert hasattr(msg, "content")

    print(f"Summarized {len(in_context_messages)} messages with {total_content_size} chars to {len(result)} messages")


# @pytest.mark.asyncio
# @pytest.mark.parametrize(
#    "llm_config",
#    TESTED_LLM_CONFIGS,
#    ids=[c.model for c in TESTED_LLM_CONFIGS],
# )
# async def test_summarize_truncates_large_tool_return(server: SyncServer, actor, llm_config: LLMConfig):
#    """
#    Test that summarization properly truncates very large tool returns.
#    This ensures that oversized tool returns don't consume excessive context.
#    """
#    # Create an extremely large tool return (100k chars)
#    large_return = create_large_tool_return(100000)
#    original_size = len(large_return)
#
#    # Create messages with a large tool return
#    messages = [
#        PydanticMessage(
#            role=MessageRole.user,
#            content=[TextContent(type="text", text="Please run the database query.")],
#        ),
#        PydanticMessage(
#            role=MessageRole.assistant,
#            content=[
#                TextContent(type="text", text="Running query..."),
#                ToolCallContent(
#                    type="tool_call",
#                    id="call_1",
#                    name="run_query",
#                    input={"query": "SELECT * FROM large_table"},
#                ),
#            ],
#        ),
#        PydanticMessage(
#            role=MessageRole.tool,
#            tool_call_id="call_1",
#            content=[
#                ToolReturnContent(
#                    type="tool_return",
#                    tool_call_id="call_1",
#                    content=large_return,
#                    is_error=False,
#                )
#            ],
#        ),
#        PydanticMessage(
#            role=MessageRole.assistant,
#            content=[TextContent(type="text", text="Query completed successfully with many results.")],
#        ),
#    ]
#
#    agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
#
#    # Verify the original tool return is indeed large
#    assert original_size > 90000, f"Expected tool return >90k chars, got {original_size}"
#
#    # Run summarization
#    summary, result = await run_summarization(server, agent_state, in_context_messages, actor)
#
#    # Verify result
#    assert isinstance(result, list)
#    assert len(result) >= 1
#
#    # Find tool return messages in the result and verify truncation occurred
#    tool_returns_found = False
#    for msg in result:
#        if msg.role == MessageRole.tool:
#            for content in msg.content:
#                if isinstance(content, ToolReturnContent):
#                    tool_returns_found = True
#                    result_size = len(content.content)
#                    # Verify that the tool return has been truncated
#                    assert result_size < original_size, (
#                        f"Expected tool return to be truncated from {original_size} chars, but got {result_size} chars"
#                    )
#                    print(f"Tool return successfully truncated from {original_size} to {result_size} chars")
#
#    # If we didn't find any tool returns in the result, that's also acceptable
#    # (they may have been completely removed during aggressive summarization)
#    if not tool_returns_found:
#        print("Tool returns were completely removed during summarization")
#

# ======================================================================================================================
# CompactionSettings Mode Tests - Using LettaAgentV3
# ======================================================================================================================

from unittest.mock import patch

from letta.services.summarizer.summarizer_config import CompactionSettings

# Test both summarizer modes: "all" summarizes entire history, "sliding_window" keeps recent messages
SUMMARIZER_CONFIG_MODES: list[Literal["all", "sliding_window"]] = ["all", "sliding_window"]


@pytest.mark.asyncio
@pytest.mark.parametrize("mode", SUMMARIZER_CONFIG_MODES, ids=SUMMARIZER_CONFIG_MODES)
@pytest.mark.parametrize("llm_config", TESTED_LLM_CONFIGS, ids=[c.model for c in TESTED_LLM_CONFIGS])
async def test_summarize_with_mode(server: SyncServer, actor, llm_config: LLMConfig, mode: Literal["all", "sliding_window"]):
    """
    Test summarization with different CompactionSettings modes using LettaAgentV3.

    This test verifies that both summarization modes work correctly:
    - "all": Summarizes the entire conversation history into a single summary
    - "sliding_window": Keeps recent messages and summarizes older ones
    """
    # Create a conversation with enough messages to trigger summarization
    messages = [
        PydanticMessage(
            role=MessageRole.system,
            content=[TextContent(type="text", text="You are a helpful assistant.")],
        )
    ]
    for i in range(10):
        messages.append(
            PydanticMessage(
                role=MessageRole.user,
                content=[TextContent(type="text", text=f"User message {i}: Test message {i}.")],
            )
        )
        messages.append(
            PydanticMessage(
                role=MessageRole.assistant,
                content=[TextContent(type="text", text=f"Assistant response {i}: Acknowledged message {i}.")],
            )
        )

    agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)

    # Create new messages that would be added during this step
    new_letta_messages = [
        PydanticMessage(
            role=MessageRole.user,
            content=[TextContent(type="text", text="This is a new user message during this step.")],
            agent_id=agent_state.id,
        )
    ]
    # Persist the new messages
    new_letta_messages = await server.message_manager.create_many_messages_async(new_letta_messages, actor=actor)

    # Override compaction settings directly on the agent state
    handle = llm_config.handle or f"{llm_config.model_endpoint_type}/{llm_config.model}"
    agent_state.compaction_settings = CompactionSettings(model=handle, mode=mode)

    agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)

    summary, result, summary_text = await agent_loop.compact(messages=in_context_messages)

    assert isinstance(result, list)

    # Verify that the result contains valid messages
    for msg in result:
        assert hasattr(msg, "role")
        assert hasattr(msg, "content")

    # Verify the summary text (third return value) is a non-empty string.
    # This is used by the agent loop to construct a SummaryMessage for clients.
    assert isinstance(summary_text, str), f"Expected summary_text to be a string, got {type(summary_text)}"
    assert len(summary_text) > 0, "Expected non-empty summary text"

    print()
    print(f"RESULTS {mode} ======")
    for msg in result:
        print(f"MSG: {msg}")
    print(f"SUMMARY TEXT: {summary_text[:200]}...")

    print()

    if mode == "all":
        # For "all" mode, V3 keeps:
        #   1. System prompt
        #   2. A single user summary message (system_alert JSON)
        # and no remaining historical messages.
        assert len(result) == 2, f"Expected 2 messages for 'all' mode (system + summary), got {len(result)}"
        assert result[0].role == MessageRole.system
        assert result[1].role == MessageRole.user
    else:
        # For "sliding_window" mode, result should include:
        #   1. System prompt
        #   2. User summary message
        #   3+. Recent user/assistant messages inside the window.
        assert len(result) > 2, f"Expected >2 messages for 'sliding_window' mode, got {len(result)}"
        assert result[0].role == MessageRole.system
        assert result[1].role == MessageRole.user


@pytest.mark.asyncio
@pytest.mark.parametrize(
    "llm_config",
    TESTED_LLM_CONFIGS,
    ids=[c.model for c in TESTED_LLM_CONFIGS],
)
async def test_compact_returns_valid_summary_message_and_event_message(server: SyncServer, actor, llm_config: LLMConfig):
    """
    Test that compact() return values can be used to construct valid SummaryMessage and EventMessage objects.

    This validates the contract that _step() relies on: compact() returns
    (summary_message_obj, compacted_messages, summary_text) where summary_text
    is used to build a SummaryMessage and the metadata is used for an EventMessage.
    """
    import uuid

    from letta.helpers.datetime_helpers import get_utc_time

    # Create a conversation with enough messages to summarize
    messages = [
        PydanticMessage(
            role=MessageRole.system,
            content=[TextContent(type="text", text="You are a helpful assistant.")],
        )
    ]
    for i in range(10):
        messages.append(
            PydanticMessage(
                role=MessageRole.user,
                content=[TextContent(type="text", text=f"User message {i}: Test message {i}.")],
            )
        )
        messages.append(
            PydanticMessage(
                role=MessageRole.assistant,
                content=[TextContent(type="text", text=f"Assistant response {i}: Acknowledged message {i}.")],
            )
        )

    agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)

    handle = llm_config.handle or f"{llm_config.model_endpoint_type}/{llm_config.model}"
    agent_state.compaction_settings = CompactionSettings(model=handle, mode="all")

    agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)

    summary_message_obj, compacted_messages, summary_text = await agent_loop.compact(messages=in_context_messages)

    # Verify we can construct a valid SummaryMessage from compact() return values
    summary_msg = SummaryMessage(
        id=summary_message_obj.id,
        date=summary_message_obj.created_at,
        summary=summary_text,
        otid=PydanticMessage.generate_otid_from_id(summary_message_obj.id, 0),
        step_id=None,
        run_id=None,
    )
    assert summary_msg.message_type == "summary_message"
    assert isinstance(summary_msg.summary, str)
    assert len(summary_msg.summary) > 0
    assert summary_msg.id == summary_message_obj.id

    # Verify we can construct a valid EventMessage for compaction
    event_msg = EventMessage(
        id=str(uuid.uuid4()),
        date=get_utc_time(),
        event_type="compaction",
        event_data={
            "trigger": "post_step_context_check",
            "context_token_estimate": 1000,
            "context_window": agent_state.llm_config.context_window,
        },
        run_id=None,
        step_id=None,
    )
    assert event_msg.message_type == "event_message"
    assert event_msg.event_type == "compaction"
    assert "trigger" in event_msg.event_data
    assert "context_window" in event_msg.event_data


@pytest.mark.asyncio
@pytest.mark.parametrize(
    "llm_config",
    TESTED_LLM_CONFIGS,
    ids=[c.model for c in TESTED_LLM_CONFIGS],
)
async def test_compact_with_use_summary_role_creates_summary_message_role(server: SyncServer, actor, llm_config: LLMConfig):
    """
    Test that compact() with use_summary_role=True creates a message with role=MessageRole.summary.

    This validates that manual compaction endpoints (which pass use_summary_role=True)
    will store summary messages with the dedicated 'summary' role instead of the legacy 'user' role.
    """
    # Create a conversation with enough messages to summarize
    messages = [
        PydanticMessage(
            role=MessageRole.system,
            content=[TextContent(type="text", text="You are a helpful assistant.")],
        )
    ]
    for i in range(10):
        messages.append(
            PydanticMessage(
                role=MessageRole.user,
                content=[TextContent(type="text", text=f"User message {i}: Test message {i}.")],
            )
        )
        messages.append(
            PydanticMessage(
                role=MessageRole.assistant,
                content=[TextContent(type="text", text=f"Assistant response {i}: Acknowledged message {i}.")],
            )
        )

    agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)

    handle = llm_config.handle or f"{llm_config.model_endpoint_type}/{llm_config.model}"
    agent_state.compaction_settings = CompactionSettings(model=handle, mode="all")

    agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)

    # Call compact with use_summary_role=True (as the REST endpoints now do)
    summary_message_obj, compacted_messages, summary_text = await agent_loop.compact(
        messages=in_context_messages,
        use_summary_role=True,
    )

    # Verify the summary message has role=summary (not user)
    assert summary_message_obj.role == MessageRole.summary, (
        f"Expected summary message to have role=summary when use_summary_role=True, got {summary_message_obj.role}"
    )

    # Verify the compacted messages list structure
    assert len(compacted_messages) == 2, f"Expected 2 messages (system + summary), got {len(compacted_messages)}"
    assert compacted_messages[0].role == MessageRole.system
    assert compacted_messages[1].role == MessageRole.summary

    # Verify summary text is non-empty
    assert isinstance(summary_text, str)
    assert len(summary_text) > 0


@pytest.mark.asyncio
async def test_v3_compact_uses_compaction_settings_model_and_model_settings(server: SyncServer, actor):
    """Integration test: LettaAgentV3.compact uses the LLMConfig implied by CompactionSettings.

    We set a different summarizer model handle + model_settings and verify that
    the LLMConfig passed into simple_summary reflects both the handle and
    the model_settings overrides.
    """

    from letta.agents.letta_agent_v3 import LettaAgentV3
    from letta.schemas.model import OpenAIModelSettings, OpenAIReasoning
    from letta.services.summarizer import summarizer_all

    base_llm_config = LLMConfig.default_config("gpt-4o-mini")

    messages = [
        PydanticMessage(
            role=MessageRole.system,
            content=[TextContent(type="text", text="You are a helpful assistant.")],
        ),
        PydanticMessage(
            role=MessageRole.user,
            content=[TextContent(type="text", text="Hello")],
        ),
        PydanticMessage(
            role=MessageRole.assistant,
            content=[TextContent(type="text", text="Hi there")],
        ),
    ]

    # Create agent + messages via helper to get a real AgentState
    agent_state, in_context_messages = await create_agent_with_messages(
        server=server,
        actor=actor,
        llm_config=base_llm_config,
        messages=messages,
    )

    summarizer_handle = "openai/gpt-5-mini"
    summarizer_model_settings = OpenAIModelSettings(
        max_output_tokens=4321,
        temperature=0.05,
        reasoning=OpenAIReasoning(reasoning_effort="high"),
        response_format=None,
    )
    agent_state.compaction_settings = CompactionSettings(
        model=summarizer_handle,
        model_settings=summarizer_model_settings,
        prompt="You are a summarizer.",
        prompt_acknowledgement=True,
        clip_chars=2000,
        mode="all",
        sliding_window_percentage=0.3,
    )

    captured_llm_config: dict = {}

    async def fake_simple_summary(messages, llm_config, actor, include_ack=True, prompt=None, **kwargs):  # type: ignore[override]
        captured_llm_config["value"] = llm_config
        return "summary text"

    # Patch simple_summary so we don't hit the real LLM and can inspect llm_config
    with patch.object(summarizer_all, "simple_summary", new=fake_simple_summary):
        agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
        summary_msg, compacted, _ = await agent_loop.compact(messages=in_context_messages)

    assert summary_msg is not None
    assert "value" in captured_llm_config
    summarizer_llm_config = captured_llm_config["value"]

    # Agent's llm_config remains the base config
    assert agent_state.llm_config.model == "gpt-4o-mini"

    # Summarizer llm_config should reflect compaction_settings.model and model_settings
    assert summarizer_llm_config.handle == summarizer_handle
    assert summarizer_llm_config.model == "gpt-5-mini"
    assert summarizer_llm_config.max_tokens == 4321
    assert summarizer_llm_config.temperature == 0.05


@pytest.mark.asyncio
@pytest.mark.parametrize("llm_config", TESTED_LLM_CONFIGS, ids=[c.model for c in TESTED_LLM_CONFIGS])
async def test_v3_summarize_hard_eviction_when_still_over_threshold(
    server: SyncServer,
    actor,
    llm_config: LLMConfig,
    caplog,
):
    """Regression test: ensure V3 summarizer does a hard eviction when
    summarization fails to bring the context size below the proactive
    summarization threshold.

    This test simulates the edge case that previously led to summarization
    loops:

    1. A large pre-summarization token count triggers summarization.
    2. Even after summarization, the (mocked) post-summarization token count
       is still above the trigger threshold.
    3. We verify that LettaAgentV3:
       - Logs an error about summarization failing to reduce context size.
       - Evicts all prior messages, keeping only the system message plus a
         single synthetic user summary message (system_alert).
       - Updates `context_token_estimate` to the token count of the minimal
         context so future steps don't keep re-triggering summarization based
         on a stale, oversized value.
    """

    # Build a small but non-trivial conversation with an explicit system
    # message so that after hard eviction we expect to keep exactly that
    # system message plus a single user summary message.
    messages = [
        PydanticMessage(
            role=MessageRole.system,
            content=[TextContent(type="text", text="You are a helpful assistant.")],
        ),
        PydanticMessage(
            role=MessageRole.user,
            content=[TextContent(type="text", text="User message 0: hello")],
        ),
        PydanticMessage(
            role=MessageRole.assistant,
            content=[TextContent(type="text", text="Assistant response 0: hi there")],
        ),
    ]

    agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)

    print("ORIGINAL IN-CONTEXT MESSAGES ======")
    for msg in in_context_messages:
        print(f"MSG: {msg}")

    # Create the V3 agent loop
    agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)

    # We don't care which summarizer mode is used here; we just need
    # summarize_conversation_history to run and then hit the branch where the
    # *post*-summarization token count is still above the proactive
    # summarization threshold. We simulate that by patching the
    # count_tokens_with_tools helper to report an extremely large
    # token count for the first call (post-summary) and a small count for the
    # second call (after hard eviction).
    with patch("letta.services.summarizer.compact.count_tokens_with_tools") as mock_count_tokens:
        # First call: pretend the summarized context is still huge relative to
        # this model's context window so that we always trigger the
        # hard-eviction path. Second call: minimal context (system only) is
        # small.
        context_limit = llm_config.context_window or 100_000
        huge_tokens = context_limit * 10  # safely above any reasonable trigger
        mock_count_tokens.side_effect = [huge_tokens, 10]

        caplog.set_level("ERROR")

        summary, result, summary_text = await agent_loop.compact(
            messages=in_context_messages,
            trigger_threshold=context_limit,
        )

    # We should have made exactly two token-count calls: one for the
    # summarized context, one for the hard-evicted minimal context.
    assert mock_count_tokens.call_count == 2

    print("COMPACTED RESULT ======")
    for msg in result:
        print(f"MSG: {msg}")

    # After hard eviction, we keep only:
    #   1. The system prompt
    #   2. The synthetic user summary message.
    assert isinstance(result, list)
    assert len(result) == 2, f"Expected system + summary after hard eviction, got {len(result)} messages"
    assert result[0].role == MessageRole.system
    assert result[1].role == MessageRole.user

    # Verify the summary text is returned (used to construct SummaryMessage in the agent loop)
    assert isinstance(summary_text, str), f"Expected summary_text to be a string, got {type(summary_text)}"
    assert len(summary_text) > 0, "Expected non-empty summary text after hard eviction"


# ======================================================================================================================
# Sliding Window Summarizer Unit Tests
# ======================================================================================================================


@pytest.mark.asyncio
@pytest.mark.parametrize(
    "llm_config",
    TESTED_LLM_CONFIGS,
    ids=[c.model for c in TESTED_LLM_CONFIGS],
)
async def test_sliding_window_cutoff_index_does_not_exceed_message_count(server: SyncServer, actor, llm_config: LLMConfig):
    """
    Test that the sliding window summarizer correctly calculates cutoff indices.

    This test verifies the fix for a bug where the cutoff percentage was treated as
    a whole number (10) instead of a decimal (0.10), causing:
      message_cutoff_index = round(10 * 65) = 650
    when there were only 65 messages, resulting in an empty range loop and the error:
      "No assistant message found from indices 650 to 65"

    The fix changed:
      - max(..., 10) -> max(..., 0.10)
      - += 10 -> += 0.10
      - >= 100 -> >= 1.0

    This test uses the real token counter (via create_token_counter) to verify
    the sliding window logic works with actual token counting.
    """
    from letta.services.summarizer.summarizer_config import CompactionSettings
    from letta.services.summarizer.summarizer_sliding_window import summarize_via_sliding_window

    # Create a real summarizer config using the default factory
    # Override sliding_window_percentage to 0.3 for this test
    handle = llm_config.handle or f"{llm_config.model_endpoint_type}/{llm_config.model}"
    summarizer_config = CompactionSettings(model=handle)
    summarizer_config.sliding_window_percentage = 0.3

    # Create 65 messages (similar to the failing case in the bug report)
    # Pattern: system + alternating user/assistant messages
    messages = [
        PydanticMessage(
            role=MessageRole.system,
            content=[TextContent(type="text", text="You are a helpful assistant.")],
        )
    ]

    # Add 64 more messages (32 user-assistant pairs)
    for i in range(32):
        messages.append(
            PydanticMessage(
                role=MessageRole.user,
                content=[TextContent(type="text", text=f"User message {i}")],
            )
        )
        messages.append(
            PydanticMessage(
                role=MessageRole.assistant,
                content=[TextContent(type="text", text=f"Assistant response {i}")],
            )
        )

    assert len(messages) == 65, f"Expected 65 messages, got {len(messages)}"

    # This should NOT raise "No assistant message found from indices 650 to 65"
    # With the fix, message_count_cutoff_percent starts at max(0.7, 0.10) = 0.7
    # So message_cutoff_index = round(0.7 * 65) = 46, which is valid
    try:
        summary, remaining_messages = await summarize_via_sliding_window(
            actor=actor,
            llm_config=llm_config,
            agent_llm_config=llm_config,  # case where agent and summarizer have same config
            summarizer_config=summarizer_config,
            in_context_messages=messages,
        )

        # Verify the summary was generated (actual LLM response)
        assert summary is not None
        assert len(summary) > 0

        # Verify remaining messages is a valid subset
        assert len(remaining_messages) < len(messages)
        assert len(remaining_messages) > 0

        print(f"Successfully summarized {len(messages)} messages to {len(remaining_messages)} remaining")
        print(f"Summary: {summary[:200]}..." if len(summary) > 200 else f"Summary: {summary}")
        print(f"Using {llm_config.model_endpoint_type} token counter for model {llm_config.model}")

    except ValueError as e:
        if "No assistant message found from indices" in str(e):
            # Extract the indices from the error message
            import re

            match = re.search(r"from indices (\d+) to (\d+)", str(e))
            if match:
                start_idx, end_idx = int(match.group(1)), int(match.group(2))
                pytest.fail(
                    f"Bug detected: cutoff index ({start_idx}) exceeds message count ({end_idx}). "
                    f"This indicates the percentage calculation bug where 10 was used instead of 0.10. "
                    f"Error: {e}"
                )
        raise


@pytest.mark.asyncio
@pytest.mark.parametrize(
    "llm_config",
    TESTED_LLM_CONFIGS,
    ids=[c.model for c in TESTED_LLM_CONFIGS],
)
async def test_large_system_prompt_summarization(server: SyncServer, actor, llm_config: LLMConfig):
    """
    Test edge case of large system prompt / memory blocks.

    This test verifies that summarization handles the case where the system prompt
    and memory blocks are very large, potentially consuming most of the context window.
    The summarizer should gracefully handle this scenario without errors.
    """

    # Override context window to be small so we trigger summarization
    llm_config.context_window = 10000

    # Create agent with large system prompt and memory blocks
    agent_name = f"test_agent_large_system_prompt_{llm_config.model}".replace(".", "_").replace("/", "_")
    agent_create = CreateAgent(
        name=agent_name,
        llm_config=llm_config,
        embedding_config=DEFAULT_EMBEDDING_CONFIG,
        system="SYSTEM PROMPT " * 10000,  # Large system prompt
        memory_blocks=[
            CreateBlock(
                label="human",
                limit=200000,
                value="NAME " * 10000,  # Large memory block
            )
        ],
    )
    agent_state = await server.agent_manager.create_agent_async(agent_create, actor=actor)

    # Create a run for the agent using RunManager
    run = PydanticRun(agent_id=agent_state.id)
    run = await RunManager().create_run(pydantic_run=run, actor=actor)

    # Create the agent loop using LettaAgentV3
    agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)

    # message the agent
    input_message = MessageCreate(role=MessageRole.user, content="Hello")

    # Call step on the agent - may trigger summarization due to large context
    from letta.errors import SystemPromptTokenExceededError

    with pytest.raises(SystemPromptTokenExceededError):
        response = await agent_loop.step(
            input_messages=[input_message],
            run_id=run.id,
            max_steps=3,
        )

    # Repair the agent by shortening the memory blocks and system prompt
    # Update system prompt to a shorter version
    short_system_prompt = "You are a helpful assistant."
    await server.agent_manager.update_agent_async(
        agent_id=agent_state.id,
        agent_update=UpdateAgent(system=short_system_prompt),
        actor=actor,
    )

    # Update memory block to a shorter version
    short_memory_value = "The user's name is Alice."
    await server.agent_manager.modify_block_by_label_async(
        agent_id=agent_state.id,
        block_label="human",
        block_update=BlockUpdate(value=short_memory_value),
        actor=actor,
    )

    # Reload agent state after repairs
    agent_state = await server.agent_manager.get_agent_by_id_async(agent_id=agent_state.id, actor=actor)
    print("REPAIRED AGENT STATE ======")
    print(agent_state.system)
    print(agent_state.blocks)

    # Create a new run for the repaired agent
    run = PydanticRun(agent_id=agent_state.id)
    run = await RunManager().create_run(pydantic_run=run, actor=actor)

    # Create a new agent loop with the repaired agent state
    agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)

    # Now the agent should be able to respond without context window errors
    response = await agent_loop.step(
        input_messages=[input_message],
        run_id=run.id,
        max_steps=3,
    )

    # Verify we got a valid response after repair
    assert response is not None
    assert response.messages is not None
    print(f"Agent successfully responded after repair with {len(response.messages)} messages")


# @pytest.mark.asyncio
# async def test_context_window_overflow_triggers_summarization_in_streaming(server: SyncServer, actor):
#    """
#    Test that a ContextWindowExceededError during a streaming LLM request
#    properly triggers the summarizer and compacts the in-context messages.
#
#    This test simulates:
#    1. An LLM streaming request that fails with ContextWindowExceededError
#    2. The summarizer being invoked to reduce context size
#    3. Verification that messages are compacted and summary message exists
#
#    Note: This test only runs with OpenAI since it uses OpenAI-specific error handling.
#    """
#    import uuid
#    from unittest.mock import patch
#
#    import openai
#
#    from letta.schemas.message import MessageCreate
#    from letta.schemas.run import Run
#    from letta.services.run_manager import RunManager
#
#    # Use OpenAI config for this test (since we're using OpenAI-specific error handling)
#    llm_config = get_llm_config("openai-gpt-4o-mini.json")
#
#    # Create test messages - enough to have something to summarize
#    messages = []
#    for i in range(15):
#        messages.append(
#            PydanticMessage(
#                role=MessageRole.user,
#                content=[TextContent(type="text", text=f"User message {i}: This is test message number {i}.")],
#            )
#        )
#        messages.append(
#            PydanticMessage(
#                role=MessageRole.assistant,
#                content=[TextContent(type="text", text=f"Assistant response {i}: I acknowledge message {i}.")],
#            )
#        )
#
#    agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
#    original_message_count = len(agent_state.message_ids)
#
#    # Create an input message to trigger the agent
#    input_message = MessageCreate(
#        role=MessageRole.user,
#        content=[TextContent(type="text", text="Hello, please respond.")],
#    )
#
#    # Create a proper run record in the database
#    run_manager = RunManager()
#    test_run_id = f"run-{uuid.uuid4()}"
#    test_run = Run(
#        id=test_run_id,
#        agent_id=agent_state.id,
#    )
#    await run_manager.create_run(test_run, actor)
#
#    # Create the agent loop using LettaAgentV3
#    agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
#
#    # Track how many times stream_async is called
#    call_count = 0
#
#    # Store original stream_async method
#    original_stream_async = agent_loop.llm_client.stream_async
#
#    async def mock_stream_async_with_error(request_data, llm_config):
#        nonlocal call_count
#        call_count += 1
#        if call_count == 1:
#            # First call raises OpenAI BadRequestError with context_length_exceeded error code
#            # This will be properly converted to ContextWindowExceededError by handle_llm_error
#            from unittest.mock import MagicMock
#
#            import httpx
#
#            # Create a mock response with the required structure
#            mock_request = httpx.Request("POST", "https://api.openai.com/v1/chat/completions")
#            mock_response = httpx.Response(
#                status_code=400,
#                request=mock_request,
#                json={
#                    "error": {
#                        "message": "This model's maximum context length is 8000 tokens. However, your messages resulted in 12000 tokens.",
#                        "type": "invalid_request_error",
#                        "code": "context_length_exceeded",
#                    }
#                },
#            )
#
#            raise openai.BadRequestError(
#                message="This model's maximum context length is 8000 tokens. However, your messages resulted in 12000 tokens.",
#                response=mock_response,
#                body={
#                    "error": {
#                        "message": "This model's maximum context length is 8000 tokens. However, your messages resulted in 12000 tokens.",
#                        "type": "invalid_request_error",
#                        "code": "context_length_exceeded",
#                    }
#                },
#            )
#        # Subsequent calls use the real implementation
#        return await original_stream_async(request_data, llm_config)
#
#    # Patch the llm_client's stream_async to raise ContextWindowExceededError on first call
#    with patch.object(agent_loop.llm_client, "stream_async", side_effect=mock_stream_async_with_error):
#        # Execute a streaming step
#        try:
#            result_chunks = []
#            async for chunk in agent_loop.stream(
#                input_messages=[input_message],
#                max_steps=1,
#                stream_tokens=True,
#                run_id=test_run_id,
#            ):
#                result_chunks.append(chunk)
#        except Exception as e:
#            # Some errors might happen due to real LLM calls after retry
#            print(f"Exception during stream: {e}")
#
#    # Reload agent state to get updated message_ids after summarization
#    updated_agent_state = await server.agent_manager.get_agent_by_id_async(agent_id=agent_state.id, actor=actor)
#    updated_message_count = len(updated_agent_state.message_ids)
#
#    # Fetch the updated in-context messages
#    updated_in_context_messages = await server.message_manager.get_messages_by_ids_async(
#        message_ids=updated_agent_state.message_ids, actor=actor
#    )
#
#    # Convert to LettaMessage format for easier content inspection
#    letta_messages = PydanticMessage.to_letta_messages_from_list(updated_in_context_messages)
#
#    # Verify a summary message exists with the correct format
#    # The summary message has content with type="system_alert" and message containing:
#    # "prior messages ... have been hidden" and "summary of the previous"
#    import json
#
#    summary_message_found = False
#    summary_message_text = None
#    for msg in letta_messages:
#        # Not all message types have a content attribute (e.g., ReasoningMessage)
#        if not hasattr(msg, "content"):
#            continue
#
#        content = msg.content
#        # Content can be a string (JSON) or an object with type/message fields
#        if isinstance(content, str):
#            # Try to parse as JSON
#            try:
#                parsed = json.loads(content)
#                if isinstance(parsed, dict) and parsed.get("type") == "system_alert":
#                    text_to_check = parsed.get("message", "").lower()
#                    if "prior messages" in text_to_check and "hidden" in text_to_check and "summary of the previous" in text_to_check:
#                        summary_message_found = True
#                        summary_message_text = parsed.get("message")
#                        break
#            except (json.JSONDecodeError, TypeError):
#                pass
#        # Check if content has system_alert type with the summary message (object form)
#        elif hasattr(content, "type") and content.type == "system_alert":
#            if hasattr(content, "message") and content.message:
#                text_to_check = content.message.lower()
#                if "prior messages" in text_to_check and "hidden" in text_to_check and "summary of the previous" in text_to_check:
#                    summary_message_found = True
#                    summary_message_text = content.message
#                    break
#
#    assert summary_message_found, (
#        "A summary message should exist in the in-context messages after summarization. "
#        "Expected format containing 'prior messages...hidden' and 'summary of the previous'"
#    )
#
#    # Verify we attempted multiple invocations (the failing one + retry after summarization)
#    assert call_count >= 2, f"Expected at least 2 LLM invocations (initial + retry), got {call_count}"
#
#    # The original messages should have been compacted - the updated count should be less than
#    # original + the new messages added (input + assistant response + tool results)
#    # Since summarization should have removed most of the original 30 messages
#    print("Test passed: Summary message found in context")
#    print(f"Original message count: {original_message_count}, Updated: {updated_message_count}")
#    print(f"Summary message: {summary_message_text[:200] if summary_message_text else 'N/A'}...")
#    print(f"Total LLM invocations: {call_count}")
#
#
# @pytest.mark.asyncio
# async def test_context_window_overflow_triggers_summarization_in_blocking(server: SyncServer, actor):
#    """
#    Test that a ContextWindowExceededError during a blocking (non-streaming) LLM request
#    properly triggers the summarizer and compacts the in-context messages.
#
#    This test is similar to the streaming test but uses the blocking step() method.
#
#    Note: This test only runs with OpenAI since it uses OpenAI-specific error handling.
#    """
#    import uuid
#    from unittest.mock import patch
#
#    import openai
#
#    from letta.schemas.message import MessageCreate
#    from letta.schemas.run import Run
#    from letta.services.run_manager import RunManager
#
#    # Use OpenAI config for this test (since we're using OpenAI-specific error handling)
#    llm_config = get_llm_config("openai-gpt-4o-mini.json")
#
#    # Create test messages
#    messages = []
#    for i in range(15):
#        messages.append(
#            PydanticMessage(
#                role=MessageRole.user,
#                content=[TextContent(type="text", text=f"User message {i}: This is test message number {i}.")],
#            )
#        )
#        messages.append(
#            PydanticMessage(
#                role=MessageRole.assistant,
#                content=[TextContent(type="text", text=f"Assistant response {i}: I acknowledge message {i}.")],
#            )
#        )
#
#    agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)
#    original_message_count = len(agent_state.message_ids)
#
#    # Create an input message to trigger the agent
#    input_message = MessageCreate(
#        role=MessageRole.user,
#        content=[TextContent(type="text", text="Hello, please respond.")],
#    )
#
#    # Create a proper run record in the database
#    run_manager = RunManager()
#    test_run_id = f"run-{uuid.uuid4()}"
#    test_run = Run(
#        id=test_run_id,
#        agent_id=agent_state.id,
#    )
#    await run_manager.create_run(test_run, actor)
#
#    # Create the agent loop using LettaAgentV3
#    agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)
#
#    # Track how many times request_async is called
#    call_count = 0
#
#    # Store original request_async method
#    original_request_async = agent_loop.llm_client.request_async
#
#    async def mock_request_async_with_error(request_data, llm_config):
#        nonlocal call_count
#        call_count += 1
#        if call_count == 1:
#            # First call raises OpenAI BadRequestError with context_length_exceeded error code
#            # This will be properly converted to ContextWindowExceededError by handle_llm_error
#            import httpx
#
#            # Create a mock response with the required structure
#            mock_request = httpx.Request("POST", "https://api.openai.com/v1/chat/completions")
#            mock_response = httpx.Response(
#                status_code=400,
#                request=mock_request,
#                json={
#                    "error": {
#                        "message": "This model's maximum context length is 8000 tokens. However, your messages resulted in 12000 tokens.",
#                        "type": "invalid_request_error",
#                        "code": "context_length_exceeded",
#                    }
#                },
#            )
#
#            raise openai.BadRequestError(
#                message="This model's maximum context length is 8000 tokens. However, your messages resulted in 12000 tokens.",
#                response=mock_response,
#                body={
#                    "error": {
#                        "message": "This model's maximum context length is 8000 tokens. However, your messages resulted in 12000 tokens.",
#                        "type": "invalid_request_error",
#                        "code": "context_length_exceeded",
#                    }
#                },
#            )
#        # Subsequent calls use the real implementation
#        return await original_request_async(request_data, llm_config)
#
#    # Patch the llm_client's request_async to raise ContextWindowExceededError on first call
#    with patch.object(agent_loop.llm_client, "request_async", side_effect=mock_request_async_with_error):
#        # Execute a blocking step
#        try:
#            result = await agent_loop.step(
#                input_messages=[input_message],
#                max_steps=1,
#                run_id=test_run_id,
#            )
#        except Exception as e:
#            # Some errors might happen due to real LLM calls after retry
#            print(f"Exception during step: {e}")
#
#    # Reload agent state to get updated message_ids after summarization
#    updated_agent_state = await server.agent_manager.get_agent_by_id_async(agent_id=agent_state.id, actor=actor)
#    updated_message_count = len(updated_agent_state.message_ids)
#
#    # Fetch the updated in-context messages
#    updated_in_context_messages = await server.message_manager.get_messages_by_ids_async(
#        message_ids=updated_agent_state.message_ids, actor=actor
#    )
#
#    # Convert to LettaMessage format for easier content inspection
#    letta_messages = PydanticMessage.to_letta_messages_from_list(updated_in_context_messages)
#
#    # Verify a summary message exists with the correct format
#    # The summary message has content with type="system_alert" and message containing:
#    # "prior messages ... have been hidden" and "summary of the previous"
#    import json
#
#    summary_message_found = False
#    summary_message_text = None
#    for msg in letta_messages:
#        # Not all message types have a content attribute (e.g., ReasoningMessage)
#        if not hasattr(msg, "content"):
#            continue
#
#        content = msg.content
#        # Content can be a string (JSON) or an object with type/message fields
#        if isinstance(content, str):
#            # Try to parse as JSON
#            try:
#                parsed = json.loads(content)
#                if isinstance(parsed, dict) and parsed.get("type") == "system_alert":
#                    text_to_check = parsed.get("message", "").lower()
#                    if "prior messages" in text_to_check and "hidden" in text_to_check and "summary of the previous" in text_to_check:
#                        summary_message_found = True
#                        summary_message_text = parsed.get("message")
#                        break
#            except (json.JSONDecodeError, TypeError):
#                pass
#        # Check if content has system_alert type with the summary message (object form)
#        elif hasattr(content, "type") and content.type == "system_alert":
#            if hasattr(content, "message") and content.message:
#                text_to_check = content.message.lower()
#                if "prior messages" in text_to_check and "hidden" in text_to_check and "summary of the previous" in text_to_check:
#                    summary_message_found = True
#                    summary_message_text = content.message
#                    break
#
#    assert summary_message_found, (
#        "A summary message should exist in the in-context messages after summarization. "
#        "Expected format containing 'prior messages...hidden' and 'summary of the previous'"
#    )
#
#    # Verify we attempted multiple invocations (the failing one + retry after summarization)
#    assert call_count >= 2, f"Expected at least 2 LLM invocations (initial + retry), got {call_count}"
#
#    # The original messages should have been compacted - the updated count should be less than
#    # original + the new messages added (input + assistant response + tool results)
#    print("Test passed: Summary message found in context (blocking mode)")
#    print(f"Original message count: {original_message_count}, Updated: {updated_message_count}")
#    print(f"Summary message: {summary_message_text[:200] if summary_message_text else 'N/A'}...")
#    print(f"Total LLM invocations: {call_count}")
#
#


@pytest.mark.asyncio
@pytest.mark.parametrize(
    "llm_config",
    TESTED_LLM_CONFIGS,
    ids=[c.model for c in TESTED_LLM_CONFIGS],
)
async def test_summarize_all(server: SyncServer, actor, llm_config: LLMConfig):
    """
    Test the summarize_all function with real LLM calls.

    This test verifies that the 'all' summarization mode works correctly,
    summarizing the entire conversation into a single summary string.
    """
    from letta.services.summarizer.summarizer_all import summarize_all
    from letta.services.summarizer.summarizer_config import CompactionSettings

    # Create a summarizer config with "all" mode
    handle = llm_config.handle or f"{llm_config.model_endpoint_type}/{llm_config.model}"
    summarizer_config = CompactionSettings(model=handle)
    summarizer_config.mode = "all"

    # Create test messages - a simple conversation
    messages = [
        PydanticMessage(
            role=MessageRole.system,
            content=[TextContent(type="text", text="You are a helpful assistant.")],
        )
    ]

    # Add 10 user-assistant pairs
    for i in range(10):
        messages.append(
            PydanticMessage(
                role=MessageRole.user,
                content=[TextContent(type="text", text=f"User message {i}: What is {i} + {i}?")],
            )
        )
        messages.append(
            PydanticMessage(
                role=MessageRole.assistant,
                content=[TextContent(type="text", text=f"Assistant response {i}: {i} + {i} = {i * 2}.")],
            )
        )

    assert len(messages) == 21, f"Expected 21 messages, got {len(messages)}"

    # Call summarize_all with real LLM
    summary, new_in_context_messages = await summarize_all(
        actor=actor,
        llm_config=llm_config,
        summarizer_config=summarizer_config,
        in_context_messages=messages,
    )

    # Verify the summary was generated
    assert len(new_in_context_messages) == 1
    assert summary is not None
    assert len(summary) > 0
    assert len(summary) <= 2000

    print(f"Successfully summarized {len(messages)} messages using 'all' mode")
    print(f"Summary: {summary[:200]}..." if len(summary) > 200 else f"Summary: {summary}")
    print(f"Using {llm_config.model_endpoint_type} for model {llm_config.model}")


# =============================================================================
# CompactionStats tests
# =============================================================================


def test_compaction_stats_embedding_in_packed_json():
    """Test that compaction_stats are correctly embedded in the packed JSON by package_summarize_message_no_counts."""
    from letta.system import package_summarize_message_no_counts

    stats = {
        "trigger": "post_step_context_check",
        "context_tokens_before": 50000,
        "context_tokens_after": 15000,
        "context_window": 128000,
        "messages_count_before": 45,
        "messages_count_after": 12,
    }

    packed = package_summarize_message_no_counts(
        summary="Test summary content",
        timezone="UTC",
        compaction_stats=stats,
    )

    # Parse the packed JSON
    packed_json = json.loads(packed)

    # Verify structure
    assert "type" in packed_json
    assert packed_json["type"] == "system_alert"
    assert "message" in packed_json
    assert "Test summary content" in packed_json["message"]
    assert "compaction_stats" in packed_json

    # Verify stats content
    embedded_stats = packed_json["compaction_stats"]
    assert embedded_stats["trigger"] == "post_step_context_check"
    assert embedded_stats["context_tokens_before"] == 50000
    assert embedded_stats["context_tokens_after"] == 15000
    assert embedded_stats["context_window"] == 128000
    assert embedded_stats["messages_count_before"] == 45
    assert embedded_stats["messages_count_after"] == 12


def test_compaction_stats_embedding_without_stats():
    """Test that packed JSON works correctly when no stats are provided."""
    from letta.system import package_summarize_message_no_counts

    packed = package_summarize_message_no_counts(
        summary="Test summary content",
        timezone="UTC",
        compaction_stats=None,
    )

    packed_json = json.loads(packed)

    assert "type" in packed_json
    assert "message" in packed_json
    assert "compaction_stats" not in packed_json


def test_extract_compaction_stats_from_packed_json():
    """Test extracting CompactionStats from a packed JSON string."""
    from letta.schemas.letta_message import CompactionStats, extract_compaction_stats_from_packed_json

    packed_json = json.dumps(
        {
            "type": "system_alert",
            "message": "Test summary",
            "time": "2024-01-15T10:00:00",
            "compaction_stats": {
                "trigger": "context_window_exceeded",
                "context_tokens_before": 100000,
                "context_tokens_after": 30000,
                "context_window": 128000,
                "messages_count_before": 50,
                "messages_count_after": 15,
            },
        }
    )

    stats = extract_compaction_stats_from_packed_json(packed_json)

    assert stats is not None
    assert isinstance(stats, CompactionStats)
    assert stats.trigger == "context_window_exceeded"
    assert stats.context_tokens_before == 100000
    assert stats.context_tokens_after == 30000
    assert stats.context_window == 128000
    assert stats.messages_count_before == 50
    assert stats.messages_count_after == 15


def test_extract_compaction_stats_from_packed_json_without_stats():
    """Test that extraction returns None when no stats are present (backward compatibility)."""
    from letta.schemas.letta_message import extract_compaction_stats_from_packed_json

    # Old format without compaction_stats
    packed_json = json.dumps(
        {
            "type": "system_alert",
            "message": "Test summary",
            "time": "2024-01-15T10:00:00",
        }
    )

    stats = extract_compaction_stats_from_packed_json(packed_json)

    assert stats is None


def test_extract_compaction_stats_from_packed_json_invalid_json():
    """Test that extraction handles invalid JSON gracefully."""
    from letta.schemas.letta_message import extract_compaction_stats_from_packed_json

    stats = extract_compaction_stats_from_packed_json("not valid json")
    assert stats is None

    stats = extract_compaction_stats_from_packed_json("")
    assert stats is None


def test_extract_compaction_stats_from_packed_json_invalid_stats():
    """Test that extraction handles invalid stats structure gracefully."""
    from letta.schemas.letta_message import extract_compaction_stats_from_packed_json

    # Missing required fields
    packed_json = json.dumps(
        {
            "type": "system_alert",
            "message": "Test summary",
            "compaction_stats": {
                "trigger": "test",
                # Missing context_window, messages_count_before, messages_count_after
            },
        }
    )

    stats = extract_compaction_stats_from_packed_json(packed_json)
    assert stats is None  # Should return None due to validation failure


def test_extract_compaction_stats_from_message():
    """Test extracting CompactionStats from a Message object."""
    from letta.agents.letta_agent_v3 import extract_compaction_stats_from_message
    from letta.schemas.letta_message import CompactionStats

    packed_content = json.dumps(
        {
            "type": "system_alert",
            "message": "Test summary",
            "time": "2024-01-15T10:00:00",
            "compaction_stats": {
                "trigger": "post_step_context_check",
                "context_tokens_before": 50000,
                "context_tokens_after": 15000,
                "context_window": 128000,
                "messages_count_before": 45,
                "messages_count_after": 12,
            },
        }
    )

    message = PydanticMessage(
        role=MessageRole.summary,
        content=[TextContent(type="text", text=packed_content)],
    )

    stats = extract_compaction_stats_from_message(message)

    assert stats is not None
    assert isinstance(stats, CompactionStats)
    assert stats.trigger == "post_step_context_check"
    assert stats.context_tokens_before == 50000
    assert stats.messages_count_after == 12


def test_extract_compaction_stats_from_message_without_stats():
    """Test that Message extraction returns None when no stats are present."""
    from letta.agents.letta_agent_v3 import extract_compaction_stats_from_message

    packed_content = json.dumps(
        {
            "type": "system_alert",
            "message": "Old format summary",
            "time": "2024-01-15T10:00:00",
        }
    )

    message = PydanticMessage(
        role=MessageRole.summary,
        content=[TextContent(type="text", text=packed_content)],
    )

    stats = extract_compaction_stats_from_message(message)
    assert stats is None


def test_message_to_summary_message_with_stats():
    """Test that Message._convert_summary_message extracts compaction_stats."""
    from letta.schemas.letta_message import CompactionStats

    packed_content = json.dumps(
        {
            "type": "system_alert",
            "message": "Summary of conversation",
            "time": "2024-01-15T10:00:00",
            "compaction_stats": {
                "trigger": "context_window_exceeded",
                "context_tokens_before": 80000,
                "context_tokens_after": 25000,
                "context_window": 128000,
                "messages_count_before": 60,
                "messages_count_after": 20,
            },
        }
    )

    message = PydanticMessage(
        role=MessageRole.summary,
        content=[TextContent(type="text", text=packed_content)],
    )

    # Convert to SummaryMessage (as_user_message=False)
    summary_msg = message._convert_summary_message(as_user_message=False)

    assert summary_msg.message_type == "summary_message"
    assert summary_msg.compaction_stats is not None
    assert isinstance(summary_msg.compaction_stats, CompactionStats)
    assert summary_msg.compaction_stats.trigger == "context_window_exceeded"
    assert summary_msg.compaction_stats.context_tokens_before == 80000


def test_message_to_summary_message_backward_compatible():
    """Test that old messages without compaction_stats still convert correctly."""
    packed_content = json.dumps(
        {
            "type": "system_alert",
            "message": "Old format summary without stats",
            "time": "2024-01-15T10:00:00",
        }
    )

    message = PydanticMessage(
        role=MessageRole.summary,
        content=[TextContent(type="text", text=packed_content)],
    )

    summary_msg = message._convert_summary_message(as_user_message=False)

    assert summary_msg.message_type == "summary_message"
    assert summary_msg.compaction_stats is None  # Should be None for old messages
    assert "Old format summary" in summary_msg.summary


@pytest.mark.asyncio
@pytest.mark.parametrize(
    "llm_config",
    TESTED_LLM_CONFIGS,
    ids=[c.model for c in TESTED_LLM_CONFIGS],
)
async def test_compact_with_stats_params_embeds_stats(server: SyncServer, actor, llm_config: LLMConfig):
    """
    Integration test: compact() with trigger/context_tokens_before/messages_count_before
    embeds compaction_stats in the packed message content.
    """
    from letta.agents.letta_agent_v3 import extract_compaction_stats_from_message

    # Create a conversation with enough messages to summarize
    messages = [
        PydanticMessage(
            role=MessageRole.system,
            content=[TextContent(type="text", text="You are a helpful assistant.")],
        )
    ]
    for i in range(10):
        messages.append(
            PydanticMessage(
                role=MessageRole.user,
                content=[TextContent(type="text", text=f"User message {i}")],
            )
        )
        messages.append(
            PydanticMessage(
                role=MessageRole.assistant,
                content=[TextContent(type="text", text=f"Response {i}")],
            )
        )

    agent_state, in_context_messages = await create_agent_with_messages(server, actor, llm_config, messages)

    handle = llm_config.handle or f"{llm_config.model_endpoint_type}/{llm_config.model}"
    agent_state.compaction_settings = CompactionSettings(model=handle, mode="all")

    agent_loop = LettaAgentV3(agent_state=agent_state, actor=actor)

    # Call compact with stats params
    summary_message_obj, compacted_messages, summary_text = await agent_loop.compact(
        messages=in_context_messages,
        use_summary_role=True,
        trigger="post_step_context_check",
        context_tokens_before=50000,
        messages_count_before=len(in_context_messages),
    )

    # Extract stats from the message
    stats = extract_compaction_stats_from_message(summary_message_obj)

    assert stats is not None, "CompactionStats should be embedded in the message"
    assert stats.trigger == "post_step_context_check"
    assert stats.context_tokens_before == 50000
    assert stats.messages_count_before == len(in_context_messages)
    assert stats.context_tokens_after is not None  # Should be set by compact()
    assert stats.messages_count_after == len(compacted_messages)  # final_messages already includes summary
    assert stats.context_window == llm_config.context_window