chore: bump 0.16.6 (#3211)

2026-03-03 19:13:07 -08:00
parent 1b2aa98b3e 9eb7ae39a2
commit 4cb2f21c65
84 changed files with 2540 additions and 407 deletions
--- a/conf.yaml
+++ b/conf.yaml
@@ -260,6 +260,7 @@ model:
    base_url: https://generativelanguage.googleapis.com/
    force_minimum_thinking_budget: false
    max_retries: 5
    timeout_seconds: 600.0
  # Google Vertex (-> GOOGLE_CLOUD_*)
  # google_cloud:
--- a/fern/openapi.json
+++ b/fern/openapi.json
--- a/fern/scripts/prepare-openapi.ts
+++ b/fern/scripts/prepare-openapi.ts
@@ -0,0 +1,220 @@
 import * as fs from 'fs';
 import * as path from 'path';
 import { omit } from 'lodash';
 import { execSync } from 'child_process';
 import { merge, isErrorResult } from 'openapi-merge';
 import type { Swagger } from 'atlassian-openapi';
 import { RESTRICTED_ROUTE_BASE_PATHS } from '@letta-cloud/sdk-core';
 const lettaWebOpenAPIPath = path.join(
  __dirname,
  '..',
  '..',
  '..',
  'web',
  'autogenerated',
  'letta-web-openapi.json',
 );
 const lettaAgentsAPIPath = path.join(
  __dirname,
  '..',
  '..',
  'letta',
  'server',
  'openapi_letta.json',
 );
 const lettaWebOpenAPI = JSON.parse(
  fs.readFileSync(lettaWebOpenAPIPath, 'utf8'),
 ) as Swagger.SwaggerV3;
 const lettaAgentsAPI = JSON.parse(
  fs.readFileSync(lettaAgentsAPIPath, 'utf8'),
 ) as Swagger.SwaggerV3;
 // removes any routes that are restricted
 lettaAgentsAPI.paths = Object.fromEntries(
  Object.entries(lettaAgentsAPI.paths).filter(([path]) =>
    RESTRICTED_ROUTE_BASE_PATHS.every(
      (restrictedPath) => !path.startsWith(restrictedPath),
    ),
  ),
 );
 const lettaAgentsAPIWithNoEndslash = Object.keys(lettaAgentsAPI.paths).reduce(
  (acc, path) => {
    const pathWithoutSlash = path.endsWith('/')
      ? path.slice(0, path.length - 1)
      : path;
    acc[pathWithoutSlash] = lettaAgentsAPI.paths[path];
    return acc;
  },
  {} as Swagger.SwaggerV3['paths'],
 );
 // remove duplicate paths, delete from letta-web-openapi if it exists in sdk-core
 // some paths will have an extra / at the end, so we need to remove that as well
 lettaWebOpenAPI.paths = Object.fromEntries(
  Object.entries(lettaWebOpenAPI.paths).filter(([path]) => {
    const pathWithoutSlash = path.endsWith('/')
      ? path.slice(0, path.length - 1)
      : path;
    return !lettaAgentsAPIWithNoEndslash[pathWithoutSlash];
  }),
 );
 const agentStatePathsToOverride: Array<[string, string]> = [
  ['/v1/templates/{project}/{template_version}/agents', '201'],
  ['/v1/agents/search', '200'],
 ];
 for (const [path, responseCode] of agentStatePathsToOverride) {
  if (lettaWebOpenAPI.paths[path]?.post?.responses?.[responseCode]) {
    // Get direct reference to the schema object
    const responseSchema =
      lettaWebOpenAPI.paths[path].post.responses[responseCode];
    const contentSchema = responseSchema.content['application/json'].schema;
    // Replace the entire agents array schema with the reference
    if (contentSchema.properties?.agents) {
      contentSchema.properties.agents = {
        type: 'array',
        items: {
          $ref: '#/components/schemas/AgentState',
        },
      };
    }
  }
 }
 // go through the paths and remove "user_id"/"actor_id" from the headers
 for (const path of Object.keys(lettaAgentsAPI.paths)) {
  for (const method of Object.keys(lettaAgentsAPI.paths[path])) {
    // @ts-expect-error - a
    if (lettaAgentsAPI.paths[path][method]?.parameters) {
      // @ts-expect-error - a
      lettaAgentsAPI.paths[path][method].parameters = lettaAgentsAPI.paths[
        path
      ][method].parameters.filter(
        (param: Record<string, string>) =>
          param.in !== 'header' ||
          (
            param.name !== 'user_id' &&
            param.name !== 'User-Agent' &&
            param.name !== 'X-Project-Id' &&
            param.name !== 'X-Letta-Source' &&
            param.name !== 'X-Stainless-Package-Version' &&
            !param.name.startsWith('X-Experimental') &&
            !param.name.startsWith('X-Billing')
          ),
      );
    }
  }
 }
 const result = merge([
  {
    oas: lettaAgentsAPI,
  },
  {
    oas: lettaWebOpenAPI,
  },
 ]);
 if (isErrorResult(result)) {
  console.error(`${result.message} (${result.type})`);
  process.exit(1);
 }
 result.output.openapi = '3.1.0';
 result.output.info = {
  title: 'Letta API',
  version: '1.0.0',
 };
 result.output.servers = [
  {
    url: 'https://app.letta.com',
    description: 'Letta Cloud',
  },
  {
    url: 'http://localhost:8283',
    description: 'Self-hosted',
  },
 ];
 result.output.components = {
  ...result.output.components,
  securitySchemes: {
    bearerAuth: {
      type: 'http',
      scheme: 'bearer',
    },
  },
 };
 result.output.security = [
  ...(result.output.security || []),
  {
    bearerAuth: [],
  },
 ];
 // omit all instances of "user_id" from the openapi.json file
 function deepOmitPreserveArrays(obj: unknown, key: string): unknown {
  if (Array.isArray(obj)) {
    return obj.map((item) => deepOmitPreserveArrays(item, key));
  }
  if (typeof obj !== 'object' || obj === null) {
    return obj;
  }
  if (key in obj) {
    return omit(obj, key);
  }
  return Object.fromEntries(
    Object.entries(obj).map(([k, v]) => [k, deepOmitPreserveArrays(v, key)]),
  );
 }
 // eslint-disable-next-line  @typescript-eslint/ban-ts-comment
 // @ts-ignore
 result.output.components = deepOmitPreserveArrays(
  result.output.components,
  'user_id',
 );
 // eslint-disable-next-line  @typescript-eslint/ban-ts-comment
 // @ts-ignore
 result.output.components = deepOmitPreserveArrays(
  result.output.components,
  'actor_id',
 );
 // eslint-disable-next-line  @typescript-eslint/ban-ts-comment
 // @ts-ignore
 result.output.components = deepOmitPreserveArrays(
  result.output.components,
  'organization_id',
 );
 fs.writeFileSync(
  path.join(__dirname, '..', 'openapi.json'),
  JSON.stringify(result.output, null, 2),
 );
 function formatOpenAPIJson() {
  const openApiPath = path.join(__dirname, '..', 'openapi.json');
  try {
    execSync(`npx prettier --write "${openApiPath}"`, { stdio: 'inherit' });
    console.log('Successfully formatted openapi.json with Prettier');
  } catch (error) {
    console.error('Error formatting openapi.json:', error);
    process.exit(1);
  }
 }
 formatOpenAPIJson();
--- a/letta/init.py
+++ b/letta/init.py
@@ -5,7 +5,7 @@ try:
    __version__ = version("letta")
 except PackageNotFoundError:
    # Fallback for development installations
-    __version__ = "0.16.5"
+    __version__ = "0.16.6"
 if os.environ.get("LETTA_VERSION"):
    __version__ = os.environ["LETTA_VERSION"]
--- a/letta/adapters/letta_llm_adapter.py
+++ b/letta/adapters/letta_llm_adapter.py
@@ -7,6 +7,7 @@ from letta.schemas.letta_message import LettaMessage
 from letta.schemas.letta_message_content import ReasoningContent, RedactedReasoningContent, TextContent
 from letta.schemas.llm_config import LLMConfig
 from letta.schemas.openai.chat_completion_response import ChatCompletionResponse, ChoiceLogprobs, ToolCall
 from letta.schemas.provider_trace import BillingContext
 from letta.schemas.usage import LettaUsageStatistics
 from letta.schemas.user import User
 from letta.services.telemetry_manager import TelemetryManager
@@ -31,6 +32,7 @@ class LettaLLMAdapter(ABC):
        run_id: str | None = None,
        org_id: str | None = None,
        user_id: str | None = None,
        billing_context: BillingContext | None = None,
    ) -> None:
        self.llm_client: LLMClientBase = llm_client
        self.llm_config: LLMConfig = llm_config
@@ -40,6 +42,7 @@ class LettaLLMAdapter(ABC):
        self.run_id: str | None = run_id
        self.org_id: str | None = org_id
        self.user_id: str | None = user_id
        self.billing_context: BillingContext | None = billing_context
        self.message_id: str | None = None
        self.request_data: dict | None = None
        self.response_data: dict | None = None
--- a/letta/adapters/letta_llm_stream_adapter.py
+++ b/letta/adapters/letta_llm_stream_adapter.py
@@ -10,7 +10,7 @@ from letta.otel.tracing import log_attributes, safe_json_dumps, trace_method
 from letta.schemas.enums import LLMCallType, ProviderType
 from letta.schemas.letta_message import LettaMessage
 from letta.schemas.llm_config import LLMConfig
-from letta.schemas.provider_trace import ProviderTrace
+from letta.schemas.provider_trace import BillingContext, ProviderTrace
 from letta.schemas.user import User
 from letta.settings import settings
 from letta.utils import safe_create_task
@@ -36,6 +36,7 @@ class LettaLLMStreamAdapter(LettaLLMAdapter):
        run_id: str | None = None,
        org_id: str | None = None,
        user_id: str | None = None,
        billing_context: "BillingContext | None" = None,
    ) -> None:
        super().__init__(
            llm_client,
@@ -46,6 +47,7 @@ class LettaLLMStreamAdapter(LettaLLMAdapter):
            run_id=run_id,
            org_id=org_id,
            user_id=user_id,
            billing_context=billing_context,
        )
        self.interface: OpenAIStreamingInterface | AnthropicStreamingInterface | None = None
--- a/letta/adapters/simple_llm_request_adapter.py
+++ b/letta/adapters/simple_llm_request_adapter.py
@@ -51,6 +51,7 @@ class SimpleLLMRequestAdapter(LettaLLMRequestAdapter):
            org_id=self.org_id,
            user_id=self.user_id,
            llm_config=self.llm_config.model_dump() if self.llm_config else None,
            billing_context=self.billing_context,
        )
        try:
            self.response_data = await self.llm_client.request_async_with_telemetry(request_data, self.llm_config)
--- a/letta/adapters/simple_llm_stream_adapter.py
+++ b/letta/adapters/simple_llm_stream_adapter.py
@@ -278,6 +278,7 @@ class SimpleLLMStreamAdapter(LettaLLMStreamAdapter):
                        org_id=self.org_id,
                        user_id=self.user_id,
                        llm_config=self.llm_config.model_dump() if self.llm_config else None,
                        billing_context=self.billing_context,
                    ),
                ),
                label="create_provider_trace",
--- a/letta/agents/base_agent.py
+++ b/letta/agents/base_agent.py
@@ -15,6 +15,7 @@ from letta.schemas.letta_message_content import TextContent
 from letta.schemas.letta_response import LettaResponse
 from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType
 from letta.schemas.message import Message, MessageCreate, MessageUpdate
 from letta.schemas.provider_trace import BillingContext
 from letta.schemas.usage import LettaUsageStatistics
 from letta.schemas.user import User
 from letta.services.agent_manager import AgentManager
@@ -51,7 +52,11 @@ class BaseAgent(ABC):
    @abstractmethod
    async def step(
-        self, input_messages: List[MessageCreate], max_steps: int = DEFAULT_MAX_STEPS, run_id: Optional[str] = None
+        self,
        input_messages: List[MessageCreate],
        max_steps: int = DEFAULT_MAX_STEPS,
        run_id: Optional[str] = None,
        billing_context: "BillingContext | None" = None,
    ) -> LettaResponse:
        """
        Main execution loop for the agent.
--- a/letta/agents/base_agent_v2.py
+++ b/letta/agents/base_agent_v2.py
@@ -12,6 +12,7 @@ from letta.schemas.user import User
 if TYPE_CHECKING:
    from letta.schemas.letta_request import ClientToolSchema
    from letta.schemas.provider_trace import BillingContext
 class BaseAgentV2(ABC):
@@ -52,6 +53,7 @@ class BaseAgentV2(ABC):
        request_start_timestamp_ns: int | None = None,
        client_tools: list["ClientToolSchema"] | None = None,
        include_compaction_messages: bool = False,  # Not used in V2, but accepted for API compatibility
        billing_context: "BillingContext | None" = None,
    ) -> LettaResponse:
        """
        Execute the agent loop in blocking mode, returning all messages at once.
@@ -76,6 +78,7 @@ class BaseAgentV2(ABC):
        conversation_id: str | None = None,
        client_tools: list["ClientToolSchema"] | None = None,
        include_compaction_messages: bool = False,  # Not used in V2, but accepted for API compatibility
        billing_context: "BillingContext | None" = None,
    ) -> AsyncGenerator[LettaMessage | LegacyLettaMessage | MessageStreamStatus, None]:
        """
        Execute the agent loop in streaming mode, yielding chunks as they become available.
--- a/letta/agents/helpers.py
+++ b/letta/agents/helpers.py
@@ -192,44 +192,15 @@ async def _prepare_in_context_messages_no_persist_async(
            # Otherwise, include the full list of messages from the conversation
            current_in_context_messages = await message_manager.get_messages_by_ids_async(message_ids=message_ids, actor=actor)
        else:
-            # No messages in conversation yet - compile a new system message for this conversation
+            # No messages in conversation yet (fallback) - compile a new system message
-            # Each conversation gets its own system message (captures memory state at conversation start)
+            # Normally this is handled at conversation creation time, but this covers
-            from letta.prompts.prompt_generator import PromptGenerator
+            # edge cases where a conversation exists without a system message.
-            from letta.services.passage_manager import PassageManager
+            system_message = await conversation_manager.compile_and_save_system_message_for_conversation(
            num_messages = await message_manager.size_async(actor=actor, agent_id=agent_state.id)
            passage_manager = PassageManager()
            num_archival_memories = await passage_manager.agent_passage_size_async(actor=actor, agent_id=agent_state.id)
            system_message_str = await PromptGenerator.compile_system_message_async(
                system_prompt=agent_state.system,
                in_context_memory=agent_state.memory,
                in_context_memory_last_edit=get_utc_time(),
                timezone=agent_state.timezone,
                user_defined_variables=None,
                append_icm_if_missing=True,
                previous_message_count=num_messages,
                archival_memory_size=num_archival_memories,
                sources=agent_state.sources,
                max_files_open=agent_state.max_files_open,
            )
            system_message = Message.dict_to_message(
                agent_id=agent_state.id,
                model=agent_state.llm_config.model,
                openai_message_dict={"role": "system", "content": system_message_str},
            )
            # Persist the new system message
            persisted_messages = await message_manager.create_many_messages_async([system_message], actor=actor)
            system_message = persisted_messages[0]
            # Add it to the conversation tracking
            await conversation_manager.add_messages_to_conversation(
                conversation_id=conversation_id,
                agent_id=agent_state.id,
                message_ids=[system_message.id],
                actor=actor,
-                starting_position=0,
+                agent_state=agent_state,
                message_manager=message_manager,
            )
            current_in_context_messages = [system_message]
--- a/letta/agents/letta_agent.py
+++ b/letta/agents/letta_agent.py
@@ -48,6 +48,7 @@ from letta.schemas.openai.chat_completion_response import (
    UsageStatisticsCompletionTokenDetails,
    UsageStatisticsPromptTokenDetails,
 )
 from letta.schemas.provider_trace import BillingContext
 from letta.schemas.step import StepProgression
 from letta.schemas.step_metrics import StepMetrics
 from letta.schemas.tool_execution_result import ToolExecutionResult
@@ -179,6 +180,7 @@ class LettaAgent(BaseAgent):
        request_start_timestamp_ns: int | None = None,
        include_return_message_types: list[MessageType] | None = None,
        dry_run: bool = False,
        billing_context: "BillingContext | None" = None,
    ) -> Union[LettaResponse, dict]:
        # TODO (cliandy): pass in run_id and use at send_message endpoints for all step functions
        agent_state = await self.agent_manager.get_agent_by_id_async(
--- a/letta/agents/letta_agent_v2.py
+++ b/letta/agents/letta_agent_v2.py
@@ -44,6 +44,7 @@ from letta.schemas.openai.chat_completion_response import (
    UsageStatisticsCompletionTokenDetails,
    UsageStatisticsPromptTokenDetails,
 )
 from letta.schemas.provider_trace import BillingContext
 from letta.schemas.step import Step, StepProgression
 from letta.schemas.step_metrics import StepMetrics
 from letta.schemas.tool import Tool
@@ -185,6 +186,7 @@ class LettaAgentV2(BaseAgentV2):
        request_start_timestamp_ns: int | None = None,
        client_tools: list[ClientToolSchema] | None = None,
        include_compaction_messages: bool = False,  # Not used in V2, but accepted for API compatibility
        billing_context: "BillingContext | None" = None,
    ) -> LettaResponse:
        """
        Execute the agent loop in blocking mode, returning all messages at once.
@@ -290,6 +292,7 @@ class LettaAgentV2(BaseAgentV2):
        conversation_id: str | None = None,  # Not used in V2, but accepted for API compatibility
        client_tools: list[ClientToolSchema] | None = None,
        include_compaction_messages: bool = False,  # Not used in V2, but accepted for API compatibility
        billing_context: BillingContext | None = None,
    ) -> AsyncGenerator[str, None]:
        """
        Execute the agent loop in streaming mode, yielding chunks as they become available.
--- a/letta/agents/letta_agent_v3.py
+++ b/letta/agents/letta_agent_v3.py
@@ -21,7 +21,7 @@ from letta.agents.helpers import (
 )
 from letta.agents.letta_agent_v2 import LettaAgentV2
 from letta.constants import DEFAULT_MAX_STEPS, NON_USER_MSG_PREFIX, REQUEST_HEARTBEAT_PARAM
-from letta.errors import ContextWindowExceededError, LLMError, SystemPromptTokenExceededError
+from letta.errors import ContextWindowExceededError, LLMEmptyResponseError, LLMError, SystemPromptTokenExceededError
 from letta.helpers import ToolRulesSolver
 from letta.helpers.datetime_helpers import get_utc_time, get_utc_timestamp_ns
 from letta.helpers.tool_execution_helper import enable_strict_mode
@@ -45,6 +45,7 @@ from letta.schemas.letta_response import LettaResponse, TurnTokenData
 from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType
 from letta.schemas.message import Message, MessageCreate, ToolReturn
 from letta.schemas.openai.chat_completion_response import ChoiceLogprobs, ToolCall, ToolCallDenial, UsageStatistics
 from letta.schemas.provider_trace import BillingContext
 from letta.schemas.step import StepProgression
 from letta.schemas.step_metrics import StepMetrics
 from letta.schemas.tool_execution_result import ToolExecutionResult
@@ -149,6 +150,7 @@ class LettaAgentV3(LettaAgentV2):
        conversation_id: str | None = None,
        client_tools: list[ClientToolSchema] | None = None,
        include_compaction_messages: bool = False,
        billing_context: "BillingContext | None" = None,
    ) -> LettaResponse:
        """
        Execute the agent loop in blocking mode, returning all messages at once.
@@ -232,6 +234,7 @@ class LettaAgentV3(LettaAgentV2):
                run_id=run_id,
                org_id=self.actor.organization_id,
                user_id=self.actor.id,
                billing_context=billing_context,
            )
        credit_task = None
@@ -362,6 +365,7 @@ class LettaAgentV3(LettaAgentV2):
        conversation_id: str | None = None,
        client_tools: list[ClientToolSchema] | None = None,
        include_compaction_messages: bool = False,
        billing_context: BillingContext | None = None,
    ) -> AsyncGenerator[str, None]:
        """
        Execute the agent loop in streaming mode, yielding chunks as they become available.
@@ -419,6 +423,7 @@ class LettaAgentV3(LettaAgentV2):
                run_id=run_id,
                org_id=self.actor.organization_id,
                user_id=self.actor.id,
                billing_context=billing_context,
            )
        elif use_sglang_native:
            # Use SGLang native adapter for multi-turn RL training
@@ -431,6 +436,7 @@ class LettaAgentV3(LettaAgentV2):
                run_id=run_id,
                org_id=self.actor.organization_id,
                user_id=self.actor.id,
                billing_context=billing_context,
            )
            # Reset turns tracking for this step
            self.turns = []
@@ -444,6 +450,7 @@ class LettaAgentV3(LettaAgentV2):
                run_id=run_id,
                org_id=self.actor.organization_id,
                user_id=self.actor.id,
                billing_context=billing_context,
            )
        try:
@@ -764,7 +771,12 @@ class LettaAgentV3(LettaAgentV2):
            ]
        else:
            # Old behavior: UserMessage with packed JSON
-            return list(Message.to_letta_messages(summary_message))
+            messages = list(Message.to_letta_messages(summary_message))
            # Set otid on returned messages (summary Message doesn't have otid set at creation)
            for i, msg in enumerate(messages):
                if not msg.otid:
                    msg.otid = Message.generate_otid_from_id(summary_message.id, i)
            return messages
    @trace_method
    async def _step(
@@ -990,6 +1002,9 @@ class LettaAgentV3(LettaAgentV2):
                    except ValueError as e:
                        self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response.value)
                        raise e
                    except LLMEmptyResponseError as e:
                        self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response.value)
                        raise e
                    except LLMError as e:
                        self.stop_reason = LettaStopReason(stop_reason=StopReasonType.llm_api_error.value)
                        raise e
--- a/letta/config_file.py
+++ b/letta/config_file.py
@@ -134,7 +134,7 @@ def _flatten_model_settings(d: dict, env_vars: dict[str, str]) -> None:
            api_base: yyy    -> OPENAI_API_BASE
        anthropic:
            api_key: zzz     -> ANTHROPIC_API_KEY
-        global_max_context_window_limit: 32000  -> GLOBAL_MAX_CONTEXT_WINDOW_LIMIT
+        global_max_context_window_limit: 128000  -> GLOBAL_MAX_CONTEXT_WINDOW_LIMIT
    """
    for key, value in d.items():
        if isinstance(value, dict):
--- a/letta/constants.py
+++ b/letta/constants.py
@@ -74,7 +74,7 @@ DEFAULT_MAX_STEPS = 50
 # context window size
 MIN_CONTEXT_WINDOW = 4096
-DEFAULT_CONTEXT_WINDOW = 32000
+DEFAULT_CONTEXT_WINDOW = 128000
 # Summarization trigger threshold (multiplier of context_window limit)
 # Summarization triggers when step usage > context_window * SUMMARIZATION_TRIGGER_MULTIPLIER
@@ -253,10 +253,10 @@ LLM_MAX_CONTEXT_WINDOW = {
    "deepseek-reasoner": 64000,
    # glm (Z.AI)
    "glm-4.5": 128000,
-    "glm-4.6": 200000,
+    "glm-4.6": 180000,
-    "glm-4.7": 200000,
+    "glm-4.7": 180000,
-    "glm-5": 200000,
+    "glm-5": 180000,
-    "glm-5-code": 200000,
+    "glm-5-code": 180000,
    ## OpenAI models: https://platform.openai.com/docs/models/overview
    # gpt-5
    "gpt-5": 272000,
@@ -278,6 +278,8 @@ LLM_MAX_CONTEXT_WINDOW = {
    "gpt-5.2-pro": 272000,
    "gpt-5.2-pro-2025-12-11": 272000,
    "gpt-5.2-codex": 272000,
    # gpt-5.3
    "gpt-5.3-codex": 272000,
    # reasoners
    "o1": 200000,
    # "o1-pro": 200000,  # responses API only
@@ -419,7 +421,7 @@ MAX_ERROR_MESSAGE_CHAR_LIMIT = 1000
 # Default memory limits
 CORE_MEMORY_PERSONA_CHAR_LIMIT: int = 20000
 CORE_MEMORY_HUMAN_CHAR_LIMIT: int = 20000
-CORE_MEMORY_BLOCK_CHAR_LIMIT: int = 20000
+CORE_MEMORY_BLOCK_CHAR_LIMIT: int = 100000
 # Function return limits
 FUNCTION_RETURN_CHAR_LIMIT = 50000  # ~300 words
--- a/letta/errors.py
+++ b/letta/errors.py
@@ -283,6 +283,15 @@ class LLMServerError(LLMError):
    while processing the request."""
 class LLMEmptyResponseError(LLMServerError):
    """Error when LLM returns an empty response (no content and no tool calls).
    This is a subclass of LLMServerError to maintain retry behavior, but allows
    specific handling for empty response cases which may benefit from request
    modification before retry.
    """
 class LLMTimeoutError(LLMError):
    """Error when LLM request times out"""
--- a/letta/groups/sleeptime_multi_agent_v2.py
+++ b/letta/groups/sleeptime_multi_agent_v2.py
@@ -13,6 +13,7 @@ from letta.schemas.letta_message import MessageType
 from letta.schemas.letta_message_content import TextContent
 from letta.schemas.letta_response import LettaResponse
 from letta.schemas.message import Message, MessageCreate
 from letta.schemas.provider_trace import BillingContext
 from letta.schemas.run import Run
 from letta.schemas.user import User
 from letta.services.agent_manager import AgentManager
@@ -69,6 +70,7 @@ class SleeptimeMultiAgentV2(BaseAgent):
        use_assistant_message: bool = True,
        request_start_timestamp_ns: int | None = None,
        include_return_message_types: list[MessageType] | None = None,
        billing_context: "BillingContext | None" = None,
    ) -> LettaResponse:
        run_ids = []
@@ -100,6 +102,7 @@ class SleeptimeMultiAgentV2(BaseAgent):
            run_id=run_id,
            use_assistant_message=use_assistant_message,
            include_return_message_types=include_return_message_types,
            billing_context=billing_context,
        )
        # Get last response messages
--- a/letta/groups/sleeptime_multi_agent_v3.py
+++ b/letta/groups/sleeptime_multi_agent_v3.py
@@ -15,6 +15,7 @@ from letta.schemas.letta_request import ClientToolSchema
 from letta.schemas.letta_response import LettaResponse
 from letta.schemas.letta_stop_reason import StopReasonType
 from letta.schemas.message import Message, MessageCreate
 from letta.schemas.provider_trace import BillingContext
 from letta.schemas.run import Run, RunUpdate
 from letta.schemas.user import User
 from letta.services.group_manager import GroupManager
@@ -47,6 +48,7 @@ class SleeptimeMultiAgentV3(LettaAgentV2):
        request_start_timestamp_ns: int | None = None,
        client_tools: list[ClientToolSchema] | None = None,
        include_compaction_messages: bool = False,
        billing_context: "BillingContext | None" = None,
    ) -> LettaResponse:
        self.run_ids = []
@@ -62,6 +64,7 @@ class SleeptimeMultiAgentV3(LettaAgentV2):
            request_start_timestamp_ns=request_start_timestamp_ns,
            client_tools=client_tools,
            include_compaction_messages=include_compaction_messages,
            billing_context=billing_context,
        )
        await self.run_sleeptime_agents()
@@ -81,6 +84,7 @@ class SleeptimeMultiAgentV3(LettaAgentV2):
        include_return_message_types: list[MessageType] | None = None,
        client_tools: list[ClientToolSchema] | None = None,
        include_compaction_messages: bool = False,
        billing_context: "BillingContext | None" = None,
    ) -> AsyncGenerator[str, None]:
        self.run_ids = []
@@ -99,6 +103,7 @@ class SleeptimeMultiAgentV3(LettaAgentV2):
                request_start_timestamp_ns=request_start_timestamp_ns,
                client_tools=client_tools,
                include_compaction_messages=include_compaction_messages,
                billing_context=billing_context,
            ):
                yield chunk
        finally:
--- a/letta/groups/sleeptime_multi_agent_v4.py
+++ b/letta/groups/sleeptime_multi_agent_v4.py
@@ -14,6 +14,7 @@ from letta.schemas.letta_request import ClientToolSchema
 from letta.schemas.letta_response import LettaResponse
 from letta.schemas.letta_stop_reason import StopReasonType
 from letta.schemas.message import Message, MessageCreate
 from letta.schemas.provider_trace import BillingContext
 from letta.schemas.run import Run, RunUpdate
 from letta.schemas.user import User
 from letta.services.group_manager import GroupManager
@@ -47,6 +48,7 @@ class SleeptimeMultiAgentV4(LettaAgentV3):
        conversation_id: str | None = None,
        client_tools: list[ClientToolSchema] | None = None,
        include_compaction_messages: bool = False,
        billing_context: "BillingContext | None" = None,
    ) -> LettaResponse:
        self.run_ids = []
@@ -63,6 +65,7 @@ class SleeptimeMultiAgentV4(LettaAgentV3):
            conversation_id=conversation_id,
            client_tools=client_tools,
            include_compaction_messages=include_compaction_messages,
            billing_context=billing_context,
        )
        run_ids = await self.run_sleeptime_agents()
@@ -82,6 +85,7 @@ class SleeptimeMultiAgentV4(LettaAgentV3):
        conversation_id: str | None = None,
        client_tools: list[ClientToolSchema] | None = None,
        include_compaction_messages: bool = False,
        billing_context: "BillingContext | None" = None,
    ) -> AsyncGenerator[str, None]:
        self.run_ids = []
@@ -101,6 +105,7 @@ class SleeptimeMultiAgentV4(LettaAgentV3):
                conversation_id=conversation_id,
                client_tools=client_tools,
                include_compaction_messages=include_compaction_messages,
                billing_context=billing_context,
            ):
                yield chunk
        finally:
--- a/letta/interfaces/anthropic_streaming_interface.py
+++ b/letta/interfaces/anthropic_streaming_interface.py
@@ -30,6 +30,7 @@ from anthropic.types.beta import (
 )
 from letta.constants import DEFAULT_MESSAGE_TOOL, DEFAULT_MESSAGE_TOOL_KWARG
 from letta.errors import LLMEmptyResponseError
 from letta.local_llm.constants import INNER_THOUGHTS_KWARG
 from letta.log import get_logger
 from letta.schemas.letta_message import (
@@ -104,6 +105,10 @@ class AnthropicStreamingInterface:
        self.inner_thoughts_complete = False
        self.put_inner_thoughts_in_kwarg = put_inner_thoughts_in_kwarg
        # Track whether any content was produced (text or tool calls)
        # Used to detect empty responses from models like Opus 4.6
        self.has_content = False
        # Buffer to handle partial XML tags across chunks
        self.partial_tag_buffer = ""
@@ -298,9 +303,11 @@ class AnthropicStreamingInterface:
            if isinstance(content, BetaTextBlock):
                self.anthropic_mode = EventMode.TEXT
                self.has_content = True  # Track that we received text content
                # TODO: Can capture citations, etc.
            elif isinstance(content, BetaToolUseBlock):
                self.anthropic_mode = EventMode.TOOL_USE
                self.has_content = True  # Track that we received tool use content
                self.tool_call_id = content.id
                self.tool_call_name = content.name
                self.inner_thoughts_complete = False
@@ -589,8 +596,12 @@ class AnthropicStreamingInterface:
            # message_delta event are *cumulative*." So we assign, not accumulate.
            self.output_tokens = event.usage.output_tokens
        elif isinstance(event, BetaRawMessageStopEvent):
-            # Don't do anything here! We don't want to stop the stream.
+            # Check if any content was produced during the stream
-            pass
+            # Empty responses (no text and no tool calls) should raise an error
            if not self.has_content:
                raise LLMEmptyResponseError(
                    message=f"LLM provider returned empty content in streaming response (model: {self.model}, message_id: {self.message_id})"
                )
        elif isinstance(event, BetaRawContentBlockStopEvent):
            # If we're exiting a tool use block and there are still buffered messages,
            # we should flush them now.
@@ -837,10 +848,12 @@ class SimpleAnthropicStreamingInterface:
            if isinstance(content, BetaTextBlock):
                self.anthropic_mode = EventMode.TEXT
                self.has_content = True  # Track that we received text content
                # TODO: Can capture citations, etc.
            elif isinstance(content, BetaToolUseBlock):
                self.anthropic_mode = EventMode.TOOL_USE
                self.has_content = True  # Track that we received tool use content
                self.tool_call_id = content.id
                self.tool_call_name = content.name
@@ -1014,8 +1027,12 @@ class SimpleAnthropicStreamingInterface:
            self.output_tokens = event.usage.output_tokens
        elif isinstance(event, BetaRawMessageStopEvent):
-            # Don't do anything here! We don't want to stop the stream.
+            # Check if any content was produced during the stream
-            pass
+            # Empty responses (no text and no tool calls) should raise an error
            if not self.has_content:
                raise LLMEmptyResponseError(
                    message=f"LLM provider returned empty content in streaming response (model: {self.model}, message_id: {self.message_id})"
                )
        elif isinstance(event, BetaRawContentBlockStopEvent):
            self.anthropic_mode = None
--- a/letta/llm_api/anthropic_client.py
+++ b/letta/llm_api/anthropic_client.py
@@ -19,6 +19,8 @@ from letta.errors import (
    LLMAuthenticationError,
    LLMBadRequestError,
    LLMConnectionError,
    LLMEmptyResponseError,
    LLMError,
    LLMInsufficientCreditsError,
    LLMNotFoundError,
    LLMPermissionDeniedError,
@@ -957,6 +959,11 @@ class AnthropicClient(LLMClientBase):
    @trace_method
    def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
        # Pass through errors that are already LLMError instances unchanged
        # This preserves specific error types like LLMEmptyResponseError
        if isinstance(e, LLMError):
            return e
        is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
        # make sure to check for overflow errors, regardless of error type
@@ -1278,7 +1285,7 @@ class AnthropicClient(LLMClientBase):
                response.stop_reason,
                json.dumps(response_data),
            )
-            raise LLMServerError(
+            raise LLMEmptyResponseError(
                message=f"LLM provider returned empty content in response (ID: {response.id}, model: {response.model}, stop_reason: {response.stop_reason})",
                code=ErrorCode.INTERNAL_SERVER_ERROR,
                details={
--- a/letta/llm_api/google_ai_client.py
+++ b/letta/llm_api/google_ai_client.py
@@ -9,7 +9,7 @@ from letta.llm_api.google_constants import GOOGLE_MODEL_FOR_API_KEY_CHECK
 from letta.llm_api.google_vertex_client import GoogleVertexClient
 from letta.log import get_logger
 from letta.schemas.llm_config import LLMConfig
-from letta.settings import model_settings, settings
+from letta.settings import model_settings
 logger = get_logger(__name__)
@@ -18,7 +18,7 @@ class GoogleAIClient(GoogleVertexClient):
    provider_label = "Google AI"
    def _get_client(self, llm_config: Optional[LLMConfig] = None):
-        timeout_ms = int(settings.llm_request_timeout_seconds * 1000)
+        timeout_ms = int(model_settings.gemini_timeout_seconds * 1000)
        api_key = None
        if llm_config:
            api_key, _, _ = self.get_byok_overrides(llm_config)
@@ -30,7 +30,7 @@ class GoogleAIClient(GoogleVertexClient):
        )
    async def _get_client_async(self, llm_config: Optional[LLMConfig] = None):
-        timeout_ms = int(settings.llm_request_timeout_seconds * 1000)
+        timeout_ms = int(model_settings.gemini_timeout_seconds * 1000)
        api_key = None
        if llm_config:
            api_key, _, _ = await self.get_byok_overrides_async(llm_config)
--- a/letta/llm_api/llm_client_base.py
+++ b/letta/llm_api/llm_client_base.py
@@ -14,7 +14,7 @@ from letta.schemas.enums import AgentType, LLMCallType, ProviderCategory
 from letta.schemas.llm_config import LLMConfig
 from letta.schemas.message import Message
 from letta.schemas.openai.chat_completion_response import ChatCompletionResponse
-from letta.schemas.provider_trace import ProviderTrace
+from letta.schemas.provider_trace import BillingContext, ProviderTrace
 from letta.schemas.usage import LettaUsageStatistics
 from letta.services.telemetry_manager import TelemetryManager
 from letta.settings import settings
@@ -48,6 +48,7 @@ class LLMClientBase:
        self._telemetry_user_id: Optional[str] = None
        self._telemetry_compaction_settings: Optional[Dict] = None
        self._telemetry_llm_config: Optional[Dict] = None
        self._telemetry_billing_context: Optional[BillingContext] = None
    def set_telemetry_context(
        self,
@@ -62,6 +63,7 @@ class LLMClientBase:
        compaction_settings: Optional[Dict] = None,
        llm_config: Optional[Dict] = None,
        actor: Optional["User"] = None,
        billing_context: Optional[BillingContext] = None,
    ) -> None:
        """Set telemetry context for provider trace logging."""
        if actor is not None:
@@ -76,6 +78,7 @@ class LLMClientBase:
        self._telemetry_user_id = user_id
        self._telemetry_compaction_settings = compaction_settings
        self._telemetry_llm_config = llm_config
        self._telemetry_billing_context = billing_context
    def extract_usage_statistics(self, response_data: Optional[dict], llm_config: LLMConfig) -> LettaUsageStatistics:
        """Provider-specific usage parsing hook (override in subclasses). Returns LettaUsageStatistics."""
@@ -125,6 +128,7 @@ class LLMClientBase:
                                user_id=self._telemetry_user_id,
                                compaction_settings=self._telemetry_compaction_settings,
                                llm_config=llm_config.model_dump() if llm_config else self._telemetry_llm_config,
                                billing_context=self._telemetry_billing_context,
                            ),
                        )
                    except Exception as e:
@@ -186,6 +190,7 @@ class LLMClientBase:
                    user_id=self._telemetry_user_id,
                    compaction_settings=self._telemetry_compaction_settings,
                    llm_config=llm_config.model_dump() if llm_config else self._telemetry_llm_config,
                    billing_context=self._telemetry_billing_context,
                ),
            )
        except Exception as e:
--- a/letta/llm_api/openai_client.py
+++ b/letta/llm_api/openai_client.py
@@ -88,7 +88,7 @@ def supports_none_reasoning_effort(model: str) -> bool:
    Currently, GPT-5.1 and GPT-5.2 models support the 'none' reasoning effort level.
    """
-    return model.startswith("gpt-5.1") or model.startswith("gpt-5.2")
+    return model.startswith("gpt-5.1") or model.startswith("gpt-5.2") or model.startswith("gpt-5.3")
 def is_openai_5_model(model: str) -> bool:
@@ -389,7 +389,6 @@ class OpenAIClient(LLMClientBase):
            input=openai_messages_list,
            tools=responses_tools,
            tool_choice=tool_choice,
            max_output_tokens=llm_config.max_tokens,
            temperature=llm_config.temperature if supports_temperature_param(model) else None,
            parallel_tool_calls=llm_config.parallel_tool_calls if tools and supports_parallel_tool_calling(model) else False,
        )
@@ -397,6 +396,10 @@ class OpenAIClient(LLMClientBase):
        # Handle text configuration (verbosity and response format)
        text_config_kwargs = {}
        # Only set max_output_tokens if explicitly configured
        if llm_config.max_tokens is not None:
            data.max_output_tokens = llm_config.max_tokens
        # Add verbosity control for GPT-5 models
        if supports_verbosity_control(model) and llm_config.verbosity:
            text_config_kwargs["verbosity"] = llm_config.verbosity
@@ -451,7 +454,6 @@ class OpenAIClient(LLMClientBase):
        )
        request_data = data.model_dump(exclude_unset=True)
        # print("responses request data", request_data)
        return request_data
    @trace_method
@@ -639,6 +641,14 @@ class OpenAIClient(LLMClientBase):
                    tool.function.strict = False
        request_data = data.model_dump(exclude_unset=True)
        # Fireworks uses strict validation (additionalProperties: false) and rejects
        # reasoning fields that are not in their schema.
        is_fireworks = llm_config.model_endpoint and "fireworks.ai" in llm_config.model_endpoint
        if is_fireworks and "messages" in request_data:
            for message in request_data["messages"]:
                for field in ("reasoning_content_signature", "redacted_reasoning_content", "omitted_reasoning_content"):
                    message.pop(field, None)
        # If Ollama
        # if llm_config.handle.startswith("ollama/") and llm_config.enable_reasoner:
        # Sadly, reasoning via the OpenAI proxy on Ollama only works for Harmony/gpt-oss
--- a/letta/llm_api/zai_client.py
+++ b/letta/llm_api/zai_client.py
@@ -68,6 +68,12 @@ class ZAIClient(OpenAIClient):
                    }
                }
        # Z.ai's API uses max_tokens, not max_completion_tokens.
        # If max_completion_tokens is sent, Z.ai ignores it and falls back to its
        # default of 65536, silently truncating input to ~137K of the 200K context window.
        if "max_completion_tokens" in data:
            data["max_tokens"] = data.pop("max_completion_tokens")
        # Sanitize empty text content — ZAI rejects empty text blocks
        if "messages" in data:
            for msg in data["messages"]:
--- a/letta/model_specs/model_prices_and_context_window.json
+++ b/letta/model_specs/model_prices_and_context_window.json
@@ -17295,6 +17295,58 @@
    "supports_tool_choice": true,
    "supports_vision": true
  },
  "gpt-5.3-chat-latest": {
    "cache_read_input_token_cost": 1.75e-7,
    "cache_read_input_token_cost_priority": 3.5e-7,
    "input_cost_per_token": 1.75e-6,
    "input_cost_per_token_priority": 3.5e-6,
    "litellm_provider": "openai",
    "max_input_tokens": 128000,
    "max_output_tokens": 16384,
    "max_tokens": 16384,
    "mode": "chat",
    "output_cost_per_token": 1.4e-5,
    "output_cost_per_token_priority": 2.8e-5,
    "supported_endpoints": ["/v1/chat/completions", "/v1/responses"],
    "supported_modalities": ["text", "image"],
    "supported_output_modalities": ["text"],
    "supports_function_calling": true,
    "supports_native_streaming": true,
    "supports_parallel_function_calling": true,
    "supports_pdf_input": true,
    "supports_prompt_caching": true,
    "supports_reasoning": true,
    "supports_response_schema": true,
    "supports_system_messages": true,
    "supports_tool_choice": true,
    "supports_vision": true
  },
  "gpt-5.3-codex": {
    "cache_read_input_token_cost": 1.75e-7,
    "cache_read_input_token_cost_priority": 3.5e-7,
    "input_cost_per_token": 1.75e-6,
    "input_cost_per_token_priority": 3.5e-6,
    "litellm_provider": "openai",
    "max_input_tokens": 272000,
    "max_output_tokens": 128000,
    "max_tokens": 128000,
    "mode": "responses",
    "output_cost_per_token": 1.4e-5,
    "output_cost_per_token_priority": 2.8e-5,
    "supported_endpoints": ["/v1/responses"],
    "supported_modalities": ["text", "image"],
    "supported_output_modalities": ["text"],
    "supports_function_calling": true,
    "supports_native_streaming": true,
    "supports_parallel_function_calling": true,
    "supports_pdf_input": true,
    "supports_prompt_caching": true,
    "supports_reasoning": true,
    "supports_response_schema": true,
    "supports_system_messages": false,
    "supports_tool_choice": true,
    "supports_vision": true
  },
  "gpt-5-mini": {
    "cache_read_input_token_cost": 2.5e-8,
    "cache_read_input_token_cost_flex": 1.25e-8,
--- a/letta/orm/conversation.py
+++ b/letta/orm/conversation.py
@@ -44,7 +44,7 @@ class Conversation(SqlalchemyBase, OrganizationMixin):
        "ConversationMessage",
        back_populates="conversation",
        cascade="all, delete-orphan",
-        lazy="selectin",
+        lazy="raise",
    )
    isolated_blocks: Mapped[List["Block"]] = relationship(
        "Block",
--- a/letta/orm/conversation_messages.py
+++ b/letta/orm/conversation_messages.py
@@ -69,5 +69,5 @@ class ConversationMessage(SqlalchemyBase, OrganizationMixin):
    )
    message: Mapped["Message"] = relationship(
        "Message",
-        lazy="selectin",
+        lazy="raise",
    )
--- a/letta/schemas/letta_request.py
+++ b/letta/schemas/letta_request.py
@@ -88,8 +88,7 @@ class LettaRequest(BaseModel):
    )
    top_logprobs: Optional[int] = Field(
        default=None,
-        description="Number of most likely tokens to return at each position (0-20). "
+        description="Number of most likely tokens to return at each position (0-20). Requires return_logprobs=True.",
        "Requires return_logprobs=True.",
    )
    return_token_ids: bool = Field(
        default=False,
@@ -155,6 +154,10 @@ class LettaStreamingRequest(LettaRequest):
 class ConversationMessageRequest(LettaRequest):
    """Request for sending messages to a conversation. Streams by default."""
    agent_id: Optional[str] = Field(
        default=None,
        description="Agent ID for agent-direct mode with 'default' conversation. Use with conversation_id='default' in the URL path.",
    )
    streaming: bool = Field(
        default=True,
        description="If True (default), returns a streaming response (Server-Sent Events). If False, returns a complete JSON response.",
@@ -194,6 +197,10 @@ class CreateBatch(BaseModel):
 class RetrieveStreamRequest(BaseModel):
    agent_id: Optional[str] = Field(
        default=None,
        description="Agent ID for agent-direct mode with 'default' conversation. Use with conversation_id='default' in the URL path.",
    )
    starting_after: int = Field(
        0, description="Sequence id to use as a cursor for pagination. Response will start streaming after this chunk sequence id"
    )
--- a/letta/schemas/llm_config.py
+++ b/letta/schemas/llm_config.py
@@ -1,3 +1,4 @@
 import re
 from typing import TYPE_CHECKING, Literal, Optional
 from pydantic import BaseModel, ConfigDict, Field, model_validator
@@ -139,7 +140,9 @@ class LLMConfig(BaseModel):
        # Set max_tokens defaults based on model (only if not explicitly provided)
        if "max_tokens" not in values:
-            if model.startswith("gpt-5"):  # Covers both gpt-5 and gpt-5.1
+            if re.match(r"^gpt-5\.[23]", model) and "-chat" not in model:
                values["max_tokens"] = 128000
            elif model.startswith("gpt-5"):
                values["max_tokens"] = 16384
            elif model == "gpt-4.1":
                values["max_tokens"] = 8192
@@ -299,7 +302,7 @@ class LLMConfig(BaseModel):
                context_window=272000,
                reasoning_effort="none",  # Default to "none" for GPT-5.2
                verbosity="medium",
-                max_tokens=16384,
+                max_tokens=128000,
            )
        elif model_name == "letta":
            return cls(
--- a/letta/schemas/llm_trace.py
+++ b/letta/schemas/llm_trace.py
@@ -95,6 +95,11 @@ class LLMTrace(LettaBase):
    response_json: str = Field(..., description="Full response payload as JSON string")
    llm_config_json: str = Field(default="", description="LLM config as JSON string")
    # Billing context
    billing_plan_type: Optional[str] = Field(default=None, description="Subscription tier (e.g., 'basic', 'standard', 'max', 'enterprise')")
    billing_cost_source: Optional[str] = Field(default=None, description="Cost source: 'quota' or 'credits'")
    billing_customer_id: Optional[str] = Field(default=None, description="Customer ID for cross-referencing billing records")
    # Timestamp
    created_at: datetime = Field(default_factory=get_utc_time, description="When the trace was created")
@@ -128,6 +133,9 @@ class LLMTrace(LettaBase):
            self.request_json,
            self.response_json,
            self.llm_config_json,
            self.billing_plan_type or "",
            self.billing_cost_source or "",
            self.billing_customer_id or "",
            self.created_at,
        )
@@ -162,5 +170,8 @@ class LLMTrace(LettaBase):
            "request_json",
            "response_json",
            "llm_config_json",
            "billing_plan_type",
            "billing_cost_source",
            "billing_customer_id",
            "created_at",
        ]
--- a/letta/schemas/memory.py
+++ b/letta/schemas/memory.py
@@ -226,8 +226,6 @@ class Memory(BaseModel, validate_assignment=True):
            front_lines = []
            if block.description:
                front_lines.append(f"description: {block.description}")
            if block.limit is not None:
                front_lines.append(f"limit: {block.limit}")
            if getattr(block, "read_only", False):
                front_lines.append("read_only: true")
@@ -291,7 +289,40 @@ class Memory(BaseModel, validate_assignment=True):
        s.write("\n\n<memory_filesystem>\n")
-        def _render_tree(node: dict, prefix: str = ""):
+        def _render_tree(node: dict, prefix: str = "", in_system: bool = False, path_parts: tuple[str, ...] = ()):
            # Render skills/ as concise top-level entries only, using both
            # current (`skills/<name>`) and legacy (`skills/<name>/SKILL`) labels.
            if path_parts == ("skills",):
                skill_entries: list[tuple[str, str]] = []
                for name, val in node.items():
                    if name == LEAF_KEY:
                        continue
                    block = None
                    if isinstance(val, dict):
                        legacy_skill_block = val.get("SKILL")
                        if legacy_skill_block is not None and not isinstance(legacy_skill_block, dict):
                            block = legacy_skill_block
                        elif LEAF_KEY in val and not isinstance(val[LEAF_KEY], dict):
                            block = val[LEAF_KEY]
                    else:
                        block = val
                    if block is None:
                        continue
                    desc = getattr(block, "description", None)
                    desc_line = (desc or "").strip().split("\n")[0].strip()
                    skill_entries.append((name, desc_line))
                skill_entries.sort(key=lambda e: e[0])
                for i, (name, desc_line) in enumerate(skill_entries):
                    is_last = i == len(skill_entries) - 1
                    connector = "└── " if is_last else "├── "
                    desc_suffix = f" ({desc_line})" if desc_line else ""
                    s.write(f"{prefix}{connector}{name}{desc_suffix}\n")
                return
            # Sort: directories first, then files. If a node is both a directory and a
            # leaf (LEAF_KEY present), show both <name>/ and <name>.md.
            dirs = []
@@ -316,9 +347,24 @@ class Memory(BaseModel, validate_assignment=True):
                if is_dir:
                    s.write(f"{prefix}{connector}{name}/\n")
                    extension = "    " if is_last else "│   "
-                    _render_tree(node[name], prefix + extension)
+                    _render_tree(
                        node[name],
                        prefix + extension,
                        in_system=in_system or name == "system",
                        path_parts=(*path_parts, name),
                    )
                else:
-                    s.write(f"{prefix}{connector}{name}.md\n")
+                    # For files outside system/, append the block description
                    desc_suffix = ""
                    if not in_system:
                        val = node[name]
                        block = val[LEAF_KEY] if isinstance(val, dict) else val
                        desc = getattr(block, "description", None)
                        if desc:
                            desc_line = desc.strip().split("\n")[0].strip()
                            if desc_line:
                                desc_suffix = f" ({desc_line})"
                    s.write(f"{prefix}{connector}{name}.md{desc_suffix}\n")
        _render_tree(tree)
        s.write("</memory_filesystem>")
--- a/letta/schemas/model.py
+++ b/letta/schemas/model.py
@@ -282,10 +282,10 @@ class AnthropicModelSettings(ModelSettings):
        description="Soft control for how verbose model output should be, used for GPT-5 models.",
    )
-    # Opus 4.5 effort parameter
+    # Effort parameter for Opus 4.5, Opus 4.6, and Sonnet 4.6
-    effort: Optional[Literal["low", "medium", "high"]] = Field(
+    effort: Optional[Literal["low", "medium", "high", "max"]] = Field(
        None,
-        description="Effort level for Opus 4.5 model (controls token conservation). Not setting this gives similar performance to 'high'.",
+        description="Effort level for supported Anthropic models (controls token spending). 'max' is only available on Opus 4.6. Not setting this gives similar performance to 'high'.",
    )
    # Anthropic supports strict mode for tool calling - defaults to False
--- a/letta/schemas/provider_trace.py
+++ b/letta/schemas/provider_trace.py
@@ -3,13 +3,21 @@ from __future__ import annotations
 from datetime import datetime
 from typing import Any, Dict, Optional
-from pydantic import Field
+from pydantic import BaseModel, Field
 from letta.helpers.datetime_helpers import get_utc_time
 from letta.schemas.enums import PrimitiveType
 from letta.schemas.letta_base import OrmMetadataBase
 class BillingContext(BaseModel):
    """Billing context for LLM request cost tracking."""
    plan_type: Optional[str] = Field(None, description="Subscription tier")
    cost_source: Optional[str] = Field(None, description="Cost source: 'quota' or 'credits'")
    customer_id: Optional[str] = Field(None, description="Customer ID for billing records")
 class BaseProviderTrace(OrmMetadataBase):
    __id_prefix__ = PrimitiveType.PROVIDER_TRACE.value
@@ -53,6 +61,8 @@ class ProviderTrace(BaseProviderTrace):
    compaction_settings: Optional[Dict[str, Any]] = Field(None, description="Compaction/summarization settings (summarization calls only)")
    llm_config: Optional[Dict[str, Any]] = Field(None, description="LLM configuration used for this call (non-summarization calls only)")
    billing_context: Optional[BillingContext] = Field(None, description="Billing context from request headers")
    created_at: datetime = Field(default_factory=get_utc_time, description="The timestamp when the object was created.")
--- a/letta/schemas/providers/openai.py
+++ b/letta/schemas/providers/openai.py
@@ -14,7 +14,7 @@ from letta.schemas.providers.base import Provider
 logger = get_logger(__name__)
 ALLOWED_PREFIXES = {"gpt-4", "gpt-5", "o1", "o3", "o4"}
-DISALLOWED_KEYWORDS = {"transcribe", "search", "realtime", "tts", "audio", "computer", "o1-mini", "o1-preview", "o1-pro", "chat"}
+DISALLOWED_KEYWORDS = {"transcribe", "search", "realtime", "tts", "audio", "computer", "o1-mini", "o1-preview", "o1-pro"}
 DEFAULT_EMBEDDING_BATCH_SIZE = 1024
@@ -50,10 +50,22 @@ class OpenAIProvider(Provider):
        except Exception as e:
            raise LLMError(message=f"{e}", code=ErrorCode.INTERNAL_SERVER_ERROR)
    @staticmethod
    def _openai_default_max_output_tokens(model_name: str) -> int:
        """Return a sensible max-output-tokens default for OpenAI models.
        gpt-5.2* / gpt-5.3* support 128k output tokens, except the
        `-chat` variants which are capped at 16k.
        """
        import re
        if re.match(r"^gpt-5\.[23]", model_name) and "-chat" not in model_name:
            return 128000
        return 16384
    def get_default_max_output_tokens(self, model_name: str) -> int:
        """Get the default max output tokens for OpenAI models (sync fallback)."""
-        # Simple default for openai
+        return self._openai_default_max_output_tokens(model_name)
        return 16384
    async def get_default_max_output_tokens_async(self, model_name: str) -> int:
        """Get the default max output tokens for OpenAI models.
@@ -67,8 +79,7 @@ class OpenAIProvider(Provider):
        if max_output is not None:
            return max_output
-        # Simple default for openai
+        return self._openai_default_max_output_tokens(model_name)
        return 16384
    async def _get_models_async(self) -> list[dict]:
        from letta.llm_api.openai import openai_get_model_list_async
--- a/letta/schemas/providers/zai.py
+++ b/letta/schemas/providers/zai.py
@@ -12,12 +12,13 @@ from letta.schemas.providers.openai import OpenAIProvider
 # Z.ai model context windows
 # Reference: https://docs.z.ai/
 # GLM-5 max context window is 200K tokens but max_output_tokens (default 16k) counts against that --> 180k
 MODEL_CONTEXT_WINDOWS = {
    "glm-4.5": 128000,
-    "glm-4.6": 200000,
+    "glm-4.6": 180000,
-    "glm-4.7": 200000,
+    "glm-4.7": 180000,
-    "glm-5": 200000,
+    "glm-5": 180000,
-    "glm-5-code": 200000,
+    "glm-5-code": 180000,
 }
--- a/letta/server/db.py
+++ b/letta/server/db.py
@@ -3,7 +3,7 @@ import uuid
 from contextlib import asynccontextmanager
 from typing import AsyncGenerator
-from sqlalchemy import NullPool, text
+from sqlalchemy import NullPool
 from sqlalchemy.ext.asyncio import (
    AsyncEngine,
    AsyncSession,
@@ -88,10 +88,6 @@ class DatabaseRegistry:
            try:
                async with async_session_factory() as session:
                    try:
                        result = await session.execute(text("SELECT pg_backend_pid(), current_setting('statement_timeout')"))
                        pid, timeout = result.one()
                        logger.warning(f"[stmt_timeout_debug] pid={pid} statement_timeout={timeout}")
                        await session.rollback()
                        yield session
                        await session.commit()
                    except asyncio.CancelledError:
--- a/letta/server/rest_api/dependencies.py
+++ b/letta/server/rest_api/dependencies.py
@@ -6,6 +6,7 @@ from pydantic import BaseModel
 from letta.errors import LettaInvalidArgumentError
 from letta.otel.tracing import tracer
 from letta.schemas.enums import PrimitiveType
 from letta.schemas.provider_trace import BillingContext
 from letta.validators import PRIMITIVE_ID_PATTERNS
 if TYPE_CHECKING:
@@ -30,18 +31,24 @@ class HeaderParams(BaseModel):
    letta_source: Optional[str] = None
    sdk_version: Optional[str] = None
    experimental_params: Optional[ExperimentalParams] = None
    billing_context: Optional[BillingContext] = None
 def get_headers(
    actor_id: Optional[str] = Header(None, alias="user_id"),
    user_agent: Optional[str] = Header(None, alias="User-Agent"),
    project_id: Optional[str] = Header(None, alias="X-Project-Id"),
-    letta_source: Optional[str] = Header(None, alias="X-Letta-Source"),
+    letta_source: Optional[str] = Header(None, alias="X-Letta-Source", include_in_schema=False),
-    sdk_version: Optional[str] = Header(None, alias="X-Stainless-Package-Version"),
+    sdk_version: Optional[str] = Header(None, alias="X-Stainless-Package-Version", include_in_schema=False),
-    message_async: Optional[str] = Header(None, alias="X-Experimental-Message-Async"),
+    message_async: Optional[str] = Header(None, alias="X-Experimental-Message-Async", include_in_schema=False),
-    letta_v1_agent: Optional[str] = Header(None, alias="X-Experimental-Letta-V1-Agent"),
+    letta_v1_agent: Optional[str] = Header(None, alias="X-Experimental-Letta-V1-Agent", include_in_schema=False),
-    letta_v1_agent_message_async: Optional[str] = Header(None, alias="X-Experimental-Letta-V1-Agent-Message-Async"),
+    letta_v1_agent_message_async: Optional[str] = Header(
-    modal_sandbox: Optional[str] = Header(None, alias="X-Experimental-Modal-Sandbox"),
+        None, alias="X-Experimental-Letta-V1-Agent-Message-Async", include_in_schema=False
    ),
    modal_sandbox: Optional[str] = Header(None, alias="X-Experimental-Modal-Sandbox", include_in_schema=False),
    billing_plan_type: Optional[str] = Header(None, alias="X-Billing-Plan-Type", include_in_schema=False),
    billing_cost_source: Optional[str] = Header(None, alias="X-Billing-Cost-Source", include_in_schema=False),
    billing_customer_id: Optional[str] = Header(None, alias="X-Billing-Customer-Id", include_in_schema=False),
 ) -> HeaderParams:
    """Dependency injection function to extract common headers from requests."""
    with tracer.start_as_current_span("dependency.get_headers"):
@@ -63,6 +70,13 @@ def get_headers(
                letta_v1_agent_message_async=(letta_v1_agent_message_async == "true") if letta_v1_agent_message_async else None,
                modal_sandbox=(modal_sandbox == "true") if modal_sandbox else None,
            ),
            billing_context=BillingContext(
                plan_type=billing_plan_type,
                cost_source=billing_cost_source,
                customer_id=billing_customer_id,
            )
            if any([billing_plan_type, billing_cost_source, billing_customer_id])
            else None,
        )
--- a/letta/server/rest_api/routers/v1/agents.py
+++ b/letta/server/rest_api/routers/v1/agents.py
@@ -49,6 +49,7 @@ from letta.schemas.memory import (
 )
 from letta.schemas.message import Message, MessageCreate, MessageCreateType, MessageSearchRequest, MessageSearchResult
 from letta.schemas.passage import Passage
 from letta.schemas.provider_trace import BillingContext
 from letta.schemas.run import Run as PydanticRun, RunUpdate
 from letta.schemas.source import Source
 from letta.schemas.tool import Tool
@@ -156,7 +157,7 @@ async def list_agents(
    order: Literal["asc", "desc"] = Query(
        "desc", description="Sort order for agents by creation time. 'asc' for oldest first, 'desc' for newest first"
    ),
-    order_by: Literal["created_at", "last_run_completion"] = Query("created_at", description="Field to sort by"),
+    order_by: Literal["created_at", "updated_at", "last_run_completion"] = Query("created_at", description="Field to sort by"),
    ascending: bool = Query(
        False,
        description="Whether to sort agents oldest to newest (True) or newest to oldest (False, default)",
@@ -1697,6 +1698,7 @@ async def send_message(
            actor=actor,
            request=request,
            run_type="send_message",
            billing_context=headers.billing_context,
        )
        return result
@@ -1767,6 +1769,7 @@ async def send_message(
            include_return_message_types=request.include_return_message_types,
            client_tools=request.client_tools,
            include_compaction_messages=request.include_compaction_messages,
            billing_context=headers.billing_context,
        )
        run_status = result.stop_reason.stop_reason.run_status
        return result
@@ -1845,6 +1848,7 @@ async def send_message_streaming(
        actor=actor,
        request=request,
        run_type="send_message_streaming",
        billing_context=headers.billing_context,
    )
    return result
@@ -1868,6 +1872,13 @@ async def cancel_message(
    """
    # TODO: WHY DOES THIS CANCEL A LIST OF RUNS?
    actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
    logger.info(
        "[Interrupt] Cancel request received for agent=%s by actor=%s (org=%s), explicit_run_ids=%s",
        agent_id,
        actor.id,
        actor.organization_id,
        request.run_ids if request else None,
    )
    if not settings.track_agent_run:
        raise HTTPException(status_code=400, detail="Agent run tracking is disabled")
    run_ids = request.run_ids if request else None
@@ -2036,6 +2047,7 @@ async def _process_message_background(
    include_return_message_types: list[MessageType] | None = None,
    override_model: str | None = None,
    include_compaction_messages: bool = False,
    billing_context: "BillingContext | None" = None,
 ) -> None:
    """Background task to process the message and update run status."""
    request_start_timestamp_ns = get_utc_timestamp_ns()
@@ -2067,6 +2079,7 @@ async def _process_message_background(
            request_start_timestamp_ns=request_start_timestamp_ns,
            include_return_message_types=include_return_message_types,
            include_compaction_messages=include_compaction_messages,
            billing_context=billing_context,
        )
        runs_manager = RunManager()
        from letta.schemas.enums import RunStatus
@@ -2235,6 +2248,7 @@ async def send_message_async(
            include_return_message_types=request.include_return_message_types,
            override_model=request.override_model,
            include_compaction_messages=request.include_compaction_messages,
            billing_context=headers.billing_context,
        ),
        label=f"process_message_background_{run.id}",
    )
@@ -2419,7 +2433,11 @@ async def summarize_messages(
        # If mode changed from agent's original settings and prompt not explicitly set in request, then use the default prompt for the new mode
        # Ex: previously was sliding_window, now is all, so we need to use the default prompt for all mode
-        if "mode" in changed_fields and agent.compaction_settings.mode != request.compaction_settings.mode:
+        if (
            "mode" in changed_fields
            and "prompt" not in changed_fields
            and agent.compaction_settings.mode != request.compaction_settings.mode
        ):
            from letta.services.summarizer.summarizer_config import get_default_prompt_for_mode
            compaction_settings.prompt = get_default_prompt_for_mode(compaction_settings.mode)
@@ -2439,7 +2457,7 @@ async def summarize_messages(
        logger.warning(f"Summarization failed to reduce the number of messages. {num_messages_before} messages -> {num_messages_after}.")
        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
-            detail="Summarization failed to reduce the number of messages. You may need to use a different CompactionSettings (e.g. using `all` mode).",
+            detail="Summarization failed to reduce the number of messages. You may not have enough messages to compact or need to use a different CompactionSettings (e.g. using `all` mode).",
        )
    await agent_loop._checkpoint_messages(run_id=None, step_id=None, new_messages=[summary_message], in_context_messages=messages)
    return CompactionResponse(
--- a/letta/server/rest_api/routers/v1/conversations.py
+++ b/letta/server/rest_api/routers/v1/conversations.py
@@ -1,5 +1,6 @@
 from datetime import timedelta
 from typing import Annotated, List, Literal, Optional
 from uuid import uuid4
 from fastapi import APIRouter, Body, Depends, HTTPException, Query, status
 from pydantic import BaseModel, Field
@@ -18,6 +19,7 @@ from letta.schemas.job import LettaRequestConfig
 from letta.schemas.letta_message import LettaMessageUnion
 from letta.schemas.letta_request import ConversationMessageRequest, LettaStreamingRequest, RetrieveStreamRequest
 from letta.schemas.letta_response import LettaResponse
 from letta.schemas.provider_trace import BillingContext
 from letta.schemas.run import Run as PydanticRun
 from letta.server.rest_api.dependencies import HeaderParams, get_headers, get_letta_server
 from letta.server.rest_api.redis_stream_manager import redis_sse_stream_generator
@@ -32,7 +34,7 @@ from letta.services.run_manager import RunManager
 from letta.services.streaming_service import StreamingService
 from letta.services.summarizer.summarizer_config import CompactionSettings
 from letta.settings import settings
-from letta.validators import ConversationId
+from letta.validators import ConversationId, ConversationIdOrDefault
 router = APIRouter(prefix="/conversations", tags=["conversations"])
@@ -148,7 +150,8 @@ ConversationMessagesResponse = Annotated[
    operation_id="list_conversation_messages",
 )
 async def list_conversation_messages(
-    conversation_id: ConversationId,
+    conversation_id: ConversationIdOrDefault,
    agent_id: Optional[str] = Query(None, description="Agent ID for agent-direct mode with 'default' conversation"),
    server: SyncServer = Depends(get_letta_server),
    headers: HeaderParams = Depends(get_headers),
    before: Optional[str] = Query(
@@ -172,8 +175,36 @@ async def list_conversation_messages(
    Returns LettaMessage objects (UserMessage, AssistantMessage, etc.) for all
    messages in the conversation, with support for cursor-based pagination.
    **Agent-direct mode**: Pass conversation_id="default" with agent_id parameter
    to list messages from the agent's default conversation.
    **Deprecated**: Passing an agent ID as conversation_id still works but will be removed.
    """
    actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
    # Agent-direct mode: conversation_id="default" + agent_id param (preferred)
    # OR conversation_id="agent-*" (backwards compat, deprecated)
    resolved_agent_id = None
    if conversation_id == "default" and agent_id:
        resolved_agent_id = agent_id
    elif conversation_id.startswith("agent-"):
        resolved_agent_id = conversation_id
    if resolved_agent_id:
        return await server.get_agent_recall_async(
            agent_id=resolved_agent_id,
            after=after,
            before=before,
            limit=limit,
            group_id=group_id,
            conversation_id=None,  # Default conversation (no isolation)
            reverse=(order == "desc"),
            return_message_object=False,
            include_err=include_err,
            actor=actor,
        )
    return await conversation_manager.list_conversation_messages(
        conversation_id=conversation_id,
        actor=actor,
@@ -186,6 +217,108 @@ async def list_conversation_messages(
    )
 async def _send_agent_direct_message(
    agent_id: str,
    request: ConversationMessageRequest,
    server: SyncServer,
    actor,
    billing_context: "BillingContext | None" = None,
 ) -> StreamingResponse | LettaResponse:
    """
    Handle agent-direct messaging with locking but without conversation features.
    This is used when the conversation_id in the URL is actually an agent ID,
    providing a unified endpoint while maintaining agent-level locking.
    """
    redis_client = await get_redis_client()
    # Streaming mode (default)
    if request.streaming:
        streaming_request = LettaStreamingRequest(
            messages=request.messages,
            streaming=True,
            stream_tokens=request.stream_tokens,
            include_pings=request.include_pings,
            background=request.background,
            max_steps=request.max_steps,
            use_assistant_message=request.use_assistant_message,
            assistant_message_tool_name=request.assistant_message_tool_name,
            assistant_message_tool_kwarg=request.assistant_message_tool_kwarg,
            include_return_message_types=request.include_return_message_types,
            override_model=request.override_model,
            client_tools=request.client_tools,
        )
        streaming_service = StreamingService(server)
        run, result = await streaming_service.create_agent_stream(
            agent_id=agent_id,
            actor=actor,
            request=streaming_request,
            run_type="send_message",
            conversation_id=None,
            should_lock=True,
            billing_context=billing_context,
        )
        return result
    # Non-streaming mode with locking
    agent = await server.agent_manager.get_agent_by_id_async(
        agent_id,
        actor,
        include_relationships=["memory", "multi_agent_group", "sources", "tool_exec_environment_variables", "tools", "tags"],
    )
    # Handle model override if specified in the request
    if request.override_model:
        override_llm_config = await server.get_llm_config_from_handle_async(
            actor=actor,
            handle=request.override_model,
        )
        agent = agent.model_copy(update={"llm_config": override_llm_config})
    # Acquire lock using agent_id as lock key
    if not isinstance(redis_client, NoopAsyncRedisClient):
        await redis_client.acquire_conversation_lock(
            conversation_id=agent_id,
            token=str(uuid4()),
        )
    try:
        # Create a run for execution tracking
        run = None
        if settings.track_agent_run:
            runs_manager = RunManager()
            run = await runs_manager.create_run(
                pydantic_run=PydanticRun(
                    agent_id=agent_id,
                    background=False,
                    metadata={
                        "run_type": "send_message",
                    },
                    request_config=LettaRequestConfig.from_letta_request(request),
                ),
                actor=actor,
            )
        # Set run_id in Redis for cancellation support
        await redis_client.set(f"{REDIS_RUN_ID_PREFIX}:{agent_id}", run.id if run else None)
        agent_loop = AgentLoop.load(agent_state=agent, actor=actor)
        return await agent_loop.step(
            request.messages,
            max_steps=request.max_steps,
            run_id=run.id if run else None,
            use_assistant_message=request.use_assistant_message,
            include_return_message_types=request.include_return_message_types,
            client_tools=request.client_tools,
            conversation_id=None,
            include_compaction_messages=request.include_compaction_messages,
            billing_context=billing_context,
        )
    finally:
        # Release lock
        await redis_client.release_conversation_lock(agent_id)
@router.post(
    "/{conversation_id}/messages",
    response_model=LettaResponse,
@@ -201,7 +334,7 @@ async def list_conversation_messages(
    },
 )
 async def send_conversation_message(
-    conversation_id: ConversationId,
+    conversation_id: ConversationIdOrDefault,
    request: ConversationMessageRequest = Body(...),
    server: SyncServer = Depends(get_letta_server),
    headers: HeaderParams = Depends(get_headers),
@@ -212,12 +345,36 @@ async def send_conversation_message(
    This endpoint sends a message to an existing conversation.
    By default (streaming=true), returns a streaming response (Server-Sent Events).
    Set streaming=false to get a complete JSON response.
    **Agent-direct mode**: Pass conversation_id="default" with agent_id in request body
    to send messages to the agent's default conversation with locking.
    **Deprecated**: Passing an agent ID as conversation_id still works but will be removed.
    """
    actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
    if not request.messages or len(request.messages) == 0:
        raise HTTPException(status_code=422, detail="Messages must not be empty")
    # Agent-direct mode: conversation_id="default" + agent_id in body (preferred)
    # OR conversation_id="agent-*" (backwards compat, deprecated)
    resolved_agent_id = None
    if conversation_id == "default" and request.agent_id:
        resolved_agent_id = request.agent_id
    elif conversation_id.startswith("agent-"):
        resolved_agent_id = conversation_id
    if resolved_agent_id:
        # Agent-direct mode: use agent ID, enable locking, skip conversation features
        return await _send_agent_direct_message(
            agent_id=resolved_agent_id,
            request=request,
            server=server,
            actor=actor,
            billing_context=headers.billing_context,
        )
    # Normal conversation mode
    conversation = await conversation_manager.get_conversation_by_id(
        conversation_id=conversation_id,
        actor=actor,
@@ -247,6 +404,7 @@ async def send_conversation_message(
            request=streaming_request,
            run_type="send_conversation_message",
            conversation_id=conversation_id,
            billing_context=headers.billing_context,
        )
        return result
@@ -265,6 +423,10 @@ async def send_conversation_message(
        )
        if conversation.model_settings is not None:
            update_params = conversation.model_settings._to_legacy_config_params()
            # Don't clobber max_tokens with the Pydantic default when the caller
            # didn't explicitly provide max_output_tokens.
            if "max_output_tokens" not in conversation.model_settings.model_fields_set:
                update_params.pop("max_tokens", None)
            conversation_llm_config = conversation_llm_config.model_copy(update=update_params)
        agent = agent.model_copy(update={"llm_config": conversation_llm_config})
@@ -305,6 +467,7 @@ async def send_conversation_message(
        client_tools=request.client_tools,
        conversation_id=conversation_id,
        include_compaction_messages=request.include_compaction_messages,
        billing_context=headers.billing_context,
    )
@@ -341,7 +504,7 @@ async def send_conversation_message(
    },
 )
 async def retrieve_conversation_stream(
-    conversation_id: ConversationId,
+    conversation_id: ConversationIdOrDefault,
    request: RetrieveStreamRequest = Body(None),
    headers: HeaderParams = Depends(get_headers),
    server: SyncServer = Depends(get_letta_server),
@@ -351,18 +514,42 @@ async def retrieve_conversation_stream(
    This endpoint allows you to reconnect to an active background stream
    for a conversation, enabling recovery from network interruptions.
    **Agent-direct mode**: Pass conversation_id="default" with agent_id in request body
    to retrieve the stream for the agent's most recent active run.
    **Deprecated**: Passing an agent ID as conversation_id still works but will be removed.
    """
    actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
    runs_manager = RunManager()
-    # Find the most recent active run for this conversation
+    # Agent-direct mode: conversation_id="default" + agent_id in body (preferred)
-    active_runs = await runs_manager.list_runs(
+    # OR conversation_id="agent-*" (backwards compat, deprecated)
-        actor=actor,
+    resolved_agent_id = None
-        conversation_id=conversation_id,
+    if conversation_id == "default" and request and request.agent_id:
-        statuses=[RunStatus.created, RunStatus.running],
+        resolved_agent_id = request.agent_id
-        limit=1,
+    elif conversation_id.startswith("agent-"):
-        ascending=False,
+        resolved_agent_id = conversation_id
-    )
+
    # Find the most recent active run
    if resolved_agent_id:
        # Agent-direct mode: find runs by agent_id
        active_runs = await runs_manager.list_runs(
            actor=actor,
            agent_id=resolved_agent_id,
            statuses=[RunStatus.created, RunStatus.running],
            limit=1,
            ascending=False,
        )
    else:
        # Normal mode: find runs by conversation_id
        active_runs = await runs_manager.list_runs(
            actor=actor,
            conversation_id=conversation_id,
            statuses=[RunStatus.created, RunStatus.running],
            limit=1,
            ascending=False,
        )
    if not active_runs:
        raise LettaInvalidArgumentError("No active runs found for this conversation.")
@@ -417,7 +604,8 @@ async def retrieve_conversation_stream(
@router.post("/{conversation_id}/cancel", operation_id="cancel_conversation")
 async def cancel_conversation(
-    conversation_id: ConversationId,
+    conversation_id: ConversationIdOrDefault,
    agent_id: Optional[str] = Query(None, description="Agent ID for agent-direct mode with 'default' conversation"),
    server: SyncServer = Depends(get_letta_server),
    headers: HeaderParams = Depends(get_headers),
 ) -> dict:
@@ -425,26 +613,58 @@ async def cancel_conversation(
    Cancel runs associated with a conversation.
    Note: To cancel active runs, Redis is required.
    **Agent-direct mode**: Pass conversation_id="default" with agent_id query parameter
    to cancel runs for the agent's default conversation.
    **Deprecated**: Passing an agent ID as conversation_id still works but will be removed.
    """
    actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
    logger.info(
        "[Interrupt] Cancel request received for conversation=%s by actor=%s (org=%s)",
        conversation_id,
        actor.id,
        actor.organization_id,
    )
    if not settings.track_agent_run:
        raise HTTPException(status_code=400, detail="Agent run tracking is disabled")
-    # Verify conversation exists and get agent_id
+    # Agent-direct mode: conversation_id="default" + agent_id param (preferred)
-    conversation = await conversation_manager.get_conversation_by_id(
+    # OR conversation_id="agent-*" (backwards compat, deprecated)
-        conversation_id=conversation_id,
+    resolved_agent_id = None
-        actor=actor,
+    if conversation_id == "default" and agent_id:
-    )
+        resolved_agent_id = agent_id
    elif conversation_id.startswith("agent-"):
        resolved_agent_id = conversation_id
    if resolved_agent_id:
        # Agent-direct mode: use agent_id directly, skip conversation lookup
        # Find active runs for this agent (default conversation has conversation_id=None)
        runs = await server.run_manager.list_runs(
            actor=actor,
            agent_id=resolved_agent_id,
            statuses=[RunStatus.created, RunStatus.running],
            ascending=False,
            limit=100,
        )
    else:
        # Verify conversation exists and get agent_id
        conversation = await conversation_manager.get_conversation_by_id(
            conversation_id=conversation_id,
            actor=actor,
        )
        agent_id = conversation.agent_id
        # Find active runs for this conversation
        runs = await server.run_manager.list_runs(
            actor=actor,
            statuses=[RunStatus.created, RunStatus.running],
            ascending=False,
            conversation_id=conversation_id,
            limit=100,
        )
    # Find active runs for this conversation
    runs = await server.run_manager.list_runs(
        actor=actor,
        statuses=[RunStatus.created, RunStatus.running],
        ascending=False,
        conversation_id=conversation_id,
        limit=100,
    )
    run_ids = [run.id for run in runs]
    if not run_ids:
@@ -461,7 +681,7 @@ async def cancel_conversation(
                except Exception as e:
                    logger.error(f"Failed to cancel Lettuce run {run_id}: {e}")
-            await server.run_manager.cancel_run(actor=actor, agent_id=conversation.agent_id, run_id=run_id)
+            await server.run_manager.cancel_run(actor=actor, agent_id=agent_id, run_id=run_id)
        except Exception as e:
            results[run_id] = "failed"
            logger.error(f"Failed to cancel run {run_id}: {str(e)}")
@@ -473,6 +693,10 @@ async def cancel_conversation(
 class CompactionRequest(BaseModel):
    agent_id: Optional[str] = Field(
        default=None,
        description="Agent ID for agent-direct mode with 'default' conversation. Use with conversation_id='default' in the URL path.",
    )
    compaction_settings: Optional[CompactionSettings] = Field(
        default=None,
        description="Optional compaction settings to use for this summarization request. If not provided, the agent's default settings will be used.",
@@ -487,7 +711,7 @@ class CompactionResponse(BaseModel):
@router.post("/{conversation_id}/compact", response_model=CompactionResponse, operation_id="compact_conversation")
 async def compact_conversation(
-    conversation_id: ConversationId,
+    conversation_id: ConversationIdOrDefault,
    request: Optional[CompactionRequest] = Body(default=None),
    server: SyncServer = Depends(get_letta_server),
    headers: HeaderParams = Depends(get_headers),
@@ -497,23 +721,45 @@ async def compact_conversation(
    This endpoint summarizes the in-context messages for a specific conversation,
    reducing the message count while preserving important context.
    **Agent-direct mode**: Pass conversation_id="default" with agent_id in request body
    to compact the agent's default conversation messages.
    **Deprecated**: Passing an agent ID as conversation_id still works but will be removed.
    """
    actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
-    # Get the conversation to find the agent_id
+    # Agent-direct mode: conversation_id="default" + agent_id in body (preferred)
-    conversation = await conversation_manager.get_conversation_by_id(
+    # OR conversation_id="agent-*" (backwards compat, deprecated)
-        conversation_id=conversation_id,
+    resolved_agent_id = None
-        actor=actor,
+    if conversation_id == "default" and request and request.agent_id:
-    )
+        resolved_agent_id = request.agent_id
    elif conversation_id.startswith("agent-"):
        resolved_agent_id = conversation_id
-    # Get the agent state
+    if resolved_agent_id:
-    agent = await server.agent_manager.get_agent_by_id_async(conversation.agent_id, actor, include_relationships=["multi_agent_group"])
+        # Agent-direct mode: compact agent's default conversation
        agent = await server.agent_manager.get_agent_by_id_async(resolved_agent_id, actor, include_relationships=["multi_agent_group"])
        in_context_messages = await server.message_manager.get_messages_by_ids_async(message_ids=agent.message_ids, actor=actor)
        agent_loop = LettaAgentV3(agent_state=agent, actor=actor)
    else:
        # Get the conversation to find the agent_id
        conversation = await conversation_manager.get_conversation_by_id(
            conversation_id=conversation_id,
            actor=actor,
        )
-    # Get in-context messages for this conversation
+        # Get the agent state
-    in_context_messages = await conversation_manager.get_messages_for_conversation(
+        agent = await server.agent_manager.get_agent_by_id_async(conversation.agent_id, actor, include_relationships=["multi_agent_group"])
-        conversation_id=conversation_id,
+
-        actor=actor,
+        # Get in-context messages for this conversation
-    )
+        in_context_messages = await conversation_manager.get_messages_for_conversation(
            conversation_id=conversation_id,
            actor=actor,
        )
        # Create agent loop with conversation context
        agent_loop = LettaAgentV3(agent_state=agent, actor=actor, conversation_id=conversation_id)
    if not in_context_messages:
        raise HTTPException(
@@ -521,10 +767,27 @@ async def compact_conversation(
            detail="No in-context messages found for this conversation.",
        )
-    # Create agent loop with conversation context
+    # Merge request compaction_settings with agent's settings (request overrides agent)
-    agent_loop = LettaAgentV3(agent_state=agent, actor=actor, conversation_id=conversation_id)
+    if agent.compaction_settings and request and request.compaction_settings:
        # Start with agent's settings, override with new values from request
        # Use model_fields_set to get the fields that were changed in the request (want to ignore the defaults that get set automatically)
        compaction_settings = agent.compaction_settings.copy()  # do not mutate original agent compaction settings
        changed_fields = request.compaction_settings.model_fields_set
        for field in changed_fields:
            setattr(compaction_settings, field, getattr(request.compaction_settings, field))
-    compaction_settings = request.compaction_settings if request else None
+        # If mode changed from agent's original settings and prompt not explicitly set in request, then use the default prompt for the new mode
        # Ex: previously was sliding_window, now is all, so we need to use the default prompt for all mode
        if (
            "mode" in changed_fields
            and "prompt" not in changed_fields
            and agent.compaction_settings.mode != request.compaction_settings.mode
        ):
            from letta.services.summarizer.summarizer_config import get_default_prompt_for_mode
            compaction_settings.prompt = get_default_prompt_for_mode(compaction_settings.mode)
    else:
        compaction_settings = (request and request.compaction_settings) or agent.compaction_settings
    num_messages_before = len(in_context_messages)
    # Run compaction
@@ -537,13 +800,11 @@ async def compact_conversation(
    # Validate compaction reduced messages
    if num_messages_before <= num_messages_after:
-        logger.warning(
+        logger.warning(f"Summarization failed to reduce the number of messages. {num_messages_before} messages -> {num_messages_after}.")
-            f"Summarization failed to reduce the number of messages. {num_messages_before} messages -> {num_messages_after} (only expected if drop_tool_returns is True)."
+        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
            detail="Summarization failed to reduce the number of messages. You may not have enough messages to compact or need to use a different CompactionSettings (e.g. using `all` mode).",
        )
        # raise HTTPException(
        #     status_code=status.HTTP_400_BAD_REQUEST,
        #     detail="Summarization failed to reduce the number of messages. You may need to use a different CompactionSettings (e.g. using `all` mode).",
        # )
    # Checkpoint the messages (this will update the conversation_messages table)
    await agent_loop._checkpoint_messages(run_id=None, step_id=None, new_messages=[summary_message], in_context_messages=messages)
--- a/letta/server/rest_api/routers/v1/git_http.py
+++ b/letta/server/rest_api/routers/v1/git_http.py
@@ -29,11 +29,23 @@ from starlette.background import BackgroundTask
 from letta.log import get_logger
 from letta.server.rest_api.dependencies import HeaderParams, get_headers, get_letta_server
 from letta.services.memory_repo.path_mapping import memory_block_label_from_markdown_path
 logger = get_logger(__name__)
 _background_tasks: set[asyncio.Task] = set()
 def _is_syncable_block_markdown_path(path: str) -> bool:
    """Return whether a markdown path should be mirrored into block cache.
    Special-case skills so only skill definitions are mirrored:
    - sync `skills/{skill_name}/SKILL.md` as label `skills/{skill_name}`
    - ignore all other markdown under `skills/`
    """
    return memory_block_label_from_markdown_path(path) is not None
 router = APIRouter(prefix="/git", tags=["git"], include_in_schema=False)
 # Global storage for the server instance (set during app startup)
@@ -100,7 +112,7 @@ async def _sync_after_push(actor_id: str, agent_id: str) -> None:
    expected_labels = set()
    from letta.services.memory_repo.block_markdown import parse_block_markdown
-    md_file_paths = sorted([file_path for file_path in files if file_path.endswith(".md")])
+    md_file_paths = sorted([file_path for file_path in files if _is_syncable_block_markdown_path(file_path)])
    nested_md_file_paths = [file_path for file_path in md_file_paths if "/" in file_path[:-3]]
    logger.info(
        "Post-push sync file scan: agent=%s total_files=%d md_files=%d nested_md_files=%d sample_md_paths=%s",
@@ -113,10 +125,12 @@ async def _sync_after_push(actor_id: str, agent_id: str) -> None:
    synced = 0
    for file_path, content in files.items():
-        if not file_path.endswith(".md"):
+        if not _is_syncable_block_markdown_path(file_path):
            continue
-        label = file_path[:-3]
+        label = memory_block_label_from_markdown_path(file_path)
        if label is None:
            continue
        expected_labels.add(label)
        # Parse frontmatter to extract metadata alongside value
--- a/letta/server/rest_api/utils.py
+++ b/letta/server/rest_api/utils.py
@@ -364,6 +364,8 @@ def create_approval_request_message_from_llm_response(
    )
    if pre_computed_assistant_message_id:
        approval_message.id = decrement_message_uuid(pre_computed_assistant_message_id)
    # Set otid to match streaming interface pattern (index -1 returns id unchanged)
    approval_message.otid = Message.generate_otid_from_id(approval_message.id, -1)
    messages.append(approval_message)
    return messages
--- a/letta/server/server.py
+++ b/letta/server/server.py
@@ -562,6 +562,10 @@ class SyncServer(object):
        # update with model_settings
        if request.model_settings is not None:
            update_llm_config_params = request.model_settings._to_legacy_config_params()
            # Don't clobber max_tokens with the Pydantic default when the caller
            # didn't explicitly provide max_output_tokens in the request.
            if "max_output_tokens" not in request.model_settings.model_fields_set:
                update_llm_config_params.pop("max_tokens", None)
            request.llm_config = request.llm_config.model_copy(update=update_llm_config_params)
        # Copy parallel_tool_calls from request to llm_config if provided
@@ -675,6 +679,12 @@ class SyncServer(object):
                # Get the current agent's llm_config if not already set
                agent = await self.agent_manager.get_agent_by_id_async(agent_id=agent_id, actor=actor)
                request.llm_config = agent.llm_config.model_copy()
            else:
                # TODO: Refactor update_agent to accept partial llm_config so we
                # don't need to fetch the full agent just to preserve max_tokens.
                if request.max_tokens is None and "max_output_tokens" not in request.model_settings.model_fields_set:
                    agent = await self.agent_manager.get_agent_by_id_async(agent_id=agent_id, actor=actor)
                    request.llm_config.max_tokens = agent.llm_config.max_tokens
            update_llm_config_params = request.model_settings._to_legacy_config_params()
            # Don't clobber max_tokens with the Pydantic default when the caller
            # didn't explicitly provide max_output_tokens in the request.
--- a/letta/services/agent_manager.py
+++ b/letta/services/agent_manager.py
@@ -24,8 +24,7 @@ from letta.constants import (
    INCLUDE_MODEL_KEYWORDS_BASE_TOOL_RULES,
    RETRIEVAL_QUERY_DEFAULT_PAGE_SIZE,
 )
-
+from letta.errors import LettaError
 from letta.errors import LettaAgentNotFoundError, LettaError, LettaInvalidArgumentError
 from letta.helpers import ToolRulesSolver
 from letta.helpers.datetime_helpers import get_utc_time
 from letta.log import get_logger
@@ -789,6 +788,25 @@ class AgentManager:
                    agent.agent_type,
                )
            # Upsert compaction_settings: merge incoming partial update with existing settings
            if agent_update.compaction_settings is not None:
                # If mode changed, update the prompt to the default for the new mode
                changed_fields = agent_update.compaction_settings.model_fields_set
                if (
                    agent.compaction_settings is not None
                    and "mode" in changed_fields
                    and agent_update.compaction_settings.mode != agent.compaction_settings.mode
                ):
                    from letta.services.summarizer.summarizer_config import get_default_prompt_for_mode
                    agent_update.compaction_settings.prompt = get_default_prompt_for_mode(agent_update.compaction_settings.mode)
                # Fill in unchanged fields from existing settings
                if agent.compaction_settings is not None:
                    for field in agent.compaction_settings.model_fields:
                        if field not in changed_fields:
                            setattr(agent_update.compaction_settings, field, getattr(agent.compaction_settings, field))
            scalar_updates = {
                "name": agent_update.name,
                "system": agent_update.system,
--- a/letta/services/conversation_manager.py
+++ b/letta/services/conversation_manager.py
@@ -7,6 +7,7 @@ if TYPE_CHECKING:
 from sqlalchemy import and_, asc, delete, desc, func, nulls_last, or_, select
 from letta.errors import LettaInvalidArgumentError
 from letta.helpers.datetime_helpers import get_utc_time
 from letta.orm.agent import Agent as AgentModel
 from letta.orm.block import Block as BlockModel
 from letta.orm.blocks_conversations import BlocksConversations
@@ -29,6 +30,21 @@ from letta.utils import enforce_types
 class ConversationManager:
    """Manager class to handle business logic related to Conversations."""
    @staticmethod
    def _serialize_model_settings(model_settings) -> Optional[dict]:
        """Serialize model settings for DB storage, stripping max_output_tokens if not explicitly set.
        Uses model_dump() to preserve all fields (including the provider_type discriminator),
        but removes max_output_tokens when it wasn't explicitly provided by the caller so we
        don't persist the Pydantic default (4096) and later overwrite the agent's own value.
        """
        if model_settings is None:
            return None
        data = model_settings.model_dump()
        if "max_output_tokens" not in model_settings.model_fields_set:
            data.pop("max_output_tokens", None)
        return data
    @enforce_types
    @trace_method
    async def create_conversation(
@@ -56,7 +72,7 @@ class ConversationManager:
                summary=conversation_create.summary,
                organization_id=actor.organization_id,
                model=conversation_create.model,
-                model_settings=conversation_create.model_settings.model_dump() if conversation_create.model_settings else None,
+                model_settings=self._serialize_model_settings(conversation_create.model_settings),
            )
            await conversation.create_async(session, actor=actor)
@@ -73,7 +89,101 @@ class ConversationManager:
            pydantic_conversation = conversation.to_pydantic()
            pydantic_conversation.isolated_block_ids = isolated_block_ids
-            return pydantic_conversation
+
        # Compile and persist the initial system message for this conversation
        # This ensures the conversation captures the latest memory block state at creation time
        await self.compile_and_save_system_message_for_conversation(
            conversation_id=pydantic_conversation.id,
            agent_id=agent_id,
            actor=actor,
        )
        return pydantic_conversation
    @trace_method
    async def compile_and_save_system_message_for_conversation(
        self,
        conversation_id: str,
        agent_id: str,
        actor: PydanticUser,
        agent_state: Optional["AgentState"] = None,
        message_manager: Optional[object] = None,
    ) -> PydanticMessage:
        """Compile and persist the initial system message for a conversation.
        This recompiles the system prompt with the latest memory block values
        and metadata, ensuring the conversation starts with an up-to-date
        system message.
        This is the single source of truth for creating a conversation's system
        message — used both at conversation creation time and as a fallback
        when a conversation has no messages yet.
        Args:
            conversation_id: The conversation to add the system message to
            agent_id: The agent this conversation belongs to
            actor: The user performing the action
            agent_state: Optional pre-loaded agent state (avoids redundant DB load)
            message_manager: Optional pre-loaded MessageManager instance
        Returns:
            The persisted system message
        """
        # Lazy imports to avoid circular dependencies
        from letta.prompts.prompt_generator import PromptGenerator
        from letta.services.message_manager import MessageManager
        from letta.services.passage_manager import PassageManager
        if message_manager is None:
            message_manager = MessageManager()
        if agent_state is None:
            from letta.services.agent_manager import AgentManager
            agent_state = await AgentManager().get_agent_by_id_async(
                agent_id=agent_id,
                include_relationships=["memory", "sources"],
                actor=actor,
            )
        passage_manager = PassageManager()
        num_messages = await message_manager.size_async(actor=actor, agent_id=agent_id)
        num_archival_memories = await passage_manager.agent_passage_size_async(actor=actor, agent_id=agent_id)
        # Compile the system message with current memory state
        system_message_str = await PromptGenerator.compile_system_message_async(
            system_prompt=agent_state.system,
            in_context_memory=agent_state.memory,
            in_context_memory_last_edit=get_utc_time(),
            timezone=agent_state.timezone,
            user_defined_variables=None,
            append_icm_if_missing=True,
            previous_message_count=num_messages,
            archival_memory_size=num_archival_memories,
            sources=agent_state.sources,
            max_files_open=agent_state.max_files_open,
        )
        system_message = PydanticMessage.dict_to_message(
            agent_id=agent_id,
            model=agent_state.llm_config.model,
            openai_message_dict={"role": "system", "content": system_message_str},
        )
        # Persist the new system message
        persisted_messages = await message_manager.create_many_messages_async([system_message], actor=actor)
        system_message = persisted_messages[0]
        # Add it to the conversation tracking at position 0
        await self.add_messages_to_conversation(
            conversation_id=conversation_id,
            agent_id=agent_id,
            message_ids=[system_message.id],
            actor=actor,
            starting_position=0,
        )
        return system_message
    @enforce_types
    @trace_method
@@ -133,22 +243,15 @@ class ConversationManager:
            if sort_by == "last_run_completion":
                # Subquery to get the latest completed_at for each conversation
                latest_run_subquery = (
-                    select(
+                    select(RunModel.conversation_id, func.max(RunModel.completed_at).label("last_run_completion"))
                        RunModel.conversation_id,
                        func.max(RunModel.completed_at).label("last_run_completion")
                    )
                    .where(RunModel.conversation_id.isnot(None))
                    .group_by(RunModel.conversation_id)
                    .subquery()
                )
                # Join conversations with the subquery
-                stmt = (
+                stmt = select(ConversationModel).outerjoin(
-                    select(ConversationModel)
+                    latest_run_subquery, ConversationModel.id == latest_run_subquery.c.conversation_id
                    .outerjoin(
                        latest_run_subquery,
                        ConversationModel.id == latest_run_subquery.c.conversation_id
                    )
                )
                sort_column = latest_run_subquery.c.last_run_completion
                sort_nulls_last = True
@@ -170,10 +273,12 @@ class ConversationManager:
            # Add summary search filter if provided
            if summary_search:
-                conditions.extend([
+                conditions.extend(
-                    ConversationModel.summary.isnot(None),
+                    [
-                    ConversationModel.summary.contains(summary_search),
+                        ConversationModel.summary.isnot(None),
-                ])
+                        ConversationModel.summary.contains(summary_search),
                    ]
                )
            stmt = stmt.where(and_(*conditions))
@@ -182,10 +287,7 @@ class ConversationManager:
                # Get the sort value for the cursor conversation
                if sort_by == "last_run_completion":
                    cursor_query = (
-                        select(
+                        select(ConversationModel.id, func.max(RunModel.completed_at).label("last_run_completion"))
                            ConversationModel.id,
                            func.max(RunModel.completed_at).label("last_run_completion")
                        )
                        .outerjoin(RunModel, ConversationModel.id == RunModel.conversation_id)
                        .where(ConversationModel.id == after)
                        .group_by(ConversationModel.id)
@@ -198,16 +300,11 @@ class ConversationManager:
                            # Cursor is at NULL - if ascending, get non-NULLs or NULLs with greater ID
                            if ascending:
                                stmt = stmt.where(
-                                    or_(
+                                    or_(and_(sort_column.is_(None), ConversationModel.id > after_id), sort_column.isnot(None))
                                        and_(sort_column.is_(None), ConversationModel.id > after_id),
                                        sort_column.isnot(None)
                                    )
                                )
                            else:
                                # If descending, get NULLs with smaller ID
-                                stmt = stmt.where(
+                                stmt = stmt.where(and_(sort_column.is_(None), ConversationModel.id < after_id))
                                    and_(sort_column.is_(None), ConversationModel.id < after_id)
                                )
                        else:
                            # Cursor is at non-NULL
                            if ascending:
@@ -217,8 +314,8 @@ class ConversationManager:
                                        sort_column.isnot(None),
                                        or_(
                                            sort_column > after_sort_value,
-                                            and_(sort_column == after_sort_value, ConversationModel.id > after_id)
+                                            and_(sort_column == after_sort_value, ConversationModel.id > after_id),
-                                        )
+                                        ),
                                    )
                                )
                            else:
@@ -227,7 +324,7 @@ class ConversationManager:
                                    or_(
                                        sort_column.is_(None),
                                        sort_column < after_sort_value,
-                                        and_(sort_column == after_sort_value, ConversationModel.id < after_id)
+                                        and_(sort_column == after_sort_value, ConversationModel.id < after_id),
                                    )
                                )
                else:
@@ -277,7 +374,11 @@ class ConversationManager:
            for key, value in update_data.items():
                # model_settings needs to be serialized to dict for the JSON column
                if key == "model_settings" and value is not None:
-                    setattr(conversation, key, conversation_update.model_settings.model_dump() if conversation_update.model_settings else value)
+                    setattr(
                        conversation,
                        key,
                        self._serialize_model_settings(conversation_update.model_settings) if conversation_update.model_settings else value,
                    )
                else:
                    setattr(conversation, key, value)
--- a/letta/services/helpers/agent_manager_helper.py
+++ b/letta/services/helpers/agent_manager_helper.py
@@ -604,6 +604,9 @@ def _apply_pagination(
    if sort_by == "last_run_completion":
        sort_column = AgentModel.last_run_completion
        sort_nulls_last = True  # TODO: handle this as a query param eventually
    elif sort_by == "updated_at":
        sort_column = AgentModel.updated_at
        sort_nulls_last = False
    else:
        sort_column = AgentModel.created_at
        sort_nulls_last = False
@@ -637,6 +640,9 @@ async def _apply_pagination_async(
    if sort_by == "last_run_completion":
        sort_column = AgentModel.last_run_completion
        sort_nulls_last = True  # TODO: handle this as a query param eventually
    elif sort_by == "updated_at":
        sort_column = AgentModel.updated_at
        sort_nulls_last = False
    else:
        sort_column = AgentModel.created_at
        sort_nulls_last = False
--- a/letta/services/llm_trace_writer.py
+++ b/letta/services/llm_trace_writer.py
@@ -73,7 +73,6 @@ class LLMTraceWriter:
    def __init__(self):
        self._client = None
        self._shutdown = False
        self._write_lock = asyncio.Lock()  # Serialize writes - clickhouse_connect isn't thread-safe
        # Check if ClickHouse is configured - if not, writing is disabled
        self._enabled = bool(settings.clickhouse_endpoint and settings.clickhouse_password)
@@ -82,11 +81,7 @@ class LLMTraceWriter:
        atexit.register(self._sync_shutdown)
    def _get_client(self):
-        """Initialize ClickHouse client on first use (lazy loading).
+        """Initialize ClickHouse client on first use (lazy loading)."""
        Configures async_insert with wait_for_async_insert=1 for reliable
        server-side batching with acknowledgment.
        """
        if self._client is not None:
            return self._client
@@ -108,8 +103,10 @@ class LLMTraceWriter:
            settings={
                # Enable server-side batching
                "async_insert": 1,
-                # Wait for acknowledgment (reliable)
+                # Don't wait for server-side flush acknowledgment — fire and forget.
-                "wait_for_async_insert": 1,
+                # Waiting (value=1) caused each insert to hold an asyncio.Lock for ~1s,
                # creating unbounded task queues that saturated the event loop under load.
                "wait_for_async_insert": 0,
                # Flush after 1 second if batch not full
                "async_insert_busy_timeout_ms": 1000,
            },
@@ -148,15 +145,15 @@ class LLMTraceWriter:
                row = trace.to_clickhouse_row()
                columns = LLMTrace.clickhouse_columns()
-                # Serialize writes - clickhouse_connect client isn't thread-safe
+                # Run synchronous insert in thread pool. clickhouse-connect supports
-                async with self._write_lock:
+                # multithreaded use via a thread-safe connection pool:
-                    # Run synchronous insert in thread pool
+                # https://clickhouse.com/docs/integrations/language-clients/python/advanced-usage#multithreaded-multiprocess-and-asyncevent-driven-use-cases
-                    await asyncio.to_thread(
+                await asyncio.to_thread(
-                        client.insert,
+                    client.insert,
-                        "llm_traces",
+                    "llm_traces",
-                        [row],
+                    [row],
-                        column_names=columns,
+                    column_names=columns,
-                    )
+                )
                return  # Success
            except Exception as e:
--- a/letta/services/memory_repo/block_markdown.py
+++ b/letta/services/memory_repo/block_markdown.py
@@ -3,11 +3,11 @@
 File format:
    ---
    description: "Who I am and how I approach work"
    limit: 20000
    ---
    My name is Memo. I'm a stateful coding assistant...
 - Frontmatter fields are only rendered when they differ from defaults.
 - ``limit`` is intentionally excluded from frontmatter (deprecated for git-base memory).
 - Files without frontmatter are treated as value-only (backward compat).
 """
@@ -37,12 +37,12 @@ def serialize_block(
    This is used for initial file creation. For updates to existing files,
    prefer `merge_frontmatter_with_body` to preserve user formatting.
    """
-    # description and limit are always included in frontmatter.
+    # description is always included in frontmatter.
    # read_only and metadata are only included when non-default.
    # limit is intentionally excluded (deprecated for git-base memory).
    front: Dict[str, Any] = {}
    front["description"] = description
    front["limit"] = limit if limit is not None else _get_field_default("limit")
    if read_only != _get_field_default("read_only"):
        front["read_only"] = read_only
@@ -111,7 +111,6 @@ def merge_frontmatter_with_body(
    # Desired values
    desired_description = description
    desired_limit = limit if limit is not None else _get_field_default("limit")
    desired_read_only = read_only
    desired_metadata = metadata if metadata is not None else _get_field_default("metadata")
@@ -122,8 +121,9 @@ def merge_frontmatter_with_body(
        parsed["description"] = desired_description
        changed = True
-    if "limit" not in parsed or parsed.get("limit") != desired_limit:
+    # Remove limit from frontmatter if it exists (deprecated for git-base memory)
-        parsed["limit"] = desired_limit
+    if "limit" in parsed:
        del parsed["limit"]
        changed = True
    if desired_read_only != _get_field_default("read_only"):
--- a/letta/services/memory_repo/memfs_client_base.py
+++ b/letta/services/memory_repo/memfs_client_base.py
@@ -21,6 +21,7 @@ from letta.schemas.memory_repo import MemoryCommit
 from letta.schemas.user import User as PydanticUser
 from letta.services.memory_repo.block_markdown import parse_block_markdown, serialize_block
 from letta.services.memory_repo.git_operations import GitOperations
 from letta.services.memory_repo.path_mapping import memory_block_label_from_markdown_path
 from letta.services.memory_repo.storage.local import LocalStorageBackend
 from letta.utils import enforce_types
@@ -133,26 +134,29 @@ class MemfsClient:
        except FileNotFoundError:
            return []
-        # Convert block files to PydanticBlock (metadata is in frontmatter)
+        # Convert block files to PydanticBlock (metadata is in frontmatter).
        # skills/{skill_name}/SKILL.md is mapped to block label skills/{skill_name};
        # other files under skills/ are intentionally ignored.
        blocks = []
        for file_path, content in files.items():
-            if file_path.endswith(".md"):
+            label = memory_block_label_from_markdown_path(file_path)
-                label = file_path[:-3]
+            if label is None:
                continue
-                parsed = parse_block_markdown(content)
+            parsed = parse_block_markdown(content)
-                synthetic_uuid = uuid.UUID(hashlib.md5(f"{agent_id}:{label}".encode()).hexdigest())
+            synthetic_uuid = uuid.UUID(hashlib.md5(f"{agent_id}:{label}".encode()).hexdigest())
-                blocks.append(
+            blocks.append(
-                    PydanticBlock(
+                PydanticBlock(
-                        id=f"block-{synthetic_uuid}",
+                    id=f"block-{synthetic_uuid}",
-                        label=label,
+                    label=label,
-                        value=parsed["value"],
+                    value=parsed["value"],
-                        description=parsed.get("description"),
+                    description=parsed.get("description"),
-                        limit=parsed.get("limit", CORE_MEMORY_BLOCK_CHAR_LIMIT),
+                    limit=parsed.get("limit", CORE_MEMORY_BLOCK_CHAR_LIMIT),
-                        read_only=parsed.get("read_only", False),
+                    read_only=parsed.get("read_only", False),
-                        metadata=parsed.get("metadata", {}),
+                    metadata=parsed.get("metadata", {}),
                    )
                )
            )
        return blocks
--- a/letta/services/memory_repo/path_mapping.py
+++ b/letta/services/memory_repo/path_mapping.py
@@ -0,0 +1,29 @@
 """Helpers for mapping memory-repo markdown paths to block labels.
 Special handling for skills:
 - sync `skills/{skill_name}/SKILL.md` as block label `skills/{skill_name}`
 - ignore all other markdown files under `skills/`
 """
 from __future__ import annotations
 def memory_block_label_from_markdown_path(path: str) -> str | None:
    """Return block label for a syncable markdown path, else None.
    Rules:
    - Non-`.md` files are ignored.
    - `skills/{skill_name}/SKILL.md` -> `skills/{skill_name}`
    - Other `skills/**` markdown files are ignored.
    - All other markdown files map to `path[:-3]`.
    """
    if not path.endswith(".md"):
        return None
    if path.startswith("skills/"):
        parts = path.split("/")
        if len(parts) == 3 and parts[0] == "skills" and parts[1] and parts[2] == "SKILL.md":
            return f"skills/{parts[1]}"
        return None
    return path[:-3]
--- a/letta/services/provider_trace_backends/clickhouse.py
+++ b/letta/services/provider_trace_backends/clickhouse.py
@@ -141,6 +141,9 @@ class ClickhouseProviderTraceBackend(ProviderTraceBackendClient):
            request_json=request_json_str,
            response_json=response_json_str,
            llm_config_json=llm_config_json_str,
            billing_plan_type=provider_trace.billing_context.plan_type if provider_trace.billing_context else None,
            billing_cost_source=provider_trace.billing_context.cost_source if provider_trace.billing_context else None,
            billing_customer_id=provider_trace.billing_context.customer_id if provider_trace.billing_context else None,
        )
    def _extract_usage(self, response_json: dict, provider: str) -> dict:
--- a/letta/services/provider_trace_backends/postgres.py
+++ b/letta/services/provider_trace_backends/postgres.py
@@ -29,7 +29,7 @@ class PostgresProviderTraceBackend(ProviderTraceBackendClient):
    ) -> ProviderTrace:
        """Write full provider trace to provider_traces table."""
        async with db_registry.async_session() as session:
-            provider_trace_model = ProviderTraceModel(**provider_trace.model_dump())
+            provider_trace_model = ProviderTraceModel(**provider_trace.model_dump(exclude={"billing_context"}))
            provider_trace_model.organization_id = actor.organization_id
            if provider_trace.request_json:
--- a/letta/services/run_manager.py
+++ b/letta/services/run_manager.py
@@ -638,7 +638,13 @@ class RunManager:
                raise NoResultFound(f"Run with id {run_id} not found")
            agent_id = run.agent_id
-        logger.debug(f"Cancelling run {run_id} for agent {agent_id}")
+        logger.info(
            "[Interrupt] Processing cancellation for run=%s, agent=%s, current_status=%s, current_stop_reason=%s",
            run_id,
            agent_id,
            run.status if run else "unknown",
            run.stop_reason if run else "unknown",
        )
        # Cancellation should be idempotent: if a run is already terminated, treat this as a no-op.
        # This commonly happens when a run finishes between client request and server handling.
--- a/letta/services/streaming_service.py
+++ b/letta/services/streaming_service.py
@@ -15,6 +15,7 @@ from letta.errors import (
    LettaInvalidArgumentError,
    LettaServiceUnavailableError,
    LLMAuthenticationError,
    LLMEmptyResponseError,
    LLMError,
    LLMRateLimitError,
    LLMTimeoutError,
@@ -33,6 +34,7 @@ from letta.schemas.letta_request import ClientToolSchema, LettaStreamingRequest
 from letta.schemas.letta_response import LettaResponse
 from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType
 from letta.schemas.message import MessageCreate
 from letta.schemas.provider_trace import BillingContext
 from letta.schemas.run import Run as PydanticRun, RunUpdate
 from letta.schemas.usage import LettaUsageStatistics
 from letta.schemas.user import User
@@ -76,6 +78,8 @@ class StreamingService:
        request: LettaStreamingRequest,
        run_type: str = "streaming",
        conversation_id: Optional[str] = None,
        should_lock: bool = False,
        billing_context: "BillingContext | None" = None,
    ) -> tuple[Optional[PydanticRun], Union[StreamingResponse, LettaResponse]]:
        """
        Create a streaming response for an agent.
@@ -86,6 +90,7 @@ class StreamingService:
            request: The LettaStreamingRequest containing all request parameters
            run_type: Type of run for tracking
            conversation_id: Optional conversation ID for conversation-scoped messaging
            should_lock: If True and conversation_id is None, use agent_id as lock key
        Returns:
            Tuple of (run object or None, streaming response)
@@ -116,6 +121,10 @@ class StreamingService:
                )
                if conversation.model_settings is not None:
                    update_params = conversation.model_settings._to_legacy_config_params()
                    # Don't clobber max_tokens with the Pydantic default when the caller
                    # didn't explicitly provide max_output_tokens.
                    if "max_output_tokens" not in conversation.model_settings.model_fields_set:
                        update_params.pop("max_tokens", None)
                    conversation_llm_config = conversation_llm_config.model_copy(update=update_params)
                agent = agent.model_copy(update={"llm_config": conversation_llm_config})
@@ -130,12 +139,15 @@ class StreamingService:
        model_compatible_token_streaming = self._is_token_streaming_compatible(agent)
-        # Attempt to acquire conversation lock if conversation_id is provided
+        # Determine lock key: use conversation_id if provided, else agent_id if should_lock
-        # This prevents concurrent message processing for the same conversation
+        lock_key = conversation_id if conversation_id else (agent_id if should_lock else None)
        # Attempt to acquire lock if lock_key is set
        # This prevents concurrent message processing for the same conversation/agent
        # Skip locking if Redis is not available (graceful degradation)
-        if conversation_id and not isinstance(redis_client, NoopAsyncRedisClient):
+        if lock_key and not isinstance(redis_client, NoopAsyncRedisClient):
            await redis_client.acquire_conversation_lock(
-                conversation_id=conversation_id,
+                conversation_id=lock_key,
                token=str(uuid4()),
            )
@@ -163,8 +175,10 @@ class StreamingService:
                include_return_message_types=request.include_return_message_types,
                actor=actor,
                conversation_id=conversation_id,
                lock_key=lock_key,  # For lock release (may differ from conversation_id)
                client_tools=request.client_tools,
                include_compaction_messages=request.include_compaction_messages,
                billing_context=billing_context,
            )
            # handle background streaming if requested
@@ -195,7 +209,7 @@ class StreamingService:
                        run_id=run.id,
                        run_manager=self.server.run_manager,
                        actor=actor,
-                        conversation_id=conversation_id,
+                        conversation_id=lock_key,  # Use lock_key for lock release
                    ),
                    label=f"background_stream_processor_{run.id}",
                )
@@ -251,7 +265,7 @@ class StreamingService:
            if settings.track_agent_run and run and run_status:
                await self.server.run_manager.update_run_by_id_async(
                    run_id=run.id,
-                    conversation_id=conversation_id,
+                    conversation_id=lock_key,  # Use lock_key for lock release
                    update=RunUpdate(status=run_status, metadata=run_update_metadata),
                    actor=actor,
                )
@@ -326,8 +340,10 @@ class StreamingService:
        include_return_message_types: Optional[list[MessageType]],
        actor: User,
        conversation_id: Optional[str] = None,
        lock_key: Optional[str] = None,
        client_tools: Optional[list[ClientToolSchema]] = None,
        include_compaction_messages: bool = False,
        billing_context: BillingContext | None = None,
    ) -> AsyncIterator:
        """
        Create a stream with unified error handling.
@@ -356,6 +372,7 @@ class StreamingService:
                    conversation_id=conversation_id,
                    client_tools=client_tools,
                    include_compaction_messages=include_compaction_messages,
                    billing_context=billing_context,
                )
                async for chunk in stream:
@@ -442,6 +459,21 @@ class StreamingService:
                yield f"event: error\ndata: {error_message.model_dump_json()}\n\n"
                # Send [DONE] marker to properly close the stream
                yield "data: [DONE]\n\n"
            except LLMEmptyResponseError as e:
                run_status = RunStatus.failed
                stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response)
                error_message = LettaErrorMessage(
                    run_id=run_id,
                    error_type="llm_empty_response",
                    message="LLM returned an empty response.",
                    detail=str(e),
                )
                error_data = {"error": error_message.model_dump()}
                logger.warning(f"Run {run_id} stopped with LLM empty response: {e}, error_data: {error_message.model_dump()}")
                yield f"data: {stop_reason.model_dump_json()}\n\n"
                yield f"event: error\ndata: {error_message.model_dump_json()}\n\n"
                # Send [DONE] marker to properly close the stream
                yield "data: [DONE]\n\n"
            except LLMError as e:
                run_status = RunStatus.failed
                stop_reason = LettaStopReason(stop_reason=StopReasonType.llm_api_error)
@@ -491,7 +523,7 @@ class StreamingService:
                    stop_reason_value = stop_reason.stop_reason if stop_reason else StopReasonType.error.value
                    await self.runs_manager.update_run_by_id_async(
                        run_id=run_id,
-                        conversation_id=conversation_id,
+                        conversation_id=lock_key,  # Use lock_key for lock release
                        update=RunUpdate(status=run_status, stop_reason=stop_reason_value, metadata=error_data),
                        actor=actor,
                    )
--- a/letta/services/summarizer/compact.py
+++ b/letta/services/summarizer/compact.py
@@ -96,6 +96,10 @@ async def build_summarizer_llm_config(
        # them just like server.create_agent_async does for agents.
        if summarizer_config.model_settings is not None:
            update_params = summarizer_config.model_settings._to_legacy_config_params()
            # Don't clobber max_tokens with the Pydantic default when the caller
            # didn't explicitly provide max_output_tokens.
            if "max_output_tokens" not in summarizer_config.model_settings.model_fields_set:
                update_params.pop("max_tokens", None)
            return base.model_copy(update=update_params)
        return base
--- a/letta/services/summarizer/self_summarizer.py
+++ b/letta/services/summarizer/self_summarizer.py
@@ -196,7 +196,7 @@ async def self_summarize_sliding_window(
            return message.tool_calls is not None and len(message.tool_calls) > 0
        return False
-    post_summarization_buffer = [system_prompt]
+    post_summarization_buffer = []
    while approx_token_count >= goal_tokens and eviction_percentage < 1.0:
        # more eviction percentage
        eviction_percentage += 0.10
@@ -217,8 +217,8 @@ async def self_summarize_sliding_window(
        # update token count
        logger.info(f"Attempting to compact messages to index {assistant_message_index} messages")
-        post_summarization_buffer = [system_prompt, *messages[assistant_message_index:]]
+        post_summarization_buffer = list(messages[assistant_message_index:])
-        approx_token_count = await count_tokens(actor, agent_llm_config, post_summarization_buffer)
+        approx_token_count = await count_tokens(actor, agent_llm_config, [system_prompt, *post_summarization_buffer])
        logger.info(
            f"Compacting messages index 1:{assistant_message_index} messages resulted in {approx_token_count} tokens, goal is {goal_tokens}"
        )
--- a/letta/services/summarizer/summarizer_config.py
+++ b/letta/services/summarizer/summarizer_config.py
@@ -11,7 +11,7 @@ from letta.settings import summarizer_settings
 def get_default_summarizer_model(provider_type: ProviderType) -> str | None:
    """Get default model for summarization for given provider type."""
    summarizer_defaults = {
-        ProviderType.anthropic: "anthropic/claude-haiku-4-5-20251001",
+        ProviderType.anthropic: "anthropic/claude-haiku-4-5",
        ProviderType.openai: "openai/gpt-5-mini",
        ProviderType.google_ai: "google_ai/gemini-2.5-flash",
    }
--- a/letta/settings.py
+++ b/letta/settings.py
@@ -114,7 +114,7 @@ class SummarizerSettings(BaseSettings):
 class ModelSettings(BaseSettings):
    model_config = SettingsConfigDict(env_file=".env", extra="ignore")
-    global_max_context_window_limit: int = 32000
+    global_max_context_window_limit: int = 128000
    inner_thoughts_kwarg: str | None = Field(default=INNER_THOUGHTS_KWARG, description="Key used for passing in inner thoughts.")
@@ -204,6 +204,7 @@ class ModelSettings(BaseSettings):
    gemini_base_url: str = "https://generativelanguage.googleapis.com/"
    gemini_force_minimum_thinking_budget: bool = False
    gemini_max_retries: int = 5
    gemini_timeout_seconds: float = 600.0
    # google vertex
    google_cloud_project: Optional[str] = None
--- a/letta/validators.py
+++ b/letta/validators.py
@@ -45,30 +45,36 @@ PATH_VALIDATORS = {primitive_type.value: _create_path_validator_factory(primitiv
 def _create_conversation_id_or_default_path_validator_factory():
-    """Conversation IDs accept the usual primitive format or the special value 'default'."""
+    """Conversation IDs with support for 'default' and agent IDs (backwards compatibility)."""
-    primitive = PrimitiveType.CONVERSATION.value
+    conversation_primitive = PrimitiveType.CONVERSATION.value
-    prefix_pattern = PRIMITIVE_ID_PATTERNS[primitive].pattern
+    agent_primitive = PrimitiveType.AGENT.value
-    # Make the full regex accept either the primitive ID format or 'default'.
+    conversation_pattern = PRIMITIVE_ID_PATTERNS[conversation_primitive].pattern
-    # `prefix_pattern` already contains the ^...$ anchors.
+    agent_pattern = PRIMITIVE_ID_PATTERNS[agent_primitive].pattern
-    conversation_or_default_pattern = f"^(default|{prefix_pattern[1:-1]})$"
+    # Make the full regex accept: conversation ID, agent ID, or 'default'.
    # Patterns already contain ^...$ anchors, so strip them for the alternation.
    conversation_or_agent_or_default_pattern = f"^(default|{conversation_pattern[1:-1]}|{agent_pattern[1:-1]})$"
    def factory():
        return Path(
-            description=(f"The conversation identifier. Either the special value 'default' or an ID in the format '{primitive}-<uuid4>'"),
+            description=(
-            pattern=conversation_or_default_pattern,
+                f"The conversation identifier. Can be a conversation ID ('{conversation_primitive}-<uuid4>'), "
-            examples=["default", f"{primitive}-123e4567-e89b-42d3-8456-426614174000"],
+                f"'default' for agent-direct mode (with agent_id parameter), "
                f"or an agent ID ('{agent_primitive}-<uuid4>') for backwards compatibility (deprecated)."
            ),
            pattern=conversation_or_agent_or_default_pattern,
            examples=[
                "default",
                f"{conversation_primitive}-123e4567-e89b-42d3-8456-426614174000",
                f"{agent_primitive}-123e4567-e89b-42d3-8456-426614174000",
            ],
            min_length=1,
-            max_length=len(primitive) + 1 + 36,
+            max_length=max(len(conversation_primitive), len(agent_primitive)) + 1 + 36,
        )
    return factory
 # Override conversation ID path validation to also allow the special value 'default'.
 PATH_VALIDATORS[PrimitiveType.CONVERSATION.value] = _create_conversation_id_or_default_path_validator_factory()
 # Type aliases for common ID types
 # These can be used directly in route handler signatures for cleaner code
 AgentId = Annotated[str, PATH_VALIDATORS[PrimitiveType.AGENT.value]()]
@@ -89,6 +95,10 @@ StepId = Annotated[str, PATH_VALIDATORS[PrimitiveType.STEP.value]()]
 IdentityId = Annotated[str, PATH_VALIDATORS[PrimitiveType.IDENTITY.value]()]
 ConversationId = Annotated[str, PATH_VALIDATORS[PrimitiveType.CONVERSATION.value]()]
 # Conversation ID with support for 'default' and agent IDs (for agent-direct mode endpoints)
 # Backwards compatible - agent-* will be deprecated in favor of conversation_id='default' + agent_id param
 ConversationIdOrDefault = Annotated[str, _create_conversation_id_or_default_path_validator_factory()()]
 # Infrastructure types
 McpServerId = Annotated[str, PATH_VALIDATORS[PrimitiveType.MCP_SERVER.value]()]
 McpOAuthId = Annotated[str, PATH_VALIDATORS[PrimitiveType.MCP_OAUTH.value]()]
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "letta"
-version = "0.16.5"
+version = "0.16.6"
 description = "Create LLM agents with long-term memory and custom tools"
 authors = [
    {name = "Letta Team", email = "contact@letta.com"},
--- a/tests/adapters/test_letta_llm_stream_adapter_error_handling.py
+++ b/tests/adapters/test_letta_llm_stream_adapter_error_handling.py
@@ -2,6 +2,12 @@ import anthropic
 import httpx
 import openai
 import pytest
 from anthropic.types.beta import (
    BetaMessage,
    BetaRawMessageStartEvent,
    BetaRawMessageStopEvent,
    BetaUsage,
 )
 from google.genai import errors as google_errors
 from letta.adapters.letta_llm_stream_adapter import LettaLLMStreamAdapter
@@ -9,6 +15,7 @@ from letta.errors import (
    ContextWindowExceededError,
    LLMBadRequestError,
    LLMConnectionError,
    LLMEmptyResponseError,
    LLMInsufficientCreditsError,
    LLMServerError,
 )
@@ -287,3 +294,70 @@ def test_openai_client_handle_llm_error_non_credit_api_error():
    result = client.handle_llm_error(error)
    assert isinstance(result, LLMBadRequestError)
    assert not isinstance(result, LLMInsufficientCreditsError)
@pytest.mark.asyncio
 async def test_letta_llm_stream_adapter_raises_empty_response_error_for_anthropic(monkeypatch):
    """LET-7679: Empty streaming responses (no content blocks) should raise LLMEmptyResponseError.
    This tests the case where Opus 4.6 returns a response with:
    - BetaRawMessageStartEvent (with usage tokens)
    - BetaRawMessageStopEvent (end_turn)
    - NO content blocks in between
    This should raise LLMEmptyResponseError, not complete successfully with stop_reason=end_turn.
    """
    class FakeAsyncStream:
        """Mimics anthropic.AsyncStream that returns empty content (no content blocks)."""
        def __init__(self):
            self.events = [
                # Message start with some usage info
                BetaRawMessageStartEvent(
                    type="message_start",
                    message=BetaMessage(
                        id="msg_test_empty",
                        type="message",
                        role="assistant",
                        content=[],  # Empty content
                        model="claude-opus-4-6",
                        stop_reason="end_turn",
                        stop_sequence=None,
                        usage=BetaUsage(input_tokens=1000, output_tokens=26, cache_creation_input_tokens=0, cache_read_input_tokens=0),
                    ),
                ),
                # Message stop immediately after start - no content blocks
                BetaRawMessageStopEvent(type="message_stop"),
            ]
            self.index = 0
        async def __aenter__(self):
            return self
        async def __aexit__(self, exc_type, exc, tb):
            return None
        def __aiter__(self):
            return self
        async def __anext__(self):
            if self.index >= len(self.events):
                raise StopAsyncIteration
            event = self.events[self.index]
            self.index += 1
            return event
    async def fake_stream_async(self, request_data: dict, llm_config):
        return FakeAsyncStream()
    monkeypatch.setattr(AnthropicClient, "stream_async", fake_stream_async, raising=True)
    llm_client = AnthropicClient()
    llm_config = LLMConfig(model="claude-opus-4-6", model_endpoint_type="anthropic", context_window=200000)
    adapter = LettaLLMStreamAdapter(llm_client=llm_client, llm_config=llm_config, call_type=LLMCallType.agent_step)
    gen = adapter.invoke_llm(request_data={}, messages=[], tools=[], use_assistant_message=True)
    with pytest.raises(LLMEmptyResponseError):
        async for _ in gen:
            pass
--- a/tests/configs/llm_model_configs/openai-gpt-5.3-codex.json
+++ b/tests/configs/llm_model_configs/openai-gpt-5.3-codex.json
@@ -0,0 +1,8 @@
 {
  "context_window": 32000,
  "model": "gpt-5.3-codex",
  "model_endpoint_type": "openai",
  "model_endpoint": "https://api.openai.com/v1",
  "model_wrapper": null,
  "reasoning_effort": "low"
 }
--- a/tests/integration_test_batch_api_cron_jobs.py
+++ b/tests/integration_test_batch_api_cron_jobs.py
@@ -141,7 +141,7 @@ async def create_test_agent(name, actor, test_id: Optional[str] = None, model="a
        model="claude-3-7-sonnet-latest",
        model_endpoint_type="anthropic",
        model_endpoint="https://api.anthropic.com/v1",
-        context_window=32000,
+        context_window=128000,
        handle="anthropic/claude-3-7-sonnet-latest",
        put_inner_thoughts_in_kwargs=True,
        max_tokens=4096,
@@ -193,7 +193,7 @@ async def create_test_batch_item(server, batch_id, agent_id, default_user):
        model="claude-3-7-sonnet-latest",
        model_endpoint_type="anthropic",
        model_endpoint="https://api.anthropic.com/v1",
-        context_window=32000,
+        context_window=128000,
        handle="anthropic/claude-3-7-sonnet-latest",
        put_inner_thoughts_in_kwargs=True,
        max_tokens=4096,
--- a/tests/integration_test_conversations_sdk.py
+++ b/tests/integration_test_conversations_sdk.py
@@ -62,12 +62,14 @@ class TestConversationsSDK:
        # Create a conversation
        created = client.conversations.create(agent_id=agent.id)
-        # Retrieve it (should have empty in_context_message_ids initially)
+        # Retrieve it (should have system message from creation)
        retrieved = client.conversations.retrieve(conversation_id=created.id)
        assert retrieved.id == created.id
        assert retrieved.agent_id == created.agent_id
-        assert retrieved.in_context_message_ids == []
+        # Conversation should have 1 system message immediately after creation
        assert len(retrieved.in_context_message_ids) == 1
        assert retrieved.in_context_message_ids[0].startswith("message-")
        # Send a message to the conversation
        list(
@@ -566,6 +568,289 @@ class TestConversationsSDK:
        # Should not contain the cursor message
        assert first_message_id not in [m.id for m in messages_after]
    def test_agent_direct_messaging_via_conversations_endpoint(self, client: Letta, agent):
        """Test sending messages using agent ID as conversation_id (agent-direct mode).
        This allows clients to use a unified endpoint pattern without managing conversation IDs.
        """
        # Send a message using the agent ID directly as conversation_id
        # This should route to agent-direct mode with locking
        messages = list(
            client.conversations.messages.create(
                conversation_id=agent.id,  # Using agent ID instead of conversation ID
                messages=[{"role": "user", "content": "Hello via agent-direct mode!"}],
            )
        )
        # Verify we got a response
        assert len(messages) > 0, "Should receive response messages"
        # Verify we got an assistant message in the response
        assistant_messages = [m for m in messages if hasattr(m, "message_type") and m.message_type == "assistant_message"]
        assert len(assistant_messages) > 0, "Should receive at least one assistant message"
    def test_agent_direct_messaging_with_locking(self, client: Letta, agent):
        """Test that agent-direct mode properly acquires and releases locks.
        Sequential requests should both succeed if locks are properly released.
        """
        from letta.settings import settings
        # Skip if Redis is not configured
        if settings.redis_host is None or settings.redis_port is None:
            pytest.skip("Redis not configured - skipping agent-direct lock test")
        # Send first message via agent-direct mode
        messages1 = list(
            client.conversations.messages.create(
                conversation_id=agent.id,
                messages=[{"role": "user", "content": "First message"}],
            )
        )
        assert len(messages1) > 0, "First message should succeed"
        # Send second message - should succeed if lock was released
        messages2 = list(
            client.conversations.messages.create(
                conversation_id=agent.id,
                messages=[{"role": "user", "content": "Second message"}],
            )
        )
        assert len(messages2) > 0, "Second message should succeed after lock released"
    def test_agent_direct_concurrent_requests_blocked(self, client: Letta, agent):
        """Test that concurrent requests to agent-direct mode are properly serialized.
        One request should succeed and one should get a 409 CONVERSATION_BUSY error.
        """
        import concurrent.futures
        from letta_client import ConflictError
        from letta.settings import settings
        # Skip if Redis is not configured
        if settings.redis_host is None or settings.redis_port is None:
            pytest.skip("Redis not configured - skipping agent-direct lock test")
        results = {"success": 0, "conflict": 0, "other_error": 0}
        def send_message(msg: str):
            try:
                messages = list(
                    client.conversations.messages.create(
                        conversation_id=agent.id,  # Agent-direct mode
                        messages=[{"role": "user", "content": msg}],
                    )
                )
                return ("success", messages)
            except ConflictError:
                return ("conflict", None)
            except Exception as e:
                return ("other_error", str(e))
        # Fire off two messages concurrently
        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
            future1 = executor.submit(send_message, "Concurrent message 1")
            future2 = executor.submit(send_message, "Concurrent message 2")
            result1 = future1.result()
            result2 = future2.result()
        # Count results
        for result_type, _ in [result1, result2]:
            results[result_type] += 1
        # One should succeed and one should get conflict
        assert results["success"] == 1, f"Expected 1 success, got {results['success']}"
        assert results["conflict"] == 1, f"Expected 1 conflict, got {results['conflict']}"
        assert results["other_error"] == 0, f"Unexpected errors: {results['other_error']}"
        # Now send another message - should succeed since lock is released
        messages = list(
            client.conversations.messages.create(
                conversation_id=agent.id,
                messages=[{"role": "user", "content": "Message after concurrent requests"}],
            )
        )
        assert len(messages) > 0, "Should be able to send message after concurrent requests complete"
    def test_agent_direct_list_messages(self, client: Letta, agent):
        """Test listing messages using agent ID as conversation_id."""
        # First send a message via agent-direct mode
        list(
            client.conversations.messages.create(
                conversation_id=agent.id,
                messages=[{"role": "user", "content": "Test message for listing"}],
            )
        )
        # List messages using agent ID
        messages_page = client.conversations.messages.list(conversation_id=agent.id)
        messages = list(messages_page)
        # Should have messages (at least system + user + assistant)
        assert len(messages) >= 3, f"Expected at least 3 messages, got {len(messages)}"
        # Verify we can find our test message
        user_messages = [m for m in messages if hasattr(m, "message_type") and m.message_type == "user_message"]
        assert any("Test message for listing" in str(m.content) for m in user_messages), "Should find our test message"
    def test_agent_direct_cancel(self, client: Letta, agent):
        """Test canceling runs using agent ID as conversation_id."""
        from letta.settings import settings
        # Skip if run tracking is disabled
        if not settings.track_agent_run:
            pytest.skip("Run tracking disabled - skipping cancel test")
        # Start a background request that we can cancel
        try:
            # Send a message in background mode
            stream = client.conversations.messages.create(
                conversation_id=agent.id,
                messages=[{"role": "user", "content": "Background message to cancel"}],
                background=True,
            )
            # Consume a bit of the stream to ensure it started
            next(iter(stream), None)
            # Cancel using agent ID
            result = client.conversations.cancel(conversation_id=agent.id)
            # Should return results (may be empty if run already completed)
            assert isinstance(result, dict), "Cancel should return a dict of results"
        except Exception as e:
            # If no active runs, that's okay - the run may have completed quickly
            if "No active runs" not in str(e):
                raise
    def test_backwards_compatibility_old_pattern(self, client: Letta, agent, server_url: str):
        """Test that the old pattern (agent_id as conversation_id) still works for backwards compatibility."""
        # OLD PATTERN: conversation_id=agent.id (should still work)
        # Use raw HTTP requests since SDK might not be up to date
        # Test 1: Send message using old pattern
        response = requests.post(
            f"{server_url}/v1/conversations/{agent.id}/messages",
            json={
                "messages": [{"role": "user", "content": "Testing old pattern still works"}],
                "streaming": False,
            },
        )
        assert response.status_code == 200, f"Old pattern should work for sending messages: {response.text}"
        data = response.json()
        assert "messages" in data, "Response should contain messages"
        assert len(data["messages"]) > 0, "Should receive response messages"
        # Test 2: List messages using old pattern
        response = requests.get(f"{server_url}/v1/conversations/{agent.id}/messages")
        assert response.status_code == 200, f"Old pattern should work for listing messages: {response.text}"
        data = response.json()
        # Response is a list of messages directly
        assert isinstance(data, list), "Response should be a list of messages"
        assert len(data) >= 3, "Should have at least system + user + assistant messages"
        # Verify our message is there
        user_messages = [m for m in data if m.get("message_type") == "user_message"]
        assert any("Testing old pattern still works" in str(m.get("content", "")) for m in user_messages), "Should find our test message"
    def test_new_pattern_send_message(self, client: Letta, agent, server_url: str):
        """Test sending messages using the new pattern: conversation_id='default' + agent_id in body."""
        # NEW PATTERN: conversation_id='default' + agent_id in request body
        response = requests.post(
            f"{server_url}/v1/conversations/default/messages",
            json={
                "agent_id": agent.id,
                "messages": [{"role": "user", "content": "Testing new pattern send message"}],
                "streaming": False,
            },
        )
        assert response.status_code == 200, f"New pattern should work for sending messages: {response.text}"
        data = response.json()
        assert "messages" in data, "Response should contain messages"
        assert len(data["messages"]) > 0, "Should receive response messages"
        # Verify we got an assistant message
        assistant_messages = [m for m in data["messages"] if m.get("message_type") == "assistant_message"]
        assert len(assistant_messages) > 0, "Should receive at least one assistant message"
    def test_new_pattern_list_messages(self, client: Letta, agent, server_url: str):
        """Test listing messages using the new pattern: conversation_id='default' + agent_id query param."""
        # First send a message to populate the conversation
        requests.post(
            f"{server_url}/v1/conversations/{agent.id}/messages",
            json={
                "messages": [{"role": "user", "content": "Setup message for list test"}],
                "streaming": False,
            },
        )
        # NEW PATTERN: conversation_id='default' + agent_id as query param
        response = requests.get(
            f"{server_url}/v1/conversations/default/messages",
            params={"agent_id": agent.id},
        )
        assert response.status_code == 200, f"New pattern should work for listing messages: {response.text}"
        data = response.json()
        # Response is a list of messages directly
        assert isinstance(data, list), "Response should be a list of messages"
        assert len(data) >= 3, "Should have at least system + user + assistant messages"
    def test_new_pattern_cancel(self, client: Letta, agent, server_url: str):
        """Test canceling runs using the new pattern: conversation_id='default' + agent_id query param."""
        from letta.settings import settings
        if not settings.track_agent_run:
            pytest.skip("Run tracking disabled - skipping cancel test")
        # NEW PATTERN: conversation_id='default' + agent_id as query param
        response = requests.post(
            f"{server_url}/v1/conversations/default/cancel",
            params={"agent_id": agent.id},
        )
        # Returns 200 with results if runs exist, or 409 if no active runs
        assert response.status_code in [200, 409], f"New pattern should work for cancel: {response.text}"
        if response.status_code == 200:
            data = response.json()
            assert isinstance(data, dict), "Cancel should return a dict"
    def test_new_pattern_compact(self, client: Letta, agent, server_url: str):
        """Test compacting conversation using the new pattern: conversation_id='default' + agent_id in body."""
        # Send many messages to have enough for compaction
        for i in range(10):
            requests.post(
                f"{server_url}/v1/conversations/{agent.id}/messages",
                json={
                    "messages": [{"role": "user", "content": f"Message {i} for compaction test"}],
                    "streaming": False,
                },
            )
        # NEW PATTERN: conversation_id='default' + agent_id in request body
        response = requests.post(
            f"{server_url}/v1/conversations/default/compact",
            json={"agent_id": agent.id},
        )
        # May return 200 (success) or 400 (not enough messages to compact)
        assert response.status_code in [200, 400], f"New pattern should accept agent_id parameter: {response.text}"
        if response.status_code == 200:
            data = response.json()
            assert "summary" in data, "Response should contain summary"
            assert "num_messages_before" in data, "Response should contain num_messages_before"
            assert "num_messages_after" in data, "Response should contain num_messages_after"
    def test_new_pattern_stream_retrieve(self, client: Letta, agent, server_url: str):
        """Test retrieving stream using the new pattern: conversation_id='default' + agent_id in body."""
        # NEW PATTERN: conversation_id='default' + agent_id in request body
        # Note: This will likely return 400 if no active run exists, which is expected
        response = requests.post(
            f"{server_url}/v1/conversations/default/stream",
            json={"agent_id": agent.id},
        )
        # Either 200 (if run exists) or 400 (no active run) are both acceptable
        assert response.status_code in [200, 400], f"Stream retrieve should accept new pattern: {response.text}"
 class TestConversationDelete:
    """Tests for the conversation delete endpoint."""
@@ -834,3 +1119,130 @@ class TestConversationCompact:
        )
        assert response.status_code == 404
 class TestConversationSystemMessageRecompilation:
    """Tests that verify the system message is recompiled with latest memory state on new conversation creation."""
    def test_new_conversation_recompiles_system_message_with_updated_memory(self, client: Letta, server_url: str):
        """Test the full workflow:
        1. Agent is created
        2. Send message to agent (through a conversation)
        3. Modify the memory block -> check system message is NOT updated with the modified value
        4. Create a new conversation
        5. Check new conversation system message DOES have the modified value
        """
        unique_marker = f"UNIQUE_MARKER_{uuid.uuid4().hex[:8]}"
        # Step 1: Create an agent with known memory blocks
        agent = client.agents.create(
            name=f"test_sys_msg_recompile_{uuid.uuid4().hex[:8]}",
            model="openai/gpt-4o-mini",
            embedding="openai/text-embedding-3-small",
            memory_blocks=[
                {"label": "human", "value": "The user is a test user."},
                {"label": "persona", "value": "You are a helpful assistant."},
            ],
        )
        try:
            # Step 2: Create a conversation and send a message to it
            conv1 = client.conversations.create(agent_id=agent.id)
            list(
                client.conversations.messages.create(
                    conversation_id=conv1.id,
                    messages=[{"role": "user", "content": "Hello, just a quick test."}],
                )
            )
            # Verify the conversation has messages including a system message
            conv1_messages = client.conversations.messages.list(
                conversation_id=conv1.id,
                order="asc",
            )
            assert len(conv1_messages) >= 3  # system + user + assistant
            assert conv1_messages[0].message_type == "system_message"
            # Get the original system message content
            original_system_content = conv1_messages[0].content
            assert unique_marker not in original_system_content, "Marker should not be in original system message"
            # Step 3: Modify the memory block with a unique marker
            client.agents.blocks.update(
                agent_id=agent.id,
                block_label="human",
                value=f"The user is a test user. {unique_marker}",
            )
            # Verify the block was actually updated
            updated_block = client.agents.blocks.retrieve(agent_id=agent.id, block_label="human")
            assert unique_marker in updated_block.value
            # Check that the OLD conversation's system message is NOT updated
            conv1_messages_after_update = client.conversations.messages.list(
                conversation_id=conv1.id,
                order="asc",
            )
            old_system_content = conv1_messages_after_update[0].content
            assert unique_marker not in old_system_content, "Old conversation system message should NOT contain the updated memory value"
            # Step 4: Create a new conversation
            conv2 = client.conversations.create(agent_id=agent.id)
            # Step 5: Check the new conversation's system message has the updated value
            # The system message should be compiled at creation time with the latest memory
            conv2_retrieved = client.conversations.retrieve(conversation_id=conv2.id)
            assert len(conv2_retrieved.in_context_message_ids) == 1, (
                f"New conversation should have exactly 1 system message, got {len(conv2_retrieved.in_context_message_ids)}"
            )
            conv2_messages = client.conversations.messages.list(
                conversation_id=conv2.id,
                order="asc",
            )
            assert len(conv2_messages) >= 1
            assert conv2_messages[0].message_type == "system_message"
            new_system_content = conv2_messages[0].content
            assert unique_marker in new_system_content, (
                f"New conversation system message should contain the updated memory value '{unique_marker}', "
                f"but system message content did not include it"
            )
        finally:
            client.agents.delete(agent_id=agent.id)
    def test_conversation_creation_initializes_system_message(self, client: Letta, server_url: str):
        """Test that creating a conversation immediately initializes it with a system message."""
        agent = client.agents.create(
            name=f"test_conv_init_{uuid.uuid4().hex[:8]}",
            model="openai/gpt-4o-mini",
            embedding="openai/text-embedding-3-small",
            memory_blocks=[
                {"label": "human", "value": "Test user for system message init."},
                {"label": "persona", "value": "You are a helpful assistant."},
            ],
        )
        try:
            # Create a conversation (without sending any messages)
            conversation = client.conversations.create(agent_id=agent.id)
            # Verify the conversation has a system message immediately
            retrieved = client.conversations.retrieve(conversation_id=conversation.id)
            assert len(retrieved.in_context_message_ids) == 1, (
                f"Expected 1 system message after conversation creation, got {len(retrieved.in_context_message_ids)}"
            )
            # Verify the system message content contains memory block values
            messages = client.conversations.messages.list(
                conversation_id=conversation.id,
                order="asc",
            )
            assert len(messages) == 1
            assert messages[0].message_type == "system_message"
            assert "Test user for system message init." in messages[0].content
        finally:
            client.agents.delete(agent_id=agent.id)
--- a/tests/integration_test_multi_agent.py
+++ b/tests/integration_test_multi_agent.py
@@ -93,7 +93,7 @@ def agent_obj(client: Letta) -> AgentState:
        tool_ids=[send_message_to_agent_tool.id],
        model="openai/gpt-4o",
        embedding="openai/text-embedding-3-small",
-        context_window_limit=32000,
+        context_window_limit=128000,
    )
    yield agent_state_instance
@@ -107,7 +107,7 @@ def other_agent_obj(client: Letta) -> AgentState:
        include_multi_agent_tools=False,
        model="openai/gpt-4o",
        embedding="openai/text-embedding-3-small",
-        context_window_limit=32000,
+        context_window_limit=128000,
    )
    yield agent_state_instance
--- a/tests/managers/test_agent_manager.py
+++ b/tests/managers/test_agent_manager.py
@@ -366,6 +366,8 @@ async def test_compaction_settings_model_uses_separate_llm_config_for_summarizat
 async def test_create_agent_sets_default_compaction_model_anthropic(server: SyncServer, default_user):
    """When no compaction_settings provided for Anthropic agent, default haiku model should be set."""
    from letta.schemas.agent import CreateAgent
    from letta.schemas.enums import ProviderType
    from letta.services.summarizer.summarizer_config import get_default_summarizer_model
    await server.init_async(init_with_default_org_and_user=True)
@@ -384,7 +386,7 @@ async def test_create_agent_sets_default_compaction_model_anthropic(server: Sync
    # Should have default haiku model set
    assert agent.compaction_settings is not None
-    assert agent.compaction_settings.model == "anthropic/claude-haiku-4-5-20251001"
+    assert agent.compaction_settings.model == get_default_summarizer_model(ProviderType.anthropic)
@pytest.mark.asyncio
@@ -808,6 +810,79 @@ async def test_update_agent_compaction_settings(server: SyncServer, comprehensiv
    assert updated_agent.compaction_settings.prompt_acknowledgement == False
@pytest.mark.asyncio
 async def test_update_agent_partial_compaction_settings(server: SyncServer, comprehensive_test_agent_fixture, default_user):
    """Test that an agent's compaction_settings can be upserted."""
    from letta.services.summarizer.summarizer_config import get_default_prompt_for_mode
    agent, _ = comprehensive_test_agent_fixture
    # Create new compaction settings
    original_compaction_settings = agent.compaction_settings.model_copy()
    new_compaction_settings = CompactionSettings(
        mode="all",
        prompt_acknowledgement=True,
        clip_chars=3000,
    )
    # Update agent with compaction settings
    update_agent_request = UpdateAgent(
        compaction_settings=new_compaction_settings,
    )
    updated_agent = await server.agent_manager.update_agent_async(agent.id, update_agent_request, actor=default_user)
    # Verify compaction settings were updated correctly
    assert updated_agent.compaction_settings is not None
    assert updated_agent.compaction_settings.model == original_compaction_settings.model
    assert updated_agent.compaction_settings.model_settings == original_compaction_settings.model_settings
    assert updated_agent.compaction_settings.sliding_window_percentage == original_compaction_settings.sliding_window_percentage
    assert updated_agent.compaction_settings.mode == "all"
    assert updated_agent.compaction_settings.clip_chars == 3000
    assert updated_agent.compaction_settings.prompt == get_default_prompt_for_mode("all")
    assert updated_agent.compaction_settings.prompt_acknowledgement == True
@pytest.mark.asyncio
 async def test_update_agent_partial_compaction_settings_same_mode(server: SyncServer, comprehensive_test_agent_fixture, default_user):
    """Test that if the mode stays the same without a prompt passed in, the prompt is not updated."""
    agent, _ = comprehensive_test_agent_fixture
    update_agent_request = UpdateAgent(
        compaction_settings=CompactionSettings(mode="sliding_window", prompt="This is a fake prompt."),
    )
    updated_agent = await server.agent_manager.update_agent_async(agent.id, update_agent_request, actor=default_user)
    assert updated_agent.compaction_settings is not None
    assert updated_agent.compaction_settings.prompt == "This is a fake prompt."
    # Create new compaction settings
    original_compaction_settings = updated_agent.compaction_settings.model_copy()
    new_compaction_settings = CompactionSettings(
        mode="sliding_window",
        model="openai/gpt-4o-mini",
    )
    # Update agent with compaction settings
    update_agent_request = UpdateAgent(
        compaction_settings=new_compaction_settings,
    )
    final_agent = await server.agent_manager.update_agent_async(updated_agent.id, update_agent_request, actor=default_user)
    # Verify compaction settings were updated correctly
    assert final_agent.compaction_settings is not None
    assert final_agent.compaction_settings.sliding_window_percentage == original_compaction_settings.sliding_window_percentage
    assert final_agent.compaction_settings.prompt == original_compaction_settings.prompt
    assert final_agent.compaction_settings.clip_chars == original_compaction_settings.clip_chars
    assert final_agent.compaction_settings.prompt_acknowledgement == original_compaction_settings.prompt_acknowledgement
    assert final_agent.compaction_settings.mode == "sliding_window"
    assert final_agent.compaction_settings.model == "openai/gpt-4o-mini"
@pytest.mark.asyncio
 async def test_agent_file_defaults_based_on_context_window(server: SyncServer, default_user, default_block):
    """Test that file-related defaults are set based on the model's context window size"""
--- a/tests/managers/test_block_manager.py
+++ b/tests/managers/test_block_manager.py
@@ -562,7 +562,9 @@ async def test_update_block(server: SyncServer, default_user):
@pytest.mark.asyncio
 async def test_update_block_limit(server: SyncServer, default_user):
    block_manager = BlockManager()
-    block = await block_manager.create_or_update_block_async(PydanticBlock(label="persona", value="Original Content"), actor=default_user)
+    block = await block_manager.create_or_update_block_async(
        PydanticBlock(label="persona", value="Original Content", limit=20000), actor=default_user
    )
    limit = len("Updated Content") * 2000
    update_data = BlockUpdate(value="Updated Content" * 2000, description="Updated description")
--- a/tests/managers/test_conversation_manager.py
+++ b/tests/managers/test_conversation_manager.py
@@ -355,8 +355,9 @@ async def test_add_messages_to_conversation(
        actor=default_user,
    )
-    assert len(message_ids) == 1
+    # create_conversation auto-creates a system message at position 0
-    assert message_ids[0] == hello_world_message_fixture.id
+    assert len(message_ids) == 2
    assert hello_world_message_fixture.id in message_ids
@pytest.mark.asyncio
@@ -385,8 +386,9 @@ async def test_get_messages_for_conversation(
        actor=default_user,
    )
-    assert len(messages) == 1
+    # create_conversation auto-creates a system message at position 0
-    assert messages[0].id == hello_world_message_fixture.id
+    assert len(messages) == 2
    assert any(m.id == hello_world_message_fixture.id for m in messages)
@pytest.mark.asyncio
@@ -430,7 +432,10 @@ async def test_message_ordering_in_conversation(conversation_manager, server: Sy
        actor=default_user,
    )
-    assert retrieved_ids == [m.id for m in messages]
+    # create_conversation auto-creates a system message at position 0,
    # so the user messages start at index 1
    assert len(retrieved_ids) == len(messages) + 1
    assert retrieved_ids[1:] == [m.id for m in messages]
@pytest.mark.asyncio
@@ -489,7 +494,7 @@ async def test_update_in_context_messages(conversation_manager, server: SyncServ
@pytest.mark.asyncio
 async def test_empty_conversation_message_ids(conversation_manager, server: SyncServer, sarah_agent, default_user):
-    """Test getting message IDs from an empty conversation."""
+    """Test getting message IDs from a newly created conversation (has auto-created system message)."""
    # Create a conversation
    conversation = await conversation_manager.create_conversation(
        agent_id=sarah_agent.id,
@@ -497,13 +502,14 @@ async def test_empty_conversation_message_ids(conversation_manager, server: Sync
        actor=default_user,
    )
-    # Get message IDs (should be empty)
+    # create_conversation auto-creates a system message at position 0,
    # so a newly created conversation has exactly one message
    message_ids = await conversation_manager.get_message_ids_for_conversation(
        conversation_id=conversation.id,
        actor=default_user,
    )
-    assert message_ids == []
+    assert len(message_ids) == 1
@pytest.mark.asyncio
@@ -551,9 +557,11 @@ async def test_list_conversation_messages(conversation_manager, server: SyncServ
        actor=default_user,
    )
-    assert len(letta_messages) == 2
+    # create_conversation auto-creates a system message, so we get 3 total
    assert len(letta_messages) == 3
    # Check message types
    message_types = [m.message_type for m in letta_messages]
    assert "system_message" in message_types
    assert "user_message" in message_types
    assert "assistant_message" in message_types
@@ -902,9 +910,12 @@ async def test_list_conversation_messages_ascending_order(conversation_manager,
        reverse=False,
    )
-    # First message should be "Message 0" (oldest)
+    # create_conversation auto-creates a system message at position 0,
-    assert len(letta_messages) == 3
+    # so we get 4 messages total (system + 3 user messages)
-    assert "Message 0" in letta_messages[0].content
+    assert len(letta_messages) == 4
    # First message is the auto-created system message; "Message 0" is second
    assert letta_messages[0].message_type == "system_message"
    assert "Message 0" in letta_messages[1].content
@pytest.mark.asyncio
@@ -949,8 +960,9 @@ async def test_list_conversation_messages_descending_order(conversation_manager,
        reverse=True,
    )
-    # First message should be "Message 2" (newest)
+    # create_conversation auto-creates a system message, so 4 total
-    assert len(letta_messages) == 3
+    # First message should be "Message 2" (newest) in descending order
    assert len(letta_messages) == 4
    assert "Message 2" in letta_messages[0].content
@@ -1081,7 +1093,8 @@ async def test_list_conversation_messages_no_group_id_returns_all(conversation_m
        actor=default_user,
    )
-    assert len(all_messages) == 3
+    # create_conversation auto-creates a system message, so 4 total
    assert len(all_messages) == 4
@pytest.mark.asyncio
@@ -1137,8 +1150,8 @@ async def test_list_conversation_messages_order_with_pagination(conversation_man
    # The first messages should be different
    assert page_asc[0].content != page_desc[0].content
-    # In ascending, first should be "Message 0"
+    # In ascending, first is the auto-created system message, second is "Message 0"
-    assert "Message 0" in page_asc[0].content
+    assert page_asc[0].message_type == "system_message"
    # In descending, first should be "Message 4"
    assert "Message 4" in page_desc[0].content
--- a/tests/managers/test_provider_manager.py
+++ b/tests/managers/test_provider_manager.py
@@ -579,8 +579,11 @@ async def test_server_startup_syncs_base_providers(default_user, default_organiz
                yield item
    # Mock the Anthropic AsyncAnthropic client
    # NOTE: list() must be a regular (non-async) method that returns an async iterable,
    # because the real Anthropic SDK's models.list() returns an AsyncPage (which has __aiter__)
    # directly, and the code uses `async for model in client.models.list()`.
    class MockAnthropicModels:
-        async def list(self):
+        def list(self):
            return MockAnthropicAsyncPage(mock_anthropic_models["data"])
    class MockAsyncAnthropic:
@@ -877,8 +880,10 @@ async def test_server_startup_handles_api_errors_gracefully(default_user, defaul
            for item in self._items:
                yield item
    # NOTE: The real SDK's models.list() is a regular (non-async) method that
    # returns an AsyncPaginator (which is async-iterable).
    class MockAnthropicModels:
-        async def list(self):
+        def list(self):
            return MockAnthropicAsyncPage(mock_anthropic_data)
    class MockAsyncAnthropic:
--- a/tests/model_settings/openai-gpt-5.3-chat-latest.json
+++ b/tests/model_settings/openai-gpt-5.3-chat-latest.json
@@ -0,0 +1,11 @@
 {
  "handle": "openai/gpt-5.3-chat-latest",
  "model_settings": {
    "provider_type": "openai",
    "max_output_tokens": 4096,
    "parallel_tool_calls": false,
    "reasoning": {
      "reasoning_effort": "minimal"
    }
  }
 }
--- a/tests/sdk/blocks_test.py
+++ b/tests/sdk/blocks_test.py
@@ -1,11 +1,11 @@
 from conftest import create_test_module
 from letta_client import UnprocessableEntityError
-from letta.constants import CORE_MEMORY_HUMAN_CHAR_LIMIT, CORE_MEMORY_PERSONA_CHAR_LIMIT
+from letta.constants import CORE_MEMORY_BLOCK_CHAR_LIMIT
 BLOCKS_CREATE_PARAMS = [
-    ("human_block", {"label": "human", "value": "test"}, {"limit": CORE_MEMORY_HUMAN_CHAR_LIMIT}, None),
+    ("human_block", {"label": "human", "value": "test"}, {"limit": CORE_MEMORY_BLOCK_CHAR_LIMIT}, None),
-    ("persona_block", {"label": "persona", "value": "test1"}, {"limit": CORE_MEMORY_PERSONA_CHAR_LIMIT}, None),
+    ("persona_block", {"label": "persona", "value": "test1"}, {"limit": CORE_MEMORY_BLOCK_CHAR_LIMIT}, None),
 ]
 BLOCKS_UPDATE_PARAMS = [
--- a/tests/test_agent_files/customer_service.af
+++ b/tests/test_agent_files/customer_service.af
--- a/tests/test_agent_files/deep_research_agent.af
+++ b/tests/test_agent_files/deep_research_agent.af
--- a/tests/test_agent_files/knowledge-base.af
+++ b/tests/test_agent_files/knowledge-base.af
@@ -44,7 +44,7 @@
    "provider_name": null,
    "provider_category": null,
    "model_wrapper": null,
-    "context_window": 32000,
+    "context_window": 128000,
    "put_inner_thoughts_in_kwargs": false,
    "handle": "anthropic/claude-3.5-sonnet",
    "temperature": 1.0,
--- a/tests/test_agent_files/memgpt_agent_with_convo.af
+++ b/tests/test_agent_files/memgpt_agent_with_convo.af
--- a/tests/test_agent_files/outreach_workflow_agent.af
+++ b/tests/test_agent_files/outreach_workflow_agent.af
--- a/tests/test_agent_files/test_agent_with_files_and_sources.af
+++ b/tests/test_agent_files/test_agent_with_files_and_sources.af
@@ -56,7 +56,7 @@
        "provider_name": "openai",
        "provider_category": "base",
        "model_wrapper": null,
-        "context_window": 32000,
+        "context_window": 128000,
        "put_inner_thoughts_in_kwargs": true,
        "handle": "openai/gpt-4o-mini",
        "temperature": 1.0,
--- a/tests/test_agent_files/test_basic_agent_with_blocks_tools_messages_v2.af
+++ b/tests/test_agent_files/test_basic_agent_with_blocks_tools_messages_v2.af
@@ -55,7 +55,7 @@
        "provider_name": "openai",
        "provider_category": "base",
        "model_wrapper": null,
-        "context_window": 32000,
+        "context_window": 128000,
        "put_inner_thoughts_in_kwargs": true,
        "handle": "openai/gpt-4.1-mini",
        "temperature": 1.0,
--- a/tests/test_llm_clients.py
+++ b/tests/test_llm_clients.py
@@ -16,7 +16,7 @@ def llm_config():
        model="claude-3-7-sonnet-20250219",
        model_endpoint_type="anthropic",
        model_endpoint="https://api.anthropic.com/v1",
-        context_window=32000,
+        context_window=128000,
        handle="anthropic/claude-sonnet-4-20250514",
        put_inner_thoughts_in_kwargs=False,
        max_tokens=4096,
--- a/tests/test_log_context_middleware.py
+++ b/tests/test_log_context_middleware.py
@@ -52,8 +52,17 @@ class TestLogContextMiddleware:
            async def get_files(self, agent_id, org_id, ref):
                assert ref == "HEAD"
                return {
-                    "system/human.md": "---\ndescription: human\nlimit: 20000\n---\nname: sarah",
+                    "system/human.md": "---\ndescription: human\n---\nname: sarah",
-                    "system/persona.md": "---\ndescription: persona\nlimit: 20000\n---\nbe helpful",
+                    "system/persona.md": "---\ndescription: persona\n---\nbe helpful",
                    "skills/research-helper/SKILL.md": (
                        "---\n"
                        "name: research-helper\n"
                        "description: Search the web and summarize findings.\n"
                        "---\n"
                        "# Research Helper\n\n"
                        "Use this skill to do deep web research and summarize results.\n"
                    ),
                    "skills/research-helper/references/details.md": "---\ndescription: nested\n---\nShould not be synced",
                }
        class DummyMemoryRepoManager:
@@ -95,6 +104,12 @@ class TestLogContextMiddleware:
        labels = {call["label"] for call in synced_calls}
        assert "system/human" in labels
        assert "system/persona" in labels
        assert "skills/research-helper" in labels
        assert "skills/research-helper/references/details" not in labels
        by_label = {call["label"]: call for call in synced_calls}
        assert by_label["skills/research-helper"]["description"] == "Search the web and summarize findings."
        assert by_label["skills/research-helper"]["value"].startswith("# Research Helper")
    def test_extracts_actor_id_from_headers(self, client):
        response = client.get("/v1/agents/agent-123e4567-e89b-42d3-8456-426614174000", headers={"user_id": "user-abc123"})
--- a/tests/test_memory.py
+++ b/tests/test_memory.py
@@ -25,9 +25,9 @@ def test_chat_memory_init_and_utils(chat_memory: Memory):
 def test_memory_limit_validation(chat_memory: Memory):
    with pytest.raises(ValueError):
-        ChatMemory(persona="x " * 50000, human="y " * 50000)
+        ChatMemory(persona="x " * 60000, human="y " * 60000)
    with pytest.raises(ValueError):
-        chat_memory.get_block("persona").value = "x " * 50000
+        chat_memory.get_block("persona").value = "x " * 60000
 def test_get_block_not_found(chat_memory: Memory):
@@ -253,3 +253,104 @@ def test_compile_git_memory_filesystem_handles_leaf_directory_collisions():
    assert "system/" in out
    assert "system.md" in out
    assert "human.md" in out
 def test_compile_git_memory_filesystem_renders_descriptions_for_non_system_files():
    """Files outside system/ should render their description in the filesystem tree.
    e.g. `reference/api.md (Contains API specifications)`
    System files should NOT render descriptions in the tree.
    """
    m = Memory(
        agent_type=AgentType.letta_v1_agent,
        git_enabled=True,
        blocks=[
            Block(label="system/human", value="human data", limit=100, description="The human block"),
            Block(label="system/persona", value="persona data", limit=100, description="The persona block"),
            Block(label="reference/api", value="api specs", limit=100, description="Contains API specifications"),
            Block(label="notes", value="my notes", limit=100, description="Personal notes and reminders"),
        ],
    )
    out = m.compile()
    # Filesystem tree should exist
    assert "<memory_filesystem>" in out
    # Non-system files should have descriptions rendered
    assert "api.md (Contains API specifications)" in out
    assert "notes.md (Personal notes and reminders)" in out
    # System files should NOT have descriptions in the tree
    assert "human.md (The human block)" not in out
    assert "persona.md (The persona block)" not in out
    # But they should still be in the tree (without description)
    assert "human.md" in out
    assert "persona.md" in out
 def test_compile_git_memory_filesystem_no_description_when_empty():
    """Files outside system/ with no description should render without parentheses."""
    m = Memory(
        agent_type=AgentType.letta_v1_agent,
        git_enabled=True,
        blocks=[
            Block(label="system/human", value="human data", limit=100),
            Block(label="notes", value="my notes", limit=100),
            Block(label="reference/api", value="api specs", limit=100, description="API docs"),
        ],
    )
    out = m.compile()
    # notes.md has no description, so no parentheses
    assert "notes.md\n" in out or "notes.md\n" in out
    # reference/api.md has a description
    assert "api.md (API docs)" in out
 def test_compile_git_memory_filesystem_condenses_skills_to_top_level_entries():
    """skills/ should render as top-level skill entries with description.
    We intentionally avoid showing nested files under skills/ in the system
    prompt tree to keep context concise.
    """
    m = Memory(
        agent_type=AgentType.letta_v1_agent,
        git_enabled=True,
        blocks=[
            Block(label="system/human", value="human data", limit=100),
            Block(
                label="skills/searching-messages",
                value="# searching messages",
                limit=100,
                description="Search past messages to recall context.",
            ),
            Block(
                label="skills/creating-skills",
                value="# creating skills",
                limit=100,
                description="Guide for creating effective skills.",
            ),
            Block(
                label="skills/creating-skills/references/workflows",
                value="nested docs",
                limit=100,
                description="Nested workflow docs (should not appear)",
            ),
        ],
    )
    out = m.compile()
    # Condensed top-level skill entries with descriptions.
    assert "searching-messages (Search past messages to recall context.)" in out
    assert "creating-skills (Guide for creating effective skills.)" in out
    # Do not show .md suffixes or nested skill docs in tree.
    assert "searching-messages.md" not in out
    assert "creating-skills.md" not in out
    assert "references/workflows" not in out
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -24,6 +24,9 @@ def test_get_headers_user_id_allows_none():
        letta_v1_agent=None,
        letta_v1_agent_message_async=None,
        modal_sandbox=None,
        billing_plan_type=None,
        billing_cost_source=None,
        billing_customer_id=None,
    )
    assert isinstance(headers, HeaderParams)
@@ -40,6 +43,9 @@ def test_get_headers_user_id_rejects_invalid_format():
            letta_v1_agent=None,
            letta_v1_agent_message_async=None,
            modal_sandbox=None,
            billing_plan_type=None,
            billing_cost_source=None,
            billing_customer_id=None,
        )
@@ -54,6 +60,9 @@ def test_get_headers_user_id_accepts_valid_format():
        letta_v1_agent=None,
        letta_v1_agent_message_async=None,
        modal_sandbox=None,
        billing_plan_type=None,
        billing_cost_source=None,
        billing_customer_id=None,
    )
    assert headers.actor_id == "user-123e4567-e89b-42d3-8456-426614174000"
--- a/uv.lock
+++ b/uv.lock
@@ -2510,7 +2510,7 @@ wheels = [
 [[package]]
 name = "letta"
-version = "0.16.5"
+version = "0.16.6"
 source = { editable = "." }
 dependencies = [
    { name = "aiofiles" },