chore: bump 0.16.6 (#3211)

2026-03-03 19:13:07 -08:00
parent 1b2aa98b3e 9eb7ae39a2
commit 4cb2f21c65
84 changed files with 2540 additions and 407 deletions
--- a/conf.yaml
+++ b/conf.yaml
@@ -260,6 +260,7 @@ model:
    base_url: https://generativelanguage.googleapis.com/
    force_minimum_thinking_budget: false
    max_retries: 5
+    timeout_seconds: 600.0

  # Google Vertex (-> GOOGLE_CLOUD_*)
  # google_cloud:
--- a/fern/openapi.json
+++ b/fern/openapi.json
--- a/fern/scripts/prepare-openapi.ts
+++ b/fern/scripts/prepare-openapi.ts
@@ -0,0 +1,220 @@
+import * as fs from 'fs';
+import * as path from 'path';
+
+import { omit } from 'lodash';
+import { execSync } from 'child_process';
+import { merge, isErrorResult } from 'openapi-merge';
+import type { Swagger } from 'atlassian-openapi';
+import { RESTRICTED_ROUTE_BASE_PATHS } from '@letta-cloud/sdk-core';
+
+const lettaWebOpenAPIPath = path.join(
+  __dirname,
+  '..',
+  '..',
+  '..',
+  'web',
+  'autogenerated',
+  'letta-web-openapi.json',
+);
+const lettaAgentsAPIPath = path.join(
+  __dirname,
+  '..',
+  '..',
+  'letta',
+  'server',
+  'openapi_letta.json',
+);
+
+const lettaWebOpenAPI = JSON.parse(
+  fs.readFileSync(lettaWebOpenAPIPath, 'utf8'),
+) as Swagger.SwaggerV3;
+const lettaAgentsAPI = JSON.parse(
+  fs.readFileSync(lettaAgentsAPIPath, 'utf8'),
+) as Swagger.SwaggerV3;
+
+// removes any routes that are restricted
+lettaAgentsAPI.paths = Object.fromEntries(
+  Object.entries(lettaAgentsAPI.paths).filter(([path]) =>
+    RESTRICTED_ROUTE_BASE_PATHS.every(
+      (restrictedPath) => !path.startsWith(restrictedPath),
+    ),
+  ),
+);
+
+const lettaAgentsAPIWithNoEndslash = Object.keys(lettaAgentsAPI.paths).reduce(
+  (acc, path) => {
+    const pathWithoutSlash = path.endsWith('/')
+      ? path.slice(0, path.length - 1)
+      : path;
+    acc[pathWithoutSlash] = lettaAgentsAPI.paths[path];
+    return acc;
+  },
+  {} as Swagger.SwaggerV3['paths'],
+);
+
+// remove duplicate paths, delete from letta-web-openapi if it exists in sdk-core
+// some paths will have an extra / at the end, so we need to remove that as well
+lettaWebOpenAPI.paths = Object.fromEntries(
+  Object.entries(lettaWebOpenAPI.paths).filter(([path]) => {
+    const pathWithoutSlash = path.endsWith('/')
+      ? path.slice(0, path.length - 1)
+      : path;
+    return !lettaAgentsAPIWithNoEndslash[pathWithoutSlash];
+  }),
+);
+
+const agentStatePathsToOverride: Array<[string, string]> = [
+  ['/v1/templates/{project}/{template_version}/agents', '201'],
+  ['/v1/agents/search', '200'],
+];
+
+for (const [path, responseCode] of agentStatePathsToOverride) {
+  if (lettaWebOpenAPI.paths[path]?.post?.responses?.[responseCode]) {
+    // Get direct reference to the schema object
+    const responseSchema =
+      lettaWebOpenAPI.paths[path].post.responses[responseCode];
+    const contentSchema = responseSchema.content['application/json'].schema;
+
+    // Replace the entire agents array schema with the reference
+    if (contentSchema.properties?.agents) {
+      contentSchema.properties.agents = {
+        type: 'array',
+        items: {
+          $ref: '#/components/schemas/AgentState',
+        },
+      };
+    }
+  }
+}
+
+// go through the paths and remove "user_id"/"actor_id" from the headers
+for (const path of Object.keys(lettaAgentsAPI.paths)) {
+  for (const method of Object.keys(lettaAgentsAPI.paths[path])) {
+    // @ts-expect-error - a
+    if (lettaAgentsAPI.paths[path][method]?.parameters) {
+      // @ts-expect-error - a
+      lettaAgentsAPI.paths[path][method].parameters = lettaAgentsAPI.paths[
+        path
+      ][method].parameters.filter(
+        (param: Record<string, string>) =>
+          param.in !== 'header' ||
+          (
+            param.name !== 'user_id' &&
+            param.name !== 'User-Agent' &&
+            param.name !== 'X-Project-Id' &&
+            param.name !== 'X-Letta-Source' &&
+            param.name !== 'X-Stainless-Package-Version' &&
+            !param.name.startsWith('X-Experimental') &&
+            !param.name.startsWith('X-Billing')
+          ),
+      );
+    }
+  }
+}
+
+const result = merge([
+  {
+    oas: lettaAgentsAPI,
+  },
+  {
+    oas: lettaWebOpenAPI,
+  },
+]);
+
+if (isErrorResult(result)) {
+  console.error(`${result.message} (${result.type})`);
+  process.exit(1);
+}
+
+result.output.openapi = '3.1.0';
+result.output.info = {
+  title: 'Letta API',
+  version: '1.0.0',
+};
+
+result.output.servers = [
+  {
+    url: 'https://app.letta.com',
+    description: 'Letta Cloud',
+  },
+  {
+    url: 'http://localhost:8283',
+    description: 'Self-hosted',
+  },
+];
+
+result.output.components = {
+  ...result.output.components,
+  securitySchemes: {
+    bearerAuth: {
+      type: 'http',
+      scheme: 'bearer',
+    },
+  },
+};
+
+result.output.security = [
+  ...(result.output.security || []),
+  {
+    bearerAuth: [],
+  },
+];
+
+// omit all instances of "user_id" from the openapi.json file
+function deepOmitPreserveArrays(obj: unknown, key: string): unknown {
+  if (Array.isArray(obj)) {
+    return obj.map((item) => deepOmitPreserveArrays(item, key));
+  }
+
+  if (typeof obj !== 'object' || obj === null) {
+    return obj;
+  }
+
+  if (key in obj) {
+    return omit(obj, key);
+  }
+
+  return Object.fromEntries(
+    Object.entries(obj).map(([k, v]) => [k, deepOmitPreserveArrays(v, key)]),
+  );
+}
+
+// eslint-disable-next-line  @typescript-eslint/ban-ts-comment
+// @ts-ignore
+result.output.components = deepOmitPreserveArrays(
+  result.output.components,
+  'user_id',
+);
+
+// eslint-disable-next-line  @typescript-eslint/ban-ts-comment
+// @ts-ignore
+result.output.components = deepOmitPreserveArrays(
+  result.output.components,
+  'actor_id',
+);
+
+// eslint-disable-next-line  @typescript-eslint/ban-ts-comment
+// @ts-ignore
+result.output.components = deepOmitPreserveArrays(
+  result.output.components,
+  'organization_id',
+);
+
+fs.writeFileSync(
+  path.join(__dirname, '..', 'openapi.json'),
+  JSON.stringify(result.output, null, 2),
+);
+
+function formatOpenAPIJson() {
+  const openApiPath = path.join(__dirname, '..', 'openapi.json');
+
+  try {
+    execSync(`npx prettier --write "${openApiPath}"`, { stdio: 'inherit' });
+    console.log('Successfully formatted openapi.json with Prettier');
+  } catch (error) {
+    console.error('Error formatting openapi.json:', error);
+    process.exit(1);
+  }
+}
+
+formatOpenAPIJson();
--- a/letta/init.py
+++ b/letta/init.py
@@ -5,7 +5,7 @@ try:
    __version__ = version("letta")
 except PackageNotFoundError:
    # Fallback for development installations
-    __version__ = "0.16.5"
+    __version__ = "0.16.6"

 if os.environ.get("LETTA_VERSION"):
    __version__ = os.environ["LETTA_VERSION"]
--- a/letta/adapters/letta_llm_adapter.py
+++ b/letta/adapters/letta_llm_adapter.py
@@ -7,6 +7,7 @@ from letta.schemas.letta_message import LettaMessage
 from letta.schemas.letta_message_content import ReasoningContent, RedactedReasoningContent, TextContent
 from letta.schemas.llm_config import LLMConfig
 from letta.schemas.openai.chat_completion_response import ChatCompletionResponse, ChoiceLogprobs, ToolCall
+from letta.schemas.provider_trace import BillingContext
 from letta.schemas.usage import LettaUsageStatistics
 from letta.schemas.user import User
 from letta.services.telemetry_manager import TelemetryManager
@@ -31,6 +32,7 @@ class LettaLLMAdapter(ABC):
        run_id: str | None = None,
        org_id: str | None = None,
        user_id: str | None = None,
+        billing_context: BillingContext | None = None,
    ) -> None:
        self.llm_client: LLMClientBase = llm_client
        self.llm_config: LLMConfig = llm_config
@@ -40,6 +42,7 @@ class LettaLLMAdapter(ABC):
        self.run_id: str | None = run_id
        self.org_id: str | None = org_id
        self.user_id: str | None = user_id
+        self.billing_context: BillingContext | None = billing_context
        self.message_id: str | None = None
        self.request_data: dict | None = None
        self.response_data: dict | None = None
--- a/letta/adapters/letta_llm_stream_adapter.py
+++ b/letta/adapters/letta_llm_stream_adapter.py
@@ -10,7 +10,7 @@ from letta.otel.tracing import log_attributes, safe_json_dumps, trace_method
 from letta.schemas.enums import LLMCallType, ProviderType
 from letta.schemas.letta_message import LettaMessage
 from letta.schemas.llm_config import LLMConfig
-from letta.schemas.provider_trace import ProviderTrace
+from letta.schemas.provider_trace import BillingContext, ProviderTrace
 from letta.schemas.user import User
 from letta.settings import settings
 from letta.utils import safe_create_task
@@ -36,6 +36,7 @@ class LettaLLMStreamAdapter(LettaLLMAdapter):
        run_id: str | None = None,
        org_id: str | None = None,
        user_id: str | None = None,
+        billing_context: "BillingContext | None" = None,
    ) -> None:
        super().__init__(
            llm_client,
@@ -46,6 +47,7 @@ class LettaLLMStreamAdapter(LettaLLMAdapter):
            run_id=run_id,
            org_id=org_id,
            user_id=user_id,
+            billing_context=billing_context,
        )
        self.interface: OpenAIStreamingInterface | AnthropicStreamingInterface | None = None

--- a/letta/adapters/simple_llm_request_adapter.py
+++ b/letta/adapters/simple_llm_request_adapter.py
@@ -51,6 +51,7 @@ class SimpleLLMRequestAdapter(LettaLLMRequestAdapter):
            org_id=self.org_id,
            user_id=self.user_id,
            llm_config=self.llm_config.model_dump() if self.llm_config else None,
+            billing_context=self.billing_context,
        )
        try:
            self.response_data = await self.llm_client.request_async_with_telemetry(request_data, self.llm_config)
--- a/letta/adapters/simple_llm_stream_adapter.py
+++ b/letta/adapters/simple_llm_stream_adapter.py
@@ -278,6 +278,7 @@ class SimpleLLMStreamAdapter(LettaLLMStreamAdapter):
                        org_id=self.org_id,
                        user_id=self.user_id,
                        llm_config=self.llm_config.model_dump() if self.llm_config else None,
+                        billing_context=self.billing_context,
                    ),
                ),
                label="create_provider_trace",
--- a/letta/agents/base_agent.py
+++ b/letta/agents/base_agent.py
@@ -15,6 +15,7 @@ from letta.schemas.letta_message_content import TextContent
 from letta.schemas.letta_response import LettaResponse
 from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType
 from letta.schemas.message import Message, MessageCreate, MessageUpdate
+from letta.schemas.provider_trace import BillingContext
 from letta.schemas.usage import LettaUsageStatistics
 from letta.schemas.user import User
 from letta.services.agent_manager import AgentManager
@@ -51,7 +52,11 @@ class BaseAgent(ABC):

    @abstractmethod
    async def step(
-        self, input_messages: List[MessageCreate], max_steps: int = DEFAULT_MAX_STEPS, run_id: Optional[str] = None
+        self,
+        input_messages: List[MessageCreate],
+        max_steps: int = DEFAULT_MAX_STEPS,
+        run_id: Optional[str] = None,
+        billing_context: "BillingContext | None" = None,
    ) -> LettaResponse:
        """
        Main execution loop for the agent.
--- a/letta/agents/base_agent_v2.py
+++ b/letta/agents/base_agent_v2.py
@@ -12,6 +12,7 @@ from letta.schemas.user import User

 if TYPE_CHECKING:
    from letta.schemas.letta_request import ClientToolSchema
+    from letta.schemas.provider_trace import BillingContext


 class BaseAgentV2(ABC):
@@ -52,6 +53,7 @@ class BaseAgentV2(ABC):
        request_start_timestamp_ns: int | None = None,
        client_tools: list["ClientToolSchema"] | None = None,
        include_compaction_messages: bool = False,  # Not used in V2, but accepted for API compatibility
+        billing_context: "BillingContext | None" = None,
    ) -> LettaResponse:
        """
        Execute the agent loop in blocking mode, returning all messages at once.
@@ -76,6 +78,7 @@ class BaseAgentV2(ABC):
        conversation_id: str | None = None,
        client_tools: list["ClientToolSchema"] | None = None,
        include_compaction_messages: bool = False,  # Not used in V2, but accepted for API compatibility
+        billing_context: "BillingContext | None" = None,
    ) -> AsyncGenerator[LettaMessage | LegacyLettaMessage | MessageStreamStatus, None]:
        """
        Execute the agent loop in streaming mode, yielding chunks as they become available.
--- a/letta/agents/helpers.py
+++ b/letta/agents/helpers.py
@@ -192,44 +192,15 @@ async def _prepare_in_context_messages_no_persist_async(
            # Otherwise, include the full list of messages from the conversation
            current_in_context_messages = await message_manager.get_messages_by_ids_async(message_ids=message_ids, actor=actor)
        else:
-            # No messages in conversation yet - compile a new system message for this conversation
-            # Each conversation gets its own system message (captures memory state at conversation start)
-            from letta.prompts.prompt_generator import PromptGenerator
-            from letta.services.passage_manager import PassageManager
-
-            num_messages = await message_manager.size_async(actor=actor, agent_id=agent_state.id)
-            passage_manager = PassageManager()
-            num_archival_memories = await passage_manager.agent_passage_size_async(actor=actor, agent_id=agent_state.id)
-
-            system_message_str = await PromptGenerator.compile_system_message_async(
-                system_prompt=agent_state.system,
-                in_context_memory=agent_state.memory,
-                in_context_memory_last_edit=get_utc_time(),
-                timezone=agent_state.timezone,
-                user_defined_variables=None,
-                append_icm_if_missing=True,
-                previous_message_count=num_messages,
-                archival_memory_size=num_archival_memories,
-                sources=agent_state.sources,
-                max_files_open=agent_state.max_files_open,
-            )
-            system_message = Message.dict_to_message(
-                agent_id=agent_state.id,
-                model=agent_state.llm_config.model,
-                openai_message_dict={"role": "system", "content": system_message_str},
-            )
-
-            # Persist the new system message
-            persisted_messages = await message_manager.create_many_messages_async([system_message], actor=actor)
-            system_message = persisted_messages[0]
-
-            # Add it to the conversation tracking
-            await conversation_manager.add_messages_to_conversation(
+            # No messages in conversation yet (fallback) - compile a new system message
+            # Normally this is handled at conversation creation time, but this covers
+            # edge cases where a conversation exists without a system message.
+            system_message = await conversation_manager.compile_and_save_system_message_for_conversation(
                conversation_id=conversation_id,
                agent_id=agent_state.id,
-                message_ids=[system_message.id],
                actor=actor,
-                starting_position=0,
+                agent_state=agent_state,
+                message_manager=message_manager,
            )

            current_in_context_messages = [system_message]
--- a/letta/agents/letta_agent.py
+++ b/letta/agents/letta_agent.py
@@ -48,6 +48,7 @@ from letta.schemas.openai.chat_completion_response import (
    UsageStatisticsCompletionTokenDetails,
    UsageStatisticsPromptTokenDetails,
 )
+from letta.schemas.provider_trace import BillingContext
 from letta.schemas.step import StepProgression
 from letta.schemas.step_metrics import StepMetrics
 from letta.schemas.tool_execution_result import ToolExecutionResult
@@ -179,6 +180,7 @@ class LettaAgent(BaseAgent):
        request_start_timestamp_ns: int | None = None,
        include_return_message_types: list[MessageType] | None = None,
        dry_run: bool = False,
+        billing_context: "BillingContext | None" = None,
    ) -> Union[LettaResponse, dict]:
        # TODO (cliandy): pass in run_id and use at send_message endpoints for all step functions
        agent_state = await self.agent_manager.get_agent_by_id_async(
--- a/letta/agents/letta_agent_v2.py
+++ b/letta/agents/letta_agent_v2.py
@@ -44,6 +44,7 @@ from letta.schemas.openai.chat_completion_response import (
    UsageStatisticsCompletionTokenDetails,
    UsageStatisticsPromptTokenDetails,
 )
+from letta.schemas.provider_trace import BillingContext
 from letta.schemas.step import Step, StepProgression
 from letta.schemas.step_metrics import StepMetrics
 from letta.schemas.tool import Tool
@@ -185,6 +186,7 @@ class LettaAgentV2(BaseAgentV2):
        request_start_timestamp_ns: int | None = None,
        client_tools: list[ClientToolSchema] | None = None,
        include_compaction_messages: bool = False,  # Not used in V2, but accepted for API compatibility
+        billing_context: "BillingContext | None" = None,
    ) -> LettaResponse:
        """
        Execute the agent loop in blocking mode, returning all messages at once.
@@ -290,6 +292,7 @@ class LettaAgentV2(BaseAgentV2):
        conversation_id: str | None = None,  # Not used in V2, but accepted for API compatibility
        client_tools: list[ClientToolSchema] | None = None,
        include_compaction_messages: bool = False,  # Not used in V2, but accepted for API compatibility
+        billing_context: BillingContext | None = None,
    ) -> AsyncGenerator[str, None]:
        """
        Execute the agent loop in streaming mode, yielding chunks as they become available.
--- a/letta/agents/letta_agent_v3.py
+++ b/letta/agents/letta_agent_v3.py
@@ -21,7 +21,7 @@ from letta.agents.helpers import (
 )
 from letta.agents.letta_agent_v2 import LettaAgentV2
 from letta.constants import DEFAULT_MAX_STEPS, NON_USER_MSG_PREFIX, REQUEST_HEARTBEAT_PARAM
-from letta.errors import ContextWindowExceededError, LLMError, SystemPromptTokenExceededError
+from letta.errors import ContextWindowExceededError, LLMEmptyResponseError, LLMError, SystemPromptTokenExceededError
 from letta.helpers import ToolRulesSolver
 from letta.helpers.datetime_helpers import get_utc_time, get_utc_timestamp_ns
 from letta.helpers.tool_execution_helper import enable_strict_mode
@@ -45,6 +45,7 @@ from letta.schemas.letta_response import LettaResponse, TurnTokenData
 from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType
 from letta.schemas.message import Message, MessageCreate, ToolReturn
 from letta.schemas.openai.chat_completion_response import ChoiceLogprobs, ToolCall, ToolCallDenial, UsageStatistics
+from letta.schemas.provider_trace import BillingContext
 from letta.schemas.step import StepProgression
 from letta.schemas.step_metrics import StepMetrics
 from letta.schemas.tool_execution_result import ToolExecutionResult
@@ -149,6 +150,7 @@ class LettaAgentV3(LettaAgentV2):
        conversation_id: str | None = None,
        client_tools: list[ClientToolSchema] | None = None,
        include_compaction_messages: bool = False,
+        billing_context: "BillingContext | None" = None,
    ) -> LettaResponse:
        """
        Execute the agent loop in blocking mode, returning all messages at once.
@@ -232,6 +234,7 @@ class LettaAgentV3(LettaAgentV2):
                run_id=run_id,
                org_id=self.actor.organization_id,
                user_id=self.actor.id,
+                billing_context=billing_context,
            )

        credit_task = None
@@ -362,6 +365,7 @@ class LettaAgentV3(LettaAgentV2):
        conversation_id: str | None = None,
        client_tools: list[ClientToolSchema] | None = None,
        include_compaction_messages: bool = False,
+        billing_context: BillingContext | None = None,
    ) -> AsyncGenerator[str, None]:
        """
        Execute the agent loop in streaming mode, yielding chunks as they become available.
@@ -419,6 +423,7 @@ class LettaAgentV3(LettaAgentV2):
                run_id=run_id,
                org_id=self.actor.organization_id,
                user_id=self.actor.id,
+                billing_context=billing_context,
            )
        elif use_sglang_native:
            # Use SGLang native adapter for multi-turn RL training
@@ -431,6 +436,7 @@ class LettaAgentV3(LettaAgentV2):
                run_id=run_id,
                org_id=self.actor.organization_id,
                user_id=self.actor.id,
+                billing_context=billing_context,
            )
            # Reset turns tracking for this step
            self.turns = []
@@ -444,6 +450,7 @@ class LettaAgentV3(LettaAgentV2):
                run_id=run_id,
                org_id=self.actor.organization_id,
                user_id=self.actor.id,
+                billing_context=billing_context,
            )

        try:
@@ -764,7 +771,12 @@ class LettaAgentV3(LettaAgentV2):
            ]
        else:
            # Old behavior: UserMessage with packed JSON
-            return list(Message.to_letta_messages(summary_message))
+            messages = list(Message.to_letta_messages(summary_message))
+            # Set otid on returned messages (summary Message doesn't have otid set at creation)
+            for i, msg in enumerate(messages):
+                if not msg.otid:
+                    msg.otid = Message.generate_otid_from_id(summary_message.id, i)
+            return messages

    @trace_method
    async def _step(
@@ -990,6 +1002,9 @@ class LettaAgentV3(LettaAgentV2):
                    except ValueError as e:
                        self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response.value)
                        raise e
+                    except LLMEmptyResponseError as e:
+                        self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response.value)
+                        raise e
                    except LLMError as e:
                        self.stop_reason = LettaStopReason(stop_reason=StopReasonType.llm_api_error.value)
                        raise e
--- a/letta/config_file.py
+++ b/letta/config_file.py
@@ -134,7 +134,7 @@ def _flatten_model_settings(d: dict, env_vars: dict[str, str]) -> None:
            api_base: yyy    -> OPENAI_API_BASE
        anthropic:
            api_key: zzz     -> ANTHROPIC_API_KEY
-        global_max_context_window_limit: 32000  -> GLOBAL_MAX_CONTEXT_WINDOW_LIMIT
+        global_max_context_window_limit: 128000  -> GLOBAL_MAX_CONTEXT_WINDOW_LIMIT
    """
    for key, value in d.items():
        if isinstance(value, dict):
--- a/letta/constants.py
+++ b/letta/constants.py
@@ -74,7 +74,7 @@ DEFAULT_MAX_STEPS = 50

 # context window size
 MIN_CONTEXT_WINDOW = 4096
-DEFAULT_CONTEXT_WINDOW = 32000
+DEFAULT_CONTEXT_WINDOW = 128000

 # Summarization trigger threshold (multiplier of context_window limit)
 # Summarization triggers when step usage > context_window * SUMMARIZATION_TRIGGER_MULTIPLIER
@@ -253,10 +253,10 @@ LLM_MAX_CONTEXT_WINDOW = {
    "deepseek-reasoner": 64000,
    # glm (Z.AI)
    "glm-4.5": 128000,
-    "glm-4.6": 200000,
-    "glm-4.7": 200000,
-    "glm-5": 200000,
-    "glm-5-code": 200000,
+    "glm-4.6": 180000,
+    "glm-4.7": 180000,
+    "glm-5": 180000,
+    "glm-5-code": 180000,
    ## OpenAI models: https://platform.openai.com/docs/models/overview
    # gpt-5
    "gpt-5": 272000,
@@ -278,6 +278,8 @@ LLM_MAX_CONTEXT_WINDOW = {
    "gpt-5.2-pro": 272000,
    "gpt-5.2-pro-2025-12-11": 272000,
    "gpt-5.2-codex": 272000,
+    # gpt-5.3
+    "gpt-5.3-codex": 272000,
    # reasoners
    "o1": 200000,
    # "o1-pro": 200000,  # responses API only
@@ -419,7 +421,7 @@ MAX_ERROR_MESSAGE_CHAR_LIMIT = 1000
 # Default memory limits
 CORE_MEMORY_PERSONA_CHAR_LIMIT: int = 20000
 CORE_MEMORY_HUMAN_CHAR_LIMIT: int = 20000
-CORE_MEMORY_BLOCK_CHAR_LIMIT: int = 20000
+CORE_MEMORY_BLOCK_CHAR_LIMIT: int = 100000

 # Function return limits
 FUNCTION_RETURN_CHAR_LIMIT = 50000  # ~300 words
--- a/letta/errors.py
+++ b/letta/errors.py
@@ -283,6 +283,15 @@ class LLMServerError(LLMError):
    while processing the request."""


+class LLMEmptyResponseError(LLMServerError):
+    """Error when LLM returns an empty response (no content and no tool calls).
+
+    This is a subclass of LLMServerError to maintain retry behavior, but allows
+    specific handling for empty response cases which may benefit from request
+    modification before retry.
+    """
+
+
 class LLMTimeoutError(LLMError):
    """Error when LLM request times out"""

--- a/letta/groups/sleeptime_multi_agent_v2.py
+++ b/letta/groups/sleeptime_multi_agent_v2.py
@@ -13,6 +13,7 @@ from letta.schemas.letta_message import MessageType
 from letta.schemas.letta_message_content import TextContent
 from letta.schemas.letta_response import LettaResponse
 from letta.schemas.message import Message, MessageCreate
+from letta.schemas.provider_trace import BillingContext
 from letta.schemas.run import Run
 from letta.schemas.user import User
 from letta.services.agent_manager import AgentManager
@@ -69,6 +70,7 @@ class SleeptimeMultiAgentV2(BaseAgent):
        use_assistant_message: bool = True,
        request_start_timestamp_ns: int | None = None,
        include_return_message_types: list[MessageType] | None = None,
+        billing_context: "BillingContext | None" = None,
    ) -> LettaResponse:
        run_ids = []

@@ -100,6 +102,7 @@ class SleeptimeMultiAgentV2(BaseAgent):
            run_id=run_id,
            use_assistant_message=use_assistant_message,
            include_return_message_types=include_return_message_types,
+            billing_context=billing_context,
        )

        # Get last response messages
--- a/letta/groups/sleeptime_multi_agent_v3.py
+++ b/letta/groups/sleeptime_multi_agent_v3.py
@@ -15,6 +15,7 @@ from letta.schemas.letta_request import ClientToolSchema
 from letta.schemas.letta_response import LettaResponse
 from letta.schemas.letta_stop_reason import StopReasonType
 from letta.schemas.message import Message, MessageCreate
+from letta.schemas.provider_trace import BillingContext
 from letta.schemas.run import Run, RunUpdate
 from letta.schemas.user import User
 from letta.services.group_manager import GroupManager
@@ -47,6 +48,7 @@ class SleeptimeMultiAgentV3(LettaAgentV2):
        request_start_timestamp_ns: int | None = None,
        client_tools: list[ClientToolSchema] | None = None,
        include_compaction_messages: bool = False,
+        billing_context: "BillingContext | None" = None,
    ) -> LettaResponse:
        self.run_ids = []

@@ -62,6 +64,7 @@ class SleeptimeMultiAgentV3(LettaAgentV2):
            request_start_timestamp_ns=request_start_timestamp_ns,
            client_tools=client_tools,
            include_compaction_messages=include_compaction_messages,
+            billing_context=billing_context,
        )

        await self.run_sleeptime_agents()
@@ -81,6 +84,7 @@ class SleeptimeMultiAgentV3(LettaAgentV2):
        include_return_message_types: list[MessageType] | None = None,
        client_tools: list[ClientToolSchema] | None = None,
        include_compaction_messages: bool = False,
+        billing_context: "BillingContext | None" = None,
    ) -> AsyncGenerator[str, None]:
        self.run_ids = []

@@ -99,6 +103,7 @@ class SleeptimeMultiAgentV3(LettaAgentV2):
                request_start_timestamp_ns=request_start_timestamp_ns,
                client_tools=client_tools,
                include_compaction_messages=include_compaction_messages,
+                billing_context=billing_context,
            ):
                yield chunk
        finally:
--- a/letta/groups/sleeptime_multi_agent_v4.py
+++ b/letta/groups/sleeptime_multi_agent_v4.py
@@ -14,6 +14,7 @@ from letta.schemas.letta_request import ClientToolSchema
 from letta.schemas.letta_response import LettaResponse
 from letta.schemas.letta_stop_reason import StopReasonType
 from letta.schemas.message import Message, MessageCreate
+from letta.schemas.provider_trace import BillingContext
 from letta.schemas.run import Run, RunUpdate
 from letta.schemas.user import User
 from letta.services.group_manager import GroupManager
@@ -47,6 +48,7 @@ class SleeptimeMultiAgentV4(LettaAgentV3):
        conversation_id: str | None = None,
        client_tools: list[ClientToolSchema] | None = None,
        include_compaction_messages: bool = False,
+        billing_context: "BillingContext | None" = None,
    ) -> LettaResponse:
        self.run_ids = []

@@ -63,6 +65,7 @@ class SleeptimeMultiAgentV4(LettaAgentV3):
            conversation_id=conversation_id,
            client_tools=client_tools,
            include_compaction_messages=include_compaction_messages,
+            billing_context=billing_context,
        )

        run_ids = await self.run_sleeptime_agents()
@@ -82,6 +85,7 @@ class SleeptimeMultiAgentV4(LettaAgentV3):
        conversation_id: str | None = None,
        client_tools: list[ClientToolSchema] | None = None,
        include_compaction_messages: bool = False,
+        billing_context: "BillingContext | None" = None,
    ) -> AsyncGenerator[str, None]:
        self.run_ids = []

@@ -101,6 +105,7 @@ class SleeptimeMultiAgentV4(LettaAgentV3):
                conversation_id=conversation_id,
                client_tools=client_tools,
                include_compaction_messages=include_compaction_messages,
+                billing_context=billing_context,
            ):
                yield chunk
        finally:
--- a/letta/interfaces/anthropic_streaming_interface.py
+++ b/letta/interfaces/anthropic_streaming_interface.py
@@ -30,6 +30,7 @@ from anthropic.types.beta import (
 )

 from letta.constants import DEFAULT_MESSAGE_TOOL, DEFAULT_MESSAGE_TOOL_KWARG
+from letta.errors import LLMEmptyResponseError
 from letta.local_llm.constants import INNER_THOUGHTS_KWARG
 from letta.log import get_logger
 from letta.schemas.letta_message import (
@@ -104,6 +105,10 @@ class AnthropicStreamingInterface:
        self.inner_thoughts_complete = False
        self.put_inner_thoughts_in_kwarg = put_inner_thoughts_in_kwarg

+        # Track whether any content was produced (text or tool calls)
+        # Used to detect empty responses from models like Opus 4.6
+        self.has_content = False
+
        # Buffer to handle partial XML tags across chunks
        self.partial_tag_buffer = ""

@@ -298,9 +303,11 @@ class AnthropicStreamingInterface:

            if isinstance(content, BetaTextBlock):
                self.anthropic_mode = EventMode.TEXT
+                self.has_content = True  # Track that we received text content
                # TODO: Can capture citations, etc.
            elif isinstance(content, BetaToolUseBlock):
                self.anthropic_mode = EventMode.TOOL_USE
+                self.has_content = True  # Track that we received tool use content
                self.tool_call_id = content.id
                self.tool_call_name = content.name
                self.inner_thoughts_complete = False
@@ -589,8 +596,12 @@ class AnthropicStreamingInterface:
            # message_delta event are *cumulative*." So we assign, not accumulate.
            self.output_tokens = event.usage.output_tokens
        elif isinstance(event, BetaRawMessageStopEvent):
-            # Don't do anything here! We don't want to stop the stream.
-            pass
+            # Check if any content was produced during the stream
+            # Empty responses (no text and no tool calls) should raise an error
+            if not self.has_content:
+                raise LLMEmptyResponseError(
+                    message=f"LLM provider returned empty content in streaming response (model: {self.model}, message_id: {self.message_id})"
+                )
        elif isinstance(event, BetaRawContentBlockStopEvent):
            # If we're exiting a tool use block and there are still buffered messages,
            # we should flush them now.
@@ -837,10 +848,12 @@ class SimpleAnthropicStreamingInterface:

            if isinstance(content, BetaTextBlock):
                self.anthropic_mode = EventMode.TEXT
+                self.has_content = True  # Track that we received text content
                # TODO: Can capture citations, etc.

            elif isinstance(content, BetaToolUseBlock):
                self.anthropic_mode = EventMode.TOOL_USE
+                self.has_content = True  # Track that we received tool use content
                self.tool_call_id = content.id
                self.tool_call_name = content.name

@@ -1014,8 +1027,12 @@ class SimpleAnthropicStreamingInterface:
            self.output_tokens = event.usage.output_tokens

        elif isinstance(event, BetaRawMessageStopEvent):
-            # Don't do anything here! We don't want to stop the stream.
-            pass
+            # Check if any content was produced during the stream
+            # Empty responses (no text and no tool calls) should raise an error
+            if not self.has_content:
+                raise LLMEmptyResponseError(
+                    message=f"LLM provider returned empty content in streaming response (model: {self.model}, message_id: {self.message_id})"
+                )

        elif isinstance(event, BetaRawContentBlockStopEvent):
            self.anthropic_mode = None
--- a/letta/llm_api/anthropic_client.py
+++ b/letta/llm_api/anthropic_client.py
@@ -19,6 +19,8 @@ from letta.errors import (
    LLMAuthenticationError,
    LLMBadRequestError,
    LLMConnectionError,
+    LLMEmptyResponseError,
+    LLMError,
    LLMInsufficientCreditsError,
    LLMNotFoundError,
    LLMPermissionDeniedError,
@@ -957,6 +959,11 @@ class AnthropicClient(LLMClientBase):

    @trace_method
    def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
+        # Pass through errors that are already LLMError instances unchanged
+        # This preserves specific error types like LLMEmptyResponseError
+        if isinstance(e, LLMError):
+            return e
+
        is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None

        # make sure to check for overflow errors, regardless of error type
@@ -1278,7 +1285,7 @@ class AnthropicClient(LLMClientBase):
                response.stop_reason,
                json.dumps(response_data),
            )
-            raise LLMServerError(
+            raise LLMEmptyResponseError(
                message=f"LLM provider returned empty content in response (ID: {response.id}, model: {response.model}, stop_reason: {response.stop_reason})",
                code=ErrorCode.INTERNAL_SERVER_ERROR,
                details={
--- a/letta/llm_api/google_ai_client.py
+++ b/letta/llm_api/google_ai_client.py
@@ -9,7 +9,7 @@ from letta.llm_api.google_constants import GOOGLE_MODEL_FOR_API_KEY_CHECK
 from letta.llm_api.google_vertex_client import GoogleVertexClient
 from letta.log import get_logger
 from letta.schemas.llm_config import LLMConfig
-from letta.settings import model_settings, settings
+from letta.settings import model_settings

 logger = get_logger(__name__)

@@ -18,7 +18,7 @@ class GoogleAIClient(GoogleVertexClient):
    provider_label = "Google AI"

    def _get_client(self, llm_config: Optional[LLMConfig] = None):
-        timeout_ms = int(settings.llm_request_timeout_seconds * 1000)
+        timeout_ms = int(model_settings.gemini_timeout_seconds * 1000)
        api_key = None
        if llm_config:
            api_key, _, _ = self.get_byok_overrides(llm_config)
@@ -30,7 +30,7 @@ class GoogleAIClient(GoogleVertexClient):
        )

    async def _get_client_async(self, llm_config: Optional[LLMConfig] = None):
-        timeout_ms = int(settings.llm_request_timeout_seconds * 1000)
+        timeout_ms = int(model_settings.gemini_timeout_seconds * 1000)
        api_key = None
        if llm_config:
            api_key, _, _ = await self.get_byok_overrides_async(llm_config)
--- a/letta/llm_api/llm_client_base.py
+++ b/letta/llm_api/llm_client_base.py
@@ -14,7 +14,7 @@ from letta.schemas.enums import AgentType, LLMCallType, ProviderCategory
 from letta.schemas.llm_config import LLMConfig
 from letta.schemas.message import Message
 from letta.schemas.openai.chat_completion_response import ChatCompletionResponse
-from letta.schemas.provider_trace import ProviderTrace
+from letta.schemas.provider_trace import BillingContext, ProviderTrace
 from letta.schemas.usage import LettaUsageStatistics
 from letta.services.telemetry_manager import TelemetryManager
 from letta.settings import settings
@@ -48,6 +48,7 @@ class LLMClientBase:
        self._telemetry_user_id: Optional[str] = None
        self._telemetry_compaction_settings: Optional[Dict] = None
        self._telemetry_llm_config: Optional[Dict] = None
+        self._telemetry_billing_context: Optional[BillingContext] = None

    def set_telemetry_context(
        self,
@@ -62,6 +63,7 @@ class LLMClientBase:
        compaction_settings: Optional[Dict] = None,
        llm_config: Optional[Dict] = None,
        actor: Optional["User"] = None,
+        billing_context: Optional[BillingContext] = None,
    ) -> None:
        """Set telemetry context for provider trace logging."""
        if actor is not None:
@@ -76,6 +78,7 @@ class LLMClientBase:
        self._telemetry_user_id = user_id
        self._telemetry_compaction_settings = compaction_settings
        self._telemetry_llm_config = llm_config
+        self._telemetry_billing_context = billing_context

    def extract_usage_statistics(self, response_data: Optional[dict], llm_config: LLMConfig) -> LettaUsageStatistics:
        """Provider-specific usage parsing hook (override in subclasses). Returns LettaUsageStatistics."""
@@ -125,6 +128,7 @@ class LLMClientBase:
                                user_id=self._telemetry_user_id,
                                compaction_settings=self._telemetry_compaction_settings,
                                llm_config=llm_config.model_dump() if llm_config else self._telemetry_llm_config,
+                                billing_context=self._telemetry_billing_context,
                            ),
                        )
                    except Exception as e:
@@ -186,6 +190,7 @@ class LLMClientBase:
                    user_id=self._telemetry_user_id,
                    compaction_settings=self._telemetry_compaction_settings,
                    llm_config=llm_config.model_dump() if llm_config else self._telemetry_llm_config,
+                    billing_context=self._telemetry_billing_context,
                ),
            )
        except Exception as e:
--- a/letta/llm_api/openai_client.py
+++ b/letta/llm_api/openai_client.py
@@ -88,7 +88,7 @@ def supports_none_reasoning_effort(model: str) -> bool:

    Currently, GPT-5.1 and GPT-5.2 models support the 'none' reasoning effort level.
    """
-    return model.startswith("gpt-5.1") or model.startswith("gpt-5.2")
+    return model.startswith("gpt-5.1") or model.startswith("gpt-5.2") or model.startswith("gpt-5.3")


 def is_openai_5_model(model: str) -> bool:
@@ -389,7 +389,6 @@ class OpenAIClient(LLMClientBase):
            input=openai_messages_list,
            tools=responses_tools,
            tool_choice=tool_choice,
-            max_output_tokens=llm_config.max_tokens,
            temperature=llm_config.temperature if supports_temperature_param(model) else None,
            parallel_tool_calls=llm_config.parallel_tool_calls if tools and supports_parallel_tool_calling(model) else False,
        )
@@ -397,6 +396,10 @@ class OpenAIClient(LLMClientBase):
        # Handle text configuration (verbosity and response format)
        text_config_kwargs = {}

+        # Only set max_output_tokens if explicitly configured
+        if llm_config.max_tokens is not None:
+            data.max_output_tokens = llm_config.max_tokens
+
        # Add verbosity control for GPT-5 models
        if supports_verbosity_control(model) and llm_config.verbosity:
            text_config_kwargs["verbosity"] = llm_config.verbosity
@@ -451,7 +454,6 @@ class OpenAIClient(LLMClientBase):
        )

        request_data = data.model_dump(exclude_unset=True)
-        # print("responses request data", request_data)
        return request_data

    @trace_method
@@ -639,6 +641,14 @@ class OpenAIClient(LLMClientBase):
                    tool.function.strict = False
        request_data = data.model_dump(exclude_unset=True)

+        # Fireworks uses strict validation (additionalProperties: false) and rejects
+        # reasoning fields that are not in their schema.
+        is_fireworks = llm_config.model_endpoint and "fireworks.ai" in llm_config.model_endpoint
+        if is_fireworks and "messages" in request_data:
+            for message in request_data["messages"]:
+                for field in ("reasoning_content_signature", "redacted_reasoning_content", "omitted_reasoning_content"):
+                    message.pop(field, None)
+
        # If Ollama
        # if llm_config.handle.startswith("ollama/") and llm_config.enable_reasoner:
        # Sadly, reasoning via the OpenAI proxy on Ollama only works for Harmony/gpt-oss
--- a/letta/llm_api/zai_client.py
+++ b/letta/llm_api/zai_client.py
@@ -68,6 +68,12 @@ class ZAIClient(OpenAIClient):
                    }
                }

+        # Z.ai's API uses max_tokens, not max_completion_tokens.
+        # If max_completion_tokens is sent, Z.ai ignores it and falls back to its
+        # default of 65536, silently truncating input to ~137K of the 200K context window.
+        if "max_completion_tokens" in data:
+            data["max_tokens"] = data.pop("max_completion_tokens")
+
        # Sanitize empty text content — ZAI rejects empty text blocks
        if "messages" in data:
            for msg in data["messages"]:
--- a/letta/model_specs/model_prices_and_context_window.json
+++ b/letta/model_specs/model_prices_and_context_window.json
@@ -17295,6 +17295,58 @@
    "supports_tool_choice": true,
    "supports_vision": true
  },
+  "gpt-5.3-chat-latest": {
+    "cache_read_input_token_cost": 1.75e-7,
+    "cache_read_input_token_cost_priority": 3.5e-7,
+    "input_cost_per_token": 1.75e-6,
+    "input_cost_per_token_priority": 3.5e-6,
+    "litellm_provider": "openai",
+    "max_input_tokens": 128000,
+    "max_output_tokens": 16384,
+    "max_tokens": 16384,
+    "mode": "chat",
+    "output_cost_per_token": 1.4e-5,
+    "output_cost_per_token_priority": 2.8e-5,
+    "supported_endpoints": ["/v1/chat/completions", "/v1/responses"],
+    "supported_modalities": ["text", "image"],
+    "supported_output_modalities": ["text"],
+    "supports_function_calling": true,
+    "supports_native_streaming": true,
+    "supports_parallel_function_calling": true,
+    "supports_pdf_input": true,
+    "supports_prompt_caching": true,
+    "supports_reasoning": true,
+    "supports_response_schema": true,
+    "supports_system_messages": true,
+    "supports_tool_choice": true,
+    "supports_vision": true
+  },
+  "gpt-5.3-codex": {
+    "cache_read_input_token_cost": 1.75e-7,
+    "cache_read_input_token_cost_priority": 3.5e-7,
+    "input_cost_per_token": 1.75e-6,
+    "input_cost_per_token_priority": 3.5e-6,
+    "litellm_provider": "openai",
+    "max_input_tokens": 272000,
+    "max_output_tokens": 128000,
+    "max_tokens": 128000,
+    "mode": "responses",
+    "output_cost_per_token": 1.4e-5,
+    "output_cost_per_token_priority": 2.8e-5,
+    "supported_endpoints": ["/v1/responses"],
+    "supported_modalities": ["text", "image"],
+    "supported_output_modalities": ["text"],
+    "supports_function_calling": true,
+    "supports_native_streaming": true,
+    "supports_parallel_function_calling": true,
+    "supports_pdf_input": true,
+    "supports_prompt_caching": true,
+    "supports_reasoning": true,
+    "supports_response_schema": true,
+    "supports_system_messages": false,
+    "supports_tool_choice": true,
+    "supports_vision": true
+  },
  "gpt-5-mini": {
    "cache_read_input_token_cost": 2.5e-8,
    "cache_read_input_token_cost_flex": 1.25e-8,
--- a/letta/orm/conversation.py
+++ b/letta/orm/conversation.py
@@ -44,7 +44,7 @@ class Conversation(SqlalchemyBase, OrganizationMixin):
        "ConversationMessage",
        back_populates="conversation",
        cascade="all, delete-orphan",
-        lazy="selectin",
+        lazy="raise",
    )
    isolated_blocks: Mapped[List["Block"]] = relationship(
        "Block",
--- a/letta/orm/conversation_messages.py
+++ b/letta/orm/conversation_messages.py
@@ -69,5 +69,5 @@ class ConversationMessage(SqlalchemyBase, OrganizationMixin):
    )
    message: Mapped["Message"] = relationship(
        "Message",
-        lazy="selectin",
+        lazy="raise",
    )
--- a/letta/schemas/letta_request.py
+++ b/letta/schemas/letta_request.py
@@ -88,8 +88,7 @@ class LettaRequest(BaseModel):
    )
    top_logprobs: Optional[int] = Field(
        default=None,
-        description="Number of most likely tokens to return at each position (0-20). "
-        "Requires return_logprobs=True.",
+        description="Number of most likely tokens to return at each position (0-20). Requires return_logprobs=True.",
    )
    return_token_ids: bool = Field(
        default=False,
@@ -155,6 +154,10 @@ class LettaStreamingRequest(LettaRequest):
 class ConversationMessageRequest(LettaRequest):
    """Request for sending messages to a conversation. Streams by default."""

+    agent_id: Optional[str] = Field(
+        default=None,
+        description="Agent ID for agent-direct mode with 'default' conversation. Use with conversation_id='default' in the URL path.",
+    )
    streaming: bool = Field(
        default=True,
        description="If True (default), returns a streaming response (Server-Sent Events). If False, returns a complete JSON response.",
@@ -194,6 +197,10 @@ class CreateBatch(BaseModel):


 class RetrieveStreamRequest(BaseModel):
+    agent_id: Optional[str] = Field(
+        default=None,
+        description="Agent ID for agent-direct mode with 'default' conversation. Use with conversation_id='default' in the URL path.",
+    )
    starting_after: int = Field(
        0, description="Sequence id to use as a cursor for pagination. Response will start streaming after this chunk sequence id"
    )
--- a/letta/schemas/llm_config.py
+++ b/letta/schemas/llm_config.py
@@ -1,3 +1,4 @@
+import re
 from typing import TYPE_CHECKING, Literal, Optional

 from pydantic import BaseModel, ConfigDict, Field, model_validator
@@ -139,7 +140,9 @@ class LLMConfig(BaseModel):

        # Set max_tokens defaults based on model (only if not explicitly provided)
        if "max_tokens" not in values:
-            if model.startswith("gpt-5"):  # Covers both gpt-5 and gpt-5.1
+            if re.match(r"^gpt-5\.[23]", model) and "-chat" not in model:
+                values["max_tokens"] = 128000
+            elif model.startswith("gpt-5"):
                values["max_tokens"] = 16384
            elif model == "gpt-4.1":
                values["max_tokens"] = 8192
@@ -299,7 +302,7 @@ class LLMConfig(BaseModel):
                context_window=272000,
                reasoning_effort="none",  # Default to "none" for GPT-5.2
                verbosity="medium",
-                max_tokens=16384,
+                max_tokens=128000,
            )
        elif model_name == "letta":
            return cls(
--- a/letta/schemas/llm_trace.py
+++ b/letta/schemas/llm_trace.py
@@ -95,6 +95,11 @@ class LLMTrace(LettaBase):
    response_json: str = Field(..., description="Full response payload as JSON string")
    llm_config_json: str = Field(default="", description="LLM config as JSON string")

+    # Billing context
+    billing_plan_type: Optional[str] = Field(default=None, description="Subscription tier (e.g., 'basic', 'standard', 'max', 'enterprise')")
+    billing_cost_source: Optional[str] = Field(default=None, description="Cost source: 'quota' or 'credits'")
+    billing_customer_id: Optional[str] = Field(default=None, description="Customer ID for cross-referencing billing records")
+
    # Timestamp
    created_at: datetime = Field(default_factory=get_utc_time, description="When the trace was created")

@@ -128,6 +133,9 @@ class LLMTrace(LettaBase):
            self.request_json,
            self.response_json,
            self.llm_config_json,
+            self.billing_plan_type or "",
+            self.billing_cost_source or "",
+            self.billing_customer_id or "",
            self.created_at,
        )

@@ -162,5 +170,8 @@ class LLMTrace(LettaBase):
            "request_json",
            "response_json",
            "llm_config_json",
+            "billing_plan_type",
+            "billing_cost_source",
+            "billing_customer_id",
            "created_at",
        ]
--- a/letta/schemas/memory.py
+++ b/letta/schemas/memory.py
@@ -226,8 +226,6 @@ class Memory(BaseModel, validate_assignment=True):
            front_lines = []
            if block.description:
                front_lines.append(f"description: {block.description}")
-            if block.limit is not None:
-                front_lines.append(f"limit: {block.limit}")
            if getattr(block, "read_only", False):
                front_lines.append("read_only: true")

@@ -291,7 +289,40 @@ class Memory(BaseModel, validate_assignment=True):

        s.write("\n\n<memory_filesystem>\n")

-        def _render_tree(node: dict, prefix: str = ""):
+        def _render_tree(node: dict, prefix: str = "", in_system: bool = False, path_parts: tuple[str, ...] = ()):
+            # Render skills/ as concise top-level entries only, using both
+            # current (`skills/<name>`) and legacy (`skills/<name>/SKILL`) labels.
+            if path_parts == ("skills",):
+                skill_entries: list[tuple[str, str]] = []
+                for name, val in node.items():
+                    if name == LEAF_KEY:
+                        continue
+
+                    block = None
+                    if isinstance(val, dict):
+                        legacy_skill_block = val.get("SKILL")
+                        if legacy_skill_block is not None and not isinstance(legacy_skill_block, dict):
+                            block = legacy_skill_block
+                        elif LEAF_KEY in val and not isinstance(val[LEAF_KEY], dict):
+                            block = val[LEAF_KEY]
+                    else:
+                        block = val
+
+                    if block is None:
+                        continue
+
+                    desc = getattr(block, "description", None)
+                    desc_line = (desc or "").strip().split("\n")[0].strip()
+                    skill_entries.append((name, desc_line))
+
+                skill_entries.sort(key=lambda e: e[0])
+                for i, (name, desc_line) in enumerate(skill_entries):
+                    is_last = i == len(skill_entries) - 1
+                    connector = "└── " if is_last else "├── "
+                    desc_suffix = f" ({desc_line})" if desc_line else ""
+                    s.write(f"{prefix}{connector}{name}{desc_suffix}\n")
+                return
+
            # Sort: directories first, then files. If a node is both a directory and a
            # leaf (LEAF_KEY present), show both <name>/ and <name>.md.
            dirs = []
@@ -316,9 +347,24 @@ class Memory(BaseModel, validate_assignment=True):
                if is_dir:
                    s.write(f"{prefix}{connector}{name}/\n")
                    extension = "    " if is_last else "│   "
-                    _render_tree(node[name], prefix + extension)
+                    _render_tree(
+                        node[name],
+                        prefix + extension,
+                        in_system=in_system or name == "system",
+                        path_parts=(*path_parts, name),
+                    )
                else:
-                    s.write(f"{prefix}{connector}{name}.md\n")
+                    # For files outside system/, append the block description
+                    desc_suffix = ""
+                    if not in_system:
+                        val = node[name]
+                        block = val[LEAF_KEY] if isinstance(val, dict) else val
+                        desc = getattr(block, "description", None)
+                        if desc:
+                            desc_line = desc.strip().split("\n")[0].strip()
+                            if desc_line:
+                                desc_suffix = f" ({desc_line})"
+                    s.write(f"{prefix}{connector}{name}.md{desc_suffix}\n")

        _render_tree(tree)
        s.write("</memory_filesystem>")
--- a/letta/schemas/model.py
+++ b/letta/schemas/model.py
@@ -282,10 +282,10 @@ class AnthropicModelSettings(ModelSettings):
        description="Soft control for how verbose model output should be, used for GPT-5 models.",
    )

-    # Opus 4.5 effort parameter
-    effort: Optional[Literal["low", "medium", "high"]] = Field(
+    # Effort parameter for Opus 4.5, Opus 4.6, and Sonnet 4.6
+    effort: Optional[Literal["low", "medium", "high", "max"]] = Field(
        None,
-        description="Effort level for Opus 4.5 model (controls token conservation). Not setting this gives similar performance to 'high'.",
+        description="Effort level for supported Anthropic models (controls token spending). 'max' is only available on Opus 4.6. Not setting this gives similar performance to 'high'.",
    )

    # Anthropic supports strict mode for tool calling - defaults to False
--- a/letta/schemas/provider_trace.py
+++ b/letta/schemas/provider_trace.py
@@ -3,13 +3,21 @@ from __future__ import annotations
 from datetime import datetime
 from typing import Any, Dict, Optional

-from pydantic import Field
+from pydantic import BaseModel, Field

 from letta.helpers.datetime_helpers import get_utc_time
 from letta.schemas.enums import PrimitiveType
 from letta.schemas.letta_base import OrmMetadataBase


+class BillingContext(BaseModel):
+    """Billing context for LLM request cost tracking."""
+
+    plan_type: Optional[str] = Field(None, description="Subscription tier")
+    cost_source: Optional[str] = Field(None, description="Cost source: 'quota' or 'credits'")
+    customer_id: Optional[str] = Field(None, description="Customer ID for billing records")
+
+
 class BaseProviderTrace(OrmMetadataBase):
    __id_prefix__ = PrimitiveType.PROVIDER_TRACE.value

@@ -53,6 +61,8 @@ class ProviderTrace(BaseProviderTrace):
    compaction_settings: Optional[Dict[str, Any]] = Field(None, description="Compaction/summarization settings (summarization calls only)")
    llm_config: Optional[Dict[str, Any]] = Field(None, description="LLM configuration used for this call (non-summarization calls only)")

+    billing_context: Optional[BillingContext] = Field(None, description="Billing context from request headers")
+
    created_at: datetime = Field(default_factory=get_utc_time, description="The timestamp when the object was created.")


--- a/letta/schemas/providers/openai.py
+++ b/letta/schemas/providers/openai.py
@@ -14,7 +14,7 @@ from letta.schemas.providers.base import Provider
 logger = get_logger(__name__)

 ALLOWED_PREFIXES = {"gpt-4", "gpt-5", "o1", "o3", "o4"}
-DISALLOWED_KEYWORDS = {"transcribe", "search", "realtime", "tts", "audio", "computer", "o1-mini", "o1-preview", "o1-pro", "chat"}
+DISALLOWED_KEYWORDS = {"transcribe", "search", "realtime", "tts", "audio", "computer", "o1-mini", "o1-preview", "o1-pro"}
 DEFAULT_EMBEDDING_BATCH_SIZE = 1024


@@ -50,10 +50,22 @@ class OpenAIProvider(Provider):
        except Exception as e:
            raise LLMError(message=f"{e}", code=ErrorCode.INTERNAL_SERVER_ERROR)

+    @staticmethod
+    def _openai_default_max_output_tokens(model_name: str) -> int:
+        """Return a sensible max-output-tokens default for OpenAI models.
+
+        gpt-5.2* / gpt-5.3* support 128k output tokens, except the
+        `-chat` variants which are capped at 16k.
+        """
+        import re
+
+        if re.match(r"^gpt-5\.[23]", model_name) and "-chat" not in model_name:
+            return 128000
+        return 16384
+
    def get_default_max_output_tokens(self, model_name: str) -> int:
        """Get the default max output tokens for OpenAI models (sync fallback)."""
-        # Simple default for openai
-        return 16384
+        return self._openai_default_max_output_tokens(model_name)

    async def get_default_max_output_tokens_async(self, model_name: str) -> int:
        """Get the default max output tokens for OpenAI models.
@@ -67,8 +79,7 @@ class OpenAIProvider(Provider):
        if max_output is not None:
            return max_output

-        # Simple default for openai
-        return 16384
+        return self._openai_default_max_output_tokens(model_name)

    async def _get_models_async(self) -> list[dict]:
        from letta.llm_api.openai import openai_get_model_list_async
--- a/letta/schemas/providers/zai.py
+++ b/letta/schemas/providers/zai.py
@@ -12,12 +12,13 @@ from letta.schemas.providers.openai import OpenAIProvider

 # Z.ai model context windows
 # Reference: https://docs.z.ai/
+# GLM-5 max context window is 200K tokens but max_output_tokens (default 16k) counts against that --> 180k
 MODEL_CONTEXT_WINDOWS = {
    "glm-4.5": 128000,
-    "glm-4.6": 200000,
-    "glm-4.7": 200000,
-    "glm-5": 200000,
-    "glm-5-code": 200000,
+    "glm-4.6": 180000,
+    "glm-4.7": 180000,
+    "glm-5": 180000,
+    "glm-5-code": 180000,
 }


--- a/letta/server/db.py
+++ b/letta/server/db.py
@@ -3,7 +3,7 @@ import uuid
 from contextlib import asynccontextmanager
 from typing import AsyncGenerator

-from sqlalchemy import NullPool, text
+from sqlalchemy import NullPool
 from sqlalchemy.ext.asyncio import (
    AsyncEngine,
    AsyncSession,
@@ -88,10 +88,6 @@ class DatabaseRegistry:
            try:
                async with async_session_factory() as session:
                    try:
-                        result = await session.execute(text("SELECT pg_backend_pid(), current_setting('statement_timeout')"))
-                        pid, timeout = result.one()
-                        logger.warning(f"[stmt_timeout_debug] pid={pid} statement_timeout={timeout}")
-                        await session.rollback()
                        yield session
                        await session.commit()
                    except asyncio.CancelledError:
--- a/letta/server/rest_api/dependencies.py
+++ b/letta/server/rest_api/dependencies.py
@@ -6,6 +6,7 @@ from pydantic import BaseModel
 from letta.errors import LettaInvalidArgumentError
 from letta.otel.tracing import tracer
 from letta.schemas.enums import PrimitiveType
+from letta.schemas.provider_trace import BillingContext
 from letta.validators import PRIMITIVE_ID_PATTERNS

 if TYPE_CHECKING:
@@ -30,18 +31,24 @@ class HeaderParams(BaseModel):
    letta_source: Optional[str] = None
    sdk_version: Optional[str] = None
    experimental_params: Optional[ExperimentalParams] = None
+    billing_context: Optional[BillingContext] = None


 def get_headers(
    actor_id: Optional[str] = Header(None, alias="user_id"),
    user_agent: Optional[str] = Header(None, alias="User-Agent"),
    project_id: Optional[str] = Header(None, alias="X-Project-Id"),
-    letta_source: Optional[str] = Header(None, alias="X-Letta-Source"),
-    sdk_version: Optional[str] = Header(None, alias="X-Stainless-Package-Version"),
-    message_async: Optional[str] = Header(None, alias="X-Experimental-Message-Async"),
-    letta_v1_agent: Optional[str] = Header(None, alias="X-Experimental-Letta-V1-Agent"),
-    letta_v1_agent_message_async: Optional[str] = Header(None, alias="X-Experimental-Letta-V1-Agent-Message-Async"),
-    modal_sandbox: Optional[str] = Header(None, alias="X-Experimental-Modal-Sandbox"),
+    letta_source: Optional[str] = Header(None, alias="X-Letta-Source", include_in_schema=False),
+    sdk_version: Optional[str] = Header(None, alias="X-Stainless-Package-Version", include_in_schema=False),
+    message_async: Optional[str] = Header(None, alias="X-Experimental-Message-Async", include_in_schema=False),
+    letta_v1_agent: Optional[str] = Header(None, alias="X-Experimental-Letta-V1-Agent", include_in_schema=False),
+    letta_v1_agent_message_async: Optional[str] = Header(
+        None, alias="X-Experimental-Letta-V1-Agent-Message-Async", include_in_schema=False
+    ),
+    modal_sandbox: Optional[str] = Header(None, alias="X-Experimental-Modal-Sandbox", include_in_schema=False),
+    billing_plan_type: Optional[str] = Header(None, alias="X-Billing-Plan-Type", include_in_schema=False),
+    billing_cost_source: Optional[str] = Header(None, alias="X-Billing-Cost-Source", include_in_schema=False),
+    billing_customer_id: Optional[str] = Header(None, alias="X-Billing-Customer-Id", include_in_schema=False),
 ) -> HeaderParams:
    """Dependency injection function to extract common headers from requests."""
    with tracer.start_as_current_span("dependency.get_headers"):
@@ -63,6 +70,13 @@ def get_headers(
                letta_v1_agent_message_async=(letta_v1_agent_message_async == "true") if letta_v1_agent_message_async else None,
                modal_sandbox=(modal_sandbox == "true") if modal_sandbox else None,
            ),
+            billing_context=BillingContext(
+                plan_type=billing_plan_type,
+                cost_source=billing_cost_source,
+                customer_id=billing_customer_id,
+            )
+            if any([billing_plan_type, billing_cost_source, billing_customer_id])
+            else None,
        )


--- a/letta/server/rest_api/routers/v1/agents.py
+++ b/letta/server/rest_api/routers/v1/agents.py
@@ -49,6 +49,7 @@ from letta.schemas.memory import (
 )
 from letta.schemas.message import Message, MessageCreate, MessageCreateType, MessageSearchRequest, MessageSearchResult
 from letta.schemas.passage import Passage
+from letta.schemas.provider_trace import BillingContext
 from letta.schemas.run import Run as PydanticRun, RunUpdate
 from letta.schemas.source import Source
 from letta.schemas.tool import Tool
@@ -156,7 +157,7 @@ async def list_agents(
    order: Literal["asc", "desc"] = Query(
        "desc", description="Sort order for agents by creation time. 'asc' for oldest first, 'desc' for newest first"
    ),
-    order_by: Literal["created_at", "last_run_completion"] = Query("created_at", description="Field to sort by"),
+    order_by: Literal["created_at", "updated_at", "last_run_completion"] = Query("created_at", description="Field to sort by"),
    ascending: bool = Query(
        False,
        description="Whether to sort agents oldest to newest (True) or newest to oldest (False, default)",
@@ -1697,6 +1698,7 @@ async def send_message(
            actor=actor,
            request=request,
            run_type="send_message",
+            billing_context=headers.billing_context,
        )
        return result

@@ -1767,6 +1769,7 @@ async def send_message(
            include_return_message_types=request.include_return_message_types,
            client_tools=request.client_tools,
            include_compaction_messages=request.include_compaction_messages,
+            billing_context=headers.billing_context,
        )
        run_status = result.stop_reason.stop_reason.run_status
        return result
@@ -1845,6 +1848,7 @@ async def send_message_streaming(
        actor=actor,
        request=request,
        run_type="send_message_streaming",
+        billing_context=headers.billing_context,
    )

    return result
@@ -1868,6 +1872,13 @@ async def cancel_message(
    """
    # TODO: WHY DOES THIS CANCEL A LIST OF RUNS?
    actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
+    logger.info(
+        "[Interrupt] Cancel request received for agent=%s by actor=%s (org=%s), explicit_run_ids=%s",
+        agent_id,
+        actor.id,
+        actor.organization_id,
+        request.run_ids if request else None,
+    )
    if not settings.track_agent_run:
        raise HTTPException(status_code=400, detail="Agent run tracking is disabled")
    run_ids = request.run_ids if request else None
@@ -2036,6 +2047,7 @@ async def _process_message_background(
    include_return_message_types: list[MessageType] | None = None,
    override_model: str | None = None,
    include_compaction_messages: bool = False,
+    billing_context: "BillingContext | None" = None,
 ) -> None:
    """Background task to process the message and update run status."""
    request_start_timestamp_ns = get_utc_timestamp_ns()
@@ -2067,6 +2079,7 @@ async def _process_message_background(
            request_start_timestamp_ns=request_start_timestamp_ns,
            include_return_message_types=include_return_message_types,
            include_compaction_messages=include_compaction_messages,
+            billing_context=billing_context,
        )
        runs_manager = RunManager()
        from letta.schemas.enums import RunStatus
@@ -2235,6 +2248,7 @@ async def send_message_async(
            include_return_message_types=request.include_return_message_types,
            override_model=request.override_model,
            include_compaction_messages=request.include_compaction_messages,
+            billing_context=headers.billing_context,
        ),
        label=f"process_message_background_{run.id}",
    )
@@ -2419,7 +2433,11 @@ async def summarize_messages(

        # If mode changed from agent's original settings and prompt not explicitly set in request, then use the default prompt for the new mode
        # Ex: previously was sliding_window, now is all, so we need to use the default prompt for all mode
-        if "mode" in changed_fields and agent.compaction_settings.mode != request.compaction_settings.mode:
+        if (
+            "mode" in changed_fields
+            and "prompt" not in changed_fields
+            and agent.compaction_settings.mode != request.compaction_settings.mode
+        ):
            from letta.services.summarizer.summarizer_config import get_default_prompt_for_mode

            compaction_settings.prompt = get_default_prompt_for_mode(compaction_settings.mode)
@@ -2439,7 +2457,7 @@ async def summarize_messages(
        logger.warning(f"Summarization failed to reduce the number of messages. {num_messages_before} messages -> {num_messages_after}.")
        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
-            detail="Summarization failed to reduce the number of messages. You may need to use a different CompactionSettings (e.g. using `all` mode).",
+            detail="Summarization failed to reduce the number of messages. You may not have enough messages to compact or need to use a different CompactionSettings (e.g. using `all` mode).",
        )
    await agent_loop._checkpoint_messages(run_id=None, step_id=None, new_messages=[summary_message], in_context_messages=messages)
    return CompactionResponse(
--- a/letta/server/rest_api/routers/v1/conversations.py
+++ b/letta/server/rest_api/routers/v1/conversations.py
@@ -1,5 +1,6 @@
 from datetime import timedelta
 from typing import Annotated, List, Literal, Optional
+from uuid import uuid4

 from fastapi import APIRouter, Body, Depends, HTTPException, Query, status
 from pydantic import BaseModel, Field
@@ -18,6 +19,7 @@ from letta.schemas.job import LettaRequestConfig
 from letta.schemas.letta_message import LettaMessageUnion
 from letta.schemas.letta_request import ConversationMessageRequest, LettaStreamingRequest, RetrieveStreamRequest
 from letta.schemas.letta_response import LettaResponse
+from letta.schemas.provider_trace import BillingContext
 from letta.schemas.run import Run as PydanticRun
 from letta.server.rest_api.dependencies import HeaderParams, get_headers, get_letta_server
 from letta.server.rest_api.redis_stream_manager import redis_sse_stream_generator
@@ -32,7 +34,7 @@ from letta.services.run_manager import RunManager
 from letta.services.streaming_service import StreamingService
 from letta.services.summarizer.summarizer_config import CompactionSettings
 from letta.settings import settings
-from letta.validators import ConversationId
+from letta.validators import ConversationId, ConversationIdOrDefault

 router = APIRouter(prefix="/conversations", tags=["conversations"])

@@ -148,7 +150,8 @@ ConversationMessagesResponse = Annotated[
    operation_id="list_conversation_messages",
 )
 async def list_conversation_messages(
-    conversation_id: ConversationId,
+    conversation_id: ConversationIdOrDefault,
+    agent_id: Optional[str] = Query(None, description="Agent ID for agent-direct mode with 'default' conversation"),
    server: SyncServer = Depends(get_letta_server),
    headers: HeaderParams = Depends(get_headers),
    before: Optional[str] = Query(
@@ -172,8 +175,36 @@ async def list_conversation_messages(

    Returns LettaMessage objects (UserMessage, AssistantMessage, etc.) for all
    messages in the conversation, with support for cursor-based pagination.
+
+    **Agent-direct mode**: Pass conversation_id="default" with agent_id parameter
+    to list messages from the agent's default conversation.
+
+    **Deprecated**: Passing an agent ID as conversation_id still works but will be removed.
    """
    actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
+
+    # Agent-direct mode: conversation_id="default" + agent_id param (preferred)
+    # OR conversation_id="agent-*" (backwards compat, deprecated)
+    resolved_agent_id = None
+    if conversation_id == "default" and agent_id:
+        resolved_agent_id = agent_id
+    elif conversation_id.startswith("agent-"):
+        resolved_agent_id = conversation_id
+
+    if resolved_agent_id:
+        return await server.get_agent_recall_async(
+            agent_id=resolved_agent_id,
+            after=after,
+            before=before,
+            limit=limit,
+            group_id=group_id,
+            conversation_id=None,  # Default conversation (no isolation)
+            reverse=(order == "desc"),
+            return_message_object=False,
+            include_err=include_err,
+            actor=actor,
+        )
+
    return await conversation_manager.list_conversation_messages(
        conversation_id=conversation_id,
        actor=actor,
@@ -186,6 +217,108 @@ async def list_conversation_messages(
    )


+async def _send_agent_direct_message(
+    agent_id: str,
+    request: ConversationMessageRequest,
+    server: SyncServer,
+    actor,
+    billing_context: "BillingContext | None" = None,
+) -> StreamingResponse | LettaResponse:
+    """
+    Handle agent-direct messaging with locking but without conversation features.
+
+    This is used when the conversation_id in the URL is actually an agent ID,
+    providing a unified endpoint while maintaining agent-level locking.
+    """
+    redis_client = await get_redis_client()
+
+    # Streaming mode (default)
+    if request.streaming:
+        streaming_request = LettaStreamingRequest(
+            messages=request.messages,
+            streaming=True,
+            stream_tokens=request.stream_tokens,
+            include_pings=request.include_pings,
+            background=request.background,
+            max_steps=request.max_steps,
+            use_assistant_message=request.use_assistant_message,
+            assistant_message_tool_name=request.assistant_message_tool_name,
+            assistant_message_tool_kwarg=request.assistant_message_tool_kwarg,
+            include_return_message_types=request.include_return_message_types,
+            override_model=request.override_model,
+            client_tools=request.client_tools,
+        )
+        streaming_service = StreamingService(server)
+        run, result = await streaming_service.create_agent_stream(
+            agent_id=agent_id,
+            actor=actor,
+            request=streaming_request,
+            run_type="send_message",
+            conversation_id=None,
+            should_lock=True,
+            billing_context=billing_context,
+        )
+        return result
+
+    # Non-streaming mode with locking
+    agent = await server.agent_manager.get_agent_by_id_async(
+        agent_id,
+        actor,
+        include_relationships=["memory", "multi_agent_group", "sources", "tool_exec_environment_variables", "tools", "tags"],
+    )
+
+    # Handle model override if specified in the request
+    if request.override_model:
+        override_llm_config = await server.get_llm_config_from_handle_async(
+            actor=actor,
+            handle=request.override_model,
+        )
+        agent = agent.model_copy(update={"llm_config": override_llm_config})
+
+    # Acquire lock using agent_id as lock key
+    if not isinstance(redis_client, NoopAsyncRedisClient):
+        await redis_client.acquire_conversation_lock(
+            conversation_id=agent_id,
+            token=str(uuid4()),
+        )
+
+    try:
+        # Create a run for execution tracking
+        run = None
+        if settings.track_agent_run:
+            runs_manager = RunManager()
+            run = await runs_manager.create_run(
+                pydantic_run=PydanticRun(
+                    agent_id=agent_id,
+                    background=False,
+                    metadata={
+                        "run_type": "send_message",
+                    },
+                    request_config=LettaRequestConfig.from_letta_request(request),
+                ),
+                actor=actor,
+            )
+
+        # Set run_id in Redis for cancellation support
+        await redis_client.set(f"{REDIS_RUN_ID_PREFIX}:{agent_id}", run.id if run else None)
+
+        agent_loop = AgentLoop.load(agent_state=agent, actor=actor)
+        return await agent_loop.step(
+            request.messages,
+            max_steps=request.max_steps,
+            run_id=run.id if run else None,
+            use_assistant_message=request.use_assistant_message,
+            include_return_message_types=request.include_return_message_types,
+            client_tools=request.client_tools,
+            conversation_id=None,
+            include_compaction_messages=request.include_compaction_messages,
+            billing_context=billing_context,
+        )
+    finally:
+        # Release lock
+        await redis_client.release_conversation_lock(agent_id)
+
+
@router.post(
    "/{conversation_id}/messages",
    response_model=LettaResponse,
@@ -201,7 +334,7 @@ async def list_conversation_messages(
    },
 )
 async def send_conversation_message(
-    conversation_id: ConversationId,
+    conversation_id: ConversationIdOrDefault,
    request: ConversationMessageRequest = Body(...),
    server: SyncServer = Depends(get_letta_server),
    headers: HeaderParams = Depends(get_headers),
@@ -212,12 +345,36 @@ async def send_conversation_message(
    This endpoint sends a message to an existing conversation.
    By default (streaming=true), returns a streaming response (Server-Sent Events).
    Set streaming=false to get a complete JSON response.
+
+    **Agent-direct mode**: Pass conversation_id="default" with agent_id in request body
+    to send messages to the agent's default conversation with locking.
+
+    **Deprecated**: Passing an agent ID as conversation_id still works but will be removed.
    """
    actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)

    if not request.messages or len(request.messages) == 0:
        raise HTTPException(status_code=422, detail="Messages must not be empty")

+    # Agent-direct mode: conversation_id="default" + agent_id in body (preferred)
+    # OR conversation_id="agent-*" (backwards compat, deprecated)
+    resolved_agent_id = None
+    if conversation_id == "default" and request.agent_id:
+        resolved_agent_id = request.agent_id
+    elif conversation_id.startswith("agent-"):
+        resolved_agent_id = conversation_id
+
+    if resolved_agent_id:
+        # Agent-direct mode: use agent ID, enable locking, skip conversation features
+        return await _send_agent_direct_message(
+            agent_id=resolved_agent_id,
+            request=request,
+            server=server,
+            actor=actor,
+            billing_context=headers.billing_context,
+        )
+
+    # Normal conversation mode
    conversation = await conversation_manager.get_conversation_by_id(
        conversation_id=conversation_id,
        actor=actor,
@@ -247,6 +404,7 @@ async def send_conversation_message(
            request=streaming_request,
            run_type="send_conversation_message",
            conversation_id=conversation_id,
+            billing_context=headers.billing_context,
        )
        return result

@@ -265,6 +423,10 @@ async def send_conversation_message(
        )
        if conversation.model_settings is not None:
            update_params = conversation.model_settings._to_legacy_config_params()
+            # Don't clobber max_tokens with the Pydantic default when the caller
+            # didn't explicitly provide max_output_tokens.
+            if "max_output_tokens" not in conversation.model_settings.model_fields_set:
+                update_params.pop("max_tokens", None)
            conversation_llm_config = conversation_llm_config.model_copy(update=update_params)
        agent = agent.model_copy(update={"llm_config": conversation_llm_config})

@@ -305,6 +467,7 @@ async def send_conversation_message(
        client_tools=request.client_tools,
        conversation_id=conversation_id,
        include_compaction_messages=request.include_compaction_messages,
+        billing_context=headers.billing_context,
    )


@@ -341,7 +504,7 @@ async def send_conversation_message(
    },
 )
 async def retrieve_conversation_stream(
-    conversation_id: ConversationId,
+    conversation_id: ConversationIdOrDefault,
    request: RetrieveStreamRequest = Body(None),
    headers: HeaderParams = Depends(get_headers),
    server: SyncServer = Depends(get_letta_server),
@@ -351,11 +514,35 @@ async def retrieve_conversation_stream(

    This endpoint allows you to reconnect to an active background stream
    for a conversation, enabling recovery from network interruptions.
+
+    **Agent-direct mode**: Pass conversation_id="default" with agent_id in request body
+    to retrieve the stream for the agent's most recent active run.
+
+    **Deprecated**: Passing an agent ID as conversation_id still works but will be removed.
    """
    actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
    runs_manager = RunManager()

-    # Find the most recent active run for this conversation
+    # Agent-direct mode: conversation_id="default" + agent_id in body (preferred)
+    # OR conversation_id="agent-*" (backwards compat, deprecated)
+    resolved_agent_id = None
+    if conversation_id == "default" and request and request.agent_id:
+        resolved_agent_id = request.agent_id
+    elif conversation_id.startswith("agent-"):
+        resolved_agent_id = conversation_id
+
+    # Find the most recent active run
+    if resolved_agent_id:
+        # Agent-direct mode: find runs by agent_id
+        active_runs = await runs_manager.list_runs(
+            actor=actor,
+            agent_id=resolved_agent_id,
+            statuses=[RunStatus.created, RunStatus.running],
+            limit=1,
+            ascending=False,
+        )
+    else:
+        # Normal mode: find runs by conversation_id
        active_runs = await runs_manager.list_runs(
            actor=actor,
            conversation_id=conversation_id,
@@ -417,7 +604,8 @@ async def retrieve_conversation_stream(

@router.post("/{conversation_id}/cancel", operation_id="cancel_conversation")
 async def cancel_conversation(
-    conversation_id: ConversationId,
+    conversation_id: ConversationIdOrDefault,
+    agent_id: Optional[str] = Query(None, description="Agent ID for agent-direct mode with 'default' conversation"),
    server: SyncServer = Depends(get_letta_server),
    headers: HeaderParams = Depends(get_headers),
 ) -> dict:
@@ -425,17 +613,48 @@ async def cancel_conversation(
    Cancel runs associated with a conversation.

    Note: To cancel active runs, Redis is required.
+
+    **Agent-direct mode**: Pass conversation_id="default" with agent_id query parameter
+    to cancel runs for the agent's default conversation.
+
+    **Deprecated**: Passing an agent ID as conversation_id still works but will be removed.
    """
    actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
+    logger.info(
+        "[Interrupt] Cancel request received for conversation=%s by actor=%s (org=%s)",
+        conversation_id,
+        actor.id,
+        actor.organization_id,
+    )

    if not settings.track_agent_run:
        raise HTTPException(status_code=400, detail="Agent run tracking is disabled")

+    # Agent-direct mode: conversation_id="default" + agent_id param (preferred)
+    # OR conversation_id="agent-*" (backwards compat, deprecated)
+    resolved_agent_id = None
+    if conversation_id == "default" and agent_id:
+        resolved_agent_id = agent_id
+    elif conversation_id.startswith("agent-"):
+        resolved_agent_id = conversation_id
+
+    if resolved_agent_id:
+        # Agent-direct mode: use agent_id directly, skip conversation lookup
+        # Find active runs for this agent (default conversation has conversation_id=None)
+        runs = await server.run_manager.list_runs(
+            actor=actor,
+            agent_id=resolved_agent_id,
+            statuses=[RunStatus.created, RunStatus.running],
+            ascending=False,
+            limit=100,
+        )
+    else:
        # Verify conversation exists and get agent_id
        conversation = await conversation_manager.get_conversation_by_id(
            conversation_id=conversation_id,
            actor=actor,
        )
+        agent_id = conversation.agent_id

        # Find active runs for this conversation
        runs = await server.run_manager.list_runs(
@@ -445,6 +664,7 @@ async def cancel_conversation(
            conversation_id=conversation_id,
            limit=100,
        )
+
    run_ids = [run.id for run in runs]

    if not run_ids:
@@ -461,7 +681,7 @@ async def cancel_conversation(
                except Exception as e:
                    logger.error(f"Failed to cancel Lettuce run {run_id}: {e}")

-            await server.run_manager.cancel_run(actor=actor, agent_id=conversation.agent_id, run_id=run_id)
+            await server.run_manager.cancel_run(actor=actor, agent_id=agent_id, run_id=run_id)
        except Exception as e:
            results[run_id] = "failed"
            logger.error(f"Failed to cancel run {run_id}: {str(e)}")
@@ -473,6 +693,10 @@ async def cancel_conversation(


 class CompactionRequest(BaseModel):
+    agent_id: Optional[str] = Field(
+        default=None,
+        description="Agent ID for agent-direct mode with 'default' conversation. Use with conversation_id='default' in the URL path.",
+    )
    compaction_settings: Optional[CompactionSettings] = Field(
        default=None,
        description="Optional compaction settings to use for this summarization request. If not provided, the agent's default settings will be used.",
@@ -487,7 +711,7 @@ class CompactionResponse(BaseModel):

@router.post("/{conversation_id}/compact", response_model=CompactionResponse, operation_id="compact_conversation")
 async def compact_conversation(
-    conversation_id: ConversationId,
+    conversation_id: ConversationIdOrDefault,
    request: Optional[CompactionRequest] = Body(default=None),
    server: SyncServer = Depends(get_letta_server),
    headers: HeaderParams = Depends(get_headers),
@@ -497,9 +721,28 @@ async def compact_conversation(

    This endpoint summarizes the in-context messages for a specific conversation,
    reducing the message count while preserving important context.
+
+    **Agent-direct mode**: Pass conversation_id="default" with agent_id in request body
+    to compact the agent's default conversation messages.
+
+    **Deprecated**: Passing an agent ID as conversation_id still works but will be removed.
    """
    actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)

+    # Agent-direct mode: conversation_id="default" + agent_id in body (preferred)
+    # OR conversation_id="agent-*" (backwards compat, deprecated)
+    resolved_agent_id = None
+    if conversation_id == "default" and request and request.agent_id:
+        resolved_agent_id = request.agent_id
+    elif conversation_id.startswith("agent-"):
+        resolved_agent_id = conversation_id
+
+    if resolved_agent_id:
+        # Agent-direct mode: compact agent's default conversation
+        agent = await server.agent_manager.get_agent_by_id_async(resolved_agent_id, actor, include_relationships=["multi_agent_group"])
+        in_context_messages = await server.message_manager.get_messages_by_ids_async(message_ids=agent.message_ids, actor=actor)
+        agent_loop = LettaAgentV3(agent_state=agent, actor=actor)
+    else:
        # Get the conversation to find the agent_id
        conversation = await conversation_manager.get_conversation_by_id(
            conversation_id=conversation_id,
@@ -515,16 +758,36 @@ async def compact_conversation(
            actor=actor,
        )

+        # Create agent loop with conversation context
+        agent_loop = LettaAgentV3(agent_state=agent, actor=actor, conversation_id=conversation_id)
+
    if not in_context_messages:
        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
            detail="No in-context messages found for this conversation.",
        )

-    # Create agent loop with conversation context
-    agent_loop = LettaAgentV3(agent_state=agent, actor=actor, conversation_id=conversation_id)
+    # Merge request compaction_settings with agent's settings (request overrides agent)
+    if agent.compaction_settings and request and request.compaction_settings:
+        # Start with agent's settings, override with new values from request
+        # Use model_fields_set to get the fields that were changed in the request (want to ignore the defaults that get set automatically)
+        compaction_settings = agent.compaction_settings.copy()  # do not mutate original agent compaction settings
+        changed_fields = request.compaction_settings.model_fields_set
+        for field in changed_fields:
+            setattr(compaction_settings, field, getattr(request.compaction_settings, field))

-    compaction_settings = request.compaction_settings if request else None
+        # If mode changed from agent's original settings and prompt not explicitly set in request, then use the default prompt for the new mode
+        # Ex: previously was sliding_window, now is all, so we need to use the default prompt for all mode
+        if (
+            "mode" in changed_fields
+            and "prompt" not in changed_fields
+            and agent.compaction_settings.mode != request.compaction_settings.mode
+        ):
+            from letta.services.summarizer.summarizer_config import get_default_prompt_for_mode
+
+            compaction_settings.prompt = get_default_prompt_for_mode(compaction_settings.mode)
+    else:
+        compaction_settings = (request and request.compaction_settings) or agent.compaction_settings
    num_messages_before = len(in_context_messages)

    # Run compaction
@@ -537,13 +800,11 @@ async def compact_conversation(

    # Validate compaction reduced messages
    if num_messages_before <= num_messages_after:
-        logger.warning(
-            f"Summarization failed to reduce the number of messages. {num_messages_before} messages -> {num_messages_after} (only expected if drop_tool_returns is True)."
+        logger.warning(f"Summarization failed to reduce the number of messages. {num_messages_before} messages -> {num_messages_after}.")
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="Summarization failed to reduce the number of messages. You may not have enough messages to compact or need to use a different CompactionSettings (e.g. using `all` mode).",
        )
-        # raise HTTPException(
-        #     status_code=status.HTTP_400_BAD_REQUEST,
-        #     detail="Summarization failed to reduce the number of messages. You may need to use a different CompactionSettings (e.g. using `all` mode).",
-        # )

    # Checkpoint the messages (this will update the conversation_messages table)
    await agent_loop._checkpoint_messages(run_id=None, step_id=None, new_messages=[summary_message], in_context_messages=messages)
--- a/letta/server/rest_api/routers/v1/git_http.py
+++ b/letta/server/rest_api/routers/v1/git_http.py
@@ -29,11 +29,23 @@ from starlette.background import BackgroundTask

 from letta.log import get_logger
 from letta.server.rest_api.dependencies import HeaderParams, get_headers, get_letta_server
+from letta.services.memory_repo.path_mapping import memory_block_label_from_markdown_path

 logger = get_logger(__name__)

 _background_tasks: set[asyncio.Task] = set()

+
+def _is_syncable_block_markdown_path(path: str) -> bool:
+    """Return whether a markdown path should be mirrored into block cache.
+
+    Special-case skills so only skill definitions are mirrored:
+    - sync `skills/{skill_name}/SKILL.md` as label `skills/{skill_name}`
+    - ignore all other markdown under `skills/`
+    """
+    return memory_block_label_from_markdown_path(path) is not None
+
+
 router = APIRouter(prefix="/git", tags=["git"], include_in_schema=False)

 # Global storage for the server instance (set during app startup)
@@ -100,7 +112,7 @@ async def _sync_after_push(actor_id: str, agent_id: str) -> None:
    expected_labels = set()
    from letta.services.memory_repo.block_markdown import parse_block_markdown

-    md_file_paths = sorted([file_path for file_path in files if file_path.endswith(".md")])
+    md_file_paths = sorted([file_path for file_path in files if _is_syncable_block_markdown_path(file_path)])
    nested_md_file_paths = [file_path for file_path in md_file_paths if "/" in file_path[:-3]]
    logger.info(
        "Post-push sync file scan: agent=%s total_files=%d md_files=%d nested_md_files=%d sample_md_paths=%s",
@@ -113,10 +125,12 @@ async def _sync_after_push(actor_id: str, agent_id: str) -> None:

    synced = 0
    for file_path, content in files.items():
-        if not file_path.endswith(".md"):
+        if not _is_syncable_block_markdown_path(file_path):
            continue

-        label = file_path[:-3]
+        label = memory_block_label_from_markdown_path(file_path)
+        if label is None:
+            continue
        expected_labels.add(label)

        # Parse frontmatter to extract metadata alongside value
--- a/letta/server/rest_api/utils.py
+++ b/letta/server/rest_api/utils.py
@@ -364,6 +364,8 @@ def create_approval_request_message_from_llm_response(
    )
    if pre_computed_assistant_message_id:
        approval_message.id = decrement_message_uuid(pre_computed_assistant_message_id)
+    # Set otid to match streaming interface pattern (index -1 returns id unchanged)
+    approval_message.otid = Message.generate_otid_from_id(approval_message.id, -1)
    messages.append(approval_message)
    return messages

--- a/letta/server/server.py
+++ b/letta/server/server.py
@@ -562,6 +562,10 @@ class SyncServer(object):
        # update with model_settings
        if request.model_settings is not None:
            update_llm_config_params = request.model_settings._to_legacy_config_params()
+            # Don't clobber max_tokens with the Pydantic default when the caller
+            # didn't explicitly provide max_output_tokens in the request.
+            if "max_output_tokens" not in request.model_settings.model_fields_set:
+                update_llm_config_params.pop("max_tokens", None)
            request.llm_config = request.llm_config.model_copy(update=update_llm_config_params)

        # Copy parallel_tool_calls from request to llm_config if provided
@@ -675,6 +679,12 @@ class SyncServer(object):
                # Get the current agent's llm_config if not already set
                agent = await self.agent_manager.get_agent_by_id_async(agent_id=agent_id, actor=actor)
                request.llm_config = agent.llm_config.model_copy()
+            else:
+                # TODO: Refactor update_agent to accept partial llm_config so we
+                # don't need to fetch the full agent just to preserve max_tokens.
+                if request.max_tokens is None and "max_output_tokens" not in request.model_settings.model_fields_set:
+                    agent = await self.agent_manager.get_agent_by_id_async(agent_id=agent_id, actor=actor)
+                    request.llm_config.max_tokens = agent.llm_config.max_tokens
            update_llm_config_params = request.model_settings._to_legacy_config_params()
            # Don't clobber max_tokens with the Pydantic default when the caller
            # didn't explicitly provide max_output_tokens in the request.
--- a/letta/services/agent_manager.py
+++ b/letta/services/agent_manager.py
@@ -24,8 +24,7 @@ from letta.constants import (
    INCLUDE_MODEL_KEYWORDS_BASE_TOOL_RULES,
    RETRIEVAL_QUERY_DEFAULT_PAGE_SIZE,
 )
-
-from letta.errors import LettaAgentNotFoundError, LettaError, LettaInvalidArgumentError
+from letta.errors import LettaError
 from letta.helpers import ToolRulesSolver
 from letta.helpers.datetime_helpers import get_utc_time
 from letta.log import get_logger
@@ -789,6 +788,25 @@ class AgentManager:
                    agent.agent_type,
                )

+            # Upsert compaction_settings: merge incoming partial update with existing settings
+            if agent_update.compaction_settings is not None:
+                # If mode changed, update the prompt to the default for the new mode
+                changed_fields = agent_update.compaction_settings.model_fields_set
+                if (
+                    agent.compaction_settings is not None
+                    and "mode" in changed_fields
+                    and agent_update.compaction_settings.mode != agent.compaction_settings.mode
+                ):
+                    from letta.services.summarizer.summarizer_config import get_default_prompt_for_mode
+
+                    agent_update.compaction_settings.prompt = get_default_prompt_for_mode(agent_update.compaction_settings.mode)
+
+                # Fill in unchanged fields from existing settings
+                if agent.compaction_settings is not None:
+                    for field in agent.compaction_settings.model_fields:
+                        if field not in changed_fields:
+                            setattr(agent_update.compaction_settings, field, getattr(agent.compaction_settings, field))
+
            scalar_updates = {
                "name": agent_update.name,
                "system": agent_update.system,
--- a/letta/services/conversation_manager.py
+++ b/letta/services/conversation_manager.py
@@ -7,6 +7,7 @@ if TYPE_CHECKING:
 from sqlalchemy import and_, asc, delete, desc, func, nulls_last, or_, select

 from letta.errors import LettaInvalidArgumentError
+from letta.helpers.datetime_helpers import get_utc_time
 from letta.orm.agent import Agent as AgentModel
 from letta.orm.block import Block as BlockModel
 from letta.orm.blocks_conversations import BlocksConversations
@@ -29,6 +30,21 @@ from letta.utils import enforce_types
 class ConversationManager:
    """Manager class to handle business logic related to Conversations."""

+    @staticmethod
+    def _serialize_model_settings(model_settings) -> Optional[dict]:
+        """Serialize model settings for DB storage, stripping max_output_tokens if not explicitly set.
+
+        Uses model_dump() to preserve all fields (including the provider_type discriminator),
+        but removes max_output_tokens when it wasn't explicitly provided by the caller so we
+        don't persist the Pydantic default (4096) and later overwrite the agent's own value.
+        """
+        if model_settings is None:
+            return None
+        data = model_settings.model_dump()
+        if "max_output_tokens" not in model_settings.model_fields_set:
+            data.pop("max_output_tokens", None)
+        return data
+
    @enforce_types
    @trace_method
    async def create_conversation(
@@ -56,7 +72,7 @@ class ConversationManager:
                summary=conversation_create.summary,
                organization_id=actor.organization_id,
                model=conversation_create.model,
-                model_settings=conversation_create.model_settings.model_dump() if conversation_create.model_settings else None,
+                model_settings=self._serialize_model_settings(conversation_create.model_settings),
            )
            await conversation.create_async(session, actor=actor)

@@ -73,8 +89,102 @@ class ConversationManager:

            pydantic_conversation = conversation.to_pydantic()
            pydantic_conversation.isolated_block_ids = isolated_block_ids
+
+        # Compile and persist the initial system message for this conversation
+        # This ensures the conversation captures the latest memory block state at creation time
+        await self.compile_and_save_system_message_for_conversation(
+            conversation_id=pydantic_conversation.id,
+            agent_id=agent_id,
+            actor=actor,
+        )
+
        return pydantic_conversation

+    @trace_method
+    async def compile_and_save_system_message_for_conversation(
+        self,
+        conversation_id: str,
+        agent_id: str,
+        actor: PydanticUser,
+        agent_state: Optional["AgentState"] = None,
+        message_manager: Optional[object] = None,
+    ) -> PydanticMessage:
+        """Compile and persist the initial system message for a conversation.
+
+        This recompiles the system prompt with the latest memory block values
+        and metadata, ensuring the conversation starts with an up-to-date
+        system message.
+
+        This is the single source of truth for creating a conversation's system
+        message — used both at conversation creation time and as a fallback
+        when a conversation has no messages yet.
+
+        Args:
+            conversation_id: The conversation to add the system message to
+            agent_id: The agent this conversation belongs to
+            actor: The user performing the action
+            agent_state: Optional pre-loaded agent state (avoids redundant DB load)
+            message_manager: Optional pre-loaded MessageManager instance
+
+        Returns:
+            The persisted system message
+        """
+        # Lazy imports to avoid circular dependencies
+        from letta.prompts.prompt_generator import PromptGenerator
+        from letta.services.message_manager import MessageManager
+        from letta.services.passage_manager import PassageManager
+
+        if message_manager is None:
+            message_manager = MessageManager()
+
+        if agent_state is None:
+            from letta.services.agent_manager import AgentManager
+
+            agent_state = await AgentManager().get_agent_by_id_async(
+                agent_id=agent_id,
+                include_relationships=["memory", "sources"],
+                actor=actor,
+            )
+
+        passage_manager = PassageManager()
+        num_messages = await message_manager.size_async(actor=actor, agent_id=agent_id)
+        num_archival_memories = await passage_manager.agent_passage_size_async(actor=actor, agent_id=agent_id)
+
+        # Compile the system message with current memory state
+        system_message_str = await PromptGenerator.compile_system_message_async(
+            system_prompt=agent_state.system,
+            in_context_memory=agent_state.memory,
+            in_context_memory_last_edit=get_utc_time(),
+            timezone=agent_state.timezone,
+            user_defined_variables=None,
+            append_icm_if_missing=True,
+            previous_message_count=num_messages,
+            archival_memory_size=num_archival_memories,
+            sources=agent_state.sources,
+            max_files_open=agent_state.max_files_open,
+        )
+
+        system_message = PydanticMessage.dict_to_message(
+            agent_id=agent_id,
+            model=agent_state.llm_config.model,
+            openai_message_dict={"role": "system", "content": system_message_str},
+        )
+
+        # Persist the new system message
+        persisted_messages = await message_manager.create_many_messages_async([system_message], actor=actor)
+        system_message = persisted_messages[0]
+
+        # Add it to the conversation tracking at position 0
+        await self.add_messages_to_conversation(
+            conversation_id=conversation_id,
+            agent_id=agent_id,
+            message_ids=[system_message.id],
+            actor=actor,
+            starting_position=0,
+        )
+
+        return system_message
+
    @enforce_types
    @trace_method
    async def get_conversation_by_id(
@@ -133,22 +243,15 @@ class ConversationManager:
            if sort_by == "last_run_completion":
                # Subquery to get the latest completed_at for each conversation
                latest_run_subquery = (
-                    select(
-                        RunModel.conversation_id,
-                        func.max(RunModel.completed_at).label("last_run_completion")
-                    )
+                    select(RunModel.conversation_id, func.max(RunModel.completed_at).label("last_run_completion"))
                    .where(RunModel.conversation_id.isnot(None))
                    .group_by(RunModel.conversation_id)
                    .subquery()
                )

                # Join conversations with the subquery
-                stmt = (
-                    select(ConversationModel)
-                    .outerjoin(
-                        latest_run_subquery,
-                        ConversationModel.id == latest_run_subquery.c.conversation_id
-                    )
+                stmt = select(ConversationModel).outerjoin(
+                    latest_run_subquery, ConversationModel.id == latest_run_subquery.c.conversation_id
                )
                sort_column = latest_run_subquery.c.last_run_completion
                sort_nulls_last = True
@@ -170,10 +273,12 @@ class ConversationManager:

            # Add summary search filter if provided
            if summary_search:
-                conditions.extend([
+                conditions.extend(
+                    [
                        ConversationModel.summary.isnot(None),
                        ConversationModel.summary.contains(summary_search),
-                ])
+                    ]
+                )

            stmt = stmt.where(and_(*conditions))

@@ -182,10 +287,7 @@ class ConversationManager:
                # Get the sort value for the cursor conversation
                if sort_by == "last_run_completion":
                    cursor_query = (
-                        select(
-                            ConversationModel.id,
-                            func.max(RunModel.completed_at).label("last_run_completion")
-                        )
+                        select(ConversationModel.id, func.max(RunModel.completed_at).label("last_run_completion"))
                        .outerjoin(RunModel, ConversationModel.id == RunModel.conversation_id)
                        .where(ConversationModel.id == after)
                        .group_by(ConversationModel.id)
@@ -198,16 +300,11 @@ class ConversationManager:
                            # Cursor is at NULL - if ascending, get non-NULLs or NULLs with greater ID
                            if ascending:
                                stmt = stmt.where(
-                                    or_(
-                                        and_(sort_column.is_(None), ConversationModel.id > after_id),
-                                        sort_column.isnot(None)
-                                    )
+                                    or_(and_(sort_column.is_(None), ConversationModel.id > after_id), sort_column.isnot(None))
                                )
                            else:
                                # If descending, get NULLs with smaller ID
-                                stmt = stmt.where(
-                                    and_(sort_column.is_(None), ConversationModel.id < after_id)
-                                )
+                                stmt = stmt.where(and_(sort_column.is_(None), ConversationModel.id < after_id))
                        else:
                            # Cursor is at non-NULL
                            if ascending:
@@ -217,8 +314,8 @@ class ConversationManager:
                                        sort_column.isnot(None),
                                        or_(
                                            sort_column > after_sort_value,
-                                            and_(sort_column == after_sort_value, ConversationModel.id > after_id)
-                                        )
+                                            and_(sort_column == after_sort_value, ConversationModel.id > after_id),
+                                        ),
                                    )
                                )
                            else:
@@ -227,7 +324,7 @@ class ConversationManager:
                                    or_(
                                        sort_column.is_(None),
                                        sort_column < after_sort_value,
-                                        and_(sort_column == after_sort_value, ConversationModel.id < after_id)
+                                        and_(sort_column == after_sort_value, ConversationModel.id < after_id),
                                    )
                                )
                else:
@@ -277,7 +374,11 @@ class ConversationManager:
            for key, value in update_data.items():
                # model_settings needs to be serialized to dict for the JSON column
                if key == "model_settings" and value is not None:
-                    setattr(conversation, key, conversation_update.model_settings.model_dump() if conversation_update.model_settings else value)
+                    setattr(
+                        conversation,
+                        key,
+                        self._serialize_model_settings(conversation_update.model_settings) if conversation_update.model_settings else value,
+                    )
                else:
                    setattr(conversation, key, value)

--- a/letta/services/helpers/agent_manager_helper.py
+++ b/letta/services/helpers/agent_manager_helper.py
@@ -604,6 +604,9 @@ def _apply_pagination(
    if sort_by == "last_run_completion":
        sort_column = AgentModel.last_run_completion
        sort_nulls_last = True  # TODO: handle this as a query param eventually
+    elif sort_by == "updated_at":
+        sort_column = AgentModel.updated_at
+        sort_nulls_last = False
    else:
        sort_column = AgentModel.created_at
        sort_nulls_last = False
@@ -637,6 +640,9 @@ async def _apply_pagination_async(
    if sort_by == "last_run_completion":
        sort_column = AgentModel.last_run_completion
        sort_nulls_last = True  # TODO: handle this as a query param eventually
+    elif sort_by == "updated_at":
+        sort_column = AgentModel.updated_at
+        sort_nulls_last = False
    else:
        sort_column = AgentModel.created_at
        sort_nulls_last = False
--- a/letta/services/llm_trace_writer.py
+++ b/letta/services/llm_trace_writer.py
@@ -73,7 +73,6 @@ class LLMTraceWriter:
    def __init__(self):
        self._client = None
        self._shutdown = False
-        self._write_lock = asyncio.Lock()  # Serialize writes - clickhouse_connect isn't thread-safe

        # Check if ClickHouse is configured - if not, writing is disabled
        self._enabled = bool(settings.clickhouse_endpoint and settings.clickhouse_password)
@@ -82,11 +81,7 @@ class LLMTraceWriter:
        atexit.register(self._sync_shutdown)

    def _get_client(self):
-        """Initialize ClickHouse client on first use (lazy loading).
-
-        Configures async_insert with wait_for_async_insert=1 for reliable
-        server-side batching with acknowledgment.
-        """
+        """Initialize ClickHouse client on first use (lazy loading)."""
        if self._client is not None:
            return self._client

@@ -108,8 +103,10 @@ class LLMTraceWriter:
            settings={
                # Enable server-side batching
                "async_insert": 1,
-                # Wait for acknowledgment (reliable)
-                "wait_for_async_insert": 1,
+                # Don't wait for server-side flush acknowledgment — fire and forget.
+                # Waiting (value=1) caused each insert to hold an asyncio.Lock for ~1s,
+                # creating unbounded task queues that saturated the event loop under load.
+                "wait_for_async_insert": 0,
                # Flush after 1 second if batch not full
                "async_insert_busy_timeout_ms": 1000,
            },
@@ -148,9 +145,9 @@ class LLMTraceWriter:
                row = trace.to_clickhouse_row()
                columns = LLMTrace.clickhouse_columns()

-                # Serialize writes - clickhouse_connect client isn't thread-safe
-                async with self._write_lock:
-                    # Run synchronous insert in thread pool
+                # Run synchronous insert in thread pool. clickhouse-connect supports
+                # multithreaded use via a thread-safe connection pool:
+                # https://clickhouse.com/docs/integrations/language-clients/python/advanced-usage#multithreaded-multiprocess-and-asyncevent-driven-use-cases
                await asyncio.to_thread(
                    client.insert,
                    "llm_traces",
--- a/letta/services/memory_repo/block_markdown.py
+++ b/letta/services/memory_repo/block_markdown.py
@@ -3,11 +3,11 @@
 File format:
    ---
    description: "Who I am and how I approach work"
-    limit: 20000
    ---
    My name is Memo. I'm a stateful coding assistant...

 - Frontmatter fields are only rendered when they differ from defaults.
+- ``limit`` is intentionally excluded from frontmatter (deprecated for git-base memory).
 - Files without frontmatter are treated as value-only (backward compat).
 """

@@ -37,12 +37,12 @@ def serialize_block(
    This is used for initial file creation. For updates to existing files,
    prefer `merge_frontmatter_with_body` to preserve user formatting.
    """
-    # description and limit are always included in frontmatter.
+    # description is always included in frontmatter.
    # read_only and metadata are only included when non-default.
+    # limit is intentionally excluded (deprecated for git-base memory).
    front: Dict[str, Any] = {}

    front["description"] = description
-    front["limit"] = limit if limit is not None else _get_field_default("limit")

    if read_only != _get_field_default("read_only"):
        front["read_only"] = read_only
@@ -111,7 +111,6 @@ def merge_frontmatter_with_body(

    # Desired values
    desired_description = description
-    desired_limit = limit if limit is not None else _get_field_default("limit")
    desired_read_only = read_only
    desired_metadata = metadata if metadata is not None else _get_field_default("metadata")

@@ -122,8 +121,9 @@ def merge_frontmatter_with_body(
        parsed["description"] = desired_description
        changed = True

-    if "limit" not in parsed or parsed.get("limit") != desired_limit:
-        parsed["limit"] = desired_limit
+    # Remove limit from frontmatter if it exists (deprecated for git-base memory)
+    if "limit" in parsed:
+        del parsed["limit"]
        changed = True

    if desired_read_only != _get_field_default("read_only"):
--- a/letta/services/memory_repo/memfs_client_base.py
+++ b/letta/services/memory_repo/memfs_client_base.py
@@ -21,6 +21,7 @@ from letta.schemas.memory_repo import MemoryCommit
 from letta.schemas.user import User as PydanticUser
 from letta.services.memory_repo.block_markdown import parse_block_markdown, serialize_block
 from letta.services.memory_repo.git_operations import GitOperations
+from letta.services.memory_repo.path_mapping import memory_block_label_from_markdown_path
 from letta.services.memory_repo.storage.local import LocalStorageBackend
 from letta.utils import enforce_types

@@ -133,11 +134,14 @@ class MemfsClient:
        except FileNotFoundError:
            return []

-        # Convert block files to PydanticBlock (metadata is in frontmatter)
+        # Convert block files to PydanticBlock (metadata is in frontmatter).
+        # skills/{skill_name}/SKILL.md is mapped to block label skills/{skill_name};
+        # other files under skills/ are intentionally ignored.
        blocks = []
        for file_path, content in files.items():
-            if file_path.endswith(".md"):
-                label = file_path[:-3]
+            label = memory_block_label_from_markdown_path(file_path)
+            if label is None:
+                continue

            parsed = parse_block_markdown(content)

--- a/letta/services/memory_repo/path_mapping.py
+++ b/letta/services/memory_repo/path_mapping.py
@@ -0,0 +1,29 @@
+"""Helpers for mapping memory-repo markdown paths to block labels.
+
+Special handling for skills:
+- sync `skills/{skill_name}/SKILL.md` as block label `skills/{skill_name}`
+- ignore all other markdown files under `skills/`
+"""
+
+from __future__ import annotations
+
+
+def memory_block_label_from_markdown_path(path: str) -> str | None:
+    """Return block label for a syncable markdown path, else None.
+
+    Rules:
+    - Non-`.md` files are ignored.
+    - `skills/{skill_name}/SKILL.md` -> `skills/{skill_name}`
+    - Other `skills/**` markdown files are ignored.
+    - All other markdown files map to `path[:-3]`.
+    """
+    if not path.endswith(".md"):
+        return None
+
+    if path.startswith("skills/"):
+        parts = path.split("/")
+        if len(parts) == 3 and parts[0] == "skills" and parts[1] and parts[2] == "SKILL.md":
+            return f"skills/{parts[1]}"
+        return None
+
+    return path[:-3]
--- a/letta/services/provider_trace_backends/clickhouse.py
+++ b/letta/services/provider_trace_backends/clickhouse.py
@@ -141,6 +141,9 @@ class ClickhouseProviderTraceBackend(ProviderTraceBackendClient):
            request_json=request_json_str,
            response_json=response_json_str,
            llm_config_json=llm_config_json_str,
+            billing_plan_type=provider_trace.billing_context.plan_type if provider_trace.billing_context else None,
+            billing_cost_source=provider_trace.billing_context.cost_source if provider_trace.billing_context else None,
+            billing_customer_id=provider_trace.billing_context.customer_id if provider_trace.billing_context else None,
        )

    def _extract_usage(self, response_json: dict, provider: str) -> dict:
--- a/letta/services/provider_trace_backends/postgres.py
+++ b/letta/services/provider_trace_backends/postgres.py
@@ -29,7 +29,7 @@ class PostgresProviderTraceBackend(ProviderTraceBackendClient):
    ) -> ProviderTrace:
        """Write full provider trace to provider_traces table."""
        async with db_registry.async_session() as session:
-            provider_trace_model = ProviderTraceModel(**provider_trace.model_dump())
+            provider_trace_model = ProviderTraceModel(**provider_trace.model_dump(exclude={"billing_context"}))
            provider_trace_model.organization_id = actor.organization_id

            if provider_trace.request_json:
--- a/letta/services/run_manager.py
+++ b/letta/services/run_manager.py
@@ -638,7 +638,13 @@ class RunManager:
                raise NoResultFound(f"Run with id {run_id} not found")
            agent_id = run.agent_id

-        logger.debug(f"Cancelling run {run_id} for agent {agent_id}")
+        logger.info(
+            "[Interrupt] Processing cancellation for run=%s, agent=%s, current_status=%s, current_stop_reason=%s",
+            run_id,
+            agent_id,
+            run.status if run else "unknown",
+            run.stop_reason if run else "unknown",
+        )

        # Cancellation should be idempotent: if a run is already terminated, treat this as a no-op.
        # This commonly happens when a run finishes between client request and server handling.
--- a/letta/services/streaming_service.py
+++ b/letta/services/streaming_service.py
@@ -15,6 +15,7 @@ from letta.errors import (
    LettaInvalidArgumentError,
    LettaServiceUnavailableError,
    LLMAuthenticationError,
+    LLMEmptyResponseError,
    LLMError,
    LLMRateLimitError,
    LLMTimeoutError,
@@ -33,6 +34,7 @@ from letta.schemas.letta_request import ClientToolSchema, LettaStreamingRequest
 from letta.schemas.letta_response import LettaResponse
 from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType
 from letta.schemas.message import MessageCreate
+from letta.schemas.provider_trace import BillingContext
 from letta.schemas.run import Run as PydanticRun, RunUpdate
 from letta.schemas.usage import LettaUsageStatistics
 from letta.schemas.user import User
@@ -76,6 +78,8 @@ class StreamingService:
        request: LettaStreamingRequest,
        run_type: str = "streaming",
        conversation_id: Optional[str] = None,
+        should_lock: bool = False,
+        billing_context: "BillingContext | None" = None,
    ) -> tuple[Optional[PydanticRun], Union[StreamingResponse, LettaResponse]]:
        """
        Create a streaming response for an agent.
@@ -86,6 +90,7 @@ class StreamingService:
            request: The LettaStreamingRequest containing all request parameters
            run_type: Type of run for tracking
            conversation_id: Optional conversation ID for conversation-scoped messaging
+            should_lock: If True and conversation_id is None, use agent_id as lock key

        Returns:
            Tuple of (run object or None, streaming response)
@@ -116,6 +121,10 @@ class StreamingService:
                )
                if conversation.model_settings is not None:
                    update_params = conversation.model_settings._to_legacy_config_params()
+                    # Don't clobber max_tokens with the Pydantic default when the caller
+                    # didn't explicitly provide max_output_tokens.
+                    if "max_output_tokens" not in conversation.model_settings.model_fields_set:
+                        update_params.pop("max_tokens", None)
                    conversation_llm_config = conversation_llm_config.model_copy(update=update_params)
                agent = agent.model_copy(update={"llm_config": conversation_llm_config})

@@ -130,12 +139,15 @@ class StreamingService:

        model_compatible_token_streaming = self._is_token_streaming_compatible(agent)

-        # Attempt to acquire conversation lock if conversation_id is provided
-        # This prevents concurrent message processing for the same conversation
+        # Determine lock key: use conversation_id if provided, else agent_id if should_lock
+        lock_key = conversation_id if conversation_id else (agent_id if should_lock else None)
+
+        # Attempt to acquire lock if lock_key is set
+        # This prevents concurrent message processing for the same conversation/agent
        # Skip locking if Redis is not available (graceful degradation)
-        if conversation_id and not isinstance(redis_client, NoopAsyncRedisClient):
+        if lock_key and not isinstance(redis_client, NoopAsyncRedisClient):
            await redis_client.acquire_conversation_lock(
-                conversation_id=conversation_id,
+                conversation_id=lock_key,
                token=str(uuid4()),
            )

@@ -163,8 +175,10 @@ class StreamingService:
                include_return_message_types=request.include_return_message_types,
                actor=actor,
                conversation_id=conversation_id,
+                lock_key=lock_key,  # For lock release (may differ from conversation_id)
                client_tools=request.client_tools,
                include_compaction_messages=request.include_compaction_messages,
+                billing_context=billing_context,
            )

            # handle background streaming if requested
@@ -195,7 +209,7 @@ class StreamingService:
                        run_id=run.id,
                        run_manager=self.server.run_manager,
                        actor=actor,
-                        conversation_id=conversation_id,
+                        conversation_id=lock_key,  # Use lock_key for lock release
                    ),
                    label=f"background_stream_processor_{run.id}",
                )
@@ -251,7 +265,7 @@ class StreamingService:
            if settings.track_agent_run and run and run_status:
                await self.server.run_manager.update_run_by_id_async(
                    run_id=run.id,
-                    conversation_id=conversation_id,
+                    conversation_id=lock_key,  # Use lock_key for lock release
                    update=RunUpdate(status=run_status, metadata=run_update_metadata),
                    actor=actor,
                )
@@ -326,8 +340,10 @@ class StreamingService:
        include_return_message_types: Optional[list[MessageType]],
        actor: User,
        conversation_id: Optional[str] = None,
+        lock_key: Optional[str] = None,
        client_tools: Optional[list[ClientToolSchema]] = None,
        include_compaction_messages: bool = False,
+        billing_context: BillingContext | None = None,
    ) -> AsyncIterator:
        """
        Create a stream with unified error handling.
@@ -356,6 +372,7 @@ class StreamingService:
                    conversation_id=conversation_id,
                    client_tools=client_tools,
                    include_compaction_messages=include_compaction_messages,
+                    billing_context=billing_context,
                )

                async for chunk in stream:
@@ -442,6 +459,21 @@ class StreamingService:
                yield f"event: error\ndata: {error_message.model_dump_json()}\n\n"
                # Send [DONE] marker to properly close the stream
                yield "data: [DONE]\n\n"
+            except LLMEmptyResponseError as e:
+                run_status = RunStatus.failed
+                stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response)
+                error_message = LettaErrorMessage(
+                    run_id=run_id,
+                    error_type="llm_empty_response",
+                    message="LLM returned an empty response.",
+                    detail=str(e),
+                )
+                error_data = {"error": error_message.model_dump()}
+                logger.warning(f"Run {run_id} stopped with LLM empty response: {e}, error_data: {error_message.model_dump()}")
+                yield f"data: {stop_reason.model_dump_json()}\n\n"
+                yield f"event: error\ndata: {error_message.model_dump_json()}\n\n"
+                # Send [DONE] marker to properly close the stream
+                yield "data: [DONE]\n\n"
            except LLMError as e:
                run_status = RunStatus.failed
                stop_reason = LettaStopReason(stop_reason=StopReasonType.llm_api_error)
@@ -491,7 +523,7 @@ class StreamingService:
                    stop_reason_value = stop_reason.stop_reason if stop_reason else StopReasonType.error.value
                    await self.runs_manager.update_run_by_id_async(
                        run_id=run_id,
-                        conversation_id=conversation_id,
+                        conversation_id=lock_key,  # Use lock_key for lock release
                        update=RunUpdate(status=run_status, stop_reason=stop_reason_value, metadata=error_data),
                        actor=actor,
                    )
--- a/letta/services/summarizer/compact.py
+++ b/letta/services/summarizer/compact.py
@@ -96,6 +96,10 @@ async def build_summarizer_llm_config(
        # them just like server.create_agent_async does for agents.
        if summarizer_config.model_settings is not None:
            update_params = summarizer_config.model_settings._to_legacy_config_params()
+            # Don't clobber max_tokens with the Pydantic default when the caller
+            # didn't explicitly provide max_output_tokens.
+            if "max_output_tokens" not in summarizer_config.model_settings.model_fields_set:
+                update_params.pop("max_tokens", None)
            return base.model_copy(update=update_params)

        return base
--- a/letta/services/summarizer/self_summarizer.py
+++ b/letta/services/summarizer/self_summarizer.py
@@ -196,7 +196,7 @@ async def self_summarize_sliding_window(
            return message.tool_calls is not None and len(message.tool_calls) > 0
        return False

-    post_summarization_buffer = [system_prompt]
+    post_summarization_buffer = []
    while approx_token_count >= goal_tokens and eviction_percentage < 1.0:
        # more eviction percentage
        eviction_percentage += 0.10
@@ -217,8 +217,8 @@ async def self_summarize_sliding_window(

        # update token count
        logger.info(f"Attempting to compact messages to index {assistant_message_index} messages")
-        post_summarization_buffer = [system_prompt, *messages[assistant_message_index:]]
-        approx_token_count = await count_tokens(actor, agent_llm_config, post_summarization_buffer)
+        post_summarization_buffer = list(messages[assistant_message_index:])
+        approx_token_count = await count_tokens(actor, agent_llm_config, [system_prompt, *post_summarization_buffer])
        logger.info(
            f"Compacting messages index 1:{assistant_message_index} messages resulted in {approx_token_count} tokens, goal is {goal_tokens}"
        )
--- a/letta/services/summarizer/summarizer_config.py
+++ b/letta/services/summarizer/summarizer_config.py
@@ -11,7 +11,7 @@ from letta.settings import summarizer_settings
 def get_default_summarizer_model(provider_type: ProviderType) -> str | None:
    """Get default model for summarization for given provider type."""
    summarizer_defaults = {
-        ProviderType.anthropic: "anthropic/claude-haiku-4-5-20251001",
+        ProviderType.anthropic: "anthropic/claude-haiku-4-5",
        ProviderType.openai: "openai/gpt-5-mini",
        ProviderType.google_ai: "google_ai/gemini-2.5-flash",
    }
--- a/letta/settings.py
+++ b/letta/settings.py
@@ -114,7 +114,7 @@ class SummarizerSettings(BaseSettings):
 class ModelSettings(BaseSettings):
    model_config = SettingsConfigDict(env_file=".env", extra="ignore")

-    global_max_context_window_limit: int = 32000
+    global_max_context_window_limit: int = 128000

    inner_thoughts_kwarg: str | None = Field(default=INNER_THOUGHTS_KWARG, description="Key used for passing in inner thoughts.")

@@ -204,6 +204,7 @@ class ModelSettings(BaseSettings):
    gemini_base_url: str = "https://generativelanguage.googleapis.com/"
    gemini_force_minimum_thinking_budget: bool = False
    gemini_max_retries: int = 5
+    gemini_timeout_seconds: float = 600.0

    # google vertex
    google_cloud_project: Optional[str] = None
--- a/letta/validators.py
+++ b/letta/validators.py
@@ -45,30 +45,36 @@ PATH_VALIDATORS = {primitive_type.value: _create_path_validator_factory(primitiv


 def _create_conversation_id_or_default_path_validator_factory():
-    """Conversation IDs accept the usual primitive format or the special value 'default'."""
+    """Conversation IDs with support for 'default' and agent IDs (backwards compatibility)."""

-    primitive = PrimitiveType.CONVERSATION.value
-    prefix_pattern = PRIMITIVE_ID_PATTERNS[primitive].pattern
-    # Make the full regex accept either the primitive ID format or 'default'.
-    # `prefix_pattern` already contains the ^...$ anchors.
-    conversation_or_default_pattern = f"^(default|{prefix_pattern[1:-1]})$"
+    conversation_primitive = PrimitiveType.CONVERSATION.value
+    agent_primitive = PrimitiveType.AGENT.value
+    conversation_pattern = PRIMITIVE_ID_PATTERNS[conversation_primitive].pattern
+    agent_pattern = PRIMITIVE_ID_PATTERNS[agent_primitive].pattern
+    # Make the full regex accept: conversation ID, agent ID, or 'default'.
+    # Patterns already contain ^...$ anchors, so strip them for the alternation.
+    conversation_or_agent_or_default_pattern = f"^(default|{conversation_pattern[1:-1]}|{agent_pattern[1:-1]})$"

    def factory():
        return Path(
-            description=(f"The conversation identifier. Either the special value 'default' or an ID in the format '{primitive}-<uuid4>'"),
-            pattern=conversation_or_default_pattern,
-            examples=["default", f"{primitive}-123e4567-e89b-42d3-8456-426614174000"],
+            description=(
+                f"The conversation identifier. Can be a conversation ID ('{conversation_primitive}-<uuid4>'), "
+                f"'default' for agent-direct mode (with agent_id parameter), "
+                f"or an agent ID ('{agent_primitive}-<uuid4>') for backwards compatibility (deprecated)."
+            ),
+            pattern=conversation_or_agent_or_default_pattern,
+            examples=[
+                "default",
+                f"{conversation_primitive}-123e4567-e89b-42d3-8456-426614174000",
+                f"{agent_primitive}-123e4567-e89b-42d3-8456-426614174000",
+            ],
            min_length=1,
-            max_length=len(primitive) + 1 + 36,
+            max_length=max(len(conversation_primitive), len(agent_primitive)) + 1 + 36,
        )

    return factory


-# Override conversation ID path validation to also allow the special value 'default'.
-PATH_VALIDATORS[PrimitiveType.CONVERSATION.value] = _create_conversation_id_or_default_path_validator_factory()
-
-
 # Type aliases for common ID types
 # These can be used directly in route handler signatures for cleaner code
 AgentId = Annotated[str, PATH_VALIDATORS[PrimitiveType.AGENT.value]()]
@@ -89,6 +95,10 @@ StepId = Annotated[str, PATH_VALIDATORS[PrimitiveType.STEP.value]()]
 IdentityId = Annotated[str, PATH_VALIDATORS[PrimitiveType.IDENTITY.value]()]
 ConversationId = Annotated[str, PATH_VALIDATORS[PrimitiveType.CONVERSATION.value]()]

+# Conversation ID with support for 'default' and agent IDs (for agent-direct mode endpoints)
+# Backwards compatible - agent-* will be deprecated in favor of conversation_id='default' + agent_id param
+ConversationIdOrDefault = Annotated[str, _create_conversation_id_or_default_path_validator_factory()()]
+
 # Infrastructure types
 McpServerId = Annotated[str, PATH_VALIDATORS[PrimitiveType.MCP_SERVER.value]()]
 McpOAuthId = Annotated[str, PATH_VALIDATORS[PrimitiveType.MCP_OAUTH.value]()]
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "letta"
-version = "0.16.5"
+version = "0.16.6"
 description = "Create LLM agents with long-term memory and custom tools"
 authors = [
    {name = "Letta Team", email = "contact@letta.com"},
--- a/tests/adapters/test_letta_llm_stream_adapter_error_handling.py
+++ b/tests/adapters/test_letta_llm_stream_adapter_error_handling.py
@@ -2,6 +2,12 @@ import anthropic
 import httpx
 import openai
 import pytest
+from anthropic.types.beta import (
+    BetaMessage,
+    BetaRawMessageStartEvent,
+    BetaRawMessageStopEvent,
+    BetaUsage,
+)
 from google.genai import errors as google_errors

 from letta.adapters.letta_llm_stream_adapter import LettaLLMStreamAdapter
@@ -9,6 +15,7 @@ from letta.errors import (
    ContextWindowExceededError,
    LLMBadRequestError,
    LLMConnectionError,
+    LLMEmptyResponseError,
    LLMInsufficientCreditsError,
    LLMServerError,
 )
@@ -287,3 +294,70 @@ def test_openai_client_handle_llm_error_non_credit_api_error():
    result = client.handle_llm_error(error)
    assert isinstance(result, LLMBadRequestError)
    assert not isinstance(result, LLMInsufficientCreditsError)
+
+
+@pytest.mark.asyncio
+async def test_letta_llm_stream_adapter_raises_empty_response_error_for_anthropic(monkeypatch):
+    """LET-7679: Empty streaming responses (no content blocks) should raise LLMEmptyResponseError.
+
+    This tests the case where Opus 4.6 returns a response with:
+    - BetaRawMessageStartEvent (with usage tokens)
+    - BetaRawMessageStopEvent (end_turn)
+    - NO content blocks in between
+
+    This should raise LLMEmptyResponseError, not complete successfully with stop_reason=end_turn.
+    """
+
+    class FakeAsyncStream:
+        """Mimics anthropic.AsyncStream that returns empty content (no content blocks)."""
+
+        def __init__(self):
+            self.events = [
+                # Message start with some usage info
+                BetaRawMessageStartEvent(
+                    type="message_start",
+                    message=BetaMessage(
+                        id="msg_test_empty",
+                        type="message",
+                        role="assistant",
+                        content=[],  # Empty content
+                        model="claude-opus-4-6",
+                        stop_reason="end_turn",
+                        stop_sequence=None,
+                        usage=BetaUsage(input_tokens=1000, output_tokens=26, cache_creation_input_tokens=0, cache_read_input_tokens=0),
+                    ),
+                ),
+                # Message stop immediately after start - no content blocks
+                BetaRawMessageStopEvent(type="message_stop"),
+            ]
+            self.index = 0
+
+        async def __aenter__(self):
+            return self
+
+        async def __aexit__(self, exc_type, exc, tb):
+            return None
+
+        def __aiter__(self):
+            return self
+
+        async def __anext__(self):
+            if self.index >= len(self.events):
+                raise StopAsyncIteration
+            event = self.events[self.index]
+            self.index += 1
+            return event
+
+    async def fake_stream_async(self, request_data: dict, llm_config):
+        return FakeAsyncStream()
+
+    monkeypatch.setattr(AnthropicClient, "stream_async", fake_stream_async, raising=True)
+
+    llm_client = AnthropicClient()
+    llm_config = LLMConfig(model="claude-opus-4-6", model_endpoint_type="anthropic", context_window=200000)
+    adapter = LettaLLMStreamAdapter(llm_client=llm_client, llm_config=llm_config, call_type=LLMCallType.agent_step)
+
+    gen = adapter.invoke_llm(request_data={}, messages=[], tools=[], use_assistant_message=True)
+    with pytest.raises(LLMEmptyResponseError):
+        async for _ in gen:
+            pass
--- a/tests/configs/llm_model_configs/openai-gpt-5.3-codex.json
+++ b/tests/configs/llm_model_configs/openai-gpt-5.3-codex.json
@@ -0,0 +1,8 @@
+{
+  "context_window": 32000,
+  "model": "gpt-5.3-codex",
+  "model_endpoint_type": "openai",
+  "model_endpoint": "https://api.openai.com/v1",
+  "model_wrapper": null,
+  "reasoning_effort": "low"
+}
--- a/tests/integration_test_batch_api_cron_jobs.py
+++ b/tests/integration_test_batch_api_cron_jobs.py
@@ -141,7 +141,7 @@ async def create_test_agent(name, actor, test_id: Optional[str] = None, model="a
        model="claude-3-7-sonnet-latest",
        model_endpoint_type="anthropic",
        model_endpoint="https://api.anthropic.com/v1",
-        context_window=32000,
+        context_window=128000,
        handle="anthropic/claude-3-7-sonnet-latest",
        put_inner_thoughts_in_kwargs=True,
        max_tokens=4096,
@@ -193,7 +193,7 @@ async def create_test_batch_item(server, batch_id, agent_id, default_user):
        model="claude-3-7-sonnet-latest",
        model_endpoint_type="anthropic",
        model_endpoint="https://api.anthropic.com/v1",
-        context_window=32000,
+        context_window=128000,
        handle="anthropic/claude-3-7-sonnet-latest",
        put_inner_thoughts_in_kwargs=True,
        max_tokens=4096,
--- a/tests/integration_test_conversations_sdk.py
+++ b/tests/integration_test_conversations_sdk.py
@@ -62,12 +62,14 @@ class TestConversationsSDK:
        # Create a conversation
        created = client.conversations.create(agent_id=agent.id)

-        # Retrieve it (should have empty in_context_message_ids initially)
+        # Retrieve it (should have system message from creation)
        retrieved = client.conversations.retrieve(conversation_id=created.id)

        assert retrieved.id == created.id
        assert retrieved.agent_id == created.agent_id
-        assert retrieved.in_context_message_ids == []
+        # Conversation should have 1 system message immediately after creation
+        assert len(retrieved.in_context_message_ids) == 1
+        assert retrieved.in_context_message_ids[0].startswith("message-")

        # Send a message to the conversation
        list(
@@ -566,6 +568,289 @@ class TestConversationsSDK:
        # Should not contain the cursor message
        assert first_message_id not in [m.id for m in messages_after]

+    def test_agent_direct_messaging_via_conversations_endpoint(self, client: Letta, agent):
+        """Test sending messages using agent ID as conversation_id (agent-direct mode).
+
+        This allows clients to use a unified endpoint pattern without managing conversation IDs.
+        """
+        # Send a message using the agent ID directly as conversation_id
+        # This should route to agent-direct mode with locking
+        messages = list(
+            client.conversations.messages.create(
+                conversation_id=agent.id,  # Using agent ID instead of conversation ID
+                messages=[{"role": "user", "content": "Hello via agent-direct mode!"}],
+            )
+        )
+
+        # Verify we got a response
+        assert len(messages) > 0, "Should receive response messages"
+
+        # Verify we got an assistant message in the response
+        assistant_messages = [m for m in messages if hasattr(m, "message_type") and m.message_type == "assistant_message"]
+        assert len(assistant_messages) > 0, "Should receive at least one assistant message"
+
+    def test_agent_direct_messaging_with_locking(self, client: Letta, agent):
+        """Test that agent-direct mode properly acquires and releases locks.
+
+        Sequential requests should both succeed if locks are properly released.
+        """
+        from letta.settings import settings
+
+        # Skip if Redis is not configured
+        if settings.redis_host is None or settings.redis_port is None:
+            pytest.skip("Redis not configured - skipping agent-direct lock test")
+
+        # Send first message via agent-direct mode
+        messages1 = list(
+            client.conversations.messages.create(
+                conversation_id=agent.id,
+                messages=[{"role": "user", "content": "First message"}],
+            )
+        )
+        assert len(messages1) > 0, "First message should succeed"
+
+        # Send second message - should succeed if lock was released
+        messages2 = list(
+            client.conversations.messages.create(
+                conversation_id=agent.id,
+                messages=[{"role": "user", "content": "Second message"}],
+            )
+        )
+        assert len(messages2) > 0, "Second message should succeed after lock released"
+
+    def test_agent_direct_concurrent_requests_blocked(self, client: Letta, agent):
+        """Test that concurrent requests to agent-direct mode are properly serialized.
+
+        One request should succeed and one should get a 409 CONVERSATION_BUSY error.
+        """
+        import concurrent.futures
+
+        from letta_client import ConflictError
+
+        from letta.settings import settings
+
+        # Skip if Redis is not configured
+        if settings.redis_host is None or settings.redis_port is None:
+            pytest.skip("Redis not configured - skipping agent-direct lock test")
+
+        results = {"success": 0, "conflict": 0, "other_error": 0}
+
+        def send_message(msg: str):
+            try:
+                messages = list(
+                    client.conversations.messages.create(
+                        conversation_id=agent.id,  # Agent-direct mode
+                        messages=[{"role": "user", "content": msg}],
+                    )
+                )
+                return ("success", messages)
+            except ConflictError:
+                return ("conflict", None)
+            except Exception as e:
+                return ("other_error", str(e))
+
+        # Fire off two messages concurrently
+        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
+            future1 = executor.submit(send_message, "Concurrent message 1")
+            future2 = executor.submit(send_message, "Concurrent message 2")
+
+            result1 = future1.result()
+            result2 = future2.result()
+
+        # Count results
+        for result_type, _ in [result1, result2]:
+            results[result_type] += 1
+
+        # One should succeed and one should get conflict
+        assert results["success"] == 1, f"Expected 1 success, got {results['success']}"
+        assert results["conflict"] == 1, f"Expected 1 conflict, got {results['conflict']}"
+        assert results["other_error"] == 0, f"Unexpected errors: {results['other_error']}"
+
+        # Now send another message - should succeed since lock is released
+        messages = list(
+            client.conversations.messages.create(
+                conversation_id=agent.id,
+                messages=[{"role": "user", "content": "Message after concurrent requests"}],
+            )
+        )
+        assert len(messages) > 0, "Should be able to send message after concurrent requests complete"
+
+    def test_agent_direct_list_messages(self, client: Letta, agent):
+        """Test listing messages using agent ID as conversation_id."""
+        # First send a message via agent-direct mode
+        list(
+            client.conversations.messages.create(
+                conversation_id=agent.id,
+                messages=[{"role": "user", "content": "Test message for listing"}],
+            )
+        )
+
+        # List messages using agent ID
+        messages_page = client.conversations.messages.list(conversation_id=agent.id)
+        messages = list(messages_page)
+
+        # Should have messages (at least system + user + assistant)
+        assert len(messages) >= 3, f"Expected at least 3 messages, got {len(messages)}"
+
+        # Verify we can find our test message
+        user_messages = [m for m in messages if hasattr(m, "message_type") and m.message_type == "user_message"]
+        assert any("Test message for listing" in str(m.content) for m in user_messages), "Should find our test message"
+
+    def test_agent_direct_cancel(self, client: Letta, agent):
+        """Test canceling runs using agent ID as conversation_id."""
+        from letta.settings import settings
+
+        # Skip if run tracking is disabled
+        if not settings.track_agent_run:
+            pytest.skip("Run tracking disabled - skipping cancel test")
+
+        # Start a background request that we can cancel
+        try:
+            # Send a message in background mode
+            stream = client.conversations.messages.create(
+                conversation_id=agent.id,
+                messages=[{"role": "user", "content": "Background message to cancel"}],
+                background=True,
+            )
+            # Consume a bit of the stream to ensure it started
+            next(iter(stream), None)
+
+            # Cancel using agent ID
+            result = client.conversations.cancel(conversation_id=agent.id)
+
+            # Should return results (may be empty if run already completed)
+            assert isinstance(result, dict), "Cancel should return a dict of results"
+        except Exception as e:
+            # If no active runs, that's okay - the run may have completed quickly
+            if "No active runs" not in str(e):
+                raise
+
+    def test_backwards_compatibility_old_pattern(self, client: Letta, agent, server_url: str):
+        """Test that the old pattern (agent_id as conversation_id) still works for backwards compatibility."""
+        # OLD PATTERN: conversation_id=agent.id (should still work)
+        # Use raw HTTP requests since SDK might not be up to date
+
+        # Test 1: Send message using old pattern
+        response = requests.post(
+            f"{server_url}/v1/conversations/{agent.id}/messages",
+            json={
+                "messages": [{"role": "user", "content": "Testing old pattern still works"}],
+                "streaming": False,
+            },
+        )
+        assert response.status_code == 200, f"Old pattern should work for sending messages: {response.text}"
+        data = response.json()
+        assert "messages" in data, "Response should contain messages"
+        assert len(data["messages"]) > 0, "Should receive response messages"
+
+        # Test 2: List messages using old pattern
+        response = requests.get(f"{server_url}/v1/conversations/{agent.id}/messages")
+        assert response.status_code == 200, f"Old pattern should work for listing messages: {response.text}"
+        data = response.json()
+        # Response is a list of messages directly
+        assert isinstance(data, list), "Response should be a list of messages"
+        assert len(data) >= 3, "Should have at least system + user + assistant messages"
+
+        # Verify our message is there
+        user_messages = [m for m in data if m.get("message_type") == "user_message"]
+        assert any("Testing old pattern still works" in str(m.get("content", "")) for m in user_messages), "Should find our test message"
+
+    def test_new_pattern_send_message(self, client: Letta, agent, server_url: str):
+        """Test sending messages using the new pattern: conversation_id='default' + agent_id in body."""
+        # NEW PATTERN: conversation_id='default' + agent_id in request body
+        response = requests.post(
+            f"{server_url}/v1/conversations/default/messages",
+            json={
+                "agent_id": agent.id,
+                "messages": [{"role": "user", "content": "Testing new pattern send message"}],
+                "streaming": False,
+            },
+        )
+        assert response.status_code == 200, f"New pattern should work for sending messages: {response.text}"
+        data = response.json()
+        assert "messages" in data, "Response should contain messages"
+        assert len(data["messages"]) > 0, "Should receive response messages"
+
+        # Verify we got an assistant message
+        assistant_messages = [m for m in data["messages"] if m.get("message_type") == "assistant_message"]
+        assert len(assistant_messages) > 0, "Should receive at least one assistant message"
+
+    def test_new_pattern_list_messages(self, client: Letta, agent, server_url: str):
+        """Test listing messages using the new pattern: conversation_id='default' + agent_id query param."""
+        # First send a message to populate the conversation
+        requests.post(
+            f"{server_url}/v1/conversations/{agent.id}/messages",
+            json={
+                "messages": [{"role": "user", "content": "Setup message for list test"}],
+                "streaming": False,
+            },
+        )
+
+        # NEW PATTERN: conversation_id='default' + agent_id as query param
+        response = requests.get(
+            f"{server_url}/v1/conversations/default/messages",
+            params={"agent_id": agent.id},
+        )
+        assert response.status_code == 200, f"New pattern should work for listing messages: {response.text}"
+        data = response.json()
+        # Response is a list of messages directly
+        assert isinstance(data, list), "Response should be a list of messages"
+        assert len(data) >= 3, "Should have at least system + user + assistant messages"
+
+    def test_new_pattern_cancel(self, client: Letta, agent, server_url: str):
+        """Test canceling runs using the new pattern: conversation_id='default' + agent_id query param."""
+        from letta.settings import settings
+
+        if not settings.track_agent_run:
+            pytest.skip("Run tracking disabled - skipping cancel test")
+
+        # NEW PATTERN: conversation_id='default' + agent_id as query param
+        response = requests.post(
+            f"{server_url}/v1/conversations/default/cancel",
+            params={"agent_id": agent.id},
+        )
+        # Returns 200 with results if runs exist, or 409 if no active runs
+        assert response.status_code in [200, 409], f"New pattern should work for cancel: {response.text}"
+        if response.status_code == 200:
+            data = response.json()
+            assert isinstance(data, dict), "Cancel should return a dict"
+
+    def test_new_pattern_compact(self, client: Letta, agent, server_url: str):
+        """Test compacting conversation using the new pattern: conversation_id='default' + agent_id in body."""
+        # Send many messages to have enough for compaction
+        for i in range(10):
+            requests.post(
+                f"{server_url}/v1/conversations/{agent.id}/messages",
+                json={
+                    "messages": [{"role": "user", "content": f"Message {i} for compaction test"}],
+                    "streaming": False,
+                },
+            )
+
+        # NEW PATTERN: conversation_id='default' + agent_id in request body
+        response = requests.post(
+            f"{server_url}/v1/conversations/default/compact",
+            json={"agent_id": agent.id},
+        )
+        # May return 200 (success) or 400 (not enough messages to compact)
+        assert response.status_code in [200, 400], f"New pattern should accept agent_id parameter: {response.text}"
+        if response.status_code == 200:
+            data = response.json()
+            assert "summary" in data, "Response should contain summary"
+            assert "num_messages_before" in data, "Response should contain num_messages_before"
+            assert "num_messages_after" in data, "Response should contain num_messages_after"
+
+    def test_new_pattern_stream_retrieve(self, client: Letta, agent, server_url: str):
+        """Test retrieving stream using the new pattern: conversation_id='default' + agent_id in body."""
+        # NEW PATTERN: conversation_id='default' + agent_id in request body
+        # Note: This will likely return 400 if no active run exists, which is expected
+        response = requests.post(
+            f"{server_url}/v1/conversations/default/stream",
+            json={"agent_id": agent.id},
+        )
+        # Either 200 (if run exists) or 400 (no active run) are both acceptable
+        assert response.status_code in [200, 400], f"Stream retrieve should accept new pattern: {response.text}"
+

 class TestConversationDelete:
    """Tests for the conversation delete endpoint."""
@@ -834,3 +1119,130 @@ class TestConversationCompact:
        )

        assert response.status_code == 404
+
+
+class TestConversationSystemMessageRecompilation:
+    """Tests that verify the system message is recompiled with latest memory state on new conversation creation."""
+
+    def test_new_conversation_recompiles_system_message_with_updated_memory(self, client: Letta, server_url: str):
+        """Test the full workflow:
+        1. Agent is created
+        2. Send message to agent (through a conversation)
+        3. Modify the memory block -> check system message is NOT updated with the modified value
+        4. Create a new conversation
+        5. Check new conversation system message DOES have the modified value
+        """
+        unique_marker = f"UNIQUE_MARKER_{uuid.uuid4().hex[:8]}"
+
+        # Step 1: Create an agent with known memory blocks
+        agent = client.agents.create(
+            name=f"test_sys_msg_recompile_{uuid.uuid4().hex[:8]}",
+            model="openai/gpt-4o-mini",
+            embedding="openai/text-embedding-3-small",
+            memory_blocks=[
+                {"label": "human", "value": "The user is a test user."},
+                {"label": "persona", "value": "You are a helpful assistant."},
+            ],
+        )
+
+        try:
+            # Step 2: Create a conversation and send a message to it
+            conv1 = client.conversations.create(agent_id=agent.id)
+
+            list(
+                client.conversations.messages.create(
+                    conversation_id=conv1.id,
+                    messages=[{"role": "user", "content": "Hello, just a quick test."}],
+                )
+            )
+
+            # Verify the conversation has messages including a system message
+            conv1_messages = client.conversations.messages.list(
+                conversation_id=conv1.id,
+                order="asc",
+            )
+            assert len(conv1_messages) >= 3  # system + user + assistant
+            assert conv1_messages[0].message_type == "system_message"
+
+            # Get the original system message content
+            original_system_content = conv1_messages[0].content
+            assert unique_marker not in original_system_content, "Marker should not be in original system message"
+
+            # Step 3: Modify the memory block with a unique marker
+            client.agents.blocks.update(
+                agent_id=agent.id,
+                block_label="human",
+                value=f"The user is a test user. {unique_marker}",
+            )
+
+            # Verify the block was actually updated
+            updated_block = client.agents.blocks.retrieve(agent_id=agent.id, block_label="human")
+            assert unique_marker in updated_block.value
+
+            # Check that the OLD conversation's system message is NOT updated
+            conv1_messages_after_update = client.conversations.messages.list(
+                conversation_id=conv1.id,
+                order="asc",
+            )
+            old_system_content = conv1_messages_after_update[0].content
+            assert unique_marker not in old_system_content, "Old conversation system message should NOT contain the updated memory value"
+
+            # Step 4: Create a new conversation
+            conv2 = client.conversations.create(agent_id=agent.id)
+
+            # Step 5: Check the new conversation's system message has the updated value
+            # The system message should be compiled at creation time with the latest memory
+            conv2_retrieved = client.conversations.retrieve(conversation_id=conv2.id)
+            assert len(conv2_retrieved.in_context_message_ids) == 1, (
+                f"New conversation should have exactly 1 system message, got {len(conv2_retrieved.in_context_message_ids)}"
+            )
+
+            conv2_messages = client.conversations.messages.list(
+                conversation_id=conv2.id,
+                order="asc",
+            )
+            assert len(conv2_messages) >= 1
+            assert conv2_messages[0].message_type == "system_message"
+
+            new_system_content = conv2_messages[0].content
+            assert unique_marker in new_system_content, (
+                f"New conversation system message should contain the updated memory value '{unique_marker}', "
+                f"but system message content did not include it"
+            )
+
+        finally:
+            client.agents.delete(agent_id=agent.id)
+
+    def test_conversation_creation_initializes_system_message(self, client: Letta, server_url: str):
+        """Test that creating a conversation immediately initializes it with a system message."""
+        agent = client.agents.create(
+            name=f"test_conv_init_{uuid.uuid4().hex[:8]}",
+            model="openai/gpt-4o-mini",
+            embedding="openai/text-embedding-3-small",
+            memory_blocks=[
+                {"label": "human", "value": "Test user for system message init."},
+                {"label": "persona", "value": "You are a helpful assistant."},
+            ],
+        )
+
+        try:
+            # Create a conversation (without sending any messages)
+            conversation = client.conversations.create(agent_id=agent.id)
+
+            # Verify the conversation has a system message immediately
+            retrieved = client.conversations.retrieve(conversation_id=conversation.id)
+            assert len(retrieved.in_context_message_ids) == 1, (
+                f"Expected 1 system message after conversation creation, got {len(retrieved.in_context_message_ids)}"
+            )
+
+            # Verify the system message content contains memory block values
+            messages = client.conversations.messages.list(
+                conversation_id=conversation.id,
+                order="asc",
+            )
+            assert len(messages) == 1
+            assert messages[0].message_type == "system_message"
+            assert "Test user for system message init." in messages[0].content
+
+        finally:
+            client.agents.delete(agent_id=agent.id)
--- a/tests/integration_test_multi_agent.py
+++ b/tests/integration_test_multi_agent.py
@@ -93,7 +93,7 @@ def agent_obj(client: Letta) -> AgentState:
        tool_ids=[send_message_to_agent_tool.id],
        model="openai/gpt-4o",
        embedding="openai/text-embedding-3-small",
-        context_window_limit=32000,
+        context_window_limit=128000,
    )
    yield agent_state_instance

@@ -107,7 +107,7 @@ def other_agent_obj(client: Letta) -> AgentState:
        include_multi_agent_tools=False,
        model="openai/gpt-4o",
        embedding="openai/text-embedding-3-small",
-        context_window_limit=32000,
+        context_window_limit=128000,
    )

    yield agent_state_instance
--- a/tests/managers/test_agent_manager.py
+++ b/tests/managers/test_agent_manager.py
@@ -366,6 +366,8 @@ async def test_compaction_settings_model_uses_separate_llm_config_for_summarizat
 async def test_create_agent_sets_default_compaction_model_anthropic(server: SyncServer, default_user):
    """When no compaction_settings provided for Anthropic agent, default haiku model should be set."""
    from letta.schemas.agent import CreateAgent
+    from letta.schemas.enums import ProviderType
+    from letta.services.summarizer.summarizer_config import get_default_summarizer_model

    await server.init_async(init_with_default_org_and_user=True)

@@ -384,7 +386,7 @@ async def test_create_agent_sets_default_compaction_model_anthropic(server: Sync

    # Should have default haiku model set
    assert agent.compaction_settings is not None
-    assert agent.compaction_settings.model == "anthropic/claude-haiku-4-5-20251001"
+    assert agent.compaction_settings.model == get_default_summarizer_model(ProviderType.anthropic)


@pytest.mark.asyncio
@@ -808,6 +810,79 @@ async def test_update_agent_compaction_settings(server: SyncServer, comprehensiv
    assert updated_agent.compaction_settings.prompt_acknowledgement == False


+@pytest.mark.asyncio
+async def test_update_agent_partial_compaction_settings(server: SyncServer, comprehensive_test_agent_fixture, default_user):
+    """Test that an agent's compaction_settings can be upserted."""
+    from letta.services.summarizer.summarizer_config import get_default_prompt_for_mode
+
+    agent, _ = comprehensive_test_agent_fixture
+
+    # Create new compaction settings
+    original_compaction_settings = agent.compaction_settings.model_copy()
+
+    new_compaction_settings = CompactionSettings(
+        mode="all",
+        prompt_acknowledgement=True,
+        clip_chars=3000,
+    )
+
+    # Update agent with compaction settings
+    update_agent_request = UpdateAgent(
+        compaction_settings=new_compaction_settings,
+    )
+
+    updated_agent = await server.agent_manager.update_agent_async(agent.id, update_agent_request, actor=default_user)
+
+    # Verify compaction settings were updated correctly
+    assert updated_agent.compaction_settings is not None
+    assert updated_agent.compaction_settings.model == original_compaction_settings.model
+    assert updated_agent.compaction_settings.model_settings == original_compaction_settings.model_settings
+    assert updated_agent.compaction_settings.sliding_window_percentage == original_compaction_settings.sliding_window_percentage
+    assert updated_agent.compaction_settings.mode == "all"
+    assert updated_agent.compaction_settings.clip_chars == 3000
+    assert updated_agent.compaction_settings.prompt == get_default_prompt_for_mode("all")
+    assert updated_agent.compaction_settings.prompt_acknowledgement == True
+
+
+@pytest.mark.asyncio
+async def test_update_agent_partial_compaction_settings_same_mode(server: SyncServer, comprehensive_test_agent_fixture, default_user):
+    """Test that if the mode stays the same without a prompt passed in, the prompt is not updated."""
+
+    agent, _ = comprehensive_test_agent_fixture
+
+    update_agent_request = UpdateAgent(
+        compaction_settings=CompactionSettings(mode="sliding_window", prompt="This is a fake prompt."),
+    )
+    updated_agent = await server.agent_manager.update_agent_async(agent.id, update_agent_request, actor=default_user)
+
+    assert updated_agent.compaction_settings is not None
+    assert updated_agent.compaction_settings.prompt == "This is a fake prompt."
+
+    # Create new compaction settings
+    original_compaction_settings = updated_agent.compaction_settings.model_copy()
+
+    new_compaction_settings = CompactionSettings(
+        mode="sliding_window",
+        model="openai/gpt-4o-mini",
+    )
+
+    # Update agent with compaction settings
+    update_agent_request = UpdateAgent(
+        compaction_settings=new_compaction_settings,
+    )
+
+    final_agent = await server.agent_manager.update_agent_async(updated_agent.id, update_agent_request, actor=default_user)
+
+    # Verify compaction settings were updated correctly
+    assert final_agent.compaction_settings is not None
+    assert final_agent.compaction_settings.sliding_window_percentage == original_compaction_settings.sliding_window_percentage
+    assert final_agent.compaction_settings.prompt == original_compaction_settings.prompt
+    assert final_agent.compaction_settings.clip_chars == original_compaction_settings.clip_chars
+    assert final_agent.compaction_settings.prompt_acknowledgement == original_compaction_settings.prompt_acknowledgement
+    assert final_agent.compaction_settings.mode == "sliding_window"
+    assert final_agent.compaction_settings.model == "openai/gpt-4o-mini"
+
+
@pytest.mark.asyncio
 async def test_agent_file_defaults_based_on_context_window(server: SyncServer, default_user, default_block):
    """Test that file-related defaults are set based on the model's context window size"""
--- a/tests/managers/test_block_manager.py
+++ b/tests/managers/test_block_manager.py
@@ -562,7 +562,9 @@ async def test_update_block(server: SyncServer, default_user):
@pytest.mark.asyncio
 async def test_update_block_limit(server: SyncServer, default_user):
    block_manager = BlockManager()
-    block = await block_manager.create_or_update_block_async(PydanticBlock(label="persona", value="Original Content"), actor=default_user)
+    block = await block_manager.create_or_update_block_async(
+        PydanticBlock(label="persona", value="Original Content", limit=20000), actor=default_user
+    )

    limit = len("Updated Content") * 2000
    update_data = BlockUpdate(value="Updated Content" * 2000, description="Updated description")
--- a/tests/managers/test_conversation_manager.py
+++ b/tests/managers/test_conversation_manager.py
@@ -355,8 +355,9 @@ async def test_add_messages_to_conversation(
        actor=default_user,
    )

-    assert len(message_ids) == 1
-    assert message_ids[0] == hello_world_message_fixture.id
+    # create_conversation auto-creates a system message at position 0
+    assert len(message_ids) == 2
+    assert hello_world_message_fixture.id in message_ids


@pytest.mark.asyncio
@@ -385,8 +386,9 @@ async def test_get_messages_for_conversation(
        actor=default_user,
    )

-    assert len(messages) == 1
-    assert messages[0].id == hello_world_message_fixture.id
+    # create_conversation auto-creates a system message at position 0
+    assert len(messages) == 2
+    assert any(m.id == hello_world_message_fixture.id for m in messages)


@pytest.mark.asyncio
@@ -430,7 +432,10 @@ async def test_message_ordering_in_conversation(conversation_manager, server: Sy
        actor=default_user,
    )

-    assert retrieved_ids == [m.id for m in messages]
+    # create_conversation auto-creates a system message at position 0,
+    # so the user messages start at index 1
+    assert len(retrieved_ids) == len(messages) + 1
+    assert retrieved_ids[1:] == [m.id for m in messages]


@pytest.mark.asyncio
@@ -489,7 +494,7 @@ async def test_update_in_context_messages(conversation_manager, server: SyncServ

@pytest.mark.asyncio
 async def test_empty_conversation_message_ids(conversation_manager, server: SyncServer, sarah_agent, default_user):
-    """Test getting message IDs from an empty conversation."""
+    """Test getting message IDs from a newly created conversation (has auto-created system message)."""
    # Create a conversation
    conversation = await conversation_manager.create_conversation(
        agent_id=sarah_agent.id,
@@ -497,13 +502,14 @@ async def test_empty_conversation_message_ids(conversation_manager, server: Sync
        actor=default_user,
    )

-    # Get message IDs (should be empty)
+    # create_conversation auto-creates a system message at position 0,
+    # so a newly created conversation has exactly one message
    message_ids = await conversation_manager.get_message_ids_for_conversation(
        conversation_id=conversation.id,
        actor=default_user,
    )

-    assert message_ids == []
+    assert len(message_ids) == 1


@pytest.mark.asyncio
@@ -551,9 +557,11 @@ async def test_list_conversation_messages(conversation_manager, server: SyncServ
        actor=default_user,
    )

-    assert len(letta_messages) == 2
+    # create_conversation auto-creates a system message, so we get 3 total
+    assert len(letta_messages) == 3
    # Check message types
    message_types = [m.message_type for m in letta_messages]
+    assert "system_message" in message_types
    assert "user_message" in message_types
    assert "assistant_message" in message_types

@@ -902,9 +910,12 @@ async def test_list_conversation_messages_ascending_order(conversation_manager,
        reverse=False,
    )

-    # First message should be "Message 0" (oldest)
-    assert len(letta_messages) == 3
-    assert "Message 0" in letta_messages[0].content
+    # create_conversation auto-creates a system message at position 0,
+    # so we get 4 messages total (system + 3 user messages)
+    assert len(letta_messages) == 4
+    # First message is the auto-created system message; "Message 0" is second
+    assert letta_messages[0].message_type == "system_message"
+    assert "Message 0" in letta_messages[1].content


@pytest.mark.asyncio
@@ -949,8 +960,9 @@ async def test_list_conversation_messages_descending_order(conversation_manager,
        reverse=True,
    )

-    # First message should be "Message 2" (newest)
-    assert len(letta_messages) == 3
+    # create_conversation auto-creates a system message, so 4 total
+    # First message should be "Message 2" (newest) in descending order
+    assert len(letta_messages) == 4
    assert "Message 2" in letta_messages[0].content


@@ -1081,7 +1093,8 @@ async def test_list_conversation_messages_no_group_id_returns_all(conversation_m
        actor=default_user,
    )

-    assert len(all_messages) == 3
+    # create_conversation auto-creates a system message, so 4 total
+    assert len(all_messages) == 4


@pytest.mark.asyncio
@@ -1137,8 +1150,8 @@ async def test_list_conversation_messages_order_with_pagination(conversation_man

    # The first messages should be different
    assert page_asc[0].content != page_desc[0].content
-    # In ascending, first should be "Message 0"
-    assert "Message 0" in page_asc[0].content
+    # In ascending, first is the auto-created system message, second is "Message 0"
+    assert page_asc[0].message_type == "system_message"
    # In descending, first should be "Message 4"
    assert "Message 4" in page_desc[0].content

--- a/tests/managers/test_provider_manager.py
+++ b/tests/managers/test_provider_manager.py
@@ -579,8 +579,11 @@ async def test_server_startup_syncs_base_providers(default_user, default_organiz
                yield item

    # Mock the Anthropic AsyncAnthropic client
+    # NOTE: list() must be a regular (non-async) method that returns an async iterable,
+    # because the real Anthropic SDK's models.list() returns an AsyncPage (which has __aiter__)
+    # directly, and the code uses `async for model in client.models.list()`.
    class MockAnthropicModels:
-        async def list(self):
+        def list(self):
            return MockAnthropicAsyncPage(mock_anthropic_models["data"])

    class MockAsyncAnthropic:
@@ -877,8 +880,10 @@ async def test_server_startup_handles_api_errors_gracefully(default_user, defaul
            for item in self._items:
                yield item

+    # NOTE: The real SDK's models.list() is a regular (non-async) method that
+    # returns an AsyncPaginator (which is async-iterable).
    class MockAnthropicModels:
-        async def list(self):
+        def list(self):
            return MockAnthropicAsyncPage(mock_anthropic_data)

    class MockAsyncAnthropic:
--- a/tests/model_settings/openai-gpt-5.3-chat-latest.json
+++ b/tests/model_settings/openai-gpt-5.3-chat-latest.json
@@ -0,0 +1,11 @@
+{
+  "handle": "openai/gpt-5.3-chat-latest",
+  "model_settings": {
+    "provider_type": "openai",
+    "max_output_tokens": 4096,
+    "parallel_tool_calls": false,
+    "reasoning": {
+      "reasoning_effort": "minimal"
+    }
+  }
+}
--- a/tests/sdk/blocks_test.py
+++ b/tests/sdk/blocks_test.py
@@ -1,11 +1,11 @@
 from conftest import create_test_module
 from letta_client import UnprocessableEntityError

-from letta.constants import CORE_MEMORY_HUMAN_CHAR_LIMIT, CORE_MEMORY_PERSONA_CHAR_LIMIT
+from letta.constants import CORE_MEMORY_BLOCK_CHAR_LIMIT

 BLOCKS_CREATE_PARAMS = [
-    ("human_block", {"label": "human", "value": "test"}, {"limit": CORE_MEMORY_HUMAN_CHAR_LIMIT}, None),
-    ("persona_block", {"label": "persona", "value": "test1"}, {"limit": CORE_MEMORY_PERSONA_CHAR_LIMIT}, None),
+    ("human_block", {"label": "human", "value": "test"}, {"limit": CORE_MEMORY_BLOCK_CHAR_LIMIT}, None),
+    ("persona_block", {"label": "persona", "value": "test1"}, {"limit": CORE_MEMORY_BLOCK_CHAR_LIMIT}, None),
 ]

 BLOCKS_UPDATE_PARAMS = [
--- a/tests/test_agent_files/customer_service.af
+++ b/tests/test_agent_files/customer_service.af
--- a/tests/test_agent_files/deep_research_agent.af
+++ b/tests/test_agent_files/deep_research_agent.af
--- a/tests/test_agent_files/knowledge-base.af
+++ b/tests/test_agent_files/knowledge-base.af
@@ -44,7 +44,7 @@
    "provider_name": null,
    "provider_category": null,
    "model_wrapper": null,
-    "context_window": 32000,
+    "context_window": 128000,
    "put_inner_thoughts_in_kwargs": false,
    "handle": "anthropic/claude-3.5-sonnet",
    "temperature": 1.0,
--- a/tests/test_agent_files/memgpt_agent_with_convo.af
+++ b/tests/test_agent_files/memgpt_agent_with_convo.af
--- a/tests/test_agent_files/outreach_workflow_agent.af
+++ b/tests/test_agent_files/outreach_workflow_agent.af
--- a/tests/test_agent_files/test_agent_with_files_and_sources.af
+++ b/tests/test_agent_files/test_agent_with_files_and_sources.af
@@ -56,7 +56,7 @@
        "provider_name": "openai",
        "provider_category": "base",
        "model_wrapper": null,
-        "context_window": 32000,
+        "context_window": 128000,
        "put_inner_thoughts_in_kwargs": true,
        "handle": "openai/gpt-4o-mini",
        "temperature": 1.0,
--- a/tests/test_agent_files/test_basic_agent_with_blocks_tools_messages_v2.af
+++ b/tests/test_agent_files/test_basic_agent_with_blocks_tools_messages_v2.af
@@ -55,7 +55,7 @@
        "provider_name": "openai",
        "provider_category": "base",
        "model_wrapper": null,
-        "context_window": 32000,
+        "context_window": 128000,
        "put_inner_thoughts_in_kwargs": true,
        "handle": "openai/gpt-4.1-mini",
        "temperature": 1.0,
--- a/tests/test_llm_clients.py
+++ b/tests/test_llm_clients.py
@@ -16,7 +16,7 @@ def llm_config():
        model="claude-3-7-sonnet-20250219",
        model_endpoint_type="anthropic",
        model_endpoint="https://api.anthropic.com/v1",
-        context_window=32000,
+        context_window=128000,
        handle="anthropic/claude-sonnet-4-20250514",
        put_inner_thoughts_in_kwargs=False,
        max_tokens=4096,
--- a/tests/test_log_context_middleware.py
+++ b/tests/test_log_context_middleware.py
@@ -52,8 +52,17 @@ class TestLogContextMiddleware:
            async def get_files(self, agent_id, org_id, ref):
                assert ref == "HEAD"
                return {
-                    "system/human.md": "---\ndescription: human\nlimit: 20000\n---\nname: sarah",
-                    "system/persona.md": "---\ndescription: persona\nlimit: 20000\n---\nbe helpful",
+                    "system/human.md": "---\ndescription: human\n---\nname: sarah",
+                    "system/persona.md": "---\ndescription: persona\n---\nbe helpful",
+                    "skills/research-helper/SKILL.md": (
+                        "---\n"
+                        "name: research-helper\n"
+                        "description: Search the web and summarize findings.\n"
+                        "---\n"
+                        "# Research Helper\n\n"
+                        "Use this skill to do deep web research and summarize results.\n"
+                    ),
+                    "skills/research-helper/references/details.md": "---\ndescription: nested\n---\nShould not be synced",
                }

        class DummyMemoryRepoManager:
@@ -95,6 +104,12 @@ class TestLogContextMiddleware:
        labels = {call["label"] for call in synced_calls}
        assert "system/human" in labels
        assert "system/persona" in labels
+        assert "skills/research-helper" in labels
+        assert "skills/research-helper/references/details" not in labels
+
+        by_label = {call["label"]: call for call in synced_calls}
+        assert by_label["skills/research-helper"]["description"] == "Search the web and summarize findings."
+        assert by_label["skills/research-helper"]["value"].startswith("# Research Helper")

    def test_extracts_actor_id_from_headers(self, client):
        response = client.get("/v1/agents/agent-123e4567-e89b-42d3-8456-426614174000", headers={"user_id": "user-abc123"})
--- a/tests/test_memory.py
+++ b/tests/test_memory.py
@@ -25,9 +25,9 @@ def test_chat_memory_init_and_utils(chat_memory: Memory):

 def test_memory_limit_validation(chat_memory: Memory):
    with pytest.raises(ValueError):
-        ChatMemory(persona="x " * 50000, human="y " * 50000)
+        ChatMemory(persona="x " * 60000, human="y " * 60000)
    with pytest.raises(ValueError):
-        chat_memory.get_block("persona").value = "x " * 50000
+        chat_memory.get_block("persona").value = "x " * 60000


 def test_get_block_not_found(chat_memory: Memory):
@@ -253,3 +253,104 @@ def test_compile_git_memory_filesystem_handles_leaf_directory_collisions():
    assert "system/" in out
    assert "system.md" in out
    assert "human.md" in out
+
+
+def test_compile_git_memory_filesystem_renders_descriptions_for_non_system_files():
+    """Files outside system/ should render their description in the filesystem tree.
+
+    e.g. `reference/api.md (Contains API specifications)`
+    System files should NOT render descriptions in the tree.
+    """
+
+    m = Memory(
+        agent_type=AgentType.letta_v1_agent,
+        git_enabled=True,
+        blocks=[
+            Block(label="system/human", value="human data", limit=100, description="The human block"),
+            Block(label="system/persona", value="persona data", limit=100, description="The persona block"),
+            Block(label="reference/api", value="api specs", limit=100, description="Contains API specifications"),
+            Block(label="notes", value="my notes", limit=100, description="Personal notes and reminders"),
+        ],
+    )
+
+    out = m.compile()
+
+    # Filesystem tree should exist
+    assert "<memory_filesystem>" in out
+
+    # Non-system files should have descriptions rendered
+    assert "api.md (Contains API specifications)" in out
+    assert "notes.md (Personal notes and reminders)" in out
+
+    # System files should NOT have descriptions in the tree
+    assert "human.md (The human block)" not in out
+    assert "persona.md (The persona block)" not in out
+    # But they should still be in the tree (without description)
+    assert "human.md" in out
+    assert "persona.md" in out
+
+
+def test_compile_git_memory_filesystem_no_description_when_empty():
+    """Files outside system/ with no description should render without parentheses."""
+
+    m = Memory(
+        agent_type=AgentType.letta_v1_agent,
+        git_enabled=True,
+        blocks=[
+            Block(label="system/human", value="human data", limit=100),
+            Block(label="notes", value="my notes", limit=100),
+            Block(label="reference/api", value="api specs", limit=100, description="API docs"),
+        ],
+    )
+
+    out = m.compile()
+
+    # notes.md has no description, so no parentheses
+    assert "notes.md\n" in out or "notes.md\n" in out
+    # reference/api.md has a description
+    assert "api.md (API docs)" in out
+
+
+def test_compile_git_memory_filesystem_condenses_skills_to_top_level_entries():
+    """skills/ should render as top-level skill entries with description.
+
+    We intentionally avoid showing nested files under skills/ in the system
+    prompt tree to keep context concise.
+    """
+
+    m = Memory(
+        agent_type=AgentType.letta_v1_agent,
+        git_enabled=True,
+        blocks=[
+            Block(label="system/human", value="human data", limit=100),
+            Block(
+                label="skills/searching-messages",
+                value="# searching messages",
+                limit=100,
+                description="Search past messages to recall context.",
+            ),
+            Block(
+                label="skills/creating-skills",
+                value="# creating skills",
+                limit=100,
+                description="Guide for creating effective skills.",
+            ),
+            Block(
+                label="skills/creating-skills/references/workflows",
+                value="nested docs",
+                limit=100,
+                description="Nested workflow docs (should not appear)",
+            ),
+        ],
+    )
+
+    out = m.compile()
+
+    # Condensed top-level skill entries with descriptions.
+    assert "searching-messages (Search past messages to recall context.)" in out
+    assert "creating-skills (Guide for creating effective skills.)" in out
+
+    # Do not show .md suffixes or nested skill docs in tree.
+    assert "searching-messages.md" not in out
+    assert "creating-skills.md" not in out
+    assert "references/workflows" not in out
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -24,6 +24,9 @@ def test_get_headers_user_id_allows_none():
        letta_v1_agent=None,
        letta_v1_agent_message_async=None,
        modal_sandbox=None,
+        billing_plan_type=None,
+        billing_cost_source=None,
+        billing_customer_id=None,
    )
    assert isinstance(headers, HeaderParams)

@@ -40,6 +43,9 @@ def test_get_headers_user_id_rejects_invalid_format():
            letta_v1_agent=None,
            letta_v1_agent_message_async=None,
            modal_sandbox=None,
+            billing_plan_type=None,
+            billing_cost_source=None,
+            billing_customer_id=None,
        )


@@ -54,6 +60,9 @@ def test_get_headers_user_id_accepts_valid_format():
        letta_v1_agent=None,
        letta_v1_agent_message_async=None,
        modal_sandbox=None,
+        billing_plan_type=None,
+        billing_cost_source=None,
+        billing_customer_id=None,
    )
    assert headers.actor_id == "user-123e4567-e89b-42d3-8456-426614174000"

--- a/uv.lock
+++ b/uv.lock
@@ -2510,7 +2510,7 @@ wheels = [

 [[package]]
 name = "letta"
-version = "0.16.5"
+version = "0.16.6"
 source = { editable = "." }
 dependencies = [
    { name = "aiofiles" },