chore: bump 0.16.6 (#3211)

This commit is contained in:
cthomas
2026-03-03 19:13:07 -08:00
committed by GitHub
84 changed files with 2540 additions and 407 deletions

View File

@@ -260,6 +260,7 @@ model:
base_url: https://generativelanguage.googleapis.com/
force_minimum_thinking_budget: false
max_retries: 5
timeout_seconds: 600.0
# Google Vertex (-> GOOGLE_CLOUD_*)
# google_cloud:

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,220 @@
import * as fs from 'fs';
import * as path from 'path';
import { omit } from 'lodash';
import { execSync } from 'child_process';
import { merge, isErrorResult } from 'openapi-merge';
import type { Swagger } from 'atlassian-openapi';
import { RESTRICTED_ROUTE_BASE_PATHS } from '@letta-cloud/sdk-core';
const lettaWebOpenAPIPath = path.join(
__dirname,
'..',
'..',
'..',
'web',
'autogenerated',
'letta-web-openapi.json',
);
const lettaAgentsAPIPath = path.join(
__dirname,
'..',
'..',
'letta',
'server',
'openapi_letta.json',
);
const lettaWebOpenAPI = JSON.parse(
fs.readFileSync(lettaWebOpenAPIPath, 'utf8'),
) as Swagger.SwaggerV3;
const lettaAgentsAPI = JSON.parse(
fs.readFileSync(lettaAgentsAPIPath, 'utf8'),
) as Swagger.SwaggerV3;
// removes any routes that are restricted
lettaAgentsAPI.paths = Object.fromEntries(
Object.entries(lettaAgentsAPI.paths).filter(([path]) =>
RESTRICTED_ROUTE_BASE_PATHS.every(
(restrictedPath) => !path.startsWith(restrictedPath),
),
),
);
const lettaAgentsAPIWithNoEndslash = Object.keys(lettaAgentsAPI.paths).reduce(
(acc, path) => {
const pathWithoutSlash = path.endsWith('/')
? path.slice(0, path.length - 1)
: path;
acc[pathWithoutSlash] = lettaAgentsAPI.paths[path];
return acc;
},
{} as Swagger.SwaggerV3['paths'],
);
// remove duplicate paths, delete from letta-web-openapi if it exists in sdk-core
// some paths will have an extra / at the end, so we need to remove that as well
lettaWebOpenAPI.paths = Object.fromEntries(
Object.entries(lettaWebOpenAPI.paths).filter(([path]) => {
const pathWithoutSlash = path.endsWith('/')
? path.slice(0, path.length - 1)
: path;
return !lettaAgentsAPIWithNoEndslash[pathWithoutSlash];
}),
);
const agentStatePathsToOverride: Array<[string, string]> = [
['/v1/templates/{project}/{template_version}/agents', '201'],
['/v1/agents/search', '200'],
];
for (const [path, responseCode] of agentStatePathsToOverride) {
if (lettaWebOpenAPI.paths[path]?.post?.responses?.[responseCode]) {
// Get direct reference to the schema object
const responseSchema =
lettaWebOpenAPI.paths[path].post.responses[responseCode];
const contentSchema = responseSchema.content['application/json'].schema;
// Replace the entire agents array schema with the reference
if (contentSchema.properties?.agents) {
contentSchema.properties.agents = {
type: 'array',
items: {
$ref: '#/components/schemas/AgentState',
},
};
}
}
}
// go through the paths and remove "user_id"/"actor_id" from the headers
for (const path of Object.keys(lettaAgentsAPI.paths)) {
for (const method of Object.keys(lettaAgentsAPI.paths[path])) {
// @ts-expect-error - a
if (lettaAgentsAPI.paths[path][method]?.parameters) {
// @ts-expect-error - a
lettaAgentsAPI.paths[path][method].parameters = lettaAgentsAPI.paths[
path
][method].parameters.filter(
(param: Record<string, string>) =>
param.in !== 'header' ||
(
param.name !== 'user_id' &&
param.name !== 'User-Agent' &&
param.name !== 'X-Project-Id' &&
param.name !== 'X-Letta-Source' &&
param.name !== 'X-Stainless-Package-Version' &&
!param.name.startsWith('X-Experimental') &&
!param.name.startsWith('X-Billing')
),
);
}
}
}
const result = merge([
{
oas: lettaAgentsAPI,
},
{
oas: lettaWebOpenAPI,
},
]);
if (isErrorResult(result)) {
console.error(`${result.message} (${result.type})`);
process.exit(1);
}
result.output.openapi = '3.1.0';
result.output.info = {
title: 'Letta API',
version: '1.0.0',
};
result.output.servers = [
{
url: 'https://app.letta.com',
description: 'Letta Cloud',
},
{
url: 'http://localhost:8283',
description: 'Self-hosted',
},
];
result.output.components = {
...result.output.components,
securitySchemes: {
bearerAuth: {
type: 'http',
scheme: 'bearer',
},
},
};
result.output.security = [
...(result.output.security || []),
{
bearerAuth: [],
},
];
// omit all instances of "user_id" from the openapi.json file
function deepOmitPreserveArrays(obj: unknown, key: string): unknown {
if (Array.isArray(obj)) {
return obj.map((item) => deepOmitPreserveArrays(item, key));
}
if (typeof obj !== 'object' || obj === null) {
return obj;
}
if (key in obj) {
return omit(obj, key);
}
return Object.fromEntries(
Object.entries(obj).map(([k, v]) => [k, deepOmitPreserveArrays(v, key)]),
);
}
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore
result.output.components = deepOmitPreserveArrays(
result.output.components,
'user_id',
);
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore
result.output.components = deepOmitPreserveArrays(
result.output.components,
'actor_id',
);
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore
result.output.components = deepOmitPreserveArrays(
result.output.components,
'organization_id',
);
fs.writeFileSync(
path.join(__dirname, '..', 'openapi.json'),
JSON.stringify(result.output, null, 2),
);
function formatOpenAPIJson() {
const openApiPath = path.join(__dirname, '..', 'openapi.json');
try {
execSync(`npx prettier --write "${openApiPath}"`, { stdio: 'inherit' });
console.log('Successfully formatted openapi.json with Prettier');
} catch (error) {
console.error('Error formatting openapi.json:', error);
process.exit(1);
}
}
formatOpenAPIJson();

View File

@@ -5,7 +5,7 @@ try:
__version__ = version("letta")
except PackageNotFoundError:
# Fallback for development installations
__version__ = "0.16.5"
__version__ = "0.16.6"
if os.environ.get("LETTA_VERSION"):
__version__ = os.environ["LETTA_VERSION"]

View File

@@ -7,6 +7,7 @@ from letta.schemas.letta_message import LettaMessage
from letta.schemas.letta_message_content import ReasoningContent, RedactedReasoningContent, TextContent
from letta.schemas.llm_config import LLMConfig
from letta.schemas.openai.chat_completion_response import ChatCompletionResponse, ChoiceLogprobs, ToolCall
from letta.schemas.provider_trace import BillingContext
from letta.schemas.usage import LettaUsageStatistics
from letta.schemas.user import User
from letta.services.telemetry_manager import TelemetryManager
@@ -31,6 +32,7 @@ class LettaLLMAdapter(ABC):
run_id: str | None = None,
org_id: str | None = None,
user_id: str | None = None,
billing_context: BillingContext | None = None,
) -> None:
self.llm_client: LLMClientBase = llm_client
self.llm_config: LLMConfig = llm_config
@@ -40,6 +42,7 @@ class LettaLLMAdapter(ABC):
self.run_id: str | None = run_id
self.org_id: str | None = org_id
self.user_id: str | None = user_id
self.billing_context: BillingContext | None = billing_context
self.message_id: str | None = None
self.request_data: dict | None = None
self.response_data: dict | None = None

View File

@@ -10,7 +10,7 @@ from letta.otel.tracing import log_attributes, safe_json_dumps, trace_method
from letta.schemas.enums import LLMCallType, ProviderType
from letta.schemas.letta_message import LettaMessage
from letta.schemas.llm_config import LLMConfig
from letta.schemas.provider_trace import ProviderTrace
from letta.schemas.provider_trace import BillingContext, ProviderTrace
from letta.schemas.user import User
from letta.settings import settings
from letta.utils import safe_create_task
@@ -36,6 +36,7 @@ class LettaLLMStreamAdapter(LettaLLMAdapter):
run_id: str | None = None,
org_id: str | None = None,
user_id: str | None = None,
billing_context: "BillingContext | None" = None,
) -> None:
super().__init__(
llm_client,
@@ -46,6 +47,7 @@ class LettaLLMStreamAdapter(LettaLLMAdapter):
run_id=run_id,
org_id=org_id,
user_id=user_id,
billing_context=billing_context,
)
self.interface: OpenAIStreamingInterface | AnthropicStreamingInterface | None = None

View File

@@ -51,6 +51,7 @@ class SimpleLLMRequestAdapter(LettaLLMRequestAdapter):
org_id=self.org_id,
user_id=self.user_id,
llm_config=self.llm_config.model_dump() if self.llm_config else None,
billing_context=self.billing_context,
)
try:
self.response_data = await self.llm_client.request_async_with_telemetry(request_data, self.llm_config)

View File

@@ -278,6 +278,7 @@ class SimpleLLMStreamAdapter(LettaLLMStreamAdapter):
org_id=self.org_id,
user_id=self.user_id,
llm_config=self.llm_config.model_dump() if self.llm_config else None,
billing_context=self.billing_context,
),
),
label="create_provider_trace",

View File

@@ -15,6 +15,7 @@ from letta.schemas.letta_message_content import TextContent
from letta.schemas.letta_response import LettaResponse
from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType
from letta.schemas.message import Message, MessageCreate, MessageUpdate
from letta.schemas.provider_trace import BillingContext
from letta.schemas.usage import LettaUsageStatistics
from letta.schemas.user import User
from letta.services.agent_manager import AgentManager
@@ -51,7 +52,11 @@ class BaseAgent(ABC):
@abstractmethod
async def step(
self, input_messages: List[MessageCreate], max_steps: int = DEFAULT_MAX_STEPS, run_id: Optional[str] = None
self,
input_messages: List[MessageCreate],
max_steps: int = DEFAULT_MAX_STEPS,
run_id: Optional[str] = None,
billing_context: "BillingContext | None" = None,
) -> LettaResponse:
"""
Main execution loop for the agent.

View File

@@ -12,6 +12,7 @@ from letta.schemas.user import User
if TYPE_CHECKING:
from letta.schemas.letta_request import ClientToolSchema
from letta.schemas.provider_trace import BillingContext
class BaseAgentV2(ABC):
@@ -52,6 +53,7 @@ class BaseAgentV2(ABC):
request_start_timestamp_ns: int | None = None,
client_tools: list["ClientToolSchema"] | None = None,
include_compaction_messages: bool = False, # Not used in V2, but accepted for API compatibility
billing_context: "BillingContext | None" = None,
) -> LettaResponse:
"""
Execute the agent loop in blocking mode, returning all messages at once.
@@ -76,6 +78,7 @@ class BaseAgentV2(ABC):
conversation_id: str | None = None,
client_tools: list["ClientToolSchema"] | None = None,
include_compaction_messages: bool = False, # Not used in V2, but accepted for API compatibility
billing_context: "BillingContext | None" = None,
) -> AsyncGenerator[LettaMessage | LegacyLettaMessage | MessageStreamStatus, None]:
"""
Execute the agent loop in streaming mode, yielding chunks as they become available.

View File

@@ -192,44 +192,15 @@ async def _prepare_in_context_messages_no_persist_async(
# Otherwise, include the full list of messages from the conversation
current_in_context_messages = await message_manager.get_messages_by_ids_async(message_ids=message_ids, actor=actor)
else:
# No messages in conversation yet - compile a new system message for this conversation
# Each conversation gets its own system message (captures memory state at conversation start)
from letta.prompts.prompt_generator import PromptGenerator
from letta.services.passage_manager import PassageManager
num_messages = await message_manager.size_async(actor=actor, agent_id=agent_state.id)
passage_manager = PassageManager()
num_archival_memories = await passage_manager.agent_passage_size_async(actor=actor, agent_id=agent_state.id)
system_message_str = await PromptGenerator.compile_system_message_async(
system_prompt=agent_state.system,
in_context_memory=agent_state.memory,
in_context_memory_last_edit=get_utc_time(),
timezone=agent_state.timezone,
user_defined_variables=None,
append_icm_if_missing=True,
previous_message_count=num_messages,
archival_memory_size=num_archival_memories,
sources=agent_state.sources,
max_files_open=agent_state.max_files_open,
)
system_message = Message.dict_to_message(
agent_id=agent_state.id,
model=agent_state.llm_config.model,
openai_message_dict={"role": "system", "content": system_message_str},
)
# Persist the new system message
persisted_messages = await message_manager.create_many_messages_async([system_message], actor=actor)
system_message = persisted_messages[0]
# Add it to the conversation tracking
await conversation_manager.add_messages_to_conversation(
# No messages in conversation yet (fallback) - compile a new system message
# Normally this is handled at conversation creation time, but this covers
# edge cases where a conversation exists without a system message.
system_message = await conversation_manager.compile_and_save_system_message_for_conversation(
conversation_id=conversation_id,
agent_id=agent_state.id,
message_ids=[system_message.id],
actor=actor,
starting_position=0,
agent_state=agent_state,
message_manager=message_manager,
)
current_in_context_messages = [system_message]

View File

@@ -48,6 +48,7 @@ from letta.schemas.openai.chat_completion_response import (
UsageStatisticsCompletionTokenDetails,
UsageStatisticsPromptTokenDetails,
)
from letta.schemas.provider_trace import BillingContext
from letta.schemas.step import StepProgression
from letta.schemas.step_metrics import StepMetrics
from letta.schemas.tool_execution_result import ToolExecutionResult
@@ -179,6 +180,7 @@ class LettaAgent(BaseAgent):
request_start_timestamp_ns: int | None = None,
include_return_message_types: list[MessageType] | None = None,
dry_run: bool = False,
billing_context: "BillingContext | None" = None,
) -> Union[LettaResponse, dict]:
# TODO (cliandy): pass in run_id and use at send_message endpoints for all step functions
agent_state = await self.agent_manager.get_agent_by_id_async(

View File

@@ -44,6 +44,7 @@ from letta.schemas.openai.chat_completion_response import (
UsageStatisticsCompletionTokenDetails,
UsageStatisticsPromptTokenDetails,
)
from letta.schemas.provider_trace import BillingContext
from letta.schemas.step import Step, StepProgression
from letta.schemas.step_metrics import StepMetrics
from letta.schemas.tool import Tool
@@ -185,6 +186,7 @@ class LettaAgentV2(BaseAgentV2):
request_start_timestamp_ns: int | None = None,
client_tools: list[ClientToolSchema] | None = None,
include_compaction_messages: bool = False, # Not used in V2, but accepted for API compatibility
billing_context: "BillingContext | None" = None,
) -> LettaResponse:
"""
Execute the agent loop in blocking mode, returning all messages at once.
@@ -290,6 +292,7 @@ class LettaAgentV2(BaseAgentV2):
conversation_id: str | None = None, # Not used in V2, but accepted for API compatibility
client_tools: list[ClientToolSchema] | None = None,
include_compaction_messages: bool = False, # Not used in V2, but accepted for API compatibility
billing_context: BillingContext | None = None,
) -> AsyncGenerator[str, None]:
"""
Execute the agent loop in streaming mode, yielding chunks as they become available.

View File

@@ -21,7 +21,7 @@ from letta.agents.helpers import (
)
from letta.agents.letta_agent_v2 import LettaAgentV2
from letta.constants import DEFAULT_MAX_STEPS, NON_USER_MSG_PREFIX, REQUEST_HEARTBEAT_PARAM
from letta.errors import ContextWindowExceededError, LLMError, SystemPromptTokenExceededError
from letta.errors import ContextWindowExceededError, LLMEmptyResponseError, LLMError, SystemPromptTokenExceededError
from letta.helpers import ToolRulesSolver
from letta.helpers.datetime_helpers import get_utc_time, get_utc_timestamp_ns
from letta.helpers.tool_execution_helper import enable_strict_mode
@@ -45,6 +45,7 @@ from letta.schemas.letta_response import LettaResponse, TurnTokenData
from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType
from letta.schemas.message import Message, MessageCreate, ToolReturn
from letta.schemas.openai.chat_completion_response import ChoiceLogprobs, ToolCall, ToolCallDenial, UsageStatistics
from letta.schemas.provider_trace import BillingContext
from letta.schemas.step import StepProgression
from letta.schemas.step_metrics import StepMetrics
from letta.schemas.tool_execution_result import ToolExecutionResult
@@ -149,6 +150,7 @@ class LettaAgentV3(LettaAgentV2):
conversation_id: str | None = None,
client_tools: list[ClientToolSchema] | None = None,
include_compaction_messages: bool = False,
billing_context: "BillingContext | None" = None,
) -> LettaResponse:
"""
Execute the agent loop in blocking mode, returning all messages at once.
@@ -232,6 +234,7 @@ class LettaAgentV3(LettaAgentV2):
run_id=run_id,
org_id=self.actor.organization_id,
user_id=self.actor.id,
billing_context=billing_context,
)
credit_task = None
@@ -362,6 +365,7 @@ class LettaAgentV3(LettaAgentV2):
conversation_id: str | None = None,
client_tools: list[ClientToolSchema] | None = None,
include_compaction_messages: bool = False,
billing_context: BillingContext | None = None,
) -> AsyncGenerator[str, None]:
"""
Execute the agent loop in streaming mode, yielding chunks as they become available.
@@ -419,6 +423,7 @@ class LettaAgentV3(LettaAgentV2):
run_id=run_id,
org_id=self.actor.organization_id,
user_id=self.actor.id,
billing_context=billing_context,
)
elif use_sglang_native:
# Use SGLang native adapter for multi-turn RL training
@@ -431,6 +436,7 @@ class LettaAgentV3(LettaAgentV2):
run_id=run_id,
org_id=self.actor.organization_id,
user_id=self.actor.id,
billing_context=billing_context,
)
# Reset turns tracking for this step
self.turns = []
@@ -444,6 +450,7 @@ class LettaAgentV3(LettaAgentV2):
run_id=run_id,
org_id=self.actor.organization_id,
user_id=self.actor.id,
billing_context=billing_context,
)
try:
@@ -764,7 +771,12 @@ class LettaAgentV3(LettaAgentV2):
]
else:
# Old behavior: UserMessage with packed JSON
return list(Message.to_letta_messages(summary_message))
messages = list(Message.to_letta_messages(summary_message))
# Set otid on returned messages (summary Message doesn't have otid set at creation)
for i, msg in enumerate(messages):
if not msg.otid:
msg.otid = Message.generate_otid_from_id(summary_message.id, i)
return messages
@trace_method
async def _step(
@@ -990,6 +1002,9 @@ class LettaAgentV3(LettaAgentV2):
except ValueError as e:
self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response.value)
raise e
except LLMEmptyResponseError as e:
self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response.value)
raise e
except LLMError as e:
self.stop_reason = LettaStopReason(stop_reason=StopReasonType.llm_api_error.value)
raise e

View File

@@ -134,7 +134,7 @@ def _flatten_model_settings(d: dict, env_vars: dict[str, str]) -> None:
api_base: yyy -> OPENAI_API_BASE
anthropic:
api_key: zzz -> ANTHROPIC_API_KEY
global_max_context_window_limit: 32000 -> GLOBAL_MAX_CONTEXT_WINDOW_LIMIT
global_max_context_window_limit: 128000 -> GLOBAL_MAX_CONTEXT_WINDOW_LIMIT
"""
for key, value in d.items():
if isinstance(value, dict):

View File

@@ -74,7 +74,7 @@ DEFAULT_MAX_STEPS = 50
# context window size
MIN_CONTEXT_WINDOW = 4096
DEFAULT_CONTEXT_WINDOW = 32000
DEFAULT_CONTEXT_WINDOW = 128000
# Summarization trigger threshold (multiplier of context_window limit)
# Summarization triggers when step usage > context_window * SUMMARIZATION_TRIGGER_MULTIPLIER
@@ -253,10 +253,10 @@ LLM_MAX_CONTEXT_WINDOW = {
"deepseek-reasoner": 64000,
# glm (Z.AI)
"glm-4.5": 128000,
"glm-4.6": 200000,
"glm-4.7": 200000,
"glm-5": 200000,
"glm-5-code": 200000,
"glm-4.6": 180000,
"glm-4.7": 180000,
"glm-5": 180000,
"glm-5-code": 180000,
## OpenAI models: https://platform.openai.com/docs/models/overview
# gpt-5
"gpt-5": 272000,
@@ -278,6 +278,8 @@ LLM_MAX_CONTEXT_WINDOW = {
"gpt-5.2-pro": 272000,
"gpt-5.2-pro-2025-12-11": 272000,
"gpt-5.2-codex": 272000,
# gpt-5.3
"gpt-5.3-codex": 272000,
# reasoners
"o1": 200000,
# "o1-pro": 200000, # responses API only
@@ -419,7 +421,7 @@ MAX_ERROR_MESSAGE_CHAR_LIMIT = 1000
# Default memory limits
CORE_MEMORY_PERSONA_CHAR_LIMIT: int = 20000
CORE_MEMORY_HUMAN_CHAR_LIMIT: int = 20000
CORE_MEMORY_BLOCK_CHAR_LIMIT: int = 20000
CORE_MEMORY_BLOCK_CHAR_LIMIT: int = 100000
# Function return limits
FUNCTION_RETURN_CHAR_LIMIT = 50000 # ~300 words

View File

@@ -283,6 +283,15 @@ class LLMServerError(LLMError):
while processing the request."""
class LLMEmptyResponseError(LLMServerError):
"""Error when LLM returns an empty response (no content and no tool calls).
This is a subclass of LLMServerError to maintain retry behavior, but allows
specific handling for empty response cases which may benefit from request
modification before retry.
"""
class LLMTimeoutError(LLMError):
"""Error when LLM request times out"""

View File

@@ -13,6 +13,7 @@ from letta.schemas.letta_message import MessageType
from letta.schemas.letta_message_content import TextContent
from letta.schemas.letta_response import LettaResponse
from letta.schemas.message import Message, MessageCreate
from letta.schemas.provider_trace import BillingContext
from letta.schemas.run import Run
from letta.schemas.user import User
from letta.services.agent_manager import AgentManager
@@ -69,6 +70,7 @@ class SleeptimeMultiAgentV2(BaseAgent):
use_assistant_message: bool = True,
request_start_timestamp_ns: int | None = None,
include_return_message_types: list[MessageType] | None = None,
billing_context: "BillingContext | None" = None,
) -> LettaResponse:
run_ids = []
@@ -100,6 +102,7 @@ class SleeptimeMultiAgentV2(BaseAgent):
run_id=run_id,
use_assistant_message=use_assistant_message,
include_return_message_types=include_return_message_types,
billing_context=billing_context,
)
# Get last response messages

View File

@@ -15,6 +15,7 @@ from letta.schemas.letta_request import ClientToolSchema
from letta.schemas.letta_response import LettaResponse
from letta.schemas.letta_stop_reason import StopReasonType
from letta.schemas.message import Message, MessageCreate
from letta.schemas.provider_trace import BillingContext
from letta.schemas.run import Run, RunUpdate
from letta.schemas.user import User
from letta.services.group_manager import GroupManager
@@ -47,6 +48,7 @@ class SleeptimeMultiAgentV3(LettaAgentV2):
request_start_timestamp_ns: int | None = None,
client_tools: list[ClientToolSchema] | None = None,
include_compaction_messages: bool = False,
billing_context: "BillingContext | None" = None,
) -> LettaResponse:
self.run_ids = []
@@ -62,6 +64,7 @@ class SleeptimeMultiAgentV3(LettaAgentV2):
request_start_timestamp_ns=request_start_timestamp_ns,
client_tools=client_tools,
include_compaction_messages=include_compaction_messages,
billing_context=billing_context,
)
await self.run_sleeptime_agents()
@@ -81,6 +84,7 @@ class SleeptimeMultiAgentV3(LettaAgentV2):
include_return_message_types: list[MessageType] | None = None,
client_tools: list[ClientToolSchema] | None = None,
include_compaction_messages: bool = False,
billing_context: "BillingContext | None" = None,
) -> AsyncGenerator[str, None]:
self.run_ids = []
@@ -99,6 +103,7 @@ class SleeptimeMultiAgentV3(LettaAgentV2):
request_start_timestamp_ns=request_start_timestamp_ns,
client_tools=client_tools,
include_compaction_messages=include_compaction_messages,
billing_context=billing_context,
):
yield chunk
finally:

View File

@@ -14,6 +14,7 @@ from letta.schemas.letta_request import ClientToolSchema
from letta.schemas.letta_response import LettaResponse
from letta.schemas.letta_stop_reason import StopReasonType
from letta.schemas.message import Message, MessageCreate
from letta.schemas.provider_trace import BillingContext
from letta.schemas.run import Run, RunUpdate
from letta.schemas.user import User
from letta.services.group_manager import GroupManager
@@ -47,6 +48,7 @@ class SleeptimeMultiAgentV4(LettaAgentV3):
conversation_id: str | None = None,
client_tools: list[ClientToolSchema] | None = None,
include_compaction_messages: bool = False,
billing_context: "BillingContext | None" = None,
) -> LettaResponse:
self.run_ids = []
@@ -63,6 +65,7 @@ class SleeptimeMultiAgentV4(LettaAgentV3):
conversation_id=conversation_id,
client_tools=client_tools,
include_compaction_messages=include_compaction_messages,
billing_context=billing_context,
)
run_ids = await self.run_sleeptime_agents()
@@ -82,6 +85,7 @@ class SleeptimeMultiAgentV4(LettaAgentV3):
conversation_id: str | None = None,
client_tools: list[ClientToolSchema] | None = None,
include_compaction_messages: bool = False,
billing_context: "BillingContext | None" = None,
) -> AsyncGenerator[str, None]:
self.run_ids = []
@@ -101,6 +105,7 @@ class SleeptimeMultiAgentV4(LettaAgentV3):
conversation_id=conversation_id,
client_tools=client_tools,
include_compaction_messages=include_compaction_messages,
billing_context=billing_context,
):
yield chunk
finally:

View File

@@ -30,6 +30,7 @@ from anthropic.types.beta import (
)
from letta.constants import DEFAULT_MESSAGE_TOOL, DEFAULT_MESSAGE_TOOL_KWARG
from letta.errors import LLMEmptyResponseError
from letta.local_llm.constants import INNER_THOUGHTS_KWARG
from letta.log import get_logger
from letta.schemas.letta_message import (
@@ -104,6 +105,10 @@ class AnthropicStreamingInterface:
self.inner_thoughts_complete = False
self.put_inner_thoughts_in_kwarg = put_inner_thoughts_in_kwarg
# Track whether any content was produced (text or tool calls)
# Used to detect empty responses from models like Opus 4.6
self.has_content = False
# Buffer to handle partial XML tags across chunks
self.partial_tag_buffer = ""
@@ -298,9 +303,11 @@ class AnthropicStreamingInterface:
if isinstance(content, BetaTextBlock):
self.anthropic_mode = EventMode.TEXT
self.has_content = True # Track that we received text content
# TODO: Can capture citations, etc.
elif isinstance(content, BetaToolUseBlock):
self.anthropic_mode = EventMode.TOOL_USE
self.has_content = True # Track that we received tool use content
self.tool_call_id = content.id
self.tool_call_name = content.name
self.inner_thoughts_complete = False
@@ -589,8 +596,12 @@ class AnthropicStreamingInterface:
# message_delta event are *cumulative*." So we assign, not accumulate.
self.output_tokens = event.usage.output_tokens
elif isinstance(event, BetaRawMessageStopEvent):
# Don't do anything here! We don't want to stop the stream.
pass
# Check if any content was produced during the stream
# Empty responses (no text and no tool calls) should raise an error
if not self.has_content:
raise LLMEmptyResponseError(
message=f"LLM provider returned empty content in streaming response (model: {self.model}, message_id: {self.message_id})"
)
elif isinstance(event, BetaRawContentBlockStopEvent):
# If we're exiting a tool use block and there are still buffered messages,
# we should flush them now.
@@ -837,10 +848,12 @@ class SimpleAnthropicStreamingInterface:
if isinstance(content, BetaTextBlock):
self.anthropic_mode = EventMode.TEXT
self.has_content = True # Track that we received text content
# TODO: Can capture citations, etc.
elif isinstance(content, BetaToolUseBlock):
self.anthropic_mode = EventMode.TOOL_USE
self.has_content = True # Track that we received tool use content
self.tool_call_id = content.id
self.tool_call_name = content.name
@@ -1014,8 +1027,12 @@ class SimpleAnthropicStreamingInterface:
self.output_tokens = event.usage.output_tokens
elif isinstance(event, BetaRawMessageStopEvent):
# Don't do anything here! We don't want to stop the stream.
pass
# Check if any content was produced during the stream
# Empty responses (no text and no tool calls) should raise an error
if not self.has_content:
raise LLMEmptyResponseError(
message=f"LLM provider returned empty content in streaming response (model: {self.model}, message_id: {self.message_id})"
)
elif isinstance(event, BetaRawContentBlockStopEvent):
self.anthropic_mode = None

View File

@@ -19,6 +19,8 @@ from letta.errors import (
LLMAuthenticationError,
LLMBadRequestError,
LLMConnectionError,
LLMEmptyResponseError,
LLMError,
LLMInsufficientCreditsError,
LLMNotFoundError,
LLMPermissionDeniedError,
@@ -957,6 +959,11 @@ class AnthropicClient(LLMClientBase):
@trace_method
def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
# Pass through errors that are already LLMError instances unchanged
# This preserves specific error types like LLMEmptyResponseError
if isinstance(e, LLMError):
return e
is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
# make sure to check for overflow errors, regardless of error type
@@ -1278,7 +1285,7 @@ class AnthropicClient(LLMClientBase):
response.stop_reason,
json.dumps(response_data),
)
raise LLMServerError(
raise LLMEmptyResponseError(
message=f"LLM provider returned empty content in response (ID: {response.id}, model: {response.model}, stop_reason: {response.stop_reason})",
code=ErrorCode.INTERNAL_SERVER_ERROR,
details={

View File

@@ -9,7 +9,7 @@ from letta.llm_api.google_constants import GOOGLE_MODEL_FOR_API_KEY_CHECK
from letta.llm_api.google_vertex_client import GoogleVertexClient
from letta.log import get_logger
from letta.schemas.llm_config import LLMConfig
from letta.settings import model_settings, settings
from letta.settings import model_settings
logger = get_logger(__name__)
@@ -18,7 +18,7 @@ class GoogleAIClient(GoogleVertexClient):
provider_label = "Google AI"
def _get_client(self, llm_config: Optional[LLMConfig] = None):
timeout_ms = int(settings.llm_request_timeout_seconds * 1000)
timeout_ms = int(model_settings.gemini_timeout_seconds * 1000)
api_key = None
if llm_config:
api_key, _, _ = self.get_byok_overrides(llm_config)
@@ -30,7 +30,7 @@ class GoogleAIClient(GoogleVertexClient):
)
async def _get_client_async(self, llm_config: Optional[LLMConfig] = None):
timeout_ms = int(settings.llm_request_timeout_seconds * 1000)
timeout_ms = int(model_settings.gemini_timeout_seconds * 1000)
api_key = None
if llm_config:
api_key, _, _ = await self.get_byok_overrides_async(llm_config)

View File

@@ -14,7 +14,7 @@ from letta.schemas.enums import AgentType, LLMCallType, ProviderCategory
from letta.schemas.llm_config import LLMConfig
from letta.schemas.message import Message
from letta.schemas.openai.chat_completion_response import ChatCompletionResponse
from letta.schemas.provider_trace import ProviderTrace
from letta.schemas.provider_trace import BillingContext, ProviderTrace
from letta.schemas.usage import LettaUsageStatistics
from letta.services.telemetry_manager import TelemetryManager
from letta.settings import settings
@@ -48,6 +48,7 @@ class LLMClientBase:
self._telemetry_user_id: Optional[str] = None
self._telemetry_compaction_settings: Optional[Dict] = None
self._telemetry_llm_config: Optional[Dict] = None
self._telemetry_billing_context: Optional[BillingContext] = None
def set_telemetry_context(
self,
@@ -62,6 +63,7 @@ class LLMClientBase:
compaction_settings: Optional[Dict] = None,
llm_config: Optional[Dict] = None,
actor: Optional["User"] = None,
billing_context: Optional[BillingContext] = None,
) -> None:
"""Set telemetry context for provider trace logging."""
if actor is not None:
@@ -76,6 +78,7 @@ class LLMClientBase:
self._telemetry_user_id = user_id
self._telemetry_compaction_settings = compaction_settings
self._telemetry_llm_config = llm_config
self._telemetry_billing_context = billing_context
def extract_usage_statistics(self, response_data: Optional[dict], llm_config: LLMConfig) -> LettaUsageStatistics:
"""Provider-specific usage parsing hook (override in subclasses). Returns LettaUsageStatistics."""
@@ -125,6 +128,7 @@ class LLMClientBase:
user_id=self._telemetry_user_id,
compaction_settings=self._telemetry_compaction_settings,
llm_config=llm_config.model_dump() if llm_config else self._telemetry_llm_config,
billing_context=self._telemetry_billing_context,
),
)
except Exception as e:
@@ -186,6 +190,7 @@ class LLMClientBase:
user_id=self._telemetry_user_id,
compaction_settings=self._telemetry_compaction_settings,
llm_config=llm_config.model_dump() if llm_config else self._telemetry_llm_config,
billing_context=self._telemetry_billing_context,
),
)
except Exception as e:

View File

@@ -88,7 +88,7 @@ def supports_none_reasoning_effort(model: str) -> bool:
Currently, GPT-5.1 and GPT-5.2 models support the 'none' reasoning effort level.
"""
return model.startswith("gpt-5.1") or model.startswith("gpt-5.2")
return model.startswith("gpt-5.1") or model.startswith("gpt-5.2") or model.startswith("gpt-5.3")
def is_openai_5_model(model: str) -> bool:
@@ -389,7 +389,6 @@ class OpenAIClient(LLMClientBase):
input=openai_messages_list,
tools=responses_tools,
tool_choice=tool_choice,
max_output_tokens=llm_config.max_tokens,
temperature=llm_config.temperature if supports_temperature_param(model) else None,
parallel_tool_calls=llm_config.parallel_tool_calls if tools and supports_parallel_tool_calling(model) else False,
)
@@ -397,6 +396,10 @@ class OpenAIClient(LLMClientBase):
# Handle text configuration (verbosity and response format)
text_config_kwargs = {}
# Only set max_output_tokens if explicitly configured
if llm_config.max_tokens is not None:
data.max_output_tokens = llm_config.max_tokens
# Add verbosity control for GPT-5 models
if supports_verbosity_control(model) and llm_config.verbosity:
text_config_kwargs["verbosity"] = llm_config.verbosity
@@ -451,7 +454,6 @@ class OpenAIClient(LLMClientBase):
)
request_data = data.model_dump(exclude_unset=True)
# print("responses request data", request_data)
return request_data
@trace_method
@@ -639,6 +641,14 @@ class OpenAIClient(LLMClientBase):
tool.function.strict = False
request_data = data.model_dump(exclude_unset=True)
# Fireworks uses strict validation (additionalProperties: false) and rejects
# reasoning fields that are not in their schema.
is_fireworks = llm_config.model_endpoint and "fireworks.ai" in llm_config.model_endpoint
if is_fireworks and "messages" in request_data:
for message in request_data["messages"]:
for field in ("reasoning_content_signature", "redacted_reasoning_content", "omitted_reasoning_content"):
message.pop(field, None)
# If Ollama
# if llm_config.handle.startswith("ollama/") and llm_config.enable_reasoner:
# Sadly, reasoning via the OpenAI proxy on Ollama only works for Harmony/gpt-oss

View File

@@ -68,6 +68,12 @@ class ZAIClient(OpenAIClient):
}
}
# Z.ai's API uses max_tokens, not max_completion_tokens.
# If max_completion_tokens is sent, Z.ai ignores it and falls back to its
# default of 65536, silently truncating input to ~137K of the 200K context window.
if "max_completion_tokens" in data:
data["max_tokens"] = data.pop("max_completion_tokens")
# Sanitize empty text content — ZAI rejects empty text blocks
if "messages" in data:
for msg in data["messages"]:

View File

@@ -17295,6 +17295,58 @@
"supports_tool_choice": true,
"supports_vision": true
},
"gpt-5.3-chat-latest": {
"cache_read_input_token_cost": 1.75e-7,
"cache_read_input_token_cost_priority": 3.5e-7,
"input_cost_per_token": 1.75e-6,
"input_cost_per_token_priority": 3.5e-6,
"litellm_provider": "openai",
"max_input_tokens": 128000,
"max_output_tokens": 16384,
"max_tokens": 16384,
"mode": "chat",
"output_cost_per_token": 1.4e-5,
"output_cost_per_token_priority": 2.8e-5,
"supported_endpoints": ["/v1/chat/completions", "/v1/responses"],
"supported_modalities": ["text", "image"],
"supported_output_modalities": ["text"],
"supports_function_calling": true,
"supports_native_streaming": true,
"supports_parallel_function_calling": true,
"supports_pdf_input": true,
"supports_prompt_caching": true,
"supports_reasoning": true,
"supports_response_schema": true,
"supports_system_messages": true,
"supports_tool_choice": true,
"supports_vision": true
},
"gpt-5.3-codex": {
"cache_read_input_token_cost": 1.75e-7,
"cache_read_input_token_cost_priority": 3.5e-7,
"input_cost_per_token": 1.75e-6,
"input_cost_per_token_priority": 3.5e-6,
"litellm_provider": "openai",
"max_input_tokens": 272000,
"max_output_tokens": 128000,
"max_tokens": 128000,
"mode": "responses",
"output_cost_per_token": 1.4e-5,
"output_cost_per_token_priority": 2.8e-5,
"supported_endpoints": ["/v1/responses"],
"supported_modalities": ["text", "image"],
"supported_output_modalities": ["text"],
"supports_function_calling": true,
"supports_native_streaming": true,
"supports_parallel_function_calling": true,
"supports_pdf_input": true,
"supports_prompt_caching": true,
"supports_reasoning": true,
"supports_response_schema": true,
"supports_system_messages": false,
"supports_tool_choice": true,
"supports_vision": true
},
"gpt-5-mini": {
"cache_read_input_token_cost": 2.5e-8,
"cache_read_input_token_cost_flex": 1.25e-8,

View File

@@ -44,7 +44,7 @@ class Conversation(SqlalchemyBase, OrganizationMixin):
"ConversationMessage",
back_populates="conversation",
cascade="all, delete-orphan",
lazy="selectin",
lazy="raise",
)
isolated_blocks: Mapped[List["Block"]] = relationship(
"Block",

View File

@@ -69,5 +69,5 @@ class ConversationMessage(SqlalchemyBase, OrganizationMixin):
)
message: Mapped["Message"] = relationship(
"Message",
lazy="selectin",
lazy="raise",
)

View File

@@ -88,8 +88,7 @@ class LettaRequest(BaseModel):
)
top_logprobs: Optional[int] = Field(
default=None,
description="Number of most likely tokens to return at each position (0-20). "
"Requires return_logprobs=True.",
description="Number of most likely tokens to return at each position (0-20). Requires return_logprobs=True.",
)
return_token_ids: bool = Field(
default=False,
@@ -155,6 +154,10 @@ class LettaStreamingRequest(LettaRequest):
class ConversationMessageRequest(LettaRequest):
"""Request for sending messages to a conversation. Streams by default."""
agent_id: Optional[str] = Field(
default=None,
description="Agent ID for agent-direct mode with 'default' conversation. Use with conversation_id='default' in the URL path.",
)
streaming: bool = Field(
default=True,
description="If True (default), returns a streaming response (Server-Sent Events). If False, returns a complete JSON response.",
@@ -194,6 +197,10 @@ class CreateBatch(BaseModel):
class RetrieveStreamRequest(BaseModel):
agent_id: Optional[str] = Field(
default=None,
description="Agent ID for agent-direct mode with 'default' conversation. Use with conversation_id='default' in the URL path.",
)
starting_after: int = Field(
0, description="Sequence id to use as a cursor for pagination. Response will start streaming after this chunk sequence id"
)

View File

@@ -1,3 +1,4 @@
import re
from typing import TYPE_CHECKING, Literal, Optional
from pydantic import BaseModel, ConfigDict, Field, model_validator
@@ -139,7 +140,9 @@ class LLMConfig(BaseModel):
# Set max_tokens defaults based on model (only if not explicitly provided)
if "max_tokens" not in values:
if model.startswith("gpt-5"): # Covers both gpt-5 and gpt-5.1
if re.match(r"^gpt-5\.[23]", model) and "-chat" not in model:
values["max_tokens"] = 128000
elif model.startswith("gpt-5"):
values["max_tokens"] = 16384
elif model == "gpt-4.1":
values["max_tokens"] = 8192
@@ -299,7 +302,7 @@ class LLMConfig(BaseModel):
context_window=272000,
reasoning_effort="none", # Default to "none" for GPT-5.2
verbosity="medium",
max_tokens=16384,
max_tokens=128000,
)
elif model_name == "letta":
return cls(

View File

@@ -95,6 +95,11 @@ class LLMTrace(LettaBase):
response_json: str = Field(..., description="Full response payload as JSON string")
llm_config_json: str = Field(default="", description="LLM config as JSON string")
# Billing context
billing_plan_type: Optional[str] = Field(default=None, description="Subscription tier (e.g., 'basic', 'standard', 'max', 'enterprise')")
billing_cost_source: Optional[str] = Field(default=None, description="Cost source: 'quota' or 'credits'")
billing_customer_id: Optional[str] = Field(default=None, description="Customer ID for cross-referencing billing records")
# Timestamp
created_at: datetime = Field(default_factory=get_utc_time, description="When the trace was created")
@@ -128,6 +133,9 @@ class LLMTrace(LettaBase):
self.request_json,
self.response_json,
self.llm_config_json,
self.billing_plan_type or "",
self.billing_cost_source or "",
self.billing_customer_id or "",
self.created_at,
)
@@ -162,5 +170,8 @@ class LLMTrace(LettaBase):
"request_json",
"response_json",
"llm_config_json",
"billing_plan_type",
"billing_cost_source",
"billing_customer_id",
"created_at",
]

View File

@@ -226,8 +226,6 @@ class Memory(BaseModel, validate_assignment=True):
front_lines = []
if block.description:
front_lines.append(f"description: {block.description}")
if block.limit is not None:
front_lines.append(f"limit: {block.limit}")
if getattr(block, "read_only", False):
front_lines.append("read_only: true")
@@ -291,7 +289,40 @@ class Memory(BaseModel, validate_assignment=True):
s.write("\n\n<memory_filesystem>\n")
def _render_tree(node: dict, prefix: str = ""):
def _render_tree(node: dict, prefix: str = "", in_system: bool = False, path_parts: tuple[str, ...] = ()):
# Render skills/ as concise top-level entries only, using both
# current (`skills/<name>`) and legacy (`skills/<name>/SKILL`) labels.
if path_parts == ("skills",):
skill_entries: list[tuple[str, str]] = []
for name, val in node.items():
if name == LEAF_KEY:
continue
block = None
if isinstance(val, dict):
legacy_skill_block = val.get("SKILL")
if legacy_skill_block is not None and not isinstance(legacy_skill_block, dict):
block = legacy_skill_block
elif LEAF_KEY in val and not isinstance(val[LEAF_KEY], dict):
block = val[LEAF_KEY]
else:
block = val
if block is None:
continue
desc = getattr(block, "description", None)
desc_line = (desc or "").strip().split("\n")[0].strip()
skill_entries.append((name, desc_line))
skill_entries.sort(key=lambda e: e[0])
for i, (name, desc_line) in enumerate(skill_entries):
is_last = i == len(skill_entries) - 1
connector = "└── " if is_last else "├── "
desc_suffix = f" ({desc_line})" if desc_line else ""
s.write(f"{prefix}{connector}{name}{desc_suffix}\n")
return
# Sort: directories first, then files. If a node is both a directory and a
# leaf (LEAF_KEY present), show both <name>/ and <name>.md.
dirs = []
@@ -316,9 +347,24 @@ class Memory(BaseModel, validate_assignment=True):
if is_dir:
s.write(f"{prefix}{connector}{name}/\n")
extension = " " if is_last else ""
_render_tree(node[name], prefix + extension)
_render_tree(
node[name],
prefix + extension,
in_system=in_system or name == "system",
path_parts=(*path_parts, name),
)
else:
s.write(f"{prefix}{connector}{name}.md\n")
# For files outside system/, append the block description
desc_suffix = ""
if not in_system:
val = node[name]
block = val[LEAF_KEY] if isinstance(val, dict) else val
desc = getattr(block, "description", None)
if desc:
desc_line = desc.strip().split("\n")[0].strip()
if desc_line:
desc_suffix = f" ({desc_line})"
s.write(f"{prefix}{connector}{name}.md{desc_suffix}\n")
_render_tree(tree)
s.write("</memory_filesystem>")

View File

@@ -282,10 +282,10 @@ class AnthropicModelSettings(ModelSettings):
description="Soft control for how verbose model output should be, used for GPT-5 models.",
)
# Opus 4.5 effort parameter
effort: Optional[Literal["low", "medium", "high"]] = Field(
# Effort parameter for Opus 4.5, Opus 4.6, and Sonnet 4.6
effort: Optional[Literal["low", "medium", "high", "max"]] = Field(
None,
description="Effort level for Opus 4.5 model (controls token conservation). Not setting this gives similar performance to 'high'.",
description="Effort level for supported Anthropic models (controls token spending). 'max' is only available on Opus 4.6. Not setting this gives similar performance to 'high'.",
)
# Anthropic supports strict mode for tool calling - defaults to False

View File

@@ -3,13 +3,21 @@ from __future__ import annotations
from datetime import datetime
from typing import Any, Dict, Optional
from pydantic import Field
from pydantic import BaseModel, Field
from letta.helpers.datetime_helpers import get_utc_time
from letta.schemas.enums import PrimitiveType
from letta.schemas.letta_base import OrmMetadataBase
class BillingContext(BaseModel):
"""Billing context for LLM request cost tracking."""
plan_type: Optional[str] = Field(None, description="Subscription tier")
cost_source: Optional[str] = Field(None, description="Cost source: 'quota' or 'credits'")
customer_id: Optional[str] = Field(None, description="Customer ID for billing records")
class BaseProviderTrace(OrmMetadataBase):
__id_prefix__ = PrimitiveType.PROVIDER_TRACE.value
@@ -53,6 +61,8 @@ class ProviderTrace(BaseProviderTrace):
compaction_settings: Optional[Dict[str, Any]] = Field(None, description="Compaction/summarization settings (summarization calls only)")
llm_config: Optional[Dict[str, Any]] = Field(None, description="LLM configuration used for this call (non-summarization calls only)")
billing_context: Optional[BillingContext] = Field(None, description="Billing context from request headers")
created_at: datetime = Field(default_factory=get_utc_time, description="The timestamp when the object was created.")

View File

@@ -14,7 +14,7 @@ from letta.schemas.providers.base import Provider
logger = get_logger(__name__)
ALLOWED_PREFIXES = {"gpt-4", "gpt-5", "o1", "o3", "o4"}
DISALLOWED_KEYWORDS = {"transcribe", "search", "realtime", "tts", "audio", "computer", "o1-mini", "o1-preview", "o1-pro", "chat"}
DISALLOWED_KEYWORDS = {"transcribe", "search", "realtime", "tts", "audio", "computer", "o1-mini", "o1-preview", "o1-pro"}
DEFAULT_EMBEDDING_BATCH_SIZE = 1024
@@ -50,10 +50,22 @@ class OpenAIProvider(Provider):
except Exception as e:
raise LLMError(message=f"{e}", code=ErrorCode.INTERNAL_SERVER_ERROR)
@staticmethod
def _openai_default_max_output_tokens(model_name: str) -> int:
"""Return a sensible max-output-tokens default for OpenAI models.
gpt-5.2* / gpt-5.3* support 128k output tokens, except the
`-chat` variants which are capped at 16k.
"""
import re
if re.match(r"^gpt-5\.[23]", model_name) and "-chat" not in model_name:
return 128000
return 16384
def get_default_max_output_tokens(self, model_name: str) -> int:
"""Get the default max output tokens for OpenAI models (sync fallback)."""
# Simple default for openai
return 16384
return self._openai_default_max_output_tokens(model_name)
async def get_default_max_output_tokens_async(self, model_name: str) -> int:
"""Get the default max output tokens for OpenAI models.
@@ -67,8 +79,7 @@ class OpenAIProvider(Provider):
if max_output is not None:
return max_output
# Simple default for openai
return 16384
return self._openai_default_max_output_tokens(model_name)
async def _get_models_async(self) -> list[dict]:
from letta.llm_api.openai import openai_get_model_list_async

View File

@@ -12,12 +12,13 @@ from letta.schemas.providers.openai import OpenAIProvider
# Z.ai model context windows
# Reference: https://docs.z.ai/
# GLM-5 max context window is 200K tokens but max_output_tokens (default 16k) counts against that --> 180k
MODEL_CONTEXT_WINDOWS = {
"glm-4.5": 128000,
"glm-4.6": 200000,
"glm-4.7": 200000,
"glm-5": 200000,
"glm-5-code": 200000,
"glm-4.6": 180000,
"glm-4.7": 180000,
"glm-5": 180000,
"glm-5-code": 180000,
}

View File

@@ -3,7 +3,7 @@ import uuid
from contextlib import asynccontextmanager
from typing import AsyncGenerator
from sqlalchemy import NullPool, text
from sqlalchemy import NullPool
from sqlalchemy.ext.asyncio import (
AsyncEngine,
AsyncSession,
@@ -88,10 +88,6 @@ class DatabaseRegistry:
try:
async with async_session_factory() as session:
try:
result = await session.execute(text("SELECT pg_backend_pid(), current_setting('statement_timeout')"))
pid, timeout = result.one()
logger.warning(f"[stmt_timeout_debug] pid={pid} statement_timeout={timeout}")
await session.rollback()
yield session
await session.commit()
except asyncio.CancelledError:

View File

@@ -6,6 +6,7 @@ from pydantic import BaseModel
from letta.errors import LettaInvalidArgumentError
from letta.otel.tracing import tracer
from letta.schemas.enums import PrimitiveType
from letta.schemas.provider_trace import BillingContext
from letta.validators import PRIMITIVE_ID_PATTERNS
if TYPE_CHECKING:
@@ -30,18 +31,24 @@ class HeaderParams(BaseModel):
letta_source: Optional[str] = None
sdk_version: Optional[str] = None
experimental_params: Optional[ExperimentalParams] = None
billing_context: Optional[BillingContext] = None
def get_headers(
actor_id: Optional[str] = Header(None, alias="user_id"),
user_agent: Optional[str] = Header(None, alias="User-Agent"),
project_id: Optional[str] = Header(None, alias="X-Project-Id"),
letta_source: Optional[str] = Header(None, alias="X-Letta-Source"),
sdk_version: Optional[str] = Header(None, alias="X-Stainless-Package-Version"),
message_async: Optional[str] = Header(None, alias="X-Experimental-Message-Async"),
letta_v1_agent: Optional[str] = Header(None, alias="X-Experimental-Letta-V1-Agent"),
letta_v1_agent_message_async: Optional[str] = Header(None, alias="X-Experimental-Letta-V1-Agent-Message-Async"),
modal_sandbox: Optional[str] = Header(None, alias="X-Experimental-Modal-Sandbox"),
letta_source: Optional[str] = Header(None, alias="X-Letta-Source", include_in_schema=False),
sdk_version: Optional[str] = Header(None, alias="X-Stainless-Package-Version", include_in_schema=False),
message_async: Optional[str] = Header(None, alias="X-Experimental-Message-Async", include_in_schema=False),
letta_v1_agent: Optional[str] = Header(None, alias="X-Experimental-Letta-V1-Agent", include_in_schema=False),
letta_v1_agent_message_async: Optional[str] = Header(
None, alias="X-Experimental-Letta-V1-Agent-Message-Async", include_in_schema=False
),
modal_sandbox: Optional[str] = Header(None, alias="X-Experimental-Modal-Sandbox", include_in_schema=False),
billing_plan_type: Optional[str] = Header(None, alias="X-Billing-Plan-Type", include_in_schema=False),
billing_cost_source: Optional[str] = Header(None, alias="X-Billing-Cost-Source", include_in_schema=False),
billing_customer_id: Optional[str] = Header(None, alias="X-Billing-Customer-Id", include_in_schema=False),
) -> HeaderParams:
"""Dependency injection function to extract common headers from requests."""
with tracer.start_as_current_span("dependency.get_headers"):
@@ -63,6 +70,13 @@ def get_headers(
letta_v1_agent_message_async=(letta_v1_agent_message_async == "true") if letta_v1_agent_message_async else None,
modal_sandbox=(modal_sandbox == "true") if modal_sandbox else None,
),
billing_context=BillingContext(
plan_type=billing_plan_type,
cost_source=billing_cost_source,
customer_id=billing_customer_id,
)
if any([billing_plan_type, billing_cost_source, billing_customer_id])
else None,
)

View File

@@ -49,6 +49,7 @@ from letta.schemas.memory import (
)
from letta.schemas.message import Message, MessageCreate, MessageCreateType, MessageSearchRequest, MessageSearchResult
from letta.schemas.passage import Passage
from letta.schemas.provider_trace import BillingContext
from letta.schemas.run import Run as PydanticRun, RunUpdate
from letta.schemas.source import Source
from letta.schemas.tool import Tool
@@ -156,7 +157,7 @@ async def list_agents(
order: Literal["asc", "desc"] = Query(
"desc", description="Sort order for agents by creation time. 'asc' for oldest first, 'desc' for newest first"
),
order_by: Literal["created_at", "last_run_completion"] = Query("created_at", description="Field to sort by"),
order_by: Literal["created_at", "updated_at", "last_run_completion"] = Query("created_at", description="Field to sort by"),
ascending: bool = Query(
False,
description="Whether to sort agents oldest to newest (True) or newest to oldest (False, default)",
@@ -1697,6 +1698,7 @@ async def send_message(
actor=actor,
request=request,
run_type="send_message",
billing_context=headers.billing_context,
)
return result
@@ -1767,6 +1769,7 @@ async def send_message(
include_return_message_types=request.include_return_message_types,
client_tools=request.client_tools,
include_compaction_messages=request.include_compaction_messages,
billing_context=headers.billing_context,
)
run_status = result.stop_reason.stop_reason.run_status
return result
@@ -1845,6 +1848,7 @@ async def send_message_streaming(
actor=actor,
request=request,
run_type="send_message_streaming",
billing_context=headers.billing_context,
)
return result
@@ -1868,6 +1872,13 @@ async def cancel_message(
"""
# TODO: WHY DOES THIS CANCEL A LIST OF RUNS?
actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
logger.info(
"[Interrupt] Cancel request received for agent=%s by actor=%s (org=%s), explicit_run_ids=%s",
agent_id,
actor.id,
actor.organization_id,
request.run_ids if request else None,
)
if not settings.track_agent_run:
raise HTTPException(status_code=400, detail="Agent run tracking is disabled")
run_ids = request.run_ids if request else None
@@ -2036,6 +2047,7 @@ async def _process_message_background(
include_return_message_types: list[MessageType] | None = None,
override_model: str | None = None,
include_compaction_messages: bool = False,
billing_context: "BillingContext | None" = None,
) -> None:
"""Background task to process the message and update run status."""
request_start_timestamp_ns = get_utc_timestamp_ns()
@@ -2067,6 +2079,7 @@ async def _process_message_background(
request_start_timestamp_ns=request_start_timestamp_ns,
include_return_message_types=include_return_message_types,
include_compaction_messages=include_compaction_messages,
billing_context=billing_context,
)
runs_manager = RunManager()
from letta.schemas.enums import RunStatus
@@ -2235,6 +2248,7 @@ async def send_message_async(
include_return_message_types=request.include_return_message_types,
override_model=request.override_model,
include_compaction_messages=request.include_compaction_messages,
billing_context=headers.billing_context,
),
label=f"process_message_background_{run.id}",
)
@@ -2419,7 +2433,11 @@ async def summarize_messages(
# If mode changed from agent's original settings and prompt not explicitly set in request, then use the default prompt for the new mode
# Ex: previously was sliding_window, now is all, so we need to use the default prompt for all mode
if "mode" in changed_fields and agent.compaction_settings.mode != request.compaction_settings.mode:
if (
"mode" in changed_fields
and "prompt" not in changed_fields
and agent.compaction_settings.mode != request.compaction_settings.mode
):
from letta.services.summarizer.summarizer_config import get_default_prompt_for_mode
compaction_settings.prompt = get_default_prompt_for_mode(compaction_settings.mode)
@@ -2439,7 +2457,7 @@ async def summarize_messages(
logger.warning(f"Summarization failed to reduce the number of messages. {num_messages_before} messages -> {num_messages_after}.")
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Summarization failed to reduce the number of messages. You may need to use a different CompactionSettings (e.g. using `all` mode).",
detail="Summarization failed to reduce the number of messages. You may not have enough messages to compact or need to use a different CompactionSettings (e.g. using `all` mode).",
)
await agent_loop._checkpoint_messages(run_id=None, step_id=None, new_messages=[summary_message], in_context_messages=messages)
return CompactionResponse(

View File

@@ -1,5 +1,6 @@
from datetime import timedelta
from typing import Annotated, List, Literal, Optional
from uuid import uuid4
from fastapi import APIRouter, Body, Depends, HTTPException, Query, status
from pydantic import BaseModel, Field
@@ -18,6 +19,7 @@ from letta.schemas.job import LettaRequestConfig
from letta.schemas.letta_message import LettaMessageUnion
from letta.schemas.letta_request import ConversationMessageRequest, LettaStreamingRequest, RetrieveStreamRequest
from letta.schemas.letta_response import LettaResponse
from letta.schemas.provider_trace import BillingContext
from letta.schemas.run import Run as PydanticRun
from letta.server.rest_api.dependencies import HeaderParams, get_headers, get_letta_server
from letta.server.rest_api.redis_stream_manager import redis_sse_stream_generator
@@ -32,7 +34,7 @@ from letta.services.run_manager import RunManager
from letta.services.streaming_service import StreamingService
from letta.services.summarizer.summarizer_config import CompactionSettings
from letta.settings import settings
from letta.validators import ConversationId
from letta.validators import ConversationId, ConversationIdOrDefault
router = APIRouter(prefix="/conversations", tags=["conversations"])
@@ -148,7 +150,8 @@ ConversationMessagesResponse = Annotated[
operation_id="list_conversation_messages",
)
async def list_conversation_messages(
conversation_id: ConversationId,
conversation_id: ConversationIdOrDefault,
agent_id: Optional[str] = Query(None, description="Agent ID for agent-direct mode with 'default' conversation"),
server: SyncServer = Depends(get_letta_server),
headers: HeaderParams = Depends(get_headers),
before: Optional[str] = Query(
@@ -172,8 +175,36 @@ async def list_conversation_messages(
Returns LettaMessage objects (UserMessage, AssistantMessage, etc.) for all
messages in the conversation, with support for cursor-based pagination.
**Agent-direct mode**: Pass conversation_id="default" with agent_id parameter
to list messages from the agent's default conversation.
**Deprecated**: Passing an agent ID as conversation_id still works but will be removed.
"""
actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
# Agent-direct mode: conversation_id="default" + agent_id param (preferred)
# OR conversation_id="agent-*" (backwards compat, deprecated)
resolved_agent_id = None
if conversation_id == "default" and agent_id:
resolved_agent_id = agent_id
elif conversation_id.startswith("agent-"):
resolved_agent_id = conversation_id
if resolved_agent_id:
return await server.get_agent_recall_async(
agent_id=resolved_agent_id,
after=after,
before=before,
limit=limit,
group_id=group_id,
conversation_id=None, # Default conversation (no isolation)
reverse=(order == "desc"),
return_message_object=False,
include_err=include_err,
actor=actor,
)
return await conversation_manager.list_conversation_messages(
conversation_id=conversation_id,
actor=actor,
@@ -186,6 +217,108 @@ async def list_conversation_messages(
)
async def _send_agent_direct_message(
agent_id: str,
request: ConversationMessageRequest,
server: SyncServer,
actor,
billing_context: "BillingContext | None" = None,
) -> StreamingResponse | LettaResponse:
"""
Handle agent-direct messaging with locking but without conversation features.
This is used when the conversation_id in the URL is actually an agent ID,
providing a unified endpoint while maintaining agent-level locking.
"""
redis_client = await get_redis_client()
# Streaming mode (default)
if request.streaming:
streaming_request = LettaStreamingRequest(
messages=request.messages,
streaming=True,
stream_tokens=request.stream_tokens,
include_pings=request.include_pings,
background=request.background,
max_steps=request.max_steps,
use_assistant_message=request.use_assistant_message,
assistant_message_tool_name=request.assistant_message_tool_name,
assistant_message_tool_kwarg=request.assistant_message_tool_kwarg,
include_return_message_types=request.include_return_message_types,
override_model=request.override_model,
client_tools=request.client_tools,
)
streaming_service = StreamingService(server)
run, result = await streaming_service.create_agent_stream(
agent_id=agent_id,
actor=actor,
request=streaming_request,
run_type="send_message",
conversation_id=None,
should_lock=True,
billing_context=billing_context,
)
return result
# Non-streaming mode with locking
agent = await server.agent_manager.get_agent_by_id_async(
agent_id,
actor,
include_relationships=["memory", "multi_agent_group", "sources", "tool_exec_environment_variables", "tools", "tags"],
)
# Handle model override if specified in the request
if request.override_model:
override_llm_config = await server.get_llm_config_from_handle_async(
actor=actor,
handle=request.override_model,
)
agent = agent.model_copy(update={"llm_config": override_llm_config})
# Acquire lock using agent_id as lock key
if not isinstance(redis_client, NoopAsyncRedisClient):
await redis_client.acquire_conversation_lock(
conversation_id=agent_id,
token=str(uuid4()),
)
try:
# Create a run for execution tracking
run = None
if settings.track_agent_run:
runs_manager = RunManager()
run = await runs_manager.create_run(
pydantic_run=PydanticRun(
agent_id=agent_id,
background=False,
metadata={
"run_type": "send_message",
},
request_config=LettaRequestConfig.from_letta_request(request),
),
actor=actor,
)
# Set run_id in Redis for cancellation support
await redis_client.set(f"{REDIS_RUN_ID_PREFIX}:{agent_id}", run.id if run else None)
agent_loop = AgentLoop.load(agent_state=agent, actor=actor)
return await agent_loop.step(
request.messages,
max_steps=request.max_steps,
run_id=run.id if run else None,
use_assistant_message=request.use_assistant_message,
include_return_message_types=request.include_return_message_types,
client_tools=request.client_tools,
conversation_id=None,
include_compaction_messages=request.include_compaction_messages,
billing_context=billing_context,
)
finally:
# Release lock
await redis_client.release_conversation_lock(agent_id)
@router.post(
"/{conversation_id}/messages",
response_model=LettaResponse,
@@ -201,7 +334,7 @@ async def list_conversation_messages(
},
)
async def send_conversation_message(
conversation_id: ConversationId,
conversation_id: ConversationIdOrDefault,
request: ConversationMessageRequest = Body(...),
server: SyncServer = Depends(get_letta_server),
headers: HeaderParams = Depends(get_headers),
@@ -212,12 +345,36 @@ async def send_conversation_message(
This endpoint sends a message to an existing conversation.
By default (streaming=true), returns a streaming response (Server-Sent Events).
Set streaming=false to get a complete JSON response.
**Agent-direct mode**: Pass conversation_id="default" with agent_id in request body
to send messages to the agent's default conversation with locking.
**Deprecated**: Passing an agent ID as conversation_id still works but will be removed.
"""
actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
if not request.messages or len(request.messages) == 0:
raise HTTPException(status_code=422, detail="Messages must not be empty")
# Agent-direct mode: conversation_id="default" + agent_id in body (preferred)
# OR conversation_id="agent-*" (backwards compat, deprecated)
resolved_agent_id = None
if conversation_id == "default" and request.agent_id:
resolved_agent_id = request.agent_id
elif conversation_id.startswith("agent-"):
resolved_agent_id = conversation_id
if resolved_agent_id:
# Agent-direct mode: use agent ID, enable locking, skip conversation features
return await _send_agent_direct_message(
agent_id=resolved_agent_id,
request=request,
server=server,
actor=actor,
billing_context=headers.billing_context,
)
# Normal conversation mode
conversation = await conversation_manager.get_conversation_by_id(
conversation_id=conversation_id,
actor=actor,
@@ -247,6 +404,7 @@ async def send_conversation_message(
request=streaming_request,
run_type="send_conversation_message",
conversation_id=conversation_id,
billing_context=headers.billing_context,
)
return result
@@ -265,6 +423,10 @@ async def send_conversation_message(
)
if conversation.model_settings is not None:
update_params = conversation.model_settings._to_legacy_config_params()
# Don't clobber max_tokens with the Pydantic default when the caller
# didn't explicitly provide max_output_tokens.
if "max_output_tokens" not in conversation.model_settings.model_fields_set:
update_params.pop("max_tokens", None)
conversation_llm_config = conversation_llm_config.model_copy(update=update_params)
agent = agent.model_copy(update={"llm_config": conversation_llm_config})
@@ -305,6 +467,7 @@ async def send_conversation_message(
client_tools=request.client_tools,
conversation_id=conversation_id,
include_compaction_messages=request.include_compaction_messages,
billing_context=headers.billing_context,
)
@@ -341,7 +504,7 @@ async def send_conversation_message(
},
)
async def retrieve_conversation_stream(
conversation_id: ConversationId,
conversation_id: ConversationIdOrDefault,
request: RetrieveStreamRequest = Body(None),
headers: HeaderParams = Depends(get_headers),
server: SyncServer = Depends(get_letta_server),
@@ -351,11 +514,35 @@ async def retrieve_conversation_stream(
This endpoint allows you to reconnect to an active background stream
for a conversation, enabling recovery from network interruptions.
**Agent-direct mode**: Pass conversation_id="default" with agent_id in request body
to retrieve the stream for the agent's most recent active run.
**Deprecated**: Passing an agent ID as conversation_id still works but will be removed.
"""
actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
runs_manager = RunManager()
# Find the most recent active run for this conversation
# Agent-direct mode: conversation_id="default" + agent_id in body (preferred)
# OR conversation_id="agent-*" (backwards compat, deprecated)
resolved_agent_id = None
if conversation_id == "default" and request and request.agent_id:
resolved_agent_id = request.agent_id
elif conversation_id.startswith("agent-"):
resolved_agent_id = conversation_id
# Find the most recent active run
if resolved_agent_id:
# Agent-direct mode: find runs by agent_id
active_runs = await runs_manager.list_runs(
actor=actor,
agent_id=resolved_agent_id,
statuses=[RunStatus.created, RunStatus.running],
limit=1,
ascending=False,
)
else:
# Normal mode: find runs by conversation_id
active_runs = await runs_manager.list_runs(
actor=actor,
conversation_id=conversation_id,
@@ -417,7 +604,8 @@ async def retrieve_conversation_stream(
@router.post("/{conversation_id}/cancel", operation_id="cancel_conversation")
async def cancel_conversation(
conversation_id: ConversationId,
conversation_id: ConversationIdOrDefault,
agent_id: Optional[str] = Query(None, description="Agent ID for agent-direct mode with 'default' conversation"),
server: SyncServer = Depends(get_letta_server),
headers: HeaderParams = Depends(get_headers),
) -> dict:
@@ -425,17 +613,48 @@ async def cancel_conversation(
Cancel runs associated with a conversation.
Note: To cancel active runs, Redis is required.
**Agent-direct mode**: Pass conversation_id="default" with agent_id query parameter
to cancel runs for the agent's default conversation.
**Deprecated**: Passing an agent ID as conversation_id still works but will be removed.
"""
actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
logger.info(
"[Interrupt] Cancel request received for conversation=%s by actor=%s (org=%s)",
conversation_id,
actor.id,
actor.organization_id,
)
if not settings.track_agent_run:
raise HTTPException(status_code=400, detail="Agent run tracking is disabled")
# Agent-direct mode: conversation_id="default" + agent_id param (preferred)
# OR conversation_id="agent-*" (backwards compat, deprecated)
resolved_agent_id = None
if conversation_id == "default" and agent_id:
resolved_agent_id = agent_id
elif conversation_id.startswith("agent-"):
resolved_agent_id = conversation_id
if resolved_agent_id:
# Agent-direct mode: use agent_id directly, skip conversation lookup
# Find active runs for this agent (default conversation has conversation_id=None)
runs = await server.run_manager.list_runs(
actor=actor,
agent_id=resolved_agent_id,
statuses=[RunStatus.created, RunStatus.running],
ascending=False,
limit=100,
)
else:
# Verify conversation exists and get agent_id
conversation = await conversation_manager.get_conversation_by_id(
conversation_id=conversation_id,
actor=actor,
)
agent_id = conversation.agent_id
# Find active runs for this conversation
runs = await server.run_manager.list_runs(
@@ -445,6 +664,7 @@ async def cancel_conversation(
conversation_id=conversation_id,
limit=100,
)
run_ids = [run.id for run in runs]
if not run_ids:
@@ -461,7 +681,7 @@ async def cancel_conversation(
except Exception as e:
logger.error(f"Failed to cancel Lettuce run {run_id}: {e}")
await server.run_manager.cancel_run(actor=actor, agent_id=conversation.agent_id, run_id=run_id)
await server.run_manager.cancel_run(actor=actor, agent_id=agent_id, run_id=run_id)
except Exception as e:
results[run_id] = "failed"
logger.error(f"Failed to cancel run {run_id}: {str(e)}")
@@ -473,6 +693,10 @@ async def cancel_conversation(
class CompactionRequest(BaseModel):
agent_id: Optional[str] = Field(
default=None,
description="Agent ID for agent-direct mode with 'default' conversation. Use with conversation_id='default' in the URL path.",
)
compaction_settings: Optional[CompactionSettings] = Field(
default=None,
description="Optional compaction settings to use for this summarization request. If not provided, the agent's default settings will be used.",
@@ -487,7 +711,7 @@ class CompactionResponse(BaseModel):
@router.post("/{conversation_id}/compact", response_model=CompactionResponse, operation_id="compact_conversation")
async def compact_conversation(
conversation_id: ConversationId,
conversation_id: ConversationIdOrDefault,
request: Optional[CompactionRequest] = Body(default=None),
server: SyncServer = Depends(get_letta_server),
headers: HeaderParams = Depends(get_headers),
@@ -497,9 +721,28 @@ async def compact_conversation(
This endpoint summarizes the in-context messages for a specific conversation,
reducing the message count while preserving important context.
**Agent-direct mode**: Pass conversation_id="default" with agent_id in request body
to compact the agent's default conversation messages.
**Deprecated**: Passing an agent ID as conversation_id still works but will be removed.
"""
actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
# Agent-direct mode: conversation_id="default" + agent_id in body (preferred)
# OR conversation_id="agent-*" (backwards compat, deprecated)
resolved_agent_id = None
if conversation_id == "default" and request and request.agent_id:
resolved_agent_id = request.agent_id
elif conversation_id.startswith("agent-"):
resolved_agent_id = conversation_id
if resolved_agent_id:
# Agent-direct mode: compact agent's default conversation
agent = await server.agent_manager.get_agent_by_id_async(resolved_agent_id, actor, include_relationships=["multi_agent_group"])
in_context_messages = await server.message_manager.get_messages_by_ids_async(message_ids=agent.message_ids, actor=actor)
agent_loop = LettaAgentV3(agent_state=agent, actor=actor)
else:
# Get the conversation to find the agent_id
conversation = await conversation_manager.get_conversation_by_id(
conversation_id=conversation_id,
@@ -515,16 +758,36 @@ async def compact_conversation(
actor=actor,
)
# Create agent loop with conversation context
agent_loop = LettaAgentV3(agent_state=agent, actor=actor, conversation_id=conversation_id)
if not in_context_messages:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="No in-context messages found for this conversation.",
)
# Create agent loop with conversation context
agent_loop = LettaAgentV3(agent_state=agent, actor=actor, conversation_id=conversation_id)
# Merge request compaction_settings with agent's settings (request overrides agent)
if agent.compaction_settings and request and request.compaction_settings:
# Start with agent's settings, override with new values from request
# Use model_fields_set to get the fields that were changed in the request (want to ignore the defaults that get set automatically)
compaction_settings = agent.compaction_settings.copy() # do not mutate original agent compaction settings
changed_fields = request.compaction_settings.model_fields_set
for field in changed_fields:
setattr(compaction_settings, field, getattr(request.compaction_settings, field))
compaction_settings = request.compaction_settings if request else None
# If mode changed from agent's original settings and prompt not explicitly set in request, then use the default prompt for the new mode
# Ex: previously was sliding_window, now is all, so we need to use the default prompt for all mode
if (
"mode" in changed_fields
and "prompt" not in changed_fields
and agent.compaction_settings.mode != request.compaction_settings.mode
):
from letta.services.summarizer.summarizer_config import get_default_prompt_for_mode
compaction_settings.prompt = get_default_prompt_for_mode(compaction_settings.mode)
else:
compaction_settings = (request and request.compaction_settings) or agent.compaction_settings
num_messages_before = len(in_context_messages)
# Run compaction
@@ -537,13 +800,11 @@ async def compact_conversation(
# Validate compaction reduced messages
if num_messages_before <= num_messages_after:
logger.warning(
f"Summarization failed to reduce the number of messages. {num_messages_before} messages -> {num_messages_after} (only expected if drop_tool_returns is True)."
logger.warning(f"Summarization failed to reduce the number of messages. {num_messages_before} messages -> {num_messages_after}.")
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Summarization failed to reduce the number of messages. You may not have enough messages to compact or need to use a different CompactionSettings (e.g. using `all` mode).",
)
# raise HTTPException(
# status_code=status.HTTP_400_BAD_REQUEST,
# detail="Summarization failed to reduce the number of messages. You may need to use a different CompactionSettings (e.g. using `all` mode).",
# )
# Checkpoint the messages (this will update the conversation_messages table)
await agent_loop._checkpoint_messages(run_id=None, step_id=None, new_messages=[summary_message], in_context_messages=messages)

View File

@@ -29,11 +29,23 @@ from starlette.background import BackgroundTask
from letta.log import get_logger
from letta.server.rest_api.dependencies import HeaderParams, get_headers, get_letta_server
from letta.services.memory_repo.path_mapping import memory_block_label_from_markdown_path
logger = get_logger(__name__)
_background_tasks: set[asyncio.Task] = set()
def _is_syncable_block_markdown_path(path: str) -> bool:
"""Return whether a markdown path should be mirrored into block cache.
Special-case skills so only skill definitions are mirrored:
- sync `skills/{skill_name}/SKILL.md` as label `skills/{skill_name}`
- ignore all other markdown under `skills/`
"""
return memory_block_label_from_markdown_path(path) is not None
router = APIRouter(prefix="/git", tags=["git"], include_in_schema=False)
# Global storage for the server instance (set during app startup)
@@ -100,7 +112,7 @@ async def _sync_after_push(actor_id: str, agent_id: str) -> None:
expected_labels = set()
from letta.services.memory_repo.block_markdown import parse_block_markdown
md_file_paths = sorted([file_path for file_path in files if file_path.endswith(".md")])
md_file_paths = sorted([file_path for file_path in files if _is_syncable_block_markdown_path(file_path)])
nested_md_file_paths = [file_path for file_path in md_file_paths if "/" in file_path[:-3]]
logger.info(
"Post-push sync file scan: agent=%s total_files=%d md_files=%d nested_md_files=%d sample_md_paths=%s",
@@ -113,10 +125,12 @@ async def _sync_after_push(actor_id: str, agent_id: str) -> None:
synced = 0
for file_path, content in files.items():
if not file_path.endswith(".md"):
if not _is_syncable_block_markdown_path(file_path):
continue
label = file_path[:-3]
label = memory_block_label_from_markdown_path(file_path)
if label is None:
continue
expected_labels.add(label)
# Parse frontmatter to extract metadata alongside value

View File

@@ -364,6 +364,8 @@ def create_approval_request_message_from_llm_response(
)
if pre_computed_assistant_message_id:
approval_message.id = decrement_message_uuid(pre_computed_assistant_message_id)
# Set otid to match streaming interface pattern (index -1 returns id unchanged)
approval_message.otid = Message.generate_otid_from_id(approval_message.id, -1)
messages.append(approval_message)
return messages

View File

@@ -562,6 +562,10 @@ class SyncServer(object):
# update with model_settings
if request.model_settings is not None:
update_llm_config_params = request.model_settings._to_legacy_config_params()
# Don't clobber max_tokens with the Pydantic default when the caller
# didn't explicitly provide max_output_tokens in the request.
if "max_output_tokens" not in request.model_settings.model_fields_set:
update_llm_config_params.pop("max_tokens", None)
request.llm_config = request.llm_config.model_copy(update=update_llm_config_params)
# Copy parallel_tool_calls from request to llm_config if provided
@@ -675,6 +679,12 @@ class SyncServer(object):
# Get the current agent's llm_config if not already set
agent = await self.agent_manager.get_agent_by_id_async(agent_id=agent_id, actor=actor)
request.llm_config = agent.llm_config.model_copy()
else:
# TODO: Refactor update_agent to accept partial llm_config so we
# don't need to fetch the full agent just to preserve max_tokens.
if request.max_tokens is None and "max_output_tokens" not in request.model_settings.model_fields_set:
agent = await self.agent_manager.get_agent_by_id_async(agent_id=agent_id, actor=actor)
request.llm_config.max_tokens = agent.llm_config.max_tokens
update_llm_config_params = request.model_settings._to_legacy_config_params()
# Don't clobber max_tokens with the Pydantic default when the caller
# didn't explicitly provide max_output_tokens in the request.

View File

@@ -24,8 +24,7 @@ from letta.constants import (
INCLUDE_MODEL_KEYWORDS_BASE_TOOL_RULES,
RETRIEVAL_QUERY_DEFAULT_PAGE_SIZE,
)
from letta.errors import LettaAgentNotFoundError, LettaError, LettaInvalidArgumentError
from letta.errors import LettaError
from letta.helpers import ToolRulesSolver
from letta.helpers.datetime_helpers import get_utc_time
from letta.log import get_logger
@@ -789,6 +788,25 @@ class AgentManager:
agent.agent_type,
)
# Upsert compaction_settings: merge incoming partial update with existing settings
if agent_update.compaction_settings is not None:
# If mode changed, update the prompt to the default for the new mode
changed_fields = agent_update.compaction_settings.model_fields_set
if (
agent.compaction_settings is not None
and "mode" in changed_fields
and agent_update.compaction_settings.mode != agent.compaction_settings.mode
):
from letta.services.summarizer.summarizer_config import get_default_prompt_for_mode
agent_update.compaction_settings.prompt = get_default_prompt_for_mode(agent_update.compaction_settings.mode)
# Fill in unchanged fields from existing settings
if agent.compaction_settings is not None:
for field in agent.compaction_settings.model_fields:
if field not in changed_fields:
setattr(agent_update.compaction_settings, field, getattr(agent.compaction_settings, field))
scalar_updates = {
"name": agent_update.name,
"system": agent_update.system,

View File

@@ -7,6 +7,7 @@ if TYPE_CHECKING:
from sqlalchemy import and_, asc, delete, desc, func, nulls_last, or_, select
from letta.errors import LettaInvalidArgumentError
from letta.helpers.datetime_helpers import get_utc_time
from letta.orm.agent import Agent as AgentModel
from letta.orm.block import Block as BlockModel
from letta.orm.blocks_conversations import BlocksConversations
@@ -29,6 +30,21 @@ from letta.utils import enforce_types
class ConversationManager:
"""Manager class to handle business logic related to Conversations."""
@staticmethod
def _serialize_model_settings(model_settings) -> Optional[dict]:
"""Serialize model settings for DB storage, stripping max_output_tokens if not explicitly set.
Uses model_dump() to preserve all fields (including the provider_type discriminator),
but removes max_output_tokens when it wasn't explicitly provided by the caller so we
don't persist the Pydantic default (4096) and later overwrite the agent's own value.
"""
if model_settings is None:
return None
data = model_settings.model_dump()
if "max_output_tokens" not in model_settings.model_fields_set:
data.pop("max_output_tokens", None)
return data
@enforce_types
@trace_method
async def create_conversation(
@@ -56,7 +72,7 @@ class ConversationManager:
summary=conversation_create.summary,
organization_id=actor.organization_id,
model=conversation_create.model,
model_settings=conversation_create.model_settings.model_dump() if conversation_create.model_settings else None,
model_settings=self._serialize_model_settings(conversation_create.model_settings),
)
await conversation.create_async(session, actor=actor)
@@ -73,8 +89,102 @@ class ConversationManager:
pydantic_conversation = conversation.to_pydantic()
pydantic_conversation.isolated_block_ids = isolated_block_ids
# Compile and persist the initial system message for this conversation
# This ensures the conversation captures the latest memory block state at creation time
await self.compile_and_save_system_message_for_conversation(
conversation_id=pydantic_conversation.id,
agent_id=agent_id,
actor=actor,
)
return pydantic_conversation
@trace_method
async def compile_and_save_system_message_for_conversation(
self,
conversation_id: str,
agent_id: str,
actor: PydanticUser,
agent_state: Optional["AgentState"] = None,
message_manager: Optional[object] = None,
) -> PydanticMessage:
"""Compile and persist the initial system message for a conversation.
This recompiles the system prompt with the latest memory block values
and metadata, ensuring the conversation starts with an up-to-date
system message.
This is the single source of truth for creating a conversation's system
message — used both at conversation creation time and as a fallback
when a conversation has no messages yet.
Args:
conversation_id: The conversation to add the system message to
agent_id: The agent this conversation belongs to
actor: The user performing the action
agent_state: Optional pre-loaded agent state (avoids redundant DB load)
message_manager: Optional pre-loaded MessageManager instance
Returns:
The persisted system message
"""
# Lazy imports to avoid circular dependencies
from letta.prompts.prompt_generator import PromptGenerator
from letta.services.message_manager import MessageManager
from letta.services.passage_manager import PassageManager
if message_manager is None:
message_manager = MessageManager()
if agent_state is None:
from letta.services.agent_manager import AgentManager
agent_state = await AgentManager().get_agent_by_id_async(
agent_id=agent_id,
include_relationships=["memory", "sources"],
actor=actor,
)
passage_manager = PassageManager()
num_messages = await message_manager.size_async(actor=actor, agent_id=agent_id)
num_archival_memories = await passage_manager.agent_passage_size_async(actor=actor, agent_id=agent_id)
# Compile the system message with current memory state
system_message_str = await PromptGenerator.compile_system_message_async(
system_prompt=agent_state.system,
in_context_memory=agent_state.memory,
in_context_memory_last_edit=get_utc_time(),
timezone=agent_state.timezone,
user_defined_variables=None,
append_icm_if_missing=True,
previous_message_count=num_messages,
archival_memory_size=num_archival_memories,
sources=agent_state.sources,
max_files_open=agent_state.max_files_open,
)
system_message = PydanticMessage.dict_to_message(
agent_id=agent_id,
model=agent_state.llm_config.model,
openai_message_dict={"role": "system", "content": system_message_str},
)
# Persist the new system message
persisted_messages = await message_manager.create_many_messages_async([system_message], actor=actor)
system_message = persisted_messages[0]
# Add it to the conversation tracking at position 0
await self.add_messages_to_conversation(
conversation_id=conversation_id,
agent_id=agent_id,
message_ids=[system_message.id],
actor=actor,
starting_position=0,
)
return system_message
@enforce_types
@trace_method
async def get_conversation_by_id(
@@ -133,22 +243,15 @@ class ConversationManager:
if sort_by == "last_run_completion":
# Subquery to get the latest completed_at for each conversation
latest_run_subquery = (
select(
RunModel.conversation_id,
func.max(RunModel.completed_at).label("last_run_completion")
)
select(RunModel.conversation_id, func.max(RunModel.completed_at).label("last_run_completion"))
.where(RunModel.conversation_id.isnot(None))
.group_by(RunModel.conversation_id)
.subquery()
)
# Join conversations with the subquery
stmt = (
select(ConversationModel)
.outerjoin(
latest_run_subquery,
ConversationModel.id == latest_run_subquery.c.conversation_id
)
stmt = select(ConversationModel).outerjoin(
latest_run_subquery, ConversationModel.id == latest_run_subquery.c.conversation_id
)
sort_column = latest_run_subquery.c.last_run_completion
sort_nulls_last = True
@@ -170,10 +273,12 @@ class ConversationManager:
# Add summary search filter if provided
if summary_search:
conditions.extend([
conditions.extend(
[
ConversationModel.summary.isnot(None),
ConversationModel.summary.contains(summary_search),
])
]
)
stmt = stmt.where(and_(*conditions))
@@ -182,10 +287,7 @@ class ConversationManager:
# Get the sort value for the cursor conversation
if sort_by == "last_run_completion":
cursor_query = (
select(
ConversationModel.id,
func.max(RunModel.completed_at).label("last_run_completion")
)
select(ConversationModel.id, func.max(RunModel.completed_at).label("last_run_completion"))
.outerjoin(RunModel, ConversationModel.id == RunModel.conversation_id)
.where(ConversationModel.id == after)
.group_by(ConversationModel.id)
@@ -198,16 +300,11 @@ class ConversationManager:
# Cursor is at NULL - if ascending, get non-NULLs or NULLs with greater ID
if ascending:
stmt = stmt.where(
or_(
and_(sort_column.is_(None), ConversationModel.id > after_id),
sort_column.isnot(None)
)
or_(and_(sort_column.is_(None), ConversationModel.id > after_id), sort_column.isnot(None))
)
else:
# If descending, get NULLs with smaller ID
stmt = stmt.where(
and_(sort_column.is_(None), ConversationModel.id < after_id)
)
stmt = stmt.where(and_(sort_column.is_(None), ConversationModel.id < after_id))
else:
# Cursor is at non-NULL
if ascending:
@@ -217,8 +314,8 @@ class ConversationManager:
sort_column.isnot(None),
or_(
sort_column > after_sort_value,
and_(sort_column == after_sort_value, ConversationModel.id > after_id)
)
and_(sort_column == after_sort_value, ConversationModel.id > after_id),
),
)
)
else:
@@ -227,7 +324,7 @@ class ConversationManager:
or_(
sort_column.is_(None),
sort_column < after_sort_value,
and_(sort_column == after_sort_value, ConversationModel.id < after_id)
and_(sort_column == after_sort_value, ConversationModel.id < after_id),
)
)
else:
@@ -277,7 +374,11 @@ class ConversationManager:
for key, value in update_data.items():
# model_settings needs to be serialized to dict for the JSON column
if key == "model_settings" and value is not None:
setattr(conversation, key, conversation_update.model_settings.model_dump() if conversation_update.model_settings else value)
setattr(
conversation,
key,
self._serialize_model_settings(conversation_update.model_settings) if conversation_update.model_settings else value,
)
else:
setattr(conversation, key, value)

View File

@@ -604,6 +604,9 @@ def _apply_pagination(
if sort_by == "last_run_completion":
sort_column = AgentModel.last_run_completion
sort_nulls_last = True # TODO: handle this as a query param eventually
elif sort_by == "updated_at":
sort_column = AgentModel.updated_at
sort_nulls_last = False
else:
sort_column = AgentModel.created_at
sort_nulls_last = False
@@ -637,6 +640,9 @@ async def _apply_pagination_async(
if sort_by == "last_run_completion":
sort_column = AgentModel.last_run_completion
sort_nulls_last = True # TODO: handle this as a query param eventually
elif sort_by == "updated_at":
sort_column = AgentModel.updated_at
sort_nulls_last = False
else:
sort_column = AgentModel.created_at
sort_nulls_last = False

View File

@@ -73,7 +73,6 @@ class LLMTraceWriter:
def __init__(self):
self._client = None
self._shutdown = False
self._write_lock = asyncio.Lock() # Serialize writes - clickhouse_connect isn't thread-safe
# Check if ClickHouse is configured - if not, writing is disabled
self._enabled = bool(settings.clickhouse_endpoint and settings.clickhouse_password)
@@ -82,11 +81,7 @@ class LLMTraceWriter:
atexit.register(self._sync_shutdown)
def _get_client(self):
"""Initialize ClickHouse client on first use (lazy loading).
Configures async_insert with wait_for_async_insert=1 for reliable
server-side batching with acknowledgment.
"""
"""Initialize ClickHouse client on first use (lazy loading)."""
if self._client is not None:
return self._client
@@ -108,8 +103,10 @@ class LLMTraceWriter:
settings={
# Enable server-side batching
"async_insert": 1,
# Wait for acknowledgment (reliable)
"wait_for_async_insert": 1,
# Don't wait for server-side flush acknowledgment — fire and forget.
# Waiting (value=1) caused each insert to hold an asyncio.Lock for ~1s,
# creating unbounded task queues that saturated the event loop under load.
"wait_for_async_insert": 0,
# Flush after 1 second if batch not full
"async_insert_busy_timeout_ms": 1000,
},
@@ -148,9 +145,9 @@ class LLMTraceWriter:
row = trace.to_clickhouse_row()
columns = LLMTrace.clickhouse_columns()
# Serialize writes - clickhouse_connect client isn't thread-safe
async with self._write_lock:
# Run synchronous insert in thread pool
# Run synchronous insert in thread pool. clickhouse-connect supports
# multithreaded use via a thread-safe connection pool:
# https://clickhouse.com/docs/integrations/language-clients/python/advanced-usage#multithreaded-multiprocess-and-asyncevent-driven-use-cases
await asyncio.to_thread(
client.insert,
"llm_traces",

View File

@@ -3,11 +3,11 @@
File format:
---
description: "Who I am and how I approach work"
limit: 20000
---
My name is Memo. I'm a stateful coding assistant...
- Frontmatter fields are only rendered when they differ from defaults.
- ``limit`` is intentionally excluded from frontmatter (deprecated for git-base memory).
- Files without frontmatter are treated as value-only (backward compat).
"""
@@ -37,12 +37,12 @@ def serialize_block(
This is used for initial file creation. For updates to existing files,
prefer `merge_frontmatter_with_body` to preserve user formatting.
"""
# description and limit are always included in frontmatter.
# description is always included in frontmatter.
# read_only and metadata are only included when non-default.
# limit is intentionally excluded (deprecated for git-base memory).
front: Dict[str, Any] = {}
front["description"] = description
front["limit"] = limit if limit is not None else _get_field_default("limit")
if read_only != _get_field_default("read_only"):
front["read_only"] = read_only
@@ -111,7 +111,6 @@ def merge_frontmatter_with_body(
# Desired values
desired_description = description
desired_limit = limit if limit is not None else _get_field_default("limit")
desired_read_only = read_only
desired_metadata = metadata if metadata is not None else _get_field_default("metadata")
@@ -122,8 +121,9 @@ def merge_frontmatter_with_body(
parsed["description"] = desired_description
changed = True
if "limit" not in parsed or parsed.get("limit") != desired_limit:
parsed["limit"] = desired_limit
# Remove limit from frontmatter if it exists (deprecated for git-base memory)
if "limit" in parsed:
del parsed["limit"]
changed = True
if desired_read_only != _get_field_default("read_only"):

View File

@@ -21,6 +21,7 @@ from letta.schemas.memory_repo import MemoryCommit
from letta.schemas.user import User as PydanticUser
from letta.services.memory_repo.block_markdown import parse_block_markdown, serialize_block
from letta.services.memory_repo.git_operations import GitOperations
from letta.services.memory_repo.path_mapping import memory_block_label_from_markdown_path
from letta.services.memory_repo.storage.local import LocalStorageBackend
from letta.utils import enforce_types
@@ -133,11 +134,14 @@ class MemfsClient:
except FileNotFoundError:
return []
# Convert block files to PydanticBlock (metadata is in frontmatter)
# Convert block files to PydanticBlock (metadata is in frontmatter).
# skills/{skill_name}/SKILL.md is mapped to block label skills/{skill_name};
# other files under skills/ are intentionally ignored.
blocks = []
for file_path, content in files.items():
if file_path.endswith(".md"):
label = file_path[:-3]
label = memory_block_label_from_markdown_path(file_path)
if label is None:
continue
parsed = parse_block_markdown(content)

View File

@@ -0,0 +1,29 @@
"""Helpers for mapping memory-repo markdown paths to block labels.
Special handling for skills:
- sync `skills/{skill_name}/SKILL.md` as block label `skills/{skill_name}`
- ignore all other markdown files under `skills/`
"""
from __future__ import annotations
def memory_block_label_from_markdown_path(path: str) -> str | None:
"""Return block label for a syncable markdown path, else None.
Rules:
- Non-`.md` files are ignored.
- `skills/{skill_name}/SKILL.md` -> `skills/{skill_name}`
- Other `skills/**` markdown files are ignored.
- All other markdown files map to `path[:-3]`.
"""
if not path.endswith(".md"):
return None
if path.startswith("skills/"):
parts = path.split("/")
if len(parts) == 3 and parts[0] == "skills" and parts[1] and parts[2] == "SKILL.md":
return f"skills/{parts[1]}"
return None
return path[:-3]

View File

@@ -141,6 +141,9 @@ class ClickhouseProviderTraceBackend(ProviderTraceBackendClient):
request_json=request_json_str,
response_json=response_json_str,
llm_config_json=llm_config_json_str,
billing_plan_type=provider_trace.billing_context.plan_type if provider_trace.billing_context else None,
billing_cost_source=provider_trace.billing_context.cost_source if provider_trace.billing_context else None,
billing_customer_id=provider_trace.billing_context.customer_id if provider_trace.billing_context else None,
)
def _extract_usage(self, response_json: dict, provider: str) -> dict:

View File

@@ -29,7 +29,7 @@ class PostgresProviderTraceBackend(ProviderTraceBackendClient):
) -> ProviderTrace:
"""Write full provider trace to provider_traces table."""
async with db_registry.async_session() as session:
provider_trace_model = ProviderTraceModel(**provider_trace.model_dump())
provider_trace_model = ProviderTraceModel(**provider_trace.model_dump(exclude={"billing_context"}))
provider_trace_model.organization_id = actor.organization_id
if provider_trace.request_json:

View File

@@ -638,7 +638,13 @@ class RunManager:
raise NoResultFound(f"Run with id {run_id} not found")
agent_id = run.agent_id
logger.debug(f"Cancelling run {run_id} for agent {agent_id}")
logger.info(
"[Interrupt] Processing cancellation for run=%s, agent=%s, current_status=%s, current_stop_reason=%s",
run_id,
agent_id,
run.status if run else "unknown",
run.stop_reason if run else "unknown",
)
# Cancellation should be idempotent: if a run is already terminated, treat this as a no-op.
# This commonly happens when a run finishes between client request and server handling.

View File

@@ -15,6 +15,7 @@ from letta.errors import (
LettaInvalidArgumentError,
LettaServiceUnavailableError,
LLMAuthenticationError,
LLMEmptyResponseError,
LLMError,
LLMRateLimitError,
LLMTimeoutError,
@@ -33,6 +34,7 @@ from letta.schemas.letta_request import ClientToolSchema, LettaStreamingRequest
from letta.schemas.letta_response import LettaResponse
from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType
from letta.schemas.message import MessageCreate
from letta.schemas.provider_trace import BillingContext
from letta.schemas.run import Run as PydanticRun, RunUpdate
from letta.schemas.usage import LettaUsageStatistics
from letta.schemas.user import User
@@ -76,6 +78,8 @@ class StreamingService:
request: LettaStreamingRequest,
run_type: str = "streaming",
conversation_id: Optional[str] = None,
should_lock: bool = False,
billing_context: "BillingContext | None" = None,
) -> tuple[Optional[PydanticRun], Union[StreamingResponse, LettaResponse]]:
"""
Create a streaming response for an agent.
@@ -86,6 +90,7 @@ class StreamingService:
request: The LettaStreamingRequest containing all request parameters
run_type: Type of run for tracking
conversation_id: Optional conversation ID for conversation-scoped messaging
should_lock: If True and conversation_id is None, use agent_id as lock key
Returns:
Tuple of (run object or None, streaming response)
@@ -116,6 +121,10 @@ class StreamingService:
)
if conversation.model_settings is not None:
update_params = conversation.model_settings._to_legacy_config_params()
# Don't clobber max_tokens with the Pydantic default when the caller
# didn't explicitly provide max_output_tokens.
if "max_output_tokens" not in conversation.model_settings.model_fields_set:
update_params.pop("max_tokens", None)
conversation_llm_config = conversation_llm_config.model_copy(update=update_params)
agent = agent.model_copy(update={"llm_config": conversation_llm_config})
@@ -130,12 +139,15 @@ class StreamingService:
model_compatible_token_streaming = self._is_token_streaming_compatible(agent)
# Attempt to acquire conversation lock if conversation_id is provided
# This prevents concurrent message processing for the same conversation
# Determine lock key: use conversation_id if provided, else agent_id if should_lock
lock_key = conversation_id if conversation_id else (agent_id if should_lock else None)
# Attempt to acquire lock if lock_key is set
# This prevents concurrent message processing for the same conversation/agent
# Skip locking if Redis is not available (graceful degradation)
if conversation_id and not isinstance(redis_client, NoopAsyncRedisClient):
if lock_key and not isinstance(redis_client, NoopAsyncRedisClient):
await redis_client.acquire_conversation_lock(
conversation_id=conversation_id,
conversation_id=lock_key,
token=str(uuid4()),
)
@@ -163,8 +175,10 @@ class StreamingService:
include_return_message_types=request.include_return_message_types,
actor=actor,
conversation_id=conversation_id,
lock_key=lock_key, # For lock release (may differ from conversation_id)
client_tools=request.client_tools,
include_compaction_messages=request.include_compaction_messages,
billing_context=billing_context,
)
# handle background streaming if requested
@@ -195,7 +209,7 @@ class StreamingService:
run_id=run.id,
run_manager=self.server.run_manager,
actor=actor,
conversation_id=conversation_id,
conversation_id=lock_key, # Use lock_key for lock release
),
label=f"background_stream_processor_{run.id}",
)
@@ -251,7 +265,7 @@ class StreamingService:
if settings.track_agent_run and run and run_status:
await self.server.run_manager.update_run_by_id_async(
run_id=run.id,
conversation_id=conversation_id,
conversation_id=lock_key, # Use lock_key for lock release
update=RunUpdate(status=run_status, metadata=run_update_metadata),
actor=actor,
)
@@ -326,8 +340,10 @@ class StreamingService:
include_return_message_types: Optional[list[MessageType]],
actor: User,
conversation_id: Optional[str] = None,
lock_key: Optional[str] = None,
client_tools: Optional[list[ClientToolSchema]] = None,
include_compaction_messages: bool = False,
billing_context: BillingContext | None = None,
) -> AsyncIterator:
"""
Create a stream with unified error handling.
@@ -356,6 +372,7 @@ class StreamingService:
conversation_id=conversation_id,
client_tools=client_tools,
include_compaction_messages=include_compaction_messages,
billing_context=billing_context,
)
async for chunk in stream:
@@ -442,6 +459,21 @@ class StreamingService:
yield f"event: error\ndata: {error_message.model_dump_json()}\n\n"
# Send [DONE] marker to properly close the stream
yield "data: [DONE]\n\n"
except LLMEmptyResponseError as e:
run_status = RunStatus.failed
stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response)
error_message = LettaErrorMessage(
run_id=run_id,
error_type="llm_empty_response",
message="LLM returned an empty response.",
detail=str(e),
)
error_data = {"error": error_message.model_dump()}
logger.warning(f"Run {run_id} stopped with LLM empty response: {e}, error_data: {error_message.model_dump()}")
yield f"data: {stop_reason.model_dump_json()}\n\n"
yield f"event: error\ndata: {error_message.model_dump_json()}\n\n"
# Send [DONE] marker to properly close the stream
yield "data: [DONE]\n\n"
except LLMError as e:
run_status = RunStatus.failed
stop_reason = LettaStopReason(stop_reason=StopReasonType.llm_api_error)
@@ -491,7 +523,7 @@ class StreamingService:
stop_reason_value = stop_reason.stop_reason if stop_reason else StopReasonType.error.value
await self.runs_manager.update_run_by_id_async(
run_id=run_id,
conversation_id=conversation_id,
conversation_id=lock_key, # Use lock_key for lock release
update=RunUpdate(status=run_status, stop_reason=stop_reason_value, metadata=error_data),
actor=actor,
)

View File

@@ -96,6 +96,10 @@ async def build_summarizer_llm_config(
# them just like server.create_agent_async does for agents.
if summarizer_config.model_settings is not None:
update_params = summarizer_config.model_settings._to_legacy_config_params()
# Don't clobber max_tokens with the Pydantic default when the caller
# didn't explicitly provide max_output_tokens.
if "max_output_tokens" not in summarizer_config.model_settings.model_fields_set:
update_params.pop("max_tokens", None)
return base.model_copy(update=update_params)
return base

View File

@@ -196,7 +196,7 @@ async def self_summarize_sliding_window(
return message.tool_calls is not None and len(message.tool_calls) > 0
return False
post_summarization_buffer = [system_prompt]
post_summarization_buffer = []
while approx_token_count >= goal_tokens and eviction_percentage < 1.0:
# more eviction percentage
eviction_percentage += 0.10
@@ -217,8 +217,8 @@ async def self_summarize_sliding_window(
# update token count
logger.info(f"Attempting to compact messages to index {assistant_message_index} messages")
post_summarization_buffer = [system_prompt, *messages[assistant_message_index:]]
approx_token_count = await count_tokens(actor, agent_llm_config, post_summarization_buffer)
post_summarization_buffer = list(messages[assistant_message_index:])
approx_token_count = await count_tokens(actor, agent_llm_config, [system_prompt, *post_summarization_buffer])
logger.info(
f"Compacting messages index 1:{assistant_message_index} messages resulted in {approx_token_count} tokens, goal is {goal_tokens}"
)

View File

@@ -11,7 +11,7 @@ from letta.settings import summarizer_settings
def get_default_summarizer_model(provider_type: ProviderType) -> str | None:
"""Get default model for summarization for given provider type."""
summarizer_defaults = {
ProviderType.anthropic: "anthropic/claude-haiku-4-5-20251001",
ProviderType.anthropic: "anthropic/claude-haiku-4-5",
ProviderType.openai: "openai/gpt-5-mini",
ProviderType.google_ai: "google_ai/gemini-2.5-flash",
}

View File

@@ -114,7 +114,7 @@ class SummarizerSettings(BaseSettings):
class ModelSettings(BaseSettings):
model_config = SettingsConfigDict(env_file=".env", extra="ignore")
global_max_context_window_limit: int = 32000
global_max_context_window_limit: int = 128000
inner_thoughts_kwarg: str | None = Field(default=INNER_THOUGHTS_KWARG, description="Key used for passing in inner thoughts.")
@@ -204,6 +204,7 @@ class ModelSettings(BaseSettings):
gemini_base_url: str = "https://generativelanguage.googleapis.com/"
gemini_force_minimum_thinking_budget: bool = False
gemini_max_retries: int = 5
gemini_timeout_seconds: float = 600.0
# google vertex
google_cloud_project: Optional[str] = None

View File

@@ -45,30 +45,36 @@ PATH_VALIDATORS = {primitive_type.value: _create_path_validator_factory(primitiv
def _create_conversation_id_or_default_path_validator_factory():
"""Conversation IDs accept the usual primitive format or the special value 'default'."""
"""Conversation IDs with support for 'default' and agent IDs (backwards compatibility)."""
primitive = PrimitiveType.CONVERSATION.value
prefix_pattern = PRIMITIVE_ID_PATTERNS[primitive].pattern
# Make the full regex accept either the primitive ID format or 'default'.
# `prefix_pattern` already contains the ^...$ anchors.
conversation_or_default_pattern = f"^(default|{prefix_pattern[1:-1]})$"
conversation_primitive = PrimitiveType.CONVERSATION.value
agent_primitive = PrimitiveType.AGENT.value
conversation_pattern = PRIMITIVE_ID_PATTERNS[conversation_primitive].pattern
agent_pattern = PRIMITIVE_ID_PATTERNS[agent_primitive].pattern
# Make the full regex accept: conversation ID, agent ID, or 'default'.
# Patterns already contain ^...$ anchors, so strip them for the alternation.
conversation_or_agent_or_default_pattern = f"^(default|{conversation_pattern[1:-1]}|{agent_pattern[1:-1]})$"
def factory():
return Path(
description=(f"The conversation identifier. Either the special value 'default' or an ID in the format '{primitive}-<uuid4>'"),
pattern=conversation_or_default_pattern,
examples=["default", f"{primitive}-123e4567-e89b-42d3-8456-426614174000"],
description=(
f"The conversation identifier. Can be a conversation ID ('{conversation_primitive}-<uuid4>'), "
f"'default' for agent-direct mode (with agent_id parameter), "
f"or an agent ID ('{agent_primitive}-<uuid4>') for backwards compatibility (deprecated)."
),
pattern=conversation_or_agent_or_default_pattern,
examples=[
"default",
f"{conversation_primitive}-123e4567-e89b-42d3-8456-426614174000",
f"{agent_primitive}-123e4567-e89b-42d3-8456-426614174000",
],
min_length=1,
max_length=len(primitive) + 1 + 36,
max_length=max(len(conversation_primitive), len(agent_primitive)) + 1 + 36,
)
return factory
# Override conversation ID path validation to also allow the special value 'default'.
PATH_VALIDATORS[PrimitiveType.CONVERSATION.value] = _create_conversation_id_or_default_path_validator_factory()
# Type aliases for common ID types
# These can be used directly in route handler signatures for cleaner code
AgentId = Annotated[str, PATH_VALIDATORS[PrimitiveType.AGENT.value]()]
@@ -89,6 +95,10 @@ StepId = Annotated[str, PATH_VALIDATORS[PrimitiveType.STEP.value]()]
IdentityId = Annotated[str, PATH_VALIDATORS[PrimitiveType.IDENTITY.value]()]
ConversationId = Annotated[str, PATH_VALIDATORS[PrimitiveType.CONVERSATION.value]()]
# Conversation ID with support for 'default' and agent IDs (for agent-direct mode endpoints)
# Backwards compatible - agent-* will be deprecated in favor of conversation_id='default' + agent_id param
ConversationIdOrDefault = Annotated[str, _create_conversation_id_or_default_path_validator_factory()()]
# Infrastructure types
McpServerId = Annotated[str, PATH_VALIDATORS[PrimitiveType.MCP_SERVER.value]()]
McpOAuthId = Annotated[str, PATH_VALIDATORS[PrimitiveType.MCP_OAUTH.value]()]

View File

@@ -1,6 +1,6 @@
[project]
name = "letta"
version = "0.16.5"
version = "0.16.6"
description = "Create LLM agents with long-term memory and custom tools"
authors = [
{name = "Letta Team", email = "contact@letta.com"},

View File

@@ -2,6 +2,12 @@ import anthropic
import httpx
import openai
import pytest
from anthropic.types.beta import (
BetaMessage,
BetaRawMessageStartEvent,
BetaRawMessageStopEvent,
BetaUsage,
)
from google.genai import errors as google_errors
from letta.adapters.letta_llm_stream_adapter import LettaLLMStreamAdapter
@@ -9,6 +15,7 @@ from letta.errors import (
ContextWindowExceededError,
LLMBadRequestError,
LLMConnectionError,
LLMEmptyResponseError,
LLMInsufficientCreditsError,
LLMServerError,
)
@@ -287,3 +294,70 @@ def test_openai_client_handle_llm_error_non_credit_api_error():
result = client.handle_llm_error(error)
assert isinstance(result, LLMBadRequestError)
assert not isinstance(result, LLMInsufficientCreditsError)
@pytest.mark.asyncio
async def test_letta_llm_stream_adapter_raises_empty_response_error_for_anthropic(monkeypatch):
"""LET-7679: Empty streaming responses (no content blocks) should raise LLMEmptyResponseError.
This tests the case where Opus 4.6 returns a response with:
- BetaRawMessageStartEvent (with usage tokens)
- BetaRawMessageStopEvent (end_turn)
- NO content blocks in between
This should raise LLMEmptyResponseError, not complete successfully with stop_reason=end_turn.
"""
class FakeAsyncStream:
"""Mimics anthropic.AsyncStream that returns empty content (no content blocks)."""
def __init__(self):
self.events = [
# Message start with some usage info
BetaRawMessageStartEvent(
type="message_start",
message=BetaMessage(
id="msg_test_empty",
type="message",
role="assistant",
content=[], # Empty content
model="claude-opus-4-6",
stop_reason="end_turn",
stop_sequence=None,
usage=BetaUsage(input_tokens=1000, output_tokens=26, cache_creation_input_tokens=0, cache_read_input_tokens=0),
),
),
# Message stop immediately after start - no content blocks
BetaRawMessageStopEvent(type="message_stop"),
]
self.index = 0
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc, tb):
return None
def __aiter__(self):
return self
async def __anext__(self):
if self.index >= len(self.events):
raise StopAsyncIteration
event = self.events[self.index]
self.index += 1
return event
async def fake_stream_async(self, request_data: dict, llm_config):
return FakeAsyncStream()
monkeypatch.setattr(AnthropicClient, "stream_async", fake_stream_async, raising=True)
llm_client = AnthropicClient()
llm_config = LLMConfig(model="claude-opus-4-6", model_endpoint_type="anthropic", context_window=200000)
adapter = LettaLLMStreamAdapter(llm_client=llm_client, llm_config=llm_config, call_type=LLMCallType.agent_step)
gen = adapter.invoke_llm(request_data={}, messages=[], tools=[], use_assistant_message=True)
with pytest.raises(LLMEmptyResponseError):
async for _ in gen:
pass

View File

@@ -0,0 +1,8 @@
{
"context_window": 32000,
"model": "gpt-5.3-codex",
"model_endpoint_type": "openai",
"model_endpoint": "https://api.openai.com/v1",
"model_wrapper": null,
"reasoning_effort": "low"
}

View File

@@ -141,7 +141,7 @@ async def create_test_agent(name, actor, test_id: Optional[str] = None, model="a
model="claude-3-7-sonnet-latest",
model_endpoint_type="anthropic",
model_endpoint="https://api.anthropic.com/v1",
context_window=32000,
context_window=128000,
handle="anthropic/claude-3-7-sonnet-latest",
put_inner_thoughts_in_kwargs=True,
max_tokens=4096,
@@ -193,7 +193,7 @@ async def create_test_batch_item(server, batch_id, agent_id, default_user):
model="claude-3-7-sonnet-latest",
model_endpoint_type="anthropic",
model_endpoint="https://api.anthropic.com/v1",
context_window=32000,
context_window=128000,
handle="anthropic/claude-3-7-sonnet-latest",
put_inner_thoughts_in_kwargs=True,
max_tokens=4096,

View File

@@ -62,12 +62,14 @@ class TestConversationsSDK:
# Create a conversation
created = client.conversations.create(agent_id=agent.id)
# Retrieve it (should have empty in_context_message_ids initially)
# Retrieve it (should have system message from creation)
retrieved = client.conversations.retrieve(conversation_id=created.id)
assert retrieved.id == created.id
assert retrieved.agent_id == created.agent_id
assert retrieved.in_context_message_ids == []
# Conversation should have 1 system message immediately after creation
assert len(retrieved.in_context_message_ids) == 1
assert retrieved.in_context_message_ids[0].startswith("message-")
# Send a message to the conversation
list(
@@ -566,6 +568,289 @@ class TestConversationsSDK:
# Should not contain the cursor message
assert first_message_id not in [m.id for m in messages_after]
def test_agent_direct_messaging_via_conversations_endpoint(self, client: Letta, agent):
"""Test sending messages using agent ID as conversation_id (agent-direct mode).
This allows clients to use a unified endpoint pattern without managing conversation IDs.
"""
# Send a message using the agent ID directly as conversation_id
# This should route to agent-direct mode with locking
messages = list(
client.conversations.messages.create(
conversation_id=agent.id, # Using agent ID instead of conversation ID
messages=[{"role": "user", "content": "Hello via agent-direct mode!"}],
)
)
# Verify we got a response
assert len(messages) > 0, "Should receive response messages"
# Verify we got an assistant message in the response
assistant_messages = [m for m in messages if hasattr(m, "message_type") and m.message_type == "assistant_message"]
assert len(assistant_messages) > 0, "Should receive at least one assistant message"
def test_agent_direct_messaging_with_locking(self, client: Letta, agent):
"""Test that agent-direct mode properly acquires and releases locks.
Sequential requests should both succeed if locks are properly released.
"""
from letta.settings import settings
# Skip if Redis is not configured
if settings.redis_host is None or settings.redis_port is None:
pytest.skip("Redis not configured - skipping agent-direct lock test")
# Send first message via agent-direct mode
messages1 = list(
client.conversations.messages.create(
conversation_id=agent.id,
messages=[{"role": "user", "content": "First message"}],
)
)
assert len(messages1) > 0, "First message should succeed"
# Send second message - should succeed if lock was released
messages2 = list(
client.conversations.messages.create(
conversation_id=agent.id,
messages=[{"role": "user", "content": "Second message"}],
)
)
assert len(messages2) > 0, "Second message should succeed after lock released"
def test_agent_direct_concurrent_requests_blocked(self, client: Letta, agent):
"""Test that concurrent requests to agent-direct mode are properly serialized.
One request should succeed and one should get a 409 CONVERSATION_BUSY error.
"""
import concurrent.futures
from letta_client import ConflictError
from letta.settings import settings
# Skip if Redis is not configured
if settings.redis_host is None or settings.redis_port is None:
pytest.skip("Redis not configured - skipping agent-direct lock test")
results = {"success": 0, "conflict": 0, "other_error": 0}
def send_message(msg: str):
try:
messages = list(
client.conversations.messages.create(
conversation_id=agent.id, # Agent-direct mode
messages=[{"role": "user", "content": msg}],
)
)
return ("success", messages)
except ConflictError:
return ("conflict", None)
except Exception as e:
return ("other_error", str(e))
# Fire off two messages concurrently
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
future1 = executor.submit(send_message, "Concurrent message 1")
future2 = executor.submit(send_message, "Concurrent message 2")
result1 = future1.result()
result2 = future2.result()
# Count results
for result_type, _ in [result1, result2]:
results[result_type] += 1
# One should succeed and one should get conflict
assert results["success"] == 1, f"Expected 1 success, got {results['success']}"
assert results["conflict"] == 1, f"Expected 1 conflict, got {results['conflict']}"
assert results["other_error"] == 0, f"Unexpected errors: {results['other_error']}"
# Now send another message - should succeed since lock is released
messages = list(
client.conversations.messages.create(
conversation_id=agent.id,
messages=[{"role": "user", "content": "Message after concurrent requests"}],
)
)
assert len(messages) > 0, "Should be able to send message after concurrent requests complete"
def test_agent_direct_list_messages(self, client: Letta, agent):
"""Test listing messages using agent ID as conversation_id."""
# First send a message via agent-direct mode
list(
client.conversations.messages.create(
conversation_id=agent.id,
messages=[{"role": "user", "content": "Test message for listing"}],
)
)
# List messages using agent ID
messages_page = client.conversations.messages.list(conversation_id=agent.id)
messages = list(messages_page)
# Should have messages (at least system + user + assistant)
assert len(messages) >= 3, f"Expected at least 3 messages, got {len(messages)}"
# Verify we can find our test message
user_messages = [m for m in messages if hasattr(m, "message_type") and m.message_type == "user_message"]
assert any("Test message for listing" in str(m.content) for m in user_messages), "Should find our test message"
def test_agent_direct_cancel(self, client: Letta, agent):
"""Test canceling runs using agent ID as conversation_id."""
from letta.settings import settings
# Skip if run tracking is disabled
if not settings.track_agent_run:
pytest.skip("Run tracking disabled - skipping cancel test")
# Start a background request that we can cancel
try:
# Send a message in background mode
stream = client.conversations.messages.create(
conversation_id=agent.id,
messages=[{"role": "user", "content": "Background message to cancel"}],
background=True,
)
# Consume a bit of the stream to ensure it started
next(iter(stream), None)
# Cancel using agent ID
result = client.conversations.cancel(conversation_id=agent.id)
# Should return results (may be empty if run already completed)
assert isinstance(result, dict), "Cancel should return a dict of results"
except Exception as e:
# If no active runs, that's okay - the run may have completed quickly
if "No active runs" not in str(e):
raise
def test_backwards_compatibility_old_pattern(self, client: Letta, agent, server_url: str):
"""Test that the old pattern (agent_id as conversation_id) still works for backwards compatibility."""
# OLD PATTERN: conversation_id=agent.id (should still work)
# Use raw HTTP requests since SDK might not be up to date
# Test 1: Send message using old pattern
response = requests.post(
f"{server_url}/v1/conversations/{agent.id}/messages",
json={
"messages": [{"role": "user", "content": "Testing old pattern still works"}],
"streaming": False,
},
)
assert response.status_code == 200, f"Old pattern should work for sending messages: {response.text}"
data = response.json()
assert "messages" in data, "Response should contain messages"
assert len(data["messages"]) > 0, "Should receive response messages"
# Test 2: List messages using old pattern
response = requests.get(f"{server_url}/v1/conversations/{agent.id}/messages")
assert response.status_code == 200, f"Old pattern should work for listing messages: {response.text}"
data = response.json()
# Response is a list of messages directly
assert isinstance(data, list), "Response should be a list of messages"
assert len(data) >= 3, "Should have at least system + user + assistant messages"
# Verify our message is there
user_messages = [m for m in data if m.get("message_type") == "user_message"]
assert any("Testing old pattern still works" in str(m.get("content", "")) for m in user_messages), "Should find our test message"
def test_new_pattern_send_message(self, client: Letta, agent, server_url: str):
"""Test sending messages using the new pattern: conversation_id='default' + agent_id in body."""
# NEW PATTERN: conversation_id='default' + agent_id in request body
response = requests.post(
f"{server_url}/v1/conversations/default/messages",
json={
"agent_id": agent.id,
"messages": [{"role": "user", "content": "Testing new pattern send message"}],
"streaming": False,
},
)
assert response.status_code == 200, f"New pattern should work for sending messages: {response.text}"
data = response.json()
assert "messages" in data, "Response should contain messages"
assert len(data["messages"]) > 0, "Should receive response messages"
# Verify we got an assistant message
assistant_messages = [m for m in data["messages"] if m.get("message_type") == "assistant_message"]
assert len(assistant_messages) > 0, "Should receive at least one assistant message"
def test_new_pattern_list_messages(self, client: Letta, agent, server_url: str):
"""Test listing messages using the new pattern: conversation_id='default' + agent_id query param."""
# First send a message to populate the conversation
requests.post(
f"{server_url}/v1/conversations/{agent.id}/messages",
json={
"messages": [{"role": "user", "content": "Setup message for list test"}],
"streaming": False,
},
)
# NEW PATTERN: conversation_id='default' + agent_id as query param
response = requests.get(
f"{server_url}/v1/conversations/default/messages",
params={"agent_id": agent.id},
)
assert response.status_code == 200, f"New pattern should work for listing messages: {response.text}"
data = response.json()
# Response is a list of messages directly
assert isinstance(data, list), "Response should be a list of messages"
assert len(data) >= 3, "Should have at least system + user + assistant messages"
def test_new_pattern_cancel(self, client: Letta, agent, server_url: str):
"""Test canceling runs using the new pattern: conversation_id='default' + agent_id query param."""
from letta.settings import settings
if not settings.track_agent_run:
pytest.skip("Run tracking disabled - skipping cancel test")
# NEW PATTERN: conversation_id='default' + agent_id as query param
response = requests.post(
f"{server_url}/v1/conversations/default/cancel",
params={"agent_id": agent.id},
)
# Returns 200 with results if runs exist, or 409 if no active runs
assert response.status_code in [200, 409], f"New pattern should work for cancel: {response.text}"
if response.status_code == 200:
data = response.json()
assert isinstance(data, dict), "Cancel should return a dict"
def test_new_pattern_compact(self, client: Letta, agent, server_url: str):
"""Test compacting conversation using the new pattern: conversation_id='default' + agent_id in body."""
# Send many messages to have enough for compaction
for i in range(10):
requests.post(
f"{server_url}/v1/conversations/{agent.id}/messages",
json={
"messages": [{"role": "user", "content": f"Message {i} for compaction test"}],
"streaming": False,
},
)
# NEW PATTERN: conversation_id='default' + agent_id in request body
response = requests.post(
f"{server_url}/v1/conversations/default/compact",
json={"agent_id": agent.id},
)
# May return 200 (success) or 400 (not enough messages to compact)
assert response.status_code in [200, 400], f"New pattern should accept agent_id parameter: {response.text}"
if response.status_code == 200:
data = response.json()
assert "summary" in data, "Response should contain summary"
assert "num_messages_before" in data, "Response should contain num_messages_before"
assert "num_messages_after" in data, "Response should contain num_messages_after"
def test_new_pattern_stream_retrieve(self, client: Letta, agent, server_url: str):
"""Test retrieving stream using the new pattern: conversation_id='default' + agent_id in body."""
# NEW PATTERN: conversation_id='default' + agent_id in request body
# Note: This will likely return 400 if no active run exists, which is expected
response = requests.post(
f"{server_url}/v1/conversations/default/stream",
json={"agent_id": agent.id},
)
# Either 200 (if run exists) or 400 (no active run) are both acceptable
assert response.status_code in [200, 400], f"Stream retrieve should accept new pattern: {response.text}"
class TestConversationDelete:
"""Tests for the conversation delete endpoint."""
@@ -834,3 +1119,130 @@ class TestConversationCompact:
)
assert response.status_code == 404
class TestConversationSystemMessageRecompilation:
"""Tests that verify the system message is recompiled with latest memory state on new conversation creation."""
def test_new_conversation_recompiles_system_message_with_updated_memory(self, client: Letta, server_url: str):
"""Test the full workflow:
1. Agent is created
2. Send message to agent (through a conversation)
3. Modify the memory block -> check system message is NOT updated with the modified value
4. Create a new conversation
5. Check new conversation system message DOES have the modified value
"""
unique_marker = f"UNIQUE_MARKER_{uuid.uuid4().hex[:8]}"
# Step 1: Create an agent with known memory blocks
agent = client.agents.create(
name=f"test_sys_msg_recompile_{uuid.uuid4().hex[:8]}",
model="openai/gpt-4o-mini",
embedding="openai/text-embedding-3-small",
memory_blocks=[
{"label": "human", "value": "The user is a test user."},
{"label": "persona", "value": "You are a helpful assistant."},
],
)
try:
# Step 2: Create a conversation and send a message to it
conv1 = client.conversations.create(agent_id=agent.id)
list(
client.conversations.messages.create(
conversation_id=conv1.id,
messages=[{"role": "user", "content": "Hello, just a quick test."}],
)
)
# Verify the conversation has messages including a system message
conv1_messages = client.conversations.messages.list(
conversation_id=conv1.id,
order="asc",
)
assert len(conv1_messages) >= 3 # system + user + assistant
assert conv1_messages[0].message_type == "system_message"
# Get the original system message content
original_system_content = conv1_messages[0].content
assert unique_marker not in original_system_content, "Marker should not be in original system message"
# Step 3: Modify the memory block with a unique marker
client.agents.blocks.update(
agent_id=agent.id,
block_label="human",
value=f"The user is a test user. {unique_marker}",
)
# Verify the block was actually updated
updated_block = client.agents.blocks.retrieve(agent_id=agent.id, block_label="human")
assert unique_marker in updated_block.value
# Check that the OLD conversation's system message is NOT updated
conv1_messages_after_update = client.conversations.messages.list(
conversation_id=conv1.id,
order="asc",
)
old_system_content = conv1_messages_after_update[0].content
assert unique_marker not in old_system_content, "Old conversation system message should NOT contain the updated memory value"
# Step 4: Create a new conversation
conv2 = client.conversations.create(agent_id=agent.id)
# Step 5: Check the new conversation's system message has the updated value
# The system message should be compiled at creation time with the latest memory
conv2_retrieved = client.conversations.retrieve(conversation_id=conv2.id)
assert len(conv2_retrieved.in_context_message_ids) == 1, (
f"New conversation should have exactly 1 system message, got {len(conv2_retrieved.in_context_message_ids)}"
)
conv2_messages = client.conversations.messages.list(
conversation_id=conv2.id,
order="asc",
)
assert len(conv2_messages) >= 1
assert conv2_messages[0].message_type == "system_message"
new_system_content = conv2_messages[0].content
assert unique_marker in new_system_content, (
f"New conversation system message should contain the updated memory value '{unique_marker}', "
f"but system message content did not include it"
)
finally:
client.agents.delete(agent_id=agent.id)
def test_conversation_creation_initializes_system_message(self, client: Letta, server_url: str):
"""Test that creating a conversation immediately initializes it with a system message."""
agent = client.agents.create(
name=f"test_conv_init_{uuid.uuid4().hex[:8]}",
model="openai/gpt-4o-mini",
embedding="openai/text-embedding-3-small",
memory_blocks=[
{"label": "human", "value": "Test user for system message init."},
{"label": "persona", "value": "You are a helpful assistant."},
],
)
try:
# Create a conversation (without sending any messages)
conversation = client.conversations.create(agent_id=agent.id)
# Verify the conversation has a system message immediately
retrieved = client.conversations.retrieve(conversation_id=conversation.id)
assert len(retrieved.in_context_message_ids) == 1, (
f"Expected 1 system message after conversation creation, got {len(retrieved.in_context_message_ids)}"
)
# Verify the system message content contains memory block values
messages = client.conversations.messages.list(
conversation_id=conversation.id,
order="asc",
)
assert len(messages) == 1
assert messages[0].message_type == "system_message"
assert "Test user for system message init." in messages[0].content
finally:
client.agents.delete(agent_id=agent.id)

View File

@@ -93,7 +93,7 @@ def agent_obj(client: Letta) -> AgentState:
tool_ids=[send_message_to_agent_tool.id],
model="openai/gpt-4o",
embedding="openai/text-embedding-3-small",
context_window_limit=32000,
context_window_limit=128000,
)
yield agent_state_instance
@@ -107,7 +107,7 @@ def other_agent_obj(client: Letta) -> AgentState:
include_multi_agent_tools=False,
model="openai/gpt-4o",
embedding="openai/text-embedding-3-small",
context_window_limit=32000,
context_window_limit=128000,
)
yield agent_state_instance

View File

@@ -366,6 +366,8 @@ async def test_compaction_settings_model_uses_separate_llm_config_for_summarizat
async def test_create_agent_sets_default_compaction_model_anthropic(server: SyncServer, default_user):
"""When no compaction_settings provided for Anthropic agent, default haiku model should be set."""
from letta.schemas.agent import CreateAgent
from letta.schemas.enums import ProviderType
from letta.services.summarizer.summarizer_config import get_default_summarizer_model
await server.init_async(init_with_default_org_and_user=True)
@@ -384,7 +386,7 @@ async def test_create_agent_sets_default_compaction_model_anthropic(server: Sync
# Should have default haiku model set
assert agent.compaction_settings is not None
assert agent.compaction_settings.model == "anthropic/claude-haiku-4-5-20251001"
assert agent.compaction_settings.model == get_default_summarizer_model(ProviderType.anthropic)
@pytest.mark.asyncio
@@ -808,6 +810,79 @@ async def test_update_agent_compaction_settings(server: SyncServer, comprehensiv
assert updated_agent.compaction_settings.prompt_acknowledgement == False
@pytest.mark.asyncio
async def test_update_agent_partial_compaction_settings(server: SyncServer, comprehensive_test_agent_fixture, default_user):
"""Test that an agent's compaction_settings can be upserted."""
from letta.services.summarizer.summarizer_config import get_default_prompt_for_mode
agent, _ = comprehensive_test_agent_fixture
# Create new compaction settings
original_compaction_settings = agent.compaction_settings.model_copy()
new_compaction_settings = CompactionSettings(
mode="all",
prompt_acknowledgement=True,
clip_chars=3000,
)
# Update agent with compaction settings
update_agent_request = UpdateAgent(
compaction_settings=new_compaction_settings,
)
updated_agent = await server.agent_manager.update_agent_async(agent.id, update_agent_request, actor=default_user)
# Verify compaction settings were updated correctly
assert updated_agent.compaction_settings is not None
assert updated_agent.compaction_settings.model == original_compaction_settings.model
assert updated_agent.compaction_settings.model_settings == original_compaction_settings.model_settings
assert updated_agent.compaction_settings.sliding_window_percentage == original_compaction_settings.sliding_window_percentage
assert updated_agent.compaction_settings.mode == "all"
assert updated_agent.compaction_settings.clip_chars == 3000
assert updated_agent.compaction_settings.prompt == get_default_prompt_for_mode("all")
assert updated_agent.compaction_settings.prompt_acknowledgement == True
@pytest.mark.asyncio
async def test_update_agent_partial_compaction_settings_same_mode(server: SyncServer, comprehensive_test_agent_fixture, default_user):
"""Test that if the mode stays the same without a prompt passed in, the prompt is not updated."""
agent, _ = comprehensive_test_agent_fixture
update_agent_request = UpdateAgent(
compaction_settings=CompactionSettings(mode="sliding_window", prompt="This is a fake prompt."),
)
updated_agent = await server.agent_manager.update_agent_async(agent.id, update_agent_request, actor=default_user)
assert updated_agent.compaction_settings is not None
assert updated_agent.compaction_settings.prompt == "This is a fake prompt."
# Create new compaction settings
original_compaction_settings = updated_agent.compaction_settings.model_copy()
new_compaction_settings = CompactionSettings(
mode="sliding_window",
model="openai/gpt-4o-mini",
)
# Update agent with compaction settings
update_agent_request = UpdateAgent(
compaction_settings=new_compaction_settings,
)
final_agent = await server.agent_manager.update_agent_async(updated_agent.id, update_agent_request, actor=default_user)
# Verify compaction settings were updated correctly
assert final_agent.compaction_settings is not None
assert final_agent.compaction_settings.sliding_window_percentage == original_compaction_settings.sliding_window_percentage
assert final_agent.compaction_settings.prompt == original_compaction_settings.prompt
assert final_agent.compaction_settings.clip_chars == original_compaction_settings.clip_chars
assert final_agent.compaction_settings.prompt_acknowledgement == original_compaction_settings.prompt_acknowledgement
assert final_agent.compaction_settings.mode == "sliding_window"
assert final_agent.compaction_settings.model == "openai/gpt-4o-mini"
@pytest.mark.asyncio
async def test_agent_file_defaults_based_on_context_window(server: SyncServer, default_user, default_block):
"""Test that file-related defaults are set based on the model's context window size"""

View File

@@ -562,7 +562,9 @@ async def test_update_block(server: SyncServer, default_user):
@pytest.mark.asyncio
async def test_update_block_limit(server: SyncServer, default_user):
block_manager = BlockManager()
block = await block_manager.create_or_update_block_async(PydanticBlock(label="persona", value="Original Content"), actor=default_user)
block = await block_manager.create_or_update_block_async(
PydanticBlock(label="persona", value="Original Content", limit=20000), actor=default_user
)
limit = len("Updated Content") * 2000
update_data = BlockUpdate(value="Updated Content" * 2000, description="Updated description")

View File

@@ -355,8 +355,9 @@ async def test_add_messages_to_conversation(
actor=default_user,
)
assert len(message_ids) == 1
assert message_ids[0] == hello_world_message_fixture.id
# create_conversation auto-creates a system message at position 0
assert len(message_ids) == 2
assert hello_world_message_fixture.id in message_ids
@pytest.mark.asyncio
@@ -385,8 +386,9 @@ async def test_get_messages_for_conversation(
actor=default_user,
)
assert len(messages) == 1
assert messages[0].id == hello_world_message_fixture.id
# create_conversation auto-creates a system message at position 0
assert len(messages) == 2
assert any(m.id == hello_world_message_fixture.id for m in messages)
@pytest.mark.asyncio
@@ -430,7 +432,10 @@ async def test_message_ordering_in_conversation(conversation_manager, server: Sy
actor=default_user,
)
assert retrieved_ids == [m.id for m in messages]
# create_conversation auto-creates a system message at position 0,
# so the user messages start at index 1
assert len(retrieved_ids) == len(messages) + 1
assert retrieved_ids[1:] == [m.id for m in messages]
@pytest.mark.asyncio
@@ -489,7 +494,7 @@ async def test_update_in_context_messages(conversation_manager, server: SyncServ
@pytest.mark.asyncio
async def test_empty_conversation_message_ids(conversation_manager, server: SyncServer, sarah_agent, default_user):
"""Test getting message IDs from an empty conversation."""
"""Test getting message IDs from a newly created conversation (has auto-created system message)."""
# Create a conversation
conversation = await conversation_manager.create_conversation(
agent_id=sarah_agent.id,
@@ -497,13 +502,14 @@ async def test_empty_conversation_message_ids(conversation_manager, server: Sync
actor=default_user,
)
# Get message IDs (should be empty)
# create_conversation auto-creates a system message at position 0,
# so a newly created conversation has exactly one message
message_ids = await conversation_manager.get_message_ids_for_conversation(
conversation_id=conversation.id,
actor=default_user,
)
assert message_ids == []
assert len(message_ids) == 1
@pytest.mark.asyncio
@@ -551,9 +557,11 @@ async def test_list_conversation_messages(conversation_manager, server: SyncServ
actor=default_user,
)
assert len(letta_messages) == 2
# create_conversation auto-creates a system message, so we get 3 total
assert len(letta_messages) == 3
# Check message types
message_types = [m.message_type for m in letta_messages]
assert "system_message" in message_types
assert "user_message" in message_types
assert "assistant_message" in message_types
@@ -902,9 +910,12 @@ async def test_list_conversation_messages_ascending_order(conversation_manager,
reverse=False,
)
# First message should be "Message 0" (oldest)
assert len(letta_messages) == 3
assert "Message 0" in letta_messages[0].content
# create_conversation auto-creates a system message at position 0,
# so we get 4 messages total (system + 3 user messages)
assert len(letta_messages) == 4
# First message is the auto-created system message; "Message 0" is second
assert letta_messages[0].message_type == "system_message"
assert "Message 0" in letta_messages[1].content
@pytest.mark.asyncio
@@ -949,8 +960,9 @@ async def test_list_conversation_messages_descending_order(conversation_manager,
reverse=True,
)
# First message should be "Message 2" (newest)
assert len(letta_messages) == 3
# create_conversation auto-creates a system message, so 4 total
# First message should be "Message 2" (newest) in descending order
assert len(letta_messages) == 4
assert "Message 2" in letta_messages[0].content
@@ -1081,7 +1093,8 @@ async def test_list_conversation_messages_no_group_id_returns_all(conversation_m
actor=default_user,
)
assert len(all_messages) == 3
# create_conversation auto-creates a system message, so 4 total
assert len(all_messages) == 4
@pytest.mark.asyncio
@@ -1137,8 +1150,8 @@ async def test_list_conversation_messages_order_with_pagination(conversation_man
# The first messages should be different
assert page_asc[0].content != page_desc[0].content
# In ascending, first should be "Message 0"
assert "Message 0" in page_asc[0].content
# In ascending, first is the auto-created system message, second is "Message 0"
assert page_asc[0].message_type == "system_message"
# In descending, first should be "Message 4"
assert "Message 4" in page_desc[0].content

View File

@@ -579,8 +579,11 @@ async def test_server_startup_syncs_base_providers(default_user, default_organiz
yield item
# Mock the Anthropic AsyncAnthropic client
# NOTE: list() must be a regular (non-async) method that returns an async iterable,
# because the real Anthropic SDK's models.list() returns an AsyncPage (which has __aiter__)
# directly, and the code uses `async for model in client.models.list()`.
class MockAnthropicModels:
async def list(self):
def list(self):
return MockAnthropicAsyncPage(mock_anthropic_models["data"])
class MockAsyncAnthropic:
@@ -877,8 +880,10 @@ async def test_server_startup_handles_api_errors_gracefully(default_user, defaul
for item in self._items:
yield item
# NOTE: The real SDK's models.list() is a regular (non-async) method that
# returns an AsyncPaginator (which is async-iterable).
class MockAnthropicModels:
async def list(self):
def list(self):
return MockAnthropicAsyncPage(mock_anthropic_data)
class MockAsyncAnthropic:

View File

@@ -0,0 +1,11 @@
{
"handle": "openai/gpt-5.3-chat-latest",
"model_settings": {
"provider_type": "openai",
"max_output_tokens": 4096,
"parallel_tool_calls": false,
"reasoning": {
"reasoning_effort": "minimal"
}
}
}

View File

@@ -1,11 +1,11 @@
from conftest import create_test_module
from letta_client import UnprocessableEntityError
from letta.constants import CORE_MEMORY_HUMAN_CHAR_LIMIT, CORE_MEMORY_PERSONA_CHAR_LIMIT
from letta.constants import CORE_MEMORY_BLOCK_CHAR_LIMIT
BLOCKS_CREATE_PARAMS = [
("human_block", {"label": "human", "value": "test"}, {"limit": CORE_MEMORY_HUMAN_CHAR_LIMIT}, None),
("persona_block", {"label": "persona", "value": "test1"}, {"limit": CORE_MEMORY_PERSONA_CHAR_LIMIT}, None),
("human_block", {"label": "human", "value": "test"}, {"limit": CORE_MEMORY_BLOCK_CHAR_LIMIT}, None),
("persona_block", {"label": "persona", "value": "test1"}, {"limit": CORE_MEMORY_BLOCK_CHAR_LIMIT}, None),
]
BLOCKS_UPDATE_PARAMS = [

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -44,7 +44,7 @@
"provider_name": null,
"provider_category": null,
"model_wrapper": null,
"context_window": 32000,
"context_window": 128000,
"put_inner_thoughts_in_kwargs": false,
"handle": "anthropic/claude-3.5-sonnet",
"temperature": 1.0,

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -56,7 +56,7 @@
"provider_name": "openai",
"provider_category": "base",
"model_wrapper": null,
"context_window": 32000,
"context_window": 128000,
"put_inner_thoughts_in_kwargs": true,
"handle": "openai/gpt-4o-mini",
"temperature": 1.0,

View File

@@ -55,7 +55,7 @@
"provider_name": "openai",
"provider_category": "base",
"model_wrapper": null,
"context_window": 32000,
"context_window": 128000,
"put_inner_thoughts_in_kwargs": true,
"handle": "openai/gpt-4.1-mini",
"temperature": 1.0,

View File

@@ -16,7 +16,7 @@ def llm_config():
model="claude-3-7-sonnet-20250219",
model_endpoint_type="anthropic",
model_endpoint="https://api.anthropic.com/v1",
context_window=32000,
context_window=128000,
handle="anthropic/claude-sonnet-4-20250514",
put_inner_thoughts_in_kwargs=False,
max_tokens=4096,

View File

@@ -52,8 +52,17 @@ class TestLogContextMiddleware:
async def get_files(self, agent_id, org_id, ref):
assert ref == "HEAD"
return {
"system/human.md": "---\ndescription: human\nlimit: 20000\n---\nname: sarah",
"system/persona.md": "---\ndescription: persona\nlimit: 20000\n---\nbe helpful",
"system/human.md": "---\ndescription: human\n---\nname: sarah",
"system/persona.md": "---\ndescription: persona\n---\nbe helpful",
"skills/research-helper/SKILL.md": (
"---\n"
"name: research-helper\n"
"description: Search the web and summarize findings.\n"
"---\n"
"# Research Helper\n\n"
"Use this skill to do deep web research and summarize results.\n"
),
"skills/research-helper/references/details.md": "---\ndescription: nested\n---\nShould not be synced",
}
class DummyMemoryRepoManager:
@@ -95,6 +104,12 @@ class TestLogContextMiddleware:
labels = {call["label"] for call in synced_calls}
assert "system/human" in labels
assert "system/persona" in labels
assert "skills/research-helper" in labels
assert "skills/research-helper/references/details" not in labels
by_label = {call["label"]: call for call in synced_calls}
assert by_label["skills/research-helper"]["description"] == "Search the web and summarize findings."
assert by_label["skills/research-helper"]["value"].startswith("# Research Helper")
def test_extracts_actor_id_from_headers(self, client):
response = client.get("/v1/agents/agent-123e4567-e89b-42d3-8456-426614174000", headers={"user_id": "user-abc123"})

View File

@@ -25,9 +25,9 @@ def test_chat_memory_init_and_utils(chat_memory: Memory):
def test_memory_limit_validation(chat_memory: Memory):
with pytest.raises(ValueError):
ChatMemory(persona="x " * 50000, human="y " * 50000)
ChatMemory(persona="x " * 60000, human="y " * 60000)
with pytest.raises(ValueError):
chat_memory.get_block("persona").value = "x " * 50000
chat_memory.get_block("persona").value = "x " * 60000
def test_get_block_not_found(chat_memory: Memory):
@@ -253,3 +253,104 @@ def test_compile_git_memory_filesystem_handles_leaf_directory_collisions():
assert "system/" in out
assert "system.md" in out
assert "human.md" in out
def test_compile_git_memory_filesystem_renders_descriptions_for_non_system_files():
"""Files outside system/ should render their description in the filesystem tree.
e.g. `reference/api.md (Contains API specifications)`
System files should NOT render descriptions in the tree.
"""
m = Memory(
agent_type=AgentType.letta_v1_agent,
git_enabled=True,
blocks=[
Block(label="system/human", value="human data", limit=100, description="The human block"),
Block(label="system/persona", value="persona data", limit=100, description="The persona block"),
Block(label="reference/api", value="api specs", limit=100, description="Contains API specifications"),
Block(label="notes", value="my notes", limit=100, description="Personal notes and reminders"),
],
)
out = m.compile()
# Filesystem tree should exist
assert "<memory_filesystem>" in out
# Non-system files should have descriptions rendered
assert "api.md (Contains API specifications)" in out
assert "notes.md (Personal notes and reminders)" in out
# System files should NOT have descriptions in the tree
assert "human.md (The human block)" not in out
assert "persona.md (The persona block)" not in out
# But they should still be in the tree (without description)
assert "human.md" in out
assert "persona.md" in out
def test_compile_git_memory_filesystem_no_description_when_empty():
"""Files outside system/ with no description should render without parentheses."""
m = Memory(
agent_type=AgentType.letta_v1_agent,
git_enabled=True,
blocks=[
Block(label="system/human", value="human data", limit=100),
Block(label="notes", value="my notes", limit=100),
Block(label="reference/api", value="api specs", limit=100, description="API docs"),
],
)
out = m.compile()
# notes.md has no description, so no parentheses
assert "notes.md\n" in out or "notes.md\n" in out
# reference/api.md has a description
assert "api.md (API docs)" in out
def test_compile_git_memory_filesystem_condenses_skills_to_top_level_entries():
"""skills/ should render as top-level skill entries with description.
We intentionally avoid showing nested files under skills/ in the system
prompt tree to keep context concise.
"""
m = Memory(
agent_type=AgentType.letta_v1_agent,
git_enabled=True,
blocks=[
Block(label="system/human", value="human data", limit=100),
Block(
label="skills/searching-messages",
value="# searching messages",
limit=100,
description="Search past messages to recall context.",
),
Block(
label="skills/creating-skills",
value="# creating skills",
limit=100,
description="Guide for creating effective skills.",
),
Block(
label="skills/creating-skills/references/workflows",
value="nested docs",
limit=100,
description="Nested workflow docs (should not appear)",
),
],
)
out = m.compile()
# Condensed top-level skill entries with descriptions.
assert "searching-messages (Search past messages to recall context.)" in out
assert "creating-skills (Guide for creating effective skills.)" in out
# Do not show .md suffixes or nested skill docs in tree.
assert "searching-messages.md" not in out
assert "creating-skills.md" not in out
assert "references/workflows" not in out

View File

@@ -24,6 +24,9 @@ def test_get_headers_user_id_allows_none():
letta_v1_agent=None,
letta_v1_agent_message_async=None,
modal_sandbox=None,
billing_plan_type=None,
billing_cost_source=None,
billing_customer_id=None,
)
assert isinstance(headers, HeaderParams)
@@ -40,6 +43,9 @@ def test_get_headers_user_id_rejects_invalid_format():
letta_v1_agent=None,
letta_v1_agent_message_async=None,
modal_sandbox=None,
billing_plan_type=None,
billing_cost_source=None,
billing_customer_id=None,
)
@@ -54,6 +60,9 @@ def test_get_headers_user_id_accepts_valid_format():
letta_v1_agent=None,
letta_v1_agent_message_async=None,
modal_sandbox=None,
billing_plan_type=None,
billing_cost_source=None,
billing_customer_id=None,
)
assert headers.actor_id == "user-123e4567-e89b-42d3-8456-426614174000"

2
uv.lock generated
View File

@@ -2510,7 +2510,7 @@ wheels = [
[[package]]
name = "letta"
version = "0.16.5"
version = "0.16.6"
source = { editable = "." }
dependencies = [
{ name = "aiofiles" },