chore: bump 0.16.6 (#3211)

This commit is contained in:
cthomas
2026-03-03 19:13:07 -08:00
committed by GitHub
84 changed files with 2540 additions and 407 deletions

View File

@@ -260,6 +260,7 @@ model:
base_url: https://generativelanguage.googleapis.com/ base_url: https://generativelanguage.googleapis.com/
force_minimum_thinking_budget: false force_minimum_thinking_budget: false
max_retries: 5 max_retries: 5
timeout_seconds: 600.0
# Google Vertex (-> GOOGLE_CLOUD_*) # Google Vertex (-> GOOGLE_CLOUD_*)
# google_cloud: # google_cloud:

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,220 @@
import * as fs from 'fs';
import * as path from 'path';
import { omit } from 'lodash';
import { execSync } from 'child_process';
import { merge, isErrorResult } from 'openapi-merge';
import type { Swagger } from 'atlassian-openapi';
import { RESTRICTED_ROUTE_BASE_PATHS } from '@letta-cloud/sdk-core';
const lettaWebOpenAPIPath = path.join(
__dirname,
'..',
'..',
'..',
'web',
'autogenerated',
'letta-web-openapi.json',
);
const lettaAgentsAPIPath = path.join(
__dirname,
'..',
'..',
'letta',
'server',
'openapi_letta.json',
);
const lettaWebOpenAPI = JSON.parse(
fs.readFileSync(lettaWebOpenAPIPath, 'utf8'),
) as Swagger.SwaggerV3;
const lettaAgentsAPI = JSON.parse(
fs.readFileSync(lettaAgentsAPIPath, 'utf8'),
) as Swagger.SwaggerV3;
// removes any routes that are restricted
lettaAgentsAPI.paths = Object.fromEntries(
Object.entries(lettaAgentsAPI.paths).filter(([path]) =>
RESTRICTED_ROUTE_BASE_PATHS.every(
(restrictedPath) => !path.startsWith(restrictedPath),
),
),
);
const lettaAgentsAPIWithNoEndslash = Object.keys(lettaAgentsAPI.paths).reduce(
(acc, path) => {
const pathWithoutSlash = path.endsWith('/')
? path.slice(0, path.length - 1)
: path;
acc[pathWithoutSlash] = lettaAgentsAPI.paths[path];
return acc;
},
{} as Swagger.SwaggerV3['paths'],
);
// remove duplicate paths, delete from letta-web-openapi if it exists in sdk-core
// some paths will have an extra / at the end, so we need to remove that as well
lettaWebOpenAPI.paths = Object.fromEntries(
Object.entries(lettaWebOpenAPI.paths).filter(([path]) => {
const pathWithoutSlash = path.endsWith('/')
? path.slice(0, path.length - 1)
: path;
return !lettaAgentsAPIWithNoEndslash[pathWithoutSlash];
}),
);
const agentStatePathsToOverride: Array<[string, string]> = [
['/v1/templates/{project}/{template_version}/agents', '201'],
['/v1/agents/search', '200'],
];
for (const [path, responseCode] of agentStatePathsToOverride) {
if (lettaWebOpenAPI.paths[path]?.post?.responses?.[responseCode]) {
// Get direct reference to the schema object
const responseSchema =
lettaWebOpenAPI.paths[path].post.responses[responseCode];
const contentSchema = responseSchema.content['application/json'].schema;
// Replace the entire agents array schema with the reference
if (contentSchema.properties?.agents) {
contentSchema.properties.agents = {
type: 'array',
items: {
$ref: '#/components/schemas/AgentState',
},
};
}
}
}
// go through the paths and remove "user_id"/"actor_id" from the headers
for (const path of Object.keys(lettaAgentsAPI.paths)) {
for (const method of Object.keys(lettaAgentsAPI.paths[path])) {
// @ts-expect-error - a
if (lettaAgentsAPI.paths[path][method]?.parameters) {
// @ts-expect-error - a
lettaAgentsAPI.paths[path][method].parameters = lettaAgentsAPI.paths[
path
][method].parameters.filter(
(param: Record<string, string>) =>
param.in !== 'header' ||
(
param.name !== 'user_id' &&
param.name !== 'User-Agent' &&
param.name !== 'X-Project-Id' &&
param.name !== 'X-Letta-Source' &&
param.name !== 'X-Stainless-Package-Version' &&
!param.name.startsWith('X-Experimental') &&
!param.name.startsWith('X-Billing')
),
);
}
}
}
const result = merge([
{
oas: lettaAgentsAPI,
},
{
oas: lettaWebOpenAPI,
},
]);
if (isErrorResult(result)) {
console.error(`${result.message} (${result.type})`);
process.exit(1);
}
result.output.openapi = '3.1.0';
result.output.info = {
title: 'Letta API',
version: '1.0.0',
};
result.output.servers = [
{
url: 'https://app.letta.com',
description: 'Letta Cloud',
},
{
url: 'http://localhost:8283',
description: 'Self-hosted',
},
];
result.output.components = {
...result.output.components,
securitySchemes: {
bearerAuth: {
type: 'http',
scheme: 'bearer',
},
},
};
result.output.security = [
...(result.output.security || []),
{
bearerAuth: [],
},
];
// omit all instances of "user_id" from the openapi.json file
function deepOmitPreserveArrays(obj: unknown, key: string): unknown {
if (Array.isArray(obj)) {
return obj.map((item) => deepOmitPreserveArrays(item, key));
}
if (typeof obj !== 'object' || obj === null) {
return obj;
}
if (key in obj) {
return omit(obj, key);
}
return Object.fromEntries(
Object.entries(obj).map(([k, v]) => [k, deepOmitPreserveArrays(v, key)]),
);
}
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore
result.output.components = deepOmitPreserveArrays(
result.output.components,
'user_id',
);
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore
result.output.components = deepOmitPreserveArrays(
result.output.components,
'actor_id',
);
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore
result.output.components = deepOmitPreserveArrays(
result.output.components,
'organization_id',
);
fs.writeFileSync(
path.join(__dirname, '..', 'openapi.json'),
JSON.stringify(result.output, null, 2),
);
function formatOpenAPIJson() {
const openApiPath = path.join(__dirname, '..', 'openapi.json');
try {
execSync(`npx prettier --write "${openApiPath}"`, { stdio: 'inherit' });
console.log('Successfully formatted openapi.json with Prettier');
} catch (error) {
console.error('Error formatting openapi.json:', error);
process.exit(1);
}
}
formatOpenAPIJson();

View File

@@ -5,7 +5,7 @@ try:
__version__ = version("letta") __version__ = version("letta")
except PackageNotFoundError: except PackageNotFoundError:
# Fallback for development installations # Fallback for development installations
__version__ = "0.16.5" __version__ = "0.16.6"
if os.environ.get("LETTA_VERSION"): if os.environ.get("LETTA_VERSION"):
__version__ = os.environ["LETTA_VERSION"] __version__ = os.environ["LETTA_VERSION"]

View File

@@ -7,6 +7,7 @@ from letta.schemas.letta_message import LettaMessage
from letta.schemas.letta_message_content import ReasoningContent, RedactedReasoningContent, TextContent from letta.schemas.letta_message_content import ReasoningContent, RedactedReasoningContent, TextContent
from letta.schemas.llm_config import LLMConfig from letta.schemas.llm_config import LLMConfig
from letta.schemas.openai.chat_completion_response import ChatCompletionResponse, ChoiceLogprobs, ToolCall from letta.schemas.openai.chat_completion_response import ChatCompletionResponse, ChoiceLogprobs, ToolCall
from letta.schemas.provider_trace import BillingContext
from letta.schemas.usage import LettaUsageStatistics from letta.schemas.usage import LettaUsageStatistics
from letta.schemas.user import User from letta.schemas.user import User
from letta.services.telemetry_manager import TelemetryManager from letta.services.telemetry_manager import TelemetryManager
@@ -31,6 +32,7 @@ class LettaLLMAdapter(ABC):
run_id: str | None = None, run_id: str | None = None,
org_id: str | None = None, org_id: str | None = None,
user_id: str | None = None, user_id: str | None = None,
billing_context: BillingContext | None = None,
) -> None: ) -> None:
self.llm_client: LLMClientBase = llm_client self.llm_client: LLMClientBase = llm_client
self.llm_config: LLMConfig = llm_config self.llm_config: LLMConfig = llm_config
@@ -40,6 +42,7 @@ class LettaLLMAdapter(ABC):
self.run_id: str | None = run_id self.run_id: str | None = run_id
self.org_id: str | None = org_id self.org_id: str | None = org_id
self.user_id: str | None = user_id self.user_id: str | None = user_id
self.billing_context: BillingContext | None = billing_context
self.message_id: str | None = None self.message_id: str | None = None
self.request_data: dict | None = None self.request_data: dict | None = None
self.response_data: dict | None = None self.response_data: dict | None = None

View File

@@ -10,7 +10,7 @@ from letta.otel.tracing import log_attributes, safe_json_dumps, trace_method
from letta.schemas.enums import LLMCallType, ProviderType from letta.schemas.enums import LLMCallType, ProviderType
from letta.schemas.letta_message import LettaMessage from letta.schemas.letta_message import LettaMessage
from letta.schemas.llm_config import LLMConfig from letta.schemas.llm_config import LLMConfig
from letta.schemas.provider_trace import ProviderTrace from letta.schemas.provider_trace import BillingContext, ProviderTrace
from letta.schemas.user import User from letta.schemas.user import User
from letta.settings import settings from letta.settings import settings
from letta.utils import safe_create_task from letta.utils import safe_create_task
@@ -36,6 +36,7 @@ class LettaLLMStreamAdapter(LettaLLMAdapter):
run_id: str | None = None, run_id: str | None = None,
org_id: str | None = None, org_id: str | None = None,
user_id: str | None = None, user_id: str | None = None,
billing_context: "BillingContext | None" = None,
) -> None: ) -> None:
super().__init__( super().__init__(
llm_client, llm_client,
@@ -46,6 +47,7 @@ class LettaLLMStreamAdapter(LettaLLMAdapter):
run_id=run_id, run_id=run_id,
org_id=org_id, org_id=org_id,
user_id=user_id, user_id=user_id,
billing_context=billing_context,
) )
self.interface: OpenAIStreamingInterface | AnthropicStreamingInterface | None = None self.interface: OpenAIStreamingInterface | AnthropicStreamingInterface | None = None

View File

@@ -51,6 +51,7 @@ class SimpleLLMRequestAdapter(LettaLLMRequestAdapter):
org_id=self.org_id, org_id=self.org_id,
user_id=self.user_id, user_id=self.user_id,
llm_config=self.llm_config.model_dump() if self.llm_config else None, llm_config=self.llm_config.model_dump() if self.llm_config else None,
billing_context=self.billing_context,
) )
try: try:
self.response_data = await self.llm_client.request_async_with_telemetry(request_data, self.llm_config) self.response_data = await self.llm_client.request_async_with_telemetry(request_data, self.llm_config)

View File

@@ -278,6 +278,7 @@ class SimpleLLMStreamAdapter(LettaLLMStreamAdapter):
org_id=self.org_id, org_id=self.org_id,
user_id=self.user_id, user_id=self.user_id,
llm_config=self.llm_config.model_dump() if self.llm_config else None, llm_config=self.llm_config.model_dump() if self.llm_config else None,
billing_context=self.billing_context,
), ),
), ),
label="create_provider_trace", label="create_provider_trace",

View File

@@ -15,6 +15,7 @@ from letta.schemas.letta_message_content import TextContent
from letta.schemas.letta_response import LettaResponse from letta.schemas.letta_response import LettaResponse
from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType
from letta.schemas.message import Message, MessageCreate, MessageUpdate from letta.schemas.message import Message, MessageCreate, MessageUpdate
from letta.schemas.provider_trace import BillingContext
from letta.schemas.usage import LettaUsageStatistics from letta.schemas.usage import LettaUsageStatistics
from letta.schemas.user import User from letta.schemas.user import User
from letta.services.agent_manager import AgentManager from letta.services.agent_manager import AgentManager
@@ -51,7 +52,11 @@ class BaseAgent(ABC):
@abstractmethod @abstractmethod
async def step( async def step(
self, input_messages: List[MessageCreate], max_steps: int = DEFAULT_MAX_STEPS, run_id: Optional[str] = None self,
input_messages: List[MessageCreate],
max_steps: int = DEFAULT_MAX_STEPS,
run_id: Optional[str] = None,
billing_context: "BillingContext | None" = None,
) -> LettaResponse: ) -> LettaResponse:
""" """
Main execution loop for the agent. Main execution loop for the agent.

View File

@@ -12,6 +12,7 @@ from letta.schemas.user import User
if TYPE_CHECKING: if TYPE_CHECKING:
from letta.schemas.letta_request import ClientToolSchema from letta.schemas.letta_request import ClientToolSchema
from letta.schemas.provider_trace import BillingContext
class BaseAgentV2(ABC): class BaseAgentV2(ABC):
@@ -52,6 +53,7 @@ class BaseAgentV2(ABC):
request_start_timestamp_ns: int | None = None, request_start_timestamp_ns: int | None = None,
client_tools: list["ClientToolSchema"] | None = None, client_tools: list["ClientToolSchema"] | None = None,
include_compaction_messages: bool = False, # Not used in V2, but accepted for API compatibility include_compaction_messages: bool = False, # Not used in V2, but accepted for API compatibility
billing_context: "BillingContext | None" = None,
) -> LettaResponse: ) -> LettaResponse:
""" """
Execute the agent loop in blocking mode, returning all messages at once. Execute the agent loop in blocking mode, returning all messages at once.
@@ -76,6 +78,7 @@ class BaseAgentV2(ABC):
conversation_id: str | None = None, conversation_id: str | None = None,
client_tools: list["ClientToolSchema"] | None = None, client_tools: list["ClientToolSchema"] | None = None,
include_compaction_messages: bool = False, # Not used in V2, but accepted for API compatibility include_compaction_messages: bool = False, # Not used in V2, but accepted for API compatibility
billing_context: "BillingContext | None" = None,
) -> AsyncGenerator[LettaMessage | LegacyLettaMessage | MessageStreamStatus, None]: ) -> AsyncGenerator[LettaMessage | LegacyLettaMessage | MessageStreamStatus, None]:
""" """
Execute the agent loop in streaming mode, yielding chunks as they become available. Execute the agent loop in streaming mode, yielding chunks as they become available.

View File

@@ -192,44 +192,15 @@ async def _prepare_in_context_messages_no_persist_async(
# Otherwise, include the full list of messages from the conversation # Otherwise, include the full list of messages from the conversation
current_in_context_messages = await message_manager.get_messages_by_ids_async(message_ids=message_ids, actor=actor) current_in_context_messages = await message_manager.get_messages_by_ids_async(message_ids=message_ids, actor=actor)
else: else:
# No messages in conversation yet - compile a new system message for this conversation # No messages in conversation yet (fallback) - compile a new system message
# Each conversation gets its own system message (captures memory state at conversation start) # Normally this is handled at conversation creation time, but this covers
from letta.prompts.prompt_generator import PromptGenerator # edge cases where a conversation exists without a system message.
from letta.services.passage_manager import PassageManager system_message = await conversation_manager.compile_and_save_system_message_for_conversation(
num_messages = await message_manager.size_async(actor=actor, agent_id=agent_state.id)
passage_manager = PassageManager()
num_archival_memories = await passage_manager.agent_passage_size_async(actor=actor, agent_id=agent_state.id)
system_message_str = await PromptGenerator.compile_system_message_async(
system_prompt=agent_state.system,
in_context_memory=agent_state.memory,
in_context_memory_last_edit=get_utc_time(),
timezone=agent_state.timezone,
user_defined_variables=None,
append_icm_if_missing=True,
previous_message_count=num_messages,
archival_memory_size=num_archival_memories,
sources=agent_state.sources,
max_files_open=agent_state.max_files_open,
)
system_message = Message.dict_to_message(
agent_id=agent_state.id,
model=agent_state.llm_config.model,
openai_message_dict={"role": "system", "content": system_message_str},
)
# Persist the new system message
persisted_messages = await message_manager.create_many_messages_async([system_message], actor=actor)
system_message = persisted_messages[0]
# Add it to the conversation tracking
await conversation_manager.add_messages_to_conversation(
conversation_id=conversation_id, conversation_id=conversation_id,
agent_id=agent_state.id, agent_id=agent_state.id,
message_ids=[system_message.id],
actor=actor, actor=actor,
starting_position=0, agent_state=agent_state,
message_manager=message_manager,
) )
current_in_context_messages = [system_message] current_in_context_messages = [system_message]

View File

@@ -48,6 +48,7 @@ from letta.schemas.openai.chat_completion_response import (
UsageStatisticsCompletionTokenDetails, UsageStatisticsCompletionTokenDetails,
UsageStatisticsPromptTokenDetails, UsageStatisticsPromptTokenDetails,
) )
from letta.schemas.provider_trace import BillingContext
from letta.schemas.step import StepProgression from letta.schemas.step import StepProgression
from letta.schemas.step_metrics import StepMetrics from letta.schemas.step_metrics import StepMetrics
from letta.schemas.tool_execution_result import ToolExecutionResult from letta.schemas.tool_execution_result import ToolExecutionResult
@@ -179,6 +180,7 @@ class LettaAgent(BaseAgent):
request_start_timestamp_ns: int | None = None, request_start_timestamp_ns: int | None = None,
include_return_message_types: list[MessageType] | None = None, include_return_message_types: list[MessageType] | None = None,
dry_run: bool = False, dry_run: bool = False,
billing_context: "BillingContext | None" = None,
) -> Union[LettaResponse, dict]: ) -> Union[LettaResponse, dict]:
# TODO (cliandy): pass in run_id and use at send_message endpoints for all step functions # TODO (cliandy): pass in run_id and use at send_message endpoints for all step functions
agent_state = await self.agent_manager.get_agent_by_id_async( agent_state = await self.agent_manager.get_agent_by_id_async(

View File

@@ -44,6 +44,7 @@ from letta.schemas.openai.chat_completion_response import (
UsageStatisticsCompletionTokenDetails, UsageStatisticsCompletionTokenDetails,
UsageStatisticsPromptTokenDetails, UsageStatisticsPromptTokenDetails,
) )
from letta.schemas.provider_trace import BillingContext
from letta.schemas.step import Step, StepProgression from letta.schemas.step import Step, StepProgression
from letta.schemas.step_metrics import StepMetrics from letta.schemas.step_metrics import StepMetrics
from letta.schemas.tool import Tool from letta.schemas.tool import Tool
@@ -185,6 +186,7 @@ class LettaAgentV2(BaseAgentV2):
request_start_timestamp_ns: int | None = None, request_start_timestamp_ns: int | None = None,
client_tools: list[ClientToolSchema] | None = None, client_tools: list[ClientToolSchema] | None = None,
include_compaction_messages: bool = False, # Not used in V2, but accepted for API compatibility include_compaction_messages: bool = False, # Not used in V2, but accepted for API compatibility
billing_context: "BillingContext | None" = None,
) -> LettaResponse: ) -> LettaResponse:
""" """
Execute the agent loop in blocking mode, returning all messages at once. Execute the agent loop in blocking mode, returning all messages at once.
@@ -290,6 +292,7 @@ class LettaAgentV2(BaseAgentV2):
conversation_id: str | None = None, # Not used in V2, but accepted for API compatibility conversation_id: str | None = None, # Not used in V2, but accepted for API compatibility
client_tools: list[ClientToolSchema] | None = None, client_tools: list[ClientToolSchema] | None = None,
include_compaction_messages: bool = False, # Not used in V2, but accepted for API compatibility include_compaction_messages: bool = False, # Not used in V2, but accepted for API compatibility
billing_context: BillingContext | None = None,
) -> AsyncGenerator[str, None]: ) -> AsyncGenerator[str, None]:
""" """
Execute the agent loop in streaming mode, yielding chunks as they become available. Execute the agent loop in streaming mode, yielding chunks as they become available.

View File

@@ -21,7 +21,7 @@ from letta.agents.helpers import (
) )
from letta.agents.letta_agent_v2 import LettaAgentV2 from letta.agents.letta_agent_v2 import LettaAgentV2
from letta.constants import DEFAULT_MAX_STEPS, NON_USER_MSG_PREFIX, REQUEST_HEARTBEAT_PARAM from letta.constants import DEFAULT_MAX_STEPS, NON_USER_MSG_PREFIX, REQUEST_HEARTBEAT_PARAM
from letta.errors import ContextWindowExceededError, LLMError, SystemPromptTokenExceededError from letta.errors import ContextWindowExceededError, LLMEmptyResponseError, LLMError, SystemPromptTokenExceededError
from letta.helpers import ToolRulesSolver from letta.helpers import ToolRulesSolver
from letta.helpers.datetime_helpers import get_utc_time, get_utc_timestamp_ns from letta.helpers.datetime_helpers import get_utc_time, get_utc_timestamp_ns
from letta.helpers.tool_execution_helper import enable_strict_mode from letta.helpers.tool_execution_helper import enable_strict_mode
@@ -45,6 +45,7 @@ from letta.schemas.letta_response import LettaResponse, TurnTokenData
from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType
from letta.schemas.message import Message, MessageCreate, ToolReturn from letta.schemas.message import Message, MessageCreate, ToolReturn
from letta.schemas.openai.chat_completion_response import ChoiceLogprobs, ToolCall, ToolCallDenial, UsageStatistics from letta.schemas.openai.chat_completion_response import ChoiceLogprobs, ToolCall, ToolCallDenial, UsageStatistics
from letta.schemas.provider_trace import BillingContext
from letta.schemas.step import StepProgression from letta.schemas.step import StepProgression
from letta.schemas.step_metrics import StepMetrics from letta.schemas.step_metrics import StepMetrics
from letta.schemas.tool_execution_result import ToolExecutionResult from letta.schemas.tool_execution_result import ToolExecutionResult
@@ -149,6 +150,7 @@ class LettaAgentV3(LettaAgentV2):
conversation_id: str | None = None, conversation_id: str | None = None,
client_tools: list[ClientToolSchema] | None = None, client_tools: list[ClientToolSchema] | None = None,
include_compaction_messages: bool = False, include_compaction_messages: bool = False,
billing_context: "BillingContext | None" = None,
) -> LettaResponse: ) -> LettaResponse:
""" """
Execute the agent loop in blocking mode, returning all messages at once. Execute the agent loop in blocking mode, returning all messages at once.
@@ -232,6 +234,7 @@ class LettaAgentV3(LettaAgentV2):
run_id=run_id, run_id=run_id,
org_id=self.actor.organization_id, org_id=self.actor.organization_id,
user_id=self.actor.id, user_id=self.actor.id,
billing_context=billing_context,
) )
credit_task = None credit_task = None
@@ -362,6 +365,7 @@ class LettaAgentV3(LettaAgentV2):
conversation_id: str | None = None, conversation_id: str | None = None,
client_tools: list[ClientToolSchema] | None = None, client_tools: list[ClientToolSchema] | None = None,
include_compaction_messages: bool = False, include_compaction_messages: bool = False,
billing_context: BillingContext | None = None,
) -> AsyncGenerator[str, None]: ) -> AsyncGenerator[str, None]:
""" """
Execute the agent loop in streaming mode, yielding chunks as they become available. Execute the agent loop in streaming mode, yielding chunks as they become available.
@@ -419,6 +423,7 @@ class LettaAgentV3(LettaAgentV2):
run_id=run_id, run_id=run_id,
org_id=self.actor.organization_id, org_id=self.actor.organization_id,
user_id=self.actor.id, user_id=self.actor.id,
billing_context=billing_context,
) )
elif use_sglang_native: elif use_sglang_native:
# Use SGLang native adapter for multi-turn RL training # Use SGLang native adapter for multi-turn RL training
@@ -431,6 +436,7 @@ class LettaAgentV3(LettaAgentV2):
run_id=run_id, run_id=run_id,
org_id=self.actor.organization_id, org_id=self.actor.organization_id,
user_id=self.actor.id, user_id=self.actor.id,
billing_context=billing_context,
) )
# Reset turns tracking for this step # Reset turns tracking for this step
self.turns = [] self.turns = []
@@ -444,6 +450,7 @@ class LettaAgentV3(LettaAgentV2):
run_id=run_id, run_id=run_id,
org_id=self.actor.organization_id, org_id=self.actor.organization_id,
user_id=self.actor.id, user_id=self.actor.id,
billing_context=billing_context,
) )
try: try:
@@ -764,7 +771,12 @@ class LettaAgentV3(LettaAgentV2):
] ]
else: else:
# Old behavior: UserMessage with packed JSON # Old behavior: UserMessage with packed JSON
return list(Message.to_letta_messages(summary_message)) messages = list(Message.to_letta_messages(summary_message))
# Set otid on returned messages (summary Message doesn't have otid set at creation)
for i, msg in enumerate(messages):
if not msg.otid:
msg.otid = Message.generate_otid_from_id(summary_message.id, i)
return messages
@trace_method @trace_method
async def _step( async def _step(
@@ -990,6 +1002,9 @@ class LettaAgentV3(LettaAgentV2):
except ValueError as e: except ValueError as e:
self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response.value) self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response.value)
raise e raise e
except LLMEmptyResponseError as e:
self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response.value)
raise e
except LLMError as e: except LLMError as e:
self.stop_reason = LettaStopReason(stop_reason=StopReasonType.llm_api_error.value) self.stop_reason = LettaStopReason(stop_reason=StopReasonType.llm_api_error.value)
raise e raise e

View File

@@ -134,7 +134,7 @@ def _flatten_model_settings(d: dict, env_vars: dict[str, str]) -> None:
api_base: yyy -> OPENAI_API_BASE api_base: yyy -> OPENAI_API_BASE
anthropic: anthropic:
api_key: zzz -> ANTHROPIC_API_KEY api_key: zzz -> ANTHROPIC_API_KEY
global_max_context_window_limit: 32000 -> GLOBAL_MAX_CONTEXT_WINDOW_LIMIT global_max_context_window_limit: 128000 -> GLOBAL_MAX_CONTEXT_WINDOW_LIMIT
""" """
for key, value in d.items(): for key, value in d.items():
if isinstance(value, dict): if isinstance(value, dict):

View File

@@ -74,7 +74,7 @@ DEFAULT_MAX_STEPS = 50
# context window size # context window size
MIN_CONTEXT_WINDOW = 4096 MIN_CONTEXT_WINDOW = 4096
DEFAULT_CONTEXT_WINDOW = 32000 DEFAULT_CONTEXT_WINDOW = 128000
# Summarization trigger threshold (multiplier of context_window limit) # Summarization trigger threshold (multiplier of context_window limit)
# Summarization triggers when step usage > context_window * SUMMARIZATION_TRIGGER_MULTIPLIER # Summarization triggers when step usage > context_window * SUMMARIZATION_TRIGGER_MULTIPLIER
@@ -253,10 +253,10 @@ LLM_MAX_CONTEXT_WINDOW = {
"deepseek-reasoner": 64000, "deepseek-reasoner": 64000,
# glm (Z.AI) # glm (Z.AI)
"glm-4.5": 128000, "glm-4.5": 128000,
"glm-4.6": 200000, "glm-4.6": 180000,
"glm-4.7": 200000, "glm-4.7": 180000,
"glm-5": 200000, "glm-5": 180000,
"glm-5-code": 200000, "glm-5-code": 180000,
## OpenAI models: https://platform.openai.com/docs/models/overview ## OpenAI models: https://platform.openai.com/docs/models/overview
# gpt-5 # gpt-5
"gpt-5": 272000, "gpt-5": 272000,
@@ -278,6 +278,8 @@ LLM_MAX_CONTEXT_WINDOW = {
"gpt-5.2-pro": 272000, "gpt-5.2-pro": 272000,
"gpt-5.2-pro-2025-12-11": 272000, "gpt-5.2-pro-2025-12-11": 272000,
"gpt-5.2-codex": 272000, "gpt-5.2-codex": 272000,
# gpt-5.3
"gpt-5.3-codex": 272000,
# reasoners # reasoners
"o1": 200000, "o1": 200000,
# "o1-pro": 200000, # responses API only # "o1-pro": 200000, # responses API only
@@ -419,7 +421,7 @@ MAX_ERROR_MESSAGE_CHAR_LIMIT = 1000
# Default memory limits # Default memory limits
CORE_MEMORY_PERSONA_CHAR_LIMIT: int = 20000 CORE_MEMORY_PERSONA_CHAR_LIMIT: int = 20000
CORE_MEMORY_HUMAN_CHAR_LIMIT: int = 20000 CORE_MEMORY_HUMAN_CHAR_LIMIT: int = 20000
CORE_MEMORY_BLOCK_CHAR_LIMIT: int = 20000 CORE_MEMORY_BLOCK_CHAR_LIMIT: int = 100000
# Function return limits # Function return limits
FUNCTION_RETURN_CHAR_LIMIT = 50000 # ~300 words FUNCTION_RETURN_CHAR_LIMIT = 50000 # ~300 words

View File

@@ -283,6 +283,15 @@ class LLMServerError(LLMError):
while processing the request.""" while processing the request."""
class LLMEmptyResponseError(LLMServerError):
"""Error when LLM returns an empty response (no content and no tool calls).
This is a subclass of LLMServerError to maintain retry behavior, but allows
specific handling for empty response cases which may benefit from request
modification before retry.
"""
class LLMTimeoutError(LLMError): class LLMTimeoutError(LLMError):
"""Error when LLM request times out""" """Error when LLM request times out"""

View File

@@ -13,6 +13,7 @@ from letta.schemas.letta_message import MessageType
from letta.schemas.letta_message_content import TextContent from letta.schemas.letta_message_content import TextContent
from letta.schemas.letta_response import LettaResponse from letta.schemas.letta_response import LettaResponse
from letta.schemas.message import Message, MessageCreate from letta.schemas.message import Message, MessageCreate
from letta.schemas.provider_trace import BillingContext
from letta.schemas.run import Run from letta.schemas.run import Run
from letta.schemas.user import User from letta.schemas.user import User
from letta.services.agent_manager import AgentManager from letta.services.agent_manager import AgentManager
@@ -69,6 +70,7 @@ class SleeptimeMultiAgentV2(BaseAgent):
use_assistant_message: bool = True, use_assistant_message: bool = True,
request_start_timestamp_ns: int | None = None, request_start_timestamp_ns: int | None = None,
include_return_message_types: list[MessageType] | None = None, include_return_message_types: list[MessageType] | None = None,
billing_context: "BillingContext | None" = None,
) -> LettaResponse: ) -> LettaResponse:
run_ids = [] run_ids = []
@@ -100,6 +102,7 @@ class SleeptimeMultiAgentV2(BaseAgent):
run_id=run_id, run_id=run_id,
use_assistant_message=use_assistant_message, use_assistant_message=use_assistant_message,
include_return_message_types=include_return_message_types, include_return_message_types=include_return_message_types,
billing_context=billing_context,
) )
# Get last response messages # Get last response messages

View File

@@ -15,6 +15,7 @@ from letta.schemas.letta_request import ClientToolSchema
from letta.schemas.letta_response import LettaResponse from letta.schemas.letta_response import LettaResponse
from letta.schemas.letta_stop_reason import StopReasonType from letta.schemas.letta_stop_reason import StopReasonType
from letta.schemas.message import Message, MessageCreate from letta.schemas.message import Message, MessageCreate
from letta.schemas.provider_trace import BillingContext
from letta.schemas.run import Run, RunUpdate from letta.schemas.run import Run, RunUpdate
from letta.schemas.user import User from letta.schemas.user import User
from letta.services.group_manager import GroupManager from letta.services.group_manager import GroupManager
@@ -47,6 +48,7 @@ class SleeptimeMultiAgentV3(LettaAgentV2):
request_start_timestamp_ns: int | None = None, request_start_timestamp_ns: int | None = None,
client_tools: list[ClientToolSchema] | None = None, client_tools: list[ClientToolSchema] | None = None,
include_compaction_messages: bool = False, include_compaction_messages: bool = False,
billing_context: "BillingContext | None" = None,
) -> LettaResponse: ) -> LettaResponse:
self.run_ids = [] self.run_ids = []
@@ -62,6 +64,7 @@ class SleeptimeMultiAgentV3(LettaAgentV2):
request_start_timestamp_ns=request_start_timestamp_ns, request_start_timestamp_ns=request_start_timestamp_ns,
client_tools=client_tools, client_tools=client_tools,
include_compaction_messages=include_compaction_messages, include_compaction_messages=include_compaction_messages,
billing_context=billing_context,
) )
await self.run_sleeptime_agents() await self.run_sleeptime_agents()
@@ -81,6 +84,7 @@ class SleeptimeMultiAgentV3(LettaAgentV2):
include_return_message_types: list[MessageType] | None = None, include_return_message_types: list[MessageType] | None = None,
client_tools: list[ClientToolSchema] | None = None, client_tools: list[ClientToolSchema] | None = None,
include_compaction_messages: bool = False, include_compaction_messages: bool = False,
billing_context: "BillingContext | None" = None,
) -> AsyncGenerator[str, None]: ) -> AsyncGenerator[str, None]:
self.run_ids = [] self.run_ids = []
@@ -99,6 +103,7 @@ class SleeptimeMultiAgentV3(LettaAgentV2):
request_start_timestamp_ns=request_start_timestamp_ns, request_start_timestamp_ns=request_start_timestamp_ns,
client_tools=client_tools, client_tools=client_tools,
include_compaction_messages=include_compaction_messages, include_compaction_messages=include_compaction_messages,
billing_context=billing_context,
): ):
yield chunk yield chunk
finally: finally:

View File

@@ -14,6 +14,7 @@ from letta.schemas.letta_request import ClientToolSchema
from letta.schemas.letta_response import LettaResponse from letta.schemas.letta_response import LettaResponse
from letta.schemas.letta_stop_reason import StopReasonType from letta.schemas.letta_stop_reason import StopReasonType
from letta.schemas.message import Message, MessageCreate from letta.schemas.message import Message, MessageCreate
from letta.schemas.provider_trace import BillingContext
from letta.schemas.run import Run, RunUpdate from letta.schemas.run import Run, RunUpdate
from letta.schemas.user import User from letta.schemas.user import User
from letta.services.group_manager import GroupManager from letta.services.group_manager import GroupManager
@@ -47,6 +48,7 @@ class SleeptimeMultiAgentV4(LettaAgentV3):
conversation_id: str | None = None, conversation_id: str | None = None,
client_tools: list[ClientToolSchema] | None = None, client_tools: list[ClientToolSchema] | None = None,
include_compaction_messages: bool = False, include_compaction_messages: bool = False,
billing_context: "BillingContext | None" = None,
) -> LettaResponse: ) -> LettaResponse:
self.run_ids = [] self.run_ids = []
@@ -63,6 +65,7 @@ class SleeptimeMultiAgentV4(LettaAgentV3):
conversation_id=conversation_id, conversation_id=conversation_id,
client_tools=client_tools, client_tools=client_tools,
include_compaction_messages=include_compaction_messages, include_compaction_messages=include_compaction_messages,
billing_context=billing_context,
) )
run_ids = await self.run_sleeptime_agents() run_ids = await self.run_sleeptime_agents()
@@ -82,6 +85,7 @@ class SleeptimeMultiAgentV4(LettaAgentV3):
conversation_id: str | None = None, conversation_id: str | None = None,
client_tools: list[ClientToolSchema] | None = None, client_tools: list[ClientToolSchema] | None = None,
include_compaction_messages: bool = False, include_compaction_messages: bool = False,
billing_context: "BillingContext | None" = None,
) -> AsyncGenerator[str, None]: ) -> AsyncGenerator[str, None]:
self.run_ids = [] self.run_ids = []
@@ -101,6 +105,7 @@ class SleeptimeMultiAgentV4(LettaAgentV3):
conversation_id=conversation_id, conversation_id=conversation_id,
client_tools=client_tools, client_tools=client_tools,
include_compaction_messages=include_compaction_messages, include_compaction_messages=include_compaction_messages,
billing_context=billing_context,
): ):
yield chunk yield chunk
finally: finally:

View File

@@ -30,6 +30,7 @@ from anthropic.types.beta import (
) )
from letta.constants import DEFAULT_MESSAGE_TOOL, DEFAULT_MESSAGE_TOOL_KWARG from letta.constants import DEFAULT_MESSAGE_TOOL, DEFAULT_MESSAGE_TOOL_KWARG
from letta.errors import LLMEmptyResponseError
from letta.local_llm.constants import INNER_THOUGHTS_KWARG from letta.local_llm.constants import INNER_THOUGHTS_KWARG
from letta.log import get_logger from letta.log import get_logger
from letta.schemas.letta_message import ( from letta.schemas.letta_message import (
@@ -104,6 +105,10 @@ class AnthropicStreamingInterface:
self.inner_thoughts_complete = False self.inner_thoughts_complete = False
self.put_inner_thoughts_in_kwarg = put_inner_thoughts_in_kwarg self.put_inner_thoughts_in_kwarg = put_inner_thoughts_in_kwarg
# Track whether any content was produced (text or tool calls)
# Used to detect empty responses from models like Opus 4.6
self.has_content = False
# Buffer to handle partial XML tags across chunks # Buffer to handle partial XML tags across chunks
self.partial_tag_buffer = "" self.partial_tag_buffer = ""
@@ -298,9 +303,11 @@ class AnthropicStreamingInterface:
if isinstance(content, BetaTextBlock): if isinstance(content, BetaTextBlock):
self.anthropic_mode = EventMode.TEXT self.anthropic_mode = EventMode.TEXT
self.has_content = True # Track that we received text content
# TODO: Can capture citations, etc. # TODO: Can capture citations, etc.
elif isinstance(content, BetaToolUseBlock): elif isinstance(content, BetaToolUseBlock):
self.anthropic_mode = EventMode.TOOL_USE self.anthropic_mode = EventMode.TOOL_USE
self.has_content = True # Track that we received tool use content
self.tool_call_id = content.id self.tool_call_id = content.id
self.tool_call_name = content.name self.tool_call_name = content.name
self.inner_thoughts_complete = False self.inner_thoughts_complete = False
@@ -589,8 +596,12 @@ class AnthropicStreamingInterface:
# message_delta event are *cumulative*." So we assign, not accumulate. # message_delta event are *cumulative*." So we assign, not accumulate.
self.output_tokens = event.usage.output_tokens self.output_tokens = event.usage.output_tokens
elif isinstance(event, BetaRawMessageStopEvent): elif isinstance(event, BetaRawMessageStopEvent):
# Don't do anything here! We don't want to stop the stream. # Check if any content was produced during the stream
pass # Empty responses (no text and no tool calls) should raise an error
if not self.has_content:
raise LLMEmptyResponseError(
message=f"LLM provider returned empty content in streaming response (model: {self.model}, message_id: {self.message_id})"
)
elif isinstance(event, BetaRawContentBlockStopEvent): elif isinstance(event, BetaRawContentBlockStopEvent):
# If we're exiting a tool use block and there are still buffered messages, # If we're exiting a tool use block and there are still buffered messages,
# we should flush them now. # we should flush them now.
@@ -837,10 +848,12 @@ class SimpleAnthropicStreamingInterface:
if isinstance(content, BetaTextBlock): if isinstance(content, BetaTextBlock):
self.anthropic_mode = EventMode.TEXT self.anthropic_mode = EventMode.TEXT
self.has_content = True # Track that we received text content
# TODO: Can capture citations, etc. # TODO: Can capture citations, etc.
elif isinstance(content, BetaToolUseBlock): elif isinstance(content, BetaToolUseBlock):
self.anthropic_mode = EventMode.TOOL_USE self.anthropic_mode = EventMode.TOOL_USE
self.has_content = True # Track that we received tool use content
self.tool_call_id = content.id self.tool_call_id = content.id
self.tool_call_name = content.name self.tool_call_name = content.name
@@ -1014,8 +1027,12 @@ class SimpleAnthropicStreamingInterface:
self.output_tokens = event.usage.output_tokens self.output_tokens = event.usage.output_tokens
elif isinstance(event, BetaRawMessageStopEvent): elif isinstance(event, BetaRawMessageStopEvent):
# Don't do anything here! We don't want to stop the stream. # Check if any content was produced during the stream
pass # Empty responses (no text and no tool calls) should raise an error
if not self.has_content:
raise LLMEmptyResponseError(
message=f"LLM provider returned empty content in streaming response (model: {self.model}, message_id: {self.message_id})"
)
elif isinstance(event, BetaRawContentBlockStopEvent): elif isinstance(event, BetaRawContentBlockStopEvent):
self.anthropic_mode = None self.anthropic_mode = None

View File

@@ -19,6 +19,8 @@ from letta.errors import (
LLMAuthenticationError, LLMAuthenticationError,
LLMBadRequestError, LLMBadRequestError,
LLMConnectionError, LLMConnectionError,
LLMEmptyResponseError,
LLMError,
LLMInsufficientCreditsError, LLMInsufficientCreditsError,
LLMNotFoundError, LLMNotFoundError,
LLMPermissionDeniedError, LLMPermissionDeniedError,
@@ -957,6 +959,11 @@ class AnthropicClient(LLMClientBase):
@trace_method @trace_method
def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception: def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
# Pass through errors that are already LLMError instances unchanged
# This preserves specific error types like LLMEmptyResponseError
if isinstance(e, LLMError):
return e
is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
# make sure to check for overflow errors, regardless of error type # make sure to check for overflow errors, regardless of error type
@@ -1278,7 +1285,7 @@ class AnthropicClient(LLMClientBase):
response.stop_reason, response.stop_reason,
json.dumps(response_data), json.dumps(response_data),
) )
raise LLMServerError( raise LLMEmptyResponseError(
message=f"LLM provider returned empty content in response (ID: {response.id}, model: {response.model}, stop_reason: {response.stop_reason})", message=f"LLM provider returned empty content in response (ID: {response.id}, model: {response.model}, stop_reason: {response.stop_reason})",
code=ErrorCode.INTERNAL_SERVER_ERROR, code=ErrorCode.INTERNAL_SERVER_ERROR,
details={ details={

View File

@@ -9,7 +9,7 @@ from letta.llm_api.google_constants import GOOGLE_MODEL_FOR_API_KEY_CHECK
from letta.llm_api.google_vertex_client import GoogleVertexClient from letta.llm_api.google_vertex_client import GoogleVertexClient
from letta.log import get_logger from letta.log import get_logger
from letta.schemas.llm_config import LLMConfig from letta.schemas.llm_config import LLMConfig
from letta.settings import model_settings, settings from letta.settings import model_settings
logger = get_logger(__name__) logger = get_logger(__name__)
@@ -18,7 +18,7 @@ class GoogleAIClient(GoogleVertexClient):
provider_label = "Google AI" provider_label = "Google AI"
def _get_client(self, llm_config: Optional[LLMConfig] = None): def _get_client(self, llm_config: Optional[LLMConfig] = None):
timeout_ms = int(settings.llm_request_timeout_seconds * 1000) timeout_ms = int(model_settings.gemini_timeout_seconds * 1000)
api_key = None api_key = None
if llm_config: if llm_config:
api_key, _, _ = self.get_byok_overrides(llm_config) api_key, _, _ = self.get_byok_overrides(llm_config)
@@ -30,7 +30,7 @@ class GoogleAIClient(GoogleVertexClient):
) )
async def _get_client_async(self, llm_config: Optional[LLMConfig] = None): async def _get_client_async(self, llm_config: Optional[LLMConfig] = None):
timeout_ms = int(settings.llm_request_timeout_seconds * 1000) timeout_ms = int(model_settings.gemini_timeout_seconds * 1000)
api_key = None api_key = None
if llm_config: if llm_config:
api_key, _, _ = await self.get_byok_overrides_async(llm_config) api_key, _, _ = await self.get_byok_overrides_async(llm_config)

View File

@@ -14,7 +14,7 @@ from letta.schemas.enums import AgentType, LLMCallType, ProviderCategory
from letta.schemas.llm_config import LLMConfig from letta.schemas.llm_config import LLMConfig
from letta.schemas.message import Message from letta.schemas.message import Message
from letta.schemas.openai.chat_completion_response import ChatCompletionResponse from letta.schemas.openai.chat_completion_response import ChatCompletionResponse
from letta.schemas.provider_trace import ProviderTrace from letta.schemas.provider_trace import BillingContext, ProviderTrace
from letta.schemas.usage import LettaUsageStatistics from letta.schemas.usage import LettaUsageStatistics
from letta.services.telemetry_manager import TelemetryManager from letta.services.telemetry_manager import TelemetryManager
from letta.settings import settings from letta.settings import settings
@@ -48,6 +48,7 @@ class LLMClientBase:
self._telemetry_user_id: Optional[str] = None self._telemetry_user_id: Optional[str] = None
self._telemetry_compaction_settings: Optional[Dict] = None self._telemetry_compaction_settings: Optional[Dict] = None
self._telemetry_llm_config: Optional[Dict] = None self._telemetry_llm_config: Optional[Dict] = None
self._telemetry_billing_context: Optional[BillingContext] = None
def set_telemetry_context( def set_telemetry_context(
self, self,
@@ -62,6 +63,7 @@ class LLMClientBase:
compaction_settings: Optional[Dict] = None, compaction_settings: Optional[Dict] = None,
llm_config: Optional[Dict] = None, llm_config: Optional[Dict] = None,
actor: Optional["User"] = None, actor: Optional["User"] = None,
billing_context: Optional[BillingContext] = None,
) -> None: ) -> None:
"""Set telemetry context for provider trace logging.""" """Set telemetry context for provider trace logging."""
if actor is not None: if actor is not None:
@@ -76,6 +78,7 @@ class LLMClientBase:
self._telemetry_user_id = user_id self._telemetry_user_id = user_id
self._telemetry_compaction_settings = compaction_settings self._telemetry_compaction_settings = compaction_settings
self._telemetry_llm_config = llm_config self._telemetry_llm_config = llm_config
self._telemetry_billing_context = billing_context
def extract_usage_statistics(self, response_data: Optional[dict], llm_config: LLMConfig) -> LettaUsageStatistics: def extract_usage_statistics(self, response_data: Optional[dict], llm_config: LLMConfig) -> LettaUsageStatistics:
"""Provider-specific usage parsing hook (override in subclasses). Returns LettaUsageStatistics.""" """Provider-specific usage parsing hook (override in subclasses). Returns LettaUsageStatistics."""
@@ -125,6 +128,7 @@ class LLMClientBase:
user_id=self._telemetry_user_id, user_id=self._telemetry_user_id,
compaction_settings=self._telemetry_compaction_settings, compaction_settings=self._telemetry_compaction_settings,
llm_config=llm_config.model_dump() if llm_config else self._telemetry_llm_config, llm_config=llm_config.model_dump() if llm_config else self._telemetry_llm_config,
billing_context=self._telemetry_billing_context,
), ),
) )
except Exception as e: except Exception as e:
@@ -186,6 +190,7 @@ class LLMClientBase:
user_id=self._telemetry_user_id, user_id=self._telemetry_user_id,
compaction_settings=self._telemetry_compaction_settings, compaction_settings=self._telemetry_compaction_settings,
llm_config=llm_config.model_dump() if llm_config else self._telemetry_llm_config, llm_config=llm_config.model_dump() if llm_config else self._telemetry_llm_config,
billing_context=self._telemetry_billing_context,
), ),
) )
except Exception as e: except Exception as e:

View File

@@ -88,7 +88,7 @@ def supports_none_reasoning_effort(model: str) -> bool:
Currently, GPT-5.1 and GPT-5.2 models support the 'none' reasoning effort level. Currently, GPT-5.1 and GPT-5.2 models support the 'none' reasoning effort level.
""" """
return model.startswith("gpt-5.1") or model.startswith("gpt-5.2") return model.startswith("gpt-5.1") or model.startswith("gpt-5.2") or model.startswith("gpt-5.3")
def is_openai_5_model(model: str) -> bool: def is_openai_5_model(model: str) -> bool:
@@ -389,7 +389,6 @@ class OpenAIClient(LLMClientBase):
input=openai_messages_list, input=openai_messages_list,
tools=responses_tools, tools=responses_tools,
tool_choice=tool_choice, tool_choice=tool_choice,
max_output_tokens=llm_config.max_tokens,
temperature=llm_config.temperature if supports_temperature_param(model) else None, temperature=llm_config.temperature if supports_temperature_param(model) else None,
parallel_tool_calls=llm_config.parallel_tool_calls if tools and supports_parallel_tool_calling(model) else False, parallel_tool_calls=llm_config.parallel_tool_calls if tools and supports_parallel_tool_calling(model) else False,
) )
@@ -397,6 +396,10 @@ class OpenAIClient(LLMClientBase):
# Handle text configuration (verbosity and response format) # Handle text configuration (verbosity and response format)
text_config_kwargs = {} text_config_kwargs = {}
# Only set max_output_tokens if explicitly configured
if llm_config.max_tokens is not None:
data.max_output_tokens = llm_config.max_tokens
# Add verbosity control for GPT-5 models # Add verbosity control for GPT-5 models
if supports_verbosity_control(model) and llm_config.verbosity: if supports_verbosity_control(model) and llm_config.verbosity:
text_config_kwargs["verbosity"] = llm_config.verbosity text_config_kwargs["verbosity"] = llm_config.verbosity
@@ -451,7 +454,6 @@ class OpenAIClient(LLMClientBase):
) )
request_data = data.model_dump(exclude_unset=True) request_data = data.model_dump(exclude_unset=True)
# print("responses request data", request_data)
return request_data return request_data
@trace_method @trace_method
@@ -639,6 +641,14 @@ class OpenAIClient(LLMClientBase):
tool.function.strict = False tool.function.strict = False
request_data = data.model_dump(exclude_unset=True) request_data = data.model_dump(exclude_unset=True)
# Fireworks uses strict validation (additionalProperties: false) and rejects
# reasoning fields that are not in their schema.
is_fireworks = llm_config.model_endpoint and "fireworks.ai" in llm_config.model_endpoint
if is_fireworks and "messages" in request_data:
for message in request_data["messages"]:
for field in ("reasoning_content_signature", "redacted_reasoning_content", "omitted_reasoning_content"):
message.pop(field, None)
# If Ollama # If Ollama
# if llm_config.handle.startswith("ollama/") and llm_config.enable_reasoner: # if llm_config.handle.startswith("ollama/") and llm_config.enable_reasoner:
# Sadly, reasoning via the OpenAI proxy on Ollama only works for Harmony/gpt-oss # Sadly, reasoning via the OpenAI proxy on Ollama only works for Harmony/gpt-oss

View File

@@ -68,6 +68,12 @@ class ZAIClient(OpenAIClient):
} }
} }
# Z.ai's API uses max_tokens, not max_completion_tokens.
# If max_completion_tokens is sent, Z.ai ignores it and falls back to its
# default of 65536, silently truncating input to ~137K of the 200K context window.
if "max_completion_tokens" in data:
data["max_tokens"] = data.pop("max_completion_tokens")
# Sanitize empty text content — ZAI rejects empty text blocks # Sanitize empty text content — ZAI rejects empty text blocks
if "messages" in data: if "messages" in data:
for msg in data["messages"]: for msg in data["messages"]:

View File

@@ -17295,6 +17295,58 @@
"supports_tool_choice": true, "supports_tool_choice": true,
"supports_vision": true "supports_vision": true
}, },
"gpt-5.3-chat-latest": {
"cache_read_input_token_cost": 1.75e-7,
"cache_read_input_token_cost_priority": 3.5e-7,
"input_cost_per_token": 1.75e-6,
"input_cost_per_token_priority": 3.5e-6,
"litellm_provider": "openai",
"max_input_tokens": 128000,
"max_output_tokens": 16384,
"max_tokens": 16384,
"mode": "chat",
"output_cost_per_token": 1.4e-5,
"output_cost_per_token_priority": 2.8e-5,
"supported_endpoints": ["/v1/chat/completions", "/v1/responses"],
"supported_modalities": ["text", "image"],
"supported_output_modalities": ["text"],
"supports_function_calling": true,
"supports_native_streaming": true,
"supports_parallel_function_calling": true,
"supports_pdf_input": true,
"supports_prompt_caching": true,
"supports_reasoning": true,
"supports_response_schema": true,
"supports_system_messages": true,
"supports_tool_choice": true,
"supports_vision": true
},
"gpt-5.3-codex": {
"cache_read_input_token_cost": 1.75e-7,
"cache_read_input_token_cost_priority": 3.5e-7,
"input_cost_per_token": 1.75e-6,
"input_cost_per_token_priority": 3.5e-6,
"litellm_provider": "openai",
"max_input_tokens": 272000,
"max_output_tokens": 128000,
"max_tokens": 128000,
"mode": "responses",
"output_cost_per_token": 1.4e-5,
"output_cost_per_token_priority": 2.8e-5,
"supported_endpoints": ["/v1/responses"],
"supported_modalities": ["text", "image"],
"supported_output_modalities": ["text"],
"supports_function_calling": true,
"supports_native_streaming": true,
"supports_parallel_function_calling": true,
"supports_pdf_input": true,
"supports_prompt_caching": true,
"supports_reasoning": true,
"supports_response_schema": true,
"supports_system_messages": false,
"supports_tool_choice": true,
"supports_vision": true
},
"gpt-5-mini": { "gpt-5-mini": {
"cache_read_input_token_cost": 2.5e-8, "cache_read_input_token_cost": 2.5e-8,
"cache_read_input_token_cost_flex": 1.25e-8, "cache_read_input_token_cost_flex": 1.25e-8,

View File

@@ -44,7 +44,7 @@ class Conversation(SqlalchemyBase, OrganizationMixin):
"ConversationMessage", "ConversationMessage",
back_populates="conversation", back_populates="conversation",
cascade="all, delete-orphan", cascade="all, delete-orphan",
lazy="selectin", lazy="raise",
) )
isolated_blocks: Mapped[List["Block"]] = relationship( isolated_blocks: Mapped[List["Block"]] = relationship(
"Block", "Block",

View File

@@ -69,5 +69,5 @@ class ConversationMessage(SqlalchemyBase, OrganizationMixin):
) )
message: Mapped["Message"] = relationship( message: Mapped["Message"] = relationship(
"Message", "Message",
lazy="selectin", lazy="raise",
) )

View File

@@ -88,8 +88,7 @@ class LettaRequest(BaseModel):
) )
top_logprobs: Optional[int] = Field( top_logprobs: Optional[int] = Field(
default=None, default=None,
description="Number of most likely tokens to return at each position (0-20). " description="Number of most likely tokens to return at each position (0-20). Requires return_logprobs=True.",
"Requires return_logprobs=True.",
) )
return_token_ids: bool = Field( return_token_ids: bool = Field(
default=False, default=False,
@@ -155,6 +154,10 @@ class LettaStreamingRequest(LettaRequest):
class ConversationMessageRequest(LettaRequest): class ConversationMessageRequest(LettaRequest):
"""Request for sending messages to a conversation. Streams by default.""" """Request for sending messages to a conversation. Streams by default."""
agent_id: Optional[str] = Field(
default=None,
description="Agent ID for agent-direct mode with 'default' conversation. Use with conversation_id='default' in the URL path.",
)
streaming: bool = Field( streaming: bool = Field(
default=True, default=True,
description="If True (default), returns a streaming response (Server-Sent Events). If False, returns a complete JSON response.", description="If True (default), returns a streaming response (Server-Sent Events). If False, returns a complete JSON response.",
@@ -194,6 +197,10 @@ class CreateBatch(BaseModel):
class RetrieveStreamRequest(BaseModel): class RetrieveStreamRequest(BaseModel):
agent_id: Optional[str] = Field(
default=None,
description="Agent ID for agent-direct mode with 'default' conversation. Use with conversation_id='default' in the URL path.",
)
starting_after: int = Field( starting_after: int = Field(
0, description="Sequence id to use as a cursor for pagination. Response will start streaming after this chunk sequence id" 0, description="Sequence id to use as a cursor for pagination. Response will start streaming after this chunk sequence id"
) )

View File

@@ -1,3 +1,4 @@
import re
from typing import TYPE_CHECKING, Literal, Optional from typing import TYPE_CHECKING, Literal, Optional
from pydantic import BaseModel, ConfigDict, Field, model_validator from pydantic import BaseModel, ConfigDict, Field, model_validator
@@ -139,7 +140,9 @@ class LLMConfig(BaseModel):
# Set max_tokens defaults based on model (only if not explicitly provided) # Set max_tokens defaults based on model (only if not explicitly provided)
if "max_tokens" not in values: if "max_tokens" not in values:
if model.startswith("gpt-5"): # Covers both gpt-5 and gpt-5.1 if re.match(r"^gpt-5\.[23]", model) and "-chat" not in model:
values["max_tokens"] = 128000
elif model.startswith("gpt-5"):
values["max_tokens"] = 16384 values["max_tokens"] = 16384
elif model == "gpt-4.1": elif model == "gpt-4.1":
values["max_tokens"] = 8192 values["max_tokens"] = 8192
@@ -299,7 +302,7 @@ class LLMConfig(BaseModel):
context_window=272000, context_window=272000,
reasoning_effort="none", # Default to "none" for GPT-5.2 reasoning_effort="none", # Default to "none" for GPT-5.2
verbosity="medium", verbosity="medium",
max_tokens=16384, max_tokens=128000,
) )
elif model_name == "letta": elif model_name == "letta":
return cls( return cls(

View File

@@ -95,6 +95,11 @@ class LLMTrace(LettaBase):
response_json: str = Field(..., description="Full response payload as JSON string") response_json: str = Field(..., description="Full response payload as JSON string")
llm_config_json: str = Field(default="", description="LLM config as JSON string") llm_config_json: str = Field(default="", description="LLM config as JSON string")
# Billing context
billing_plan_type: Optional[str] = Field(default=None, description="Subscription tier (e.g., 'basic', 'standard', 'max', 'enterprise')")
billing_cost_source: Optional[str] = Field(default=None, description="Cost source: 'quota' or 'credits'")
billing_customer_id: Optional[str] = Field(default=None, description="Customer ID for cross-referencing billing records")
# Timestamp # Timestamp
created_at: datetime = Field(default_factory=get_utc_time, description="When the trace was created") created_at: datetime = Field(default_factory=get_utc_time, description="When the trace was created")
@@ -128,6 +133,9 @@ class LLMTrace(LettaBase):
self.request_json, self.request_json,
self.response_json, self.response_json,
self.llm_config_json, self.llm_config_json,
self.billing_plan_type or "",
self.billing_cost_source or "",
self.billing_customer_id or "",
self.created_at, self.created_at,
) )
@@ -162,5 +170,8 @@ class LLMTrace(LettaBase):
"request_json", "request_json",
"response_json", "response_json",
"llm_config_json", "llm_config_json",
"billing_plan_type",
"billing_cost_source",
"billing_customer_id",
"created_at", "created_at",
] ]

View File

@@ -226,8 +226,6 @@ class Memory(BaseModel, validate_assignment=True):
front_lines = [] front_lines = []
if block.description: if block.description:
front_lines.append(f"description: {block.description}") front_lines.append(f"description: {block.description}")
if block.limit is not None:
front_lines.append(f"limit: {block.limit}")
if getattr(block, "read_only", False): if getattr(block, "read_only", False):
front_lines.append("read_only: true") front_lines.append("read_only: true")
@@ -291,7 +289,40 @@ class Memory(BaseModel, validate_assignment=True):
s.write("\n\n<memory_filesystem>\n") s.write("\n\n<memory_filesystem>\n")
def _render_tree(node: dict, prefix: str = ""): def _render_tree(node: dict, prefix: str = "", in_system: bool = False, path_parts: tuple[str, ...] = ()):
# Render skills/ as concise top-level entries only, using both
# current (`skills/<name>`) and legacy (`skills/<name>/SKILL`) labels.
if path_parts == ("skills",):
skill_entries: list[tuple[str, str]] = []
for name, val in node.items():
if name == LEAF_KEY:
continue
block = None
if isinstance(val, dict):
legacy_skill_block = val.get("SKILL")
if legacy_skill_block is not None and not isinstance(legacy_skill_block, dict):
block = legacy_skill_block
elif LEAF_KEY in val and not isinstance(val[LEAF_KEY], dict):
block = val[LEAF_KEY]
else:
block = val
if block is None:
continue
desc = getattr(block, "description", None)
desc_line = (desc or "").strip().split("\n")[0].strip()
skill_entries.append((name, desc_line))
skill_entries.sort(key=lambda e: e[0])
for i, (name, desc_line) in enumerate(skill_entries):
is_last = i == len(skill_entries) - 1
connector = "└── " if is_last else "├── "
desc_suffix = f" ({desc_line})" if desc_line else ""
s.write(f"{prefix}{connector}{name}{desc_suffix}\n")
return
# Sort: directories first, then files. If a node is both a directory and a # Sort: directories first, then files. If a node is both a directory and a
# leaf (LEAF_KEY present), show both <name>/ and <name>.md. # leaf (LEAF_KEY present), show both <name>/ and <name>.md.
dirs = [] dirs = []
@@ -316,9 +347,24 @@ class Memory(BaseModel, validate_assignment=True):
if is_dir: if is_dir:
s.write(f"{prefix}{connector}{name}/\n") s.write(f"{prefix}{connector}{name}/\n")
extension = " " if is_last else "" extension = " " if is_last else ""
_render_tree(node[name], prefix + extension) _render_tree(
node[name],
prefix + extension,
in_system=in_system or name == "system",
path_parts=(*path_parts, name),
)
else: else:
s.write(f"{prefix}{connector}{name}.md\n") # For files outside system/, append the block description
desc_suffix = ""
if not in_system:
val = node[name]
block = val[LEAF_KEY] if isinstance(val, dict) else val
desc = getattr(block, "description", None)
if desc:
desc_line = desc.strip().split("\n")[0].strip()
if desc_line:
desc_suffix = f" ({desc_line})"
s.write(f"{prefix}{connector}{name}.md{desc_suffix}\n")
_render_tree(tree) _render_tree(tree)
s.write("</memory_filesystem>") s.write("</memory_filesystem>")

View File

@@ -282,10 +282,10 @@ class AnthropicModelSettings(ModelSettings):
description="Soft control for how verbose model output should be, used for GPT-5 models.", description="Soft control for how verbose model output should be, used for GPT-5 models.",
) )
# Opus 4.5 effort parameter # Effort parameter for Opus 4.5, Opus 4.6, and Sonnet 4.6
effort: Optional[Literal["low", "medium", "high"]] = Field( effort: Optional[Literal["low", "medium", "high", "max"]] = Field(
None, None,
description="Effort level for Opus 4.5 model (controls token conservation). Not setting this gives similar performance to 'high'.", description="Effort level for supported Anthropic models (controls token spending). 'max' is only available on Opus 4.6. Not setting this gives similar performance to 'high'.",
) )
# Anthropic supports strict mode for tool calling - defaults to False # Anthropic supports strict mode for tool calling - defaults to False

View File

@@ -3,13 +3,21 @@ from __future__ import annotations
from datetime import datetime from datetime import datetime
from typing import Any, Dict, Optional from typing import Any, Dict, Optional
from pydantic import Field from pydantic import BaseModel, Field
from letta.helpers.datetime_helpers import get_utc_time from letta.helpers.datetime_helpers import get_utc_time
from letta.schemas.enums import PrimitiveType from letta.schemas.enums import PrimitiveType
from letta.schemas.letta_base import OrmMetadataBase from letta.schemas.letta_base import OrmMetadataBase
class BillingContext(BaseModel):
"""Billing context for LLM request cost tracking."""
plan_type: Optional[str] = Field(None, description="Subscription tier")
cost_source: Optional[str] = Field(None, description="Cost source: 'quota' or 'credits'")
customer_id: Optional[str] = Field(None, description="Customer ID for billing records")
class BaseProviderTrace(OrmMetadataBase): class BaseProviderTrace(OrmMetadataBase):
__id_prefix__ = PrimitiveType.PROVIDER_TRACE.value __id_prefix__ = PrimitiveType.PROVIDER_TRACE.value
@@ -53,6 +61,8 @@ class ProviderTrace(BaseProviderTrace):
compaction_settings: Optional[Dict[str, Any]] = Field(None, description="Compaction/summarization settings (summarization calls only)") compaction_settings: Optional[Dict[str, Any]] = Field(None, description="Compaction/summarization settings (summarization calls only)")
llm_config: Optional[Dict[str, Any]] = Field(None, description="LLM configuration used for this call (non-summarization calls only)") llm_config: Optional[Dict[str, Any]] = Field(None, description="LLM configuration used for this call (non-summarization calls only)")
billing_context: Optional[BillingContext] = Field(None, description="Billing context from request headers")
created_at: datetime = Field(default_factory=get_utc_time, description="The timestamp when the object was created.") created_at: datetime = Field(default_factory=get_utc_time, description="The timestamp when the object was created.")

View File

@@ -14,7 +14,7 @@ from letta.schemas.providers.base import Provider
logger = get_logger(__name__) logger = get_logger(__name__)
ALLOWED_PREFIXES = {"gpt-4", "gpt-5", "o1", "o3", "o4"} ALLOWED_PREFIXES = {"gpt-4", "gpt-5", "o1", "o3", "o4"}
DISALLOWED_KEYWORDS = {"transcribe", "search", "realtime", "tts", "audio", "computer", "o1-mini", "o1-preview", "o1-pro", "chat"} DISALLOWED_KEYWORDS = {"transcribe", "search", "realtime", "tts", "audio", "computer", "o1-mini", "o1-preview", "o1-pro"}
DEFAULT_EMBEDDING_BATCH_SIZE = 1024 DEFAULT_EMBEDDING_BATCH_SIZE = 1024
@@ -50,10 +50,22 @@ class OpenAIProvider(Provider):
except Exception as e: except Exception as e:
raise LLMError(message=f"{e}", code=ErrorCode.INTERNAL_SERVER_ERROR) raise LLMError(message=f"{e}", code=ErrorCode.INTERNAL_SERVER_ERROR)
@staticmethod
def _openai_default_max_output_tokens(model_name: str) -> int:
"""Return a sensible max-output-tokens default for OpenAI models.
gpt-5.2* / gpt-5.3* support 128k output tokens, except the
`-chat` variants which are capped at 16k.
"""
import re
if re.match(r"^gpt-5\.[23]", model_name) and "-chat" not in model_name:
return 128000
return 16384
def get_default_max_output_tokens(self, model_name: str) -> int: def get_default_max_output_tokens(self, model_name: str) -> int:
"""Get the default max output tokens for OpenAI models (sync fallback).""" """Get the default max output tokens for OpenAI models (sync fallback)."""
# Simple default for openai return self._openai_default_max_output_tokens(model_name)
return 16384
async def get_default_max_output_tokens_async(self, model_name: str) -> int: async def get_default_max_output_tokens_async(self, model_name: str) -> int:
"""Get the default max output tokens for OpenAI models. """Get the default max output tokens for OpenAI models.
@@ -67,8 +79,7 @@ class OpenAIProvider(Provider):
if max_output is not None: if max_output is not None:
return max_output return max_output
# Simple default for openai return self._openai_default_max_output_tokens(model_name)
return 16384
async def _get_models_async(self) -> list[dict]: async def _get_models_async(self) -> list[dict]:
from letta.llm_api.openai import openai_get_model_list_async from letta.llm_api.openai import openai_get_model_list_async

View File

@@ -12,12 +12,13 @@ from letta.schemas.providers.openai import OpenAIProvider
# Z.ai model context windows # Z.ai model context windows
# Reference: https://docs.z.ai/ # Reference: https://docs.z.ai/
# GLM-5 max context window is 200K tokens but max_output_tokens (default 16k) counts against that --> 180k
MODEL_CONTEXT_WINDOWS = { MODEL_CONTEXT_WINDOWS = {
"glm-4.5": 128000, "glm-4.5": 128000,
"glm-4.6": 200000, "glm-4.6": 180000,
"glm-4.7": 200000, "glm-4.7": 180000,
"glm-5": 200000, "glm-5": 180000,
"glm-5-code": 200000, "glm-5-code": 180000,
} }

View File

@@ -3,7 +3,7 @@ import uuid
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from typing import AsyncGenerator from typing import AsyncGenerator
from sqlalchemy import NullPool, text from sqlalchemy import NullPool
from sqlalchemy.ext.asyncio import ( from sqlalchemy.ext.asyncio import (
AsyncEngine, AsyncEngine,
AsyncSession, AsyncSession,
@@ -88,10 +88,6 @@ class DatabaseRegistry:
try: try:
async with async_session_factory() as session: async with async_session_factory() as session:
try: try:
result = await session.execute(text("SELECT pg_backend_pid(), current_setting('statement_timeout')"))
pid, timeout = result.one()
logger.warning(f"[stmt_timeout_debug] pid={pid} statement_timeout={timeout}")
await session.rollback()
yield session yield session
await session.commit() await session.commit()
except asyncio.CancelledError: except asyncio.CancelledError:

View File

@@ -6,6 +6,7 @@ from pydantic import BaseModel
from letta.errors import LettaInvalidArgumentError from letta.errors import LettaInvalidArgumentError
from letta.otel.tracing import tracer from letta.otel.tracing import tracer
from letta.schemas.enums import PrimitiveType from letta.schemas.enums import PrimitiveType
from letta.schemas.provider_trace import BillingContext
from letta.validators import PRIMITIVE_ID_PATTERNS from letta.validators import PRIMITIVE_ID_PATTERNS
if TYPE_CHECKING: if TYPE_CHECKING:
@@ -30,18 +31,24 @@ class HeaderParams(BaseModel):
letta_source: Optional[str] = None letta_source: Optional[str] = None
sdk_version: Optional[str] = None sdk_version: Optional[str] = None
experimental_params: Optional[ExperimentalParams] = None experimental_params: Optional[ExperimentalParams] = None
billing_context: Optional[BillingContext] = None
def get_headers( def get_headers(
actor_id: Optional[str] = Header(None, alias="user_id"), actor_id: Optional[str] = Header(None, alias="user_id"),
user_agent: Optional[str] = Header(None, alias="User-Agent"), user_agent: Optional[str] = Header(None, alias="User-Agent"),
project_id: Optional[str] = Header(None, alias="X-Project-Id"), project_id: Optional[str] = Header(None, alias="X-Project-Id"),
letta_source: Optional[str] = Header(None, alias="X-Letta-Source"), letta_source: Optional[str] = Header(None, alias="X-Letta-Source", include_in_schema=False),
sdk_version: Optional[str] = Header(None, alias="X-Stainless-Package-Version"), sdk_version: Optional[str] = Header(None, alias="X-Stainless-Package-Version", include_in_schema=False),
message_async: Optional[str] = Header(None, alias="X-Experimental-Message-Async"), message_async: Optional[str] = Header(None, alias="X-Experimental-Message-Async", include_in_schema=False),
letta_v1_agent: Optional[str] = Header(None, alias="X-Experimental-Letta-V1-Agent"), letta_v1_agent: Optional[str] = Header(None, alias="X-Experimental-Letta-V1-Agent", include_in_schema=False),
letta_v1_agent_message_async: Optional[str] = Header(None, alias="X-Experimental-Letta-V1-Agent-Message-Async"), letta_v1_agent_message_async: Optional[str] = Header(
modal_sandbox: Optional[str] = Header(None, alias="X-Experimental-Modal-Sandbox"), None, alias="X-Experimental-Letta-V1-Agent-Message-Async", include_in_schema=False
),
modal_sandbox: Optional[str] = Header(None, alias="X-Experimental-Modal-Sandbox", include_in_schema=False),
billing_plan_type: Optional[str] = Header(None, alias="X-Billing-Plan-Type", include_in_schema=False),
billing_cost_source: Optional[str] = Header(None, alias="X-Billing-Cost-Source", include_in_schema=False),
billing_customer_id: Optional[str] = Header(None, alias="X-Billing-Customer-Id", include_in_schema=False),
) -> HeaderParams: ) -> HeaderParams:
"""Dependency injection function to extract common headers from requests.""" """Dependency injection function to extract common headers from requests."""
with tracer.start_as_current_span("dependency.get_headers"): with tracer.start_as_current_span("dependency.get_headers"):
@@ -63,6 +70,13 @@ def get_headers(
letta_v1_agent_message_async=(letta_v1_agent_message_async == "true") if letta_v1_agent_message_async else None, letta_v1_agent_message_async=(letta_v1_agent_message_async == "true") if letta_v1_agent_message_async else None,
modal_sandbox=(modal_sandbox == "true") if modal_sandbox else None, modal_sandbox=(modal_sandbox == "true") if modal_sandbox else None,
), ),
billing_context=BillingContext(
plan_type=billing_plan_type,
cost_source=billing_cost_source,
customer_id=billing_customer_id,
)
if any([billing_plan_type, billing_cost_source, billing_customer_id])
else None,
) )

View File

@@ -49,6 +49,7 @@ from letta.schemas.memory import (
) )
from letta.schemas.message import Message, MessageCreate, MessageCreateType, MessageSearchRequest, MessageSearchResult from letta.schemas.message import Message, MessageCreate, MessageCreateType, MessageSearchRequest, MessageSearchResult
from letta.schemas.passage import Passage from letta.schemas.passage import Passage
from letta.schemas.provider_trace import BillingContext
from letta.schemas.run import Run as PydanticRun, RunUpdate from letta.schemas.run import Run as PydanticRun, RunUpdate
from letta.schemas.source import Source from letta.schemas.source import Source
from letta.schemas.tool import Tool from letta.schemas.tool import Tool
@@ -156,7 +157,7 @@ async def list_agents(
order: Literal["asc", "desc"] = Query( order: Literal["asc", "desc"] = Query(
"desc", description="Sort order for agents by creation time. 'asc' for oldest first, 'desc' for newest first" "desc", description="Sort order for agents by creation time. 'asc' for oldest first, 'desc' for newest first"
), ),
order_by: Literal["created_at", "last_run_completion"] = Query("created_at", description="Field to sort by"), order_by: Literal["created_at", "updated_at", "last_run_completion"] = Query("created_at", description="Field to sort by"),
ascending: bool = Query( ascending: bool = Query(
False, False,
description="Whether to sort agents oldest to newest (True) or newest to oldest (False, default)", description="Whether to sort agents oldest to newest (True) or newest to oldest (False, default)",
@@ -1697,6 +1698,7 @@ async def send_message(
actor=actor, actor=actor,
request=request, request=request,
run_type="send_message", run_type="send_message",
billing_context=headers.billing_context,
) )
return result return result
@@ -1767,6 +1769,7 @@ async def send_message(
include_return_message_types=request.include_return_message_types, include_return_message_types=request.include_return_message_types,
client_tools=request.client_tools, client_tools=request.client_tools,
include_compaction_messages=request.include_compaction_messages, include_compaction_messages=request.include_compaction_messages,
billing_context=headers.billing_context,
) )
run_status = result.stop_reason.stop_reason.run_status run_status = result.stop_reason.stop_reason.run_status
return result return result
@@ -1845,6 +1848,7 @@ async def send_message_streaming(
actor=actor, actor=actor,
request=request, request=request,
run_type="send_message_streaming", run_type="send_message_streaming",
billing_context=headers.billing_context,
) )
return result return result
@@ -1868,6 +1872,13 @@ async def cancel_message(
""" """
# TODO: WHY DOES THIS CANCEL A LIST OF RUNS? # TODO: WHY DOES THIS CANCEL A LIST OF RUNS?
actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id) actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
logger.info(
"[Interrupt] Cancel request received for agent=%s by actor=%s (org=%s), explicit_run_ids=%s",
agent_id,
actor.id,
actor.organization_id,
request.run_ids if request else None,
)
if not settings.track_agent_run: if not settings.track_agent_run:
raise HTTPException(status_code=400, detail="Agent run tracking is disabled") raise HTTPException(status_code=400, detail="Agent run tracking is disabled")
run_ids = request.run_ids if request else None run_ids = request.run_ids if request else None
@@ -2036,6 +2047,7 @@ async def _process_message_background(
include_return_message_types: list[MessageType] | None = None, include_return_message_types: list[MessageType] | None = None,
override_model: str | None = None, override_model: str | None = None,
include_compaction_messages: bool = False, include_compaction_messages: bool = False,
billing_context: "BillingContext | None" = None,
) -> None: ) -> None:
"""Background task to process the message and update run status.""" """Background task to process the message and update run status."""
request_start_timestamp_ns = get_utc_timestamp_ns() request_start_timestamp_ns = get_utc_timestamp_ns()
@@ -2067,6 +2079,7 @@ async def _process_message_background(
request_start_timestamp_ns=request_start_timestamp_ns, request_start_timestamp_ns=request_start_timestamp_ns,
include_return_message_types=include_return_message_types, include_return_message_types=include_return_message_types,
include_compaction_messages=include_compaction_messages, include_compaction_messages=include_compaction_messages,
billing_context=billing_context,
) )
runs_manager = RunManager() runs_manager = RunManager()
from letta.schemas.enums import RunStatus from letta.schemas.enums import RunStatus
@@ -2235,6 +2248,7 @@ async def send_message_async(
include_return_message_types=request.include_return_message_types, include_return_message_types=request.include_return_message_types,
override_model=request.override_model, override_model=request.override_model,
include_compaction_messages=request.include_compaction_messages, include_compaction_messages=request.include_compaction_messages,
billing_context=headers.billing_context,
), ),
label=f"process_message_background_{run.id}", label=f"process_message_background_{run.id}",
) )
@@ -2419,7 +2433,11 @@ async def summarize_messages(
# If mode changed from agent's original settings and prompt not explicitly set in request, then use the default prompt for the new mode # If mode changed from agent's original settings and prompt not explicitly set in request, then use the default prompt for the new mode
# Ex: previously was sliding_window, now is all, so we need to use the default prompt for all mode # Ex: previously was sliding_window, now is all, so we need to use the default prompt for all mode
if "mode" in changed_fields and agent.compaction_settings.mode != request.compaction_settings.mode: if (
"mode" in changed_fields
and "prompt" not in changed_fields
and agent.compaction_settings.mode != request.compaction_settings.mode
):
from letta.services.summarizer.summarizer_config import get_default_prompt_for_mode from letta.services.summarizer.summarizer_config import get_default_prompt_for_mode
compaction_settings.prompt = get_default_prompt_for_mode(compaction_settings.mode) compaction_settings.prompt = get_default_prompt_for_mode(compaction_settings.mode)
@@ -2439,7 +2457,7 @@ async def summarize_messages(
logger.warning(f"Summarization failed to reduce the number of messages. {num_messages_before} messages -> {num_messages_after}.") logger.warning(f"Summarization failed to reduce the number of messages. {num_messages_before} messages -> {num_messages_after}.")
raise HTTPException( raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST, status_code=status.HTTP_400_BAD_REQUEST,
detail="Summarization failed to reduce the number of messages. You may need to use a different CompactionSettings (e.g. using `all` mode).", detail="Summarization failed to reduce the number of messages. You may not have enough messages to compact or need to use a different CompactionSettings (e.g. using `all` mode).",
) )
await agent_loop._checkpoint_messages(run_id=None, step_id=None, new_messages=[summary_message], in_context_messages=messages) await agent_loop._checkpoint_messages(run_id=None, step_id=None, new_messages=[summary_message], in_context_messages=messages)
return CompactionResponse( return CompactionResponse(

View File

@@ -1,5 +1,6 @@
from datetime import timedelta from datetime import timedelta
from typing import Annotated, List, Literal, Optional from typing import Annotated, List, Literal, Optional
from uuid import uuid4
from fastapi import APIRouter, Body, Depends, HTTPException, Query, status from fastapi import APIRouter, Body, Depends, HTTPException, Query, status
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
@@ -18,6 +19,7 @@ from letta.schemas.job import LettaRequestConfig
from letta.schemas.letta_message import LettaMessageUnion from letta.schemas.letta_message import LettaMessageUnion
from letta.schemas.letta_request import ConversationMessageRequest, LettaStreamingRequest, RetrieveStreamRequest from letta.schemas.letta_request import ConversationMessageRequest, LettaStreamingRequest, RetrieveStreamRequest
from letta.schemas.letta_response import LettaResponse from letta.schemas.letta_response import LettaResponse
from letta.schemas.provider_trace import BillingContext
from letta.schemas.run import Run as PydanticRun from letta.schemas.run import Run as PydanticRun
from letta.server.rest_api.dependencies import HeaderParams, get_headers, get_letta_server from letta.server.rest_api.dependencies import HeaderParams, get_headers, get_letta_server
from letta.server.rest_api.redis_stream_manager import redis_sse_stream_generator from letta.server.rest_api.redis_stream_manager import redis_sse_stream_generator
@@ -32,7 +34,7 @@ from letta.services.run_manager import RunManager
from letta.services.streaming_service import StreamingService from letta.services.streaming_service import StreamingService
from letta.services.summarizer.summarizer_config import CompactionSettings from letta.services.summarizer.summarizer_config import CompactionSettings
from letta.settings import settings from letta.settings import settings
from letta.validators import ConversationId from letta.validators import ConversationId, ConversationIdOrDefault
router = APIRouter(prefix="/conversations", tags=["conversations"]) router = APIRouter(prefix="/conversations", tags=["conversations"])
@@ -148,7 +150,8 @@ ConversationMessagesResponse = Annotated[
operation_id="list_conversation_messages", operation_id="list_conversation_messages",
) )
async def list_conversation_messages( async def list_conversation_messages(
conversation_id: ConversationId, conversation_id: ConversationIdOrDefault,
agent_id: Optional[str] = Query(None, description="Agent ID for agent-direct mode with 'default' conversation"),
server: SyncServer = Depends(get_letta_server), server: SyncServer = Depends(get_letta_server),
headers: HeaderParams = Depends(get_headers), headers: HeaderParams = Depends(get_headers),
before: Optional[str] = Query( before: Optional[str] = Query(
@@ -172,8 +175,36 @@ async def list_conversation_messages(
Returns LettaMessage objects (UserMessage, AssistantMessage, etc.) for all Returns LettaMessage objects (UserMessage, AssistantMessage, etc.) for all
messages in the conversation, with support for cursor-based pagination. messages in the conversation, with support for cursor-based pagination.
**Agent-direct mode**: Pass conversation_id="default" with agent_id parameter
to list messages from the agent's default conversation.
**Deprecated**: Passing an agent ID as conversation_id still works but will be removed.
""" """
actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id) actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
# Agent-direct mode: conversation_id="default" + agent_id param (preferred)
# OR conversation_id="agent-*" (backwards compat, deprecated)
resolved_agent_id = None
if conversation_id == "default" and agent_id:
resolved_agent_id = agent_id
elif conversation_id.startswith("agent-"):
resolved_agent_id = conversation_id
if resolved_agent_id:
return await server.get_agent_recall_async(
agent_id=resolved_agent_id,
after=after,
before=before,
limit=limit,
group_id=group_id,
conversation_id=None, # Default conversation (no isolation)
reverse=(order == "desc"),
return_message_object=False,
include_err=include_err,
actor=actor,
)
return await conversation_manager.list_conversation_messages( return await conversation_manager.list_conversation_messages(
conversation_id=conversation_id, conversation_id=conversation_id,
actor=actor, actor=actor,
@@ -186,6 +217,108 @@ async def list_conversation_messages(
) )
async def _send_agent_direct_message(
agent_id: str,
request: ConversationMessageRequest,
server: SyncServer,
actor,
billing_context: "BillingContext | None" = None,
) -> StreamingResponse | LettaResponse:
"""
Handle agent-direct messaging with locking but without conversation features.
This is used when the conversation_id in the URL is actually an agent ID,
providing a unified endpoint while maintaining agent-level locking.
"""
redis_client = await get_redis_client()
# Streaming mode (default)
if request.streaming:
streaming_request = LettaStreamingRequest(
messages=request.messages,
streaming=True,
stream_tokens=request.stream_tokens,
include_pings=request.include_pings,
background=request.background,
max_steps=request.max_steps,
use_assistant_message=request.use_assistant_message,
assistant_message_tool_name=request.assistant_message_tool_name,
assistant_message_tool_kwarg=request.assistant_message_tool_kwarg,
include_return_message_types=request.include_return_message_types,
override_model=request.override_model,
client_tools=request.client_tools,
)
streaming_service = StreamingService(server)
run, result = await streaming_service.create_agent_stream(
agent_id=agent_id,
actor=actor,
request=streaming_request,
run_type="send_message",
conversation_id=None,
should_lock=True,
billing_context=billing_context,
)
return result
# Non-streaming mode with locking
agent = await server.agent_manager.get_agent_by_id_async(
agent_id,
actor,
include_relationships=["memory", "multi_agent_group", "sources", "tool_exec_environment_variables", "tools", "tags"],
)
# Handle model override if specified in the request
if request.override_model:
override_llm_config = await server.get_llm_config_from_handle_async(
actor=actor,
handle=request.override_model,
)
agent = agent.model_copy(update={"llm_config": override_llm_config})
# Acquire lock using agent_id as lock key
if not isinstance(redis_client, NoopAsyncRedisClient):
await redis_client.acquire_conversation_lock(
conversation_id=agent_id,
token=str(uuid4()),
)
try:
# Create a run for execution tracking
run = None
if settings.track_agent_run:
runs_manager = RunManager()
run = await runs_manager.create_run(
pydantic_run=PydanticRun(
agent_id=agent_id,
background=False,
metadata={
"run_type": "send_message",
},
request_config=LettaRequestConfig.from_letta_request(request),
),
actor=actor,
)
# Set run_id in Redis for cancellation support
await redis_client.set(f"{REDIS_RUN_ID_PREFIX}:{agent_id}", run.id if run else None)
agent_loop = AgentLoop.load(agent_state=agent, actor=actor)
return await agent_loop.step(
request.messages,
max_steps=request.max_steps,
run_id=run.id if run else None,
use_assistant_message=request.use_assistant_message,
include_return_message_types=request.include_return_message_types,
client_tools=request.client_tools,
conversation_id=None,
include_compaction_messages=request.include_compaction_messages,
billing_context=billing_context,
)
finally:
# Release lock
await redis_client.release_conversation_lock(agent_id)
@router.post( @router.post(
"/{conversation_id}/messages", "/{conversation_id}/messages",
response_model=LettaResponse, response_model=LettaResponse,
@@ -201,7 +334,7 @@ async def list_conversation_messages(
}, },
) )
async def send_conversation_message( async def send_conversation_message(
conversation_id: ConversationId, conversation_id: ConversationIdOrDefault,
request: ConversationMessageRequest = Body(...), request: ConversationMessageRequest = Body(...),
server: SyncServer = Depends(get_letta_server), server: SyncServer = Depends(get_letta_server),
headers: HeaderParams = Depends(get_headers), headers: HeaderParams = Depends(get_headers),
@@ -212,12 +345,36 @@ async def send_conversation_message(
This endpoint sends a message to an existing conversation. This endpoint sends a message to an existing conversation.
By default (streaming=true), returns a streaming response (Server-Sent Events). By default (streaming=true), returns a streaming response (Server-Sent Events).
Set streaming=false to get a complete JSON response. Set streaming=false to get a complete JSON response.
**Agent-direct mode**: Pass conversation_id="default" with agent_id in request body
to send messages to the agent's default conversation with locking.
**Deprecated**: Passing an agent ID as conversation_id still works but will be removed.
""" """
actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id) actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
if not request.messages or len(request.messages) == 0: if not request.messages or len(request.messages) == 0:
raise HTTPException(status_code=422, detail="Messages must not be empty") raise HTTPException(status_code=422, detail="Messages must not be empty")
# Agent-direct mode: conversation_id="default" + agent_id in body (preferred)
# OR conversation_id="agent-*" (backwards compat, deprecated)
resolved_agent_id = None
if conversation_id == "default" and request.agent_id:
resolved_agent_id = request.agent_id
elif conversation_id.startswith("agent-"):
resolved_agent_id = conversation_id
if resolved_agent_id:
# Agent-direct mode: use agent ID, enable locking, skip conversation features
return await _send_agent_direct_message(
agent_id=resolved_agent_id,
request=request,
server=server,
actor=actor,
billing_context=headers.billing_context,
)
# Normal conversation mode
conversation = await conversation_manager.get_conversation_by_id( conversation = await conversation_manager.get_conversation_by_id(
conversation_id=conversation_id, conversation_id=conversation_id,
actor=actor, actor=actor,
@@ -247,6 +404,7 @@ async def send_conversation_message(
request=streaming_request, request=streaming_request,
run_type="send_conversation_message", run_type="send_conversation_message",
conversation_id=conversation_id, conversation_id=conversation_id,
billing_context=headers.billing_context,
) )
return result return result
@@ -265,6 +423,10 @@ async def send_conversation_message(
) )
if conversation.model_settings is not None: if conversation.model_settings is not None:
update_params = conversation.model_settings._to_legacy_config_params() update_params = conversation.model_settings._to_legacy_config_params()
# Don't clobber max_tokens with the Pydantic default when the caller
# didn't explicitly provide max_output_tokens.
if "max_output_tokens" not in conversation.model_settings.model_fields_set:
update_params.pop("max_tokens", None)
conversation_llm_config = conversation_llm_config.model_copy(update=update_params) conversation_llm_config = conversation_llm_config.model_copy(update=update_params)
agent = agent.model_copy(update={"llm_config": conversation_llm_config}) agent = agent.model_copy(update={"llm_config": conversation_llm_config})
@@ -305,6 +467,7 @@ async def send_conversation_message(
client_tools=request.client_tools, client_tools=request.client_tools,
conversation_id=conversation_id, conversation_id=conversation_id,
include_compaction_messages=request.include_compaction_messages, include_compaction_messages=request.include_compaction_messages,
billing_context=headers.billing_context,
) )
@@ -341,7 +504,7 @@ async def send_conversation_message(
}, },
) )
async def retrieve_conversation_stream( async def retrieve_conversation_stream(
conversation_id: ConversationId, conversation_id: ConversationIdOrDefault,
request: RetrieveStreamRequest = Body(None), request: RetrieveStreamRequest = Body(None),
headers: HeaderParams = Depends(get_headers), headers: HeaderParams = Depends(get_headers),
server: SyncServer = Depends(get_letta_server), server: SyncServer = Depends(get_letta_server),
@@ -351,18 +514,42 @@ async def retrieve_conversation_stream(
This endpoint allows you to reconnect to an active background stream This endpoint allows you to reconnect to an active background stream
for a conversation, enabling recovery from network interruptions. for a conversation, enabling recovery from network interruptions.
**Agent-direct mode**: Pass conversation_id="default" with agent_id in request body
to retrieve the stream for the agent's most recent active run.
**Deprecated**: Passing an agent ID as conversation_id still works but will be removed.
""" """
actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id) actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
runs_manager = RunManager() runs_manager = RunManager()
# Find the most recent active run for this conversation # Agent-direct mode: conversation_id="default" + agent_id in body (preferred)
active_runs = await runs_manager.list_runs( # OR conversation_id="agent-*" (backwards compat, deprecated)
actor=actor, resolved_agent_id = None
conversation_id=conversation_id, if conversation_id == "default" and request and request.agent_id:
statuses=[RunStatus.created, RunStatus.running], resolved_agent_id = request.agent_id
limit=1, elif conversation_id.startswith("agent-"):
ascending=False, resolved_agent_id = conversation_id
)
# Find the most recent active run
if resolved_agent_id:
# Agent-direct mode: find runs by agent_id
active_runs = await runs_manager.list_runs(
actor=actor,
agent_id=resolved_agent_id,
statuses=[RunStatus.created, RunStatus.running],
limit=1,
ascending=False,
)
else:
# Normal mode: find runs by conversation_id
active_runs = await runs_manager.list_runs(
actor=actor,
conversation_id=conversation_id,
statuses=[RunStatus.created, RunStatus.running],
limit=1,
ascending=False,
)
if not active_runs: if not active_runs:
raise LettaInvalidArgumentError("No active runs found for this conversation.") raise LettaInvalidArgumentError("No active runs found for this conversation.")
@@ -417,7 +604,8 @@ async def retrieve_conversation_stream(
@router.post("/{conversation_id}/cancel", operation_id="cancel_conversation") @router.post("/{conversation_id}/cancel", operation_id="cancel_conversation")
async def cancel_conversation( async def cancel_conversation(
conversation_id: ConversationId, conversation_id: ConversationIdOrDefault,
agent_id: Optional[str] = Query(None, description="Agent ID for agent-direct mode with 'default' conversation"),
server: SyncServer = Depends(get_letta_server), server: SyncServer = Depends(get_letta_server),
headers: HeaderParams = Depends(get_headers), headers: HeaderParams = Depends(get_headers),
) -> dict: ) -> dict:
@@ -425,26 +613,58 @@ async def cancel_conversation(
Cancel runs associated with a conversation. Cancel runs associated with a conversation.
Note: To cancel active runs, Redis is required. Note: To cancel active runs, Redis is required.
**Agent-direct mode**: Pass conversation_id="default" with agent_id query parameter
to cancel runs for the agent's default conversation.
**Deprecated**: Passing an agent ID as conversation_id still works but will be removed.
""" """
actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id) actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
logger.info(
"[Interrupt] Cancel request received for conversation=%s by actor=%s (org=%s)",
conversation_id,
actor.id,
actor.organization_id,
)
if not settings.track_agent_run: if not settings.track_agent_run:
raise HTTPException(status_code=400, detail="Agent run tracking is disabled") raise HTTPException(status_code=400, detail="Agent run tracking is disabled")
# Verify conversation exists and get agent_id # Agent-direct mode: conversation_id="default" + agent_id param (preferred)
conversation = await conversation_manager.get_conversation_by_id( # OR conversation_id="agent-*" (backwards compat, deprecated)
conversation_id=conversation_id, resolved_agent_id = None
actor=actor, if conversation_id == "default" and agent_id:
) resolved_agent_id = agent_id
elif conversation_id.startswith("agent-"):
resolved_agent_id = conversation_id
if resolved_agent_id:
# Agent-direct mode: use agent_id directly, skip conversation lookup
# Find active runs for this agent (default conversation has conversation_id=None)
runs = await server.run_manager.list_runs(
actor=actor,
agent_id=resolved_agent_id,
statuses=[RunStatus.created, RunStatus.running],
ascending=False,
limit=100,
)
else:
# Verify conversation exists and get agent_id
conversation = await conversation_manager.get_conversation_by_id(
conversation_id=conversation_id,
actor=actor,
)
agent_id = conversation.agent_id
# Find active runs for this conversation
runs = await server.run_manager.list_runs(
actor=actor,
statuses=[RunStatus.created, RunStatus.running],
ascending=False,
conversation_id=conversation_id,
limit=100,
)
# Find active runs for this conversation
runs = await server.run_manager.list_runs(
actor=actor,
statuses=[RunStatus.created, RunStatus.running],
ascending=False,
conversation_id=conversation_id,
limit=100,
)
run_ids = [run.id for run in runs] run_ids = [run.id for run in runs]
if not run_ids: if not run_ids:
@@ -461,7 +681,7 @@ async def cancel_conversation(
except Exception as e: except Exception as e:
logger.error(f"Failed to cancel Lettuce run {run_id}: {e}") logger.error(f"Failed to cancel Lettuce run {run_id}: {e}")
await server.run_manager.cancel_run(actor=actor, agent_id=conversation.agent_id, run_id=run_id) await server.run_manager.cancel_run(actor=actor, agent_id=agent_id, run_id=run_id)
except Exception as e: except Exception as e:
results[run_id] = "failed" results[run_id] = "failed"
logger.error(f"Failed to cancel run {run_id}: {str(e)}") logger.error(f"Failed to cancel run {run_id}: {str(e)}")
@@ -473,6 +693,10 @@ async def cancel_conversation(
class CompactionRequest(BaseModel): class CompactionRequest(BaseModel):
agent_id: Optional[str] = Field(
default=None,
description="Agent ID for agent-direct mode with 'default' conversation. Use with conversation_id='default' in the URL path.",
)
compaction_settings: Optional[CompactionSettings] = Field( compaction_settings: Optional[CompactionSettings] = Field(
default=None, default=None,
description="Optional compaction settings to use for this summarization request. If not provided, the agent's default settings will be used.", description="Optional compaction settings to use for this summarization request. If not provided, the agent's default settings will be used.",
@@ -487,7 +711,7 @@ class CompactionResponse(BaseModel):
@router.post("/{conversation_id}/compact", response_model=CompactionResponse, operation_id="compact_conversation") @router.post("/{conversation_id}/compact", response_model=CompactionResponse, operation_id="compact_conversation")
async def compact_conversation( async def compact_conversation(
conversation_id: ConversationId, conversation_id: ConversationIdOrDefault,
request: Optional[CompactionRequest] = Body(default=None), request: Optional[CompactionRequest] = Body(default=None),
server: SyncServer = Depends(get_letta_server), server: SyncServer = Depends(get_letta_server),
headers: HeaderParams = Depends(get_headers), headers: HeaderParams = Depends(get_headers),
@@ -497,23 +721,45 @@ async def compact_conversation(
This endpoint summarizes the in-context messages for a specific conversation, This endpoint summarizes the in-context messages for a specific conversation,
reducing the message count while preserving important context. reducing the message count while preserving important context.
**Agent-direct mode**: Pass conversation_id="default" with agent_id in request body
to compact the agent's default conversation messages.
**Deprecated**: Passing an agent ID as conversation_id still works but will be removed.
""" """
actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id) actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
# Get the conversation to find the agent_id # Agent-direct mode: conversation_id="default" + agent_id in body (preferred)
conversation = await conversation_manager.get_conversation_by_id( # OR conversation_id="agent-*" (backwards compat, deprecated)
conversation_id=conversation_id, resolved_agent_id = None
actor=actor, if conversation_id == "default" and request and request.agent_id:
) resolved_agent_id = request.agent_id
elif conversation_id.startswith("agent-"):
resolved_agent_id = conversation_id
# Get the agent state if resolved_agent_id:
agent = await server.agent_manager.get_agent_by_id_async(conversation.agent_id, actor, include_relationships=["multi_agent_group"]) # Agent-direct mode: compact agent's default conversation
agent = await server.agent_manager.get_agent_by_id_async(resolved_agent_id, actor, include_relationships=["multi_agent_group"])
in_context_messages = await server.message_manager.get_messages_by_ids_async(message_ids=agent.message_ids, actor=actor)
agent_loop = LettaAgentV3(agent_state=agent, actor=actor)
else:
# Get the conversation to find the agent_id
conversation = await conversation_manager.get_conversation_by_id(
conversation_id=conversation_id,
actor=actor,
)
# Get in-context messages for this conversation # Get the agent state
in_context_messages = await conversation_manager.get_messages_for_conversation( agent = await server.agent_manager.get_agent_by_id_async(conversation.agent_id, actor, include_relationships=["multi_agent_group"])
conversation_id=conversation_id,
actor=actor, # Get in-context messages for this conversation
) in_context_messages = await conversation_manager.get_messages_for_conversation(
conversation_id=conversation_id,
actor=actor,
)
# Create agent loop with conversation context
agent_loop = LettaAgentV3(agent_state=agent, actor=actor, conversation_id=conversation_id)
if not in_context_messages: if not in_context_messages:
raise HTTPException( raise HTTPException(
@@ -521,10 +767,27 @@ async def compact_conversation(
detail="No in-context messages found for this conversation.", detail="No in-context messages found for this conversation.",
) )
# Create agent loop with conversation context # Merge request compaction_settings with agent's settings (request overrides agent)
agent_loop = LettaAgentV3(agent_state=agent, actor=actor, conversation_id=conversation_id) if agent.compaction_settings and request and request.compaction_settings:
# Start with agent's settings, override with new values from request
# Use model_fields_set to get the fields that were changed in the request (want to ignore the defaults that get set automatically)
compaction_settings = agent.compaction_settings.copy() # do not mutate original agent compaction settings
changed_fields = request.compaction_settings.model_fields_set
for field in changed_fields:
setattr(compaction_settings, field, getattr(request.compaction_settings, field))
compaction_settings = request.compaction_settings if request else None # If mode changed from agent's original settings and prompt not explicitly set in request, then use the default prompt for the new mode
# Ex: previously was sliding_window, now is all, so we need to use the default prompt for all mode
if (
"mode" in changed_fields
and "prompt" not in changed_fields
and agent.compaction_settings.mode != request.compaction_settings.mode
):
from letta.services.summarizer.summarizer_config import get_default_prompt_for_mode
compaction_settings.prompt = get_default_prompt_for_mode(compaction_settings.mode)
else:
compaction_settings = (request and request.compaction_settings) or agent.compaction_settings
num_messages_before = len(in_context_messages) num_messages_before = len(in_context_messages)
# Run compaction # Run compaction
@@ -537,13 +800,11 @@ async def compact_conversation(
# Validate compaction reduced messages # Validate compaction reduced messages
if num_messages_before <= num_messages_after: if num_messages_before <= num_messages_after:
logger.warning( logger.warning(f"Summarization failed to reduce the number of messages. {num_messages_before} messages -> {num_messages_after}.")
f"Summarization failed to reduce the number of messages. {num_messages_before} messages -> {num_messages_after} (only expected if drop_tool_returns is True)." raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Summarization failed to reduce the number of messages. You may not have enough messages to compact or need to use a different CompactionSettings (e.g. using `all` mode).",
) )
# raise HTTPException(
# status_code=status.HTTP_400_BAD_REQUEST,
# detail="Summarization failed to reduce the number of messages. You may need to use a different CompactionSettings (e.g. using `all` mode).",
# )
# Checkpoint the messages (this will update the conversation_messages table) # Checkpoint the messages (this will update the conversation_messages table)
await agent_loop._checkpoint_messages(run_id=None, step_id=None, new_messages=[summary_message], in_context_messages=messages) await agent_loop._checkpoint_messages(run_id=None, step_id=None, new_messages=[summary_message], in_context_messages=messages)

View File

@@ -29,11 +29,23 @@ from starlette.background import BackgroundTask
from letta.log import get_logger from letta.log import get_logger
from letta.server.rest_api.dependencies import HeaderParams, get_headers, get_letta_server from letta.server.rest_api.dependencies import HeaderParams, get_headers, get_letta_server
from letta.services.memory_repo.path_mapping import memory_block_label_from_markdown_path
logger = get_logger(__name__) logger = get_logger(__name__)
_background_tasks: set[asyncio.Task] = set() _background_tasks: set[asyncio.Task] = set()
def _is_syncable_block_markdown_path(path: str) -> bool:
"""Return whether a markdown path should be mirrored into block cache.
Special-case skills so only skill definitions are mirrored:
- sync `skills/{skill_name}/SKILL.md` as label `skills/{skill_name}`
- ignore all other markdown under `skills/`
"""
return memory_block_label_from_markdown_path(path) is not None
router = APIRouter(prefix="/git", tags=["git"], include_in_schema=False) router = APIRouter(prefix="/git", tags=["git"], include_in_schema=False)
# Global storage for the server instance (set during app startup) # Global storage for the server instance (set during app startup)
@@ -100,7 +112,7 @@ async def _sync_after_push(actor_id: str, agent_id: str) -> None:
expected_labels = set() expected_labels = set()
from letta.services.memory_repo.block_markdown import parse_block_markdown from letta.services.memory_repo.block_markdown import parse_block_markdown
md_file_paths = sorted([file_path for file_path in files if file_path.endswith(".md")]) md_file_paths = sorted([file_path for file_path in files if _is_syncable_block_markdown_path(file_path)])
nested_md_file_paths = [file_path for file_path in md_file_paths if "/" in file_path[:-3]] nested_md_file_paths = [file_path for file_path in md_file_paths if "/" in file_path[:-3]]
logger.info( logger.info(
"Post-push sync file scan: agent=%s total_files=%d md_files=%d nested_md_files=%d sample_md_paths=%s", "Post-push sync file scan: agent=%s total_files=%d md_files=%d nested_md_files=%d sample_md_paths=%s",
@@ -113,10 +125,12 @@ async def _sync_after_push(actor_id: str, agent_id: str) -> None:
synced = 0 synced = 0
for file_path, content in files.items(): for file_path, content in files.items():
if not file_path.endswith(".md"): if not _is_syncable_block_markdown_path(file_path):
continue continue
label = file_path[:-3] label = memory_block_label_from_markdown_path(file_path)
if label is None:
continue
expected_labels.add(label) expected_labels.add(label)
# Parse frontmatter to extract metadata alongside value # Parse frontmatter to extract metadata alongside value

View File

@@ -364,6 +364,8 @@ def create_approval_request_message_from_llm_response(
) )
if pre_computed_assistant_message_id: if pre_computed_assistant_message_id:
approval_message.id = decrement_message_uuid(pre_computed_assistant_message_id) approval_message.id = decrement_message_uuid(pre_computed_assistant_message_id)
# Set otid to match streaming interface pattern (index -1 returns id unchanged)
approval_message.otid = Message.generate_otid_from_id(approval_message.id, -1)
messages.append(approval_message) messages.append(approval_message)
return messages return messages

View File

@@ -562,6 +562,10 @@ class SyncServer(object):
# update with model_settings # update with model_settings
if request.model_settings is not None: if request.model_settings is not None:
update_llm_config_params = request.model_settings._to_legacy_config_params() update_llm_config_params = request.model_settings._to_legacy_config_params()
# Don't clobber max_tokens with the Pydantic default when the caller
# didn't explicitly provide max_output_tokens in the request.
if "max_output_tokens" not in request.model_settings.model_fields_set:
update_llm_config_params.pop("max_tokens", None)
request.llm_config = request.llm_config.model_copy(update=update_llm_config_params) request.llm_config = request.llm_config.model_copy(update=update_llm_config_params)
# Copy parallel_tool_calls from request to llm_config if provided # Copy parallel_tool_calls from request to llm_config if provided
@@ -675,6 +679,12 @@ class SyncServer(object):
# Get the current agent's llm_config if not already set # Get the current agent's llm_config if not already set
agent = await self.agent_manager.get_agent_by_id_async(agent_id=agent_id, actor=actor) agent = await self.agent_manager.get_agent_by_id_async(agent_id=agent_id, actor=actor)
request.llm_config = agent.llm_config.model_copy() request.llm_config = agent.llm_config.model_copy()
else:
# TODO: Refactor update_agent to accept partial llm_config so we
# don't need to fetch the full agent just to preserve max_tokens.
if request.max_tokens is None and "max_output_tokens" not in request.model_settings.model_fields_set:
agent = await self.agent_manager.get_agent_by_id_async(agent_id=agent_id, actor=actor)
request.llm_config.max_tokens = agent.llm_config.max_tokens
update_llm_config_params = request.model_settings._to_legacy_config_params() update_llm_config_params = request.model_settings._to_legacy_config_params()
# Don't clobber max_tokens with the Pydantic default when the caller # Don't clobber max_tokens with the Pydantic default when the caller
# didn't explicitly provide max_output_tokens in the request. # didn't explicitly provide max_output_tokens in the request.

View File

@@ -24,8 +24,7 @@ from letta.constants import (
INCLUDE_MODEL_KEYWORDS_BASE_TOOL_RULES, INCLUDE_MODEL_KEYWORDS_BASE_TOOL_RULES,
RETRIEVAL_QUERY_DEFAULT_PAGE_SIZE, RETRIEVAL_QUERY_DEFAULT_PAGE_SIZE,
) )
from letta.errors import LettaError
from letta.errors import LettaAgentNotFoundError, LettaError, LettaInvalidArgumentError
from letta.helpers import ToolRulesSolver from letta.helpers import ToolRulesSolver
from letta.helpers.datetime_helpers import get_utc_time from letta.helpers.datetime_helpers import get_utc_time
from letta.log import get_logger from letta.log import get_logger
@@ -789,6 +788,25 @@ class AgentManager:
agent.agent_type, agent.agent_type,
) )
# Upsert compaction_settings: merge incoming partial update with existing settings
if agent_update.compaction_settings is not None:
# If mode changed, update the prompt to the default for the new mode
changed_fields = agent_update.compaction_settings.model_fields_set
if (
agent.compaction_settings is not None
and "mode" in changed_fields
and agent_update.compaction_settings.mode != agent.compaction_settings.mode
):
from letta.services.summarizer.summarizer_config import get_default_prompt_for_mode
agent_update.compaction_settings.prompt = get_default_prompt_for_mode(agent_update.compaction_settings.mode)
# Fill in unchanged fields from existing settings
if agent.compaction_settings is not None:
for field in agent.compaction_settings.model_fields:
if field not in changed_fields:
setattr(agent_update.compaction_settings, field, getattr(agent.compaction_settings, field))
scalar_updates = { scalar_updates = {
"name": agent_update.name, "name": agent_update.name,
"system": agent_update.system, "system": agent_update.system,

View File

@@ -7,6 +7,7 @@ if TYPE_CHECKING:
from sqlalchemy import and_, asc, delete, desc, func, nulls_last, or_, select from sqlalchemy import and_, asc, delete, desc, func, nulls_last, or_, select
from letta.errors import LettaInvalidArgumentError from letta.errors import LettaInvalidArgumentError
from letta.helpers.datetime_helpers import get_utc_time
from letta.orm.agent import Agent as AgentModel from letta.orm.agent import Agent as AgentModel
from letta.orm.block import Block as BlockModel from letta.orm.block import Block as BlockModel
from letta.orm.blocks_conversations import BlocksConversations from letta.orm.blocks_conversations import BlocksConversations
@@ -29,6 +30,21 @@ from letta.utils import enforce_types
class ConversationManager: class ConversationManager:
"""Manager class to handle business logic related to Conversations.""" """Manager class to handle business logic related to Conversations."""
@staticmethod
def _serialize_model_settings(model_settings) -> Optional[dict]:
"""Serialize model settings for DB storage, stripping max_output_tokens if not explicitly set.
Uses model_dump() to preserve all fields (including the provider_type discriminator),
but removes max_output_tokens when it wasn't explicitly provided by the caller so we
don't persist the Pydantic default (4096) and later overwrite the agent's own value.
"""
if model_settings is None:
return None
data = model_settings.model_dump()
if "max_output_tokens" not in model_settings.model_fields_set:
data.pop("max_output_tokens", None)
return data
@enforce_types @enforce_types
@trace_method @trace_method
async def create_conversation( async def create_conversation(
@@ -56,7 +72,7 @@ class ConversationManager:
summary=conversation_create.summary, summary=conversation_create.summary,
organization_id=actor.organization_id, organization_id=actor.organization_id,
model=conversation_create.model, model=conversation_create.model,
model_settings=conversation_create.model_settings.model_dump() if conversation_create.model_settings else None, model_settings=self._serialize_model_settings(conversation_create.model_settings),
) )
await conversation.create_async(session, actor=actor) await conversation.create_async(session, actor=actor)
@@ -73,7 +89,101 @@ class ConversationManager:
pydantic_conversation = conversation.to_pydantic() pydantic_conversation = conversation.to_pydantic()
pydantic_conversation.isolated_block_ids = isolated_block_ids pydantic_conversation.isolated_block_ids = isolated_block_ids
return pydantic_conversation
# Compile and persist the initial system message for this conversation
# This ensures the conversation captures the latest memory block state at creation time
await self.compile_and_save_system_message_for_conversation(
conversation_id=pydantic_conversation.id,
agent_id=agent_id,
actor=actor,
)
return pydantic_conversation
@trace_method
async def compile_and_save_system_message_for_conversation(
self,
conversation_id: str,
agent_id: str,
actor: PydanticUser,
agent_state: Optional["AgentState"] = None,
message_manager: Optional[object] = None,
) -> PydanticMessage:
"""Compile and persist the initial system message for a conversation.
This recompiles the system prompt with the latest memory block values
and metadata, ensuring the conversation starts with an up-to-date
system message.
This is the single source of truth for creating a conversation's system
message — used both at conversation creation time and as a fallback
when a conversation has no messages yet.
Args:
conversation_id: The conversation to add the system message to
agent_id: The agent this conversation belongs to
actor: The user performing the action
agent_state: Optional pre-loaded agent state (avoids redundant DB load)
message_manager: Optional pre-loaded MessageManager instance
Returns:
The persisted system message
"""
# Lazy imports to avoid circular dependencies
from letta.prompts.prompt_generator import PromptGenerator
from letta.services.message_manager import MessageManager
from letta.services.passage_manager import PassageManager
if message_manager is None:
message_manager = MessageManager()
if agent_state is None:
from letta.services.agent_manager import AgentManager
agent_state = await AgentManager().get_agent_by_id_async(
agent_id=agent_id,
include_relationships=["memory", "sources"],
actor=actor,
)
passage_manager = PassageManager()
num_messages = await message_manager.size_async(actor=actor, agent_id=agent_id)
num_archival_memories = await passage_manager.agent_passage_size_async(actor=actor, agent_id=agent_id)
# Compile the system message with current memory state
system_message_str = await PromptGenerator.compile_system_message_async(
system_prompt=agent_state.system,
in_context_memory=agent_state.memory,
in_context_memory_last_edit=get_utc_time(),
timezone=agent_state.timezone,
user_defined_variables=None,
append_icm_if_missing=True,
previous_message_count=num_messages,
archival_memory_size=num_archival_memories,
sources=agent_state.sources,
max_files_open=agent_state.max_files_open,
)
system_message = PydanticMessage.dict_to_message(
agent_id=agent_id,
model=agent_state.llm_config.model,
openai_message_dict={"role": "system", "content": system_message_str},
)
# Persist the new system message
persisted_messages = await message_manager.create_many_messages_async([system_message], actor=actor)
system_message = persisted_messages[0]
# Add it to the conversation tracking at position 0
await self.add_messages_to_conversation(
conversation_id=conversation_id,
agent_id=agent_id,
message_ids=[system_message.id],
actor=actor,
starting_position=0,
)
return system_message
@enforce_types @enforce_types
@trace_method @trace_method
@@ -133,22 +243,15 @@ class ConversationManager:
if sort_by == "last_run_completion": if sort_by == "last_run_completion":
# Subquery to get the latest completed_at for each conversation # Subquery to get the latest completed_at for each conversation
latest_run_subquery = ( latest_run_subquery = (
select( select(RunModel.conversation_id, func.max(RunModel.completed_at).label("last_run_completion"))
RunModel.conversation_id,
func.max(RunModel.completed_at).label("last_run_completion")
)
.where(RunModel.conversation_id.isnot(None)) .where(RunModel.conversation_id.isnot(None))
.group_by(RunModel.conversation_id) .group_by(RunModel.conversation_id)
.subquery() .subquery()
) )
# Join conversations with the subquery # Join conversations with the subquery
stmt = ( stmt = select(ConversationModel).outerjoin(
select(ConversationModel) latest_run_subquery, ConversationModel.id == latest_run_subquery.c.conversation_id
.outerjoin(
latest_run_subquery,
ConversationModel.id == latest_run_subquery.c.conversation_id
)
) )
sort_column = latest_run_subquery.c.last_run_completion sort_column = latest_run_subquery.c.last_run_completion
sort_nulls_last = True sort_nulls_last = True
@@ -170,10 +273,12 @@ class ConversationManager:
# Add summary search filter if provided # Add summary search filter if provided
if summary_search: if summary_search:
conditions.extend([ conditions.extend(
ConversationModel.summary.isnot(None), [
ConversationModel.summary.contains(summary_search), ConversationModel.summary.isnot(None),
]) ConversationModel.summary.contains(summary_search),
]
)
stmt = stmt.where(and_(*conditions)) stmt = stmt.where(and_(*conditions))
@@ -182,10 +287,7 @@ class ConversationManager:
# Get the sort value for the cursor conversation # Get the sort value for the cursor conversation
if sort_by == "last_run_completion": if sort_by == "last_run_completion":
cursor_query = ( cursor_query = (
select( select(ConversationModel.id, func.max(RunModel.completed_at).label("last_run_completion"))
ConversationModel.id,
func.max(RunModel.completed_at).label("last_run_completion")
)
.outerjoin(RunModel, ConversationModel.id == RunModel.conversation_id) .outerjoin(RunModel, ConversationModel.id == RunModel.conversation_id)
.where(ConversationModel.id == after) .where(ConversationModel.id == after)
.group_by(ConversationModel.id) .group_by(ConversationModel.id)
@@ -198,16 +300,11 @@ class ConversationManager:
# Cursor is at NULL - if ascending, get non-NULLs or NULLs with greater ID # Cursor is at NULL - if ascending, get non-NULLs or NULLs with greater ID
if ascending: if ascending:
stmt = stmt.where( stmt = stmt.where(
or_( or_(and_(sort_column.is_(None), ConversationModel.id > after_id), sort_column.isnot(None))
and_(sort_column.is_(None), ConversationModel.id > after_id),
sort_column.isnot(None)
)
) )
else: else:
# If descending, get NULLs with smaller ID # If descending, get NULLs with smaller ID
stmt = stmt.where( stmt = stmt.where(and_(sort_column.is_(None), ConversationModel.id < after_id))
and_(sort_column.is_(None), ConversationModel.id < after_id)
)
else: else:
# Cursor is at non-NULL # Cursor is at non-NULL
if ascending: if ascending:
@@ -217,8 +314,8 @@ class ConversationManager:
sort_column.isnot(None), sort_column.isnot(None),
or_( or_(
sort_column > after_sort_value, sort_column > after_sort_value,
and_(sort_column == after_sort_value, ConversationModel.id > after_id) and_(sort_column == after_sort_value, ConversationModel.id > after_id),
) ),
) )
) )
else: else:
@@ -227,7 +324,7 @@ class ConversationManager:
or_( or_(
sort_column.is_(None), sort_column.is_(None),
sort_column < after_sort_value, sort_column < after_sort_value,
and_(sort_column == after_sort_value, ConversationModel.id < after_id) and_(sort_column == after_sort_value, ConversationModel.id < after_id),
) )
) )
else: else:
@@ -277,7 +374,11 @@ class ConversationManager:
for key, value in update_data.items(): for key, value in update_data.items():
# model_settings needs to be serialized to dict for the JSON column # model_settings needs to be serialized to dict for the JSON column
if key == "model_settings" and value is not None: if key == "model_settings" and value is not None:
setattr(conversation, key, conversation_update.model_settings.model_dump() if conversation_update.model_settings else value) setattr(
conversation,
key,
self._serialize_model_settings(conversation_update.model_settings) if conversation_update.model_settings else value,
)
else: else:
setattr(conversation, key, value) setattr(conversation, key, value)

View File

@@ -604,6 +604,9 @@ def _apply_pagination(
if sort_by == "last_run_completion": if sort_by == "last_run_completion":
sort_column = AgentModel.last_run_completion sort_column = AgentModel.last_run_completion
sort_nulls_last = True # TODO: handle this as a query param eventually sort_nulls_last = True # TODO: handle this as a query param eventually
elif sort_by == "updated_at":
sort_column = AgentModel.updated_at
sort_nulls_last = False
else: else:
sort_column = AgentModel.created_at sort_column = AgentModel.created_at
sort_nulls_last = False sort_nulls_last = False
@@ -637,6 +640,9 @@ async def _apply_pagination_async(
if sort_by == "last_run_completion": if sort_by == "last_run_completion":
sort_column = AgentModel.last_run_completion sort_column = AgentModel.last_run_completion
sort_nulls_last = True # TODO: handle this as a query param eventually sort_nulls_last = True # TODO: handle this as a query param eventually
elif sort_by == "updated_at":
sort_column = AgentModel.updated_at
sort_nulls_last = False
else: else:
sort_column = AgentModel.created_at sort_column = AgentModel.created_at
sort_nulls_last = False sort_nulls_last = False

View File

@@ -73,7 +73,6 @@ class LLMTraceWriter:
def __init__(self): def __init__(self):
self._client = None self._client = None
self._shutdown = False self._shutdown = False
self._write_lock = asyncio.Lock() # Serialize writes - clickhouse_connect isn't thread-safe
# Check if ClickHouse is configured - if not, writing is disabled # Check if ClickHouse is configured - if not, writing is disabled
self._enabled = bool(settings.clickhouse_endpoint and settings.clickhouse_password) self._enabled = bool(settings.clickhouse_endpoint and settings.clickhouse_password)
@@ -82,11 +81,7 @@ class LLMTraceWriter:
atexit.register(self._sync_shutdown) atexit.register(self._sync_shutdown)
def _get_client(self): def _get_client(self):
"""Initialize ClickHouse client on first use (lazy loading). """Initialize ClickHouse client on first use (lazy loading)."""
Configures async_insert with wait_for_async_insert=1 for reliable
server-side batching with acknowledgment.
"""
if self._client is not None: if self._client is not None:
return self._client return self._client
@@ -108,8 +103,10 @@ class LLMTraceWriter:
settings={ settings={
# Enable server-side batching # Enable server-side batching
"async_insert": 1, "async_insert": 1,
# Wait for acknowledgment (reliable) # Don't wait for server-side flush acknowledgment — fire and forget.
"wait_for_async_insert": 1, # Waiting (value=1) caused each insert to hold an asyncio.Lock for ~1s,
# creating unbounded task queues that saturated the event loop under load.
"wait_for_async_insert": 0,
# Flush after 1 second if batch not full # Flush after 1 second if batch not full
"async_insert_busy_timeout_ms": 1000, "async_insert_busy_timeout_ms": 1000,
}, },
@@ -148,15 +145,15 @@ class LLMTraceWriter:
row = trace.to_clickhouse_row() row = trace.to_clickhouse_row()
columns = LLMTrace.clickhouse_columns() columns = LLMTrace.clickhouse_columns()
# Serialize writes - clickhouse_connect client isn't thread-safe # Run synchronous insert in thread pool. clickhouse-connect supports
async with self._write_lock: # multithreaded use via a thread-safe connection pool:
# Run synchronous insert in thread pool # https://clickhouse.com/docs/integrations/language-clients/python/advanced-usage#multithreaded-multiprocess-and-asyncevent-driven-use-cases
await asyncio.to_thread( await asyncio.to_thread(
client.insert, client.insert,
"llm_traces", "llm_traces",
[row], [row],
column_names=columns, column_names=columns,
) )
return # Success return # Success
except Exception as e: except Exception as e:

View File

@@ -3,11 +3,11 @@
File format: File format:
--- ---
description: "Who I am and how I approach work" description: "Who I am and how I approach work"
limit: 20000
--- ---
My name is Memo. I'm a stateful coding assistant... My name is Memo. I'm a stateful coding assistant...
- Frontmatter fields are only rendered when they differ from defaults. - Frontmatter fields are only rendered when they differ from defaults.
- ``limit`` is intentionally excluded from frontmatter (deprecated for git-base memory).
- Files without frontmatter are treated as value-only (backward compat). - Files without frontmatter are treated as value-only (backward compat).
""" """
@@ -37,12 +37,12 @@ def serialize_block(
This is used for initial file creation. For updates to existing files, This is used for initial file creation. For updates to existing files,
prefer `merge_frontmatter_with_body` to preserve user formatting. prefer `merge_frontmatter_with_body` to preserve user formatting.
""" """
# description and limit are always included in frontmatter. # description is always included in frontmatter.
# read_only and metadata are only included when non-default. # read_only and metadata are only included when non-default.
# limit is intentionally excluded (deprecated for git-base memory).
front: Dict[str, Any] = {} front: Dict[str, Any] = {}
front["description"] = description front["description"] = description
front["limit"] = limit if limit is not None else _get_field_default("limit")
if read_only != _get_field_default("read_only"): if read_only != _get_field_default("read_only"):
front["read_only"] = read_only front["read_only"] = read_only
@@ -111,7 +111,6 @@ def merge_frontmatter_with_body(
# Desired values # Desired values
desired_description = description desired_description = description
desired_limit = limit if limit is not None else _get_field_default("limit")
desired_read_only = read_only desired_read_only = read_only
desired_metadata = metadata if metadata is not None else _get_field_default("metadata") desired_metadata = metadata if metadata is not None else _get_field_default("metadata")
@@ -122,8 +121,9 @@ def merge_frontmatter_with_body(
parsed["description"] = desired_description parsed["description"] = desired_description
changed = True changed = True
if "limit" not in parsed or parsed.get("limit") != desired_limit: # Remove limit from frontmatter if it exists (deprecated for git-base memory)
parsed["limit"] = desired_limit if "limit" in parsed:
del parsed["limit"]
changed = True changed = True
if desired_read_only != _get_field_default("read_only"): if desired_read_only != _get_field_default("read_only"):

View File

@@ -21,6 +21,7 @@ from letta.schemas.memory_repo import MemoryCommit
from letta.schemas.user import User as PydanticUser from letta.schemas.user import User as PydanticUser
from letta.services.memory_repo.block_markdown import parse_block_markdown, serialize_block from letta.services.memory_repo.block_markdown import parse_block_markdown, serialize_block
from letta.services.memory_repo.git_operations import GitOperations from letta.services.memory_repo.git_operations import GitOperations
from letta.services.memory_repo.path_mapping import memory_block_label_from_markdown_path
from letta.services.memory_repo.storage.local import LocalStorageBackend from letta.services.memory_repo.storage.local import LocalStorageBackend
from letta.utils import enforce_types from letta.utils import enforce_types
@@ -133,26 +134,29 @@ class MemfsClient:
except FileNotFoundError: except FileNotFoundError:
return [] return []
# Convert block files to PydanticBlock (metadata is in frontmatter) # Convert block files to PydanticBlock (metadata is in frontmatter).
# skills/{skill_name}/SKILL.md is mapped to block label skills/{skill_name};
# other files under skills/ are intentionally ignored.
blocks = [] blocks = []
for file_path, content in files.items(): for file_path, content in files.items():
if file_path.endswith(".md"): label = memory_block_label_from_markdown_path(file_path)
label = file_path[:-3] if label is None:
continue
parsed = parse_block_markdown(content) parsed = parse_block_markdown(content)
synthetic_uuid = uuid.UUID(hashlib.md5(f"{agent_id}:{label}".encode()).hexdigest()) synthetic_uuid = uuid.UUID(hashlib.md5(f"{agent_id}:{label}".encode()).hexdigest())
blocks.append( blocks.append(
PydanticBlock( PydanticBlock(
id=f"block-{synthetic_uuid}", id=f"block-{synthetic_uuid}",
label=label, label=label,
value=parsed["value"], value=parsed["value"],
description=parsed.get("description"), description=parsed.get("description"),
limit=parsed.get("limit", CORE_MEMORY_BLOCK_CHAR_LIMIT), limit=parsed.get("limit", CORE_MEMORY_BLOCK_CHAR_LIMIT),
read_only=parsed.get("read_only", False), read_only=parsed.get("read_only", False),
metadata=parsed.get("metadata", {}), metadata=parsed.get("metadata", {}),
)
) )
)
return blocks return blocks

View File

@@ -0,0 +1,29 @@
"""Helpers for mapping memory-repo markdown paths to block labels.
Special handling for skills:
- sync `skills/{skill_name}/SKILL.md` as block label `skills/{skill_name}`
- ignore all other markdown files under `skills/`
"""
from __future__ import annotations
def memory_block_label_from_markdown_path(path: str) -> str | None:
"""Return block label for a syncable markdown path, else None.
Rules:
- Non-`.md` files are ignored.
- `skills/{skill_name}/SKILL.md` -> `skills/{skill_name}`
- Other `skills/**` markdown files are ignored.
- All other markdown files map to `path[:-3]`.
"""
if not path.endswith(".md"):
return None
if path.startswith("skills/"):
parts = path.split("/")
if len(parts) == 3 and parts[0] == "skills" and parts[1] and parts[2] == "SKILL.md":
return f"skills/{parts[1]}"
return None
return path[:-3]

View File

@@ -141,6 +141,9 @@ class ClickhouseProviderTraceBackend(ProviderTraceBackendClient):
request_json=request_json_str, request_json=request_json_str,
response_json=response_json_str, response_json=response_json_str,
llm_config_json=llm_config_json_str, llm_config_json=llm_config_json_str,
billing_plan_type=provider_trace.billing_context.plan_type if provider_trace.billing_context else None,
billing_cost_source=provider_trace.billing_context.cost_source if provider_trace.billing_context else None,
billing_customer_id=provider_trace.billing_context.customer_id if provider_trace.billing_context else None,
) )
def _extract_usage(self, response_json: dict, provider: str) -> dict: def _extract_usage(self, response_json: dict, provider: str) -> dict:

View File

@@ -29,7 +29,7 @@ class PostgresProviderTraceBackend(ProviderTraceBackendClient):
) -> ProviderTrace: ) -> ProviderTrace:
"""Write full provider trace to provider_traces table.""" """Write full provider trace to provider_traces table."""
async with db_registry.async_session() as session: async with db_registry.async_session() as session:
provider_trace_model = ProviderTraceModel(**provider_trace.model_dump()) provider_trace_model = ProviderTraceModel(**provider_trace.model_dump(exclude={"billing_context"}))
provider_trace_model.organization_id = actor.organization_id provider_trace_model.organization_id = actor.organization_id
if provider_trace.request_json: if provider_trace.request_json:

View File

@@ -638,7 +638,13 @@ class RunManager:
raise NoResultFound(f"Run with id {run_id} not found") raise NoResultFound(f"Run with id {run_id} not found")
agent_id = run.agent_id agent_id = run.agent_id
logger.debug(f"Cancelling run {run_id} for agent {agent_id}") logger.info(
"[Interrupt] Processing cancellation for run=%s, agent=%s, current_status=%s, current_stop_reason=%s",
run_id,
agent_id,
run.status if run else "unknown",
run.stop_reason if run else "unknown",
)
# Cancellation should be idempotent: if a run is already terminated, treat this as a no-op. # Cancellation should be idempotent: if a run is already terminated, treat this as a no-op.
# This commonly happens when a run finishes between client request and server handling. # This commonly happens when a run finishes between client request and server handling.

View File

@@ -15,6 +15,7 @@ from letta.errors import (
LettaInvalidArgumentError, LettaInvalidArgumentError,
LettaServiceUnavailableError, LettaServiceUnavailableError,
LLMAuthenticationError, LLMAuthenticationError,
LLMEmptyResponseError,
LLMError, LLMError,
LLMRateLimitError, LLMRateLimitError,
LLMTimeoutError, LLMTimeoutError,
@@ -33,6 +34,7 @@ from letta.schemas.letta_request import ClientToolSchema, LettaStreamingRequest
from letta.schemas.letta_response import LettaResponse from letta.schemas.letta_response import LettaResponse
from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType
from letta.schemas.message import MessageCreate from letta.schemas.message import MessageCreate
from letta.schemas.provider_trace import BillingContext
from letta.schemas.run import Run as PydanticRun, RunUpdate from letta.schemas.run import Run as PydanticRun, RunUpdate
from letta.schemas.usage import LettaUsageStatistics from letta.schemas.usage import LettaUsageStatistics
from letta.schemas.user import User from letta.schemas.user import User
@@ -76,6 +78,8 @@ class StreamingService:
request: LettaStreamingRequest, request: LettaStreamingRequest,
run_type: str = "streaming", run_type: str = "streaming",
conversation_id: Optional[str] = None, conversation_id: Optional[str] = None,
should_lock: bool = False,
billing_context: "BillingContext | None" = None,
) -> tuple[Optional[PydanticRun], Union[StreamingResponse, LettaResponse]]: ) -> tuple[Optional[PydanticRun], Union[StreamingResponse, LettaResponse]]:
""" """
Create a streaming response for an agent. Create a streaming response for an agent.
@@ -86,6 +90,7 @@ class StreamingService:
request: The LettaStreamingRequest containing all request parameters request: The LettaStreamingRequest containing all request parameters
run_type: Type of run for tracking run_type: Type of run for tracking
conversation_id: Optional conversation ID for conversation-scoped messaging conversation_id: Optional conversation ID for conversation-scoped messaging
should_lock: If True and conversation_id is None, use agent_id as lock key
Returns: Returns:
Tuple of (run object or None, streaming response) Tuple of (run object or None, streaming response)
@@ -116,6 +121,10 @@ class StreamingService:
) )
if conversation.model_settings is not None: if conversation.model_settings is not None:
update_params = conversation.model_settings._to_legacy_config_params() update_params = conversation.model_settings._to_legacy_config_params()
# Don't clobber max_tokens with the Pydantic default when the caller
# didn't explicitly provide max_output_tokens.
if "max_output_tokens" not in conversation.model_settings.model_fields_set:
update_params.pop("max_tokens", None)
conversation_llm_config = conversation_llm_config.model_copy(update=update_params) conversation_llm_config = conversation_llm_config.model_copy(update=update_params)
agent = agent.model_copy(update={"llm_config": conversation_llm_config}) agent = agent.model_copy(update={"llm_config": conversation_llm_config})
@@ -130,12 +139,15 @@ class StreamingService:
model_compatible_token_streaming = self._is_token_streaming_compatible(agent) model_compatible_token_streaming = self._is_token_streaming_compatible(agent)
# Attempt to acquire conversation lock if conversation_id is provided # Determine lock key: use conversation_id if provided, else agent_id if should_lock
# This prevents concurrent message processing for the same conversation lock_key = conversation_id if conversation_id else (agent_id if should_lock else None)
# Attempt to acquire lock if lock_key is set
# This prevents concurrent message processing for the same conversation/agent
# Skip locking if Redis is not available (graceful degradation) # Skip locking if Redis is not available (graceful degradation)
if conversation_id and not isinstance(redis_client, NoopAsyncRedisClient): if lock_key and not isinstance(redis_client, NoopAsyncRedisClient):
await redis_client.acquire_conversation_lock( await redis_client.acquire_conversation_lock(
conversation_id=conversation_id, conversation_id=lock_key,
token=str(uuid4()), token=str(uuid4()),
) )
@@ -163,8 +175,10 @@ class StreamingService:
include_return_message_types=request.include_return_message_types, include_return_message_types=request.include_return_message_types,
actor=actor, actor=actor,
conversation_id=conversation_id, conversation_id=conversation_id,
lock_key=lock_key, # For lock release (may differ from conversation_id)
client_tools=request.client_tools, client_tools=request.client_tools,
include_compaction_messages=request.include_compaction_messages, include_compaction_messages=request.include_compaction_messages,
billing_context=billing_context,
) )
# handle background streaming if requested # handle background streaming if requested
@@ -195,7 +209,7 @@ class StreamingService:
run_id=run.id, run_id=run.id,
run_manager=self.server.run_manager, run_manager=self.server.run_manager,
actor=actor, actor=actor,
conversation_id=conversation_id, conversation_id=lock_key, # Use lock_key for lock release
), ),
label=f"background_stream_processor_{run.id}", label=f"background_stream_processor_{run.id}",
) )
@@ -251,7 +265,7 @@ class StreamingService:
if settings.track_agent_run and run and run_status: if settings.track_agent_run and run and run_status:
await self.server.run_manager.update_run_by_id_async( await self.server.run_manager.update_run_by_id_async(
run_id=run.id, run_id=run.id,
conversation_id=conversation_id, conversation_id=lock_key, # Use lock_key for lock release
update=RunUpdate(status=run_status, metadata=run_update_metadata), update=RunUpdate(status=run_status, metadata=run_update_metadata),
actor=actor, actor=actor,
) )
@@ -326,8 +340,10 @@ class StreamingService:
include_return_message_types: Optional[list[MessageType]], include_return_message_types: Optional[list[MessageType]],
actor: User, actor: User,
conversation_id: Optional[str] = None, conversation_id: Optional[str] = None,
lock_key: Optional[str] = None,
client_tools: Optional[list[ClientToolSchema]] = None, client_tools: Optional[list[ClientToolSchema]] = None,
include_compaction_messages: bool = False, include_compaction_messages: bool = False,
billing_context: BillingContext | None = None,
) -> AsyncIterator: ) -> AsyncIterator:
""" """
Create a stream with unified error handling. Create a stream with unified error handling.
@@ -356,6 +372,7 @@ class StreamingService:
conversation_id=conversation_id, conversation_id=conversation_id,
client_tools=client_tools, client_tools=client_tools,
include_compaction_messages=include_compaction_messages, include_compaction_messages=include_compaction_messages,
billing_context=billing_context,
) )
async for chunk in stream: async for chunk in stream:
@@ -442,6 +459,21 @@ class StreamingService:
yield f"event: error\ndata: {error_message.model_dump_json()}\n\n" yield f"event: error\ndata: {error_message.model_dump_json()}\n\n"
# Send [DONE] marker to properly close the stream # Send [DONE] marker to properly close the stream
yield "data: [DONE]\n\n" yield "data: [DONE]\n\n"
except LLMEmptyResponseError as e:
run_status = RunStatus.failed
stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response)
error_message = LettaErrorMessage(
run_id=run_id,
error_type="llm_empty_response",
message="LLM returned an empty response.",
detail=str(e),
)
error_data = {"error": error_message.model_dump()}
logger.warning(f"Run {run_id} stopped with LLM empty response: {e}, error_data: {error_message.model_dump()}")
yield f"data: {stop_reason.model_dump_json()}\n\n"
yield f"event: error\ndata: {error_message.model_dump_json()}\n\n"
# Send [DONE] marker to properly close the stream
yield "data: [DONE]\n\n"
except LLMError as e: except LLMError as e:
run_status = RunStatus.failed run_status = RunStatus.failed
stop_reason = LettaStopReason(stop_reason=StopReasonType.llm_api_error) stop_reason = LettaStopReason(stop_reason=StopReasonType.llm_api_error)
@@ -491,7 +523,7 @@ class StreamingService:
stop_reason_value = stop_reason.stop_reason if stop_reason else StopReasonType.error.value stop_reason_value = stop_reason.stop_reason if stop_reason else StopReasonType.error.value
await self.runs_manager.update_run_by_id_async( await self.runs_manager.update_run_by_id_async(
run_id=run_id, run_id=run_id,
conversation_id=conversation_id, conversation_id=lock_key, # Use lock_key for lock release
update=RunUpdate(status=run_status, stop_reason=stop_reason_value, metadata=error_data), update=RunUpdate(status=run_status, stop_reason=stop_reason_value, metadata=error_data),
actor=actor, actor=actor,
) )

View File

@@ -96,6 +96,10 @@ async def build_summarizer_llm_config(
# them just like server.create_agent_async does for agents. # them just like server.create_agent_async does for agents.
if summarizer_config.model_settings is not None: if summarizer_config.model_settings is not None:
update_params = summarizer_config.model_settings._to_legacy_config_params() update_params = summarizer_config.model_settings._to_legacy_config_params()
# Don't clobber max_tokens with the Pydantic default when the caller
# didn't explicitly provide max_output_tokens.
if "max_output_tokens" not in summarizer_config.model_settings.model_fields_set:
update_params.pop("max_tokens", None)
return base.model_copy(update=update_params) return base.model_copy(update=update_params)
return base return base

View File

@@ -196,7 +196,7 @@ async def self_summarize_sliding_window(
return message.tool_calls is not None and len(message.tool_calls) > 0 return message.tool_calls is not None and len(message.tool_calls) > 0
return False return False
post_summarization_buffer = [system_prompt] post_summarization_buffer = []
while approx_token_count >= goal_tokens and eviction_percentage < 1.0: while approx_token_count >= goal_tokens and eviction_percentage < 1.0:
# more eviction percentage # more eviction percentage
eviction_percentage += 0.10 eviction_percentage += 0.10
@@ -217,8 +217,8 @@ async def self_summarize_sliding_window(
# update token count # update token count
logger.info(f"Attempting to compact messages to index {assistant_message_index} messages") logger.info(f"Attempting to compact messages to index {assistant_message_index} messages")
post_summarization_buffer = [system_prompt, *messages[assistant_message_index:]] post_summarization_buffer = list(messages[assistant_message_index:])
approx_token_count = await count_tokens(actor, agent_llm_config, post_summarization_buffer) approx_token_count = await count_tokens(actor, agent_llm_config, [system_prompt, *post_summarization_buffer])
logger.info( logger.info(
f"Compacting messages index 1:{assistant_message_index} messages resulted in {approx_token_count} tokens, goal is {goal_tokens}" f"Compacting messages index 1:{assistant_message_index} messages resulted in {approx_token_count} tokens, goal is {goal_tokens}"
) )

View File

@@ -11,7 +11,7 @@ from letta.settings import summarizer_settings
def get_default_summarizer_model(provider_type: ProviderType) -> str | None: def get_default_summarizer_model(provider_type: ProviderType) -> str | None:
"""Get default model for summarization for given provider type.""" """Get default model for summarization for given provider type."""
summarizer_defaults = { summarizer_defaults = {
ProviderType.anthropic: "anthropic/claude-haiku-4-5-20251001", ProviderType.anthropic: "anthropic/claude-haiku-4-5",
ProviderType.openai: "openai/gpt-5-mini", ProviderType.openai: "openai/gpt-5-mini",
ProviderType.google_ai: "google_ai/gemini-2.5-flash", ProviderType.google_ai: "google_ai/gemini-2.5-flash",
} }

View File

@@ -114,7 +114,7 @@ class SummarizerSettings(BaseSettings):
class ModelSettings(BaseSettings): class ModelSettings(BaseSettings):
model_config = SettingsConfigDict(env_file=".env", extra="ignore") model_config = SettingsConfigDict(env_file=".env", extra="ignore")
global_max_context_window_limit: int = 32000 global_max_context_window_limit: int = 128000
inner_thoughts_kwarg: str | None = Field(default=INNER_THOUGHTS_KWARG, description="Key used for passing in inner thoughts.") inner_thoughts_kwarg: str | None = Field(default=INNER_THOUGHTS_KWARG, description="Key used for passing in inner thoughts.")
@@ -204,6 +204,7 @@ class ModelSettings(BaseSettings):
gemini_base_url: str = "https://generativelanguage.googleapis.com/" gemini_base_url: str = "https://generativelanguage.googleapis.com/"
gemini_force_minimum_thinking_budget: bool = False gemini_force_minimum_thinking_budget: bool = False
gemini_max_retries: int = 5 gemini_max_retries: int = 5
gemini_timeout_seconds: float = 600.0
# google vertex # google vertex
google_cloud_project: Optional[str] = None google_cloud_project: Optional[str] = None

View File

@@ -45,30 +45,36 @@ PATH_VALIDATORS = {primitive_type.value: _create_path_validator_factory(primitiv
def _create_conversation_id_or_default_path_validator_factory(): def _create_conversation_id_or_default_path_validator_factory():
"""Conversation IDs accept the usual primitive format or the special value 'default'.""" """Conversation IDs with support for 'default' and agent IDs (backwards compatibility)."""
primitive = PrimitiveType.CONVERSATION.value conversation_primitive = PrimitiveType.CONVERSATION.value
prefix_pattern = PRIMITIVE_ID_PATTERNS[primitive].pattern agent_primitive = PrimitiveType.AGENT.value
# Make the full regex accept either the primitive ID format or 'default'. conversation_pattern = PRIMITIVE_ID_PATTERNS[conversation_primitive].pattern
# `prefix_pattern` already contains the ^...$ anchors. agent_pattern = PRIMITIVE_ID_PATTERNS[agent_primitive].pattern
conversation_or_default_pattern = f"^(default|{prefix_pattern[1:-1]})$" # Make the full regex accept: conversation ID, agent ID, or 'default'.
# Patterns already contain ^...$ anchors, so strip them for the alternation.
conversation_or_agent_or_default_pattern = f"^(default|{conversation_pattern[1:-1]}|{agent_pattern[1:-1]})$"
def factory(): def factory():
return Path( return Path(
description=(f"The conversation identifier. Either the special value 'default' or an ID in the format '{primitive}-<uuid4>'"), description=(
pattern=conversation_or_default_pattern, f"The conversation identifier. Can be a conversation ID ('{conversation_primitive}-<uuid4>'), "
examples=["default", f"{primitive}-123e4567-e89b-42d3-8456-426614174000"], f"'default' for agent-direct mode (with agent_id parameter), "
f"or an agent ID ('{agent_primitive}-<uuid4>') for backwards compatibility (deprecated)."
),
pattern=conversation_or_agent_or_default_pattern,
examples=[
"default",
f"{conversation_primitive}-123e4567-e89b-42d3-8456-426614174000",
f"{agent_primitive}-123e4567-e89b-42d3-8456-426614174000",
],
min_length=1, min_length=1,
max_length=len(primitive) + 1 + 36, max_length=max(len(conversation_primitive), len(agent_primitive)) + 1 + 36,
) )
return factory return factory
# Override conversation ID path validation to also allow the special value 'default'.
PATH_VALIDATORS[PrimitiveType.CONVERSATION.value] = _create_conversation_id_or_default_path_validator_factory()
# Type aliases for common ID types # Type aliases for common ID types
# These can be used directly in route handler signatures for cleaner code # These can be used directly in route handler signatures for cleaner code
AgentId = Annotated[str, PATH_VALIDATORS[PrimitiveType.AGENT.value]()] AgentId = Annotated[str, PATH_VALIDATORS[PrimitiveType.AGENT.value]()]
@@ -89,6 +95,10 @@ StepId = Annotated[str, PATH_VALIDATORS[PrimitiveType.STEP.value]()]
IdentityId = Annotated[str, PATH_VALIDATORS[PrimitiveType.IDENTITY.value]()] IdentityId = Annotated[str, PATH_VALIDATORS[PrimitiveType.IDENTITY.value]()]
ConversationId = Annotated[str, PATH_VALIDATORS[PrimitiveType.CONVERSATION.value]()] ConversationId = Annotated[str, PATH_VALIDATORS[PrimitiveType.CONVERSATION.value]()]
# Conversation ID with support for 'default' and agent IDs (for agent-direct mode endpoints)
# Backwards compatible - agent-* will be deprecated in favor of conversation_id='default' + agent_id param
ConversationIdOrDefault = Annotated[str, _create_conversation_id_or_default_path_validator_factory()()]
# Infrastructure types # Infrastructure types
McpServerId = Annotated[str, PATH_VALIDATORS[PrimitiveType.MCP_SERVER.value]()] McpServerId = Annotated[str, PATH_VALIDATORS[PrimitiveType.MCP_SERVER.value]()]
McpOAuthId = Annotated[str, PATH_VALIDATORS[PrimitiveType.MCP_OAUTH.value]()] McpOAuthId = Annotated[str, PATH_VALIDATORS[PrimitiveType.MCP_OAUTH.value]()]

View File

@@ -1,6 +1,6 @@
[project] [project]
name = "letta" name = "letta"
version = "0.16.5" version = "0.16.6"
description = "Create LLM agents with long-term memory and custom tools" description = "Create LLM agents with long-term memory and custom tools"
authors = [ authors = [
{name = "Letta Team", email = "contact@letta.com"}, {name = "Letta Team", email = "contact@letta.com"},

View File

@@ -2,6 +2,12 @@ import anthropic
import httpx import httpx
import openai import openai
import pytest import pytest
from anthropic.types.beta import (
BetaMessage,
BetaRawMessageStartEvent,
BetaRawMessageStopEvent,
BetaUsage,
)
from google.genai import errors as google_errors from google.genai import errors as google_errors
from letta.adapters.letta_llm_stream_adapter import LettaLLMStreamAdapter from letta.adapters.letta_llm_stream_adapter import LettaLLMStreamAdapter
@@ -9,6 +15,7 @@ from letta.errors import (
ContextWindowExceededError, ContextWindowExceededError,
LLMBadRequestError, LLMBadRequestError,
LLMConnectionError, LLMConnectionError,
LLMEmptyResponseError,
LLMInsufficientCreditsError, LLMInsufficientCreditsError,
LLMServerError, LLMServerError,
) )
@@ -287,3 +294,70 @@ def test_openai_client_handle_llm_error_non_credit_api_error():
result = client.handle_llm_error(error) result = client.handle_llm_error(error)
assert isinstance(result, LLMBadRequestError) assert isinstance(result, LLMBadRequestError)
assert not isinstance(result, LLMInsufficientCreditsError) assert not isinstance(result, LLMInsufficientCreditsError)
@pytest.mark.asyncio
async def test_letta_llm_stream_adapter_raises_empty_response_error_for_anthropic(monkeypatch):
"""LET-7679: Empty streaming responses (no content blocks) should raise LLMEmptyResponseError.
This tests the case where Opus 4.6 returns a response with:
- BetaRawMessageStartEvent (with usage tokens)
- BetaRawMessageStopEvent (end_turn)
- NO content blocks in between
This should raise LLMEmptyResponseError, not complete successfully with stop_reason=end_turn.
"""
class FakeAsyncStream:
"""Mimics anthropic.AsyncStream that returns empty content (no content blocks)."""
def __init__(self):
self.events = [
# Message start with some usage info
BetaRawMessageStartEvent(
type="message_start",
message=BetaMessage(
id="msg_test_empty",
type="message",
role="assistant",
content=[], # Empty content
model="claude-opus-4-6",
stop_reason="end_turn",
stop_sequence=None,
usage=BetaUsage(input_tokens=1000, output_tokens=26, cache_creation_input_tokens=0, cache_read_input_tokens=0),
),
),
# Message stop immediately after start - no content blocks
BetaRawMessageStopEvent(type="message_stop"),
]
self.index = 0
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc, tb):
return None
def __aiter__(self):
return self
async def __anext__(self):
if self.index >= len(self.events):
raise StopAsyncIteration
event = self.events[self.index]
self.index += 1
return event
async def fake_stream_async(self, request_data: dict, llm_config):
return FakeAsyncStream()
monkeypatch.setattr(AnthropicClient, "stream_async", fake_stream_async, raising=True)
llm_client = AnthropicClient()
llm_config = LLMConfig(model="claude-opus-4-6", model_endpoint_type="anthropic", context_window=200000)
adapter = LettaLLMStreamAdapter(llm_client=llm_client, llm_config=llm_config, call_type=LLMCallType.agent_step)
gen = adapter.invoke_llm(request_data={}, messages=[], tools=[], use_assistant_message=True)
with pytest.raises(LLMEmptyResponseError):
async for _ in gen:
pass

View File

@@ -0,0 +1,8 @@
{
"context_window": 32000,
"model": "gpt-5.3-codex",
"model_endpoint_type": "openai",
"model_endpoint": "https://api.openai.com/v1",
"model_wrapper": null,
"reasoning_effort": "low"
}

View File

@@ -141,7 +141,7 @@ async def create_test_agent(name, actor, test_id: Optional[str] = None, model="a
model="claude-3-7-sonnet-latest", model="claude-3-7-sonnet-latest",
model_endpoint_type="anthropic", model_endpoint_type="anthropic",
model_endpoint="https://api.anthropic.com/v1", model_endpoint="https://api.anthropic.com/v1",
context_window=32000, context_window=128000,
handle="anthropic/claude-3-7-sonnet-latest", handle="anthropic/claude-3-7-sonnet-latest",
put_inner_thoughts_in_kwargs=True, put_inner_thoughts_in_kwargs=True,
max_tokens=4096, max_tokens=4096,
@@ -193,7 +193,7 @@ async def create_test_batch_item(server, batch_id, agent_id, default_user):
model="claude-3-7-sonnet-latest", model="claude-3-7-sonnet-latest",
model_endpoint_type="anthropic", model_endpoint_type="anthropic",
model_endpoint="https://api.anthropic.com/v1", model_endpoint="https://api.anthropic.com/v1",
context_window=32000, context_window=128000,
handle="anthropic/claude-3-7-sonnet-latest", handle="anthropic/claude-3-7-sonnet-latest",
put_inner_thoughts_in_kwargs=True, put_inner_thoughts_in_kwargs=True,
max_tokens=4096, max_tokens=4096,

View File

@@ -62,12 +62,14 @@ class TestConversationsSDK:
# Create a conversation # Create a conversation
created = client.conversations.create(agent_id=agent.id) created = client.conversations.create(agent_id=agent.id)
# Retrieve it (should have empty in_context_message_ids initially) # Retrieve it (should have system message from creation)
retrieved = client.conversations.retrieve(conversation_id=created.id) retrieved = client.conversations.retrieve(conversation_id=created.id)
assert retrieved.id == created.id assert retrieved.id == created.id
assert retrieved.agent_id == created.agent_id assert retrieved.agent_id == created.agent_id
assert retrieved.in_context_message_ids == [] # Conversation should have 1 system message immediately after creation
assert len(retrieved.in_context_message_ids) == 1
assert retrieved.in_context_message_ids[0].startswith("message-")
# Send a message to the conversation # Send a message to the conversation
list( list(
@@ -566,6 +568,289 @@ class TestConversationsSDK:
# Should not contain the cursor message # Should not contain the cursor message
assert first_message_id not in [m.id for m in messages_after] assert first_message_id not in [m.id for m in messages_after]
def test_agent_direct_messaging_via_conversations_endpoint(self, client: Letta, agent):
"""Test sending messages using agent ID as conversation_id (agent-direct mode).
This allows clients to use a unified endpoint pattern without managing conversation IDs.
"""
# Send a message using the agent ID directly as conversation_id
# This should route to agent-direct mode with locking
messages = list(
client.conversations.messages.create(
conversation_id=agent.id, # Using agent ID instead of conversation ID
messages=[{"role": "user", "content": "Hello via agent-direct mode!"}],
)
)
# Verify we got a response
assert len(messages) > 0, "Should receive response messages"
# Verify we got an assistant message in the response
assistant_messages = [m for m in messages if hasattr(m, "message_type") and m.message_type == "assistant_message"]
assert len(assistant_messages) > 0, "Should receive at least one assistant message"
def test_agent_direct_messaging_with_locking(self, client: Letta, agent):
"""Test that agent-direct mode properly acquires and releases locks.
Sequential requests should both succeed if locks are properly released.
"""
from letta.settings import settings
# Skip if Redis is not configured
if settings.redis_host is None or settings.redis_port is None:
pytest.skip("Redis not configured - skipping agent-direct lock test")
# Send first message via agent-direct mode
messages1 = list(
client.conversations.messages.create(
conversation_id=agent.id,
messages=[{"role": "user", "content": "First message"}],
)
)
assert len(messages1) > 0, "First message should succeed"
# Send second message - should succeed if lock was released
messages2 = list(
client.conversations.messages.create(
conversation_id=agent.id,
messages=[{"role": "user", "content": "Second message"}],
)
)
assert len(messages2) > 0, "Second message should succeed after lock released"
def test_agent_direct_concurrent_requests_blocked(self, client: Letta, agent):
"""Test that concurrent requests to agent-direct mode are properly serialized.
One request should succeed and one should get a 409 CONVERSATION_BUSY error.
"""
import concurrent.futures
from letta_client import ConflictError
from letta.settings import settings
# Skip if Redis is not configured
if settings.redis_host is None or settings.redis_port is None:
pytest.skip("Redis not configured - skipping agent-direct lock test")
results = {"success": 0, "conflict": 0, "other_error": 0}
def send_message(msg: str):
try:
messages = list(
client.conversations.messages.create(
conversation_id=agent.id, # Agent-direct mode
messages=[{"role": "user", "content": msg}],
)
)
return ("success", messages)
except ConflictError:
return ("conflict", None)
except Exception as e:
return ("other_error", str(e))
# Fire off two messages concurrently
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
future1 = executor.submit(send_message, "Concurrent message 1")
future2 = executor.submit(send_message, "Concurrent message 2")
result1 = future1.result()
result2 = future2.result()
# Count results
for result_type, _ in [result1, result2]:
results[result_type] += 1
# One should succeed and one should get conflict
assert results["success"] == 1, f"Expected 1 success, got {results['success']}"
assert results["conflict"] == 1, f"Expected 1 conflict, got {results['conflict']}"
assert results["other_error"] == 0, f"Unexpected errors: {results['other_error']}"
# Now send another message - should succeed since lock is released
messages = list(
client.conversations.messages.create(
conversation_id=agent.id,
messages=[{"role": "user", "content": "Message after concurrent requests"}],
)
)
assert len(messages) > 0, "Should be able to send message after concurrent requests complete"
def test_agent_direct_list_messages(self, client: Letta, agent):
"""Test listing messages using agent ID as conversation_id."""
# First send a message via agent-direct mode
list(
client.conversations.messages.create(
conversation_id=agent.id,
messages=[{"role": "user", "content": "Test message for listing"}],
)
)
# List messages using agent ID
messages_page = client.conversations.messages.list(conversation_id=agent.id)
messages = list(messages_page)
# Should have messages (at least system + user + assistant)
assert len(messages) >= 3, f"Expected at least 3 messages, got {len(messages)}"
# Verify we can find our test message
user_messages = [m for m in messages if hasattr(m, "message_type") and m.message_type == "user_message"]
assert any("Test message for listing" in str(m.content) for m in user_messages), "Should find our test message"
def test_agent_direct_cancel(self, client: Letta, agent):
"""Test canceling runs using agent ID as conversation_id."""
from letta.settings import settings
# Skip if run tracking is disabled
if not settings.track_agent_run:
pytest.skip("Run tracking disabled - skipping cancel test")
# Start a background request that we can cancel
try:
# Send a message in background mode
stream = client.conversations.messages.create(
conversation_id=agent.id,
messages=[{"role": "user", "content": "Background message to cancel"}],
background=True,
)
# Consume a bit of the stream to ensure it started
next(iter(stream), None)
# Cancel using agent ID
result = client.conversations.cancel(conversation_id=agent.id)
# Should return results (may be empty if run already completed)
assert isinstance(result, dict), "Cancel should return a dict of results"
except Exception as e:
# If no active runs, that's okay - the run may have completed quickly
if "No active runs" not in str(e):
raise
def test_backwards_compatibility_old_pattern(self, client: Letta, agent, server_url: str):
"""Test that the old pattern (agent_id as conversation_id) still works for backwards compatibility."""
# OLD PATTERN: conversation_id=agent.id (should still work)
# Use raw HTTP requests since SDK might not be up to date
# Test 1: Send message using old pattern
response = requests.post(
f"{server_url}/v1/conversations/{agent.id}/messages",
json={
"messages": [{"role": "user", "content": "Testing old pattern still works"}],
"streaming": False,
},
)
assert response.status_code == 200, f"Old pattern should work for sending messages: {response.text}"
data = response.json()
assert "messages" in data, "Response should contain messages"
assert len(data["messages"]) > 0, "Should receive response messages"
# Test 2: List messages using old pattern
response = requests.get(f"{server_url}/v1/conversations/{agent.id}/messages")
assert response.status_code == 200, f"Old pattern should work for listing messages: {response.text}"
data = response.json()
# Response is a list of messages directly
assert isinstance(data, list), "Response should be a list of messages"
assert len(data) >= 3, "Should have at least system + user + assistant messages"
# Verify our message is there
user_messages = [m for m in data if m.get("message_type") == "user_message"]
assert any("Testing old pattern still works" in str(m.get("content", "")) for m in user_messages), "Should find our test message"
def test_new_pattern_send_message(self, client: Letta, agent, server_url: str):
"""Test sending messages using the new pattern: conversation_id='default' + agent_id in body."""
# NEW PATTERN: conversation_id='default' + agent_id in request body
response = requests.post(
f"{server_url}/v1/conversations/default/messages",
json={
"agent_id": agent.id,
"messages": [{"role": "user", "content": "Testing new pattern send message"}],
"streaming": False,
},
)
assert response.status_code == 200, f"New pattern should work for sending messages: {response.text}"
data = response.json()
assert "messages" in data, "Response should contain messages"
assert len(data["messages"]) > 0, "Should receive response messages"
# Verify we got an assistant message
assistant_messages = [m for m in data["messages"] if m.get("message_type") == "assistant_message"]
assert len(assistant_messages) > 0, "Should receive at least one assistant message"
def test_new_pattern_list_messages(self, client: Letta, agent, server_url: str):
"""Test listing messages using the new pattern: conversation_id='default' + agent_id query param."""
# First send a message to populate the conversation
requests.post(
f"{server_url}/v1/conversations/{agent.id}/messages",
json={
"messages": [{"role": "user", "content": "Setup message for list test"}],
"streaming": False,
},
)
# NEW PATTERN: conversation_id='default' + agent_id as query param
response = requests.get(
f"{server_url}/v1/conversations/default/messages",
params={"agent_id": agent.id},
)
assert response.status_code == 200, f"New pattern should work for listing messages: {response.text}"
data = response.json()
# Response is a list of messages directly
assert isinstance(data, list), "Response should be a list of messages"
assert len(data) >= 3, "Should have at least system + user + assistant messages"
def test_new_pattern_cancel(self, client: Letta, agent, server_url: str):
"""Test canceling runs using the new pattern: conversation_id='default' + agent_id query param."""
from letta.settings import settings
if not settings.track_agent_run:
pytest.skip("Run tracking disabled - skipping cancel test")
# NEW PATTERN: conversation_id='default' + agent_id as query param
response = requests.post(
f"{server_url}/v1/conversations/default/cancel",
params={"agent_id": agent.id},
)
# Returns 200 with results if runs exist, or 409 if no active runs
assert response.status_code in [200, 409], f"New pattern should work for cancel: {response.text}"
if response.status_code == 200:
data = response.json()
assert isinstance(data, dict), "Cancel should return a dict"
def test_new_pattern_compact(self, client: Letta, agent, server_url: str):
"""Test compacting conversation using the new pattern: conversation_id='default' + agent_id in body."""
# Send many messages to have enough for compaction
for i in range(10):
requests.post(
f"{server_url}/v1/conversations/{agent.id}/messages",
json={
"messages": [{"role": "user", "content": f"Message {i} for compaction test"}],
"streaming": False,
},
)
# NEW PATTERN: conversation_id='default' + agent_id in request body
response = requests.post(
f"{server_url}/v1/conversations/default/compact",
json={"agent_id": agent.id},
)
# May return 200 (success) or 400 (not enough messages to compact)
assert response.status_code in [200, 400], f"New pattern should accept agent_id parameter: {response.text}"
if response.status_code == 200:
data = response.json()
assert "summary" in data, "Response should contain summary"
assert "num_messages_before" in data, "Response should contain num_messages_before"
assert "num_messages_after" in data, "Response should contain num_messages_after"
def test_new_pattern_stream_retrieve(self, client: Letta, agent, server_url: str):
"""Test retrieving stream using the new pattern: conversation_id='default' + agent_id in body."""
# NEW PATTERN: conversation_id='default' + agent_id in request body
# Note: This will likely return 400 if no active run exists, which is expected
response = requests.post(
f"{server_url}/v1/conversations/default/stream",
json={"agent_id": agent.id},
)
# Either 200 (if run exists) or 400 (no active run) are both acceptable
assert response.status_code in [200, 400], f"Stream retrieve should accept new pattern: {response.text}"
class TestConversationDelete: class TestConversationDelete:
"""Tests for the conversation delete endpoint.""" """Tests for the conversation delete endpoint."""
@@ -834,3 +1119,130 @@ class TestConversationCompact:
) )
assert response.status_code == 404 assert response.status_code == 404
class TestConversationSystemMessageRecompilation:
"""Tests that verify the system message is recompiled with latest memory state on new conversation creation."""
def test_new_conversation_recompiles_system_message_with_updated_memory(self, client: Letta, server_url: str):
"""Test the full workflow:
1. Agent is created
2. Send message to agent (through a conversation)
3. Modify the memory block -> check system message is NOT updated with the modified value
4. Create a new conversation
5. Check new conversation system message DOES have the modified value
"""
unique_marker = f"UNIQUE_MARKER_{uuid.uuid4().hex[:8]}"
# Step 1: Create an agent with known memory blocks
agent = client.agents.create(
name=f"test_sys_msg_recompile_{uuid.uuid4().hex[:8]}",
model="openai/gpt-4o-mini",
embedding="openai/text-embedding-3-small",
memory_blocks=[
{"label": "human", "value": "The user is a test user."},
{"label": "persona", "value": "You are a helpful assistant."},
],
)
try:
# Step 2: Create a conversation and send a message to it
conv1 = client.conversations.create(agent_id=agent.id)
list(
client.conversations.messages.create(
conversation_id=conv1.id,
messages=[{"role": "user", "content": "Hello, just a quick test."}],
)
)
# Verify the conversation has messages including a system message
conv1_messages = client.conversations.messages.list(
conversation_id=conv1.id,
order="asc",
)
assert len(conv1_messages) >= 3 # system + user + assistant
assert conv1_messages[0].message_type == "system_message"
# Get the original system message content
original_system_content = conv1_messages[0].content
assert unique_marker not in original_system_content, "Marker should not be in original system message"
# Step 3: Modify the memory block with a unique marker
client.agents.blocks.update(
agent_id=agent.id,
block_label="human",
value=f"The user is a test user. {unique_marker}",
)
# Verify the block was actually updated
updated_block = client.agents.blocks.retrieve(agent_id=agent.id, block_label="human")
assert unique_marker in updated_block.value
# Check that the OLD conversation's system message is NOT updated
conv1_messages_after_update = client.conversations.messages.list(
conversation_id=conv1.id,
order="asc",
)
old_system_content = conv1_messages_after_update[0].content
assert unique_marker not in old_system_content, "Old conversation system message should NOT contain the updated memory value"
# Step 4: Create a new conversation
conv2 = client.conversations.create(agent_id=agent.id)
# Step 5: Check the new conversation's system message has the updated value
# The system message should be compiled at creation time with the latest memory
conv2_retrieved = client.conversations.retrieve(conversation_id=conv2.id)
assert len(conv2_retrieved.in_context_message_ids) == 1, (
f"New conversation should have exactly 1 system message, got {len(conv2_retrieved.in_context_message_ids)}"
)
conv2_messages = client.conversations.messages.list(
conversation_id=conv2.id,
order="asc",
)
assert len(conv2_messages) >= 1
assert conv2_messages[0].message_type == "system_message"
new_system_content = conv2_messages[0].content
assert unique_marker in new_system_content, (
f"New conversation system message should contain the updated memory value '{unique_marker}', "
f"but system message content did not include it"
)
finally:
client.agents.delete(agent_id=agent.id)
def test_conversation_creation_initializes_system_message(self, client: Letta, server_url: str):
"""Test that creating a conversation immediately initializes it with a system message."""
agent = client.agents.create(
name=f"test_conv_init_{uuid.uuid4().hex[:8]}",
model="openai/gpt-4o-mini",
embedding="openai/text-embedding-3-small",
memory_blocks=[
{"label": "human", "value": "Test user for system message init."},
{"label": "persona", "value": "You are a helpful assistant."},
],
)
try:
# Create a conversation (without sending any messages)
conversation = client.conversations.create(agent_id=agent.id)
# Verify the conversation has a system message immediately
retrieved = client.conversations.retrieve(conversation_id=conversation.id)
assert len(retrieved.in_context_message_ids) == 1, (
f"Expected 1 system message after conversation creation, got {len(retrieved.in_context_message_ids)}"
)
# Verify the system message content contains memory block values
messages = client.conversations.messages.list(
conversation_id=conversation.id,
order="asc",
)
assert len(messages) == 1
assert messages[0].message_type == "system_message"
assert "Test user for system message init." in messages[0].content
finally:
client.agents.delete(agent_id=agent.id)

View File

@@ -93,7 +93,7 @@ def agent_obj(client: Letta) -> AgentState:
tool_ids=[send_message_to_agent_tool.id], tool_ids=[send_message_to_agent_tool.id],
model="openai/gpt-4o", model="openai/gpt-4o",
embedding="openai/text-embedding-3-small", embedding="openai/text-embedding-3-small",
context_window_limit=32000, context_window_limit=128000,
) )
yield agent_state_instance yield agent_state_instance
@@ -107,7 +107,7 @@ def other_agent_obj(client: Letta) -> AgentState:
include_multi_agent_tools=False, include_multi_agent_tools=False,
model="openai/gpt-4o", model="openai/gpt-4o",
embedding="openai/text-embedding-3-small", embedding="openai/text-embedding-3-small",
context_window_limit=32000, context_window_limit=128000,
) )
yield agent_state_instance yield agent_state_instance

View File

@@ -366,6 +366,8 @@ async def test_compaction_settings_model_uses_separate_llm_config_for_summarizat
async def test_create_agent_sets_default_compaction_model_anthropic(server: SyncServer, default_user): async def test_create_agent_sets_default_compaction_model_anthropic(server: SyncServer, default_user):
"""When no compaction_settings provided for Anthropic agent, default haiku model should be set.""" """When no compaction_settings provided for Anthropic agent, default haiku model should be set."""
from letta.schemas.agent import CreateAgent from letta.schemas.agent import CreateAgent
from letta.schemas.enums import ProviderType
from letta.services.summarizer.summarizer_config import get_default_summarizer_model
await server.init_async(init_with_default_org_and_user=True) await server.init_async(init_with_default_org_and_user=True)
@@ -384,7 +386,7 @@ async def test_create_agent_sets_default_compaction_model_anthropic(server: Sync
# Should have default haiku model set # Should have default haiku model set
assert agent.compaction_settings is not None assert agent.compaction_settings is not None
assert agent.compaction_settings.model == "anthropic/claude-haiku-4-5-20251001" assert agent.compaction_settings.model == get_default_summarizer_model(ProviderType.anthropic)
@pytest.mark.asyncio @pytest.mark.asyncio
@@ -808,6 +810,79 @@ async def test_update_agent_compaction_settings(server: SyncServer, comprehensiv
assert updated_agent.compaction_settings.prompt_acknowledgement == False assert updated_agent.compaction_settings.prompt_acknowledgement == False
@pytest.mark.asyncio
async def test_update_agent_partial_compaction_settings(server: SyncServer, comprehensive_test_agent_fixture, default_user):
"""Test that an agent's compaction_settings can be upserted."""
from letta.services.summarizer.summarizer_config import get_default_prompt_for_mode
agent, _ = comprehensive_test_agent_fixture
# Create new compaction settings
original_compaction_settings = agent.compaction_settings.model_copy()
new_compaction_settings = CompactionSettings(
mode="all",
prompt_acknowledgement=True,
clip_chars=3000,
)
# Update agent with compaction settings
update_agent_request = UpdateAgent(
compaction_settings=new_compaction_settings,
)
updated_agent = await server.agent_manager.update_agent_async(agent.id, update_agent_request, actor=default_user)
# Verify compaction settings were updated correctly
assert updated_agent.compaction_settings is not None
assert updated_agent.compaction_settings.model == original_compaction_settings.model
assert updated_agent.compaction_settings.model_settings == original_compaction_settings.model_settings
assert updated_agent.compaction_settings.sliding_window_percentage == original_compaction_settings.sliding_window_percentage
assert updated_agent.compaction_settings.mode == "all"
assert updated_agent.compaction_settings.clip_chars == 3000
assert updated_agent.compaction_settings.prompt == get_default_prompt_for_mode("all")
assert updated_agent.compaction_settings.prompt_acknowledgement == True
@pytest.mark.asyncio
async def test_update_agent_partial_compaction_settings_same_mode(server: SyncServer, comprehensive_test_agent_fixture, default_user):
"""Test that if the mode stays the same without a prompt passed in, the prompt is not updated."""
agent, _ = comprehensive_test_agent_fixture
update_agent_request = UpdateAgent(
compaction_settings=CompactionSettings(mode="sliding_window", prompt="This is a fake prompt."),
)
updated_agent = await server.agent_manager.update_agent_async(agent.id, update_agent_request, actor=default_user)
assert updated_agent.compaction_settings is not None
assert updated_agent.compaction_settings.prompt == "This is a fake prompt."
# Create new compaction settings
original_compaction_settings = updated_agent.compaction_settings.model_copy()
new_compaction_settings = CompactionSettings(
mode="sliding_window",
model="openai/gpt-4o-mini",
)
# Update agent with compaction settings
update_agent_request = UpdateAgent(
compaction_settings=new_compaction_settings,
)
final_agent = await server.agent_manager.update_agent_async(updated_agent.id, update_agent_request, actor=default_user)
# Verify compaction settings were updated correctly
assert final_agent.compaction_settings is not None
assert final_agent.compaction_settings.sliding_window_percentage == original_compaction_settings.sliding_window_percentage
assert final_agent.compaction_settings.prompt == original_compaction_settings.prompt
assert final_agent.compaction_settings.clip_chars == original_compaction_settings.clip_chars
assert final_agent.compaction_settings.prompt_acknowledgement == original_compaction_settings.prompt_acknowledgement
assert final_agent.compaction_settings.mode == "sliding_window"
assert final_agent.compaction_settings.model == "openai/gpt-4o-mini"
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_agent_file_defaults_based_on_context_window(server: SyncServer, default_user, default_block): async def test_agent_file_defaults_based_on_context_window(server: SyncServer, default_user, default_block):
"""Test that file-related defaults are set based on the model's context window size""" """Test that file-related defaults are set based on the model's context window size"""

View File

@@ -562,7 +562,9 @@ async def test_update_block(server: SyncServer, default_user):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_update_block_limit(server: SyncServer, default_user): async def test_update_block_limit(server: SyncServer, default_user):
block_manager = BlockManager() block_manager = BlockManager()
block = await block_manager.create_or_update_block_async(PydanticBlock(label="persona", value="Original Content"), actor=default_user) block = await block_manager.create_or_update_block_async(
PydanticBlock(label="persona", value="Original Content", limit=20000), actor=default_user
)
limit = len("Updated Content") * 2000 limit = len("Updated Content") * 2000
update_data = BlockUpdate(value="Updated Content" * 2000, description="Updated description") update_data = BlockUpdate(value="Updated Content" * 2000, description="Updated description")

View File

@@ -355,8 +355,9 @@ async def test_add_messages_to_conversation(
actor=default_user, actor=default_user,
) )
assert len(message_ids) == 1 # create_conversation auto-creates a system message at position 0
assert message_ids[0] == hello_world_message_fixture.id assert len(message_ids) == 2
assert hello_world_message_fixture.id in message_ids
@pytest.mark.asyncio @pytest.mark.asyncio
@@ -385,8 +386,9 @@ async def test_get_messages_for_conversation(
actor=default_user, actor=default_user,
) )
assert len(messages) == 1 # create_conversation auto-creates a system message at position 0
assert messages[0].id == hello_world_message_fixture.id assert len(messages) == 2
assert any(m.id == hello_world_message_fixture.id for m in messages)
@pytest.mark.asyncio @pytest.mark.asyncio
@@ -430,7 +432,10 @@ async def test_message_ordering_in_conversation(conversation_manager, server: Sy
actor=default_user, actor=default_user,
) )
assert retrieved_ids == [m.id for m in messages] # create_conversation auto-creates a system message at position 0,
# so the user messages start at index 1
assert len(retrieved_ids) == len(messages) + 1
assert retrieved_ids[1:] == [m.id for m in messages]
@pytest.mark.asyncio @pytest.mark.asyncio
@@ -489,7 +494,7 @@ async def test_update_in_context_messages(conversation_manager, server: SyncServ
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_empty_conversation_message_ids(conversation_manager, server: SyncServer, sarah_agent, default_user): async def test_empty_conversation_message_ids(conversation_manager, server: SyncServer, sarah_agent, default_user):
"""Test getting message IDs from an empty conversation.""" """Test getting message IDs from a newly created conversation (has auto-created system message)."""
# Create a conversation # Create a conversation
conversation = await conversation_manager.create_conversation( conversation = await conversation_manager.create_conversation(
agent_id=sarah_agent.id, agent_id=sarah_agent.id,
@@ -497,13 +502,14 @@ async def test_empty_conversation_message_ids(conversation_manager, server: Sync
actor=default_user, actor=default_user,
) )
# Get message IDs (should be empty) # create_conversation auto-creates a system message at position 0,
# so a newly created conversation has exactly one message
message_ids = await conversation_manager.get_message_ids_for_conversation( message_ids = await conversation_manager.get_message_ids_for_conversation(
conversation_id=conversation.id, conversation_id=conversation.id,
actor=default_user, actor=default_user,
) )
assert message_ids == [] assert len(message_ids) == 1
@pytest.mark.asyncio @pytest.mark.asyncio
@@ -551,9 +557,11 @@ async def test_list_conversation_messages(conversation_manager, server: SyncServ
actor=default_user, actor=default_user,
) )
assert len(letta_messages) == 2 # create_conversation auto-creates a system message, so we get 3 total
assert len(letta_messages) == 3
# Check message types # Check message types
message_types = [m.message_type for m in letta_messages] message_types = [m.message_type for m in letta_messages]
assert "system_message" in message_types
assert "user_message" in message_types assert "user_message" in message_types
assert "assistant_message" in message_types assert "assistant_message" in message_types
@@ -902,9 +910,12 @@ async def test_list_conversation_messages_ascending_order(conversation_manager,
reverse=False, reverse=False,
) )
# First message should be "Message 0" (oldest) # create_conversation auto-creates a system message at position 0,
assert len(letta_messages) == 3 # so we get 4 messages total (system + 3 user messages)
assert "Message 0" in letta_messages[0].content assert len(letta_messages) == 4
# First message is the auto-created system message; "Message 0" is second
assert letta_messages[0].message_type == "system_message"
assert "Message 0" in letta_messages[1].content
@pytest.mark.asyncio @pytest.mark.asyncio
@@ -949,8 +960,9 @@ async def test_list_conversation_messages_descending_order(conversation_manager,
reverse=True, reverse=True,
) )
# First message should be "Message 2" (newest) # create_conversation auto-creates a system message, so 4 total
assert len(letta_messages) == 3 # First message should be "Message 2" (newest) in descending order
assert len(letta_messages) == 4
assert "Message 2" in letta_messages[0].content assert "Message 2" in letta_messages[0].content
@@ -1081,7 +1093,8 @@ async def test_list_conversation_messages_no_group_id_returns_all(conversation_m
actor=default_user, actor=default_user,
) )
assert len(all_messages) == 3 # create_conversation auto-creates a system message, so 4 total
assert len(all_messages) == 4
@pytest.mark.asyncio @pytest.mark.asyncio
@@ -1137,8 +1150,8 @@ async def test_list_conversation_messages_order_with_pagination(conversation_man
# The first messages should be different # The first messages should be different
assert page_asc[0].content != page_desc[0].content assert page_asc[0].content != page_desc[0].content
# In ascending, first should be "Message 0" # In ascending, first is the auto-created system message, second is "Message 0"
assert "Message 0" in page_asc[0].content assert page_asc[0].message_type == "system_message"
# In descending, first should be "Message 4" # In descending, first should be "Message 4"
assert "Message 4" in page_desc[0].content assert "Message 4" in page_desc[0].content

View File

@@ -579,8 +579,11 @@ async def test_server_startup_syncs_base_providers(default_user, default_organiz
yield item yield item
# Mock the Anthropic AsyncAnthropic client # Mock the Anthropic AsyncAnthropic client
# NOTE: list() must be a regular (non-async) method that returns an async iterable,
# because the real Anthropic SDK's models.list() returns an AsyncPage (which has __aiter__)
# directly, and the code uses `async for model in client.models.list()`.
class MockAnthropicModels: class MockAnthropicModels:
async def list(self): def list(self):
return MockAnthropicAsyncPage(mock_anthropic_models["data"]) return MockAnthropicAsyncPage(mock_anthropic_models["data"])
class MockAsyncAnthropic: class MockAsyncAnthropic:
@@ -877,8 +880,10 @@ async def test_server_startup_handles_api_errors_gracefully(default_user, defaul
for item in self._items: for item in self._items:
yield item yield item
# NOTE: The real SDK's models.list() is a regular (non-async) method that
# returns an AsyncPaginator (which is async-iterable).
class MockAnthropicModels: class MockAnthropicModels:
async def list(self): def list(self):
return MockAnthropicAsyncPage(mock_anthropic_data) return MockAnthropicAsyncPage(mock_anthropic_data)
class MockAsyncAnthropic: class MockAsyncAnthropic:

View File

@@ -0,0 +1,11 @@
{
"handle": "openai/gpt-5.3-chat-latest",
"model_settings": {
"provider_type": "openai",
"max_output_tokens": 4096,
"parallel_tool_calls": false,
"reasoning": {
"reasoning_effort": "minimal"
}
}
}

View File

@@ -1,11 +1,11 @@
from conftest import create_test_module from conftest import create_test_module
from letta_client import UnprocessableEntityError from letta_client import UnprocessableEntityError
from letta.constants import CORE_MEMORY_HUMAN_CHAR_LIMIT, CORE_MEMORY_PERSONA_CHAR_LIMIT from letta.constants import CORE_MEMORY_BLOCK_CHAR_LIMIT
BLOCKS_CREATE_PARAMS = [ BLOCKS_CREATE_PARAMS = [
("human_block", {"label": "human", "value": "test"}, {"limit": CORE_MEMORY_HUMAN_CHAR_LIMIT}, None), ("human_block", {"label": "human", "value": "test"}, {"limit": CORE_MEMORY_BLOCK_CHAR_LIMIT}, None),
("persona_block", {"label": "persona", "value": "test1"}, {"limit": CORE_MEMORY_PERSONA_CHAR_LIMIT}, None), ("persona_block", {"label": "persona", "value": "test1"}, {"limit": CORE_MEMORY_BLOCK_CHAR_LIMIT}, None),
] ]
BLOCKS_UPDATE_PARAMS = [ BLOCKS_UPDATE_PARAMS = [

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -44,7 +44,7 @@
"provider_name": null, "provider_name": null,
"provider_category": null, "provider_category": null,
"model_wrapper": null, "model_wrapper": null,
"context_window": 32000, "context_window": 128000,
"put_inner_thoughts_in_kwargs": false, "put_inner_thoughts_in_kwargs": false,
"handle": "anthropic/claude-3.5-sonnet", "handle": "anthropic/claude-3.5-sonnet",
"temperature": 1.0, "temperature": 1.0,

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -56,7 +56,7 @@
"provider_name": "openai", "provider_name": "openai",
"provider_category": "base", "provider_category": "base",
"model_wrapper": null, "model_wrapper": null,
"context_window": 32000, "context_window": 128000,
"put_inner_thoughts_in_kwargs": true, "put_inner_thoughts_in_kwargs": true,
"handle": "openai/gpt-4o-mini", "handle": "openai/gpt-4o-mini",
"temperature": 1.0, "temperature": 1.0,

View File

@@ -55,7 +55,7 @@
"provider_name": "openai", "provider_name": "openai",
"provider_category": "base", "provider_category": "base",
"model_wrapper": null, "model_wrapper": null,
"context_window": 32000, "context_window": 128000,
"put_inner_thoughts_in_kwargs": true, "put_inner_thoughts_in_kwargs": true,
"handle": "openai/gpt-4.1-mini", "handle": "openai/gpt-4.1-mini",
"temperature": 1.0, "temperature": 1.0,

View File

@@ -16,7 +16,7 @@ def llm_config():
model="claude-3-7-sonnet-20250219", model="claude-3-7-sonnet-20250219",
model_endpoint_type="anthropic", model_endpoint_type="anthropic",
model_endpoint="https://api.anthropic.com/v1", model_endpoint="https://api.anthropic.com/v1",
context_window=32000, context_window=128000,
handle="anthropic/claude-sonnet-4-20250514", handle="anthropic/claude-sonnet-4-20250514",
put_inner_thoughts_in_kwargs=False, put_inner_thoughts_in_kwargs=False,
max_tokens=4096, max_tokens=4096,

View File

@@ -52,8 +52,17 @@ class TestLogContextMiddleware:
async def get_files(self, agent_id, org_id, ref): async def get_files(self, agent_id, org_id, ref):
assert ref == "HEAD" assert ref == "HEAD"
return { return {
"system/human.md": "---\ndescription: human\nlimit: 20000\n---\nname: sarah", "system/human.md": "---\ndescription: human\n---\nname: sarah",
"system/persona.md": "---\ndescription: persona\nlimit: 20000\n---\nbe helpful", "system/persona.md": "---\ndescription: persona\n---\nbe helpful",
"skills/research-helper/SKILL.md": (
"---\n"
"name: research-helper\n"
"description: Search the web and summarize findings.\n"
"---\n"
"# Research Helper\n\n"
"Use this skill to do deep web research and summarize results.\n"
),
"skills/research-helper/references/details.md": "---\ndescription: nested\n---\nShould not be synced",
} }
class DummyMemoryRepoManager: class DummyMemoryRepoManager:
@@ -95,6 +104,12 @@ class TestLogContextMiddleware:
labels = {call["label"] for call in synced_calls} labels = {call["label"] for call in synced_calls}
assert "system/human" in labels assert "system/human" in labels
assert "system/persona" in labels assert "system/persona" in labels
assert "skills/research-helper" in labels
assert "skills/research-helper/references/details" not in labels
by_label = {call["label"]: call for call in synced_calls}
assert by_label["skills/research-helper"]["description"] == "Search the web and summarize findings."
assert by_label["skills/research-helper"]["value"].startswith("# Research Helper")
def test_extracts_actor_id_from_headers(self, client): def test_extracts_actor_id_from_headers(self, client):
response = client.get("/v1/agents/agent-123e4567-e89b-42d3-8456-426614174000", headers={"user_id": "user-abc123"}) response = client.get("/v1/agents/agent-123e4567-e89b-42d3-8456-426614174000", headers={"user_id": "user-abc123"})

View File

@@ -25,9 +25,9 @@ def test_chat_memory_init_and_utils(chat_memory: Memory):
def test_memory_limit_validation(chat_memory: Memory): def test_memory_limit_validation(chat_memory: Memory):
with pytest.raises(ValueError): with pytest.raises(ValueError):
ChatMemory(persona="x " * 50000, human="y " * 50000) ChatMemory(persona="x " * 60000, human="y " * 60000)
with pytest.raises(ValueError): with pytest.raises(ValueError):
chat_memory.get_block("persona").value = "x " * 50000 chat_memory.get_block("persona").value = "x " * 60000
def test_get_block_not_found(chat_memory: Memory): def test_get_block_not_found(chat_memory: Memory):
@@ -253,3 +253,104 @@ def test_compile_git_memory_filesystem_handles_leaf_directory_collisions():
assert "system/" in out assert "system/" in out
assert "system.md" in out assert "system.md" in out
assert "human.md" in out assert "human.md" in out
def test_compile_git_memory_filesystem_renders_descriptions_for_non_system_files():
"""Files outside system/ should render their description in the filesystem tree.
e.g. `reference/api.md (Contains API specifications)`
System files should NOT render descriptions in the tree.
"""
m = Memory(
agent_type=AgentType.letta_v1_agent,
git_enabled=True,
blocks=[
Block(label="system/human", value="human data", limit=100, description="The human block"),
Block(label="system/persona", value="persona data", limit=100, description="The persona block"),
Block(label="reference/api", value="api specs", limit=100, description="Contains API specifications"),
Block(label="notes", value="my notes", limit=100, description="Personal notes and reminders"),
],
)
out = m.compile()
# Filesystem tree should exist
assert "<memory_filesystem>" in out
# Non-system files should have descriptions rendered
assert "api.md (Contains API specifications)" in out
assert "notes.md (Personal notes and reminders)" in out
# System files should NOT have descriptions in the tree
assert "human.md (The human block)" not in out
assert "persona.md (The persona block)" not in out
# But they should still be in the tree (without description)
assert "human.md" in out
assert "persona.md" in out
def test_compile_git_memory_filesystem_no_description_when_empty():
"""Files outside system/ with no description should render without parentheses."""
m = Memory(
agent_type=AgentType.letta_v1_agent,
git_enabled=True,
blocks=[
Block(label="system/human", value="human data", limit=100),
Block(label="notes", value="my notes", limit=100),
Block(label="reference/api", value="api specs", limit=100, description="API docs"),
],
)
out = m.compile()
# notes.md has no description, so no parentheses
assert "notes.md\n" in out or "notes.md\n" in out
# reference/api.md has a description
assert "api.md (API docs)" in out
def test_compile_git_memory_filesystem_condenses_skills_to_top_level_entries():
"""skills/ should render as top-level skill entries with description.
We intentionally avoid showing nested files under skills/ in the system
prompt tree to keep context concise.
"""
m = Memory(
agent_type=AgentType.letta_v1_agent,
git_enabled=True,
blocks=[
Block(label="system/human", value="human data", limit=100),
Block(
label="skills/searching-messages",
value="# searching messages",
limit=100,
description="Search past messages to recall context.",
),
Block(
label="skills/creating-skills",
value="# creating skills",
limit=100,
description="Guide for creating effective skills.",
),
Block(
label="skills/creating-skills/references/workflows",
value="nested docs",
limit=100,
description="Nested workflow docs (should not appear)",
),
],
)
out = m.compile()
# Condensed top-level skill entries with descriptions.
assert "searching-messages (Search past messages to recall context.)" in out
assert "creating-skills (Guide for creating effective skills.)" in out
# Do not show .md suffixes or nested skill docs in tree.
assert "searching-messages.md" not in out
assert "creating-skills.md" not in out
assert "references/workflows" not in out

View File

@@ -24,6 +24,9 @@ def test_get_headers_user_id_allows_none():
letta_v1_agent=None, letta_v1_agent=None,
letta_v1_agent_message_async=None, letta_v1_agent_message_async=None,
modal_sandbox=None, modal_sandbox=None,
billing_plan_type=None,
billing_cost_source=None,
billing_customer_id=None,
) )
assert isinstance(headers, HeaderParams) assert isinstance(headers, HeaderParams)
@@ -40,6 +43,9 @@ def test_get_headers_user_id_rejects_invalid_format():
letta_v1_agent=None, letta_v1_agent=None,
letta_v1_agent_message_async=None, letta_v1_agent_message_async=None,
modal_sandbox=None, modal_sandbox=None,
billing_plan_type=None,
billing_cost_source=None,
billing_customer_id=None,
) )
@@ -54,6 +60,9 @@ def test_get_headers_user_id_accepts_valid_format():
letta_v1_agent=None, letta_v1_agent=None,
letta_v1_agent_message_async=None, letta_v1_agent_message_async=None,
modal_sandbox=None, modal_sandbox=None,
billing_plan_type=None,
billing_cost_source=None,
billing_customer_id=None,
) )
assert headers.actor_id == "user-123e4567-e89b-42d3-8456-426614174000" assert headers.actor_id == "user-123e4567-e89b-42d3-8456-426614174000"

2
uv.lock generated
View File

@@ -2510,7 +2510,7 @@ wheels = [
[[package]] [[package]]
name = "letta" name = "letta"
version = "0.16.5" version = "0.16.6"
source = { editable = "." } source = { editable = "." }
dependencies = [ dependencies = [
{ name = "aiofiles" }, { name = "aiofiles" },