chore: bump 0.16.6 (#3211)
This commit is contained in:
@@ -260,6 +260,7 @@ model:
|
||||
base_url: https://generativelanguage.googleapis.com/
|
||||
force_minimum_thinking_budget: false
|
||||
max_retries: 5
|
||||
timeout_seconds: 600.0
|
||||
|
||||
# Google Vertex (-> GOOGLE_CLOUD_*)
|
||||
# google_cloud:
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
220
fern/scripts/prepare-openapi.ts
Normal file
220
fern/scripts/prepare-openapi.ts
Normal file
@@ -0,0 +1,220 @@
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
import { omit } from 'lodash';
|
||||
import { execSync } from 'child_process';
|
||||
import { merge, isErrorResult } from 'openapi-merge';
|
||||
import type { Swagger } from 'atlassian-openapi';
|
||||
import { RESTRICTED_ROUTE_BASE_PATHS } from '@letta-cloud/sdk-core';
|
||||
|
||||
const lettaWebOpenAPIPath = path.join(
|
||||
__dirname,
|
||||
'..',
|
||||
'..',
|
||||
'..',
|
||||
'web',
|
||||
'autogenerated',
|
||||
'letta-web-openapi.json',
|
||||
);
|
||||
const lettaAgentsAPIPath = path.join(
|
||||
__dirname,
|
||||
'..',
|
||||
'..',
|
||||
'letta',
|
||||
'server',
|
||||
'openapi_letta.json',
|
||||
);
|
||||
|
||||
const lettaWebOpenAPI = JSON.parse(
|
||||
fs.readFileSync(lettaWebOpenAPIPath, 'utf8'),
|
||||
) as Swagger.SwaggerV3;
|
||||
const lettaAgentsAPI = JSON.parse(
|
||||
fs.readFileSync(lettaAgentsAPIPath, 'utf8'),
|
||||
) as Swagger.SwaggerV3;
|
||||
|
||||
// removes any routes that are restricted
|
||||
lettaAgentsAPI.paths = Object.fromEntries(
|
||||
Object.entries(lettaAgentsAPI.paths).filter(([path]) =>
|
||||
RESTRICTED_ROUTE_BASE_PATHS.every(
|
||||
(restrictedPath) => !path.startsWith(restrictedPath),
|
||||
),
|
||||
),
|
||||
);
|
||||
|
||||
const lettaAgentsAPIWithNoEndslash = Object.keys(lettaAgentsAPI.paths).reduce(
|
||||
(acc, path) => {
|
||||
const pathWithoutSlash = path.endsWith('/')
|
||||
? path.slice(0, path.length - 1)
|
||||
: path;
|
||||
acc[pathWithoutSlash] = lettaAgentsAPI.paths[path];
|
||||
return acc;
|
||||
},
|
||||
{} as Swagger.SwaggerV3['paths'],
|
||||
);
|
||||
|
||||
// remove duplicate paths, delete from letta-web-openapi if it exists in sdk-core
|
||||
// some paths will have an extra / at the end, so we need to remove that as well
|
||||
lettaWebOpenAPI.paths = Object.fromEntries(
|
||||
Object.entries(lettaWebOpenAPI.paths).filter(([path]) => {
|
||||
const pathWithoutSlash = path.endsWith('/')
|
||||
? path.slice(0, path.length - 1)
|
||||
: path;
|
||||
return !lettaAgentsAPIWithNoEndslash[pathWithoutSlash];
|
||||
}),
|
||||
);
|
||||
|
||||
const agentStatePathsToOverride: Array<[string, string]> = [
|
||||
['/v1/templates/{project}/{template_version}/agents', '201'],
|
||||
['/v1/agents/search', '200'],
|
||||
];
|
||||
|
||||
for (const [path, responseCode] of agentStatePathsToOverride) {
|
||||
if (lettaWebOpenAPI.paths[path]?.post?.responses?.[responseCode]) {
|
||||
// Get direct reference to the schema object
|
||||
const responseSchema =
|
||||
lettaWebOpenAPI.paths[path].post.responses[responseCode];
|
||||
const contentSchema = responseSchema.content['application/json'].schema;
|
||||
|
||||
// Replace the entire agents array schema with the reference
|
||||
if (contentSchema.properties?.agents) {
|
||||
contentSchema.properties.agents = {
|
||||
type: 'array',
|
||||
items: {
|
||||
$ref: '#/components/schemas/AgentState',
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// go through the paths and remove "user_id"/"actor_id" from the headers
|
||||
for (const path of Object.keys(lettaAgentsAPI.paths)) {
|
||||
for (const method of Object.keys(lettaAgentsAPI.paths[path])) {
|
||||
// @ts-expect-error - a
|
||||
if (lettaAgentsAPI.paths[path][method]?.parameters) {
|
||||
// @ts-expect-error - a
|
||||
lettaAgentsAPI.paths[path][method].parameters = lettaAgentsAPI.paths[
|
||||
path
|
||||
][method].parameters.filter(
|
||||
(param: Record<string, string>) =>
|
||||
param.in !== 'header' ||
|
||||
(
|
||||
param.name !== 'user_id' &&
|
||||
param.name !== 'User-Agent' &&
|
||||
param.name !== 'X-Project-Id' &&
|
||||
param.name !== 'X-Letta-Source' &&
|
||||
param.name !== 'X-Stainless-Package-Version' &&
|
||||
!param.name.startsWith('X-Experimental') &&
|
||||
!param.name.startsWith('X-Billing')
|
||||
),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const result = merge([
|
||||
{
|
||||
oas: lettaAgentsAPI,
|
||||
},
|
||||
{
|
||||
oas: lettaWebOpenAPI,
|
||||
},
|
||||
]);
|
||||
|
||||
if (isErrorResult(result)) {
|
||||
console.error(`${result.message} (${result.type})`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
result.output.openapi = '3.1.0';
|
||||
result.output.info = {
|
||||
title: 'Letta API',
|
||||
version: '1.0.0',
|
||||
};
|
||||
|
||||
result.output.servers = [
|
||||
{
|
||||
url: 'https://app.letta.com',
|
||||
description: 'Letta Cloud',
|
||||
},
|
||||
{
|
||||
url: 'http://localhost:8283',
|
||||
description: 'Self-hosted',
|
||||
},
|
||||
];
|
||||
|
||||
result.output.components = {
|
||||
...result.output.components,
|
||||
securitySchemes: {
|
||||
bearerAuth: {
|
||||
type: 'http',
|
||||
scheme: 'bearer',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
result.output.security = [
|
||||
...(result.output.security || []),
|
||||
{
|
||||
bearerAuth: [],
|
||||
},
|
||||
];
|
||||
|
||||
// omit all instances of "user_id" from the openapi.json file
|
||||
function deepOmitPreserveArrays(obj: unknown, key: string): unknown {
|
||||
if (Array.isArray(obj)) {
|
||||
return obj.map((item) => deepOmitPreserveArrays(item, key));
|
||||
}
|
||||
|
||||
if (typeof obj !== 'object' || obj === null) {
|
||||
return obj;
|
||||
}
|
||||
|
||||
if (key in obj) {
|
||||
return omit(obj, key);
|
||||
}
|
||||
|
||||
return Object.fromEntries(
|
||||
Object.entries(obj).map(([k, v]) => [k, deepOmitPreserveArrays(v, key)]),
|
||||
);
|
||||
}
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
|
||||
// @ts-ignore
|
||||
result.output.components = deepOmitPreserveArrays(
|
||||
result.output.components,
|
||||
'user_id',
|
||||
);
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
|
||||
// @ts-ignore
|
||||
result.output.components = deepOmitPreserveArrays(
|
||||
result.output.components,
|
||||
'actor_id',
|
||||
);
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
|
||||
// @ts-ignore
|
||||
result.output.components = deepOmitPreserveArrays(
|
||||
result.output.components,
|
||||
'organization_id',
|
||||
);
|
||||
|
||||
fs.writeFileSync(
|
||||
path.join(__dirname, '..', 'openapi.json'),
|
||||
JSON.stringify(result.output, null, 2),
|
||||
);
|
||||
|
||||
function formatOpenAPIJson() {
|
||||
const openApiPath = path.join(__dirname, '..', 'openapi.json');
|
||||
|
||||
try {
|
||||
execSync(`npx prettier --write "${openApiPath}"`, { stdio: 'inherit' });
|
||||
console.log('Successfully formatted openapi.json with Prettier');
|
||||
} catch (error) {
|
||||
console.error('Error formatting openapi.json:', error);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
formatOpenAPIJson();
|
||||
@@ -5,7 +5,7 @@ try:
|
||||
__version__ = version("letta")
|
||||
except PackageNotFoundError:
|
||||
# Fallback for development installations
|
||||
__version__ = "0.16.5"
|
||||
__version__ = "0.16.6"
|
||||
|
||||
if os.environ.get("LETTA_VERSION"):
|
||||
__version__ = os.environ["LETTA_VERSION"]
|
||||
|
||||
@@ -7,6 +7,7 @@ from letta.schemas.letta_message import LettaMessage
|
||||
from letta.schemas.letta_message_content import ReasoningContent, RedactedReasoningContent, TextContent
|
||||
from letta.schemas.llm_config import LLMConfig
|
||||
from letta.schemas.openai.chat_completion_response import ChatCompletionResponse, ChoiceLogprobs, ToolCall
|
||||
from letta.schemas.provider_trace import BillingContext
|
||||
from letta.schemas.usage import LettaUsageStatistics
|
||||
from letta.schemas.user import User
|
||||
from letta.services.telemetry_manager import TelemetryManager
|
||||
@@ -31,6 +32,7 @@ class LettaLLMAdapter(ABC):
|
||||
run_id: str | None = None,
|
||||
org_id: str | None = None,
|
||||
user_id: str | None = None,
|
||||
billing_context: BillingContext | None = None,
|
||||
) -> None:
|
||||
self.llm_client: LLMClientBase = llm_client
|
||||
self.llm_config: LLMConfig = llm_config
|
||||
@@ -40,6 +42,7 @@ class LettaLLMAdapter(ABC):
|
||||
self.run_id: str | None = run_id
|
||||
self.org_id: str | None = org_id
|
||||
self.user_id: str | None = user_id
|
||||
self.billing_context: BillingContext | None = billing_context
|
||||
self.message_id: str | None = None
|
||||
self.request_data: dict | None = None
|
||||
self.response_data: dict | None = None
|
||||
|
||||
@@ -10,7 +10,7 @@ from letta.otel.tracing import log_attributes, safe_json_dumps, trace_method
|
||||
from letta.schemas.enums import LLMCallType, ProviderType
|
||||
from letta.schemas.letta_message import LettaMessage
|
||||
from letta.schemas.llm_config import LLMConfig
|
||||
from letta.schemas.provider_trace import ProviderTrace
|
||||
from letta.schemas.provider_trace import BillingContext, ProviderTrace
|
||||
from letta.schemas.user import User
|
||||
from letta.settings import settings
|
||||
from letta.utils import safe_create_task
|
||||
@@ -36,6 +36,7 @@ class LettaLLMStreamAdapter(LettaLLMAdapter):
|
||||
run_id: str | None = None,
|
||||
org_id: str | None = None,
|
||||
user_id: str | None = None,
|
||||
billing_context: "BillingContext | None" = None,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
llm_client,
|
||||
@@ -46,6 +47,7 @@ class LettaLLMStreamAdapter(LettaLLMAdapter):
|
||||
run_id=run_id,
|
||||
org_id=org_id,
|
||||
user_id=user_id,
|
||||
billing_context=billing_context,
|
||||
)
|
||||
self.interface: OpenAIStreamingInterface | AnthropicStreamingInterface | None = None
|
||||
|
||||
|
||||
@@ -51,6 +51,7 @@ class SimpleLLMRequestAdapter(LettaLLMRequestAdapter):
|
||||
org_id=self.org_id,
|
||||
user_id=self.user_id,
|
||||
llm_config=self.llm_config.model_dump() if self.llm_config else None,
|
||||
billing_context=self.billing_context,
|
||||
)
|
||||
try:
|
||||
self.response_data = await self.llm_client.request_async_with_telemetry(request_data, self.llm_config)
|
||||
|
||||
@@ -278,6 +278,7 @@ class SimpleLLMStreamAdapter(LettaLLMStreamAdapter):
|
||||
org_id=self.org_id,
|
||||
user_id=self.user_id,
|
||||
llm_config=self.llm_config.model_dump() if self.llm_config else None,
|
||||
billing_context=self.billing_context,
|
||||
),
|
||||
),
|
||||
label="create_provider_trace",
|
||||
|
||||
@@ -15,6 +15,7 @@ from letta.schemas.letta_message_content import TextContent
|
||||
from letta.schemas.letta_response import LettaResponse
|
||||
from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType
|
||||
from letta.schemas.message import Message, MessageCreate, MessageUpdate
|
||||
from letta.schemas.provider_trace import BillingContext
|
||||
from letta.schemas.usage import LettaUsageStatistics
|
||||
from letta.schemas.user import User
|
||||
from letta.services.agent_manager import AgentManager
|
||||
@@ -51,7 +52,11 @@ class BaseAgent(ABC):
|
||||
|
||||
@abstractmethod
|
||||
async def step(
|
||||
self, input_messages: List[MessageCreate], max_steps: int = DEFAULT_MAX_STEPS, run_id: Optional[str] = None
|
||||
self,
|
||||
input_messages: List[MessageCreate],
|
||||
max_steps: int = DEFAULT_MAX_STEPS,
|
||||
run_id: Optional[str] = None,
|
||||
billing_context: "BillingContext | None" = None,
|
||||
) -> LettaResponse:
|
||||
"""
|
||||
Main execution loop for the agent.
|
||||
|
||||
@@ -12,6 +12,7 @@ from letta.schemas.user import User
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from letta.schemas.letta_request import ClientToolSchema
|
||||
from letta.schemas.provider_trace import BillingContext
|
||||
|
||||
|
||||
class BaseAgentV2(ABC):
|
||||
@@ -52,6 +53,7 @@ class BaseAgentV2(ABC):
|
||||
request_start_timestamp_ns: int | None = None,
|
||||
client_tools: list["ClientToolSchema"] | None = None,
|
||||
include_compaction_messages: bool = False, # Not used in V2, but accepted for API compatibility
|
||||
billing_context: "BillingContext | None" = None,
|
||||
) -> LettaResponse:
|
||||
"""
|
||||
Execute the agent loop in blocking mode, returning all messages at once.
|
||||
@@ -76,6 +78,7 @@ class BaseAgentV2(ABC):
|
||||
conversation_id: str | None = None,
|
||||
client_tools: list["ClientToolSchema"] | None = None,
|
||||
include_compaction_messages: bool = False, # Not used in V2, but accepted for API compatibility
|
||||
billing_context: "BillingContext | None" = None,
|
||||
) -> AsyncGenerator[LettaMessage | LegacyLettaMessage | MessageStreamStatus, None]:
|
||||
"""
|
||||
Execute the agent loop in streaming mode, yielding chunks as they become available.
|
||||
|
||||
@@ -192,44 +192,15 @@ async def _prepare_in_context_messages_no_persist_async(
|
||||
# Otherwise, include the full list of messages from the conversation
|
||||
current_in_context_messages = await message_manager.get_messages_by_ids_async(message_ids=message_ids, actor=actor)
|
||||
else:
|
||||
# No messages in conversation yet - compile a new system message for this conversation
|
||||
# Each conversation gets its own system message (captures memory state at conversation start)
|
||||
from letta.prompts.prompt_generator import PromptGenerator
|
||||
from letta.services.passage_manager import PassageManager
|
||||
|
||||
num_messages = await message_manager.size_async(actor=actor, agent_id=agent_state.id)
|
||||
passage_manager = PassageManager()
|
||||
num_archival_memories = await passage_manager.agent_passage_size_async(actor=actor, agent_id=agent_state.id)
|
||||
|
||||
system_message_str = await PromptGenerator.compile_system_message_async(
|
||||
system_prompt=agent_state.system,
|
||||
in_context_memory=agent_state.memory,
|
||||
in_context_memory_last_edit=get_utc_time(),
|
||||
timezone=agent_state.timezone,
|
||||
user_defined_variables=None,
|
||||
append_icm_if_missing=True,
|
||||
previous_message_count=num_messages,
|
||||
archival_memory_size=num_archival_memories,
|
||||
sources=agent_state.sources,
|
||||
max_files_open=agent_state.max_files_open,
|
||||
)
|
||||
system_message = Message.dict_to_message(
|
||||
agent_id=agent_state.id,
|
||||
model=agent_state.llm_config.model,
|
||||
openai_message_dict={"role": "system", "content": system_message_str},
|
||||
)
|
||||
|
||||
# Persist the new system message
|
||||
persisted_messages = await message_manager.create_many_messages_async([system_message], actor=actor)
|
||||
system_message = persisted_messages[0]
|
||||
|
||||
# Add it to the conversation tracking
|
||||
await conversation_manager.add_messages_to_conversation(
|
||||
# No messages in conversation yet (fallback) - compile a new system message
|
||||
# Normally this is handled at conversation creation time, but this covers
|
||||
# edge cases where a conversation exists without a system message.
|
||||
system_message = await conversation_manager.compile_and_save_system_message_for_conversation(
|
||||
conversation_id=conversation_id,
|
||||
agent_id=agent_state.id,
|
||||
message_ids=[system_message.id],
|
||||
actor=actor,
|
||||
starting_position=0,
|
||||
agent_state=agent_state,
|
||||
message_manager=message_manager,
|
||||
)
|
||||
|
||||
current_in_context_messages = [system_message]
|
||||
|
||||
@@ -48,6 +48,7 @@ from letta.schemas.openai.chat_completion_response import (
|
||||
UsageStatisticsCompletionTokenDetails,
|
||||
UsageStatisticsPromptTokenDetails,
|
||||
)
|
||||
from letta.schemas.provider_trace import BillingContext
|
||||
from letta.schemas.step import StepProgression
|
||||
from letta.schemas.step_metrics import StepMetrics
|
||||
from letta.schemas.tool_execution_result import ToolExecutionResult
|
||||
@@ -179,6 +180,7 @@ class LettaAgent(BaseAgent):
|
||||
request_start_timestamp_ns: int | None = None,
|
||||
include_return_message_types: list[MessageType] | None = None,
|
||||
dry_run: bool = False,
|
||||
billing_context: "BillingContext | None" = None,
|
||||
) -> Union[LettaResponse, dict]:
|
||||
# TODO (cliandy): pass in run_id and use at send_message endpoints for all step functions
|
||||
agent_state = await self.agent_manager.get_agent_by_id_async(
|
||||
|
||||
@@ -44,6 +44,7 @@ from letta.schemas.openai.chat_completion_response import (
|
||||
UsageStatisticsCompletionTokenDetails,
|
||||
UsageStatisticsPromptTokenDetails,
|
||||
)
|
||||
from letta.schemas.provider_trace import BillingContext
|
||||
from letta.schemas.step import Step, StepProgression
|
||||
from letta.schemas.step_metrics import StepMetrics
|
||||
from letta.schemas.tool import Tool
|
||||
@@ -185,6 +186,7 @@ class LettaAgentV2(BaseAgentV2):
|
||||
request_start_timestamp_ns: int | None = None,
|
||||
client_tools: list[ClientToolSchema] | None = None,
|
||||
include_compaction_messages: bool = False, # Not used in V2, but accepted for API compatibility
|
||||
billing_context: "BillingContext | None" = None,
|
||||
) -> LettaResponse:
|
||||
"""
|
||||
Execute the agent loop in blocking mode, returning all messages at once.
|
||||
@@ -290,6 +292,7 @@ class LettaAgentV2(BaseAgentV2):
|
||||
conversation_id: str | None = None, # Not used in V2, but accepted for API compatibility
|
||||
client_tools: list[ClientToolSchema] | None = None,
|
||||
include_compaction_messages: bool = False, # Not used in V2, but accepted for API compatibility
|
||||
billing_context: BillingContext | None = None,
|
||||
) -> AsyncGenerator[str, None]:
|
||||
"""
|
||||
Execute the agent loop in streaming mode, yielding chunks as they become available.
|
||||
|
||||
@@ -21,7 +21,7 @@ from letta.agents.helpers import (
|
||||
)
|
||||
from letta.agents.letta_agent_v2 import LettaAgentV2
|
||||
from letta.constants import DEFAULT_MAX_STEPS, NON_USER_MSG_PREFIX, REQUEST_HEARTBEAT_PARAM
|
||||
from letta.errors import ContextWindowExceededError, LLMError, SystemPromptTokenExceededError
|
||||
from letta.errors import ContextWindowExceededError, LLMEmptyResponseError, LLMError, SystemPromptTokenExceededError
|
||||
from letta.helpers import ToolRulesSolver
|
||||
from letta.helpers.datetime_helpers import get_utc_time, get_utc_timestamp_ns
|
||||
from letta.helpers.tool_execution_helper import enable_strict_mode
|
||||
@@ -45,6 +45,7 @@ from letta.schemas.letta_response import LettaResponse, TurnTokenData
|
||||
from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType
|
||||
from letta.schemas.message import Message, MessageCreate, ToolReturn
|
||||
from letta.schemas.openai.chat_completion_response import ChoiceLogprobs, ToolCall, ToolCallDenial, UsageStatistics
|
||||
from letta.schemas.provider_trace import BillingContext
|
||||
from letta.schemas.step import StepProgression
|
||||
from letta.schemas.step_metrics import StepMetrics
|
||||
from letta.schemas.tool_execution_result import ToolExecutionResult
|
||||
@@ -149,6 +150,7 @@ class LettaAgentV3(LettaAgentV2):
|
||||
conversation_id: str | None = None,
|
||||
client_tools: list[ClientToolSchema] | None = None,
|
||||
include_compaction_messages: bool = False,
|
||||
billing_context: "BillingContext | None" = None,
|
||||
) -> LettaResponse:
|
||||
"""
|
||||
Execute the agent loop in blocking mode, returning all messages at once.
|
||||
@@ -232,6 +234,7 @@ class LettaAgentV3(LettaAgentV2):
|
||||
run_id=run_id,
|
||||
org_id=self.actor.organization_id,
|
||||
user_id=self.actor.id,
|
||||
billing_context=billing_context,
|
||||
)
|
||||
|
||||
credit_task = None
|
||||
@@ -362,6 +365,7 @@ class LettaAgentV3(LettaAgentV2):
|
||||
conversation_id: str | None = None,
|
||||
client_tools: list[ClientToolSchema] | None = None,
|
||||
include_compaction_messages: bool = False,
|
||||
billing_context: BillingContext | None = None,
|
||||
) -> AsyncGenerator[str, None]:
|
||||
"""
|
||||
Execute the agent loop in streaming mode, yielding chunks as they become available.
|
||||
@@ -419,6 +423,7 @@ class LettaAgentV3(LettaAgentV2):
|
||||
run_id=run_id,
|
||||
org_id=self.actor.organization_id,
|
||||
user_id=self.actor.id,
|
||||
billing_context=billing_context,
|
||||
)
|
||||
elif use_sglang_native:
|
||||
# Use SGLang native adapter for multi-turn RL training
|
||||
@@ -431,6 +436,7 @@ class LettaAgentV3(LettaAgentV2):
|
||||
run_id=run_id,
|
||||
org_id=self.actor.organization_id,
|
||||
user_id=self.actor.id,
|
||||
billing_context=billing_context,
|
||||
)
|
||||
# Reset turns tracking for this step
|
||||
self.turns = []
|
||||
@@ -444,6 +450,7 @@ class LettaAgentV3(LettaAgentV2):
|
||||
run_id=run_id,
|
||||
org_id=self.actor.organization_id,
|
||||
user_id=self.actor.id,
|
||||
billing_context=billing_context,
|
||||
)
|
||||
|
||||
try:
|
||||
@@ -764,7 +771,12 @@ class LettaAgentV3(LettaAgentV2):
|
||||
]
|
||||
else:
|
||||
# Old behavior: UserMessage with packed JSON
|
||||
return list(Message.to_letta_messages(summary_message))
|
||||
messages = list(Message.to_letta_messages(summary_message))
|
||||
# Set otid on returned messages (summary Message doesn't have otid set at creation)
|
||||
for i, msg in enumerate(messages):
|
||||
if not msg.otid:
|
||||
msg.otid = Message.generate_otid_from_id(summary_message.id, i)
|
||||
return messages
|
||||
|
||||
@trace_method
|
||||
async def _step(
|
||||
@@ -990,6 +1002,9 @@ class LettaAgentV3(LettaAgentV2):
|
||||
except ValueError as e:
|
||||
self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response.value)
|
||||
raise e
|
||||
except LLMEmptyResponseError as e:
|
||||
self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response.value)
|
||||
raise e
|
||||
except LLMError as e:
|
||||
self.stop_reason = LettaStopReason(stop_reason=StopReasonType.llm_api_error.value)
|
||||
raise e
|
||||
|
||||
@@ -134,7 +134,7 @@ def _flatten_model_settings(d: dict, env_vars: dict[str, str]) -> None:
|
||||
api_base: yyy -> OPENAI_API_BASE
|
||||
anthropic:
|
||||
api_key: zzz -> ANTHROPIC_API_KEY
|
||||
global_max_context_window_limit: 32000 -> GLOBAL_MAX_CONTEXT_WINDOW_LIMIT
|
||||
global_max_context_window_limit: 128000 -> GLOBAL_MAX_CONTEXT_WINDOW_LIMIT
|
||||
"""
|
||||
for key, value in d.items():
|
||||
if isinstance(value, dict):
|
||||
|
||||
@@ -74,7 +74,7 @@ DEFAULT_MAX_STEPS = 50
|
||||
|
||||
# context window size
|
||||
MIN_CONTEXT_WINDOW = 4096
|
||||
DEFAULT_CONTEXT_WINDOW = 32000
|
||||
DEFAULT_CONTEXT_WINDOW = 128000
|
||||
|
||||
# Summarization trigger threshold (multiplier of context_window limit)
|
||||
# Summarization triggers when step usage > context_window * SUMMARIZATION_TRIGGER_MULTIPLIER
|
||||
@@ -253,10 +253,10 @@ LLM_MAX_CONTEXT_WINDOW = {
|
||||
"deepseek-reasoner": 64000,
|
||||
# glm (Z.AI)
|
||||
"glm-4.5": 128000,
|
||||
"glm-4.6": 200000,
|
||||
"glm-4.7": 200000,
|
||||
"glm-5": 200000,
|
||||
"glm-5-code": 200000,
|
||||
"glm-4.6": 180000,
|
||||
"glm-4.7": 180000,
|
||||
"glm-5": 180000,
|
||||
"glm-5-code": 180000,
|
||||
## OpenAI models: https://platform.openai.com/docs/models/overview
|
||||
# gpt-5
|
||||
"gpt-5": 272000,
|
||||
@@ -278,6 +278,8 @@ LLM_MAX_CONTEXT_WINDOW = {
|
||||
"gpt-5.2-pro": 272000,
|
||||
"gpt-5.2-pro-2025-12-11": 272000,
|
||||
"gpt-5.2-codex": 272000,
|
||||
# gpt-5.3
|
||||
"gpt-5.3-codex": 272000,
|
||||
# reasoners
|
||||
"o1": 200000,
|
||||
# "o1-pro": 200000, # responses API only
|
||||
@@ -419,7 +421,7 @@ MAX_ERROR_MESSAGE_CHAR_LIMIT = 1000
|
||||
# Default memory limits
|
||||
CORE_MEMORY_PERSONA_CHAR_LIMIT: int = 20000
|
||||
CORE_MEMORY_HUMAN_CHAR_LIMIT: int = 20000
|
||||
CORE_MEMORY_BLOCK_CHAR_LIMIT: int = 20000
|
||||
CORE_MEMORY_BLOCK_CHAR_LIMIT: int = 100000
|
||||
|
||||
# Function return limits
|
||||
FUNCTION_RETURN_CHAR_LIMIT = 50000 # ~300 words
|
||||
|
||||
@@ -283,6 +283,15 @@ class LLMServerError(LLMError):
|
||||
while processing the request."""
|
||||
|
||||
|
||||
class LLMEmptyResponseError(LLMServerError):
|
||||
"""Error when LLM returns an empty response (no content and no tool calls).
|
||||
|
||||
This is a subclass of LLMServerError to maintain retry behavior, but allows
|
||||
specific handling for empty response cases which may benefit from request
|
||||
modification before retry.
|
||||
"""
|
||||
|
||||
|
||||
class LLMTimeoutError(LLMError):
|
||||
"""Error when LLM request times out"""
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ from letta.schemas.letta_message import MessageType
|
||||
from letta.schemas.letta_message_content import TextContent
|
||||
from letta.schemas.letta_response import LettaResponse
|
||||
from letta.schemas.message import Message, MessageCreate
|
||||
from letta.schemas.provider_trace import BillingContext
|
||||
from letta.schemas.run import Run
|
||||
from letta.schemas.user import User
|
||||
from letta.services.agent_manager import AgentManager
|
||||
@@ -69,6 +70,7 @@ class SleeptimeMultiAgentV2(BaseAgent):
|
||||
use_assistant_message: bool = True,
|
||||
request_start_timestamp_ns: int | None = None,
|
||||
include_return_message_types: list[MessageType] | None = None,
|
||||
billing_context: "BillingContext | None" = None,
|
||||
) -> LettaResponse:
|
||||
run_ids = []
|
||||
|
||||
@@ -100,6 +102,7 @@ class SleeptimeMultiAgentV2(BaseAgent):
|
||||
run_id=run_id,
|
||||
use_assistant_message=use_assistant_message,
|
||||
include_return_message_types=include_return_message_types,
|
||||
billing_context=billing_context,
|
||||
)
|
||||
|
||||
# Get last response messages
|
||||
|
||||
@@ -15,6 +15,7 @@ from letta.schemas.letta_request import ClientToolSchema
|
||||
from letta.schemas.letta_response import LettaResponse
|
||||
from letta.schemas.letta_stop_reason import StopReasonType
|
||||
from letta.schemas.message import Message, MessageCreate
|
||||
from letta.schemas.provider_trace import BillingContext
|
||||
from letta.schemas.run import Run, RunUpdate
|
||||
from letta.schemas.user import User
|
||||
from letta.services.group_manager import GroupManager
|
||||
@@ -47,6 +48,7 @@ class SleeptimeMultiAgentV3(LettaAgentV2):
|
||||
request_start_timestamp_ns: int | None = None,
|
||||
client_tools: list[ClientToolSchema] | None = None,
|
||||
include_compaction_messages: bool = False,
|
||||
billing_context: "BillingContext | None" = None,
|
||||
) -> LettaResponse:
|
||||
self.run_ids = []
|
||||
|
||||
@@ -62,6 +64,7 @@ class SleeptimeMultiAgentV3(LettaAgentV2):
|
||||
request_start_timestamp_ns=request_start_timestamp_ns,
|
||||
client_tools=client_tools,
|
||||
include_compaction_messages=include_compaction_messages,
|
||||
billing_context=billing_context,
|
||||
)
|
||||
|
||||
await self.run_sleeptime_agents()
|
||||
@@ -81,6 +84,7 @@ class SleeptimeMultiAgentV3(LettaAgentV2):
|
||||
include_return_message_types: list[MessageType] | None = None,
|
||||
client_tools: list[ClientToolSchema] | None = None,
|
||||
include_compaction_messages: bool = False,
|
||||
billing_context: "BillingContext | None" = None,
|
||||
) -> AsyncGenerator[str, None]:
|
||||
self.run_ids = []
|
||||
|
||||
@@ -99,6 +103,7 @@ class SleeptimeMultiAgentV3(LettaAgentV2):
|
||||
request_start_timestamp_ns=request_start_timestamp_ns,
|
||||
client_tools=client_tools,
|
||||
include_compaction_messages=include_compaction_messages,
|
||||
billing_context=billing_context,
|
||||
):
|
||||
yield chunk
|
||||
finally:
|
||||
|
||||
@@ -14,6 +14,7 @@ from letta.schemas.letta_request import ClientToolSchema
|
||||
from letta.schemas.letta_response import LettaResponse
|
||||
from letta.schemas.letta_stop_reason import StopReasonType
|
||||
from letta.schemas.message import Message, MessageCreate
|
||||
from letta.schemas.provider_trace import BillingContext
|
||||
from letta.schemas.run import Run, RunUpdate
|
||||
from letta.schemas.user import User
|
||||
from letta.services.group_manager import GroupManager
|
||||
@@ -47,6 +48,7 @@ class SleeptimeMultiAgentV4(LettaAgentV3):
|
||||
conversation_id: str | None = None,
|
||||
client_tools: list[ClientToolSchema] | None = None,
|
||||
include_compaction_messages: bool = False,
|
||||
billing_context: "BillingContext | None" = None,
|
||||
) -> LettaResponse:
|
||||
self.run_ids = []
|
||||
|
||||
@@ -63,6 +65,7 @@ class SleeptimeMultiAgentV4(LettaAgentV3):
|
||||
conversation_id=conversation_id,
|
||||
client_tools=client_tools,
|
||||
include_compaction_messages=include_compaction_messages,
|
||||
billing_context=billing_context,
|
||||
)
|
||||
|
||||
run_ids = await self.run_sleeptime_agents()
|
||||
@@ -82,6 +85,7 @@ class SleeptimeMultiAgentV4(LettaAgentV3):
|
||||
conversation_id: str | None = None,
|
||||
client_tools: list[ClientToolSchema] | None = None,
|
||||
include_compaction_messages: bool = False,
|
||||
billing_context: "BillingContext | None" = None,
|
||||
) -> AsyncGenerator[str, None]:
|
||||
self.run_ids = []
|
||||
|
||||
@@ -101,6 +105,7 @@ class SleeptimeMultiAgentV4(LettaAgentV3):
|
||||
conversation_id=conversation_id,
|
||||
client_tools=client_tools,
|
||||
include_compaction_messages=include_compaction_messages,
|
||||
billing_context=billing_context,
|
||||
):
|
||||
yield chunk
|
||||
finally:
|
||||
|
||||
@@ -30,6 +30,7 @@ from anthropic.types.beta import (
|
||||
)
|
||||
|
||||
from letta.constants import DEFAULT_MESSAGE_TOOL, DEFAULT_MESSAGE_TOOL_KWARG
|
||||
from letta.errors import LLMEmptyResponseError
|
||||
from letta.local_llm.constants import INNER_THOUGHTS_KWARG
|
||||
from letta.log import get_logger
|
||||
from letta.schemas.letta_message import (
|
||||
@@ -104,6 +105,10 @@ class AnthropicStreamingInterface:
|
||||
self.inner_thoughts_complete = False
|
||||
self.put_inner_thoughts_in_kwarg = put_inner_thoughts_in_kwarg
|
||||
|
||||
# Track whether any content was produced (text or tool calls)
|
||||
# Used to detect empty responses from models like Opus 4.6
|
||||
self.has_content = False
|
||||
|
||||
# Buffer to handle partial XML tags across chunks
|
||||
self.partial_tag_buffer = ""
|
||||
|
||||
@@ -298,9 +303,11 @@ class AnthropicStreamingInterface:
|
||||
|
||||
if isinstance(content, BetaTextBlock):
|
||||
self.anthropic_mode = EventMode.TEXT
|
||||
self.has_content = True # Track that we received text content
|
||||
# TODO: Can capture citations, etc.
|
||||
elif isinstance(content, BetaToolUseBlock):
|
||||
self.anthropic_mode = EventMode.TOOL_USE
|
||||
self.has_content = True # Track that we received tool use content
|
||||
self.tool_call_id = content.id
|
||||
self.tool_call_name = content.name
|
||||
self.inner_thoughts_complete = False
|
||||
@@ -589,8 +596,12 @@ class AnthropicStreamingInterface:
|
||||
# message_delta event are *cumulative*." So we assign, not accumulate.
|
||||
self.output_tokens = event.usage.output_tokens
|
||||
elif isinstance(event, BetaRawMessageStopEvent):
|
||||
# Don't do anything here! We don't want to stop the stream.
|
||||
pass
|
||||
# Check if any content was produced during the stream
|
||||
# Empty responses (no text and no tool calls) should raise an error
|
||||
if not self.has_content:
|
||||
raise LLMEmptyResponseError(
|
||||
message=f"LLM provider returned empty content in streaming response (model: {self.model}, message_id: {self.message_id})"
|
||||
)
|
||||
elif isinstance(event, BetaRawContentBlockStopEvent):
|
||||
# If we're exiting a tool use block and there are still buffered messages,
|
||||
# we should flush them now.
|
||||
@@ -837,10 +848,12 @@ class SimpleAnthropicStreamingInterface:
|
||||
|
||||
if isinstance(content, BetaTextBlock):
|
||||
self.anthropic_mode = EventMode.TEXT
|
||||
self.has_content = True # Track that we received text content
|
||||
# TODO: Can capture citations, etc.
|
||||
|
||||
elif isinstance(content, BetaToolUseBlock):
|
||||
self.anthropic_mode = EventMode.TOOL_USE
|
||||
self.has_content = True # Track that we received tool use content
|
||||
self.tool_call_id = content.id
|
||||
self.tool_call_name = content.name
|
||||
|
||||
@@ -1014,8 +1027,12 @@ class SimpleAnthropicStreamingInterface:
|
||||
self.output_tokens = event.usage.output_tokens
|
||||
|
||||
elif isinstance(event, BetaRawMessageStopEvent):
|
||||
# Don't do anything here! We don't want to stop the stream.
|
||||
pass
|
||||
# Check if any content was produced during the stream
|
||||
# Empty responses (no text and no tool calls) should raise an error
|
||||
if not self.has_content:
|
||||
raise LLMEmptyResponseError(
|
||||
message=f"LLM provider returned empty content in streaming response (model: {self.model}, message_id: {self.message_id})"
|
||||
)
|
||||
|
||||
elif isinstance(event, BetaRawContentBlockStopEvent):
|
||||
self.anthropic_mode = None
|
||||
|
||||
@@ -19,6 +19,8 @@ from letta.errors import (
|
||||
LLMAuthenticationError,
|
||||
LLMBadRequestError,
|
||||
LLMConnectionError,
|
||||
LLMEmptyResponseError,
|
||||
LLMError,
|
||||
LLMInsufficientCreditsError,
|
||||
LLMNotFoundError,
|
||||
LLMPermissionDeniedError,
|
||||
@@ -957,6 +959,11 @@ class AnthropicClient(LLMClientBase):
|
||||
|
||||
@trace_method
|
||||
def handle_llm_error(self, e: Exception, llm_config: Optional[LLMConfig] = None) -> Exception:
|
||||
# Pass through errors that are already LLMError instances unchanged
|
||||
# This preserves specific error types like LLMEmptyResponseError
|
||||
if isinstance(e, LLMError):
|
||||
return e
|
||||
|
||||
is_byok = (llm_config.provider_category == ProviderCategory.byok) if llm_config else None
|
||||
|
||||
# make sure to check for overflow errors, regardless of error type
|
||||
@@ -1278,7 +1285,7 @@ class AnthropicClient(LLMClientBase):
|
||||
response.stop_reason,
|
||||
json.dumps(response_data),
|
||||
)
|
||||
raise LLMServerError(
|
||||
raise LLMEmptyResponseError(
|
||||
message=f"LLM provider returned empty content in response (ID: {response.id}, model: {response.model}, stop_reason: {response.stop_reason})",
|
||||
code=ErrorCode.INTERNAL_SERVER_ERROR,
|
||||
details={
|
||||
|
||||
@@ -9,7 +9,7 @@ from letta.llm_api.google_constants import GOOGLE_MODEL_FOR_API_KEY_CHECK
|
||||
from letta.llm_api.google_vertex_client import GoogleVertexClient
|
||||
from letta.log import get_logger
|
||||
from letta.schemas.llm_config import LLMConfig
|
||||
from letta.settings import model_settings, settings
|
||||
from letta.settings import model_settings
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@@ -18,7 +18,7 @@ class GoogleAIClient(GoogleVertexClient):
|
||||
provider_label = "Google AI"
|
||||
|
||||
def _get_client(self, llm_config: Optional[LLMConfig] = None):
|
||||
timeout_ms = int(settings.llm_request_timeout_seconds * 1000)
|
||||
timeout_ms = int(model_settings.gemini_timeout_seconds * 1000)
|
||||
api_key = None
|
||||
if llm_config:
|
||||
api_key, _, _ = self.get_byok_overrides(llm_config)
|
||||
@@ -30,7 +30,7 @@ class GoogleAIClient(GoogleVertexClient):
|
||||
)
|
||||
|
||||
async def _get_client_async(self, llm_config: Optional[LLMConfig] = None):
|
||||
timeout_ms = int(settings.llm_request_timeout_seconds * 1000)
|
||||
timeout_ms = int(model_settings.gemini_timeout_seconds * 1000)
|
||||
api_key = None
|
||||
if llm_config:
|
||||
api_key, _, _ = await self.get_byok_overrides_async(llm_config)
|
||||
|
||||
@@ -14,7 +14,7 @@ from letta.schemas.enums import AgentType, LLMCallType, ProviderCategory
|
||||
from letta.schemas.llm_config import LLMConfig
|
||||
from letta.schemas.message import Message
|
||||
from letta.schemas.openai.chat_completion_response import ChatCompletionResponse
|
||||
from letta.schemas.provider_trace import ProviderTrace
|
||||
from letta.schemas.provider_trace import BillingContext, ProviderTrace
|
||||
from letta.schemas.usage import LettaUsageStatistics
|
||||
from letta.services.telemetry_manager import TelemetryManager
|
||||
from letta.settings import settings
|
||||
@@ -48,6 +48,7 @@ class LLMClientBase:
|
||||
self._telemetry_user_id: Optional[str] = None
|
||||
self._telemetry_compaction_settings: Optional[Dict] = None
|
||||
self._telemetry_llm_config: Optional[Dict] = None
|
||||
self._telemetry_billing_context: Optional[BillingContext] = None
|
||||
|
||||
def set_telemetry_context(
|
||||
self,
|
||||
@@ -62,6 +63,7 @@ class LLMClientBase:
|
||||
compaction_settings: Optional[Dict] = None,
|
||||
llm_config: Optional[Dict] = None,
|
||||
actor: Optional["User"] = None,
|
||||
billing_context: Optional[BillingContext] = None,
|
||||
) -> None:
|
||||
"""Set telemetry context for provider trace logging."""
|
||||
if actor is not None:
|
||||
@@ -76,6 +78,7 @@ class LLMClientBase:
|
||||
self._telemetry_user_id = user_id
|
||||
self._telemetry_compaction_settings = compaction_settings
|
||||
self._telemetry_llm_config = llm_config
|
||||
self._telemetry_billing_context = billing_context
|
||||
|
||||
def extract_usage_statistics(self, response_data: Optional[dict], llm_config: LLMConfig) -> LettaUsageStatistics:
|
||||
"""Provider-specific usage parsing hook (override in subclasses). Returns LettaUsageStatistics."""
|
||||
@@ -125,6 +128,7 @@ class LLMClientBase:
|
||||
user_id=self._telemetry_user_id,
|
||||
compaction_settings=self._telemetry_compaction_settings,
|
||||
llm_config=llm_config.model_dump() if llm_config else self._telemetry_llm_config,
|
||||
billing_context=self._telemetry_billing_context,
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
@@ -186,6 +190,7 @@ class LLMClientBase:
|
||||
user_id=self._telemetry_user_id,
|
||||
compaction_settings=self._telemetry_compaction_settings,
|
||||
llm_config=llm_config.model_dump() if llm_config else self._telemetry_llm_config,
|
||||
billing_context=self._telemetry_billing_context,
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
|
||||
@@ -88,7 +88,7 @@ def supports_none_reasoning_effort(model: str) -> bool:
|
||||
|
||||
Currently, GPT-5.1 and GPT-5.2 models support the 'none' reasoning effort level.
|
||||
"""
|
||||
return model.startswith("gpt-5.1") or model.startswith("gpt-5.2")
|
||||
return model.startswith("gpt-5.1") or model.startswith("gpt-5.2") or model.startswith("gpt-5.3")
|
||||
|
||||
|
||||
def is_openai_5_model(model: str) -> bool:
|
||||
@@ -389,7 +389,6 @@ class OpenAIClient(LLMClientBase):
|
||||
input=openai_messages_list,
|
||||
tools=responses_tools,
|
||||
tool_choice=tool_choice,
|
||||
max_output_tokens=llm_config.max_tokens,
|
||||
temperature=llm_config.temperature if supports_temperature_param(model) else None,
|
||||
parallel_tool_calls=llm_config.parallel_tool_calls if tools and supports_parallel_tool_calling(model) else False,
|
||||
)
|
||||
@@ -397,6 +396,10 @@ class OpenAIClient(LLMClientBase):
|
||||
# Handle text configuration (verbosity and response format)
|
||||
text_config_kwargs = {}
|
||||
|
||||
# Only set max_output_tokens if explicitly configured
|
||||
if llm_config.max_tokens is not None:
|
||||
data.max_output_tokens = llm_config.max_tokens
|
||||
|
||||
# Add verbosity control for GPT-5 models
|
||||
if supports_verbosity_control(model) and llm_config.verbosity:
|
||||
text_config_kwargs["verbosity"] = llm_config.verbosity
|
||||
@@ -451,7 +454,6 @@ class OpenAIClient(LLMClientBase):
|
||||
)
|
||||
|
||||
request_data = data.model_dump(exclude_unset=True)
|
||||
# print("responses request data", request_data)
|
||||
return request_data
|
||||
|
||||
@trace_method
|
||||
@@ -639,6 +641,14 @@ class OpenAIClient(LLMClientBase):
|
||||
tool.function.strict = False
|
||||
request_data = data.model_dump(exclude_unset=True)
|
||||
|
||||
# Fireworks uses strict validation (additionalProperties: false) and rejects
|
||||
# reasoning fields that are not in their schema.
|
||||
is_fireworks = llm_config.model_endpoint and "fireworks.ai" in llm_config.model_endpoint
|
||||
if is_fireworks and "messages" in request_data:
|
||||
for message in request_data["messages"]:
|
||||
for field in ("reasoning_content_signature", "redacted_reasoning_content", "omitted_reasoning_content"):
|
||||
message.pop(field, None)
|
||||
|
||||
# If Ollama
|
||||
# if llm_config.handle.startswith("ollama/") and llm_config.enable_reasoner:
|
||||
# Sadly, reasoning via the OpenAI proxy on Ollama only works for Harmony/gpt-oss
|
||||
|
||||
@@ -68,6 +68,12 @@ class ZAIClient(OpenAIClient):
|
||||
}
|
||||
}
|
||||
|
||||
# Z.ai's API uses max_tokens, not max_completion_tokens.
|
||||
# If max_completion_tokens is sent, Z.ai ignores it and falls back to its
|
||||
# default of 65536, silently truncating input to ~137K of the 200K context window.
|
||||
if "max_completion_tokens" in data:
|
||||
data["max_tokens"] = data.pop("max_completion_tokens")
|
||||
|
||||
# Sanitize empty text content — ZAI rejects empty text blocks
|
||||
if "messages" in data:
|
||||
for msg in data["messages"]:
|
||||
|
||||
@@ -17295,6 +17295,58 @@
|
||||
"supports_tool_choice": true,
|
||||
"supports_vision": true
|
||||
},
|
||||
"gpt-5.3-chat-latest": {
|
||||
"cache_read_input_token_cost": 1.75e-7,
|
||||
"cache_read_input_token_cost_priority": 3.5e-7,
|
||||
"input_cost_per_token": 1.75e-6,
|
||||
"input_cost_per_token_priority": 3.5e-6,
|
||||
"litellm_provider": "openai",
|
||||
"max_input_tokens": 128000,
|
||||
"max_output_tokens": 16384,
|
||||
"max_tokens": 16384,
|
||||
"mode": "chat",
|
||||
"output_cost_per_token": 1.4e-5,
|
||||
"output_cost_per_token_priority": 2.8e-5,
|
||||
"supported_endpoints": ["/v1/chat/completions", "/v1/responses"],
|
||||
"supported_modalities": ["text", "image"],
|
||||
"supported_output_modalities": ["text"],
|
||||
"supports_function_calling": true,
|
||||
"supports_native_streaming": true,
|
||||
"supports_parallel_function_calling": true,
|
||||
"supports_pdf_input": true,
|
||||
"supports_prompt_caching": true,
|
||||
"supports_reasoning": true,
|
||||
"supports_response_schema": true,
|
||||
"supports_system_messages": true,
|
||||
"supports_tool_choice": true,
|
||||
"supports_vision": true
|
||||
},
|
||||
"gpt-5.3-codex": {
|
||||
"cache_read_input_token_cost": 1.75e-7,
|
||||
"cache_read_input_token_cost_priority": 3.5e-7,
|
||||
"input_cost_per_token": 1.75e-6,
|
||||
"input_cost_per_token_priority": 3.5e-6,
|
||||
"litellm_provider": "openai",
|
||||
"max_input_tokens": 272000,
|
||||
"max_output_tokens": 128000,
|
||||
"max_tokens": 128000,
|
||||
"mode": "responses",
|
||||
"output_cost_per_token": 1.4e-5,
|
||||
"output_cost_per_token_priority": 2.8e-5,
|
||||
"supported_endpoints": ["/v1/responses"],
|
||||
"supported_modalities": ["text", "image"],
|
||||
"supported_output_modalities": ["text"],
|
||||
"supports_function_calling": true,
|
||||
"supports_native_streaming": true,
|
||||
"supports_parallel_function_calling": true,
|
||||
"supports_pdf_input": true,
|
||||
"supports_prompt_caching": true,
|
||||
"supports_reasoning": true,
|
||||
"supports_response_schema": true,
|
||||
"supports_system_messages": false,
|
||||
"supports_tool_choice": true,
|
||||
"supports_vision": true
|
||||
},
|
||||
"gpt-5-mini": {
|
||||
"cache_read_input_token_cost": 2.5e-8,
|
||||
"cache_read_input_token_cost_flex": 1.25e-8,
|
||||
|
||||
@@ -44,7 +44,7 @@ class Conversation(SqlalchemyBase, OrganizationMixin):
|
||||
"ConversationMessage",
|
||||
back_populates="conversation",
|
||||
cascade="all, delete-orphan",
|
||||
lazy="selectin",
|
||||
lazy="raise",
|
||||
)
|
||||
isolated_blocks: Mapped[List["Block"]] = relationship(
|
||||
"Block",
|
||||
|
||||
@@ -69,5 +69,5 @@ class ConversationMessage(SqlalchemyBase, OrganizationMixin):
|
||||
)
|
||||
message: Mapped["Message"] = relationship(
|
||||
"Message",
|
||||
lazy="selectin",
|
||||
lazy="raise",
|
||||
)
|
||||
|
||||
@@ -88,8 +88,7 @@ class LettaRequest(BaseModel):
|
||||
)
|
||||
top_logprobs: Optional[int] = Field(
|
||||
default=None,
|
||||
description="Number of most likely tokens to return at each position (0-20). "
|
||||
"Requires return_logprobs=True.",
|
||||
description="Number of most likely tokens to return at each position (0-20). Requires return_logprobs=True.",
|
||||
)
|
||||
return_token_ids: bool = Field(
|
||||
default=False,
|
||||
@@ -155,6 +154,10 @@ class LettaStreamingRequest(LettaRequest):
|
||||
class ConversationMessageRequest(LettaRequest):
|
||||
"""Request for sending messages to a conversation. Streams by default."""
|
||||
|
||||
agent_id: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Agent ID for agent-direct mode with 'default' conversation. Use with conversation_id='default' in the URL path.",
|
||||
)
|
||||
streaming: bool = Field(
|
||||
default=True,
|
||||
description="If True (default), returns a streaming response (Server-Sent Events). If False, returns a complete JSON response.",
|
||||
@@ -194,6 +197,10 @@ class CreateBatch(BaseModel):
|
||||
|
||||
|
||||
class RetrieveStreamRequest(BaseModel):
|
||||
agent_id: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Agent ID for agent-direct mode with 'default' conversation. Use with conversation_id='default' in the URL path.",
|
||||
)
|
||||
starting_after: int = Field(
|
||||
0, description="Sequence id to use as a cursor for pagination. Response will start streaming after this chunk sequence id"
|
||||
)
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import re
|
||||
from typing import TYPE_CHECKING, Literal, Optional
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
||||
@@ -139,7 +140,9 @@ class LLMConfig(BaseModel):
|
||||
|
||||
# Set max_tokens defaults based on model (only if not explicitly provided)
|
||||
if "max_tokens" not in values:
|
||||
if model.startswith("gpt-5"): # Covers both gpt-5 and gpt-5.1
|
||||
if re.match(r"^gpt-5\.[23]", model) and "-chat" not in model:
|
||||
values["max_tokens"] = 128000
|
||||
elif model.startswith("gpt-5"):
|
||||
values["max_tokens"] = 16384
|
||||
elif model == "gpt-4.1":
|
||||
values["max_tokens"] = 8192
|
||||
@@ -299,7 +302,7 @@ class LLMConfig(BaseModel):
|
||||
context_window=272000,
|
||||
reasoning_effort="none", # Default to "none" for GPT-5.2
|
||||
verbosity="medium",
|
||||
max_tokens=16384,
|
||||
max_tokens=128000,
|
||||
)
|
||||
elif model_name == "letta":
|
||||
return cls(
|
||||
|
||||
@@ -95,6 +95,11 @@ class LLMTrace(LettaBase):
|
||||
response_json: str = Field(..., description="Full response payload as JSON string")
|
||||
llm_config_json: str = Field(default="", description="LLM config as JSON string")
|
||||
|
||||
# Billing context
|
||||
billing_plan_type: Optional[str] = Field(default=None, description="Subscription tier (e.g., 'basic', 'standard', 'max', 'enterprise')")
|
||||
billing_cost_source: Optional[str] = Field(default=None, description="Cost source: 'quota' or 'credits'")
|
||||
billing_customer_id: Optional[str] = Field(default=None, description="Customer ID for cross-referencing billing records")
|
||||
|
||||
# Timestamp
|
||||
created_at: datetime = Field(default_factory=get_utc_time, description="When the trace was created")
|
||||
|
||||
@@ -128,6 +133,9 @@ class LLMTrace(LettaBase):
|
||||
self.request_json,
|
||||
self.response_json,
|
||||
self.llm_config_json,
|
||||
self.billing_plan_type or "",
|
||||
self.billing_cost_source or "",
|
||||
self.billing_customer_id or "",
|
||||
self.created_at,
|
||||
)
|
||||
|
||||
@@ -162,5 +170,8 @@ class LLMTrace(LettaBase):
|
||||
"request_json",
|
||||
"response_json",
|
||||
"llm_config_json",
|
||||
"billing_plan_type",
|
||||
"billing_cost_source",
|
||||
"billing_customer_id",
|
||||
"created_at",
|
||||
]
|
||||
|
||||
@@ -226,8 +226,6 @@ class Memory(BaseModel, validate_assignment=True):
|
||||
front_lines = []
|
||||
if block.description:
|
||||
front_lines.append(f"description: {block.description}")
|
||||
if block.limit is not None:
|
||||
front_lines.append(f"limit: {block.limit}")
|
||||
if getattr(block, "read_only", False):
|
||||
front_lines.append("read_only: true")
|
||||
|
||||
@@ -291,7 +289,40 @@ class Memory(BaseModel, validate_assignment=True):
|
||||
|
||||
s.write("\n\n<memory_filesystem>\n")
|
||||
|
||||
def _render_tree(node: dict, prefix: str = ""):
|
||||
def _render_tree(node: dict, prefix: str = "", in_system: bool = False, path_parts: tuple[str, ...] = ()):
|
||||
# Render skills/ as concise top-level entries only, using both
|
||||
# current (`skills/<name>`) and legacy (`skills/<name>/SKILL`) labels.
|
||||
if path_parts == ("skills",):
|
||||
skill_entries: list[tuple[str, str]] = []
|
||||
for name, val in node.items():
|
||||
if name == LEAF_KEY:
|
||||
continue
|
||||
|
||||
block = None
|
||||
if isinstance(val, dict):
|
||||
legacy_skill_block = val.get("SKILL")
|
||||
if legacy_skill_block is not None and not isinstance(legacy_skill_block, dict):
|
||||
block = legacy_skill_block
|
||||
elif LEAF_KEY in val and not isinstance(val[LEAF_KEY], dict):
|
||||
block = val[LEAF_KEY]
|
||||
else:
|
||||
block = val
|
||||
|
||||
if block is None:
|
||||
continue
|
||||
|
||||
desc = getattr(block, "description", None)
|
||||
desc_line = (desc or "").strip().split("\n")[0].strip()
|
||||
skill_entries.append((name, desc_line))
|
||||
|
||||
skill_entries.sort(key=lambda e: e[0])
|
||||
for i, (name, desc_line) in enumerate(skill_entries):
|
||||
is_last = i == len(skill_entries) - 1
|
||||
connector = "└── " if is_last else "├── "
|
||||
desc_suffix = f" ({desc_line})" if desc_line else ""
|
||||
s.write(f"{prefix}{connector}{name}{desc_suffix}\n")
|
||||
return
|
||||
|
||||
# Sort: directories first, then files. If a node is both a directory and a
|
||||
# leaf (LEAF_KEY present), show both <name>/ and <name>.md.
|
||||
dirs = []
|
||||
@@ -316,9 +347,24 @@ class Memory(BaseModel, validate_assignment=True):
|
||||
if is_dir:
|
||||
s.write(f"{prefix}{connector}{name}/\n")
|
||||
extension = " " if is_last else "│ "
|
||||
_render_tree(node[name], prefix + extension)
|
||||
_render_tree(
|
||||
node[name],
|
||||
prefix + extension,
|
||||
in_system=in_system or name == "system",
|
||||
path_parts=(*path_parts, name),
|
||||
)
|
||||
else:
|
||||
s.write(f"{prefix}{connector}{name}.md\n")
|
||||
# For files outside system/, append the block description
|
||||
desc_suffix = ""
|
||||
if not in_system:
|
||||
val = node[name]
|
||||
block = val[LEAF_KEY] if isinstance(val, dict) else val
|
||||
desc = getattr(block, "description", None)
|
||||
if desc:
|
||||
desc_line = desc.strip().split("\n")[0].strip()
|
||||
if desc_line:
|
||||
desc_suffix = f" ({desc_line})"
|
||||
s.write(f"{prefix}{connector}{name}.md{desc_suffix}\n")
|
||||
|
||||
_render_tree(tree)
|
||||
s.write("</memory_filesystem>")
|
||||
|
||||
@@ -282,10 +282,10 @@ class AnthropicModelSettings(ModelSettings):
|
||||
description="Soft control for how verbose model output should be, used for GPT-5 models.",
|
||||
)
|
||||
|
||||
# Opus 4.5 effort parameter
|
||||
effort: Optional[Literal["low", "medium", "high"]] = Field(
|
||||
# Effort parameter for Opus 4.5, Opus 4.6, and Sonnet 4.6
|
||||
effort: Optional[Literal["low", "medium", "high", "max"]] = Field(
|
||||
None,
|
||||
description="Effort level for Opus 4.5 model (controls token conservation). Not setting this gives similar performance to 'high'.",
|
||||
description="Effort level for supported Anthropic models (controls token spending). 'max' is only available on Opus 4.6. Not setting this gives similar performance to 'high'.",
|
||||
)
|
||||
|
||||
# Anthropic supports strict mode for tool calling - defaults to False
|
||||
|
||||
@@ -3,13 +3,21 @@ from __future__ import annotations
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from pydantic import Field
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from letta.helpers.datetime_helpers import get_utc_time
|
||||
from letta.schemas.enums import PrimitiveType
|
||||
from letta.schemas.letta_base import OrmMetadataBase
|
||||
|
||||
|
||||
class BillingContext(BaseModel):
|
||||
"""Billing context for LLM request cost tracking."""
|
||||
|
||||
plan_type: Optional[str] = Field(None, description="Subscription tier")
|
||||
cost_source: Optional[str] = Field(None, description="Cost source: 'quota' or 'credits'")
|
||||
customer_id: Optional[str] = Field(None, description="Customer ID for billing records")
|
||||
|
||||
|
||||
class BaseProviderTrace(OrmMetadataBase):
|
||||
__id_prefix__ = PrimitiveType.PROVIDER_TRACE.value
|
||||
|
||||
@@ -53,6 +61,8 @@ class ProviderTrace(BaseProviderTrace):
|
||||
compaction_settings: Optional[Dict[str, Any]] = Field(None, description="Compaction/summarization settings (summarization calls only)")
|
||||
llm_config: Optional[Dict[str, Any]] = Field(None, description="LLM configuration used for this call (non-summarization calls only)")
|
||||
|
||||
billing_context: Optional[BillingContext] = Field(None, description="Billing context from request headers")
|
||||
|
||||
created_at: datetime = Field(default_factory=get_utc_time, description="The timestamp when the object was created.")
|
||||
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ from letta.schemas.providers.base import Provider
|
||||
logger = get_logger(__name__)
|
||||
|
||||
ALLOWED_PREFIXES = {"gpt-4", "gpt-5", "o1", "o3", "o4"}
|
||||
DISALLOWED_KEYWORDS = {"transcribe", "search", "realtime", "tts", "audio", "computer", "o1-mini", "o1-preview", "o1-pro", "chat"}
|
||||
DISALLOWED_KEYWORDS = {"transcribe", "search", "realtime", "tts", "audio", "computer", "o1-mini", "o1-preview", "o1-pro"}
|
||||
DEFAULT_EMBEDDING_BATCH_SIZE = 1024
|
||||
|
||||
|
||||
@@ -50,10 +50,22 @@ class OpenAIProvider(Provider):
|
||||
except Exception as e:
|
||||
raise LLMError(message=f"{e}", code=ErrorCode.INTERNAL_SERVER_ERROR)
|
||||
|
||||
@staticmethod
|
||||
def _openai_default_max_output_tokens(model_name: str) -> int:
|
||||
"""Return a sensible max-output-tokens default for OpenAI models.
|
||||
|
||||
gpt-5.2* / gpt-5.3* support 128k output tokens, except the
|
||||
`-chat` variants which are capped at 16k.
|
||||
"""
|
||||
import re
|
||||
|
||||
if re.match(r"^gpt-5\.[23]", model_name) and "-chat" not in model_name:
|
||||
return 128000
|
||||
return 16384
|
||||
|
||||
def get_default_max_output_tokens(self, model_name: str) -> int:
|
||||
"""Get the default max output tokens for OpenAI models (sync fallback)."""
|
||||
# Simple default for openai
|
||||
return 16384
|
||||
return self._openai_default_max_output_tokens(model_name)
|
||||
|
||||
async def get_default_max_output_tokens_async(self, model_name: str) -> int:
|
||||
"""Get the default max output tokens for OpenAI models.
|
||||
@@ -67,8 +79,7 @@ class OpenAIProvider(Provider):
|
||||
if max_output is not None:
|
||||
return max_output
|
||||
|
||||
# Simple default for openai
|
||||
return 16384
|
||||
return self._openai_default_max_output_tokens(model_name)
|
||||
|
||||
async def _get_models_async(self) -> list[dict]:
|
||||
from letta.llm_api.openai import openai_get_model_list_async
|
||||
|
||||
@@ -12,12 +12,13 @@ from letta.schemas.providers.openai import OpenAIProvider
|
||||
|
||||
# Z.ai model context windows
|
||||
# Reference: https://docs.z.ai/
|
||||
# GLM-5 max context window is 200K tokens but max_output_tokens (default 16k) counts against that --> 180k
|
||||
MODEL_CONTEXT_WINDOWS = {
|
||||
"glm-4.5": 128000,
|
||||
"glm-4.6": 200000,
|
||||
"glm-4.7": 200000,
|
||||
"glm-5": 200000,
|
||||
"glm-5-code": 200000,
|
||||
"glm-4.6": 180000,
|
||||
"glm-4.7": 180000,
|
||||
"glm-5": 180000,
|
||||
"glm-5-code": 180000,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ import uuid
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from sqlalchemy import NullPool, text
|
||||
from sqlalchemy import NullPool
|
||||
from sqlalchemy.ext.asyncio import (
|
||||
AsyncEngine,
|
||||
AsyncSession,
|
||||
@@ -88,10 +88,6 @@ class DatabaseRegistry:
|
||||
try:
|
||||
async with async_session_factory() as session:
|
||||
try:
|
||||
result = await session.execute(text("SELECT pg_backend_pid(), current_setting('statement_timeout')"))
|
||||
pid, timeout = result.one()
|
||||
logger.warning(f"[stmt_timeout_debug] pid={pid} statement_timeout={timeout}")
|
||||
await session.rollback()
|
||||
yield session
|
||||
await session.commit()
|
||||
except asyncio.CancelledError:
|
||||
|
||||
@@ -6,6 +6,7 @@ from pydantic import BaseModel
|
||||
from letta.errors import LettaInvalidArgumentError
|
||||
from letta.otel.tracing import tracer
|
||||
from letta.schemas.enums import PrimitiveType
|
||||
from letta.schemas.provider_trace import BillingContext
|
||||
from letta.validators import PRIMITIVE_ID_PATTERNS
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -30,18 +31,24 @@ class HeaderParams(BaseModel):
|
||||
letta_source: Optional[str] = None
|
||||
sdk_version: Optional[str] = None
|
||||
experimental_params: Optional[ExperimentalParams] = None
|
||||
billing_context: Optional[BillingContext] = None
|
||||
|
||||
|
||||
def get_headers(
|
||||
actor_id: Optional[str] = Header(None, alias="user_id"),
|
||||
user_agent: Optional[str] = Header(None, alias="User-Agent"),
|
||||
project_id: Optional[str] = Header(None, alias="X-Project-Id"),
|
||||
letta_source: Optional[str] = Header(None, alias="X-Letta-Source"),
|
||||
sdk_version: Optional[str] = Header(None, alias="X-Stainless-Package-Version"),
|
||||
message_async: Optional[str] = Header(None, alias="X-Experimental-Message-Async"),
|
||||
letta_v1_agent: Optional[str] = Header(None, alias="X-Experimental-Letta-V1-Agent"),
|
||||
letta_v1_agent_message_async: Optional[str] = Header(None, alias="X-Experimental-Letta-V1-Agent-Message-Async"),
|
||||
modal_sandbox: Optional[str] = Header(None, alias="X-Experimental-Modal-Sandbox"),
|
||||
letta_source: Optional[str] = Header(None, alias="X-Letta-Source", include_in_schema=False),
|
||||
sdk_version: Optional[str] = Header(None, alias="X-Stainless-Package-Version", include_in_schema=False),
|
||||
message_async: Optional[str] = Header(None, alias="X-Experimental-Message-Async", include_in_schema=False),
|
||||
letta_v1_agent: Optional[str] = Header(None, alias="X-Experimental-Letta-V1-Agent", include_in_schema=False),
|
||||
letta_v1_agent_message_async: Optional[str] = Header(
|
||||
None, alias="X-Experimental-Letta-V1-Agent-Message-Async", include_in_schema=False
|
||||
),
|
||||
modal_sandbox: Optional[str] = Header(None, alias="X-Experimental-Modal-Sandbox", include_in_schema=False),
|
||||
billing_plan_type: Optional[str] = Header(None, alias="X-Billing-Plan-Type", include_in_schema=False),
|
||||
billing_cost_source: Optional[str] = Header(None, alias="X-Billing-Cost-Source", include_in_schema=False),
|
||||
billing_customer_id: Optional[str] = Header(None, alias="X-Billing-Customer-Id", include_in_schema=False),
|
||||
) -> HeaderParams:
|
||||
"""Dependency injection function to extract common headers from requests."""
|
||||
with tracer.start_as_current_span("dependency.get_headers"):
|
||||
@@ -63,6 +70,13 @@ def get_headers(
|
||||
letta_v1_agent_message_async=(letta_v1_agent_message_async == "true") if letta_v1_agent_message_async else None,
|
||||
modal_sandbox=(modal_sandbox == "true") if modal_sandbox else None,
|
||||
),
|
||||
billing_context=BillingContext(
|
||||
plan_type=billing_plan_type,
|
||||
cost_source=billing_cost_source,
|
||||
customer_id=billing_customer_id,
|
||||
)
|
||||
if any([billing_plan_type, billing_cost_source, billing_customer_id])
|
||||
else None,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -49,6 +49,7 @@ from letta.schemas.memory import (
|
||||
)
|
||||
from letta.schemas.message import Message, MessageCreate, MessageCreateType, MessageSearchRequest, MessageSearchResult
|
||||
from letta.schemas.passage import Passage
|
||||
from letta.schemas.provider_trace import BillingContext
|
||||
from letta.schemas.run import Run as PydanticRun, RunUpdate
|
||||
from letta.schemas.source import Source
|
||||
from letta.schemas.tool import Tool
|
||||
@@ -156,7 +157,7 @@ async def list_agents(
|
||||
order: Literal["asc", "desc"] = Query(
|
||||
"desc", description="Sort order for agents by creation time. 'asc' for oldest first, 'desc' for newest first"
|
||||
),
|
||||
order_by: Literal["created_at", "last_run_completion"] = Query("created_at", description="Field to sort by"),
|
||||
order_by: Literal["created_at", "updated_at", "last_run_completion"] = Query("created_at", description="Field to sort by"),
|
||||
ascending: bool = Query(
|
||||
False,
|
||||
description="Whether to sort agents oldest to newest (True) or newest to oldest (False, default)",
|
||||
@@ -1697,6 +1698,7 @@ async def send_message(
|
||||
actor=actor,
|
||||
request=request,
|
||||
run_type="send_message",
|
||||
billing_context=headers.billing_context,
|
||||
)
|
||||
return result
|
||||
|
||||
@@ -1767,6 +1769,7 @@ async def send_message(
|
||||
include_return_message_types=request.include_return_message_types,
|
||||
client_tools=request.client_tools,
|
||||
include_compaction_messages=request.include_compaction_messages,
|
||||
billing_context=headers.billing_context,
|
||||
)
|
||||
run_status = result.stop_reason.stop_reason.run_status
|
||||
return result
|
||||
@@ -1845,6 +1848,7 @@ async def send_message_streaming(
|
||||
actor=actor,
|
||||
request=request,
|
||||
run_type="send_message_streaming",
|
||||
billing_context=headers.billing_context,
|
||||
)
|
||||
|
||||
return result
|
||||
@@ -1868,6 +1872,13 @@ async def cancel_message(
|
||||
"""
|
||||
# TODO: WHY DOES THIS CANCEL A LIST OF RUNS?
|
||||
actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
|
||||
logger.info(
|
||||
"[Interrupt] Cancel request received for agent=%s by actor=%s (org=%s), explicit_run_ids=%s",
|
||||
agent_id,
|
||||
actor.id,
|
||||
actor.organization_id,
|
||||
request.run_ids if request else None,
|
||||
)
|
||||
if not settings.track_agent_run:
|
||||
raise HTTPException(status_code=400, detail="Agent run tracking is disabled")
|
||||
run_ids = request.run_ids if request else None
|
||||
@@ -2036,6 +2047,7 @@ async def _process_message_background(
|
||||
include_return_message_types: list[MessageType] | None = None,
|
||||
override_model: str | None = None,
|
||||
include_compaction_messages: bool = False,
|
||||
billing_context: "BillingContext | None" = None,
|
||||
) -> None:
|
||||
"""Background task to process the message and update run status."""
|
||||
request_start_timestamp_ns = get_utc_timestamp_ns()
|
||||
@@ -2067,6 +2079,7 @@ async def _process_message_background(
|
||||
request_start_timestamp_ns=request_start_timestamp_ns,
|
||||
include_return_message_types=include_return_message_types,
|
||||
include_compaction_messages=include_compaction_messages,
|
||||
billing_context=billing_context,
|
||||
)
|
||||
runs_manager = RunManager()
|
||||
from letta.schemas.enums import RunStatus
|
||||
@@ -2235,6 +2248,7 @@ async def send_message_async(
|
||||
include_return_message_types=request.include_return_message_types,
|
||||
override_model=request.override_model,
|
||||
include_compaction_messages=request.include_compaction_messages,
|
||||
billing_context=headers.billing_context,
|
||||
),
|
||||
label=f"process_message_background_{run.id}",
|
||||
)
|
||||
@@ -2419,7 +2433,11 @@ async def summarize_messages(
|
||||
|
||||
# If mode changed from agent's original settings and prompt not explicitly set in request, then use the default prompt for the new mode
|
||||
# Ex: previously was sliding_window, now is all, so we need to use the default prompt for all mode
|
||||
if "mode" in changed_fields and agent.compaction_settings.mode != request.compaction_settings.mode:
|
||||
if (
|
||||
"mode" in changed_fields
|
||||
and "prompt" not in changed_fields
|
||||
and agent.compaction_settings.mode != request.compaction_settings.mode
|
||||
):
|
||||
from letta.services.summarizer.summarizer_config import get_default_prompt_for_mode
|
||||
|
||||
compaction_settings.prompt = get_default_prompt_for_mode(compaction_settings.mode)
|
||||
@@ -2439,7 +2457,7 @@ async def summarize_messages(
|
||||
logger.warning(f"Summarization failed to reduce the number of messages. {num_messages_before} messages -> {num_messages_after}.")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail="Summarization failed to reduce the number of messages. You may need to use a different CompactionSettings (e.g. using `all` mode).",
|
||||
detail="Summarization failed to reduce the number of messages. You may not have enough messages to compact or need to use a different CompactionSettings (e.g. using `all` mode).",
|
||||
)
|
||||
await agent_loop._checkpoint_messages(run_id=None, step_id=None, new_messages=[summary_message], in_context_messages=messages)
|
||||
return CompactionResponse(
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from datetime import timedelta
|
||||
from typing import Annotated, List, Literal, Optional
|
||||
from uuid import uuid4
|
||||
|
||||
from fastapi import APIRouter, Body, Depends, HTTPException, Query, status
|
||||
from pydantic import BaseModel, Field
|
||||
@@ -18,6 +19,7 @@ from letta.schemas.job import LettaRequestConfig
|
||||
from letta.schemas.letta_message import LettaMessageUnion
|
||||
from letta.schemas.letta_request import ConversationMessageRequest, LettaStreamingRequest, RetrieveStreamRequest
|
||||
from letta.schemas.letta_response import LettaResponse
|
||||
from letta.schemas.provider_trace import BillingContext
|
||||
from letta.schemas.run import Run as PydanticRun
|
||||
from letta.server.rest_api.dependencies import HeaderParams, get_headers, get_letta_server
|
||||
from letta.server.rest_api.redis_stream_manager import redis_sse_stream_generator
|
||||
@@ -32,7 +34,7 @@ from letta.services.run_manager import RunManager
|
||||
from letta.services.streaming_service import StreamingService
|
||||
from letta.services.summarizer.summarizer_config import CompactionSettings
|
||||
from letta.settings import settings
|
||||
from letta.validators import ConversationId
|
||||
from letta.validators import ConversationId, ConversationIdOrDefault
|
||||
|
||||
router = APIRouter(prefix="/conversations", tags=["conversations"])
|
||||
|
||||
@@ -148,7 +150,8 @@ ConversationMessagesResponse = Annotated[
|
||||
operation_id="list_conversation_messages",
|
||||
)
|
||||
async def list_conversation_messages(
|
||||
conversation_id: ConversationId,
|
||||
conversation_id: ConversationIdOrDefault,
|
||||
agent_id: Optional[str] = Query(None, description="Agent ID for agent-direct mode with 'default' conversation"),
|
||||
server: SyncServer = Depends(get_letta_server),
|
||||
headers: HeaderParams = Depends(get_headers),
|
||||
before: Optional[str] = Query(
|
||||
@@ -172,8 +175,36 @@ async def list_conversation_messages(
|
||||
|
||||
Returns LettaMessage objects (UserMessage, AssistantMessage, etc.) for all
|
||||
messages in the conversation, with support for cursor-based pagination.
|
||||
|
||||
**Agent-direct mode**: Pass conversation_id="default" with agent_id parameter
|
||||
to list messages from the agent's default conversation.
|
||||
|
||||
**Deprecated**: Passing an agent ID as conversation_id still works but will be removed.
|
||||
"""
|
||||
actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
|
||||
|
||||
# Agent-direct mode: conversation_id="default" + agent_id param (preferred)
|
||||
# OR conversation_id="agent-*" (backwards compat, deprecated)
|
||||
resolved_agent_id = None
|
||||
if conversation_id == "default" and agent_id:
|
||||
resolved_agent_id = agent_id
|
||||
elif conversation_id.startswith("agent-"):
|
||||
resolved_agent_id = conversation_id
|
||||
|
||||
if resolved_agent_id:
|
||||
return await server.get_agent_recall_async(
|
||||
agent_id=resolved_agent_id,
|
||||
after=after,
|
||||
before=before,
|
||||
limit=limit,
|
||||
group_id=group_id,
|
||||
conversation_id=None, # Default conversation (no isolation)
|
||||
reverse=(order == "desc"),
|
||||
return_message_object=False,
|
||||
include_err=include_err,
|
||||
actor=actor,
|
||||
)
|
||||
|
||||
return await conversation_manager.list_conversation_messages(
|
||||
conversation_id=conversation_id,
|
||||
actor=actor,
|
||||
@@ -186,6 +217,108 @@ async def list_conversation_messages(
|
||||
)
|
||||
|
||||
|
||||
async def _send_agent_direct_message(
|
||||
agent_id: str,
|
||||
request: ConversationMessageRequest,
|
||||
server: SyncServer,
|
||||
actor,
|
||||
billing_context: "BillingContext | None" = None,
|
||||
) -> StreamingResponse | LettaResponse:
|
||||
"""
|
||||
Handle agent-direct messaging with locking but without conversation features.
|
||||
|
||||
This is used when the conversation_id in the URL is actually an agent ID,
|
||||
providing a unified endpoint while maintaining agent-level locking.
|
||||
"""
|
||||
redis_client = await get_redis_client()
|
||||
|
||||
# Streaming mode (default)
|
||||
if request.streaming:
|
||||
streaming_request = LettaStreamingRequest(
|
||||
messages=request.messages,
|
||||
streaming=True,
|
||||
stream_tokens=request.stream_tokens,
|
||||
include_pings=request.include_pings,
|
||||
background=request.background,
|
||||
max_steps=request.max_steps,
|
||||
use_assistant_message=request.use_assistant_message,
|
||||
assistant_message_tool_name=request.assistant_message_tool_name,
|
||||
assistant_message_tool_kwarg=request.assistant_message_tool_kwarg,
|
||||
include_return_message_types=request.include_return_message_types,
|
||||
override_model=request.override_model,
|
||||
client_tools=request.client_tools,
|
||||
)
|
||||
streaming_service = StreamingService(server)
|
||||
run, result = await streaming_service.create_agent_stream(
|
||||
agent_id=agent_id,
|
||||
actor=actor,
|
||||
request=streaming_request,
|
||||
run_type="send_message",
|
||||
conversation_id=None,
|
||||
should_lock=True,
|
||||
billing_context=billing_context,
|
||||
)
|
||||
return result
|
||||
|
||||
# Non-streaming mode with locking
|
||||
agent = await server.agent_manager.get_agent_by_id_async(
|
||||
agent_id,
|
||||
actor,
|
||||
include_relationships=["memory", "multi_agent_group", "sources", "tool_exec_environment_variables", "tools", "tags"],
|
||||
)
|
||||
|
||||
# Handle model override if specified in the request
|
||||
if request.override_model:
|
||||
override_llm_config = await server.get_llm_config_from_handle_async(
|
||||
actor=actor,
|
||||
handle=request.override_model,
|
||||
)
|
||||
agent = agent.model_copy(update={"llm_config": override_llm_config})
|
||||
|
||||
# Acquire lock using agent_id as lock key
|
||||
if not isinstance(redis_client, NoopAsyncRedisClient):
|
||||
await redis_client.acquire_conversation_lock(
|
||||
conversation_id=agent_id,
|
||||
token=str(uuid4()),
|
||||
)
|
||||
|
||||
try:
|
||||
# Create a run for execution tracking
|
||||
run = None
|
||||
if settings.track_agent_run:
|
||||
runs_manager = RunManager()
|
||||
run = await runs_manager.create_run(
|
||||
pydantic_run=PydanticRun(
|
||||
agent_id=agent_id,
|
||||
background=False,
|
||||
metadata={
|
||||
"run_type": "send_message",
|
||||
},
|
||||
request_config=LettaRequestConfig.from_letta_request(request),
|
||||
),
|
||||
actor=actor,
|
||||
)
|
||||
|
||||
# Set run_id in Redis for cancellation support
|
||||
await redis_client.set(f"{REDIS_RUN_ID_PREFIX}:{agent_id}", run.id if run else None)
|
||||
|
||||
agent_loop = AgentLoop.load(agent_state=agent, actor=actor)
|
||||
return await agent_loop.step(
|
||||
request.messages,
|
||||
max_steps=request.max_steps,
|
||||
run_id=run.id if run else None,
|
||||
use_assistant_message=request.use_assistant_message,
|
||||
include_return_message_types=request.include_return_message_types,
|
||||
client_tools=request.client_tools,
|
||||
conversation_id=None,
|
||||
include_compaction_messages=request.include_compaction_messages,
|
||||
billing_context=billing_context,
|
||||
)
|
||||
finally:
|
||||
# Release lock
|
||||
await redis_client.release_conversation_lock(agent_id)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/{conversation_id}/messages",
|
||||
response_model=LettaResponse,
|
||||
@@ -201,7 +334,7 @@ async def list_conversation_messages(
|
||||
},
|
||||
)
|
||||
async def send_conversation_message(
|
||||
conversation_id: ConversationId,
|
||||
conversation_id: ConversationIdOrDefault,
|
||||
request: ConversationMessageRequest = Body(...),
|
||||
server: SyncServer = Depends(get_letta_server),
|
||||
headers: HeaderParams = Depends(get_headers),
|
||||
@@ -212,12 +345,36 @@ async def send_conversation_message(
|
||||
This endpoint sends a message to an existing conversation.
|
||||
By default (streaming=true), returns a streaming response (Server-Sent Events).
|
||||
Set streaming=false to get a complete JSON response.
|
||||
|
||||
**Agent-direct mode**: Pass conversation_id="default" with agent_id in request body
|
||||
to send messages to the agent's default conversation with locking.
|
||||
|
||||
**Deprecated**: Passing an agent ID as conversation_id still works but will be removed.
|
||||
"""
|
||||
actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
|
||||
|
||||
if not request.messages or len(request.messages) == 0:
|
||||
raise HTTPException(status_code=422, detail="Messages must not be empty")
|
||||
|
||||
# Agent-direct mode: conversation_id="default" + agent_id in body (preferred)
|
||||
# OR conversation_id="agent-*" (backwards compat, deprecated)
|
||||
resolved_agent_id = None
|
||||
if conversation_id == "default" and request.agent_id:
|
||||
resolved_agent_id = request.agent_id
|
||||
elif conversation_id.startswith("agent-"):
|
||||
resolved_agent_id = conversation_id
|
||||
|
||||
if resolved_agent_id:
|
||||
# Agent-direct mode: use agent ID, enable locking, skip conversation features
|
||||
return await _send_agent_direct_message(
|
||||
agent_id=resolved_agent_id,
|
||||
request=request,
|
||||
server=server,
|
||||
actor=actor,
|
||||
billing_context=headers.billing_context,
|
||||
)
|
||||
|
||||
# Normal conversation mode
|
||||
conversation = await conversation_manager.get_conversation_by_id(
|
||||
conversation_id=conversation_id,
|
||||
actor=actor,
|
||||
@@ -247,6 +404,7 @@ async def send_conversation_message(
|
||||
request=streaming_request,
|
||||
run_type="send_conversation_message",
|
||||
conversation_id=conversation_id,
|
||||
billing_context=headers.billing_context,
|
||||
)
|
||||
return result
|
||||
|
||||
@@ -265,6 +423,10 @@ async def send_conversation_message(
|
||||
)
|
||||
if conversation.model_settings is not None:
|
||||
update_params = conversation.model_settings._to_legacy_config_params()
|
||||
# Don't clobber max_tokens with the Pydantic default when the caller
|
||||
# didn't explicitly provide max_output_tokens.
|
||||
if "max_output_tokens" not in conversation.model_settings.model_fields_set:
|
||||
update_params.pop("max_tokens", None)
|
||||
conversation_llm_config = conversation_llm_config.model_copy(update=update_params)
|
||||
agent = agent.model_copy(update={"llm_config": conversation_llm_config})
|
||||
|
||||
@@ -305,6 +467,7 @@ async def send_conversation_message(
|
||||
client_tools=request.client_tools,
|
||||
conversation_id=conversation_id,
|
||||
include_compaction_messages=request.include_compaction_messages,
|
||||
billing_context=headers.billing_context,
|
||||
)
|
||||
|
||||
|
||||
@@ -341,7 +504,7 @@ async def send_conversation_message(
|
||||
},
|
||||
)
|
||||
async def retrieve_conversation_stream(
|
||||
conversation_id: ConversationId,
|
||||
conversation_id: ConversationIdOrDefault,
|
||||
request: RetrieveStreamRequest = Body(None),
|
||||
headers: HeaderParams = Depends(get_headers),
|
||||
server: SyncServer = Depends(get_letta_server),
|
||||
@@ -351,11 +514,35 @@ async def retrieve_conversation_stream(
|
||||
|
||||
This endpoint allows you to reconnect to an active background stream
|
||||
for a conversation, enabling recovery from network interruptions.
|
||||
|
||||
**Agent-direct mode**: Pass conversation_id="default" with agent_id in request body
|
||||
to retrieve the stream for the agent's most recent active run.
|
||||
|
||||
**Deprecated**: Passing an agent ID as conversation_id still works but will be removed.
|
||||
"""
|
||||
actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
|
||||
runs_manager = RunManager()
|
||||
|
||||
# Find the most recent active run for this conversation
|
||||
# Agent-direct mode: conversation_id="default" + agent_id in body (preferred)
|
||||
# OR conversation_id="agent-*" (backwards compat, deprecated)
|
||||
resolved_agent_id = None
|
||||
if conversation_id == "default" and request and request.agent_id:
|
||||
resolved_agent_id = request.agent_id
|
||||
elif conversation_id.startswith("agent-"):
|
||||
resolved_agent_id = conversation_id
|
||||
|
||||
# Find the most recent active run
|
||||
if resolved_agent_id:
|
||||
# Agent-direct mode: find runs by agent_id
|
||||
active_runs = await runs_manager.list_runs(
|
||||
actor=actor,
|
||||
agent_id=resolved_agent_id,
|
||||
statuses=[RunStatus.created, RunStatus.running],
|
||||
limit=1,
|
||||
ascending=False,
|
||||
)
|
||||
else:
|
||||
# Normal mode: find runs by conversation_id
|
||||
active_runs = await runs_manager.list_runs(
|
||||
actor=actor,
|
||||
conversation_id=conversation_id,
|
||||
@@ -417,7 +604,8 @@ async def retrieve_conversation_stream(
|
||||
|
||||
@router.post("/{conversation_id}/cancel", operation_id="cancel_conversation")
|
||||
async def cancel_conversation(
|
||||
conversation_id: ConversationId,
|
||||
conversation_id: ConversationIdOrDefault,
|
||||
agent_id: Optional[str] = Query(None, description="Agent ID for agent-direct mode with 'default' conversation"),
|
||||
server: SyncServer = Depends(get_letta_server),
|
||||
headers: HeaderParams = Depends(get_headers),
|
||||
) -> dict:
|
||||
@@ -425,17 +613,48 @@ async def cancel_conversation(
|
||||
Cancel runs associated with a conversation.
|
||||
|
||||
Note: To cancel active runs, Redis is required.
|
||||
|
||||
**Agent-direct mode**: Pass conversation_id="default" with agent_id query parameter
|
||||
to cancel runs for the agent's default conversation.
|
||||
|
||||
**Deprecated**: Passing an agent ID as conversation_id still works but will be removed.
|
||||
"""
|
||||
actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
|
||||
logger.info(
|
||||
"[Interrupt] Cancel request received for conversation=%s by actor=%s (org=%s)",
|
||||
conversation_id,
|
||||
actor.id,
|
||||
actor.organization_id,
|
||||
)
|
||||
|
||||
if not settings.track_agent_run:
|
||||
raise HTTPException(status_code=400, detail="Agent run tracking is disabled")
|
||||
|
||||
# Agent-direct mode: conversation_id="default" + agent_id param (preferred)
|
||||
# OR conversation_id="agent-*" (backwards compat, deprecated)
|
||||
resolved_agent_id = None
|
||||
if conversation_id == "default" and agent_id:
|
||||
resolved_agent_id = agent_id
|
||||
elif conversation_id.startswith("agent-"):
|
||||
resolved_agent_id = conversation_id
|
||||
|
||||
if resolved_agent_id:
|
||||
# Agent-direct mode: use agent_id directly, skip conversation lookup
|
||||
# Find active runs for this agent (default conversation has conversation_id=None)
|
||||
runs = await server.run_manager.list_runs(
|
||||
actor=actor,
|
||||
agent_id=resolved_agent_id,
|
||||
statuses=[RunStatus.created, RunStatus.running],
|
||||
ascending=False,
|
||||
limit=100,
|
||||
)
|
||||
else:
|
||||
# Verify conversation exists and get agent_id
|
||||
conversation = await conversation_manager.get_conversation_by_id(
|
||||
conversation_id=conversation_id,
|
||||
actor=actor,
|
||||
)
|
||||
agent_id = conversation.agent_id
|
||||
|
||||
# Find active runs for this conversation
|
||||
runs = await server.run_manager.list_runs(
|
||||
@@ -445,6 +664,7 @@ async def cancel_conversation(
|
||||
conversation_id=conversation_id,
|
||||
limit=100,
|
||||
)
|
||||
|
||||
run_ids = [run.id for run in runs]
|
||||
|
||||
if not run_ids:
|
||||
@@ -461,7 +681,7 @@ async def cancel_conversation(
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to cancel Lettuce run {run_id}: {e}")
|
||||
|
||||
await server.run_manager.cancel_run(actor=actor, agent_id=conversation.agent_id, run_id=run_id)
|
||||
await server.run_manager.cancel_run(actor=actor, agent_id=agent_id, run_id=run_id)
|
||||
except Exception as e:
|
||||
results[run_id] = "failed"
|
||||
logger.error(f"Failed to cancel run {run_id}: {str(e)}")
|
||||
@@ -473,6 +693,10 @@ async def cancel_conversation(
|
||||
|
||||
|
||||
class CompactionRequest(BaseModel):
|
||||
agent_id: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Agent ID for agent-direct mode with 'default' conversation. Use with conversation_id='default' in the URL path.",
|
||||
)
|
||||
compaction_settings: Optional[CompactionSettings] = Field(
|
||||
default=None,
|
||||
description="Optional compaction settings to use for this summarization request. If not provided, the agent's default settings will be used.",
|
||||
@@ -487,7 +711,7 @@ class CompactionResponse(BaseModel):
|
||||
|
||||
@router.post("/{conversation_id}/compact", response_model=CompactionResponse, operation_id="compact_conversation")
|
||||
async def compact_conversation(
|
||||
conversation_id: ConversationId,
|
||||
conversation_id: ConversationIdOrDefault,
|
||||
request: Optional[CompactionRequest] = Body(default=None),
|
||||
server: SyncServer = Depends(get_letta_server),
|
||||
headers: HeaderParams = Depends(get_headers),
|
||||
@@ -497,9 +721,28 @@ async def compact_conversation(
|
||||
|
||||
This endpoint summarizes the in-context messages for a specific conversation,
|
||||
reducing the message count while preserving important context.
|
||||
|
||||
**Agent-direct mode**: Pass conversation_id="default" with agent_id in request body
|
||||
to compact the agent's default conversation messages.
|
||||
|
||||
**Deprecated**: Passing an agent ID as conversation_id still works but will be removed.
|
||||
"""
|
||||
actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
|
||||
|
||||
# Agent-direct mode: conversation_id="default" + agent_id in body (preferred)
|
||||
# OR conversation_id="agent-*" (backwards compat, deprecated)
|
||||
resolved_agent_id = None
|
||||
if conversation_id == "default" and request and request.agent_id:
|
||||
resolved_agent_id = request.agent_id
|
||||
elif conversation_id.startswith("agent-"):
|
||||
resolved_agent_id = conversation_id
|
||||
|
||||
if resolved_agent_id:
|
||||
# Agent-direct mode: compact agent's default conversation
|
||||
agent = await server.agent_manager.get_agent_by_id_async(resolved_agent_id, actor, include_relationships=["multi_agent_group"])
|
||||
in_context_messages = await server.message_manager.get_messages_by_ids_async(message_ids=agent.message_ids, actor=actor)
|
||||
agent_loop = LettaAgentV3(agent_state=agent, actor=actor)
|
||||
else:
|
||||
# Get the conversation to find the agent_id
|
||||
conversation = await conversation_manager.get_conversation_by_id(
|
||||
conversation_id=conversation_id,
|
||||
@@ -515,16 +758,36 @@ async def compact_conversation(
|
||||
actor=actor,
|
||||
)
|
||||
|
||||
# Create agent loop with conversation context
|
||||
agent_loop = LettaAgentV3(agent_state=agent, actor=actor, conversation_id=conversation_id)
|
||||
|
||||
if not in_context_messages:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail="No in-context messages found for this conversation.",
|
||||
)
|
||||
|
||||
# Create agent loop with conversation context
|
||||
agent_loop = LettaAgentV3(agent_state=agent, actor=actor, conversation_id=conversation_id)
|
||||
# Merge request compaction_settings with agent's settings (request overrides agent)
|
||||
if agent.compaction_settings and request and request.compaction_settings:
|
||||
# Start with agent's settings, override with new values from request
|
||||
# Use model_fields_set to get the fields that were changed in the request (want to ignore the defaults that get set automatically)
|
||||
compaction_settings = agent.compaction_settings.copy() # do not mutate original agent compaction settings
|
||||
changed_fields = request.compaction_settings.model_fields_set
|
||||
for field in changed_fields:
|
||||
setattr(compaction_settings, field, getattr(request.compaction_settings, field))
|
||||
|
||||
compaction_settings = request.compaction_settings if request else None
|
||||
# If mode changed from agent's original settings and prompt not explicitly set in request, then use the default prompt for the new mode
|
||||
# Ex: previously was sliding_window, now is all, so we need to use the default prompt for all mode
|
||||
if (
|
||||
"mode" in changed_fields
|
||||
and "prompt" not in changed_fields
|
||||
and agent.compaction_settings.mode != request.compaction_settings.mode
|
||||
):
|
||||
from letta.services.summarizer.summarizer_config import get_default_prompt_for_mode
|
||||
|
||||
compaction_settings.prompt = get_default_prompt_for_mode(compaction_settings.mode)
|
||||
else:
|
||||
compaction_settings = (request and request.compaction_settings) or agent.compaction_settings
|
||||
num_messages_before = len(in_context_messages)
|
||||
|
||||
# Run compaction
|
||||
@@ -537,13 +800,11 @@ async def compact_conversation(
|
||||
|
||||
# Validate compaction reduced messages
|
||||
if num_messages_before <= num_messages_after:
|
||||
logger.warning(
|
||||
f"Summarization failed to reduce the number of messages. {num_messages_before} messages -> {num_messages_after} (only expected if drop_tool_returns is True)."
|
||||
logger.warning(f"Summarization failed to reduce the number of messages. {num_messages_before} messages -> {num_messages_after}.")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail="Summarization failed to reduce the number of messages. You may not have enough messages to compact or need to use a different CompactionSettings (e.g. using `all` mode).",
|
||||
)
|
||||
# raise HTTPException(
|
||||
# status_code=status.HTTP_400_BAD_REQUEST,
|
||||
# detail="Summarization failed to reduce the number of messages. You may need to use a different CompactionSettings (e.g. using `all` mode).",
|
||||
# )
|
||||
|
||||
# Checkpoint the messages (this will update the conversation_messages table)
|
||||
await agent_loop._checkpoint_messages(run_id=None, step_id=None, new_messages=[summary_message], in_context_messages=messages)
|
||||
|
||||
@@ -29,11 +29,23 @@ from starlette.background import BackgroundTask
|
||||
|
||||
from letta.log import get_logger
|
||||
from letta.server.rest_api.dependencies import HeaderParams, get_headers, get_letta_server
|
||||
from letta.services.memory_repo.path_mapping import memory_block_label_from_markdown_path
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
_background_tasks: set[asyncio.Task] = set()
|
||||
|
||||
|
||||
def _is_syncable_block_markdown_path(path: str) -> bool:
|
||||
"""Return whether a markdown path should be mirrored into block cache.
|
||||
|
||||
Special-case skills so only skill definitions are mirrored:
|
||||
- sync `skills/{skill_name}/SKILL.md` as label `skills/{skill_name}`
|
||||
- ignore all other markdown under `skills/`
|
||||
"""
|
||||
return memory_block_label_from_markdown_path(path) is not None
|
||||
|
||||
|
||||
router = APIRouter(prefix="/git", tags=["git"], include_in_schema=False)
|
||||
|
||||
# Global storage for the server instance (set during app startup)
|
||||
@@ -100,7 +112,7 @@ async def _sync_after_push(actor_id: str, agent_id: str) -> None:
|
||||
expected_labels = set()
|
||||
from letta.services.memory_repo.block_markdown import parse_block_markdown
|
||||
|
||||
md_file_paths = sorted([file_path for file_path in files if file_path.endswith(".md")])
|
||||
md_file_paths = sorted([file_path for file_path in files if _is_syncable_block_markdown_path(file_path)])
|
||||
nested_md_file_paths = [file_path for file_path in md_file_paths if "/" in file_path[:-3]]
|
||||
logger.info(
|
||||
"Post-push sync file scan: agent=%s total_files=%d md_files=%d nested_md_files=%d sample_md_paths=%s",
|
||||
@@ -113,10 +125,12 @@ async def _sync_after_push(actor_id: str, agent_id: str) -> None:
|
||||
|
||||
synced = 0
|
||||
for file_path, content in files.items():
|
||||
if not file_path.endswith(".md"):
|
||||
if not _is_syncable_block_markdown_path(file_path):
|
||||
continue
|
||||
|
||||
label = file_path[:-3]
|
||||
label = memory_block_label_from_markdown_path(file_path)
|
||||
if label is None:
|
||||
continue
|
||||
expected_labels.add(label)
|
||||
|
||||
# Parse frontmatter to extract metadata alongside value
|
||||
|
||||
@@ -364,6 +364,8 @@ def create_approval_request_message_from_llm_response(
|
||||
)
|
||||
if pre_computed_assistant_message_id:
|
||||
approval_message.id = decrement_message_uuid(pre_computed_assistant_message_id)
|
||||
# Set otid to match streaming interface pattern (index -1 returns id unchanged)
|
||||
approval_message.otid = Message.generate_otid_from_id(approval_message.id, -1)
|
||||
messages.append(approval_message)
|
||||
return messages
|
||||
|
||||
|
||||
@@ -562,6 +562,10 @@ class SyncServer(object):
|
||||
# update with model_settings
|
||||
if request.model_settings is not None:
|
||||
update_llm_config_params = request.model_settings._to_legacy_config_params()
|
||||
# Don't clobber max_tokens with the Pydantic default when the caller
|
||||
# didn't explicitly provide max_output_tokens in the request.
|
||||
if "max_output_tokens" not in request.model_settings.model_fields_set:
|
||||
update_llm_config_params.pop("max_tokens", None)
|
||||
request.llm_config = request.llm_config.model_copy(update=update_llm_config_params)
|
||||
|
||||
# Copy parallel_tool_calls from request to llm_config if provided
|
||||
@@ -675,6 +679,12 @@ class SyncServer(object):
|
||||
# Get the current agent's llm_config if not already set
|
||||
agent = await self.agent_manager.get_agent_by_id_async(agent_id=agent_id, actor=actor)
|
||||
request.llm_config = agent.llm_config.model_copy()
|
||||
else:
|
||||
# TODO: Refactor update_agent to accept partial llm_config so we
|
||||
# don't need to fetch the full agent just to preserve max_tokens.
|
||||
if request.max_tokens is None and "max_output_tokens" not in request.model_settings.model_fields_set:
|
||||
agent = await self.agent_manager.get_agent_by_id_async(agent_id=agent_id, actor=actor)
|
||||
request.llm_config.max_tokens = agent.llm_config.max_tokens
|
||||
update_llm_config_params = request.model_settings._to_legacy_config_params()
|
||||
# Don't clobber max_tokens with the Pydantic default when the caller
|
||||
# didn't explicitly provide max_output_tokens in the request.
|
||||
|
||||
@@ -24,8 +24,7 @@ from letta.constants import (
|
||||
INCLUDE_MODEL_KEYWORDS_BASE_TOOL_RULES,
|
||||
RETRIEVAL_QUERY_DEFAULT_PAGE_SIZE,
|
||||
)
|
||||
|
||||
from letta.errors import LettaAgentNotFoundError, LettaError, LettaInvalidArgumentError
|
||||
from letta.errors import LettaError
|
||||
from letta.helpers import ToolRulesSolver
|
||||
from letta.helpers.datetime_helpers import get_utc_time
|
||||
from letta.log import get_logger
|
||||
@@ -789,6 +788,25 @@ class AgentManager:
|
||||
agent.agent_type,
|
||||
)
|
||||
|
||||
# Upsert compaction_settings: merge incoming partial update with existing settings
|
||||
if agent_update.compaction_settings is not None:
|
||||
# If mode changed, update the prompt to the default for the new mode
|
||||
changed_fields = agent_update.compaction_settings.model_fields_set
|
||||
if (
|
||||
agent.compaction_settings is not None
|
||||
and "mode" in changed_fields
|
||||
and agent_update.compaction_settings.mode != agent.compaction_settings.mode
|
||||
):
|
||||
from letta.services.summarizer.summarizer_config import get_default_prompt_for_mode
|
||||
|
||||
agent_update.compaction_settings.prompt = get_default_prompt_for_mode(agent_update.compaction_settings.mode)
|
||||
|
||||
# Fill in unchanged fields from existing settings
|
||||
if agent.compaction_settings is not None:
|
||||
for field in agent.compaction_settings.model_fields:
|
||||
if field not in changed_fields:
|
||||
setattr(agent_update.compaction_settings, field, getattr(agent.compaction_settings, field))
|
||||
|
||||
scalar_updates = {
|
||||
"name": agent_update.name,
|
||||
"system": agent_update.system,
|
||||
|
||||
@@ -7,6 +7,7 @@ if TYPE_CHECKING:
|
||||
from sqlalchemy import and_, asc, delete, desc, func, nulls_last, or_, select
|
||||
|
||||
from letta.errors import LettaInvalidArgumentError
|
||||
from letta.helpers.datetime_helpers import get_utc_time
|
||||
from letta.orm.agent import Agent as AgentModel
|
||||
from letta.orm.block import Block as BlockModel
|
||||
from letta.orm.blocks_conversations import BlocksConversations
|
||||
@@ -29,6 +30,21 @@ from letta.utils import enforce_types
|
||||
class ConversationManager:
|
||||
"""Manager class to handle business logic related to Conversations."""
|
||||
|
||||
@staticmethod
|
||||
def _serialize_model_settings(model_settings) -> Optional[dict]:
|
||||
"""Serialize model settings for DB storage, stripping max_output_tokens if not explicitly set.
|
||||
|
||||
Uses model_dump() to preserve all fields (including the provider_type discriminator),
|
||||
but removes max_output_tokens when it wasn't explicitly provided by the caller so we
|
||||
don't persist the Pydantic default (4096) and later overwrite the agent's own value.
|
||||
"""
|
||||
if model_settings is None:
|
||||
return None
|
||||
data = model_settings.model_dump()
|
||||
if "max_output_tokens" not in model_settings.model_fields_set:
|
||||
data.pop("max_output_tokens", None)
|
||||
return data
|
||||
|
||||
@enforce_types
|
||||
@trace_method
|
||||
async def create_conversation(
|
||||
@@ -56,7 +72,7 @@ class ConversationManager:
|
||||
summary=conversation_create.summary,
|
||||
organization_id=actor.organization_id,
|
||||
model=conversation_create.model,
|
||||
model_settings=conversation_create.model_settings.model_dump() if conversation_create.model_settings else None,
|
||||
model_settings=self._serialize_model_settings(conversation_create.model_settings),
|
||||
)
|
||||
await conversation.create_async(session, actor=actor)
|
||||
|
||||
@@ -73,8 +89,102 @@ class ConversationManager:
|
||||
|
||||
pydantic_conversation = conversation.to_pydantic()
|
||||
pydantic_conversation.isolated_block_ids = isolated_block_ids
|
||||
|
||||
# Compile and persist the initial system message for this conversation
|
||||
# This ensures the conversation captures the latest memory block state at creation time
|
||||
await self.compile_and_save_system_message_for_conversation(
|
||||
conversation_id=pydantic_conversation.id,
|
||||
agent_id=agent_id,
|
||||
actor=actor,
|
||||
)
|
||||
|
||||
return pydantic_conversation
|
||||
|
||||
@trace_method
|
||||
async def compile_and_save_system_message_for_conversation(
|
||||
self,
|
||||
conversation_id: str,
|
||||
agent_id: str,
|
||||
actor: PydanticUser,
|
||||
agent_state: Optional["AgentState"] = None,
|
||||
message_manager: Optional[object] = None,
|
||||
) -> PydanticMessage:
|
||||
"""Compile and persist the initial system message for a conversation.
|
||||
|
||||
This recompiles the system prompt with the latest memory block values
|
||||
and metadata, ensuring the conversation starts with an up-to-date
|
||||
system message.
|
||||
|
||||
This is the single source of truth for creating a conversation's system
|
||||
message — used both at conversation creation time and as a fallback
|
||||
when a conversation has no messages yet.
|
||||
|
||||
Args:
|
||||
conversation_id: The conversation to add the system message to
|
||||
agent_id: The agent this conversation belongs to
|
||||
actor: The user performing the action
|
||||
agent_state: Optional pre-loaded agent state (avoids redundant DB load)
|
||||
message_manager: Optional pre-loaded MessageManager instance
|
||||
|
||||
Returns:
|
||||
The persisted system message
|
||||
"""
|
||||
# Lazy imports to avoid circular dependencies
|
||||
from letta.prompts.prompt_generator import PromptGenerator
|
||||
from letta.services.message_manager import MessageManager
|
||||
from letta.services.passage_manager import PassageManager
|
||||
|
||||
if message_manager is None:
|
||||
message_manager = MessageManager()
|
||||
|
||||
if agent_state is None:
|
||||
from letta.services.agent_manager import AgentManager
|
||||
|
||||
agent_state = await AgentManager().get_agent_by_id_async(
|
||||
agent_id=agent_id,
|
||||
include_relationships=["memory", "sources"],
|
||||
actor=actor,
|
||||
)
|
||||
|
||||
passage_manager = PassageManager()
|
||||
num_messages = await message_manager.size_async(actor=actor, agent_id=agent_id)
|
||||
num_archival_memories = await passage_manager.agent_passage_size_async(actor=actor, agent_id=agent_id)
|
||||
|
||||
# Compile the system message with current memory state
|
||||
system_message_str = await PromptGenerator.compile_system_message_async(
|
||||
system_prompt=agent_state.system,
|
||||
in_context_memory=agent_state.memory,
|
||||
in_context_memory_last_edit=get_utc_time(),
|
||||
timezone=agent_state.timezone,
|
||||
user_defined_variables=None,
|
||||
append_icm_if_missing=True,
|
||||
previous_message_count=num_messages,
|
||||
archival_memory_size=num_archival_memories,
|
||||
sources=agent_state.sources,
|
||||
max_files_open=agent_state.max_files_open,
|
||||
)
|
||||
|
||||
system_message = PydanticMessage.dict_to_message(
|
||||
agent_id=agent_id,
|
||||
model=agent_state.llm_config.model,
|
||||
openai_message_dict={"role": "system", "content": system_message_str},
|
||||
)
|
||||
|
||||
# Persist the new system message
|
||||
persisted_messages = await message_manager.create_many_messages_async([system_message], actor=actor)
|
||||
system_message = persisted_messages[0]
|
||||
|
||||
# Add it to the conversation tracking at position 0
|
||||
await self.add_messages_to_conversation(
|
||||
conversation_id=conversation_id,
|
||||
agent_id=agent_id,
|
||||
message_ids=[system_message.id],
|
||||
actor=actor,
|
||||
starting_position=0,
|
||||
)
|
||||
|
||||
return system_message
|
||||
|
||||
@enforce_types
|
||||
@trace_method
|
||||
async def get_conversation_by_id(
|
||||
@@ -133,22 +243,15 @@ class ConversationManager:
|
||||
if sort_by == "last_run_completion":
|
||||
# Subquery to get the latest completed_at for each conversation
|
||||
latest_run_subquery = (
|
||||
select(
|
||||
RunModel.conversation_id,
|
||||
func.max(RunModel.completed_at).label("last_run_completion")
|
||||
)
|
||||
select(RunModel.conversation_id, func.max(RunModel.completed_at).label("last_run_completion"))
|
||||
.where(RunModel.conversation_id.isnot(None))
|
||||
.group_by(RunModel.conversation_id)
|
||||
.subquery()
|
||||
)
|
||||
|
||||
# Join conversations with the subquery
|
||||
stmt = (
|
||||
select(ConversationModel)
|
||||
.outerjoin(
|
||||
latest_run_subquery,
|
||||
ConversationModel.id == latest_run_subquery.c.conversation_id
|
||||
)
|
||||
stmt = select(ConversationModel).outerjoin(
|
||||
latest_run_subquery, ConversationModel.id == latest_run_subquery.c.conversation_id
|
||||
)
|
||||
sort_column = latest_run_subquery.c.last_run_completion
|
||||
sort_nulls_last = True
|
||||
@@ -170,10 +273,12 @@ class ConversationManager:
|
||||
|
||||
# Add summary search filter if provided
|
||||
if summary_search:
|
||||
conditions.extend([
|
||||
conditions.extend(
|
||||
[
|
||||
ConversationModel.summary.isnot(None),
|
||||
ConversationModel.summary.contains(summary_search),
|
||||
])
|
||||
]
|
||||
)
|
||||
|
||||
stmt = stmt.where(and_(*conditions))
|
||||
|
||||
@@ -182,10 +287,7 @@ class ConversationManager:
|
||||
# Get the sort value for the cursor conversation
|
||||
if sort_by == "last_run_completion":
|
||||
cursor_query = (
|
||||
select(
|
||||
ConversationModel.id,
|
||||
func.max(RunModel.completed_at).label("last_run_completion")
|
||||
)
|
||||
select(ConversationModel.id, func.max(RunModel.completed_at).label("last_run_completion"))
|
||||
.outerjoin(RunModel, ConversationModel.id == RunModel.conversation_id)
|
||||
.where(ConversationModel.id == after)
|
||||
.group_by(ConversationModel.id)
|
||||
@@ -198,16 +300,11 @@ class ConversationManager:
|
||||
# Cursor is at NULL - if ascending, get non-NULLs or NULLs with greater ID
|
||||
if ascending:
|
||||
stmt = stmt.where(
|
||||
or_(
|
||||
and_(sort_column.is_(None), ConversationModel.id > after_id),
|
||||
sort_column.isnot(None)
|
||||
)
|
||||
or_(and_(sort_column.is_(None), ConversationModel.id > after_id), sort_column.isnot(None))
|
||||
)
|
||||
else:
|
||||
# If descending, get NULLs with smaller ID
|
||||
stmt = stmt.where(
|
||||
and_(sort_column.is_(None), ConversationModel.id < after_id)
|
||||
)
|
||||
stmt = stmt.where(and_(sort_column.is_(None), ConversationModel.id < after_id))
|
||||
else:
|
||||
# Cursor is at non-NULL
|
||||
if ascending:
|
||||
@@ -217,8 +314,8 @@ class ConversationManager:
|
||||
sort_column.isnot(None),
|
||||
or_(
|
||||
sort_column > after_sort_value,
|
||||
and_(sort_column == after_sort_value, ConversationModel.id > after_id)
|
||||
)
|
||||
and_(sort_column == after_sort_value, ConversationModel.id > after_id),
|
||||
),
|
||||
)
|
||||
)
|
||||
else:
|
||||
@@ -227,7 +324,7 @@ class ConversationManager:
|
||||
or_(
|
||||
sort_column.is_(None),
|
||||
sort_column < after_sort_value,
|
||||
and_(sort_column == after_sort_value, ConversationModel.id < after_id)
|
||||
and_(sort_column == after_sort_value, ConversationModel.id < after_id),
|
||||
)
|
||||
)
|
||||
else:
|
||||
@@ -277,7 +374,11 @@ class ConversationManager:
|
||||
for key, value in update_data.items():
|
||||
# model_settings needs to be serialized to dict for the JSON column
|
||||
if key == "model_settings" and value is not None:
|
||||
setattr(conversation, key, conversation_update.model_settings.model_dump() if conversation_update.model_settings else value)
|
||||
setattr(
|
||||
conversation,
|
||||
key,
|
||||
self._serialize_model_settings(conversation_update.model_settings) if conversation_update.model_settings else value,
|
||||
)
|
||||
else:
|
||||
setattr(conversation, key, value)
|
||||
|
||||
|
||||
@@ -604,6 +604,9 @@ def _apply_pagination(
|
||||
if sort_by == "last_run_completion":
|
||||
sort_column = AgentModel.last_run_completion
|
||||
sort_nulls_last = True # TODO: handle this as a query param eventually
|
||||
elif sort_by == "updated_at":
|
||||
sort_column = AgentModel.updated_at
|
||||
sort_nulls_last = False
|
||||
else:
|
||||
sort_column = AgentModel.created_at
|
||||
sort_nulls_last = False
|
||||
@@ -637,6 +640,9 @@ async def _apply_pagination_async(
|
||||
if sort_by == "last_run_completion":
|
||||
sort_column = AgentModel.last_run_completion
|
||||
sort_nulls_last = True # TODO: handle this as a query param eventually
|
||||
elif sort_by == "updated_at":
|
||||
sort_column = AgentModel.updated_at
|
||||
sort_nulls_last = False
|
||||
else:
|
||||
sort_column = AgentModel.created_at
|
||||
sort_nulls_last = False
|
||||
|
||||
@@ -73,7 +73,6 @@ class LLMTraceWriter:
|
||||
def __init__(self):
|
||||
self._client = None
|
||||
self._shutdown = False
|
||||
self._write_lock = asyncio.Lock() # Serialize writes - clickhouse_connect isn't thread-safe
|
||||
|
||||
# Check if ClickHouse is configured - if not, writing is disabled
|
||||
self._enabled = bool(settings.clickhouse_endpoint and settings.clickhouse_password)
|
||||
@@ -82,11 +81,7 @@ class LLMTraceWriter:
|
||||
atexit.register(self._sync_shutdown)
|
||||
|
||||
def _get_client(self):
|
||||
"""Initialize ClickHouse client on first use (lazy loading).
|
||||
|
||||
Configures async_insert with wait_for_async_insert=1 for reliable
|
||||
server-side batching with acknowledgment.
|
||||
"""
|
||||
"""Initialize ClickHouse client on first use (lazy loading)."""
|
||||
if self._client is not None:
|
||||
return self._client
|
||||
|
||||
@@ -108,8 +103,10 @@ class LLMTraceWriter:
|
||||
settings={
|
||||
# Enable server-side batching
|
||||
"async_insert": 1,
|
||||
# Wait for acknowledgment (reliable)
|
||||
"wait_for_async_insert": 1,
|
||||
# Don't wait for server-side flush acknowledgment — fire and forget.
|
||||
# Waiting (value=1) caused each insert to hold an asyncio.Lock for ~1s,
|
||||
# creating unbounded task queues that saturated the event loop under load.
|
||||
"wait_for_async_insert": 0,
|
||||
# Flush after 1 second if batch not full
|
||||
"async_insert_busy_timeout_ms": 1000,
|
||||
},
|
||||
@@ -148,9 +145,9 @@ class LLMTraceWriter:
|
||||
row = trace.to_clickhouse_row()
|
||||
columns = LLMTrace.clickhouse_columns()
|
||||
|
||||
# Serialize writes - clickhouse_connect client isn't thread-safe
|
||||
async with self._write_lock:
|
||||
# Run synchronous insert in thread pool
|
||||
# Run synchronous insert in thread pool. clickhouse-connect supports
|
||||
# multithreaded use via a thread-safe connection pool:
|
||||
# https://clickhouse.com/docs/integrations/language-clients/python/advanced-usage#multithreaded-multiprocess-and-asyncevent-driven-use-cases
|
||||
await asyncio.to_thread(
|
||||
client.insert,
|
||||
"llm_traces",
|
||||
|
||||
@@ -3,11 +3,11 @@
|
||||
File format:
|
||||
---
|
||||
description: "Who I am and how I approach work"
|
||||
limit: 20000
|
||||
---
|
||||
My name is Memo. I'm a stateful coding assistant...
|
||||
|
||||
- Frontmatter fields are only rendered when they differ from defaults.
|
||||
- ``limit`` is intentionally excluded from frontmatter (deprecated for git-base memory).
|
||||
- Files without frontmatter are treated as value-only (backward compat).
|
||||
"""
|
||||
|
||||
@@ -37,12 +37,12 @@ def serialize_block(
|
||||
This is used for initial file creation. For updates to existing files,
|
||||
prefer `merge_frontmatter_with_body` to preserve user formatting.
|
||||
"""
|
||||
# description and limit are always included in frontmatter.
|
||||
# description is always included in frontmatter.
|
||||
# read_only and metadata are only included when non-default.
|
||||
# limit is intentionally excluded (deprecated for git-base memory).
|
||||
front: Dict[str, Any] = {}
|
||||
|
||||
front["description"] = description
|
||||
front["limit"] = limit if limit is not None else _get_field_default("limit")
|
||||
|
||||
if read_only != _get_field_default("read_only"):
|
||||
front["read_only"] = read_only
|
||||
@@ -111,7 +111,6 @@ def merge_frontmatter_with_body(
|
||||
|
||||
# Desired values
|
||||
desired_description = description
|
||||
desired_limit = limit if limit is not None else _get_field_default("limit")
|
||||
desired_read_only = read_only
|
||||
desired_metadata = metadata if metadata is not None else _get_field_default("metadata")
|
||||
|
||||
@@ -122,8 +121,9 @@ def merge_frontmatter_with_body(
|
||||
parsed["description"] = desired_description
|
||||
changed = True
|
||||
|
||||
if "limit" not in parsed or parsed.get("limit") != desired_limit:
|
||||
parsed["limit"] = desired_limit
|
||||
# Remove limit from frontmatter if it exists (deprecated for git-base memory)
|
||||
if "limit" in parsed:
|
||||
del parsed["limit"]
|
||||
changed = True
|
||||
|
||||
if desired_read_only != _get_field_default("read_only"):
|
||||
|
||||
@@ -21,6 +21,7 @@ from letta.schemas.memory_repo import MemoryCommit
|
||||
from letta.schemas.user import User as PydanticUser
|
||||
from letta.services.memory_repo.block_markdown import parse_block_markdown, serialize_block
|
||||
from letta.services.memory_repo.git_operations import GitOperations
|
||||
from letta.services.memory_repo.path_mapping import memory_block_label_from_markdown_path
|
||||
from letta.services.memory_repo.storage.local import LocalStorageBackend
|
||||
from letta.utils import enforce_types
|
||||
|
||||
@@ -133,11 +134,14 @@ class MemfsClient:
|
||||
except FileNotFoundError:
|
||||
return []
|
||||
|
||||
# Convert block files to PydanticBlock (metadata is in frontmatter)
|
||||
# Convert block files to PydanticBlock (metadata is in frontmatter).
|
||||
# skills/{skill_name}/SKILL.md is mapped to block label skills/{skill_name};
|
||||
# other files under skills/ are intentionally ignored.
|
||||
blocks = []
|
||||
for file_path, content in files.items():
|
||||
if file_path.endswith(".md"):
|
||||
label = file_path[:-3]
|
||||
label = memory_block_label_from_markdown_path(file_path)
|
||||
if label is None:
|
||||
continue
|
||||
|
||||
parsed = parse_block_markdown(content)
|
||||
|
||||
|
||||
29
letta/services/memory_repo/path_mapping.py
Normal file
29
letta/services/memory_repo/path_mapping.py
Normal file
@@ -0,0 +1,29 @@
|
||||
"""Helpers for mapping memory-repo markdown paths to block labels.
|
||||
|
||||
Special handling for skills:
|
||||
- sync `skills/{skill_name}/SKILL.md` as block label `skills/{skill_name}`
|
||||
- ignore all other markdown files under `skills/`
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
def memory_block_label_from_markdown_path(path: str) -> str | None:
|
||||
"""Return block label for a syncable markdown path, else None.
|
||||
|
||||
Rules:
|
||||
- Non-`.md` files are ignored.
|
||||
- `skills/{skill_name}/SKILL.md` -> `skills/{skill_name}`
|
||||
- Other `skills/**` markdown files are ignored.
|
||||
- All other markdown files map to `path[:-3]`.
|
||||
"""
|
||||
if not path.endswith(".md"):
|
||||
return None
|
||||
|
||||
if path.startswith("skills/"):
|
||||
parts = path.split("/")
|
||||
if len(parts) == 3 and parts[0] == "skills" and parts[1] and parts[2] == "SKILL.md":
|
||||
return f"skills/{parts[1]}"
|
||||
return None
|
||||
|
||||
return path[:-3]
|
||||
@@ -141,6 +141,9 @@ class ClickhouseProviderTraceBackend(ProviderTraceBackendClient):
|
||||
request_json=request_json_str,
|
||||
response_json=response_json_str,
|
||||
llm_config_json=llm_config_json_str,
|
||||
billing_plan_type=provider_trace.billing_context.plan_type if provider_trace.billing_context else None,
|
||||
billing_cost_source=provider_trace.billing_context.cost_source if provider_trace.billing_context else None,
|
||||
billing_customer_id=provider_trace.billing_context.customer_id if provider_trace.billing_context else None,
|
||||
)
|
||||
|
||||
def _extract_usage(self, response_json: dict, provider: str) -> dict:
|
||||
|
||||
@@ -29,7 +29,7 @@ class PostgresProviderTraceBackend(ProviderTraceBackendClient):
|
||||
) -> ProviderTrace:
|
||||
"""Write full provider trace to provider_traces table."""
|
||||
async with db_registry.async_session() as session:
|
||||
provider_trace_model = ProviderTraceModel(**provider_trace.model_dump())
|
||||
provider_trace_model = ProviderTraceModel(**provider_trace.model_dump(exclude={"billing_context"}))
|
||||
provider_trace_model.organization_id = actor.organization_id
|
||||
|
||||
if provider_trace.request_json:
|
||||
|
||||
@@ -638,7 +638,13 @@ class RunManager:
|
||||
raise NoResultFound(f"Run with id {run_id} not found")
|
||||
agent_id = run.agent_id
|
||||
|
||||
logger.debug(f"Cancelling run {run_id} for agent {agent_id}")
|
||||
logger.info(
|
||||
"[Interrupt] Processing cancellation for run=%s, agent=%s, current_status=%s, current_stop_reason=%s",
|
||||
run_id,
|
||||
agent_id,
|
||||
run.status if run else "unknown",
|
||||
run.stop_reason if run else "unknown",
|
||||
)
|
||||
|
||||
# Cancellation should be idempotent: if a run is already terminated, treat this as a no-op.
|
||||
# This commonly happens when a run finishes between client request and server handling.
|
||||
|
||||
@@ -15,6 +15,7 @@ from letta.errors import (
|
||||
LettaInvalidArgumentError,
|
||||
LettaServiceUnavailableError,
|
||||
LLMAuthenticationError,
|
||||
LLMEmptyResponseError,
|
||||
LLMError,
|
||||
LLMRateLimitError,
|
||||
LLMTimeoutError,
|
||||
@@ -33,6 +34,7 @@ from letta.schemas.letta_request import ClientToolSchema, LettaStreamingRequest
|
||||
from letta.schemas.letta_response import LettaResponse
|
||||
from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType
|
||||
from letta.schemas.message import MessageCreate
|
||||
from letta.schemas.provider_trace import BillingContext
|
||||
from letta.schemas.run import Run as PydanticRun, RunUpdate
|
||||
from letta.schemas.usage import LettaUsageStatistics
|
||||
from letta.schemas.user import User
|
||||
@@ -76,6 +78,8 @@ class StreamingService:
|
||||
request: LettaStreamingRequest,
|
||||
run_type: str = "streaming",
|
||||
conversation_id: Optional[str] = None,
|
||||
should_lock: bool = False,
|
||||
billing_context: "BillingContext | None" = None,
|
||||
) -> tuple[Optional[PydanticRun], Union[StreamingResponse, LettaResponse]]:
|
||||
"""
|
||||
Create a streaming response for an agent.
|
||||
@@ -86,6 +90,7 @@ class StreamingService:
|
||||
request: The LettaStreamingRequest containing all request parameters
|
||||
run_type: Type of run for tracking
|
||||
conversation_id: Optional conversation ID for conversation-scoped messaging
|
||||
should_lock: If True and conversation_id is None, use agent_id as lock key
|
||||
|
||||
Returns:
|
||||
Tuple of (run object or None, streaming response)
|
||||
@@ -116,6 +121,10 @@ class StreamingService:
|
||||
)
|
||||
if conversation.model_settings is not None:
|
||||
update_params = conversation.model_settings._to_legacy_config_params()
|
||||
# Don't clobber max_tokens with the Pydantic default when the caller
|
||||
# didn't explicitly provide max_output_tokens.
|
||||
if "max_output_tokens" not in conversation.model_settings.model_fields_set:
|
||||
update_params.pop("max_tokens", None)
|
||||
conversation_llm_config = conversation_llm_config.model_copy(update=update_params)
|
||||
agent = agent.model_copy(update={"llm_config": conversation_llm_config})
|
||||
|
||||
@@ -130,12 +139,15 @@ class StreamingService:
|
||||
|
||||
model_compatible_token_streaming = self._is_token_streaming_compatible(agent)
|
||||
|
||||
# Attempt to acquire conversation lock if conversation_id is provided
|
||||
# This prevents concurrent message processing for the same conversation
|
||||
# Determine lock key: use conversation_id if provided, else agent_id if should_lock
|
||||
lock_key = conversation_id if conversation_id else (agent_id if should_lock else None)
|
||||
|
||||
# Attempt to acquire lock if lock_key is set
|
||||
# This prevents concurrent message processing for the same conversation/agent
|
||||
# Skip locking if Redis is not available (graceful degradation)
|
||||
if conversation_id and not isinstance(redis_client, NoopAsyncRedisClient):
|
||||
if lock_key and not isinstance(redis_client, NoopAsyncRedisClient):
|
||||
await redis_client.acquire_conversation_lock(
|
||||
conversation_id=conversation_id,
|
||||
conversation_id=lock_key,
|
||||
token=str(uuid4()),
|
||||
)
|
||||
|
||||
@@ -163,8 +175,10 @@ class StreamingService:
|
||||
include_return_message_types=request.include_return_message_types,
|
||||
actor=actor,
|
||||
conversation_id=conversation_id,
|
||||
lock_key=lock_key, # For lock release (may differ from conversation_id)
|
||||
client_tools=request.client_tools,
|
||||
include_compaction_messages=request.include_compaction_messages,
|
||||
billing_context=billing_context,
|
||||
)
|
||||
|
||||
# handle background streaming if requested
|
||||
@@ -195,7 +209,7 @@ class StreamingService:
|
||||
run_id=run.id,
|
||||
run_manager=self.server.run_manager,
|
||||
actor=actor,
|
||||
conversation_id=conversation_id,
|
||||
conversation_id=lock_key, # Use lock_key for lock release
|
||||
),
|
||||
label=f"background_stream_processor_{run.id}",
|
||||
)
|
||||
@@ -251,7 +265,7 @@ class StreamingService:
|
||||
if settings.track_agent_run and run and run_status:
|
||||
await self.server.run_manager.update_run_by_id_async(
|
||||
run_id=run.id,
|
||||
conversation_id=conversation_id,
|
||||
conversation_id=lock_key, # Use lock_key for lock release
|
||||
update=RunUpdate(status=run_status, metadata=run_update_metadata),
|
||||
actor=actor,
|
||||
)
|
||||
@@ -326,8 +340,10 @@ class StreamingService:
|
||||
include_return_message_types: Optional[list[MessageType]],
|
||||
actor: User,
|
||||
conversation_id: Optional[str] = None,
|
||||
lock_key: Optional[str] = None,
|
||||
client_tools: Optional[list[ClientToolSchema]] = None,
|
||||
include_compaction_messages: bool = False,
|
||||
billing_context: BillingContext | None = None,
|
||||
) -> AsyncIterator:
|
||||
"""
|
||||
Create a stream with unified error handling.
|
||||
@@ -356,6 +372,7 @@ class StreamingService:
|
||||
conversation_id=conversation_id,
|
||||
client_tools=client_tools,
|
||||
include_compaction_messages=include_compaction_messages,
|
||||
billing_context=billing_context,
|
||||
)
|
||||
|
||||
async for chunk in stream:
|
||||
@@ -442,6 +459,21 @@ class StreamingService:
|
||||
yield f"event: error\ndata: {error_message.model_dump_json()}\n\n"
|
||||
# Send [DONE] marker to properly close the stream
|
||||
yield "data: [DONE]\n\n"
|
||||
except LLMEmptyResponseError as e:
|
||||
run_status = RunStatus.failed
|
||||
stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response)
|
||||
error_message = LettaErrorMessage(
|
||||
run_id=run_id,
|
||||
error_type="llm_empty_response",
|
||||
message="LLM returned an empty response.",
|
||||
detail=str(e),
|
||||
)
|
||||
error_data = {"error": error_message.model_dump()}
|
||||
logger.warning(f"Run {run_id} stopped with LLM empty response: {e}, error_data: {error_message.model_dump()}")
|
||||
yield f"data: {stop_reason.model_dump_json()}\n\n"
|
||||
yield f"event: error\ndata: {error_message.model_dump_json()}\n\n"
|
||||
# Send [DONE] marker to properly close the stream
|
||||
yield "data: [DONE]\n\n"
|
||||
except LLMError as e:
|
||||
run_status = RunStatus.failed
|
||||
stop_reason = LettaStopReason(stop_reason=StopReasonType.llm_api_error)
|
||||
@@ -491,7 +523,7 @@ class StreamingService:
|
||||
stop_reason_value = stop_reason.stop_reason if stop_reason else StopReasonType.error.value
|
||||
await self.runs_manager.update_run_by_id_async(
|
||||
run_id=run_id,
|
||||
conversation_id=conversation_id,
|
||||
conversation_id=lock_key, # Use lock_key for lock release
|
||||
update=RunUpdate(status=run_status, stop_reason=stop_reason_value, metadata=error_data),
|
||||
actor=actor,
|
||||
)
|
||||
|
||||
@@ -96,6 +96,10 @@ async def build_summarizer_llm_config(
|
||||
# them just like server.create_agent_async does for agents.
|
||||
if summarizer_config.model_settings is not None:
|
||||
update_params = summarizer_config.model_settings._to_legacy_config_params()
|
||||
# Don't clobber max_tokens with the Pydantic default when the caller
|
||||
# didn't explicitly provide max_output_tokens.
|
||||
if "max_output_tokens" not in summarizer_config.model_settings.model_fields_set:
|
||||
update_params.pop("max_tokens", None)
|
||||
return base.model_copy(update=update_params)
|
||||
|
||||
return base
|
||||
|
||||
@@ -196,7 +196,7 @@ async def self_summarize_sliding_window(
|
||||
return message.tool_calls is not None and len(message.tool_calls) > 0
|
||||
return False
|
||||
|
||||
post_summarization_buffer = [system_prompt]
|
||||
post_summarization_buffer = []
|
||||
while approx_token_count >= goal_tokens and eviction_percentage < 1.0:
|
||||
# more eviction percentage
|
||||
eviction_percentage += 0.10
|
||||
@@ -217,8 +217,8 @@ async def self_summarize_sliding_window(
|
||||
|
||||
# update token count
|
||||
logger.info(f"Attempting to compact messages to index {assistant_message_index} messages")
|
||||
post_summarization_buffer = [system_prompt, *messages[assistant_message_index:]]
|
||||
approx_token_count = await count_tokens(actor, agent_llm_config, post_summarization_buffer)
|
||||
post_summarization_buffer = list(messages[assistant_message_index:])
|
||||
approx_token_count = await count_tokens(actor, agent_llm_config, [system_prompt, *post_summarization_buffer])
|
||||
logger.info(
|
||||
f"Compacting messages index 1:{assistant_message_index} messages resulted in {approx_token_count} tokens, goal is {goal_tokens}"
|
||||
)
|
||||
|
||||
@@ -11,7 +11,7 @@ from letta.settings import summarizer_settings
|
||||
def get_default_summarizer_model(provider_type: ProviderType) -> str | None:
|
||||
"""Get default model for summarization for given provider type."""
|
||||
summarizer_defaults = {
|
||||
ProviderType.anthropic: "anthropic/claude-haiku-4-5-20251001",
|
||||
ProviderType.anthropic: "anthropic/claude-haiku-4-5",
|
||||
ProviderType.openai: "openai/gpt-5-mini",
|
||||
ProviderType.google_ai: "google_ai/gemini-2.5-flash",
|
||||
}
|
||||
|
||||
@@ -114,7 +114,7 @@ class SummarizerSettings(BaseSettings):
|
||||
class ModelSettings(BaseSettings):
|
||||
model_config = SettingsConfigDict(env_file=".env", extra="ignore")
|
||||
|
||||
global_max_context_window_limit: int = 32000
|
||||
global_max_context_window_limit: int = 128000
|
||||
|
||||
inner_thoughts_kwarg: str | None = Field(default=INNER_THOUGHTS_KWARG, description="Key used for passing in inner thoughts.")
|
||||
|
||||
@@ -204,6 +204,7 @@ class ModelSettings(BaseSettings):
|
||||
gemini_base_url: str = "https://generativelanguage.googleapis.com/"
|
||||
gemini_force_minimum_thinking_budget: bool = False
|
||||
gemini_max_retries: int = 5
|
||||
gemini_timeout_seconds: float = 600.0
|
||||
|
||||
# google vertex
|
||||
google_cloud_project: Optional[str] = None
|
||||
|
||||
@@ -45,30 +45,36 @@ PATH_VALIDATORS = {primitive_type.value: _create_path_validator_factory(primitiv
|
||||
|
||||
|
||||
def _create_conversation_id_or_default_path_validator_factory():
|
||||
"""Conversation IDs accept the usual primitive format or the special value 'default'."""
|
||||
"""Conversation IDs with support for 'default' and agent IDs (backwards compatibility)."""
|
||||
|
||||
primitive = PrimitiveType.CONVERSATION.value
|
||||
prefix_pattern = PRIMITIVE_ID_PATTERNS[primitive].pattern
|
||||
# Make the full regex accept either the primitive ID format or 'default'.
|
||||
# `prefix_pattern` already contains the ^...$ anchors.
|
||||
conversation_or_default_pattern = f"^(default|{prefix_pattern[1:-1]})$"
|
||||
conversation_primitive = PrimitiveType.CONVERSATION.value
|
||||
agent_primitive = PrimitiveType.AGENT.value
|
||||
conversation_pattern = PRIMITIVE_ID_PATTERNS[conversation_primitive].pattern
|
||||
agent_pattern = PRIMITIVE_ID_PATTERNS[agent_primitive].pattern
|
||||
# Make the full regex accept: conversation ID, agent ID, or 'default'.
|
||||
# Patterns already contain ^...$ anchors, so strip them for the alternation.
|
||||
conversation_or_agent_or_default_pattern = f"^(default|{conversation_pattern[1:-1]}|{agent_pattern[1:-1]})$"
|
||||
|
||||
def factory():
|
||||
return Path(
|
||||
description=(f"The conversation identifier. Either the special value 'default' or an ID in the format '{primitive}-<uuid4>'"),
|
||||
pattern=conversation_or_default_pattern,
|
||||
examples=["default", f"{primitive}-123e4567-e89b-42d3-8456-426614174000"],
|
||||
description=(
|
||||
f"The conversation identifier. Can be a conversation ID ('{conversation_primitive}-<uuid4>'), "
|
||||
f"'default' for agent-direct mode (with agent_id parameter), "
|
||||
f"or an agent ID ('{agent_primitive}-<uuid4>') for backwards compatibility (deprecated)."
|
||||
),
|
||||
pattern=conversation_or_agent_or_default_pattern,
|
||||
examples=[
|
||||
"default",
|
||||
f"{conversation_primitive}-123e4567-e89b-42d3-8456-426614174000",
|
||||
f"{agent_primitive}-123e4567-e89b-42d3-8456-426614174000",
|
||||
],
|
||||
min_length=1,
|
||||
max_length=len(primitive) + 1 + 36,
|
||||
max_length=max(len(conversation_primitive), len(agent_primitive)) + 1 + 36,
|
||||
)
|
||||
|
||||
return factory
|
||||
|
||||
|
||||
# Override conversation ID path validation to also allow the special value 'default'.
|
||||
PATH_VALIDATORS[PrimitiveType.CONVERSATION.value] = _create_conversation_id_or_default_path_validator_factory()
|
||||
|
||||
|
||||
# Type aliases for common ID types
|
||||
# These can be used directly in route handler signatures for cleaner code
|
||||
AgentId = Annotated[str, PATH_VALIDATORS[PrimitiveType.AGENT.value]()]
|
||||
@@ -89,6 +95,10 @@ StepId = Annotated[str, PATH_VALIDATORS[PrimitiveType.STEP.value]()]
|
||||
IdentityId = Annotated[str, PATH_VALIDATORS[PrimitiveType.IDENTITY.value]()]
|
||||
ConversationId = Annotated[str, PATH_VALIDATORS[PrimitiveType.CONVERSATION.value]()]
|
||||
|
||||
# Conversation ID with support for 'default' and agent IDs (for agent-direct mode endpoints)
|
||||
# Backwards compatible - agent-* will be deprecated in favor of conversation_id='default' + agent_id param
|
||||
ConversationIdOrDefault = Annotated[str, _create_conversation_id_or_default_path_validator_factory()()]
|
||||
|
||||
# Infrastructure types
|
||||
McpServerId = Annotated[str, PATH_VALIDATORS[PrimitiveType.MCP_SERVER.value]()]
|
||||
McpOAuthId = Annotated[str, PATH_VALIDATORS[PrimitiveType.MCP_OAUTH.value]()]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "letta"
|
||||
version = "0.16.5"
|
||||
version = "0.16.6"
|
||||
description = "Create LLM agents with long-term memory and custom tools"
|
||||
authors = [
|
||||
{name = "Letta Team", email = "contact@letta.com"},
|
||||
|
||||
@@ -2,6 +2,12 @@ import anthropic
|
||||
import httpx
|
||||
import openai
|
||||
import pytest
|
||||
from anthropic.types.beta import (
|
||||
BetaMessage,
|
||||
BetaRawMessageStartEvent,
|
||||
BetaRawMessageStopEvent,
|
||||
BetaUsage,
|
||||
)
|
||||
from google.genai import errors as google_errors
|
||||
|
||||
from letta.adapters.letta_llm_stream_adapter import LettaLLMStreamAdapter
|
||||
@@ -9,6 +15,7 @@ from letta.errors import (
|
||||
ContextWindowExceededError,
|
||||
LLMBadRequestError,
|
||||
LLMConnectionError,
|
||||
LLMEmptyResponseError,
|
||||
LLMInsufficientCreditsError,
|
||||
LLMServerError,
|
||||
)
|
||||
@@ -287,3 +294,70 @@ def test_openai_client_handle_llm_error_non_credit_api_error():
|
||||
result = client.handle_llm_error(error)
|
||||
assert isinstance(result, LLMBadRequestError)
|
||||
assert not isinstance(result, LLMInsufficientCreditsError)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_letta_llm_stream_adapter_raises_empty_response_error_for_anthropic(monkeypatch):
|
||||
"""LET-7679: Empty streaming responses (no content blocks) should raise LLMEmptyResponseError.
|
||||
|
||||
This tests the case where Opus 4.6 returns a response with:
|
||||
- BetaRawMessageStartEvent (with usage tokens)
|
||||
- BetaRawMessageStopEvent (end_turn)
|
||||
- NO content blocks in between
|
||||
|
||||
This should raise LLMEmptyResponseError, not complete successfully with stop_reason=end_turn.
|
||||
"""
|
||||
|
||||
class FakeAsyncStream:
|
||||
"""Mimics anthropic.AsyncStream that returns empty content (no content blocks)."""
|
||||
|
||||
def __init__(self):
|
||||
self.events = [
|
||||
# Message start with some usage info
|
||||
BetaRawMessageStartEvent(
|
||||
type="message_start",
|
||||
message=BetaMessage(
|
||||
id="msg_test_empty",
|
||||
type="message",
|
||||
role="assistant",
|
||||
content=[], # Empty content
|
||||
model="claude-opus-4-6",
|
||||
stop_reason="end_turn",
|
||||
stop_sequence=None,
|
||||
usage=BetaUsage(input_tokens=1000, output_tokens=26, cache_creation_input_tokens=0, cache_read_input_tokens=0),
|
||||
),
|
||||
),
|
||||
# Message stop immediately after start - no content blocks
|
||||
BetaRawMessageStopEvent(type="message_stop"),
|
||||
]
|
||||
self.index = 0
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc, tb):
|
||||
return None
|
||||
|
||||
def __aiter__(self):
|
||||
return self
|
||||
|
||||
async def __anext__(self):
|
||||
if self.index >= len(self.events):
|
||||
raise StopAsyncIteration
|
||||
event = self.events[self.index]
|
||||
self.index += 1
|
||||
return event
|
||||
|
||||
async def fake_stream_async(self, request_data: dict, llm_config):
|
||||
return FakeAsyncStream()
|
||||
|
||||
monkeypatch.setattr(AnthropicClient, "stream_async", fake_stream_async, raising=True)
|
||||
|
||||
llm_client = AnthropicClient()
|
||||
llm_config = LLMConfig(model="claude-opus-4-6", model_endpoint_type="anthropic", context_window=200000)
|
||||
adapter = LettaLLMStreamAdapter(llm_client=llm_client, llm_config=llm_config, call_type=LLMCallType.agent_step)
|
||||
|
||||
gen = adapter.invoke_llm(request_data={}, messages=[], tools=[], use_assistant_message=True)
|
||||
with pytest.raises(LLMEmptyResponseError):
|
||||
async for _ in gen:
|
||||
pass
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"context_window": 32000,
|
||||
"model": "gpt-5.3-codex",
|
||||
"model_endpoint_type": "openai",
|
||||
"model_endpoint": "https://api.openai.com/v1",
|
||||
"model_wrapper": null,
|
||||
"reasoning_effort": "low"
|
||||
}
|
||||
@@ -141,7 +141,7 @@ async def create_test_agent(name, actor, test_id: Optional[str] = None, model="a
|
||||
model="claude-3-7-sonnet-latest",
|
||||
model_endpoint_type="anthropic",
|
||||
model_endpoint="https://api.anthropic.com/v1",
|
||||
context_window=32000,
|
||||
context_window=128000,
|
||||
handle="anthropic/claude-3-7-sonnet-latest",
|
||||
put_inner_thoughts_in_kwargs=True,
|
||||
max_tokens=4096,
|
||||
@@ -193,7 +193,7 @@ async def create_test_batch_item(server, batch_id, agent_id, default_user):
|
||||
model="claude-3-7-sonnet-latest",
|
||||
model_endpoint_type="anthropic",
|
||||
model_endpoint="https://api.anthropic.com/v1",
|
||||
context_window=32000,
|
||||
context_window=128000,
|
||||
handle="anthropic/claude-3-7-sonnet-latest",
|
||||
put_inner_thoughts_in_kwargs=True,
|
||||
max_tokens=4096,
|
||||
|
||||
@@ -62,12 +62,14 @@ class TestConversationsSDK:
|
||||
# Create a conversation
|
||||
created = client.conversations.create(agent_id=agent.id)
|
||||
|
||||
# Retrieve it (should have empty in_context_message_ids initially)
|
||||
# Retrieve it (should have system message from creation)
|
||||
retrieved = client.conversations.retrieve(conversation_id=created.id)
|
||||
|
||||
assert retrieved.id == created.id
|
||||
assert retrieved.agent_id == created.agent_id
|
||||
assert retrieved.in_context_message_ids == []
|
||||
# Conversation should have 1 system message immediately after creation
|
||||
assert len(retrieved.in_context_message_ids) == 1
|
||||
assert retrieved.in_context_message_ids[0].startswith("message-")
|
||||
|
||||
# Send a message to the conversation
|
||||
list(
|
||||
@@ -566,6 +568,289 @@ class TestConversationsSDK:
|
||||
# Should not contain the cursor message
|
||||
assert first_message_id not in [m.id for m in messages_after]
|
||||
|
||||
def test_agent_direct_messaging_via_conversations_endpoint(self, client: Letta, agent):
|
||||
"""Test sending messages using agent ID as conversation_id (agent-direct mode).
|
||||
|
||||
This allows clients to use a unified endpoint pattern without managing conversation IDs.
|
||||
"""
|
||||
# Send a message using the agent ID directly as conversation_id
|
||||
# This should route to agent-direct mode with locking
|
||||
messages = list(
|
||||
client.conversations.messages.create(
|
||||
conversation_id=agent.id, # Using agent ID instead of conversation ID
|
||||
messages=[{"role": "user", "content": "Hello via agent-direct mode!"}],
|
||||
)
|
||||
)
|
||||
|
||||
# Verify we got a response
|
||||
assert len(messages) > 0, "Should receive response messages"
|
||||
|
||||
# Verify we got an assistant message in the response
|
||||
assistant_messages = [m for m in messages if hasattr(m, "message_type") and m.message_type == "assistant_message"]
|
||||
assert len(assistant_messages) > 0, "Should receive at least one assistant message"
|
||||
|
||||
def test_agent_direct_messaging_with_locking(self, client: Letta, agent):
|
||||
"""Test that agent-direct mode properly acquires and releases locks.
|
||||
|
||||
Sequential requests should both succeed if locks are properly released.
|
||||
"""
|
||||
from letta.settings import settings
|
||||
|
||||
# Skip if Redis is not configured
|
||||
if settings.redis_host is None or settings.redis_port is None:
|
||||
pytest.skip("Redis not configured - skipping agent-direct lock test")
|
||||
|
||||
# Send first message via agent-direct mode
|
||||
messages1 = list(
|
||||
client.conversations.messages.create(
|
||||
conversation_id=agent.id,
|
||||
messages=[{"role": "user", "content": "First message"}],
|
||||
)
|
||||
)
|
||||
assert len(messages1) > 0, "First message should succeed"
|
||||
|
||||
# Send second message - should succeed if lock was released
|
||||
messages2 = list(
|
||||
client.conversations.messages.create(
|
||||
conversation_id=agent.id,
|
||||
messages=[{"role": "user", "content": "Second message"}],
|
||||
)
|
||||
)
|
||||
assert len(messages2) > 0, "Second message should succeed after lock released"
|
||||
|
||||
def test_agent_direct_concurrent_requests_blocked(self, client: Letta, agent):
|
||||
"""Test that concurrent requests to agent-direct mode are properly serialized.
|
||||
|
||||
One request should succeed and one should get a 409 CONVERSATION_BUSY error.
|
||||
"""
|
||||
import concurrent.futures
|
||||
|
||||
from letta_client import ConflictError
|
||||
|
||||
from letta.settings import settings
|
||||
|
||||
# Skip if Redis is not configured
|
||||
if settings.redis_host is None or settings.redis_port is None:
|
||||
pytest.skip("Redis not configured - skipping agent-direct lock test")
|
||||
|
||||
results = {"success": 0, "conflict": 0, "other_error": 0}
|
||||
|
||||
def send_message(msg: str):
|
||||
try:
|
||||
messages = list(
|
||||
client.conversations.messages.create(
|
||||
conversation_id=agent.id, # Agent-direct mode
|
||||
messages=[{"role": "user", "content": msg}],
|
||||
)
|
||||
)
|
||||
return ("success", messages)
|
||||
except ConflictError:
|
||||
return ("conflict", None)
|
||||
except Exception as e:
|
||||
return ("other_error", str(e))
|
||||
|
||||
# Fire off two messages concurrently
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
||||
future1 = executor.submit(send_message, "Concurrent message 1")
|
||||
future2 = executor.submit(send_message, "Concurrent message 2")
|
||||
|
||||
result1 = future1.result()
|
||||
result2 = future2.result()
|
||||
|
||||
# Count results
|
||||
for result_type, _ in [result1, result2]:
|
||||
results[result_type] += 1
|
||||
|
||||
# One should succeed and one should get conflict
|
||||
assert results["success"] == 1, f"Expected 1 success, got {results['success']}"
|
||||
assert results["conflict"] == 1, f"Expected 1 conflict, got {results['conflict']}"
|
||||
assert results["other_error"] == 0, f"Unexpected errors: {results['other_error']}"
|
||||
|
||||
# Now send another message - should succeed since lock is released
|
||||
messages = list(
|
||||
client.conversations.messages.create(
|
||||
conversation_id=agent.id,
|
||||
messages=[{"role": "user", "content": "Message after concurrent requests"}],
|
||||
)
|
||||
)
|
||||
assert len(messages) > 0, "Should be able to send message after concurrent requests complete"
|
||||
|
||||
def test_agent_direct_list_messages(self, client: Letta, agent):
|
||||
"""Test listing messages using agent ID as conversation_id."""
|
||||
# First send a message via agent-direct mode
|
||||
list(
|
||||
client.conversations.messages.create(
|
||||
conversation_id=agent.id,
|
||||
messages=[{"role": "user", "content": "Test message for listing"}],
|
||||
)
|
||||
)
|
||||
|
||||
# List messages using agent ID
|
||||
messages_page = client.conversations.messages.list(conversation_id=agent.id)
|
||||
messages = list(messages_page)
|
||||
|
||||
# Should have messages (at least system + user + assistant)
|
||||
assert len(messages) >= 3, f"Expected at least 3 messages, got {len(messages)}"
|
||||
|
||||
# Verify we can find our test message
|
||||
user_messages = [m for m in messages if hasattr(m, "message_type") and m.message_type == "user_message"]
|
||||
assert any("Test message for listing" in str(m.content) for m in user_messages), "Should find our test message"
|
||||
|
||||
def test_agent_direct_cancel(self, client: Letta, agent):
|
||||
"""Test canceling runs using agent ID as conversation_id."""
|
||||
from letta.settings import settings
|
||||
|
||||
# Skip if run tracking is disabled
|
||||
if not settings.track_agent_run:
|
||||
pytest.skip("Run tracking disabled - skipping cancel test")
|
||||
|
||||
# Start a background request that we can cancel
|
||||
try:
|
||||
# Send a message in background mode
|
||||
stream = client.conversations.messages.create(
|
||||
conversation_id=agent.id,
|
||||
messages=[{"role": "user", "content": "Background message to cancel"}],
|
||||
background=True,
|
||||
)
|
||||
# Consume a bit of the stream to ensure it started
|
||||
next(iter(stream), None)
|
||||
|
||||
# Cancel using agent ID
|
||||
result = client.conversations.cancel(conversation_id=agent.id)
|
||||
|
||||
# Should return results (may be empty if run already completed)
|
||||
assert isinstance(result, dict), "Cancel should return a dict of results"
|
||||
except Exception as e:
|
||||
# If no active runs, that's okay - the run may have completed quickly
|
||||
if "No active runs" not in str(e):
|
||||
raise
|
||||
|
||||
def test_backwards_compatibility_old_pattern(self, client: Letta, agent, server_url: str):
|
||||
"""Test that the old pattern (agent_id as conversation_id) still works for backwards compatibility."""
|
||||
# OLD PATTERN: conversation_id=agent.id (should still work)
|
||||
# Use raw HTTP requests since SDK might not be up to date
|
||||
|
||||
# Test 1: Send message using old pattern
|
||||
response = requests.post(
|
||||
f"{server_url}/v1/conversations/{agent.id}/messages",
|
||||
json={
|
||||
"messages": [{"role": "user", "content": "Testing old pattern still works"}],
|
||||
"streaming": False,
|
||||
},
|
||||
)
|
||||
assert response.status_code == 200, f"Old pattern should work for sending messages: {response.text}"
|
||||
data = response.json()
|
||||
assert "messages" in data, "Response should contain messages"
|
||||
assert len(data["messages"]) > 0, "Should receive response messages"
|
||||
|
||||
# Test 2: List messages using old pattern
|
||||
response = requests.get(f"{server_url}/v1/conversations/{agent.id}/messages")
|
||||
assert response.status_code == 200, f"Old pattern should work for listing messages: {response.text}"
|
||||
data = response.json()
|
||||
# Response is a list of messages directly
|
||||
assert isinstance(data, list), "Response should be a list of messages"
|
||||
assert len(data) >= 3, "Should have at least system + user + assistant messages"
|
||||
|
||||
# Verify our message is there
|
||||
user_messages = [m for m in data if m.get("message_type") == "user_message"]
|
||||
assert any("Testing old pattern still works" in str(m.get("content", "")) for m in user_messages), "Should find our test message"
|
||||
|
||||
def test_new_pattern_send_message(self, client: Letta, agent, server_url: str):
|
||||
"""Test sending messages using the new pattern: conversation_id='default' + agent_id in body."""
|
||||
# NEW PATTERN: conversation_id='default' + agent_id in request body
|
||||
response = requests.post(
|
||||
f"{server_url}/v1/conversations/default/messages",
|
||||
json={
|
||||
"agent_id": agent.id,
|
||||
"messages": [{"role": "user", "content": "Testing new pattern send message"}],
|
||||
"streaming": False,
|
||||
},
|
||||
)
|
||||
assert response.status_code == 200, f"New pattern should work for sending messages: {response.text}"
|
||||
data = response.json()
|
||||
assert "messages" in data, "Response should contain messages"
|
||||
assert len(data["messages"]) > 0, "Should receive response messages"
|
||||
|
||||
# Verify we got an assistant message
|
||||
assistant_messages = [m for m in data["messages"] if m.get("message_type") == "assistant_message"]
|
||||
assert len(assistant_messages) > 0, "Should receive at least one assistant message"
|
||||
|
||||
def test_new_pattern_list_messages(self, client: Letta, agent, server_url: str):
|
||||
"""Test listing messages using the new pattern: conversation_id='default' + agent_id query param."""
|
||||
# First send a message to populate the conversation
|
||||
requests.post(
|
||||
f"{server_url}/v1/conversations/{agent.id}/messages",
|
||||
json={
|
||||
"messages": [{"role": "user", "content": "Setup message for list test"}],
|
||||
"streaming": False,
|
||||
},
|
||||
)
|
||||
|
||||
# NEW PATTERN: conversation_id='default' + agent_id as query param
|
||||
response = requests.get(
|
||||
f"{server_url}/v1/conversations/default/messages",
|
||||
params={"agent_id": agent.id},
|
||||
)
|
||||
assert response.status_code == 200, f"New pattern should work for listing messages: {response.text}"
|
||||
data = response.json()
|
||||
# Response is a list of messages directly
|
||||
assert isinstance(data, list), "Response should be a list of messages"
|
||||
assert len(data) >= 3, "Should have at least system + user + assistant messages"
|
||||
|
||||
def test_new_pattern_cancel(self, client: Letta, agent, server_url: str):
|
||||
"""Test canceling runs using the new pattern: conversation_id='default' + agent_id query param."""
|
||||
from letta.settings import settings
|
||||
|
||||
if not settings.track_agent_run:
|
||||
pytest.skip("Run tracking disabled - skipping cancel test")
|
||||
|
||||
# NEW PATTERN: conversation_id='default' + agent_id as query param
|
||||
response = requests.post(
|
||||
f"{server_url}/v1/conversations/default/cancel",
|
||||
params={"agent_id": agent.id},
|
||||
)
|
||||
# Returns 200 with results if runs exist, or 409 if no active runs
|
||||
assert response.status_code in [200, 409], f"New pattern should work for cancel: {response.text}"
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
assert isinstance(data, dict), "Cancel should return a dict"
|
||||
|
||||
def test_new_pattern_compact(self, client: Letta, agent, server_url: str):
|
||||
"""Test compacting conversation using the new pattern: conversation_id='default' + agent_id in body."""
|
||||
# Send many messages to have enough for compaction
|
||||
for i in range(10):
|
||||
requests.post(
|
||||
f"{server_url}/v1/conversations/{agent.id}/messages",
|
||||
json={
|
||||
"messages": [{"role": "user", "content": f"Message {i} for compaction test"}],
|
||||
"streaming": False,
|
||||
},
|
||||
)
|
||||
|
||||
# NEW PATTERN: conversation_id='default' + agent_id in request body
|
||||
response = requests.post(
|
||||
f"{server_url}/v1/conversations/default/compact",
|
||||
json={"agent_id": agent.id},
|
||||
)
|
||||
# May return 200 (success) or 400 (not enough messages to compact)
|
||||
assert response.status_code in [200, 400], f"New pattern should accept agent_id parameter: {response.text}"
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
assert "summary" in data, "Response should contain summary"
|
||||
assert "num_messages_before" in data, "Response should contain num_messages_before"
|
||||
assert "num_messages_after" in data, "Response should contain num_messages_after"
|
||||
|
||||
def test_new_pattern_stream_retrieve(self, client: Letta, agent, server_url: str):
|
||||
"""Test retrieving stream using the new pattern: conversation_id='default' + agent_id in body."""
|
||||
# NEW PATTERN: conversation_id='default' + agent_id in request body
|
||||
# Note: This will likely return 400 if no active run exists, which is expected
|
||||
response = requests.post(
|
||||
f"{server_url}/v1/conversations/default/stream",
|
||||
json={"agent_id": agent.id},
|
||||
)
|
||||
# Either 200 (if run exists) or 400 (no active run) are both acceptable
|
||||
assert response.status_code in [200, 400], f"Stream retrieve should accept new pattern: {response.text}"
|
||||
|
||||
|
||||
class TestConversationDelete:
|
||||
"""Tests for the conversation delete endpoint."""
|
||||
@@ -834,3 +1119,130 @@ class TestConversationCompact:
|
||||
)
|
||||
|
||||
assert response.status_code == 404
|
||||
|
||||
|
||||
class TestConversationSystemMessageRecompilation:
|
||||
"""Tests that verify the system message is recompiled with latest memory state on new conversation creation."""
|
||||
|
||||
def test_new_conversation_recompiles_system_message_with_updated_memory(self, client: Letta, server_url: str):
|
||||
"""Test the full workflow:
|
||||
1. Agent is created
|
||||
2. Send message to agent (through a conversation)
|
||||
3. Modify the memory block -> check system message is NOT updated with the modified value
|
||||
4. Create a new conversation
|
||||
5. Check new conversation system message DOES have the modified value
|
||||
"""
|
||||
unique_marker = f"UNIQUE_MARKER_{uuid.uuid4().hex[:8]}"
|
||||
|
||||
# Step 1: Create an agent with known memory blocks
|
||||
agent = client.agents.create(
|
||||
name=f"test_sys_msg_recompile_{uuid.uuid4().hex[:8]}",
|
||||
model="openai/gpt-4o-mini",
|
||||
embedding="openai/text-embedding-3-small",
|
||||
memory_blocks=[
|
||||
{"label": "human", "value": "The user is a test user."},
|
||||
{"label": "persona", "value": "You are a helpful assistant."},
|
||||
],
|
||||
)
|
||||
|
||||
try:
|
||||
# Step 2: Create a conversation and send a message to it
|
||||
conv1 = client.conversations.create(agent_id=agent.id)
|
||||
|
||||
list(
|
||||
client.conversations.messages.create(
|
||||
conversation_id=conv1.id,
|
||||
messages=[{"role": "user", "content": "Hello, just a quick test."}],
|
||||
)
|
||||
)
|
||||
|
||||
# Verify the conversation has messages including a system message
|
||||
conv1_messages = client.conversations.messages.list(
|
||||
conversation_id=conv1.id,
|
||||
order="asc",
|
||||
)
|
||||
assert len(conv1_messages) >= 3 # system + user + assistant
|
||||
assert conv1_messages[0].message_type == "system_message"
|
||||
|
||||
# Get the original system message content
|
||||
original_system_content = conv1_messages[0].content
|
||||
assert unique_marker not in original_system_content, "Marker should not be in original system message"
|
||||
|
||||
# Step 3: Modify the memory block with a unique marker
|
||||
client.agents.blocks.update(
|
||||
agent_id=agent.id,
|
||||
block_label="human",
|
||||
value=f"The user is a test user. {unique_marker}",
|
||||
)
|
||||
|
||||
# Verify the block was actually updated
|
||||
updated_block = client.agents.blocks.retrieve(agent_id=agent.id, block_label="human")
|
||||
assert unique_marker in updated_block.value
|
||||
|
||||
# Check that the OLD conversation's system message is NOT updated
|
||||
conv1_messages_after_update = client.conversations.messages.list(
|
||||
conversation_id=conv1.id,
|
||||
order="asc",
|
||||
)
|
||||
old_system_content = conv1_messages_after_update[0].content
|
||||
assert unique_marker not in old_system_content, "Old conversation system message should NOT contain the updated memory value"
|
||||
|
||||
# Step 4: Create a new conversation
|
||||
conv2 = client.conversations.create(agent_id=agent.id)
|
||||
|
||||
# Step 5: Check the new conversation's system message has the updated value
|
||||
# The system message should be compiled at creation time with the latest memory
|
||||
conv2_retrieved = client.conversations.retrieve(conversation_id=conv2.id)
|
||||
assert len(conv2_retrieved.in_context_message_ids) == 1, (
|
||||
f"New conversation should have exactly 1 system message, got {len(conv2_retrieved.in_context_message_ids)}"
|
||||
)
|
||||
|
||||
conv2_messages = client.conversations.messages.list(
|
||||
conversation_id=conv2.id,
|
||||
order="asc",
|
||||
)
|
||||
assert len(conv2_messages) >= 1
|
||||
assert conv2_messages[0].message_type == "system_message"
|
||||
|
||||
new_system_content = conv2_messages[0].content
|
||||
assert unique_marker in new_system_content, (
|
||||
f"New conversation system message should contain the updated memory value '{unique_marker}', "
|
||||
f"but system message content did not include it"
|
||||
)
|
||||
|
||||
finally:
|
||||
client.agents.delete(agent_id=agent.id)
|
||||
|
||||
def test_conversation_creation_initializes_system_message(self, client: Letta, server_url: str):
|
||||
"""Test that creating a conversation immediately initializes it with a system message."""
|
||||
agent = client.agents.create(
|
||||
name=f"test_conv_init_{uuid.uuid4().hex[:8]}",
|
||||
model="openai/gpt-4o-mini",
|
||||
embedding="openai/text-embedding-3-small",
|
||||
memory_blocks=[
|
||||
{"label": "human", "value": "Test user for system message init."},
|
||||
{"label": "persona", "value": "You are a helpful assistant."},
|
||||
],
|
||||
)
|
||||
|
||||
try:
|
||||
# Create a conversation (without sending any messages)
|
||||
conversation = client.conversations.create(agent_id=agent.id)
|
||||
|
||||
# Verify the conversation has a system message immediately
|
||||
retrieved = client.conversations.retrieve(conversation_id=conversation.id)
|
||||
assert len(retrieved.in_context_message_ids) == 1, (
|
||||
f"Expected 1 system message after conversation creation, got {len(retrieved.in_context_message_ids)}"
|
||||
)
|
||||
|
||||
# Verify the system message content contains memory block values
|
||||
messages = client.conversations.messages.list(
|
||||
conversation_id=conversation.id,
|
||||
order="asc",
|
||||
)
|
||||
assert len(messages) == 1
|
||||
assert messages[0].message_type == "system_message"
|
||||
assert "Test user for system message init." in messages[0].content
|
||||
|
||||
finally:
|
||||
client.agents.delete(agent_id=agent.id)
|
||||
|
||||
@@ -93,7 +93,7 @@ def agent_obj(client: Letta) -> AgentState:
|
||||
tool_ids=[send_message_to_agent_tool.id],
|
||||
model="openai/gpt-4o",
|
||||
embedding="openai/text-embedding-3-small",
|
||||
context_window_limit=32000,
|
||||
context_window_limit=128000,
|
||||
)
|
||||
yield agent_state_instance
|
||||
|
||||
@@ -107,7 +107,7 @@ def other_agent_obj(client: Letta) -> AgentState:
|
||||
include_multi_agent_tools=False,
|
||||
model="openai/gpt-4o",
|
||||
embedding="openai/text-embedding-3-small",
|
||||
context_window_limit=32000,
|
||||
context_window_limit=128000,
|
||||
)
|
||||
|
||||
yield agent_state_instance
|
||||
|
||||
@@ -366,6 +366,8 @@ async def test_compaction_settings_model_uses_separate_llm_config_for_summarizat
|
||||
async def test_create_agent_sets_default_compaction_model_anthropic(server: SyncServer, default_user):
|
||||
"""When no compaction_settings provided for Anthropic agent, default haiku model should be set."""
|
||||
from letta.schemas.agent import CreateAgent
|
||||
from letta.schemas.enums import ProviderType
|
||||
from letta.services.summarizer.summarizer_config import get_default_summarizer_model
|
||||
|
||||
await server.init_async(init_with_default_org_and_user=True)
|
||||
|
||||
@@ -384,7 +386,7 @@ async def test_create_agent_sets_default_compaction_model_anthropic(server: Sync
|
||||
|
||||
# Should have default haiku model set
|
||||
assert agent.compaction_settings is not None
|
||||
assert agent.compaction_settings.model == "anthropic/claude-haiku-4-5-20251001"
|
||||
assert agent.compaction_settings.model == get_default_summarizer_model(ProviderType.anthropic)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -808,6 +810,79 @@ async def test_update_agent_compaction_settings(server: SyncServer, comprehensiv
|
||||
assert updated_agent.compaction_settings.prompt_acknowledgement == False
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_update_agent_partial_compaction_settings(server: SyncServer, comprehensive_test_agent_fixture, default_user):
|
||||
"""Test that an agent's compaction_settings can be upserted."""
|
||||
from letta.services.summarizer.summarizer_config import get_default_prompt_for_mode
|
||||
|
||||
agent, _ = comprehensive_test_agent_fixture
|
||||
|
||||
# Create new compaction settings
|
||||
original_compaction_settings = agent.compaction_settings.model_copy()
|
||||
|
||||
new_compaction_settings = CompactionSettings(
|
||||
mode="all",
|
||||
prompt_acknowledgement=True,
|
||||
clip_chars=3000,
|
||||
)
|
||||
|
||||
# Update agent with compaction settings
|
||||
update_agent_request = UpdateAgent(
|
||||
compaction_settings=new_compaction_settings,
|
||||
)
|
||||
|
||||
updated_agent = await server.agent_manager.update_agent_async(agent.id, update_agent_request, actor=default_user)
|
||||
|
||||
# Verify compaction settings were updated correctly
|
||||
assert updated_agent.compaction_settings is not None
|
||||
assert updated_agent.compaction_settings.model == original_compaction_settings.model
|
||||
assert updated_agent.compaction_settings.model_settings == original_compaction_settings.model_settings
|
||||
assert updated_agent.compaction_settings.sliding_window_percentage == original_compaction_settings.sliding_window_percentage
|
||||
assert updated_agent.compaction_settings.mode == "all"
|
||||
assert updated_agent.compaction_settings.clip_chars == 3000
|
||||
assert updated_agent.compaction_settings.prompt == get_default_prompt_for_mode("all")
|
||||
assert updated_agent.compaction_settings.prompt_acknowledgement == True
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_update_agent_partial_compaction_settings_same_mode(server: SyncServer, comprehensive_test_agent_fixture, default_user):
|
||||
"""Test that if the mode stays the same without a prompt passed in, the prompt is not updated."""
|
||||
|
||||
agent, _ = comprehensive_test_agent_fixture
|
||||
|
||||
update_agent_request = UpdateAgent(
|
||||
compaction_settings=CompactionSettings(mode="sliding_window", prompt="This is a fake prompt."),
|
||||
)
|
||||
updated_agent = await server.agent_manager.update_agent_async(agent.id, update_agent_request, actor=default_user)
|
||||
|
||||
assert updated_agent.compaction_settings is not None
|
||||
assert updated_agent.compaction_settings.prompt == "This is a fake prompt."
|
||||
|
||||
# Create new compaction settings
|
||||
original_compaction_settings = updated_agent.compaction_settings.model_copy()
|
||||
|
||||
new_compaction_settings = CompactionSettings(
|
||||
mode="sliding_window",
|
||||
model="openai/gpt-4o-mini",
|
||||
)
|
||||
|
||||
# Update agent with compaction settings
|
||||
update_agent_request = UpdateAgent(
|
||||
compaction_settings=new_compaction_settings,
|
||||
)
|
||||
|
||||
final_agent = await server.agent_manager.update_agent_async(updated_agent.id, update_agent_request, actor=default_user)
|
||||
|
||||
# Verify compaction settings were updated correctly
|
||||
assert final_agent.compaction_settings is not None
|
||||
assert final_agent.compaction_settings.sliding_window_percentage == original_compaction_settings.sliding_window_percentage
|
||||
assert final_agent.compaction_settings.prompt == original_compaction_settings.prompt
|
||||
assert final_agent.compaction_settings.clip_chars == original_compaction_settings.clip_chars
|
||||
assert final_agent.compaction_settings.prompt_acknowledgement == original_compaction_settings.prompt_acknowledgement
|
||||
assert final_agent.compaction_settings.mode == "sliding_window"
|
||||
assert final_agent.compaction_settings.model == "openai/gpt-4o-mini"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_agent_file_defaults_based_on_context_window(server: SyncServer, default_user, default_block):
|
||||
"""Test that file-related defaults are set based on the model's context window size"""
|
||||
|
||||
@@ -562,7 +562,9 @@ async def test_update_block(server: SyncServer, default_user):
|
||||
@pytest.mark.asyncio
|
||||
async def test_update_block_limit(server: SyncServer, default_user):
|
||||
block_manager = BlockManager()
|
||||
block = await block_manager.create_or_update_block_async(PydanticBlock(label="persona", value="Original Content"), actor=default_user)
|
||||
block = await block_manager.create_or_update_block_async(
|
||||
PydanticBlock(label="persona", value="Original Content", limit=20000), actor=default_user
|
||||
)
|
||||
|
||||
limit = len("Updated Content") * 2000
|
||||
update_data = BlockUpdate(value="Updated Content" * 2000, description="Updated description")
|
||||
|
||||
@@ -355,8 +355,9 @@ async def test_add_messages_to_conversation(
|
||||
actor=default_user,
|
||||
)
|
||||
|
||||
assert len(message_ids) == 1
|
||||
assert message_ids[0] == hello_world_message_fixture.id
|
||||
# create_conversation auto-creates a system message at position 0
|
||||
assert len(message_ids) == 2
|
||||
assert hello_world_message_fixture.id in message_ids
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -385,8 +386,9 @@ async def test_get_messages_for_conversation(
|
||||
actor=default_user,
|
||||
)
|
||||
|
||||
assert len(messages) == 1
|
||||
assert messages[0].id == hello_world_message_fixture.id
|
||||
# create_conversation auto-creates a system message at position 0
|
||||
assert len(messages) == 2
|
||||
assert any(m.id == hello_world_message_fixture.id for m in messages)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -430,7 +432,10 @@ async def test_message_ordering_in_conversation(conversation_manager, server: Sy
|
||||
actor=default_user,
|
||||
)
|
||||
|
||||
assert retrieved_ids == [m.id for m in messages]
|
||||
# create_conversation auto-creates a system message at position 0,
|
||||
# so the user messages start at index 1
|
||||
assert len(retrieved_ids) == len(messages) + 1
|
||||
assert retrieved_ids[1:] == [m.id for m in messages]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -489,7 +494,7 @@ async def test_update_in_context_messages(conversation_manager, server: SyncServ
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_empty_conversation_message_ids(conversation_manager, server: SyncServer, sarah_agent, default_user):
|
||||
"""Test getting message IDs from an empty conversation."""
|
||||
"""Test getting message IDs from a newly created conversation (has auto-created system message)."""
|
||||
# Create a conversation
|
||||
conversation = await conversation_manager.create_conversation(
|
||||
agent_id=sarah_agent.id,
|
||||
@@ -497,13 +502,14 @@ async def test_empty_conversation_message_ids(conversation_manager, server: Sync
|
||||
actor=default_user,
|
||||
)
|
||||
|
||||
# Get message IDs (should be empty)
|
||||
# create_conversation auto-creates a system message at position 0,
|
||||
# so a newly created conversation has exactly one message
|
||||
message_ids = await conversation_manager.get_message_ids_for_conversation(
|
||||
conversation_id=conversation.id,
|
||||
actor=default_user,
|
||||
)
|
||||
|
||||
assert message_ids == []
|
||||
assert len(message_ids) == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -551,9 +557,11 @@ async def test_list_conversation_messages(conversation_manager, server: SyncServ
|
||||
actor=default_user,
|
||||
)
|
||||
|
||||
assert len(letta_messages) == 2
|
||||
# create_conversation auto-creates a system message, so we get 3 total
|
||||
assert len(letta_messages) == 3
|
||||
# Check message types
|
||||
message_types = [m.message_type for m in letta_messages]
|
||||
assert "system_message" in message_types
|
||||
assert "user_message" in message_types
|
||||
assert "assistant_message" in message_types
|
||||
|
||||
@@ -902,9 +910,12 @@ async def test_list_conversation_messages_ascending_order(conversation_manager,
|
||||
reverse=False,
|
||||
)
|
||||
|
||||
# First message should be "Message 0" (oldest)
|
||||
assert len(letta_messages) == 3
|
||||
assert "Message 0" in letta_messages[0].content
|
||||
# create_conversation auto-creates a system message at position 0,
|
||||
# so we get 4 messages total (system + 3 user messages)
|
||||
assert len(letta_messages) == 4
|
||||
# First message is the auto-created system message; "Message 0" is second
|
||||
assert letta_messages[0].message_type == "system_message"
|
||||
assert "Message 0" in letta_messages[1].content
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -949,8 +960,9 @@ async def test_list_conversation_messages_descending_order(conversation_manager,
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
# First message should be "Message 2" (newest)
|
||||
assert len(letta_messages) == 3
|
||||
# create_conversation auto-creates a system message, so 4 total
|
||||
# First message should be "Message 2" (newest) in descending order
|
||||
assert len(letta_messages) == 4
|
||||
assert "Message 2" in letta_messages[0].content
|
||||
|
||||
|
||||
@@ -1081,7 +1093,8 @@ async def test_list_conversation_messages_no_group_id_returns_all(conversation_m
|
||||
actor=default_user,
|
||||
)
|
||||
|
||||
assert len(all_messages) == 3
|
||||
# create_conversation auto-creates a system message, so 4 total
|
||||
assert len(all_messages) == 4
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -1137,8 +1150,8 @@ async def test_list_conversation_messages_order_with_pagination(conversation_man
|
||||
|
||||
# The first messages should be different
|
||||
assert page_asc[0].content != page_desc[0].content
|
||||
# In ascending, first should be "Message 0"
|
||||
assert "Message 0" in page_asc[0].content
|
||||
# In ascending, first is the auto-created system message, second is "Message 0"
|
||||
assert page_asc[0].message_type == "system_message"
|
||||
# In descending, first should be "Message 4"
|
||||
assert "Message 4" in page_desc[0].content
|
||||
|
||||
|
||||
@@ -579,8 +579,11 @@ async def test_server_startup_syncs_base_providers(default_user, default_organiz
|
||||
yield item
|
||||
|
||||
# Mock the Anthropic AsyncAnthropic client
|
||||
# NOTE: list() must be a regular (non-async) method that returns an async iterable,
|
||||
# because the real Anthropic SDK's models.list() returns an AsyncPage (which has __aiter__)
|
||||
# directly, and the code uses `async for model in client.models.list()`.
|
||||
class MockAnthropicModels:
|
||||
async def list(self):
|
||||
def list(self):
|
||||
return MockAnthropicAsyncPage(mock_anthropic_models["data"])
|
||||
|
||||
class MockAsyncAnthropic:
|
||||
@@ -877,8 +880,10 @@ async def test_server_startup_handles_api_errors_gracefully(default_user, defaul
|
||||
for item in self._items:
|
||||
yield item
|
||||
|
||||
# NOTE: The real SDK's models.list() is a regular (non-async) method that
|
||||
# returns an AsyncPaginator (which is async-iterable).
|
||||
class MockAnthropicModels:
|
||||
async def list(self):
|
||||
def list(self):
|
||||
return MockAnthropicAsyncPage(mock_anthropic_data)
|
||||
|
||||
class MockAsyncAnthropic:
|
||||
|
||||
11
tests/model_settings/openai-gpt-5.3-chat-latest.json
Normal file
11
tests/model_settings/openai-gpt-5.3-chat-latest.json
Normal file
@@ -0,0 +1,11 @@
|
||||
{
|
||||
"handle": "openai/gpt-5.3-chat-latest",
|
||||
"model_settings": {
|
||||
"provider_type": "openai",
|
||||
"max_output_tokens": 4096,
|
||||
"parallel_tool_calls": false,
|
||||
"reasoning": {
|
||||
"reasoning_effort": "minimal"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,11 +1,11 @@
|
||||
from conftest import create_test_module
|
||||
from letta_client import UnprocessableEntityError
|
||||
|
||||
from letta.constants import CORE_MEMORY_HUMAN_CHAR_LIMIT, CORE_MEMORY_PERSONA_CHAR_LIMIT
|
||||
from letta.constants import CORE_MEMORY_BLOCK_CHAR_LIMIT
|
||||
|
||||
BLOCKS_CREATE_PARAMS = [
|
||||
("human_block", {"label": "human", "value": "test"}, {"limit": CORE_MEMORY_HUMAN_CHAR_LIMIT}, None),
|
||||
("persona_block", {"label": "persona", "value": "test1"}, {"limit": CORE_MEMORY_PERSONA_CHAR_LIMIT}, None),
|
||||
("human_block", {"label": "human", "value": "test"}, {"limit": CORE_MEMORY_BLOCK_CHAR_LIMIT}, None),
|
||||
("persona_block", {"label": "persona", "value": "test1"}, {"limit": CORE_MEMORY_BLOCK_CHAR_LIMIT}, None),
|
||||
]
|
||||
|
||||
BLOCKS_UPDATE_PARAMS = [
|
||||
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -44,7 +44,7 @@
|
||||
"provider_name": null,
|
||||
"provider_category": null,
|
||||
"model_wrapper": null,
|
||||
"context_window": 32000,
|
||||
"context_window": 128000,
|
||||
"put_inner_thoughts_in_kwargs": false,
|
||||
"handle": "anthropic/claude-3.5-sonnet",
|
||||
"temperature": 1.0,
|
||||
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -56,7 +56,7 @@
|
||||
"provider_name": "openai",
|
||||
"provider_category": "base",
|
||||
"model_wrapper": null,
|
||||
"context_window": 32000,
|
||||
"context_window": 128000,
|
||||
"put_inner_thoughts_in_kwargs": true,
|
||||
"handle": "openai/gpt-4o-mini",
|
||||
"temperature": 1.0,
|
||||
|
||||
@@ -55,7 +55,7 @@
|
||||
"provider_name": "openai",
|
||||
"provider_category": "base",
|
||||
"model_wrapper": null,
|
||||
"context_window": 32000,
|
||||
"context_window": 128000,
|
||||
"put_inner_thoughts_in_kwargs": true,
|
||||
"handle": "openai/gpt-4.1-mini",
|
||||
"temperature": 1.0,
|
||||
|
||||
@@ -16,7 +16,7 @@ def llm_config():
|
||||
model="claude-3-7-sonnet-20250219",
|
||||
model_endpoint_type="anthropic",
|
||||
model_endpoint="https://api.anthropic.com/v1",
|
||||
context_window=32000,
|
||||
context_window=128000,
|
||||
handle="anthropic/claude-sonnet-4-20250514",
|
||||
put_inner_thoughts_in_kwargs=False,
|
||||
max_tokens=4096,
|
||||
|
||||
@@ -52,8 +52,17 @@ class TestLogContextMiddleware:
|
||||
async def get_files(self, agent_id, org_id, ref):
|
||||
assert ref == "HEAD"
|
||||
return {
|
||||
"system/human.md": "---\ndescription: human\nlimit: 20000\n---\nname: sarah",
|
||||
"system/persona.md": "---\ndescription: persona\nlimit: 20000\n---\nbe helpful",
|
||||
"system/human.md": "---\ndescription: human\n---\nname: sarah",
|
||||
"system/persona.md": "---\ndescription: persona\n---\nbe helpful",
|
||||
"skills/research-helper/SKILL.md": (
|
||||
"---\n"
|
||||
"name: research-helper\n"
|
||||
"description: Search the web and summarize findings.\n"
|
||||
"---\n"
|
||||
"# Research Helper\n\n"
|
||||
"Use this skill to do deep web research and summarize results.\n"
|
||||
),
|
||||
"skills/research-helper/references/details.md": "---\ndescription: nested\n---\nShould not be synced",
|
||||
}
|
||||
|
||||
class DummyMemoryRepoManager:
|
||||
@@ -95,6 +104,12 @@ class TestLogContextMiddleware:
|
||||
labels = {call["label"] for call in synced_calls}
|
||||
assert "system/human" in labels
|
||||
assert "system/persona" in labels
|
||||
assert "skills/research-helper" in labels
|
||||
assert "skills/research-helper/references/details" not in labels
|
||||
|
||||
by_label = {call["label"]: call for call in synced_calls}
|
||||
assert by_label["skills/research-helper"]["description"] == "Search the web and summarize findings."
|
||||
assert by_label["skills/research-helper"]["value"].startswith("# Research Helper")
|
||||
|
||||
def test_extracts_actor_id_from_headers(self, client):
|
||||
response = client.get("/v1/agents/agent-123e4567-e89b-42d3-8456-426614174000", headers={"user_id": "user-abc123"})
|
||||
|
||||
@@ -25,9 +25,9 @@ def test_chat_memory_init_and_utils(chat_memory: Memory):
|
||||
|
||||
def test_memory_limit_validation(chat_memory: Memory):
|
||||
with pytest.raises(ValueError):
|
||||
ChatMemory(persona="x " * 50000, human="y " * 50000)
|
||||
ChatMemory(persona="x " * 60000, human="y " * 60000)
|
||||
with pytest.raises(ValueError):
|
||||
chat_memory.get_block("persona").value = "x " * 50000
|
||||
chat_memory.get_block("persona").value = "x " * 60000
|
||||
|
||||
|
||||
def test_get_block_not_found(chat_memory: Memory):
|
||||
@@ -253,3 +253,104 @@ def test_compile_git_memory_filesystem_handles_leaf_directory_collisions():
|
||||
assert "system/" in out
|
||||
assert "system.md" in out
|
||||
assert "human.md" in out
|
||||
|
||||
|
||||
def test_compile_git_memory_filesystem_renders_descriptions_for_non_system_files():
|
||||
"""Files outside system/ should render their description in the filesystem tree.
|
||||
|
||||
e.g. `reference/api.md (Contains API specifications)`
|
||||
System files should NOT render descriptions in the tree.
|
||||
"""
|
||||
|
||||
m = Memory(
|
||||
agent_type=AgentType.letta_v1_agent,
|
||||
git_enabled=True,
|
||||
blocks=[
|
||||
Block(label="system/human", value="human data", limit=100, description="The human block"),
|
||||
Block(label="system/persona", value="persona data", limit=100, description="The persona block"),
|
||||
Block(label="reference/api", value="api specs", limit=100, description="Contains API specifications"),
|
||||
Block(label="notes", value="my notes", limit=100, description="Personal notes and reminders"),
|
||||
],
|
||||
)
|
||||
|
||||
out = m.compile()
|
||||
|
||||
# Filesystem tree should exist
|
||||
assert "<memory_filesystem>" in out
|
||||
|
||||
# Non-system files should have descriptions rendered
|
||||
assert "api.md (Contains API specifications)" in out
|
||||
assert "notes.md (Personal notes and reminders)" in out
|
||||
|
||||
# System files should NOT have descriptions in the tree
|
||||
assert "human.md (The human block)" not in out
|
||||
assert "persona.md (The persona block)" not in out
|
||||
# But they should still be in the tree (without description)
|
||||
assert "human.md" in out
|
||||
assert "persona.md" in out
|
||||
|
||||
|
||||
def test_compile_git_memory_filesystem_no_description_when_empty():
|
||||
"""Files outside system/ with no description should render without parentheses."""
|
||||
|
||||
m = Memory(
|
||||
agent_type=AgentType.letta_v1_agent,
|
||||
git_enabled=True,
|
||||
blocks=[
|
||||
Block(label="system/human", value="human data", limit=100),
|
||||
Block(label="notes", value="my notes", limit=100),
|
||||
Block(label="reference/api", value="api specs", limit=100, description="API docs"),
|
||||
],
|
||||
)
|
||||
|
||||
out = m.compile()
|
||||
|
||||
# notes.md has no description, so no parentheses
|
||||
assert "notes.md\n" in out or "notes.md\n" in out
|
||||
# reference/api.md has a description
|
||||
assert "api.md (API docs)" in out
|
||||
|
||||
|
||||
def test_compile_git_memory_filesystem_condenses_skills_to_top_level_entries():
|
||||
"""skills/ should render as top-level skill entries with description.
|
||||
|
||||
We intentionally avoid showing nested files under skills/ in the system
|
||||
prompt tree to keep context concise.
|
||||
"""
|
||||
|
||||
m = Memory(
|
||||
agent_type=AgentType.letta_v1_agent,
|
||||
git_enabled=True,
|
||||
blocks=[
|
||||
Block(label="system/human", value="human data", limit=100),
|
||||
Block(
|
||||
label="skills/searching-messages",
|
||||
value="# searching messages",
|
||||
limit=100,
|
||||
description="Search past messages to recall context.",
|
||||
),
|
||||
Block(
|
||||
label="skills/creating-skills",
|
||||
value="# creating skills",
|
||||
limit=100,
|
||||
description="Guide for creating effective skills.",
|
||||
),
|
||||
Block(
|
||||
label="skills/creating-skills/references/workflows",
|
||||
value="nested docs",
|
||||
limit=100,
|
||||
description="Nested workflow docs (should not appear)",
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
out = m.compile()
|
||||
|
||||
# Condensed top-level skill entries with descriptions.
|
||||
assert "searching-messages (Search past messages to recall context.)" in out
|
||||
assert "creating-skills (Guide for creating effective skills.)" in out
|
||||
|
||||
# Do not show .md suffixes or nested skill docs in tree.
|
||||
assert "searching-messages.md" not in out
|
||||
assert "creating-skills.md" not in out
|
||||
assert "references/workflows" not in out
|
||||
|
||||
@@ -24,6 +24,9 @@ def test_get_headers_user_id_allows_none():
|
||||
letta_v1_agent=None,
|
||||
letta_v1_agent_message_async=None,
|
||||
modal_sandbox=None,
|
||||
billing_plan_type=None,
|
||||
billing_cost_source=None,
|
||||
billing_customer_id=None,
|
||||
)
|
||||
assert isinstance(headers, HeaderParams)
|
||||
|
||||
@@ -40,6 +43,9 @@ def test_get_headers_user_id_rejects_invalid_format():
|
||||
letta_v1_agent=None,
|
||||
letta_v1_agent_message_async=None,
|
||||
modal_sandbox=None,
|
||||
billing_plan_type=None,
|
||||
billing_cost_source=None,
|
||||
billing_customer_id=None,
|
||||
)
|
||||
|
||||
|
||||
@@ -54,6 +60,9 @@ def test_get_headers_user_id_accepts_valid_format():
|
||||
letta_v1_agent=None,
|
||||
letta_v1_agent_message_async=None,
|
||||
modal_sandbox=None,
|
||||
billing_plan_type=None,
|
||||
billing_cost_source=None,
|
||||
billing_customer_id=None,
|
||||
)
|
||||
assert headers.actor_id == "user-123e4567-e89b-42d3-8456-426614174000"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user