remove docs

This commit is contained in:
Caren Thomas
2025-11-13 15:39:54 -08:00
parent 41ffc9662a
commit fa21e07905
121 changed files with 0 additions and 32826 deletions

View File

@@ -1,153 +0,0 @@
/* ──────────────────────────────────────────────────────────
assets/leaderboard.js
Load via docs.yml → js: - path: assets/leaderboard.js
(strategy: lazyOnload is fine)
────────────────────────────────────────────────────────── */
import yaml from 'https://cdn.jsdelivr.net/npm/js-yaml@4.1.0/+esm';
console.log('🏁 leaderboard.js loaded on', location.pathname);
const COST_CAP = 120;
/* ---------- helpers ---------- */
const pct = (v) => Number(v).toPrecision(3) + '%';
const cost = (v) => '$' + Number(v).toFixed(2);
const ready = (cb) =>
document.readyState === 'loading'
? document.addEventListener('DOMContentLoaded', cb)
: cb();
/* ---------- main ---------- */
ready(async () => {
// const host = document.getElementById('letta-leaderboard');
// if (!host) {
// console.warn('LB-script: #letta-leaderboard not found - bailing out.');
// return;
// }
/* ---- wait for the leaderboard container to appear (SPA nav safe) ---- */
const host = await new Promise((resolve, reject) => {
const el = document.getElementById('letta-leaderboard');
if (el) return resolve(el); // SSR / hard refresh path
const obs = new MutationObserver(() => {
const found = document.getElementById('letta-leaderboard');
if (found) {
obs.disconnect();
resolve(found); // CSR navigation path
}
});
obs.observe(document.body, { childList: true, subtree: true });
setTimeout(() => {
obs.disconnect();
reject(new Error('#letta-leaderboard never appeared'));
}, 5000); // safety timeout
}).catch((err) => {
console.warn('LB-script:', err.message);
return null;
});
if (!host) return; // still no luck → give up
/* ----- figure out URL of data.yaml ----- */
// const path = location.pathname.endsWith('/')
// ? location.pathname
// : location.pathname.replace(/[^/]*$/, ''); // strip file/slug
// const dataUrl = `${location.origin}${path}data.yaml`;
// const dataUrl = `${location.origin}/leaderboard/data.yaml`; // one-liner, always right
// const dataUrl = `${location.origin}/assets/leaderboard.yaml`;
// const dataUrl = `./assets/leaderboard.yaml`; // one-liner, always right
// const dataUrl = `${location.origin}/data.yaml`; // one-liner, always right
const dataUrl =
'https://raw.githubusercontent.com/letta-ai/letta-evals/refs/heads/main/letta-leaderboard/leaderboard_results.yaml';
// const dataUrl = 'https://cdn.jsdelivr.net/gh/letta-ai/letta-evals@latest/letta-leaderboard/leaderboard_results.yaml';
console.log('LB-script: fetching', dataUrl);
/* ----- fetch & parse YAML ----- */
let rows;
try {
const resp = await fetch(dataUrl);
console.log(`LB-script: status ${resp.status}`);
if (!resp.ok) throw new Error(`HTTP ${resp.status}`);
rows = yaml.load(await resp.text());
} catch (err) {
console.error('LB-script: failed to load YAML →', err);
return;
}
/* ----- wire up table ----- */
const dir = Object.create(null);
const tbody = document.getElementById('lb-body');
const searchI = document.getElementById('lb-search');
const headers = document.querySelectorAll('#lb-table thead th[data-key]');
searchI.value = ''; // clear any persisted filter
const render = () => {
const q = searchI.value.toLowerCase();
tbody.innerHTML = rows
.map((r) => {
const over = r.total_cost > COST_CAP;
const barW = over ? '100%' : (r.total_cost / COST_CAP) * 100 + '%';
const costCls = over ? 'cost-high' : 'cost-ok';
const warnIcon = over
? `<span class="warn" title="Cost exceeds $${COST_CAP} cap - bar is clipped to full width">⚠</span>`
: '';
return `
<tr class="${q && !r.model.toLowerCase().includes(q) ? 'hidden' : ''}">
<td style="padding:8px">${r.model}</td>
<td class="bar-cell avg metric">
<div class="bar-viz" style="width:${r.average}%"></div>
<span class="value">${pct(r.average)}</span>
</td>
<td class="bar-cell ${costCls} metric">
<div class="bar-viz" style="width:${barW}"></div>
<span class="value">${cost(r.total_cost)}</span>
${warnIcon}
</td>
</tr>`;
})
.join('');
};
const setIndicator = (activeKey) => {
headers.forEach((h) => {
h.classList.remove('asc', 'desc');
if (h.dataset.key === activeKey) h.classList.add(dir[activeKey]);
});
};
/* initial sort ↓ */
dir.average = 'desc';
rows.sort((a, b) => b.average - a.average);
setIndicator('average');
render();
/* search */
searchI.addEventListener('input', render);
/* column sorting */
headers.forEach((th) => {
const key = th.dataset.key;
th.addEventListener('click', () => {
const asc = dir[key] === 'desc';
dir[key] = asc ? 'asc' : 'desc';
rows.sort((a, b) => {
const va = a[key],
vb = b[key];
const cmp =
typeof va === 'number'
? va - vb
: String(va).localeCompare(String(vb));
return asc ? cmp : -cmp;
});
setIndicator(key);
render();
});
});
});

View File

@@ -1,60 +0,0 @@
from letta_client import Letta
client = Letta(base_url="http://localhost:8283")
# list available models
models = client.models.list_llms()
for model in models:
print(f"Provider {model.model_endpoint_type} model {model.model}: {model.handle}")
# list available embedding models
embedding_models = client.models.list_embedding_models()
for model in embedding_models:
print(f"Provider {model.handle}")
# openai
openai_agent = client.agents.create(
model="openai/gpt-4o-mini",
embedding="openai/text-embedding-3-small",
# optional configuration
context_window_limit=16000,
embedding_chunk_size=300,
)
# Azure OpenAI
azure_openai_agent = client.agents.create(
model="azure/gpt-4o-mini",
embedding="azure/text-embedding-3-small",
# optional configuration
context_window_limit=16000,
embedding_chunk_size=300,
)
# anthropic
anthropic_agent = client.agents.create(
model="anthropic/claude-sonnet-4-20250514",
# note: anthropic does not support embeddings so you will need another provider
embedding="openai/text-embedding-3-small",
# optional configuration
context_window_limit=16000,
embedding_chunk_size=300,
)
# Groq
groq_agent = client.agents.create(
model="groq/llama-3.3-70b-versatile",
# note: groq does not support embeddings so you will need another provider
embedding="openai/text-embedding-3-small",
# optional configuration
context_window_limit=16000,
embedding_chunk_size=300,
)
# Ollama
ollama_agent = client.agents.create(
model="ollama/thewindmom/hermes-3-llama-3.1-8b:latest",
embedding="ollama/mxbai-embed-large:latest",
# optional configuration
context_window_limit=16000,
embedding_chunk_size=300,
)

View File

@@ -1,30 +0,0 @@
"""
Example of using composio tools in Letta
Make sure you set `COMPOSIO_API_KEY` environment variable or run `composio login` to authenticate with Composio.
"""
from composio import Action
from letta_client import Letta
client = Letta(base_url="http://localhost:8283")
# add a composio tool
tool = client.tools.add_composio_tool(composio_action_name=Action.GITHUB_STAR_A_REPOSITORY_FOR_THE_AUTHENTICATED_USER.name)
# create an agent with the tool
agent = client.agents.create(
name="file_editing_agent",
memory_blocks=[{"label": "persona", "value": "I am a helpful assistant"}],
model="anthropic/claude-3-5-sonnet-20241022",
embedding="openai/text-embedding-3-small",
tool_ids=[tool.id],
)
print("Agent tools", [tool.name for tool in agent.tools])
# message the agent
response = client.agents.messages.create(
agent_id=agent.id, messages=[{"role": "user", "content": "Star the github repo `letta` by `letta-ai`"}]
)
for message in response.messages:
print(message)

View File

@@ -1,57 +0,0 @@
import time
from letta_client import Letta
client = Letta(base_url="http://localhost:8283")
# get available embedding models
embedding_configs = client.models.list_embedding_models()
# clear existing sources
if len(client.sources.list()) > 0:
for source in client.sources.list():
if source.name == "my_source":
client.sources.delete(source.id)
# create a source
# TODO: pass in embedding
source = client.sources.create(name="my_source", embedding_config=embedding_configs[0])
# list sources
sources = client.sources.list()
# write a dummy file
with open("dummy.txt", "w") as f:
f.write("Remember that the user is a redhead")
# upload a file into the source
with open("dummy.txt", "rb") as f:
job = client.sources.files.upload(source_id=source.id, file=f)
# wait until the job is completed
while True:
job = client.jobs.retrieve(job.id)
if job.status == "completed":
break
elif job.status == "failed":
raise ValueError(f"Job failed: {job.metadata}")
print(f"Job status: {job.status}")
time.sleep(1)
# list files in the source
files = client.sources.files.list(source_id=source.id)
print(f"Files in source: {files}")
# list passages in the source
passages = client.sources.passages.list(source_id=source.id)
print(f"Passages in source: {passages}")
# attach the source to an agent
agent = client.agents.create(
name="my_agent",
memory_blocks=[],
model="anthropic/claude-sonnet-4-20250514",
embedding=embedding_configs[0].handle,
tags=["worker"],
)
client.agents.sources.attach(agent_id=agent.id, source_id=source.id)

View File

@@ -1,44 +0,0 @@
from letta_client import Letta
client = Letta(base_url="http://localhost:8283")
agent = client.agents.create(
name="memory_agent",
memory_blocks=[
{"label": "persona", "value": "I am a memory agent"},
{"label": "human", "value": "Name: Bob", "limit": 10000},
],
model="anthropic/claude-sonnet-4-20250514",
embedding="openai/text-embedding-3-small",
tags=["worker"],
)
# create a persisted block, which can be attached to agents
block = client.blocks.create(
label="organization",
value="Organization: Letta",
limit=4000,
)
# create an agent with both a shared block and its own blocks
shared_block_agent = client.agents.create(
name="shared_block_agent",
memory_blocks=[block.id],
model="anthropic/claude-sonnet-4-20250514",
embedding="openai/text-embedding-3-small",
tags=["worker"],
)
# list the agents blocks
blocks = client.agents.core_memory.list_blocks(shared_block_agent.id)
for block in blocks:
print(block)
# update the block (via ID)
block = client.blocks.modify(block.id, limit=10000)
# update the block (via label)
block = client.agents.core_memory.modify_block(
agent_id=shared_block_agent.id, block_label="organization", value="Organization: Letta", limit=10000
)

View File

@@ -1,53 +0,0 @@
from letta_client import Letta
client = Letta(base_url="http://localhost:8283")
try:
# create a supervisor agent
supervisor_agent = client.agents.create(
name="supervisor_agent",
memory_blocks=[
{"label": "persona", "value": "I am the supervisor, and I can communicate with worker agents with the tag `worker`"}
],
model="anthropic/claude-sonnet-4-20250514",
embedding="openai/text-embedding-3-small",
tags=["supervisor"],
tools=["send_message_to_agents_matching_all_tags"],
)
print(f"Created agent {supervisor_agent.name} with ID {supervisor_agent.id}")
def get_name() -> str:
"""Get the name of the worker agent."""
return "Bob"
tool = client.tools.upsert_from_function(func=get_name)
print(f"Created tool {tool.name} with ID {tool.id}")
# create a worker agent
worker_agent = client.agents.create(
name="worker_agent",
memory_blocks=[{"label": "persona", "value": f"I am the worker, my supervisor agent has ID {supervisor_agent.id}"}],
model="anthropic/claude-sonnet-4-20250514",
embedding="openai/text-embedding-3-small",
tool_ids=[tool.id],
tags=["worker"],
tools=["send_message_to_agents_matching_all_tags"],
)
print(f"Created agent {worker_agent.name} with ID {worker_agent.id}")
# send a message to the supervisor agent
response = client.agents.messages.create(
agent_id=worker_agent.id,
messages=[{"role": "user", "content": "Ask the worker agents what their name is, then tell me with send_message"}],
)
print(response.messages)
print(response.usage)
except Exception as e:
print(e)
# cleanup
agents = client.agents.list(tags=["worker", "supervisor"])
for agent in agents:
client.agents.delete(agent.id)
print(f"Deleted agent {agent.name} with ID {agent.id}")

View File

@@ -1,34 +0,0 @@
"""
This example shows how to create agents with tool rules, which restrict
what tool the agent can execute at a given step.
Note that by default, agents can execute any tool. As agents become more
powerful, they will not need as much guidance from the developer.
Last tested with letta-client version: 0.1.22
"""
from letta_client import ChildToolRule, InitToolRule, Letta, TerminalToolRule
client = Letta(base_url="http://localhost:8283")
# always search archival memory first
search_agent = client.agents.create(
name="search_agent",
memory_blocks=[],
model="anthropic/claude-sonnet-4-20250514",
embedding="openai/text-embedding-3-small",
tags=["worker"],
tool_rules=[
InitToolRule(tool_name="archival_memory_search"),
ChildToolRule(tool_name="archival_memory_search", children=["send_message"]),
# TerminalToolRule(tool_name="send_message", type="TerminalToolRule"),
TerminalToolRule(tool_name="send_message"),
],
)
response = client.agents.messages.create(
agent_id=search_agent.id,
messages=[{"role": "user", "content": "do something"}],
)
for message in response.messages:
print(message)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 257 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 149 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 480 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 356 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 663 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 368 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 262 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 500 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 443 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 373 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 388 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 288 KiB

View File

@@ -1,13 +0,0 @@
{
"name": "@letta-cloud/fern",
"version": "0.0.1",
"private": true,
"scripts": {
"prepare-openapi": "ts-node ./scripts/prepare-openapi.ts"
},
"dependencies": {
"fern-api": "^0.83.0",
"ts-node": "^10.9.2",
"typescript": "^5.3.3"
}
}

View File

@@ -1,120 +0,0 @@
---
title: Installing Letta Desktop
subtitle: Install Letta Desktop on your MacOS, Windows, or Linux machine
slug: guides/ade/desktop
---
<img className="w-full light" src="/images/letta_desktop_screenshot.png" />
<img className="w-full dark" src="/images/letta_desktop_screenshot_dark.png" />
Letta Desktop bundles the Letta server and ADE into a single local application. When running, it provides full access to the Letta API at `https://localhost:8283`.
## Download Letta Desktop
<CardGroup>
<Card
title="Download Letta Desktop for Mac (Apple Silicon)"
icon="fa-brands fa-apple"
iconPosition="left"
href="https://downloads.letta.com/mac/dmg/arm64"
>
</Card>
<Card
title="Download Letta Desktop for Windows (x64)"
icon="fa-brands fa-windows"
iconPosition="left"
href="https://downloads.letta.com/windows/nsis/x64"
>
</Card>
<Card
title="Download Letta Desktop for Linux (x64)"
icon="fa-brands fa-linux"
iconPosition="left"
href="https://downloads.letta.com/linux/appImage/x64"
>
</Card>
</CardGroup>
<Note>
Note: Since version 0.8.9, Letta uses sqlite as the embedded DB. If you wish to continue using Postgres, migrate your data and use the `external Postgres` support.
</Note>
## Configuration Modes
Letta Desktop can run in two primary modes:
### 1. Embedded Server Mode (Default)
This is the default mode where Letta Desktop runs its own embedded server with a SQLite database. No additional setup is required - just install and run!
To manually configure embedded mode, create or edit `~/.letta/desktop_config.json`:
```json
{
"version": "1",
"databaseConfig": {
"type": "embedded",
"embeddedType": "sqlite"
}
}
```
### 2. Self-Hosted Server Mode
Connect Letta Desktop to your own self-hosted Letta server. This is useful for teams or when you want more control over your server infrastructure.
To configure self-hosted mode, create or edit `~/.letta/desktop_config.json`:
```json
{
"version": "1",
"databaseConfig": {
"type": "local",
"url": "https://api.letta.com",
"token": "your-auth-token"
}
}
```
Replace `url` with your server's address and `token` with your authentication token if required.
### Embedded Server with PostgreSQL (Deprecated)
<Warning>
This mode is deprecated and will be removed in a future release. We recommend using SQLite for embedded deployments or connecting to an external PostgreSQL instance for production use.
</Warning>
For backwards compatibility, you can still run the embedded server with PostgreSQL:
```json
{
"version": "1",
"databaseConfig": {
"type": "embedded",
"embeddedType": "pgserver"
}
}
```
## Adding LLM backends
The Letta server can be connected to various LLM API backends.
You can add additional LLM API backends by opening the integrations panel (clicking the <Icon icon="square-rss" /> icon).
When you configure a new integration (by setting the environment variable in the dialog), the Letta server will be restarted to load the new LLM API backend.
<img className="block w-300" src="/images/letta_desktop_integrations.png" />
You can also edit the environment variable file directly, located at `~/.letta/env`.
For this quickstart demo, we'll add an OpenAI API key (once we enter our key and **click confirm**, the Letta server will automatically restart):
<img className="w-300" src="/images/letta_desktop_openai.png" />
## Beta Status
Letta Desktop is currently in **beta**. View known issues and FAQ [here](/guides/desktop/troubleshooting).
For a more stable development experience, we recommend installing Letta via Docker.
## Support
For bug reports and feature requests, contact us on [Discord](https://discord.gg/letta).

View File

@@ -1,296 +0,0 @@
---
title: Agent Settings
subtitle: Configure and optimize your agent's behavior
slug: guides/ade/settings
---
The Agent Settings panel in the ADE provides comprehensive configuration options to customize and optimize your agent's behavior. These settings allow you to fine-tune everything from the agent's basic information to advanced LLM parameters.
<Tip>
Letta's philosophy is to provide flexible configuration options without enforcing a rigid "one right way" to design agents. **Letta lets you program your context window** exactly how you want it, giving you complete control over what information your agent has access to and how it's structured. While we offer guidelines and best practices, you have the freedom to structure your agent's configuration based on your specific needs and preferences. The examples and recommendations in this guide are starting points rather than strict rules.
</Tip>
## Basic Settings
### Agent Identity
- **Name**: Change your agent's display name by clicking the edit icon next to the current name
- **ID**: A unique identifier shown below the name, used when interacting with your agent via the [Letta APIs/SDKs](/api-reference)
- **Description**: A description of the agent's purpose and functionality (not used by the agent, only seen by the developer - you)
### User Identities
If you are building a multi-user application on top of Letta (e.g. a chat application with many end-users), you may want to use the concept of identities to connect agents to users. See our [identities guide](/guides/agents/multi-user) for more information.
### Tags
Tags help organize and filter your agents:
- **Add Tags**: Create custom tags to categorize your agents
- **Remove Tags**: Delete tags that are no longer relevant
- **Filter by Tags**: In the agents list, you can filter by tags to quickly find specific agent types
### LLM Model Selection
Select the AI model that powers your agent. Letta relies on tool calling to drive the agentic loop, so larger or more "powerful" models will generally be able to call tools correctly.
<Tip>
To enable additional models on your Letta server, follow the [model configuration instructions](/guides/server/providers/openai) for your preferred providers.
</Tip>
## Advanced Settings
The Advanced Settings tab provides deeper configuration options organized into three categories: Agent, LLM Config, and Embedding Config.
### Agent Settings
#### System Prompt
The system prompt contains permanent, read-only instructions for your agent:
- **Edit System Instructions**: Customize the high-level directives that guide your agent's behavior
- **Character Counting**: Monitor the length of your system prompt to optimize token usage
- **Read-Only**: The agent cannot modify these instructions during operation
<Tip>
**System instructions should include**:
- Tool usage guidelines and constraints
- Task-specific instructions that should not change
- Formatting requirements for outputs
- High-level behavioral guardrails
- Error handling protocols
**System instructions should NOT include**:
- Personality traits that might evolve
- Opinions or preferences that could change
- Personal history or background details
- Information that may need updating
</Tip>
#### Understanding System Instructions vs. Persona Memory Block
<Note>
**Key Distinction**: While there are many opinions on how to structure agent instructions, the most important functional difference in Letta is that **system instructions are read-only**, whereas **memory blocks are read-write** if the agent has memory editing tools. Letta gives you the flexibility to configure your agent's context window according to your preferences and use case needs.
</Note>
The persona memory block (in Core Memory) is modifiable by the agent during operation:
- **Editable**: The agent can update this information over time if it has access to memory editing tools
- **Evolving Identity**: Allows for personality development and adaptation
- **Personal Details**: Contains self-identity information, preferences, and traits
<Note>
Place information in the persona memory block when you want the agent to potentially update it over time. For example, preferences ("I enjoy classical music"), personality traits ("I'm detail-oriented"), or background information that might evolve with new experiences.
</Note>
This separation creates a balance between stable behavior (system instructions) and an evolving identity (persona memory), allowing your agent to maintain consistent functionality while developing a more dynamic personality.
#### Message Buffer Autoclear
- **Toggle Autoclear**: Enable or disable automatic clearing of the message buffer when context is full
- **Benefits**: When enabled, helps manage long conversations by automatically summarizing and archiving older messages
- **Use Cases**: Enable for agents that handle extended interactions; disable for agents where preserving the exact conversation history is critical
#### Agent Type
- **View Agent Type**: See which agent implementation type your agent is using (e.g., "letta_agent", "ephemeral_memory_agent")
- **API Modification**: While displayed as read-only in the ADE interface, this can be modified via the Letta API/SDK
### LLM Configuration
Fine-tune how your agent's LLM generates responses:
#### Temperature
- **Adjust Creativity**: Control the randomness/creativity of your agent's responses with a slider from 0.0 to 1.0
- **Lower Values** (0.0-0.3): More deterministic, factual responses; ideal for information retrieval or analytical tasks
- **Higher Values** (0.7-1.0): More creative, diverse responses; better for creative writing or brainstorming
#### Context Window Size
- **Customize Memory Size**: Adjust how much context your agent can maintain during a conversation
- **Tradeoffs**: Larger windows allow more context but increase token usage and cost
- **Model Limits**: The slider is bounded by your selected model's maximum context window capacity
#### Max Output Tokens
- **Control Response Length**: Limit the maximum length of your agent's responses
- **Resource Management**: Helps control costs and ensures concise responses
- **Default Setting**: Automatically set based on your selected model's capabilities
#### Max Reasoning Tokens
- **Adjust Internal Thinking**: For models that support it (e.g., Claude 3.7 Sonnet), control how much internal reasoning the model can perform
- **Use Cases**: Increase for complex problem-solving tasks; decrease for simple, direct responses
### Embedding Configuration
Configure how your agent processes and stores text for retrieval:
#### Embedding Model
- **Select Provider**: Choose which embedding model to use for your agent's vector memory
- **Model Comparison**: Different models offer varying dimensions and performance characteristics
<Warning>
We do not recommend changing the embedding model frequently. If you already have existing data in archival memory, changing models will require re-embedding all existing memories, which can be time-consuming and may affect retrieval quality.
</Warning>
#### Embedding Dimensions
- **View Dimensions**: See the vector size used by your selected embedding model
- **API Modification**: While displayed as read-only in the ADE interface, this can be configured via the Letta API/SDK
#### Chunk Size
- **View Configuration**: See the current chunk size setting for document processing
- **API Modification**: While displayed as read-only in the ADE interface, this can be configured via the Letta API/SDK
## Using the API/SDK for Advanced Configuration
While the ADE provides a user-friendly interface for most common settings, the Letta API and SDKs offer even more granular control. Settings that appear read-only in the ADE can often be modified programmatically:
```python
from letta import RESTClient
# Initialize client
client = RESTClient(base_url="https://api.letta.com/v1")
# Update advanced settings not available in the ADE UI
response = client.agents.modify_agent(
agent_id="your_agent_id",
agent_type="letta_agent", # Change agent type
embedding_config={
"embedding_endpoint_type": "openai",
"embedding_model": "text-embedding-3-large",
"embedding_dim": 3072, # Custom embedding dimensions
"embedding_chunk_size": 512 # Custom chunk size
}
)
```
## Best Practices for Agent Configuration
### Optimizing Performance
- **Match Model to Task**: Select models based on your agent's primary function (e.g., Claude for reasoning, GPT-4 for general knowledge)
- **Tune Temperature Appropriately**: Start with a moderate temperature (0.5) and adjust based on observed behavior
- **Balance Context Window**: Use the smallest context window that adequately serves your needs to optimize for cost and performance
### Effective Configuration Guidelines
#### System Prompt Best Practices
- **Be Clear and Specific**: Provide explicit instructions about behavioral expectations and tool usage
- **Separate Concerns**: Focus on permanent instructions, leaving personality elements to memory blocks
- **Include Examples**: For complex behaviors, provide concrete examples of expected tool usage
- **Define Boundaries**: Clearly outline what capabilities should and should not be used
- **Avoid Contradictions**: Ensure your instructions are internally consistent
#### Persona Memory Best Practices
- **Identity Foundation**: Define core aspects of the agent's personality, preferences, and background
- **Evolutionary Potential**: Structure information to allow for natural development over time
- **Self-Reference Format**: Use first-person statements to help the agent internalize its identity
- **Hierarchical Structure**: Organize from most fundamental traits to more specific preferences
- **Memory Hooks**: Include elements the agent can reference and build upon in conversations
### Testing Configuration Changes
After making configuration changes:
1. **Send Test Messages**: Verify the agent responds as expected with different inputs
2. **Check Edge Cases**: Test boundary conditions and unusual requests
3. **Monitor Token Usage**: Observe how configuration changes affect token consumption
4. **Iterate Gradually**: Make incremental adjustments rather than dramatic changes
## Configuration Examples with System Prompt vs. Persona Memory
### Research Assistant
```
# Basic Settings
Name: Research Helper
Model: claude-3-5-sonnet
# Advanced Settings
Temperature: 0.3 (for accurate, consistent responses)
Context Window: 32000 (to handle complex research questions)
# System Prompt (permanent, read-only instructions)
You are a research assistant tool designed to help with academic research.
When performing searches, always:
1. Use proper citation formats (MLA, APA, Chicago) based on user preference
2. Check multiple sources before providing definitive answers
3. Indicate confidence level for each research finding
4. Use core_memory_append to record important research topics for later reference
5. When using search tools, formulate queries with specific keywords and date ranges
# Persona Memory Block (editable, evolving identity)
I am a helpful and knowledgeable research assistant.
I have expertise in analyzing academic papers and synthesizing information from multiple sources.
I prefer to present information in an organized, structured manner.
I'm curious about new research and enjoy learning about diverse academic fields.
I try to maintain an objective stance while acknowledging different scholarly perspectives.
```
### Customer Service Agent
```
# Basic Settings
Name: Support Assistant
Model: claude-3-5-sonnet
# Advanced Settings
Temperature: 0.2 (for consistent, factual responses)
Context Window: 16000 (to maintain conversation history)
# System Prompt (permanent, read-only instructions)
You are a customer service assistant for TechGadgets Inc.
Your primary functions are:
1. Help customers troubleshoot product issues using the knowledge base
2. Process returns and exchanges according to company policy
3. Escalate complex issues to human agents using the escalate_ticket tool
4. Record customer information using the update_customer_record tool
5. Always verify customer identity before accessing account information
6. Follow the privacy policy: never share customer data with unauthorized parties
# Persona Memory Block (editable, evolving identity)
I am TechGadgets' friendly customer service assistant.
I speak in a warm, professional tone and use simple, clear language.
I believe in finding solutions quickly while ensuring customer satisfaction.
I'm patient with customers who are frustrated or non-technical.
I try to anticipate customer needs before they express them.
I enjoy helping people resolve their technology problems.
```
### Creative Writing Coach
```
# Basic Settings
Name: Story Weaver
Model: gpt-4o
# Advanced Settings
Temperature: 0.8 (for creative, varied outputs)
Context Window: 64000 (to track complex narratives)
# System Prompt (permanent, read-only instructions)
You are a creative writing coach that helps users develop stories.
When providing feedback:
1. Use the story_structure_analysis tool to identify plot issues
2. Use the character_development_review tool for character feedback
3. Format all feedback with specific examples from the user's text
4. Provide a balance of positive observations and constructive criticism
5. When asked to generate content, clearly mark it as a suggestion
6. Save important story elements to the user's memory block using memory_append
# Persona Memory Block (editable, evolving identity)
I am an experienced creative writing coach with a background in fiction.
I believe great stories come from authentic emotional truth and careful craft.
I'm enthusiastic about helping writers find their unique voice and style.
I enjoy magical realism, science fiction, and character-driven literary fiction.
I believe in the power of revision and thoughtful editing.
I try to be encouraging while still providing honest, actionable feedback.
```
By thoughtfully configuring these settings, you can create highly specialized agents tailored to specific use cases and user needs.

View File

@@ -1,253 +0,0 @@
---
title: Exporting Archival Memories
subtitle: Export all passages from an agent's archival memory
slug: guides/agents/archival-export
---
## Overview
You can export all archival memories (passages) from an agent programmatically using the Letta SDK. This is useful for:
- Backing up agent knowledge
- Analyzing what an agent has learned
- Migrating memories between agents
- Auditing archival content
## Export script
Below is a Python script that paginates through all of an agent's archival memories and exports them to a JSON file:
```python export_agent_memories.py
#!/usr/bin/env python3
"""
Utility script to export all archival memories (passages) from a Letta agent.
Usage:
python export_agent_memories.py <agent_id> [--output <file>] [--limit <limit>]
Example:
python export_agent_memories.py agent-123e4567-e89b-42d3-8456-426614174000 --output memories.json
"""
import argparse
import json
import os
import sys
from typing import Any, Dict, List
from letta_client import Letta
def export_agent_memories(
client: Letta,
agent_id: str,
page_limit: int = 100,
) -> List[Dict[str, Any]]:
"""
Export all archival memories from an agent by paginating through all results.
Args:
client: Initialized Letta client
agent_id: The agent ID in format 'agent-<uuid4>'
page_limit: Number of results per page (default 100)
Returns:
List of passage dictionaries with embedding and embedding_config removed
"""
all_passages = []
after_cursor = None
page_num = 1
print(f"Exporting archival memories for agent: {agent_id}")
print(f"Using pagination with limit: {page_limit}")
print("-" * 60)
while True:
# Fetch next page
print(f"Fetching page {page_num}...", end=" ", flush=True)
try:
passages = client.agents.passages.list(
agent_id=agent_id,
after=after_cursor,
limit=page_limit,
ascending=True # Get oldest to newest
)
except Exception as e:
print(f"\nError fetching memories: {e}")
raise
if not passages:
print("(no more results)")
break
print(f"got {len(passages)} passages")
# Convert to dict and remove embedding fields
for passage in passages:
passage_dict = passage.model_dump() if hasattr(passage, 'model_dump') else passage.dict()
passage_dict.pop("embedding", None)
passage_dict.pop("embedding_config", None)
all_passages.append(passage_dict)
# Check if we got fewer results than the limit (last page)
if len(passages) < page_limit:
break
# Set cursor for next page (use the ID of the last passage)
after_cursor = passages[-1].id if hasattr(passages[-1], 'id') else passages[-1]['id']
page_num += 1
print("-" * 60)
print(f"Total passages exported: {len(all_passages)}")
return all_passages
def main():
parser = argparse.ArgumentParser(
description="Export archival memories from a Letta agent"
)
parser.add_argument(
"agent_id",
help="Agent ID in format 'agent-<uuid4>'"
)
parser.add_argument(
"--output",
"-o",
help="Output JSON file path (default: <agent_id>_memories.json)"
)
parser.add_argument(
"--limit",
"-l",
type=int,
default=100,
help="Number of results per page (default: 100)"
)
args = parser.parse_args()
# Check for API key
api_key = os.getenv("LETTA_API_KEY")
if not api_key:
print("Error: LETTA_API_KEY environment variable not set", file=sys.stderr)
print("Please export LETTA_API_KEY with your API key", file=sys.stderr)
return 1
# Determine output file
output_file = args.output or f"{args.agent_id}_memories.json"
try:
# Initialize client
client = Letta(token=api_key)
# Export memories
passages = export_agent_memories(
client=client,
agent_id=args.agent_id,
page_limit=args.limit
)
# Write to file
with open(output_file, "w") as f:
json.dump(passages, f, indent=2, default=str)
print(f"\nMemories exported successfully to: {output_file}")
return 0
except Exception as e:
print(f"\nError: {e}", file=sys.stderr)
return 1
if __name__ == "__main__":
sys.exit(main())
```
## Usage
### Prerequisites
Install the Letta Python SDK:
```bash
pip install letta-client
```
Set your API key:
```bash
export LETTA_API_KEY="your-api-key-here"
```
### Running the script
Export all memories from an agent:
```bash
python export_agent_memories.py agent-123e4567-e89b-42d3-8456-426614174000
```
Specify a custom output file:
```bash
python export_agent_memories.py agent-123e4567-e89b-42d3-8456-426614174000 --output my_memories.json
```
Adjust pagination size:
```bash
python export_agent_memories.py agent-123e4567-e89b-42d3-8456-426614174000 --limit 50
```
## Output format
The script exports passages as a JSON array. Each passage contains all fields except `embedding` and `embedding_config`:
```json
[
{
"id": "passage-123e4567-e89b-42d3-8456-426614174000",
"text": "The user prefers Python for data science projects",
"created_at": "2025-01-15T10:30:00Z",
"updated_at": null,
"tags": ["preference", "programming"],
"metadata": {},
"file_id": null,
"file_name": null,
"source_id": null,
"archive_id": "archive-abc123",
"created_by_id": "user-xyz789",
"last_updated_by_id": null,
"is_deleted": false
}
]
```
## Next steps
<CardGroup cols={2}>
<Card
title="Searching & Querying"
href="/guides/agents/archival-search"
>
Learn how to search through archival memories
</Card>
<Card
title="Best Practices"
href="/guides/agents/archival-best-practices"
>
Patterns and tips for using archival memory
</Card>
<Card
title="Archival Memory Overview"
href="/guides/agents/archival-memory"
>
Learn about archival memory basics
</Card>
<Card
title="API Reference"
href="/api-reference/agents/passages/list"
>
View the List Passages endpoint documentation
</Card>
</CardGroup>

View File

@@ -1,150 +0,0 @@
---
title: Base Tools
subtitle: Built-in tools for memory management and user communication
slug: guides/agents/base-tools
---
Base tools are built-in tools that enable memory management, user communication, and access to conversation history and archival storage.
## Available Base Tools
| Tool | Purpose |
|------|---------|
| `memory_insert` | Insert text into a memory block |
| `memory_replace` | Replace specific text in a memory block |
| `memory_rethink` | Completely rewrite a memory block |
| `memory_finish_edits` | Signal completion of memory editing |
| `conversation_search` | Search prior conversation history |
| `archival_memory_insert` | Add content to archival memory |
| `archival_memory_search` | Search archival memory |
| `send_message` | Send a message to the user (legacy architectures only) |
## Memory Block Editing
Memory blocks are editable sections in the agent's context window. These tools let agents update their own memory.
See the [Memory Blocks guide](/guides/agents/memory-blocks) for more about how memory blocks work.
### memory_insert
Insert text at a specific line in a memory block.
**Parameters:**
- `label`: Which memory block to edit
- `new_str`: Text to insert
- `insert_line`: Line number (0 for beginning, -1 for end)
**Common uses:**
- Add new information to the end of a block
- Insert context at the beginning
- Add items to a list
### memory_replace
Replace specific text in a memory block.
**Parameters:**
- `label`: Which memory block to edit
- `old_str`: Exact text to find and replace
- `new_str`: Replacement text
**Common uses:**
- Update outdated information
- Fix typos or errors
- Delete text (by replacing with empty string)
**Important:** The `old_str` must match exactly, including whitespace. If it appears multiple times, the tool will error.
### memory_rethink
Completely rewrite a memory block's contents.
**Parameters:**
- `label`: Which memory block to rewrite
- `new_memory`: Complete new contents
**When to use:**
- Condensing cluttered information
- Major reorganization
- Combining multiple pieces of information
**When not to use:**
- Adding one line (use `memory_insert`)
- Changing specific text (use `memory_replace`)
### memory_finish_edits
Signals that memory editing is complete.
**Parameters:** None
Some agent architectures use this to mark the end of a memory update cycle.
## Recall Memory
### conversation_search
Search prior conversation history using both text matching and semantic similarity.
**Parameters:**
- `query`: What to search for
- `roles`: Optional filter by message role (user, assistant, tool)
- `limit`: Maximum number of results
- `start_date`, `end_date`: ISO 8601 date/datetime filters (inclusive)
**Returns:**
Matching messages with role and content, ordered by relevance.
**Example queries:**
- "What did the user say about deployment?"
- "Find previous responses about error handling"
- "Search tool outputs from last week"
## Archival Memory
Archival memory stores information long-term outside the context window. See the [Archival Memory documentation](/guides/agents/archival-memory) for details.
### archival_memory_insert
Add content to archival memory for long-term storage.
**Parameters:**
- `content`: Text to store
- `tags`: Optional tags for organization
**Common uses:**
- Storing reference information for later
- Saving important context that doesn't fit in memory blocks
- Building a knowledge base over time
### archival_memory_search
Search archival memory using semantic (embedding-based) search.
**Parameters:**
- `query`: What to search for semantically
- `tags`: Optional tag filters
- `tag_match_mode`: "any" or "all" for tag matching
- `top_k`: Maximum results
- `start_datetime`, `end_datetime`: ISO 8601 filters (inclusive)
**Returns:**
Matching passages with timestamps and content, ordered by semantic similarity.
## Deprecated Tools
These tools are still available but deprecated:
| Tool | Use Instead |
|------|-------------|
| `send_message` | Agent responses (no tool needed). See [legacy architectures](/guides/legacy/memgpt_agents_legacy) |
| `core_memory_append` | `memory_insert` with `insert_line=-1` |
| `core_memory_replace` | `memory_replace` |
## Related Documentation
- [Memory Blocks](/guides/agents/memory-blocks)
- [Archival Memory](/guides/agents/archival-memory)
- [Utilities](/guides/agents/prebuilt-tools)
- [Multi-Agent Tools](/guides/agents/multi-agent)
- [Custom Tools](/guides/agents/custom-tools)

View File

@@ -1,128 +0,0 @@
---
title: Context Engineering
subtitle: How Letta engineerings the context window of your agents
slug: guides/agents/context-engineering
---
Context engineering (aka "memory management" or "context management") is the process of managing the context window of an agent to ensure it has access to the information it needs to perform its task.
Letta and [MemGPT](https://arxiv.org/abs/2310.08560) introduced the concept of **agentic context engineering**, where the context window engineering is done by one or more AI agents. In Letta, agents are able to manage their own context window (and the context window of other agents!) using special memory management tools.
## Memory management in regular agents
By default, Letta agents are provided with tools to modify their own memory blocks. This allows agents to learn and form memories over time, as described in the MemGPT paper.
The default tools are:
* `memory_insert`: Insert content into a block
* `memory_replace`: Replace content in a block
If you do not want your agents to manage their memory, you should disable default tools with `include_base_tools=False` during the agent creation. You can also detach the memory editing tools post-agent creation - if you do so, remember to check the system instructions to make sure there are no references to tools that no longer exist.
### Memory management with sleep-time compute
If you want to enable memory management with sleep-time compute, you can set `enable_sleeptime=True` in the agent creation. For agents enabled with sleep-time, Letta will automatically create sleep-time agents which have the ability to update the blocks of the primary agent. Sleep-time agents will also include `memory_rethink` and `memory_finish_edits` tools.
Memory management with sleep-time compute can reduce the latency of your main agent (since it is no longer responsible for managing its own memory), but can come at the cost of higher token usage. See our documentation on sleeptime agents for more details.
## Enabling agents to modify their own memory blocks with tools
You can enable agents to modify their own blocks with tools. By default, agents with type `memgpt_v2_agent` will have the tools `memory_insert` and `memory_replace` to allow them to manage values in their own blocks. The legacy tools `core_memory_replace` and `core_memory_append` are deprecated but still available for backwards compatibility for type `memgpt_agent`. You can also make custom modification to blocks by implementing your own custom tools that can access the agent's state by passing in the special `agent_state` parameter into your tools.
Below is an example of a tool that re-writes the entire memory block of an agent with a new string:
<CodeGroup>
```typescript TypeScript
function rethinkMemory(agentState: AgentState, newMemory: string, targetBlockLabel: string): void {
/**
* Rewrite memory block for the main agent, newMemory should contain all current information from the block that is not outdated or inconsistent, integrating any new information, resulting in a new memory block that is organized, readable, and comprehensive.
*
* @param newMemory - The new memory with information integrated from the memory block. If there is no new information, then this should be the same as the content in the source block.
* @param targetBlockLabel - The name of the block to write to.
*
* @returns void - Always returns void as this function does not produce a response.
*/
if (agentState.memory.getBlock(targetBlockLabel) === null) {
agentState.memory.createBlock(targetBlockLabel, newMemory);
}
agentState.memory.updateBlockValue(targetBlockLabel, newMemory);
}
```
```python Python
def rethink_memory(agent_state: "AgentState", new_memory: str, target_block_label: str) -> None:
"""
Rewrite memory block for the main agent, new_memory should contain all current information from the block that is not outdated or inconsistent, integrating any new information, resulting in a new memory block that is organized, readable, and comprehensive.
Args:
new_memory (str): The new memory with information integrated from the memory block. If there is no new information, then this should be the same as the content in the source block.
target_block_label (str): The name of the block to write to.
Returns:
None: None is always returned as this function does not produce a response.
"""
if agent_state.memory.get_block(target_block_label) is None:
agent_state.memory.create_block(label=target_block_label, value=new_memory)
agent_state.memory.update_block_value(label=target_block_label, value=new_memory)
return None
```
</CodeGroup>
## Modifying blocks via the API
You can also [modify blocks via the API](/api-reference/agents/blocks/modify) to directly edit agents' context windows and memory. This can be useful in cases where you want to extract the contents of an agents memory some place in your application (for example, a dashboard or memory viewer), or when you want to programatically modify an agents memory state (for example, allowing an end-user to directly correct or modify their agent's memory).
## Modifying blocks of other Letta agents via API tools
<Tip>
Importing the Letta Python client inside a tool is a powerful way to allow agents to interact with other agents, since you can use any of the API endpoints. For example, you could create a custom tool that allows an agent to create another Letta agent.
</Tip>
You can allow agents to modify the blocks of other agents by creating tools that import the Letta SDK, then using the block update endpoint:
<CodeGroup>
```typescript TypeScript
function updateSupervisorBlock(blockLabel: string, newValue: string): void {
/**
* Update the value of a block in the supervisor agent.
*
* @param blockLabel - The label of the block to update.
* @param newValue - The new value for the block.
*
* @returns void - Always returns void as this function does not produce a response.
*/
const { LettaClient } = require('@letta-ai/letta-client');
const client = new LettaClient({
token: process.env.LETTA_API_KEY
});
await client.agents.blocks.modify(
agentId,
blockLabel,
newValue
);
}
```
```python Python
def update_supervisor_block(block_label: str, new_value: str) -> None:
"""
Update the value of a block in the supervisor agent.
Args:
block_label (str): The label of the block to update.
new_value (str): The new value for the block.
Returns:
None: None is always returned as this function does not produce a response.
"""
from letta_client import Letta
import os
client = Letta(
token=os.getenv("LETTA_API_KEY")
)
client.agents.blocks.modify(
agent_id=agent_id,
block_label=block_label,
value=new_value
)
```
</CodeGroup>

View File

@@ -1,264 +0,0 @@
---
title: Define and customize tools
slug: guides/agents/custom-tools
---
You can create custom tools in Letta using the Python SDK, as well as via the [ADE tool builder](/guides/ade/tools).
For your agent to call a tool, Letta constructs an OpenAI tool schema (contained in `json_schema` field) from the function you define. Letta can either parse this automatically from a properly formatting docstring, or you can pass in the schema explicitly by providing a Pydantic object that defines the argument schema.
## Creating a custom tool
### Specifying tools via Pydantic models
To create a custom tool, you can extend the `BaseTool` class and specify the following:
* `name` - The name of the tool
* `args_schema` - A Pydantic model that defines the arguments for the tool
* `description` - A description of the tool
* `tags` - (Optional) A list of tags for the tool to query
You must also define a `run(..)` method for the tool code that takes in the fields from the `args_schema`.
Below is an example of how to create a tool by extending `BaseTool`:
```python title="python" maxLines=50
from letta_client import Letta
from letta_client.client import BaseTool
from pydantic import BaseModel
from typing import List, Type
import os
class InventoryItem(BaseModel):
sku: str # Unique product identifier
name: str # Product name
price: float # Current price
category: str # Product category (e.g., "Electronics", "Clothing")
class InventoryEntry(BaseModel):
timestamp: int # Unix timestamp of the transaction
item: InventoryItem # The product being updated
transaction_id: str # Unique identifier for this inventory update
class InventoryEntryData(BaseModel):
data: InventoryEntry
quantity_change: int # Change in quantity (positive for additions, negative for removals)
class ManageInventoryTool(BaseTool):
name: str = "manage_inventory"
args_schema: Type[BaseModel] = InventoryEntryData
description: str = "Update inventory catalogue with a new data entry"
tags: List[str] = ["inventory", "shop"]
def run(self, data: InventoryEntry, quantity_change: int) -> bool:
print(f"Updated inventory for {data.item.name} with a quantity change of {quantity_change}")
return True
# create a client connected to Letta Cloud
# Get your API key at https://app.letta.com/api-keys
client = Letta(token=os.getenv("LETTA_API_KEY"))
# create the tool
tool_from_class = client.tools.add(
tool=ManageInventoryTool(),
)
```
To add this tool using the SDK:
<CodeGroup>
```typescript title="typescript"
import { LettaClient } from '@letta-ai/letta-client';
// create a client to connect to your local Letta server
const client = new LettaClient({
baseUrl: "http://localhost:8283"
});
// create the tool
const toolFromClass = await client.tools.add({
tool: manageInventoryTool,
});
```
```python title="python"
from letta_client import Letta
# create a client to connect to your local Letta server
client = Letta(
base_url="http://localhost:8283"
)
# create the tool
tool_from_class = client.tools.add(
tool=ManageInventoryTool(),
)
```
</CodeGroup>
### Specifying tools via function docstrings
You can create a tool by passing in a function with a [Google Style Python docstring](https://google.github.io/styleguide/pyguide.html#383-functions-and-methods) specifying the arguments and description of the tool:
<CodeGroup>
```typescript title="typescript"
// install letta-client with `npm install @letta-ai/letta-client`
import { LettaClient } from '@letta-ai/letta-client';
// create a client connected to Letta Cloud
const client = new LettaClient({
token: process.env.LETTA_API_KEY
});
// define a function
function rollDice(): string {
const diceRoleOutcome = Math.floor(Math.random() * 20) + 1;
const outputString = `You rolled a ${diceRoleOutcome}`;
return outputString;
}
// create the tool
const tool = await client.tools.createFromFunction({
func: rollDice
});
```
```python title="python" maxLines=50
# install letta_client with `pip install letta-client`
from letta_client import Letta
import os
# create a client connected to Letta Cloud
client = Letta(token=os.getenv("LETTA_API_KEY"))
# define a function with a docstring
def roll_dice() -> str:
"""
Simulate the roll of a 20-sided die (d20).
This function generates a random integer between 1 and 20, inclusive,
which represents the outcome of a single roll of a d20.
Returns:
str: The result of the die roll.
"""
import random
dice_role_outcome = random.randint(1, 20)
output_string = f"You rolled a {dice_role_outcome}"
return output_string
# create the tool
tool = client.tools.create_from_function(
func=roll_dice
)
```
</CodeGroup>
The tool creation will return a `Tool` object. You can update the tool with `client.tools.upsert_from_function(...)`.
### Specifying arguments via Pydantic models
To specify the arguments for a complex tool, you can use the `args_schema` parameter.
```python title="python" maxLines=50
# install letta_client with `pip install letta-client`
from letta_client import Letta
class Step(BaseModel):
name: str = Field(
...,
description="Name of the step.",
)
description: str = Field(
...,
description="An exhaustic description of what this step is trying to achieve and accomplish.",
)
class StepsList(BaseModel):
steps: list[Step] = Field(
...,
description="List of steps to add to the task plan.",
)
explanation: str = Field(
...,
description="Explanation for the list of steps.",
)
def create_task_plan(steps, explanation):
""" Creates a task plan for the current task. """
return steps
tool = client.tools.upsert_from_function(
func=create_task_plan,
args_schema=StepsList
)
```
Note: this path for updating tools is currently only supported in Python.
### Creating a tool from a file
You can also define a tool from a file that contains source code. For example, you may have the following file:
```python title="custom_tool.py" maxLines=50
from typing import List, Optional
from pydantic import BaseModel, Field
class Order(BaseModel):
order_number: int = Field(
...,
description="The order number to check on.",
)
customer_name: str = Field(
...,
description="The customer name to check on.",
)
def check_order_status(
orders: List[Order]
):
"""
Check status of a provided list of orders
Args:
orders (List[Order]): List of orders to check
Returns:
str: The status of the order (e.g. cancelled, refunded, processed, processing, shipping).
"""
# TODO: implement
return "ok"
```
Then, you can define the tool in Letta via the `source_code` parameter:
<CodeGroup>
```typescript title="typescript"
import * as fs from 'fs';
const tool = await client.tools.create({
sourceCode: fs.readFileSync("custom_tool.py", "utf-8")
});
```
```python title="python" maxLines=50
tool = client.tools.create(
source_code = open("custom_tool.py", "r").read()
)
```
</CodeGroup>
Note that in this case, `check_order_status` will become the name of your tool, since it is the last Python function in the file. Make sure it includes a [Google Style Python docstring](https://google.github.io/styleguide/pyguide.html#383-functions-and-methods) to define the tool's arguments and description.
# (Advanced) Accessing Agent State
<Warning>
Tools that use `agent_state` currently do not work in the ADE live tool tester (they will error when you press "Run"), however if the tool is correct it will work once you attach it to an agent.
</Warning>
If you need to directly access the state of an agent inside a tool, you can use the reserved `agent_state` keyword argument, for example:
```python title="python"
def get_agent_id(agent_state: "AgentState") -> str:
"""
A custom tool that returns the agent ID
Returns:
str: The agent ID
"""
return agent_state.id
```

View File

@@ -1,161 +0,0 @@
---
title: Fetch Webpage
subtitle: Convert webpages to readable text/markdown
slug: guides/agents/fetch-webpage
---
The `fetch_webpage` tool enables Letta agents to fetch and convert webpages into readable text or markdown format. Useful for reading documentation, articles, and web content.
<Info>
On [Letta Cloud](/guides/cloud/overview), this tool works out of the box. For self-hosted deployments with an Exa API key, fetching is enhanced. Without a key, it falls back to open-source extraction tools.
</Info>
## Quick Start
<CodeGroup>
```python Python
from letta import Letta
client = Letta(token="LETTA_API_KEY")
agent = client.agents.create(
model="openai/gpt-4o",
tools=["fetch_webpage"],
memory_blocks=[{
"label": "persona",
"value": "I can fetch and read webpages to answer questions about online content."
}]
)
```
```typescript TypeScript
import { LettaClient } from '@letta-ai/letta-client';
const client = new LettaClient({ token: "LETTA_API_KEY" });
const agent = await client.agents.create({
model: "openai/gpt-4o",
tools: ["fetch_webpage"],
memoryBlocks: [{
label: "persona",
value: "I can fetch and read webpages to answer questions about online content."
}]
});
```
</CodeGroup>
## Tool Parameters
| Parameter | Type | Description |
|-----------|------|-------------|
| `url` | `str` | The URL of the webpage to fetch |
## Return Format
The tool returns webpage content as text/markdown.
**With Exa API (if configured):**
```json
{
"title": "Page title",
"published_date": "2025-01-15",
"author": "Author name",
"text": "Full page content in markdown"
}
```
**Fallback (without Exa):**
Returns markdown-formatted text extracted from the HTML.
## How It Works
The tool uses a multi-tier approach:
1. **Exa API** (if `EXA_API_KEY` is configured): Uses Exa's content extraction
2. **Trafilatura** (fallback): Open-source text extraction to markdown
3. **Readability + html2text** (final fallback): HTML cleaning and conversion
## Self-Hosted Setup
For enhanced fetching on self-hosted servers, optionally configure an Exa API key. Without it, the tool still works using open-source extraction.
### Optional: Configure Exa
<CodeGroup>
```bash Docker
docker run \
-e EXA_API_KEY="your_exa_api_key" \
letta/letta:latest
```
```yaml Docker Compose
services:
letta:
environment:
- EXA_API_KEY=your_exa_api_key
```
```python Per-Agent
agent = client.agents.create(
tools=["fetch_webpage"],
tool_env_vars={
"EXA_API_KEY": "your_exa_api_key"
}
)
```
</CodeGroup>
## Common Patterns
### Documentation Reader
```python
agent = client.agents.create(
model="openai/gpt-4o",
tools=["fetch_webpage", "web_search"],
memory_blocks=[{
"label": "persona",
"value": "I search for documentation with web_search and read it with fetch_webpage."
}]
)
```
### Research Assistant
```python
agent = client.agents.create(
model="openai/gpt-4o",
tools=["fetch_webpage", "archival_memory_insert"],
memory_blocks=[{
"label": "persona",
"value": "I fetch articles and store key insights in archival memory for later reference."
}]
)
```
### Content Summarizer
```python
agent = client.agents.create(
model="openai/gpt-4o",
tools=["fetch_webpage"],
memory_blocks=[{
"label": "persona",
"value": "I fetch webpages and provide summaries of their content."
}]
)
```
## When to Use
| Use Case | Tool | Why |
|----------|------|-----|
| Read specific webpage | `fetch_webpage` | Direct URL access |
| Find webpages to read | `web_search` | Discovery first |
| Read + search in one | `web_search` with `include_text=true` | Combined operation |
| Multiple pages | `fetch_webpage` | Iterate over URLs |
## Related Documentation
- [Utilities Overview](/guides/agents/prebuilt-tools)
- [Web Search](/guides/agents/web-search)
- [Run Code](/guides/agents/run-code)
- [Custom Tools](/guides/agents/custom-tools)
- [Tool Variables](/guides/agents/tool-variables)

View File

@@ -1,694 +0,0 @@
---
title: Human-in-the-Loop
slug: guides/agents/human-in-the-loop
subtitle: How to integrate human-in-the-loop workflows for tool approval
---
Human-in-the-loop (HITL) workflows allow you to maintain control over critical agent actions by requiring human approval before executing certain tools. This is essential for operations that could have significant consequences, such as database modifications, financial transactions, or external API calls with cost implications.
```mermaid
flowchart LR
Agent[Agent] -->|Calls Tool| Check{Requires<br/>Approval?}
Check -->|No| Execute[Execute Tool]
Check -->|Yes| Request[Request Approval]
Request --> Human[Human Review]
Human -->|Approve| Execute
Human -->|Deny| Error[Return Error]
Execute --> Result[Return Result]
Error --> Agent
Result --> Agent
```
## Overview
When a tool is marked as requiring approval, the agent will pause execution and wait for human approval or denial before proceeding. This creates a checkpoint in the agent's workflow where human judgment can be applied. The approval workflow is designed to be non-blocking and supports both synchronous and streaming message interfaces, making it suitable for interactive applications as well as batch processing systems.
### Key Benefits
- **Risk Mitigation**: Prevent unintended actions in production environments
- **Cost Control**: Review expensive operations before execution
- **Compliance**: Ensure human oversight for regulated operations
- **Quality Assurance**: Validate agent decisions before critical actions
### How It Works
The approval workflow follows a clear sequence of steps that ensures human oversight at critical decision points:
1. **Tool Configuration**: Mark specific tools as requiring approval either globally (default for all agents) or per-agent
2. **Execution Pause**: When the agent attempts to call a protected tool, it immediately pauses and returns an approval request message
3. **Human Review**: The approval request includes the tool name, arguments, and context, allowing you to make an informed decision
4. **Approval/Denial**: Send an approval response to either execute the tool or provide feedback for the agent to adjust its approach
5. **Continuation**: The agent receives the tool result (on approval) or an error message (on denial) and continues processing
## Best Practices
Following these best practices will help you implement effective human-in-the-loop workflows while maintaining a good user experience and system performance.
### 1. Selective Tool Marking
Not every tool needs human approval. Be strategic about which tools require oversight to avoid workflow bottlenecks while maintaining necessary controls:
**Tools that typically require approval:**
- Database write operations (INSERT, UPDATE, DELETE)
- External API calls with financial implications
- File system modifications or deletions
- Communication tools (email, SMS, notifications)
- System configuration changes
- Third-party service integrations with rate limits
### 2. Clear Denial Reasons
When denying a request, your feedback directly influences how the agent adjusts its approach. Provide specific, actionable guidance rather than vague rejections:
```python
# Good: Specific and actionable
"reason": "Use read-only query first to verify the data before deletion"
# Bad: Too vague
"reason": "Don't do that"
```
The agent will use your denial reason to reformulate its approach, so the more specific you are, the better the agent can adapt.
## Setting Up Approval Requirements
There are two methods for configuring tool approval requirements, each suited for different use cases. Choose the approach that best fits your security model and operational needs.
### Method 1: Create/Upsert Tool with Default Approval Requirement
Set approval requirements at the tool level when creating or upserting a tool. This approach ensures consistent security policies across all agents that use the tool. The `default_requires_approval` flag will be applied to all future agent-tool attachments:
<CodeGroup>
```curl curl maxLines=50
curl --request POST \
--url https://api.letta.com/v1/tools \
--header 'Authorization: Bearer $LETTA_API_KEY' \
--header 'Content-Type: application/json' \
--data '{
"name": "sensitive_operation",
"default_requires_approval": true,
"json_schema": {
"type": "function",
"function": {
"name": "sensitive_operation",
"parameters": {...}
}
},
"source_code": "def sensitive_operation(...): ..."
}'
# All agents using this tool will require approval
curl --request POST \
--url https://api.letta.com/v1/agents \
--header 'Authorization: Bearer $LETTA_API_KEY' \
--header 'Content-Type: application/json' \
--data '{
"tools": ["sensitive_operation"],
// ... other configuration
}'
```
```python python maxLines=50
# Create a tool that requires approval by default
approval_tool = client.tools.upsert_from_function(
func=sensitive_operation,
default_requires_approval=True,
)
# All agents using this tool will require approval
agent = client.agents.create(
tools=['sensitive_operation'],
# ... other configuration
)
```
```typescript TypeScript maxLines=50
// Create a tool that requires approval by default
const approvalTool = await client.tools.upsert({
name: "sensitive_operation",
defaultRequiresApproval: true,
jsonSchema: {
type: "function",
function: {
name: "sensitive_operation",
parameters: {...}
}
},
sourceCode: "def sensitive_operation(...): ..."
});
// All agents using this tool will require approval
const agent = await client.agents.create({
tools: ["sensitive_operation"],
// ... other configuration
});
```
</CodeGroup>
### Method 2: Modify Existing Tool with Default Approval Requirement
<Note>
Modifying the tool-level setting will not retroactively apply to existing agent-tool attachments - it only sets the default for future attachments. This means that if the tool is already attached to an agent, the agent will continue using the tool without approval. To modify an existing agent-tool attachment, refer to Method 3 below.
</Note>
For an already existing tool, you can modify the tool to set approval requirements on future agent-tool attachments. The `default_requires_approval` flag will be applied to all future agent-tool attachments:
<CodeGroup>
```curl curl maxLines=50
curl --request PATCH \
--url https://api.letta.com/v1/tools/$TOOL_ID \
--header 'Authorization: Bearer $LETTA_API_KEY' \
--header 'Content-Type: application/json' \
--data '{
"default_requires_approval": true
}'
# All agents using this tool will require approval
curl --request POST \
--url https://api.letta.com/v1/agents \
--header 'Authorization: Bearer $LETTA_API_KEY' \
--header 'Content-Type: application/json' \
--data '{
"tools": ["sensitive_operation"],
// ... other configuration
}'
```
```python python maxLines=50
# Create a tool that requires approval by default
approval_tool = client.tools.modify(
tool_id=sensitive_operation.id,
default_requires_approval=True,
)
# All agents using this tool will require approval
agent = client.agents.create(
tools=['sensitive_operation'],
# ... other configuration
)
```
```typescript TypeScript maxLines=50
// Create a tool that requires approval by default
const approvalTool = await client.tools.modify({
tool_id=sensitive_operation.id,
defaultRequiresApproval: true,
});
// All agents using this tool will require approval
const agent = await client.agents.create({
tools: ["sensitive_operation"],
// ... other configuration
});
```
</CodeGroup>
### Method 3: Per-Agent Tool Approval
Configure approval requirements for specific agent-tool combinations, allowing fine-grained control over individual agent behaviors. This method is particularly useful for:
- **Trusted agents**: Remove approval requirements for well-tested, reliable agents
- **Progressive autonomy**: Gradually reduce approval requirements as agents prove reliable
- **Override defaults**: Change the approval setting for tools already attached to an agent
Use the following endpoints to modify approval settings for existing agent-tool relationships:
<CodeGroup>
```curl curl maxLines=50
curl --request PATCH \
--url https://api.letta.com/v1/agents/$AGENT_ID/tools/$TOOL_NAME/approval \
--header 'Authorization: Bearer $LETTA_API_KEY' \
--header 'Content-Type: application/json' \
--data '{
"requires_approval": true
}'
```
```python python maxLines=50
# Modify approval requirement for a specific agent
client.agents.tools.modify_approval(
agent_id=agent.id,
tool_name="database_write",
requires_approval=True,
)
# Check current approval settings
tools = client.agents.tools.list(agent_id=agent.id)
for tool in tools:
print(f"{tool.name}: requires_approval={tool.requires_approval}")
```
```typescript TypeScript maxLines=50
// Modify approval requirement for a specific agent
await client.agents.tools.modifyApproval({
agentId: agent.id,
toolName: "database_write",
requiresApproval: true,
});
// Check current approval settings
const tools = await client.agents.tools.list({
agentId: agent.id,
});
for (const tool of tools) {
console.log(`${tool.name}: requires_approval=${tool.requiresApproval}`);
}
```
</CodeGroup>
## Handling Approval Requests
### Step 1: Agent Requests Approval
When the agent attempts to call a tool that requires approval, execution immediately pauses. The agent returns a special approval request message containing:
- **Tool name**: The specific tool being called
- **Arguments**: The exact parameters the agent intends to pass
- **Tool call ID**: A unique identifier for tracking this specific call
- **Message ID**: The approval request ID needed for your response
- **Stop reason**: Set to `"requires_approval"` to indicate the pause state
This format matches the ToolCallMessage format intentionally, so that we can handle approval requests the same way we handle tool calls. Here's what an approval request looks like in practice:
<CodeGroup>
```curl curl maxLines=50
curl --request POST \
--url https://api.letta.com/v1/agents/$AGENT_ID/messages \
--header 'Authorization: Bearer $LETTA_API_KEY' \
--header 'Content-Type: application/json' \
--data '{
"messages": [{
"role": "user",
"content": "Delete all test data from the database"
}]
}'
# Response includes approval request
{
"messages": [
{
"message_type": "reasoning_message",
"reasoning": "I need to delete test data from the database..."
},
{
"message_type": "approval_request_message",
"id": "message-abc123",
"tool_call": {
"name": "database_write",
"arguments": "{\"query\": \"DELETE FROM test_data\"}",
"tool_call_id": "tool-xyz789"
}
}
],
"stop_reason": "requires_approval"
}
```
```python python maxLines=50
response = client.agents.messages.create(
agent_id=agent.id,
messages=[{
"role": "user",
"content": "Delete all test data from the database"
}]
)
# Response includes approval request
{
"messages": [
{
"message_type": "reasoning_message",
"reasoning": "I need to delete test data from the database..."
},
{
"message_type": "approval_request_message",
"id": "message-abc123",
"tool_call": {
"name": "database_write",
"arguments": "{\"query\": \"DELETE FROM test_data\"}",
"tool_call_id": "tool-xyz789"
}
}
],
"stop_reason": "requires_approval"
}
```
```typescript TypeScript maxLines=50
const response = await client.agents.messages.create({
agentId: agent.id,
requestBody: {
messages: [{
role: "user",
content: "Delete all test data from the database"
}]
}
});
// Response includes approval request
{
"messages": [
{
"message_type": "reasoning_message",
"reasoning": "I need to delete test data from the database..."
},
{
"message_type": "approval_request_message",
"id": "message-abc123",
"tool_call": {
"name": "database_write",
"arguments": "{\"query\": \"DELETE FROM test_data\"}",
"tool_call_id": "tool-xyz789"
}
}
],
"stop_reason": "requires_approval"
}
```
</CodeGroup>
### Step 2: Review and Respond
Once you receive an approval request, you have two options: approve the tool execution or deny it with guidance. The agent will remain paused until it receives your response.
<Note> While an approval is pending, the agent cannot process any other messages - you must resolve the approval request first.</Note>
#### Approving the Request
To approve a tool call, send an approval message with `approve: true` and the approval request ID. The agent will immediately execute the tool and continue processing:
<CodeGroup>
```curl curl maxLines=50
curl --request POST \
--url https://api.letta.com/v1/agents/$AGENT_ID/messages \
--header 'Authorization: Bearer $LETTA_API_KEY' \
--header 'Content-Type: application/json' \
--data '{
"messages": [{
"type": "approval",
"approvals": [{
"approve": true,
"tool_call_id": "tool-xyz789"
}]
}]
}'
# Response continues with tool execution
{
"messages": [
{
"message_type": "tool_return_message",
"status": "success",
"tool_return": "Deleted 1,234 test records"
},
{
"message_type": "reasoning_message",
"reasoning": "I was able to delete the test data. Let me inform the user."
},
{
"message_type": "assistant_message",
"content": "I've successfully deleted 1,234 test records from the database."
}
],
"stop_reason": "end_turn"
}
```
```python python maxLines=50
# Approve the tool call
response = client.agents.messages.create(
agent_id=agent.id,
messages=[{
"type": "approval",
"approvals": [{
"approve": True,
"tool_call_id": "tool-xyz789"
}]
}]
)
# Response continues with tool execution
{
"messages": [
{
"message_type": "tool_return_message",
"status": "success",
"tool_return": "Deleted 1,234 test records"
},
{
"message_type": "reasoning_message",
"reasoning": "I was able to delete the test data. Let me inform the user."
},
{
"message_type": "assistant_message",
"content": "I've successfully deleted 1,234 test records from the database."
}
],
"stop_reason": "end_turn"
}
```
```typescript TypeScript maxLines=50
// Approve the tool call
const response = await client.agents.messages.create({
agentId: agent.id,
requestBody: {
messages: [{
type: "approval",
approvals: [{
approve: true,
tool_call_id: "tool-xyz789"
}]
}]
}
});
// Response continues with tool execution
{
"messages": [
{
"message_type": "tool_return_message",
"status": "success",
"tool_return": "Deleted 1,234 test records"
},
{
"message_type": "reasoning_message",
"reasoning": "I was able to delete the test data. Let me inform the user."
},
{
"message_type": "assistant_message",
"content": "I've successfully deleted 1,234 test records from the database."
}
],
"stop_reason": "end_turn"
}
```
</CodeGroup>
#### Denying with Guidance
When denying a tool call, you can provide a reason that helps the agent understand how to adjust its approach. The agent will receive an error response and can use your feedback to reformulate its strategy. This is particularly useful for guiding the agent toward safer or more appropriate actions:
<CodeGroup>
```curl curl maxLines=50
curl --request POST \
--url https://api.letta.com/v1/agents/$AGENT_ID/messages \
--header 'Authorization: Bearer $LETTA_API_KEY' \
--header 'Content-Type: application/json' \
--data '{
"messages": [{
"type": "approval",
"approvals": [{
"approve": false,
"tool_call_id": "tool-xyz789",
"reason": "Only delete records older than 30 days, not all test data"
}]
}]
}'
# Response shows agent adjusting based on feedback
{
"messages": [
{
"message_type": "tool_return_message",
"status": "error",
"tool_return": "Error: request denied. Reason: Only delete records older than 30 days, not all test data"
},
{
"message_type": "reasoning_message",
"reasoning": "I need to modify my query to only delete old records..."
},
{
"message_type": "tool_call_message",
"tool_call": {
"name": "database_write",
"arguments": "{\"query\": \"DELETE FROM test_data WHERE created_at < NOW() - INTERVAL 30 DAY\"}"
}
}
],
"stop_reason": "requires_approval"
}
```
```python python maxLines=50
# Deny with explanation
response = client.agents.messages.create(
agent_id=agent.id,
messages=[{
"type": "approval",
"approvals": [{
"approve": False,
"tool_call_id": "tool-xyz789",
"reason": "Only delete records older than 30 days, not all test data"
}]
}]
)
# Response shows agent adjusting based on feedback
{
"messages": [
{
"message_type": "tool_return_message",
"status": "error",
"tool_return": "Error: request denied. Reason: Only delete records older than 30 days, not all test data"
},
{
"message_type": "reasoning_message",
"reasoning": "I need to modify my query to only delete old records..."
},
{
"message_type": "tool_call_message",
"tool_call": {
"name": "database_write",
"arguments": "{\"query\": \"DELETE FROM test_data WHERE created_at < NOW() - INTERVAL 30 DAY\"}"
}
}
],
"stop_reason": "requires_approval"
}
```
```typescript TypeScript maxLines=50
// Deny with explanation
const response = await client.agents.messages.create({
agentId: agent.id,
requestBody: {
messages: [{
type: "approval",
approvals: [{
approve: false,
tool_call_id: "tool-xyz789",
reason: "Only delete records older than 30 days, not all test data"
}]
}]
}
});
// Response shows agent adjusting based on feedback
{
"messages": [
{
"message_type": "tool_return_message",
"status": "error",
"tool_return": "Error: request denied. Reason: Only delete records older than 30 days, not all test data"
},
{
"message_type": "reasoning_message",
"reasoning": "I need to modify my query to only delete old records..."
},
{
"message_type": "tool_call_message",
"tool_call": {
"name": "database_write",
"arguments": "{\"query\": \"DELETE FROM test_data WHERE created_at < NOW() - INTERVAL 30 DAY\"}"
}
}
],
"stop_reason": "requires_approval"
}
```
</CodeGroup>
### Streaming + Background Mode
For streaming clients using background mode, approvals are best handled via `agents.messages.createStream(..., background: true)`. The approval response may include the `tool_return_message` on the approval stream itself, and followup reasoning/assistant messages can be read by resuming that streams `run_id`.
<Note>
Do not assume the `tool_return_message` will repeat after you resume. Treat the one on the approval stream as the source of truth, then resume to continue reading subsequent tokens.
</Note>
<CodeGroup>
```curl curl maxLines=70
# Approve in background after receiving approval_request_message
curl --request POST --url https://api.letta.com/v1/agents/$AGENT_ID/messages/stream --header 'Content-Type: application/json' --data '{
"messages": [{"type": "approval", "approve": true, "approval_request_id": "message-abc"}],
"stream_tokens": true,
"background": true
}'
# Example approval stream output (tool result arrives here):
data: {"run_id":"run-new","seq_id":0,"message_type":"tool_return_message","status":"success","tool_return":"..."}
# Continue by resuming the approval stream's run
curl --request GET --url https://api.letta.com/v1/runs/$RUN_ID/stream --header 'Accept: text/event-stream' --data '{
"starting_after": 0
}'
```
```python python maxLines=70
# Receive an approval_request_message, then approve in background
approve = client.agents.messages.create_stream(
agent_id=agent.id,
messages=[{"type": "approval", "approvals": [{"approve": True, "tool_call_id": "tool-xyz789"}]}],
stream_tokens=True,
background=True,
)
run_id = None
last_seq = 0
for chunk in approve:
if hasattr(chunk, "run_id") and hasattr(chunk, "seq_id"):
run_id = chunk.run_id
last_seq = chunk.seq_id
if getattr(chunk, "message_type", None) == "tool_return_message":
# Tool result arrives here on the approval stream
break
# Continue consuming output by resuming the background run
if run_id:
for chunk in client.runs.stream(run_id, starting_after=last_seq):
print(chunk)
```
```typescript TypeScript maxLines=70
// Receive an approval_request_message, then approve in background
const approve = await client.agents.messages.createStream({
agentId: agent.id,
requestBody: {
messages: [{ type: "approval", approvals: [{ approve: true, tool_call_id: "tool-xyz789" }] }],
streamTokens: true,
background: true,
}
});
let runId: string | null = null;
let lastSeq = 0;
for await (const chunk of approve) {
if (chunk.run_id && chunk.seq_id) { runId = chunk.run_id; lastSeq = chunk.seq_id; }
if (chunk.message_type === "tool_return_message") {
// Tool result arrives here on the approval stream
break;
}
}
// Continue consuming output by resuming the background run
if (runId) {
const resume = await client.runs.stream(runId, { startingAfter: lastSeq });
for await (const chunk of resume) {
console.log(chunk);
}
}
```
</CodeGroup>
<Note>
**Run switching in background mode:** Approvals are separate background requests and create a new `run_id`. Save the approval stream cursor and resume that run. The original paused run will not deliver the tool result — do not wait for the tool return there.
</Note>
See [background mode](/guides/agents/long-running) for resumption patterns.
### IDs and UI Triggers
- **approval_request_id**: This field is now deprecated, but it is still used for backwards compatibility. Used `approval_request_message.id`.
- **tool_call_id**: Always send approvals/denials using the `tool_call_id` from the `ApprovalRequestMessage`.
- **UI trigger**: Open the approval UI on `approval_request_message` only; do not derive UI from `stop_reason`.

View File

@@ -1,460 +0,0 @@
---
title: JSON Mode & Structured Output
subtitle: Get structured JSON responses from your Letta agents
slug: guides/agents/json-mode
---
Letta provides two ways to get structured JSON output from agents: **Structured Generation through Tools** (recommended) and the `response_format` parameter.
## Quick Comparison
<Note>
**Recommended**: Use **Structured Generation through Tools** - works with all providers (Anthropic, OpenAI, Google, etc.) and integrates naturally with Letta's tool-calling architecture.
</Note>
<Info>
**Structured Generation through Tools**:
- ✅ Universal provider compatibility
- ✅ Both reasoning AND structured output
- ✅ Per-message control
- ✅ Works even as "dummy tool" for pure formatting
</Info>
<Warning>
**`response_format` parameter**:
- ⚠️ OpenAI-compatible providers only (NOT Anthropic)
- ⚠️ Persistent agent state (affects all future responses)
- ✅ Built-in provider schema enforcement
</Warning>
## Structured Generation through Tools (Recommended)
Create a tool that defines your desired response format. The tool arguments become your structured data, and you can extract them from the tool call.
### Creating a Structured Generation Tool
<CodeGroup>
```typescript TypeScript maxLines=100
import { LettaClient } from '@letta-ai/letta-client'
// Create client connected to Letta Cloud
const client = new LettaClient({ token: process.env.LETTA_API_KEY });
// First create the tool
const toolCode = `def generate_rank(rank: int, reason: str):
"""Generate a ranking with explanation.
Args:
rank (int): The numerical rank from 1-10.
reason (str): The reasoning behind the rank.
"""
print("Rank generated")
return`;
const tool = await client.tools.create({
sourceCode: toolCode,
sourceType: "python"
});
// Create agent with the structured generation tool
const agentState = await client.agents.create({
model: "openai/gpt-4o-mini",
memoryBlocks: [
{
label: "human",
value: "The human's name is Chad. They are a food enthusiast who enjoys trying different cuisines."
},
{
label: "persona",
value: "I am a helpful food critic assistant. I provide detailed rankings and reviews of different foods and restaurants."
}
],
toolIds: [tool.id]
});
```
```python title="python" maxLines=100
from letta_client import Letta
# Create client connected to Letta Cloud
import os
client = Letta(token=os.getenv("LETTA_API_KEY"))
def generate_rank(rank: int, reason: str):
"""Generate a ranking with explanation.
Args:
rank (int): The numerical rank from 1-10.
reason (str): The reasoning behind the rank.
"""
print("Rank generated")
return
# Create the tool
tool = client.tools.create(func=generate_rank)
# Create agent with the structured generation tool
agent_state = client.agents.create(
model="openai/gpt-4o-mini",
embedding="openai/text-embedding-3-small",
memory_blocks=[
{
"label": "human",
"value": "The human's name is Chad. They are a food enthusiast who enjoys trying different cuisines."
},
{
"label": "persona",
"value": "I am a helpful food critic assistant. I provide detailed rankings and reviews of different foods and restaurants."
}
],
tool_ids=[tool.id]
)
```
</CodeGroup>
### Using the Structured Generation Tool
<CodeGroup>
```typescript TypeScript maxLines=100
// Send message and instruct agent to use the tool
const response = await client.agents.messages.create(
agentState.id, {
messages: [
{
role: "user",
content: "How do you rank sushi as a food? Please use the generate_rank tool to provide your response."
}
]
}
);
// Extract structured data from tool call
for (const message of response.messages) {
if (message.messageType === "tool_call_message") {
const args = JSON.parse(message.toolCall.arguments);
console.log(`Rank: ${args.rank}`);
console.log(`Reason: ${args.reason}`);
}
}
// Example output:
// Rank: 8
// Reason: Sushi is a highly regarded cuisine known for its fresh ingredients...
```
```python title="python" maxLines=100
# Send message and instruct agent to use the tool
response = client.agents.messages.create(
agent_id=agent_state.id,
messages=[
{
"role": "user",
"content": "How do you rank sushi as a food? Please use the generate_rank tool to provide your response."
}
]
)
# Extract structured data from tool call
for message in response.messages:
if message.message_type == "tool_call_message":
import json
args = json.loads(message.tool_call.arguments)
rank = args["rank"]
reason = args["reason"]
print(f"Rank: {rank}")
print(f"Reason: {reason}")
# Example output:
# Rank: 8
# Reason: Sushi is a highly regarded cuisine known for its fresh ingredients...
```
</CodeGroup>
The agent will call the tool, and you can extract the structured arguments:
```json
{
"rank": 8,
"reason": "Sushi is a highly regarded cuisine known for its fresh ingredients, artistic presentation, and cultural significance."
}
```
## Using `response_format` for Provider-Native JSON Mode
The `response_format` parameter enables structured output/JSON mode from LLM providers that support it. This approach is fundamentally different from tools because **`response_format` becomes a persistent part of the agent's state** - once set, all future responses from that agent will follow the format until explicitly changed.
Under the hood, `response_format` constrains the agent's assistant messages to follow the specified schema, but it doesn't affect tools - those continue to work normally with their original schemas.
<Warning>
**Requirements for `response_format`:**
- Only works with providers that support structured outputs (like OpenAI) - NOT Anthropic or other providers
</Warning>
### Basic JSON Mode
<CodeGroup>
```typescript TypeScript maxLines=100
import { LettaClient } from '@letta-ai/letta-client'
// Create client (Letta Cloud)
const client = new LettaClient({ token: "LETTA_API_KEY" });
// Create agent with basic JSON mode (OpenAI/compatible providers only)
const agentState = await client.agents.create({
model: "openai/gpt-4o-mini",
memoryBlocks: [
{
label: "human",
value: "The human's name is Chad. They work as a data analyst and prefer clear, organized information."
},
{
label: "persona",
value: "I am a helpful assistant who provides clear and well-organized responses."
}
],
responseFormat: { type: "json_object" }
});
// Send message expecting JSON response
const response = await client.agents.messages.create(
agentState.id, {
messages: [
{
role: "user",
content: "How do you rank sushi as a food? Please respond in JSON format with rank and reason fields."
}
]
}
);
for (const message of response.messages) {
console.log(message);
}
```
```python title="python" maxLines=100
from letta_client import Letta
# Create client (Letta Cloud)
client = Letta(token="LETTA_API_KEY")
# Create agent with basic JSON mode (OpenAI/compatible providers only)
agent_state = client.agents.create(
model="openai/gpt-4o-mini",
embedding="openai/text-embedding-3-small",
memory_blocks=[
{
"label": "human",
"value": "The human's name is Chad. They work as a data analyst and prefer clear, organized information."
},
{
"label": "persona",
"value": "I am a helpful assistant who provides clear and well-organized responses."
}
],
response_format={"type": "json_object"}
)
# Send message expecting JSON response
response = client.agents.messages.create(
agent_id=agent_state.id,
messages=[
{
"role": "user",
"content": "How do you rank sushi as a food? Please respond in JSON format with rank and reason fields."
}
]
)
for message in response.messages:
print(message)
```
</CodeGroup>
### Advanced JSON Schema Mode
For more precise control, you can use OpenAI's `json_schema` mode with strict validation:
<CodeGroup>
```typescript TypeScript maxLines=100
import { LettaClient } from '@letta-ai/letta-client'
const client = new LettaClient({ token: "LETTA_API_KEY" });
// Define structured schema (from OpenAI structured outputs guide)
const responseFormat = {
type: "json_schema",
jsonSchema: {
name: "food_ranking",
schema: {
type: "object",
properties: {
rank: {
type: "integer",
minimum: 1,
maximum: 10
},
reason: {
type: "string"
},
categories: {
type: "array",
items: {
type: "object",
properties: {
name: { type: "string" },
score: { type: "integer" }
},
required: ["name", "score"],
additionalProperties: false
}
}
},
required: ["rank", "reason", "categories"],
additionalProperties: false
},
strict: true
}
};
// Create agent
const agentState = await client.agents.create({
model: "openai/gpt-4o-mini",
memoryBlocks: []
});
// Update agent with response format
const updatedAgent = await client.agents.update(
agentState.id,
{ responseFormat }
);
// Send message
const response = await client.agents.messages.create(
agentState.id, {
messages: [
{ role: "user", content: "How do you rank sushi? Include categories for taste, presentation, and value." }
]
}
);
for (const message of response.messages) {
console.log(message);
}
```
```python title="python" maxLines=100
from letta_client import Letta
client = Letta(token="LETTA_API_KEY")
# Define structured schema (from OpenAI structured outputs guide)
response_format = {
"type": "json_schema",
"json_schema": {
"name": "food_ranking",
"schema": {
"type": "object",
"properties": {
"rank": {
"type": "integer",
"minimum": 1,
"maximum": 10
},
"reason": {
"type": "string"
},
"categories": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": { "type": "string" },
"score": { "type": "integer" }
},
"required": ["name", "score"],
"additionalProperties": False
}
}
},
"required": ["rank", "reason", "categories"],
"additionalProperties": False
},
"strict": True
}
}
# Create agent
agent_state = client.agents.create(
model="openai/gpt-4o-mini",
embedding="openai/text-embedding-3-small",
memory_blocks=[]
)
# Update agent with response format
agent_state = client.agents.update(
agent_id=agent_state.id,
response_format=response_format
)
# Send message
response = client.agents.messages.create(
agent_id=agent_state.id,
messages=[
{"role": "user", "content": "How do you rank sushi? Include categories for taste, presentation, and value."}
]
)
for message in response.messages:
print(message)
```
</CodeGroup>
With structured JSON schema, the agent's response will be strictly validated:
```json
{
"rank": 8,
"reason": "Sushi is highly regarded for its fresh ingredients and artful presentation",
"categories": [
{"name": "taste", "score": 9},
{"name": "presentation", "score": 10},
{"name": "value", "score": 6}
]
}
```
## Updating Agent Response Format
You can update an existing agent's response format:
<CodeGroup>
```typescript TypeScript maxLines=100
// Update agent to use JSON mode (OpenAI/compatible only)
await client.agents.update(agentState.id, {
responseFormat: { type: "json_object" }
});
// Or remove JSON mode
await client.agents.update(agentState.id, {
responseFormat: null
});
```
```python title="python" maxLines=100
# Update agent to use JSON mode (OpenAI/compatible only)
client.agents.update(
agent_id=agent_state.id,
response_format={"type": "json_object"}
)
# Or remove JSON mode
client.agents.update(
agent_id=agent_state.id,
response_format=None
)
```
</CodeGroup>

View File

@@ -1,602 +0,0 @@
---
title: Long-Running Executions
slug: guides/agents/long-running
subtitle: How to handle long-running agent executions
---
When agents need to execute multiple tool calls or perform complex operations (like deep research, data analysis, or multi-step workflows), processing time can vary significantly.
Letta supports various ways to handle long-running agents, so you can choose the approach that best fits your use case:
| Use Case | Duration | Recommendedation | Key Benefits |
|----------|----------|---------------------|-------------|
| Few-step invocations | < 1 minute | [Standard streaming](/guides/agents/streaming) | Simplest approach |
| Variable length runs | 1-10 minutes | **Background mode** (Keepalive + Timeout as a second choice) | Easy way to reduce timeouts |
| Deep research | 10+ minutes | **Background mode**, or async polling | Survives disconnects, resumable streams |
| Batch jobs | Any | **Async polling** | Fire-and-forget, check results later |
## Option 1: Background Mode with Resumable Streaming
<Note>
**Best for:** Operations exceeding 10 minutes, unreliable network connections, or critical workflows that must complete regardless of client connectivity.
**Trade-off:** Slightly higher latency to first token due to background task initialization.
</Note>
Background mode decouples agent execution from your client connection. The agent processes your request on the server while streaming results to a persistent store, allowing you to reconnect and resume from any point — even if your application crashes or network fails.
<CodeGroup>
```curl curl maxLines=50
curl --request POST \
--url https://api.letta.com/v1/agents/$AGENT_ID/messages/stream \
--header 'Authorization: Bearer $LETTA_API_KEY' \
--header 'Content-Type: application/json' \
--data '{
"messages": [
{
"role": "user",
"content": "Run comprehensive analysis on this dataset"
}
],
"stream_tokens": true,
"background": true
}'
# Response stream includes run_id and seq_id for each chunk:
data: {"run_id":"run-123","seq_id":0,"message_type":"reasoning_message","reasoning":"Analyzing"}
data: {"run_id":"run-123","seq_id":1,"message_type":"reasoning_message","reasoning":" the dataset"}
data: {"run_id":"run-123","seq_id":2,"message_type":"tool_call","tool_call":{...}}
# ... stream continues
# Step 2: If disconnected, resume from last received seq_id
curl --request GET \
--url https://api.letta.com/v1/runs/$RUN_ID/stream \
--header 'Accept: text/event-stream' \
--data '{
"starting_after": 57
}'
```
```python python maxLines=50
stream = client.agents.messages.create_stream(
agent_id=agent_state.id,
messages=[
{
"role": "user",
"content": "Run comprehensive analysis on this dataset"
}
],
stream_tokens=True,
background=True,
)
run_id = None
last_seq_id = None
for chunk in stream:
if hasattr(chunk, "run_id") and hasattr(chunk, "seq_id"):
run_id = chunk.run_id # Save this to reconnect if your connection drops
last_seq_id = chunk.seq_id # Save this as your resumption point for cursor-based pagination
print(chunk)
# If disconnected, resume from last received seq_id:
for chunk in client.runs.stream(run_id, starting_after=last_seq_id):
print(chunk)
```
```typescript TypeScript maxLines=50
const stream = await client.agents.messages.createStream({
agentId: agentState.id,
requestBody: {
messages: [
{
role: "user",
content: "Run comprehensive analysis on this dataset"
}
],
streamTokens: true,
background: true,
}
});
let runId = null;
let lastSeqId = null;
for await (const chunk of stream) {
if (chunk.run_id && chunk.seq_id) {
runId = chunk.run_id; // Save this to reconnect if your connection drops
lastSeqId = chunk.seq_id; // Save this as your resumption point for cursor-based pagination
}
console.log(chunk);
}
// If disconnected, resume from last received seq_id
for await (const chunk of client.runs.stream(runId, {startingAfter: lastSeqId})) {
console.log(chunk);
}
```
```python python maxLines=60
# 1) Start background stream and capture approval request
stream = client.agents.messages.create_stream(
agent_id=agent.id,
messages=[{"role": "user", "content": "Do a sensitive operation"}],
stream_tokens=True,
background=True,
)
approval_request_id = None
orig_run_id = None
last_seq_id = 0
for chunk in stream:
if hasattr(chunk, "run_id") and hasattr(chunk, "seq_id"):
orig_run_id = chunk.run_id
last_seq_id = chunk.seq_id
if getattr(chunk, "message_type", None) == "approval_request_message":
approval_request_id = chunk.id
break
# 2) Approve in background; capture the approval stream cursor (this creates a new run)
approve = client.agents.messages.create_stream(
agent_id=agent.id,
messages=[{"type": "approval", "approve": True, "approval_request_id": approval_request_id}],
stream_tokens=True,
background=True,
)
run_id = None
approve_seq = 0
for chunk in approve:
if hasattr(chunk, "run_id") and hasattr(chunk, "seq_id"):
run_id = chunk.run_id
approve_seq = chunk.seq_id
if getattr(chunk, "message_type", None) == "tool_return_message":
# Tool result arrives here on the approval stream
break
# 3) Resume that run to read follow-up tokens
for chunk in client.runs.stream(run_id, starting_after=approve_seq):
print(chunk)
```
```typescript TypeScript maxLines=60
// 1) Start background stream and capture approval request
const stream = await client.agents.messages.createStream(
agent.id, {
messages: [{role: "user", content: "Do a sensitive operation"}],
streamTokens: true,
background: true,
}
);
let approvalRequestId = null;
let origRunId = null;
let lastSeqId = 0;
for await (const chunk of stream) {
if (chunk.runId && chunk.seqId) {
origRunId = chunk.runId;
lastSeqId = chunk.seqId;
}
if (chunk.messageType === "approval_request_message") {
approvalRequestId = chunk.id;
break;
}
}
// 2) Approve in background; capture the approval stream cursor (this creates a new run)
const approveStream = await client.agents.messages.createStream(
agent.id, {
messages: [{type: "approval", approve: true, approvalRequestId}],
streamTokens: true,
background: true,
}
);
let runId = null;
let approveSeq = 0;
for await (const chunk of approveStream) {
if (chunk.runId && chunk.seqId) {
runId = chunk.runId;
approveSeq = chunk.seqId;
}
if (chunk.messageType === "tool_return_message") {
// Tool result arrives here on the approval stream
break;
}
}
// 3) Resume that run to read follow-up tokens
for await (const chunk of client.runs.stream(runId, {startingAfter: approveSeq})) {
console.log(chunk);
}
```
</CodeGroup>
### HITL in Background Mode
When [HumanintheLoop (HITL) approval](/guides/agents/human-in-the-loop) is enabled for a tool, your background stream may pause and emit an `approval_request_message`. In background mode, send the approval via a separate background stream and capture that streams `run_id`/`seq_id`.
<Note>
Approval responses in background mode emit the `tool_return_message` on the approval stream itself (with a new `run_id`, different from the original stream). Save the approval stream cursor, then resume with `runs.stream` to consume subsequent reasoning/assistant messages.
</Note>
<CodeGroup>
```curl curl maxLines=70
# 1) Start background stream; capture approval request
curl --request POST \
--url https://api.letta.com/v1/agents/$AGENT_ID/messages/stream \
--header 'Authorization: Bearer $LETTA_API_KEY' \
--header 'Content-Type: application/json' \
--data '{
"messages": [{"role": "user", "content": "Do a sensitive operation"}],
"stream_tokens": true,
"background": true
}'
# Example stream output (approval request arrives):
data: {"run_id":"run-abc","seq_id":0,"message_type":"reasoning_message","reasoning":"..."}
data: {"run_id":"run-abc","seq_id":1,"message_type":"approval_request_message","id":"message-abc","tool_call":{"name":"sensitive_operation","arguments":"{...}","tool_call_id":"tool-xyz"}}
# 2) Approve in background; capture approval stream cursor (this creates a new run)
curl --request POST \
--url https://api.letta.com/v1/agents/$AGENT_ID/messages/stream \
--header 'Authorization: Bearer $LETTA_API_KEY' \
--header 'Content-Type: application/json' \
--data '{
"messages": [{"type": "approval", "approve": true, "approval_request_id": "message-abc"}],
"stream_tokens": true,
"background": true
}'
# Example approval stream output (tool result arrives here):
data: {"run_id":"run-new","seq_id":0,"message_type":"tool_return_message","status":"success","tool_return":"..."}
# 3) Resume the approval stream's run to continue
curl --request GET \
--url https://api.letta.com/v1/runs/$RUN_ID/stream \
--header 'Accept: text/event-stream' \
--data '{
"starting_after": 0
}'
```
```python python maxLines=70
# 1) Start background stream and capture approval request
stream = client.agents.messages.create_stream(
agent_id=agent.id,
messages=[{"role": "user", "content": "Do a sensitive operation"}],
stream_tokens=True,
background=True,
)
approval_request_id = None
orig_run_id = None
last_seq_id = 0
for chunk in stream:
if hasattr(chunk, "run_id") and hasattr(chunk, "seq_id"):
orig_run_id = chunk.run_id
last_seq_id = chunk.seq_id
if getattr(chunk, "message_type", None) == "approval_request_message":
approval_request_id = chunk.id
break
# 2) Approve in background; capture the approval stream cursor (this creates a new run)
approve = client.agents.messages.create_stream(
agent_id=agent.id,
messages=[{"type": "approval", "approve": True, "approval_request_id": approval_request_id}],
stream_tokens=True,
background=True,
)
run_id = None
approve_seq = 0
for chunk in approve:
if hasattr(chunk, "run_id") and hasattr(chunk, "seq_id"):
run_id = chunk.run_id
approve_seq = chunk.seq_id
if getattr(chunk, "message_type", None) == "tool_return_message":
# Tool result arrives here on the approval stream
break
# 3) Resume that run to read follow-up tokens
for chunk in client.runs.stream(run_id, starting_after=approve_seq):
print(chunk)
```
```typescript TypeScript maxLines=70
// 1) Start background stream and capture approval request
const stream = await client.agents.messages.createStream({
agentId: agent.id,
requestBody: {
messages: [{ role: "user", content: "Do a sensitive operation" }],
streamTokens: true,
background: true,
}
});
let approvalRequestId: string | null = null;
let origRunId: string | null = null;
let lastSeqId = 0;
for await (const chunk of stream) {
if (chunk.run_id && chunk.seq_id) { origRunId = chunk.run_id; lastSeqId = chunk.seq_id; }
if (chunk.message_type === "approval_request_message") {
approvalRequestId = chunk.id; break;
}
}
// 2) Approve in background; capture the approval stream cursor (this creates a new run)
const approve = await client.agents.messages.createStream({
agentId: agent.id,
requestBody: {
messages: [{ type: "approval", approve: true, approvalRequestId }],
streamTokens: true,
background: true,
}
});
let runId: string | null = null;
let approveSeq = 0;
for await (const chunk of approve) {
if (chunk.run_id && chunk.seq_id) { runId = chunk.run_id; approveSeq = chunk.seq_id; }
if (chunk.message_type === "tool_return_message") {
// Tool result arrives here on the approval stream
break;
}
}
// 3) Resume that run to read follow-up tokens
const resume = await client.runs.stream(runId!, { startingAfter: approveSeq });
for await (const chunk of resume) {
console.log(chunk);
}
```
</CodeGroup>
### Discovering and Resuming Active Streams
When your application starts or recovers from a crash, you can check for any active background streams and resume them. This is particularly useful for:
- **Application restarts**: Resume processing after deployments or crashes
- **Load balancing**: Pick up streams started by other instances
- **Monitoring**: Check progress of long-running operations from different clients
<CodeGroup>
```curl curl maxLines=50
# Step 1: Find active background streams for your agents
curl --request GET \
--url https://api.letta.com/v1/runs/active \
--header 'Authorization: Bearer $LETTA_API_KEY' \
--header 'Content-Type: application/json' \
--data '{
"agent_ids": [
"agent-123",
"agent-456"
],
"background": true
}'
# Returns: [{"run_id": "run-abc", "agent_id": "agent-123", "status": "processing", ...}]
# Step 2: Resume streaming from the beginning (or any specified seq_id)
curl --request GET \
--url https://api.letta.com/v1/runs/$RUN_ID/stream \
--header 'Accept: text/event-stream' \
--data '{
"starting_after": 0, # Start from beginning
"batch_size": 1000 # Fetch historical chunks in larger batches
}'
```
```python python maxLines=50
# Find and resume active background streams
active_runs = client.runs.active(
agent_ids=["agent-123", "agent-456"],
background=True,
)
if active_runs:
# Resume the first active stream from the beginning
run = active_runs[0]
print(f"Resuming stream for run {run.id}, status: {run.status}")
stream = client.runs.stream(
run_id=run.id,
starting_after=0, # Start from beginning
batch_size=1000 # Fetch historical chunks in larger batches
)
# Each historical chunk is streamed one at a time, followed by new chunks as they become available
for chunk in stream:
print(chunk)
```
```typescript TypeScript maxLines=50
// Find and resume active background streams
const activeRuns = await client.runs.active({
agentIds: ["agent-123", "agent-456"],
background: true,
});
if (activeRuns.length > 0) {
// Resume the first active stream from the beginning
const run = activeRuns[0];
console.log(`Resuming stream for run ${run.id}, status: ${run.status}`);
const stream = await client.runs.stream(run.id, {
startingAfter: 0, // Start from beginning
batchSize: 1000 // Fetch historical chunks in larger batches
});
// Each historical chunk is streamed one at a time, followed by new chunks as they become available
for await (const chunk of stream) {
console.log(chunk);
}
}
```
</CodeGroup>
## Option 2: Async Operations with Polling
<Note>
**Best for:** Usecases where you don't need real-time token streaming.
</Note>
Ideal for batch processing, scheduled jobs, or when you don't need real-time updates. The [async SDK method](/api-reference/agents/messages/create-async) queues your request and returns immediately, letting you check results later:
<CodeGroup>
```curl curl maxLines=50
# Start async operation (returns immediately with run ID)
curl --request POST \
--url https://api.letta.com/v1/agents/$AGENT_ID/messages/async \
--header 'Authorization: Bearer $LETTA_API_KEY' \
--header 'Content-Type: application/json' \
--data '{
"messages": [
{
"role": "user",
"content": "Run comprehensive analysis on this dataset"
}
]
}'
# Poll for results using the returned run ID
curl --request GET \
--url https://api.letta.com/v1/runs/$RUN_ID
```
```python python maxLines=50
# Start async operation (returns immediately with run ID)
run = client.agents.messages.create_async(
agent_id=agent_state.id,
messages=[
{
"role": "user",
"content": "Run comprehensive analysis on this dataset"
}
],
)
# Poll for completion
import time
while run.status != "completed":
time.sleep(2)
run = client.runs.retrieve(run_id=run.id)
# Get the messages once complete
messages = client.runs.messages.list(run_id=run.id)
```
```typescript TypeScript maxLines=50
// Start async operation (returns immediately with run ID)
const run = await client.agents.createAgentMessageAsync({
agentId: agentState.id,
requestBody: {
messages: [
{
role: "user",
content: "Run comprehensive analysis on this dataset"
}
]
}
});
// Poll for completion
while (run.status !== "completed") {
await new Promise(resolve => setTimeout(resolve, 2000));
run = await client.runs.retrieveRun({ runId: run.id });
}
// Get the messages once complete
const messages = await client.runs.listRunMessages({ runId: run.id });
```
</CodeGroup>
## Option 3: Configure Streaming with Keepalive Pings and Longer Timeouts
<Note>
**Best for:** Usecases where you are already using the standard [streaming code](/guides/agents/streaming), but are experiencing issues with timeouts or disconnects (e.g. due to network interruptions or hanging tool executions).
**Trade-off:** Not as reliable as background mode, and does not support resuming a disconnected stream/request.
</Note>
<Warning>
This approach assumes a persistent HTTP connection. We highly recommend using **background mode** (or async polling) for long-running jobs, especially when:
- Your infrastructure uses aggressive proxy timeouts
- You need to handle network interruptions gracefully
- Operations might exceed 10 minutes
</Warning>
For operations under 10 minutes that need real-time updates without the complexity of background processing. Configure keepalive pings and timeouts to maintain stable connections:
<CodeGroup>
```curl curl maxLines=50
curl --request POST \
--url https://api.letta.com/v1/agents/$AGENT_ID/messages/stream \
--header 'Authorization: Bearer $LETTA_API_KEY' \
--header 'Content-Type: application/json' \
--data '{
"messages": [
{
"role": "user",
"content": "Execute this long-running analysis"
}
],
"include_pings": true
}'
```
```python python
# Configure client with extended timeout
from letta_client import Letta
import os
client = Letta(
token=os.getenv("LETTA_API_KEY")
)
# Enable pings to prevent timeout during long operations
stream = client.agents.messages.create_stream(
agent_id=agent_state.id,
messages=[
{
"role": "user",
"content": "Execute this long-running analysis"
}
],
include_pings=True, # Sends periodic keepalive messages
request_options={"timeout_in_seconds": 600} # 10 min timeout
)
# Process the stream (pings will keep connection alive)
for chunk in stream:
if chunk.message_type == "ping":
# Keepalive ping received, connection is still active
continue
print(chunk)
```
```typescript TypeScript maxLines=50
// Configure client with extended timeout
import { Letta } from '@letta/sdk';
const client = new Letta({
token: process.env.LETTA_API_KEY
});
// Enable pings to prevent timeout during long operations
const stream = await client.agents.createAgentMessageStream({
agentId: agentState.id,
requestBody: {
messages: [
{
role: "user",
content: "Execute this long-running analysis"
}
],
includePings: true // Sends periodic keepalive messages
}, {
timeoutInSeconds: 600 // 10 minutes timeout in seconds
}
});
// Process the stream (pings will keep connection alive)
for await (const chunk of stream) {
if (chunk.message_type === "ping") {
// Keepalive ping received, connection is still active
continue;
}
console.log(chunk);
}
```
</CodeGroup>
### Configuration Guidelines
| Parameter | Purpose | When to Use |
|-----------|---------|------------|
| Timeout in seconds | Extends request timeout beyond 60s default | Set to 1.5x your expected max duration |
| Include pings | Sends keepalive messages every ~30s | Enable for operations with long gaps between outputs |

View File

@@ -1,295 +0,0 @@
---
title: Agent Memory
subtitle: How Letta agents manage and evolve their memory
slug: guides/agents/memory
---
<Tip>
Want to dive deeper? Read our blog posts on [agent memory](https://www.letta.com/blog/agent-memory), [context engineering](https://www.letta.com/blog/guide-to-context-engineering), [memory blocks](https://www.letta.com/blog/memory-blocks), and [RAG vs agent memory](https://www.letta.com/blog/rag-vs-agent-memory).
</Tip>
## What is agent memory?
**Agent memory in Letta is about managing what information is visible in the agent's context window.**
Unlike traditional LLMs that are stateless (forgetting everything between interactions), Letta agents maintain persistent, evolving memory by intelligently managing their context window over time.
The key insight: **the context window is a scarce resource.** You can't fit an entire conversation history or knowledge base into it. Effective memory is about:
- **What's in context right now** (immediately visible to the LLM)
- **What's been moved to external storage** (retrievable when needed)
- **Who decides what stays and what goes** (the agent itself)
## The LLM Operating System
Letta is built on the [MemGPT](https://arxiv.org/abs/2310.08560) paper, which introduced the concept of an "LLM Operating System" for memory management. Just like a computer OS manages different types of memory (registers, RAM, disk), Letta agents manage different tiers of information:
```mermaid
flowchart TB
subgraph ContextWindow["⚡ CONTEXT WINDOW (What the LLM sees)"]
direction TB
System[System Prompt<br/>Kernel context]
Blocks[Memory Blocks<br/>Agent-managed context]
Messages[Recent Messages<br/>Conversation buffer]
end
subgraph External["💾 EXTERNAL STORAGE (Retrieved on-demand)"]
direction TB
Recall[Recall Memory<br/>Full conversation history]
Archival[Archival Memory<br/>Explicit facts & knowledge]
Files[Data Sources<br/>Documents & files]
end
Blocks -->|Agent edits| Blocks
Messages -->|Overflow| Recall
ContextWindow -.->|Agent searches| External
```
### Memory tiers explained
| Tier | Size | Speed | Managed By | Purpose |
|------|------|-------|------------|---------|
| **System Prompt** | ~1-2K tokens | Instant | System | Agent instructions & behavior |
| **Memory Blocks** | ~2-4K tokens total | Instant | **Agent** | Self-editing structured memory |
| **Message Buffer** | Variable | Instant | System | Recent conversation flow |
| **Recall Memory** | Unlimited | 1-2 sec | Agent via search | Past conversation history |
| **Archival Memory** | Unlimited | 1-2 sec | Agent via search | Explicit facts & knowledge |
| **Data Sources** | Unlimited | 1-2 sec | Agent via search | Uploaded documents |
## Memory blocks: Units of abstraction
**Memory blocks are discrete, structured sections of the context window that agents can read and edit.**
Think of memory blocks as "variables" that persist across interactions:
```python
# Traditional approach: everything is ephemeral
messages = [
{"role": "user", "content": "I'm Sarah, I like Python"},
{"role": "assistant", "content": "Hi Sarah!"},
{"role": "user", "content": "What's my name?"}, # Model only "knows" from message history
]
# Letta approach: structured, persistent memory blocks
memory_blocks = [
{
"label": "human",
"value": "Name: Sarah\nPreferences: Python programming",
"description": "Key details about the user"
},
{
"label": "persona",
"value": "I am a helpful coding assistant",
"description": "My identity and behavior"
}
]
# Agent can edit these blocks over time as it learns more
```
### Why memory blocks?
**Memory blocks solve the fundamental challenge of context window management:**
1. **Consistency**: Same information is visible across all interactions (not dependent on what fits in message buffer)
2. **Editability**: Agents can update their understanding over time (not just accumulate)
3. **Structure**: Organized sections instead of unstructured message history
4. **Control**: Agents decide what's important enough to persist
### Default memory blocks
Letta agents typically start with two memory blocks:
**Persona Block** - Who the agent is
```
My name is Sam. I am a friendly, professional assistant who helps users
with programming questions. I prefer concise explanations with code examples.
```
**Human Block** - Who the user is
```
The user's name is Sarah. She is a Python developer working on AI applications.
She prefers detailed technical explanations and appreciates best practices.
```
You can add custom blocks for any purpose:
- **Project context**: Current task, goals, progress
- **Organization info**: Company policies, shared knowledge
- **Conversation state**: Multi-step workflow tracking
## Agentic context engineering
**The key innovation in Letta: agents manage their own memory using tools.**
Instead of a fixed context window or simple retrieval, agents actively decide:
- What to remember (write to memory blocks)
- What to forget (remove outdated information)
- What to search for (query external storage)
- How to organize knowledge (restructure memory blocks)
### Memory management tools
Agents have access to these built-in tools:
- `memory_insert` - Add new information to a memory block
- `memory_replace` - Update or rewrite part of a memory block
- `conversation_search` - Search past messages (recall memory)
- `archival_memory_insert` - Store facts in long-term storage
- `archival_memory_search` - Retrieve facts from long-term storage
Example of an agent using memory tools:
```
User: "I'm working on a Next.js app now, not Django anymore"
Agent thinks: "User has shifted tech stacks. I should update my memory."
Agent calls: memory_replace(
block_label="human",
old_text="She is a Python developer working on Django apps",
new_text="She is a full-stack developer currently working on Next.js apps"
)
Agent responds: "Got it! I've updated my notes that you're now working with Next.js."
```
## RAG vs Agent Memory
**Traditional RAG (Retrieval-Augmented Generation):**
- Retrieves semantically similar chunks
- One-shot retrieval per interaction
- Purely reactive (only searches when prompted)
- No persistent understanding
**Letta Agent Memory:**
- Maintains structured, editable memory in context
- Multi-step retrieval (can paginate, refine searches)
- Proactive management (updates memory as it learns)
- Persistent understanding that improves over time
### When to use what
Use **memory blocks** for:
- Information that should be consistently visible
- Knowledge that evolves (user preferences, project state)
- Structured context (persona, relationships, goals)
Use **external memory (RAG-style)** for:
- Large corpora of documents
- Historical conversation logs
- Facts that rarely change
- Information that's too large for context
**Best practice**: Combine both. Memory blocks hold the "executive summary" while external storage holds the full details.
## Sleep-time agents
<Info>
Sleep-time agents are an advanced feature for memory management. See [sleep-time agents guide](/guides/agents/sleep-time-agents) for details.
</Info>
Letta supports **sleep-time compute**: background agents that process and optimize memory while the main agent is idle. This enables:
- **Lower latency**: Main agent doesn't spend time on memory management
- **Better memory**: Dedicated agent can do deeper analysis and reorganization
- **Consistent memory**: Sleep-time agent maintains memory quality over time
Think of it like how humans process memories during sleep - consolidating experiences and strengthening important connections.
## Memory best practices
### 1. Start with clear, specific memory blocks
```python
# ❌ Vague
{"label": "info", "value": "stuff about the user"}
# ✅ Specific
{"label": "user_preferences", "value": "Prefers: Python, VS Code, detailed explanations\nDislikes: Java, Eclipse"}
```
### 2. Write good descriptions
The `description` field tells the agent **when and how** to use the block:
```python
# ❌ Vague description
{
"label": "project",
"description": "Project info",
"value": "Building a chatbot"
}
# ✅ Clear description
{
"label": "project_context",
"description": "Current project goals, status, and blockers. Update as progress is made.",
"value": "Building a customer support chatbot. Status: MVP complete. Next: Add knowledge base integration."
}
```
### 3. Use read-only blocks for shared knowledge
```python
# Shared organizational knowledge that shouldn't change
{
"label": "company_policies",
"description": "Company policies and guidelines for reference",
"value": "Support hours: 9am-5pm PT. Escalation path: ...",
"read_only": True # Agent can read but not edit
}
```
### 4. Monitor memory block usage
- Check if blocks are hitting size limits
- Review if agents are actually using the blocks effectively
- Adjust descriptions if agents misuse blocks
## Memory in multi-agent systems
Memory blocks enable powerful multi-agent patterns:
### Shared memory
Multiple agents can share the same memory block:
```python
# Create shared organizational knowledge
org_block = client.blocks.create(
label="organization",
value="Mission: Help users build AI agents...",
description="Shared organizational context"
)
# Both agents see the same block
agent1 = client.agents.create(block_ids=[org_block.id], ...)
agent2 = client.agents.create(block_ids=[org_block.id], ...)
```
### Cross-agent memory updates
Agents can update each other's memory:
```python
# Supervisor agent updates worker agent's context
supervisor_tool = """
def update_worker_context(new_task_description: str):
client.agents.blocks.modify(
agent_id=worker_agent_id,
block_label="current_task",
value=new_task_description
)
"""
```
## Next steps
- [Memory Blocks API](/guides/agents/memory-blocks) - Creating and managing memory blocks
- [Context Engineering](/guides/agents/context-engineering) - Advanced memory management patterns
- [Multi-Agent Shared Memory](/guides/agents/multi-agent-memory) - Coordinating memory across agents
- [Sleep-Time Agents](/guides/agents/sleep-time-agents) - Background memory processing
## Further reading
- [Blog: Agent Memory](https://www.letta.com/blog/agent-memory)
- [Blog: Guide to Context Engineering](https://www.letta.com/blog/guide-to-context-engineering)
- [Blog: Memory Blocks](https://www.letta.com/blog/memory-blocks)
- [Blog: RAG vs Agent Memory](https://www.letta.com/blog/rag-vs-agent-memory)
- [MemGPT Research Paper](https://arxiv.org/abs/2310.08560)

View File

@@ -1,114 +0,0 @@
---
title: Agent Memory
subtitle: What is agent memory, and how does it work?
slug: guides/agents/memory
---
## What is agent memory?
**Agent memory in Letta is about managing what information is in the agent's context window.**
The context window is a scarce resource - you can't fit everything into it. Effective memory management is about deciding what stays in context (immediately visible) and what moves to external storage (retrieved when needed).
Agent memory enables AI agents to maintain persistent state, learn from interactions, and develop long-term relationships with users. Unlike traditional chatbots that treat each conversation as isolated, agents with sophisticated memory systems can build understanding over time.
## Types of Memory in Letta
Letta agents have access to multiple memory systems:
### Core Memory (In-Context)
Memory blocks are structured sections of the agent's context window that persist across all interactions. They are always visible - no retrieval needed.
**Memory blocks are Letta's core abstraction.** You can create blocks with any descriptive label - the agent learns how to use them autonomously. This enables everything from simple user preferences to sophisticated multi-agent coordination.
[Learn more about memory blocks →](/guides/agents/memory-blocks)
### External Memory (Out-of-Context)
External memory provides unlimited storage for information that doesn't need to be always visible. Agents retrieve from external memory on-demand using search tools.
Letta provides several built-in external memory systems:
- **Conversation search** - Search past messages using full-text and semantic search
- **Archival memory** - Agent-managed semantically searchable database for facts and knowledge
- **Letta Filesystem** - File management system for documents and data ([learn more](/guides/agents/filesystem))
Agents can also access any external data source through [MCP servers](/guides/mcp/overview) or [custom tools](/guides/agents/custom-tools) - databases, APIs, vector stores, or third-party services.
## How Agents Manage Their Memory
**What makes Letta unique is that agents don't just read from memory - they actively manage it.** Unlike traditional RAG systems that passively retrieve information, Letta agents use built-in tools to decide what to remember, update, and search for.
When a user mentions they've switched from Python to TypeScript, the agent may choose to update its memory:
<CodeGroup>
```typescript TypeScript
memory_replace(
block_label: "human",
old_text: "Prefers Python for development",
new_text: "Currently using TypeScript for main project"
)
```
```python Python
memory_replace(
block_label="human",
old_text="Prefers Python for development",
new_text="Currently using TypeScript for main project"
)
```
</CodeGroup>
Agents have three primary tools for editing memory blocks:
- `memory_replace` - Search and replace for precise edits
- `memory_insert` - Insert a line into a block
- `memory_rethink` - Rewrite an entire block
These tools can be attached or detached based on your use case. Not all agents need all tools (for example, some agents may not need `memory_rethink`), and memory tools can be removed entirely from an agent if needed.
The agent decides what information is important enough to persist in its memory blocks, actively maintaining this information over time. This enables agents to build understanding through conversation rather than just retrieving relevant documents.
## Memory Blocks vs RAG
Traditional RAG retrieves semantically similar chunks on-demand. Letta's memory blocks are **persistent, structured context** that agents actively maintain.
**Use memory blocks for:**
- Information that should always be visible (user preferences, agent persona)
- Knowledge that evolves over time (project status, learned preferences)
**Use external memory (RAG-style) for:**
- Large document collections
- Historical conversation logs
- Static reference material
**Best practice:** Use both together. Memory blocks hold the "executive summary" while external storage holds the full details.
## Research Background
Letta is built by the creators of [MemGPT](https://arxiv.org/abs/2310.08560), a research paper that introduced the concept of an "LLM Operating System" for memory management. The base agent design in Letta is a MemGPT-style agent, which inherits core principles of self-editing memory, memory hierarchy, and intelligent context window management.
## Next steps
<CardGroup cols={2}>
<Card
title="Memory Blocks Guide"
href="/guides/agents/memory-blocks"
>
Learn how to implement and configure memory blocks in your agents
</Card>
<Card
title="Context Engineering"
href="/guides/agents/context-engineering"
>
Optimize memory performance and advanced memory management
</Card>
<Card
title="Shared Memory Patterns"
href="/guides/agents/multi-agent-memory"
>
Use shared memory across multiple agents
</Card>
<Card
title="MemGPT Paper"
href="https://arxiv.org/abs/2310.08560"
>
Read the research behind Letta's memory system
</Card>
</CardGroup>

View File

@@ -1,407 +0,0 @@
---
title: Memory Blocks
subtitle: Understanding the building blocks of agent memory
slug: guides/agents/memory-blocks
---
<Info>
Interested in learning more about the origin of memory blocks? Read our [blog post](https://www.letta.com/blog/memory-blocks).
</Info>
## What are memory blocks?
Memory blocks are structured sections of the agent's context window that persist across all interactions. They are always visible - no retrieval needed.
**Memory blocks are Letta's core abstraction.** Create a block with a descriptive label and the agent learns how to use it. This simple mechanism enables capabilities impossible with traditional context management.
**Key properties:**
- **Agent-managed** - Agents autonomously organize information based on block labels
- **Flexible** - Use for any purpose: knowledge, guidelines, state tracking, scratchpad space
- **Shareable** - Multiple agents can access the same block; update once, visible everywhere
- **Always visible** - Blocks stay in context, never need retrieval
**Examples:**
- Store tool usage guidelines so agents avoid past mistakes
- Maintain working memory in a scratchpad block
- Mirror external state (user's current document) for real-time awareness
- Share read-only policies across all agents from a central source
- Coordinate multi-agent systems: parent agents watch subagent result blocks update in real-time
- Enable emergent behavior: add `performance_tracking` or `emotional_state` and watch agents start using them
Memory blocks aren't just storage - they're a coordination primitive that enables sophisticated agent behavior.
## Memory block structure
Memory blocks represent a section of an agent's context window. An agent may have multiple memory blocks, or none at all. A memory block consists of:
* A `label`, which is a unique identifier for the block
* A `description`, which describes the purpose of the block
* A `value`, which is the contents/data of the block
* A `limit`, which is the size limit (in characters) of the block
## The importance of the `description` field
When making memory blocks, it's crucial to provide a good `description` field that accurately describes what the block should be used for.
The `description` is the main information used by the agent to determine how to read and write to that block. Without a good description, the agent may not understand how to use the block.
Because `persona` and `human` are two popular block labels, Letta autogenerates default descriptions for these blocks if you don't provide them. If you provide a description for a memory block labelled `persona` or `human`, the default description will be overridden.
For `persona`, a good default is:
> The persona block: Stores details about your current persona, guiding how you behave and respond. This helps you to maintain consistency and personality in your interactions.
For `human`, a good default is:
> The human block: Stores key details about the person you are conversing with, allowing for more personalized and friend-like conversation.
## Read-only blocks
Memory blocks are read-write by default (so the agent can update the block using memory tools), but can be set to read-only by setting the `read_only` field to `true`. When a block is read-only, the agent cannot update the block.
Read-only blocks are useful when you want to give an agent access to information (for example, a shared memory block about an organization), but you don't want the agent to be able to make potentially destructive changes to the block.
## Creating an agent with memory blocks
When you create an agent, you can specify memory blocks to also be created with the agent. For most chat applications, we recommend create a `human` block (to represent memories about the user) and a `persona` block (to represent the agent's persona).
<CodeGroup>
```typescript TypeScript maxLines=50
// install letta-client with `npm install @letta-ai/letta-client`
import { LettaClient } from '@letta-ai/letta-client'
// create a client connected to Letta Cloud
const client = new LettaClient({
token: process.env.LETTA_API_KEY
});
// create an agent with two basic self-editing memory blocks
const agentState = await client.agents.create({
memoryBlocks: [
{
label: "human",
value: "The human's name is Bob the Builder.",
limit: 5000
},
{
label: "persona",
value: "My name is Sam, the all-knowing sentient AI.",
limit: 5000
}
],
model: "openai/gpt-4o-mini"
});
```
```python title="python" maxLines=50
# install letta_client with `pip install letta-client`
from letta_client import Letta
import os
# create a client connected to Letta Cloud
client = Letta(token=os.getenv("LETTA_API_KEY"))
# create an agent with two basic self-editing memory blocks
agent_state = client.agents.create(
memory_blocks=[
{
"label": "human",
"value": "The human's name is Bob the Builder.",
"limit": 5000
},
{
"label": "persona",
"value": "My name is Sam, the all-knowing sentient AI.",
"limit": 5000
}
],
model="openai/gpt-4o-mini"
)
```
</CodeGroup>
When the agent is created, the corresponding blocks are also created and attached to the agent, so that the block value will be in the context window.
## Creating and attaching memory blocks
You can also directly create blocks and attach them to an agent. This can be useful if you want to create blocks that are shared between multiple agents. If multiple agents are attached to a block, they will all have the block data in their context windows (essentially providing shared memory).
Below is an example of creating a block directory, and attaching the block to two agents by specifying the `block_ids` field.
<CodeGroup>
```typescript TypeScript maxLines=50
// create a persisted block, which can be attached to agents
const block = await client.blocks.create({
label: "organization",
description: "A block to store information about the organization",
value: "Organization: Letta",
limit: 4000,
});
// create an agent with both a shared block and its own blocks
const sharedBlockAgent1 = await client.agents.create({
name: "shared_block_agent1",
memoryBlocks: [
{
label: "persona",
value: "I am agent 1"
},
],
blockIds: [block.id],
model: "openai/gpt-4o-mini"
});
// create another agent with the same shared block
const sharedBlockAgent2 = await client.agents.create({
name: "shared_block_agent2",
memoryBlocks: [
{
label: "persona",
value: "I am agent 2"
},
],
blockIds: [block.id],
model: "openai/gpt-4o-mini"
});
```
```python title="python" maxLines=50
# create a persisted block, which can be attached to agents
block = client.blocks.create(
label="organization",
description="A block to store information about the organization",
value="Organization: Letta",
limit=4000,
)
# create an agent with both a shared block and its own blocks
shared_block_agent1 = client.agents.create(
name="shared_block_agent1",
memory_blocks=[
{
"label": "persona",
"value": "I am agent 1"
},
],
block_ids=[block.id],
model="openai/gpt-4o-mini"
)
# create another agent sharing the block
shared_block_agent2 = client.agents.create(
name="shared_block_agent2",
memory_blocks=[
{
"label": "persona",
"value": "I am agent 2"
},
],
block_ids=[block.id],
model="openai/gpt-4o-mini"
)
```
</CodeGroup>
You can also attach blocks to existing agents:
<CodeGroup>
```typescript TypeScript
await client.agents.blocks.attach(agent.id, block.id);
```
```python Python
client.agents.blocks.attach(agent_id=agent.id, block_id=block.id)
```
</CodeGroup>
You can see all agents attached to a block by using the `block_id` field in the [blocks retrieve](/api-reference/blocks/retrieve) endpoint.
## Managing blocks
### Retrieving a block
You can retrieve the contents of a block by ID. This is useful when blocks store finalized reports, code outputs, or other data you want to extract for use outside the agent.
<CodeGroup>
```typescript TypeScript
const block = await client.blocks.retrieve(block.id);
console.log(block.value); // access the block's content
```
```python Python
block = client.blocks.retrieve(block.id)
print(block.value) # access the block's content
```
</CodeGroup>
### Listing blocks
You can list all blocks, optionally filtering by label or searching by label text. This is useful for finding blocks across your project.
<CodeGroup>
```typescript TypeScript
// list all blocks
const blocks = await client.blocks.list();
// filter by label
const humanBlocks = await client.blocks.list({
label: "human"
});
// search by label text
const searchResults = await client.blocks.list({
labelSearch: "organization"
});
```
```python Python
# list all blocks
blocks = client.blocks.list()
# filter by label
human_blocks = client.blocks.list(label="human")
# search by label text
search_results = client.blocks.list(label_search="organization")
```
</CodeGroup>
### Modifying a block
You can directly modify a block's content, limit, description, or other properties. This is particularly useful for:
- External scripts that provide up-to-date information to agents (e.g., syncing a text file to a block)
- Updating shared blocks that multiple agents reference
- Programmatically managing block content outside of agent interactions
<CodeGroup>
```typescript TypeScript
// update the block's value - completely replaces the content
await client.blocks.modify(block.id, {
value: "Updated organization information: Letta - Building agentic AI"
});
// update multiple properties
await client.blocks.modify(block.id, {
value: "New content",
limit: 6000,
description: "Updated description"
});
```
```python Python
# update the block's value - completely replaces the content
client.blocks.modify(
block.id,
value="Updated organization information: Letta - Building agentic AI"
)
# update multiple properties
client.blocks.modify(
block.id,
value="New content",
limit=6000,
description="Updated description"
)
```
</CodeGroup>
<Warning>
**Setting `value` completely replaces the entire block content** - it is not an append operation. If multiple processes (agents or external scripts) modify the same block concurrently, the last write wins and overwrites all earlier changes. To avoid data loss:
- Set blocks to **read-only** if you don't want agents to modify them
- Only modify blocks directly in controlled scenarios where overwriting is acceptable
- Ensure your application logic accounts for full replacements, not merges
</Warning>
### Deleting a block
You can delete a block when it's no longer needed. Note that deleting a block will remove it from all agents that have it attached.
<CodeGroup>
```typescript TypeScript
await client.blocks.delete(block.id);
```
```python Python
client.blocks.delete(block_id=block.id)
```
</CodeGroup>
### Inspecting block usage
See which agents have a block attached:
<CodeGroup>
```typescript TypeScript
// list all agents that use this block
const agentsWithBlock = await client.blocks.agents.list(block.id);
console.log(`Used by ${agentsWithBlock.length} agents:`);
for (const agent of agentsWithBlock) {
console.log(` - ${agent.name}`);
}
// with pagination
const agentsPage = await client.blocks.agents.list(block.id, {
limit: 10,
order: "asc"
});
```
```python Python
# list all agents that use this block
agents_with_block = client.blocks.agents.list(block_id=block.id)
print(f"Used by {len(agents_with_block)} agents:")
for agent in agents_with_block:
print(f" - {agent.name}")
# with pagination
agents_page = client.blocks.agents.list(
block_id=block.id,
limit=10,
order="asc"
)
```
</CodeGroup>
## Agent-scoped block operations
### Listing an agent's blocks
You can retrieve all blocks attached to a specific agent. This shows you the complete memory configuration for that agent.
<CodeGroup>
```typescript TypeScript
const agentBlocks = await client.agents.blocks.list(agent.id);
```
```python Python
agent_blocks = client.agents.blocks.list(agent_id=agent.id)
```
</CodeGroup>
### Retrieving an agent's block by label
Instead of using a block ID, you can retrieve a block from a specific agent using its label. This is useful when you want to inspect what the agent currently knows about a specific topic.
<CodeGroup>
```typescript TypeScript
// get the agent's current knowledge about the human
const humanBlock = await client.agents.blocks.retrieve(
agent.id,
"human"
);
console.log(humanBlock.value);
```
```python Python
# get the agent's current knowledge about the human
human_block = client.agents.blocks.retrieve(
agent_id=agent.id,
block_label="human"
)
print(human_block.value)
```
</CodeGroup>
### Modifying an agent's block
You can modify a block through the agent-scoped endpoint using the block's label. This is useful for updating agent-specific memory without needing to know the block ID.
<CodeGroup>
```typescript TypeScript
// update the agent's human block
await client.agents.blocks.modify(agent.id, "human", {
value: "The human's name is Alice. She prefers Python over TypeScript."
});
```
```python Python
# update the agent's human block
client.agents.blocks.modify(
agent_id=agent.id,
block_label="human",
value="The human's name is Alice. She prefers Python over TypeScript."
)
```
</CodeGroup>
### Detaching blocks from agents
You can detach a block from an agent's context window. This removes the block from the agent's memory without deleting the block itself.
<CodeGroup>
```typescript TypeScript
await client.agents.blocks.detach(agent.id, block.id);
```
```python Python
client.agents.blocks.detach(agent_id=agent.id, block_id=block.id)
```
</CodeGroup>

View File

@@ -1,459 +0,0 @@
---
title: Message Types
subtitle: Understanding message types and working with agent message history
slug: guides/agents/message-types
---
When you interact with a Letta agent and retrieve its message history using `client.agents.messages.list()`, you'll receive various types of messages that represent different aspects of the agent's execution. This guide explains all message types and how to work with them.
## Overview
Letta uses a structured message system where each message has a specific `message_type` field that indicates its purpose. Messages are returned as instances of `LettaMessageUnion`, which is a discriminated union of all possible message types.
## Message Type Categories
### User and System Messages
#### `user_message`
Messages sent by the user or system events packaged as user input.
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "user_message";
content: string | Array<TextContent | ImageContent>;
name?: string;
otid?: string;
sender_id?: string;
}
```
**Special User Message Subtypes:**
User messages can contain JSON with a `type` field indicating special message subtypes:
- **`login`** - User login events
```json
{
"type": "login",
"last_login": "Never (first login)",
"time": "2025-10-03 12:34:56 PM PDT-0700"
}
```
- **`user_message`** - Standard user messages
```json
{
"type": "user_message",
"message": "Hello, agent!",
"time": "2025-10-03 12:34:56 PM PDT-0700"
}
```
- **`system_alert`** - System notifications and alerts
```json
{
"type": "system_alert",
"message": "System notification text",
"time": "2025-10-03 12:34:56 PM PDT-0700"
}
```
#### `system_message`
Messages generated by the system, typically used for internal context.
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "system_message";
content: string;
name?: string;
}
```
**Note:** System messages are never streamed back in responses; they're only visible when paginating through message history.
### Agent Reasoning and Responses
#### `reasoning_message`
Represents the agent's internal reasoning or "chain of thought."
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "reasoning_message";
reasoning: string;
source: "reasoner_model" | "non_reasoner_model";
signature?: string;
}
```
**Fields:**
- `reasoning` - The agent's internal thought process
- `source` - Whether this was generated by a model with native reasoning (like o1) or via prompting
- `signature` - Optional cryptographic signature for reasoning verification (for models that support it)
#### `hidden_reasoning_message`
Represents reasoning that has been hidden from the response.
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "hidden_reasoning_message";
state: "redacted" | "omitted";
hidden_reasoning?: string;
}
```
**Fields:**
- `state: "redacted"` - The provider redacted the reasoning content
- `state: "omitted"` - The API chose not to include reasoning (e.g., for o1/o3 models)
#### `assistant_message`
The actual message content sent by the agent.
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "assistant_message";
content: string | Array<TextContent>;
name?: string;
}
```
### Tool Execution Messages
#### `tool_call_message`
A request from the agent to execute a tool.
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "tool_call_message";
tool_call: {
name: string;
arguments: string; // JSON string
tool_call_id: string;
};
}
```
**Example:**
```typescript
{
message_type: "tool_call_message",
tool_call: {
name: "archival_memory_search",
arguments: '{"query": "user preferences", "page": 0}',
tool_call_id: "call_abc123"
}
}
```
#### `tool_return_message`
The result of a tool execution.
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "tool_return_message";
tool_return: string;
status: "success" | "error";
tool_call_id: string;
stdout?: string[];
stderr?: string[];
}
```
**Fields:**
- `tool_return` - The formatted return value from the tool
- `status` - Whether the tool executed successfully
- `stdout`/`stderr` - Captured output from the tool execution (useful for debugging)
### Human-in-the-Loop Messages
#### `approval_request_message`
A request for human approval before executing a tool.
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "approval_request_message";
tool_call: {
name: string;
arguments: string;
tool_call_id: string;
};
}
```
See [Human-in-the-Loop](/guides/agents/human-in-the-loop) for more information on this experimental feature.
#### `approval_response_message`
The user's response to an approval request.
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "approval_response_message";
approve: boolean;
approval_request_id: string;
reason?: string;
}
```
## Working with Messages
### Listing Messages
<CodeGroup>
```typescript TypeScript
import { LettaClient } from "@letta-ai/letta-client";
const client = new LettaClient({
baseUrl: "https://api.letta.com",
});
// List recent messages
const messages = await client.agents.messages.list("agent-id", {
limit: 50,
useAssistantMessage: true,
});
// Iterate through message types
for (const message of messages) {
switch (message.messageType) {
case "user_message":
console.log("User:", message.content);
break;
case "assistant_message":
console.log("Agent:", message.content);
break;
case "reasoning_message":
console.log("Reasoning:", message.reasoning);
break;
case "tool_call_message":
console.log("Tool call:", message.toolCall.name);
break;
// ... handle other types
}
}
```
```python Python
from letta_client import Letta
client = Letta(base_url="https://api.letta.com")
# List recent messages
messages = client.agents.messages.list(
agent_id="agent-id",
limit=50,
use_assistant_message=True
)
# Iterate through message types
for message in messages:
if message.message_type == "user_message":
print(f"User: {message.content}")
elif message.message_type == "assistant_message":
print(f"Agent: {message.content}")
elif message.message_type == "reasoning_message":
print(f"Reasoning: {message.reasoning}")
elif message.message_type == "tool_call_message":
print(f"Tool call: {message.tool_call.name}")
# ... handle other types
```
</CodeGroup>
### Filtering Messages by Type
<CodeGroup>
```typescript TypeScript
// Get only assistant messages (what the agent said to the user)
const agentMessages = messages.filter(
(msg) => msg.messageType === "assistant_message"
);
// Get all tool-related messages
const toolMessages = messages.filter(
(msg) => msg.messageType === "tool_call_message" ||
msg.messageType === "tool_return_message"
);
// Get conversation history (user + assistant messages only)
const conversation = messages.filter(
(msg) => msg.messageType === "user_message" ||
msg.messageType === "assistant_message"
);
```
```python Python
# Get only assistant messages (what the agent said to the user)
agent_messages = [
msg for msg in messages
if msg.message_type == "assistant_message"
]
# Get all tool-related messages
tool_messages = [
msg for msg in messages
if msg.message_type in ["tool_call_message", "tool_return_message"]
]
# Get conversation history (user + assistant messages only)
conversation = [
msg for msg in messages
if msg.message_type in ["user_message", "assistant_message"]
]
```
</CodeGroup>
### Pagination
Messages support cursor-based pagination:
<CodeGroup>
```typescript TypeScript
// Get first page
let messages = await client.agents.messages.list("agent-id", {
limit: 100,
});
// Get next page using the last message ID
const lastMessageId = messages[messages.length - 1].id;
const nextPage = await client.agents.messages.list("agent-id", {
limit: 100,
before: lastMessageId,
});
```
```python Python
# Get first page
messages = client.agents.messages.list(
agent_id="agent-id",
limit=100
)
# Get next page using the last message ID
last_message_id = messages[-1].id
next_page = client.agents.messages.list(
agent_id="agent-id",
limit=100,
before=last_message_id
)
```
</CodeGroup>
## Message Metadata Fields
All message types include these common fields:
- **`id`** - Unique identifier for the message
- **`date`** - ISO 8601 timestamp of when the message was created
- **`message_type`** - The discriminator field identifying the message type
- **`name`** - Optional name field (varies by message type)
- **`otid`** - Offline threading ID for message correlation
- **`sender_id`** - The ID of the sender (identity or agent ID)
- **`step_id`** - The step ID associated with this message
- **`is_err`** - Whether this message is part of an error step (debugging only)
- **`seq_id`** - Sequence ID for ordering
- **`run_id`** - The run ID associated with this message
## Best Practices
### 1. Use Type Discriminators
Always check the `message_type` field to safely access type-specific fields:
<CodeGroup>
```typescript TypeScript
if (message.messageType === "tool_call_message") {
// TypeScript now knows message has a toolCall field
console.log(message.toolCall.name);
}
```
```python Python
if message.message_type == "tool_call_message":
# Safe to access tool_call
print(message.tool_call.name)
```
</CodeGroup>
### 2. Handle Special User Messages
When displaying conversations to end users, filter out internal messages:
```python
def is_internal_message(msg):
"""Check if a user message is internal (login, system_alert, etc.)"""
if msg.message_type != "user_message":
return False
if not isinstance(msg.content, str):
return False
try:
parsed = json.loads(msg.content)
return parsed.get("type") in ["login", "system_alert"]
except:
return False
# Get user-facing messages only
display_messages = [
msg for msg in messages
if not is_internal_message(msg)
]
```
### 3. Track Tool Execution
Match tool calls with their returns using `tool_call_id`:
```python
# Build a map of tool calls to their returns
tool_calls = {
msg.tool_call.tool_call_id: msg
for msg in messages
if msg.message_type == "tool_call_message"
}
tool_returns = {
msg.tool_call_id: msg
for msg in messages
if msg.message_type == "tool_return_message"
}
# Find failed tool calls
for call_id, call_msg in tool_calls.items():
if call_id in tool_returns:
return_msg = tool_returns[call_id]
if return_msg.status == "error":
print(f"Tool {call_msg.tool_call.name} failed:")
print(f" {return_msg.tool_return}")
```
## See Also
- [Human-in-the-Loop](/guides/agents/human-in-the-loop) - Using approval messages
- [Streaming Responses](/guides/agents/streaming) - Receiving messages in real-time
- [API Reference](/api-reference/agents/messages/list) - Full API documentation

View File

@@ -1,459 +0,0 @@
---
title: Message Types
subtitle: Understanding message types and working with agent message history
slug: guides/agents/message-types
---
When you interact with a Letta agent and retrieve its message history using `client.agents.messages.list()`, you'll receive various types of messages that represent different aspects of the agent's execution. This guide explains all message types and how to work with them.
## Overview
Letta uses a structured message system where each message has a specific `message_type` field that indicates its purpose. Messages are returned as instances of `LettaMessageUnion`, which is a discriminated union of all possible message types.
## Message Type Categories
### User and System Messages
#### `user_message`
Messages sent by the user or system events packaged as user input.
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "user_message";
content: string | Array<TextContent | ImageContent>;
name?: string;
otid?: string;
sender_id?: string;
}
```
**Special User Message Subtypes:**
User messages can contain JSON with a `type` field indicating special message subtypes:
- **`login`** - User login events
```json
{
"type": "login",
"last_login": "Never (first login)",
"time": "2025-10-03 12:34:56 PM PDT-0700"
}
```
- **`user_message`** - Standard user messages
```json
{
"type": "user_message",
"message": "Hello, agent!",
"time": "2025-10-03 12:34:56 PM PDT-0700"
}
```
- **`system_alert`** - System notifications and alerts
```json
{
"type": "system_alert",
"message": "System notification text",
"time": "2025-10-03 12:34:56 PM PDT-0700"
}
```
#### `system_message`
Messages generated by the system, typically used for internal context.
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "system_message";
content: string;
name?: string;
}
```
**Note:** System messages are never streamed back in responses; they're only visible when paginating through message history.
### Agent Reasoning and Responses
#### `reasoning_message`
Represents the agent's internal reasoning or "chain of thought."
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "reasoning_message";
reasoning: string;
source: "reasoner_model" | "non_reasoner_model";
signature?: string;
}
```
**Fields:**
- `reasoning` - The agent's internal thought process
- `source` - Whether this was generated by a model with native reasoning (like o1) or via prompting
- `signature` - Optional cryptographic signature for reasoning verification (for models that support it)
#### `hidden_reasoning_message`
Represents reasoning that has been hidden from the response.
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "hidden_reasoning_message";
state: "redacted" | "omitted";
hidden_reasoning?: string;
}
```
**Fields:**
- `state: "redacted"` - The provider redacted the reasoning content
- `state: "omitted"` - The API chose not to include reasoning (e.g., for o1/o3 models)
#### `assistant_message`
The actual message content sent by the agent.
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "assistant_message";
content: string | Array<TextContent>;
name?: string;
}
```
### Tool Execution Messages
#### `tool_call_message`
A request from the agent to execute a tool.
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "tool_call_message";
tool_call: {
name: string;
arguments: string; // JSON string
tool_call_id: string;
};
}
```
**Example:**
```typescript
{
message_type: "tool_call_message",
tool_call: {
name: "archival_memory_search",
arguments: '{"query": "user preferences", "page": 0}',
tool_call_id: "call_abc123"
}
}
```
#### `tool_return_message`
The result of a tool execution.
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "tool_return_message";
tool_return: string;
status: "success" | "error";
tool_call_id: string;
stdout?: string[];
stderr?: string[];
}
```
**Fields:**
- `tool_return` - The formatted return value from the tool
- `status` - Whether the tool executed successfully
- `stdout`/`stderr` - Captured output from the tool execution (useful for debugging)
### Human-in-the-Loop Messages
#### `approval_request_message`
A request for human approval before executing a tool.
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "approval_request_message";
tool_call: {
name: string;
arguments: string;
tool_call_id: string;
};
}
```
See [Human-in-the-Loop](/guides/agents/human_in_the_loop) for more information on this experimental feature.
#### `approval_response_message`
The user's response to an approval request.
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "approval_response_message";
approve: boolean;
approval_request_id: string;
reason?: string;
}
```
## Working with Messages
### Listing Messages
<CodeGroup>
```typescript TypeScript
import { LettaClient } from "@letta-ai/letta-client";
const client = new LettaClient({
baseUrl: "https://api.letta.com",
});
// List recent messages
const messages = await client.agents.messages.list("agent-id", {
limit: 50,
useAssistantMessage: true,
});
// Iterate through message types
for (const message of messages) {
switch (message.messageType) {
case "user_message":
console.log("User:", message.content);
break;
case "assistant_message":
console.log("Agent:", message.content);
break;
case "reasoning_message":
console.log("Reasoning:", message.reasoning);
break;
case "tool_call_message":
console.log("Tool call:", message.toolCall.name);
break;
// ... handle other types
}
}
```
```python Python
from letta_client import Letta
client = Letta(base_url="https://api.letta.com")
# List recent messages
messages = client.agents.messages.list(
agent_id="agent-id",
limit=50,
use_assistant_message=True
)
# Iterate through message types
for message in messages:
if message.message_type == "user_message":
print(f"User: {message.content}")
elif message.message_type == "assistant_message":
print(f"Agent: {message.content}")
elif message.message_type == "reasoning_message":
print(f"Reasoning: {message.reasoning}")
elif message.message_type == "tool_call_message":
print(f"Tool call: {message.tool_call.name}")
# ... handle other types
```
</CodeGroup>
### Filtering Messages by Type
<CodeGroup>
```typescript TypeScript
// Get only assistant messages (what the agent said to the user)
const agentMessages = messages.filter(
(msg) => msg.messageType === "assistant_message"
);
// Get all tool-related messages
const toolMessages = messages.filter(
(msg) => msg.messageType === "tool_call_message" ||
msg.messageType === "tool_return_message"
);
// Get conversation history (user + assistant messages only)
const conversation = messages.filter(
(msg) => msg.messageType === "user_message" ||
msg.messageType === "assistant_message"
);
```
```python Python
# Get only assistant messages (what the agent said to the user)
agent_messages = [
msg for msg in messages
if msg.message_type == "assistant_message"
]
# Get all tool-related messages
tool_messages = [
msg for msg in messages
if msg.message_type in ["tool_call_message", "tool_return_message"]
]
# Get conversation history (user + assistant messages only)
conversation = [
msg for msg in messages
if msg.message_type in ["user_message", "assistant_message"]
]
```
</CodeGroup>
### Pagination
Messages support cursor-based pagination:
<CodeGroup>
```typescript TypeScript
// Get first page
let messages = await client.agents.messages.list("agent-id", {
limit: 100,
});
// Get next page using the last message ID
const lastMessageId = messages[messages.length - 1].id;
const nextPage = await client.agents.messages.list("agent-id", {
limit: 100,
before: lastMessageId,
});
```
```python Python
# Get first page
messages = client.agents.messages.list(
agent_id="agent-id",
limit=100
)
# Get next page using the last message ID
last_message_id = messages[-1].id
next_page = client.agents.messages.list(
agent_id="agent-id",
limit=100,
before=last_message_id
)
```
</CodeGroup>
## Message Metadata Fields
All message types include these common fields:
- **`id`** - Unique identifier for the message
- **`date`** - ISO 8601 timestamp of when the message was created
- **`message_type`** - The discriminator field identifying the message type
- **`name`** - Optional name field (varies by message type)
- **`otid`** - Offline threading ID for message correlation
- **`sender_id`** - The ID of the sender (identity or agent ID)
- **`step_id`** - The step ID associated with this message
- **`is_err`** - Whether this message is part of an error step (debugging only)
- **`seq_id`** - Sequence ID for ordering
- **`run_id`** - The run ID associated with this message
## Best Practices
### 1. Use Type Discriminators
Always check the `message_type` field to safely access type-specific fields:
<CodeGroup>
```typescript TypeScript
if (message.messageType === "tool_call_message") {
// TypeScript now knows message has a toolCall field
console.log(message.toolCall.name);
}
```
```python Python
if message.message_type == "tool_call_message":
# Safe to access tool_call
print(message.tool_call.name)
```
</CodeGroup>
### 2. Handle Special User Messages
When displaying conversations to end users, filter out internal messages:
```python
def is_internal_message(msg):
"""Check if a user message is internal (heartbeat, login, etc.)"""
if msg.message_type != "user_message":
return False
if not isinstance(msg.content, str):
return False
try:
parsed = json.loads(msg.content)
return parsed.get("type") in ["heartbeat", "login", "system_alert"]
except:
return False
# Get user-facing messages only
display_messages = [
msg for msg in messages
if not is_internal_message(msg)
]
```
### 3. Track Tool Execution
Match tool calls with their returns using `tool_call_id`:
```python
# Build a map of tool calls to their returns
tool_calls = {
msg.tool_call.tool_call_id: msg
for msg in messages
if msg.message_type == "tool_call_message"
}
tool_returns = {
msg.tool_call_id: msg
for msg in messages
if msg.message_type == "tool_return_message"
}
# Find failed tool calls
for call_id, call_msg in tool_calls.items():
if call_id in tool_returns:
return_msg = tool_returns[call_id]
if return_msg.status == "error":
print(f"Tool {call_msg.tool_call.name} failed:")
print(f" {return_msg.tool_return}")
```
## See Also
- [Human-in-the-Loop](/guides/agents/human_in_the_loop) - Using approval messages
- [Streaming Responses](/guides/agents/streaming) - Receiving messages in real-time
- [API Reference](/api-reference/agents/messages/list) - Full API documentation

View File

@@ -1,459 +0,0 @@
---
title: Message Types
subtitle: Understanding message types and working with agent message history
slug: guides/agents/message-types
---
When you interact with a Letta agent and retrieve its message history using `client.agents.messages.list()`, you'll receive various types of messages that represent different aspects of the agent's execution. This guide explains all message types and how to work with them.
## Overview
Letta uses a structured message system where each message has a specific `message_type` field that indicates its purpose. Messages are returned as instances of `LettaMessageUnion`, which is a discriminated union of all possible message types.
## Message Type Categories
### User and System Messages
#### `user_message`
Messages sent by the user or system events packaged as user input.
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "user_message";
content: string | Array<TextContent | ImageContent>;
name?: string;
otid?: string;
sender_id?: string;
}
```
**Special User Message Subtypes:**
User messages can contain JSON with a `type` field indicating special message subtypes:
- **`login`** - User login events
```json
{
"type": "login",
"last_login": "Never (first login)",
"time": "2025-10-03 12:34:56 PM PDT-0700"
}
```
- **`user_message`** - Standard user messages
```json
{
"type": "user_message",
"message": "Hello, agent!",
"time": "2025-10-03 12:34:56 PM PDT-0700"
}
```
- **`system_alert`** - System notifications and alerts
```json
{
"type": "system_alert",
"message": "System notification text",
"time": "2025-10-03 12:34:56 PM PDT-0700"
}
```
#### `system_message`
Messages generated by the system, typically used for internal context.
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "system_message";
content: string;
name?: string;
}
```
**Note:** System messages are never streamed back in responses; they're only visible when paginating through message history.
### Agent Reasoning and Responses
#### `reasoning_message`
Represents the agent's internal reasoning or "chain of thought."
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "reasoning_message";
reasoning: string;
source: "reasoner_model" | "non_reasoner_model";
signature?: string;
}
```
**Fields:**
- `reasoning` - The agent's internal thought process
- `source` - Whether this was generated by a model with native reasoning (like o1) or via prompting
- `signature` - Optional cryptographic signature for reasoning verification (for models that support it)
#### `hidden_reasoning_message`
Represents reasoning that has been hidden from the response.
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "hidden_reasoning_message";
state: "redacted" | "omitted";
hidden_reasoning?: string;
}
```
**Fields:**
- `state: "redacted"` - The provider redacted the reasoning content
- `state: "omitted"` - The API chose not to include reasoning (e.g., for o1/o3 models)
#### `assistant_message`
The actual message content sent by the agent.
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "assistant_message";
content: string | Array<TextContent>;
name?: string;
}
```
### Tool Execution Messages
#### `tool_call_message`
A request from the agent to execute a tool.
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "tool_call_message";
tool_call: {
name: string;
arguments: string; // JSON string
tool_call_id: string;
};
}
```
**Example:**
```typescript
{
message_type: "tool_call_message",
tool_call: {
name: "archival_memory_search",
arguments: '{"query": "user preferences", "page": 0}',
tool_call_id: "call_abc123"
}
}
```
#### `tool_return_message`
The result of a tool execution.
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "tool_return_message";
tool_return: string;
status: "success" | "error";
tool_call_id: string;
stdout?: string[];
stderr?: string[];
}
```
**Fields:**
- `tool_return` - The formatted return value from the tool
- `status` - Whether the tool executed successfully
- `stdout`/`stderr` - Captured output from the tool execution (useful for debugging)
### Human-in-the-Loop Messages
#### `approval_request_message`
A request for human approval before executing a tool.
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "approval_request_message";
tool_call: {
name: string;
arguments: string;
tool_call_id: string;
};
}
```
See [Human-in-the-Loop](/guides/agents/human_in_the_loop) for more information on this experimental feature.
#### `approval_response_message`
The user's response to an approval request.
**Structure:**
```typescript
{
id: string;
date: datetime;
message_type: "approval_response_message";
approve: boolean;
approval_request_id: string;
reason?: string;
}
```
## Working with Messages
### Listing Messages
<CodeGroup>
```typescript TypeScript
import { LettaClient } from "@letta-ai/letta-client";
const client = new LettaClient({
baseUrl: "https://api.letta.com",
});
// List recent messages
const messages = await client.agents.messages.list("agent-id", {
limit: 50,
useAssistantMessage: true,
});
// Iterate through message types
for (const message of messages) {
switch (message.messageType) {
case "user_message":
console.log("User:", message.content);
break;
case "assistant_message":
console.log("Agent:", message.content);
break;
case "reasoning_message":
console.log("Reasoning:", message.reasoning);
break;
case "tool_call_message":
console.log("Tool call:", message.toolCall.name);
break;
// ... handle other types
}
}
```
```python Python
from letta_client import Letta
client = Letta(base_url="https://api.letta.com")
# List recent messages
messages = client.agents.messages.list(
agent_id="agent-id",
limit=50,
use_assistant_message=True
)
# Iterate through message types
for message in messages:
if message.message_type == "user_message":
print(f"User: {message.content}")
elif message.message_type == "assistant_message":
print(f"Agent: {message.content}")
elif message.message_type == "reasoning_message":
print(f"Reasoning: {message.reasoning}")
elif message.message_type == "tool_call_message":
print(f"Tool call: {message.tool_call.name}")
# ... handle other types
```
</CodeGroup>
### Filtering Messages by Type
<CodeGroup>
```typescript TypeScript
// Get only assistant messages (what the agent said to the user)
const agentMessages = messages.filter(
(msg) => msg.messageType === "assistant_message"
);
// Get all tool-related messages
const toolMessages = messages.filter(
(msg) => msg.messageType === "tool_call_message" ||
msg.messageType === "tool_return_message"
);
// Get conversation history (user + assistant messages only)
const conversation = messages.filter(
(msg) => msg.messageType === "user_message" ||
msg.messageType === "assistant_message"
);
```
```python Python
# Get only assistant messages (what the agent said to the user)
agent_messages = [
msg for msg in messages
if msg.message_type == "assistant_message"
]
# Get all tool-related messages
tool_messages = [
msg for msg in messages
if msg.message_type in ["tool_call_message", "tool_return_message"]
]
# Get conversation history (user + assistant messages only)
conversation = [
msg for msg in messages
if msg.message_type in ["user_message", "assistant_message"]
]
```
</CodeGroup>
### Pagination
Messages support cursor-based pagination:
<CodeGroup>
```typescript TypeScript
// Get first page
let messages = await client.agents.messages.list("agent-id", {
limit: 100,
});
// Get next page using the last message ID
const lastMessageId = messages[messages.length - 1].id;
const nextPage = await client.agents.messages.list("agent-id", {
limit: 100,
before: lastMessageId,
});
```
```python Python
# Get first page
messages = client.agents.messages.list(
agent_id="agent-id",
limit=100
)
# Get next page using the last message ID
last_message_id = messages[-1].id
next_page = client.agents.messages.list(
agent_id="agent-id",
limit=100,
before=last_message_id
)
```
</CodeGroup>
## Message Metadata Fields
All message types include these common fields:
- **`id`** - Unique identifier for the message
- **`date`** - ISO 8601 timestamp of when the message was created
- **`message_type`** - The discriminator field identifying the message type
- **`name`** - Optional name field (varies by message type)
- **`otid`** - Offline threading ID for message correlation
- **`sender_id`** - The ID of the sender (identity or agent ID)
- **`step_id`** - The step ID associated with this message
- **`is_err`** - Whether this message is part of an error step (debugging only)
- **`seq_id`** - Sequence ID for ordering
- **`run_id`** - The run ID associated with this message
## Best Practices
### 1. Use Type Discriminators
Always check the `message_type` field to safely access type-specific fields:
<CodeGroup>
```typescript TypeScript
if (message.messageType === "tool_call_message") {
// TypeScript now knows message has a toolCall field
console.log(message.toolCall.name);
}
```
```python Python
if message.message_type == "tool_call_message":
# Safe to access tool_call
print(message.tool_call.name)
```
</CodeGroup>
### 2. Handle Special User Messages
When displaying conversations to end users, filter out internal messages:
```python
def is_internal_message(msg):
"""Check if a user message is internal (heartbeat, login, etc.)"""
if msg.message_type != "user_message":
return False
if not isinstance(msg.content, str):
return False
try:
parsed = json.loads(msg.content)
return parsed.get("type") in ["login", "system_alert"]
except:
return False
# Get user-facing messages only
display_messages = [
msg for msg in messages
if not is_internal_message(msg)
]
```
### 3. Track Tool Execution
Match tool calls with their returns using `tool_call_id`:
```python
# Build a map of tool calls to their returns
tool_calls = {
msg.tool_call.tool_call_id: msg
for msg in messages
if msg.message_type == "tool_call_message"
}
tool_returns = {
msg.tool_call_id: msg
for msg in messages
if msg.message_type == "tool_return_message"
}
# Find failed tool calls
for call_id, call_msg in tool_calls.items():
if call_id in tool_returns:
return_msg = tool_returns[call_id]
if return_msg.status == "error":
print(f"Tool {call_msg.tool_call.name} failed:")
print(f" {return_msg.tool_return}")
```
## See Also
- [Human-in-the-Loop](/guides/agents/human_in_the_loop) - Using approval messages
- [Streaming Responses](/guides/agents/streaming) - Receiving messages in real-time
- [API Reference](/api-reference/agents/messages/list) - Full API documentation

View File

@@ -1,120 +0,0 @@
---
title: Multi-Agent Systems
slug: guides/agents/multi-agent
---
<Tip>
All agents in Letta are *stateful* - so when you build a multi-agent system in Letta, each agent can run both independently and with others via cross-agent messaging tools! The choice is yours.
</Tip>
Letta provides built-in tools for supporting cross-agent communication to build multi-agent systems.
To enable multi-agent collaboration, you should create agents that have access to the [built-in cross-agent communication tools](#built-in-multi-agent-tools) - either by attaching the tools in the ADE, or via the API or Python/TypeScript SDK.
Letta agents can also share state via [shared memory blocks](/guides/agents/multi-agent-shared-memory). Shared memory blocks allow agents to have shared memory (e.g. memory about an organization they are both a part of or a task they are both working on).
## Built-in Multi-Agent Tools
<Tip>
We recommend only attaching one of `send_message_to_agent_and_wait_for_reply` or `send_message_to_agent_async`, but not both.
Attaching both tools can cause the agent to become confused and use the tool less reliably.
</Tip>
Our built-in tools for multi-agent communication can be used to create both **synchronous** and **asynchronous** communication networks between agents on your Letta server.
However, because all agents in Letta are addressible via a REST API, you can also make your own custom tools that use the [API for messaging agents](/api-reference/agents/messages/create) to design your own version of agent-to-agent communication.
There are three built-in tools for cross-agent communication:
* `send_message_to_agent_async` for asynchronous multi-agent messaging,
* `send_message_to_agent_and_wait_for_reply` for synchronous multi-agent messaging,
* and `send_message_to_agents_matching_all_tags` for a "supervisor-worker" pattern
### Messaging another agent (async / no wait)
<CodeGroup>
```typescript TypeScript
// The function signature for the async multi-agent messaging tool
function sendMessageToAgentAsync(
message: string,
otherAgentId: string
): string
```
```python Python
# The function signature for the async multi-agent messaging tool
def send_message_to_agent_async(
message: str,
other_agent_id: str,
): -> str
```
</CodeGroup>
```mermaid
sequenceDiagram
autonumber
Agent 1->>Agent 2: "Hi Agent 2 are you there?"
Agent 2-->>Agent 1: "Your message has been delivered."
Note over Agent 2: Processes message: "New message from Agent 1: ..."
Agent 2->>Agent 1: "Hi Agent 1, yes I'm here!"
Agent 1-->>Agent 2: "Your message has been delivered."
```
The `send_message_to_agent_async` tool allows one agent to send a message to another agent.
This tool is **asynchronous**: instead of waiting for a response from the target agent, the agent will return immediately after sending the message.
The message that is sent to the target agent contains a "message receipt", indicating which agent sent the message, which allows the target agent to reply to the sender (assuming they also have access to the `send_message_to_agent_async` tool).
### Messaging another agent (wait for reply)
<CodeGroup>
```typescript TypeScript
// The function signature for the synchronous multi-agent messaging tool
function sendMessageToAgentAndWaitForReply(
message: string,
otherAgentId: string
): string
```
```python Python
# The function signature for the synchronous multi-agent messaging tool
def send_message_to_agent_and_wait_for_reply(
message: str,
other_agent_id: str,
): -> str
```
</CodeGroup>
```mermaid
sequenceDiagram
autonumber
Agent 1->>Agent 2: "Hi Agent 2 are you there?"
Note over Agent 2: Processes message: "New message from Agent 1: ..."
Agent 2->>Agent 1: "Hi Agent 1, yes I'm here!"
```
The `send_message_to_agent_and_wait_for_reply` tool also allows one agent to send a message to another agent.
However, this tool is **synchronous**: the agent will wait for a response from the target agent before returning.
The response of the target agent is returned in the tool output - if the target agent does not respond, the tool will return default message indicating no response was received.
### Messaging a group of agents (supervisor-worker pattern)
<CodeGroup>
```typescript TypeScript
// The function signature for the group broadcast multi-agent messaging tool
function sendMessageToAgentsMatchingAllTags(
message: string,
tags: string[]
): string[]
```
```python Python
# The function signature for the group broadcast multi-agent messaging tool
def send_message_to_agents_matching_all_tags(
message: str,
tags: List[str],
) -> List[str]:
```
</CodeGroup>
```mermaid
sequenceDiagram
autonumber
Supervisor->>Worker 1: "Let's start the task"
Supervisor->>Worker 2: "Let's start the task"
Supervisor->>Worker 3: "Let's start the task"
Note over Worker 1,Worker 3: All workers process their tasks
Worker 1->>Supervisor: "Here's my result!"
Worker 2->>Supervisor: "This is what I have"
Worker 3->>Supervisor: "I didn't do anything..."
```
The `send_message_to_agents_matching_all_tags` tool allows one agent to send a message a larger group of agents in a "supervisor-worker" pattern.
For example, a supervisor agent can use this tool to send a message asking all workers in a group to begin a task.
This tool is also **synchronous**, so the result of the tool call will be a list of the responses from each agent in the group.

View File

@@ -1,163 +0,0 @@
---
title: "Multi-modal (image inputs)"
subtitle: "Send images to your agents"
slug: "multimodal"
---
<Note>
Multi-modal features require compatible language models. Ensure your agent is configured with a multi-modal capable model.
</Note>
Letta agents support image inputs, enabling richer conversations and more powerful agent capabilities.
## Model Support
Multi-modal capabilities depend on the underlying language model.
You can check which models from the API providers support image inputs by checking their individual model pages:
- **[OpenAI](https://platform.openai.com/docs/models)**: GPT-4.1, o1/3/4, GPT-4o
- **[Anthropic](https://docs.anthropic.com/en/docs/about-claude/models/overview)**: Claude Opus 4, Claude Sonnet 4
- **[Gemini](https://ai.google.dev/gemini-api/docs/models)**: Gemini 2.5 Pro, Gemini 2.5 Flash
If the provider you're using doesn't support image inputs, your images will still appear in the context window, but as a text message telling the agent that an image exists.
## ADE Support
You can pass images to your agents by drag-and-dropping them into the chat window, or clicking the image icon to select a manual file upload.
<img className="light" src="/images/ade-mm.png" />
<img className="dark" src="/images/ade-mm-dark.png" />
## Usage Examples (SDK)
### Sending an Image via URL
<CodeGroup>
```typescript TypeScript maxLines=100
import { LettaClient } from '@letta-ai/letta-client';
const client = new LettaClient({ token: "LETTA_API_KEY" });
const response = await client.agents.messages.create(
agentState.id, {
messages: [
{
role: "user",
content: [
{
type: "text",
text: "Describe this image."
},
{
type: "image",
source: {
type: "url",
url: "https://upload.wikimedia.org/wikipedia/commons/a/a7/Camponotus_flavomarginatus_ant.jpg",
},
}
],
}
],
}
);
```
```python title="python" maxLines=100
from letta_client import Letta
client = Letta(token="LETTA_API_KEY")
response = client.agents.messages.create(
agent_id=agent_state.id,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Describe this image."
},
{
"type": "image",
"source": {
"type": "url",
"url": "https://upload.wikimedia.org/wikipedia/commons/a/a7/Camponotus_flavomarginatus_ant.jpg",
},
}
],
}
],
)
```
</CodeGroup>
### Sending an Image via Base64
<CodeGroup>
```typescript TypeScript maxLines=100
import { LettaClient } from '@letta-ai/letta-client';
const client = new LettaClient({ token: "LETTA_API_KEY" });
const imageUrl = "https://upload.wikimedia.org/wikipedia/commons/a/a7/Camponotus_flavomarginatus_ant.jpg";
const imageResponse = await fetch(imageUrl);
const imageBuffer = await imageResponse.arrayBuffer();
const imageData = Buffer.from(imageBuffer).toString('base64');
const response = await client.agents.messages.create(
agentState.id, {
messages: [
{
role: "user",
content: [
{
type: "text",
text: "Describe this image."
},
{
type: "image",
source: {
type: "base64",
mediaType: "image/jpeg",
data: imageData,
},
}
],
}
],
}
);
```
```python title="python" maxLines=100
import base64
import httpx
from letta_client import Letta
client = Letta(token="LETTA_API_KEY")
image_url = "https://upload.wikimedia.org/wikipedia/commons/a/a7/Camponotus_flavomarginatus_ant.jpg"
image_data = base64.standard_b64encode(httpx.get(image_url).content).decode("utf-8")
response = client.agents.messages.create(
agent_id=agent_state.id,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Describe this image."
},
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/jpeg",
"data": image_data,
},
}
],
}
],
)
```
</CodeGroup>

View File

@@ -1,177 +0,0 @@
---
title: User Identities
slug: guides/agents/multi-user
---
You may be building a multi-user application with Letta, in which each user is associated with a specific agent.
In this scenario, you can use **Identities** to associate each agent with a user in your application.
## Using Identities
Let's assume that you have an application with multiple users that you're building on a [self-hosted Letta server](/guides/server/docker) or [Letta Cloud](/guides/cloud).
Each user has a unique username, starting at `user_1`, and incrementing up as you add more users to the platform.
To associate agents you create in Letta with your users, you can first create an **Identity** object with the user's unique ID as the `identifier_key` for your user, and then specify the **Identity** object ID when creating an agent.
For example, with `user_1`, we would create a new Identity object with `identifier_key="user_1"` and then pass `identity.id` into our [create agent request](/api-reference/agents/create):
<CodeBlocks>
```curl title="curl"
curl -X POST https://app.letta.com/v1/identities/ \
-H "Authorization: Bearer <token>" \
-H "Content-Type: application/json" \
-d '{
"identifier_key": "user_1",
"name": "Caren",
"identity_type": "user"
}'
{"id":"identity-634d3994-5d6c-46e9-b56b-56e34fe34ca0","identifier_key":"user_1","name":"Caren","identity_type":"user","project_id":null,"agent_ids":[],"organization_id":"org-00000000-0000-4000-8000-000000000000","properties":[]}
curl -X POST https://app.letta.com/v1/agents/ \
-H "Authorization: Bearer <token>" \
-H "Content-Type: application/json" \
-d '{
"memory_blocks": [],
"llm": "anthropic/claude-3-5-sonnet-20241022",
"context_window_limit": 200000,
"embedding": "openai/text-embedding-3-small",
"identity_ids": ["identity-634d3994-5d6c-46e9-b56b-56e34fe34ca0"]
}'
```
```python title="python"
# assumes that you already instantiated a client
identity = client.identities.create(
identifier_key="user_1",
name="Caren",
identity_type="user"
)
agent = client.agents.create(
memory_blocks=[],
model="anthropic/claude-3-5-sonnet-20241022",
context_window_limit=200000,
identity_ids=[identity.id]
)
```
```typescript TypeScript
// assumes that you already instantiated a client
const identity = await client.identities.create({
identifierKey: "user_1",
name: "Caren",
identityType: "user"
})
const agent = await client.agents.create({
memoryBlocks: [],
model: "anthropic/claude-3-5-sonnet-20241022",
contextWindowLimit: 200000,
identityIds: [identity.id]
});
```
</CodeBlocks>
Then, if I wanted to search for agents associated with a specific user (e.g. called `user_id`), I could use the `identifier_keys` parameter in the [list agents request](/api-reference/agents/list):
<CodeBlocks>
```curl title="curl"
curl -X GET "https://app.letta.com/v1/agents/?identifier_keys=user_1" \
-H "Accept: application/json"
```
```python title="python"
# assumes that you already instantiated a client
user_agents = client.agents.list(
identifier_keys=["user_1"]
)
```
```typescript TypeScript
// assumes that you already instantiated a client
await client.agents.list({
identifierKeys: ["user_1"]
});
```
</CodeBlocks>
You can also create an identity object and attach it to an existing agent. This can be useful if you want to enable multiple users to interact with a single agent:
<CodeBlocks>
```curl title="curl"
curl -X POST https://app.letta.com/v1/identities/ \
-H "Authorization: Bearer <token>" \
-H "Content-Type: application/json" \
-d '{
"identifier_key": "user_1",
"name": "Sarah",
"identity_type": "user"
"agent_ids": ["agent-00000000-0000-4000-8000-000000000000"]
}'
```
```python title="python"
# assumes that you already instantiated a client
identity = client.identities.create({
identifier_key="user_1",
name="Sarah",
identity_type="user"
agent_ids=["agent-00000000-0000-4000-8000-000000000000"]
})
```
```typescript TypeScript
// assumes that you already instantiated a client
const identity = await client.identities.create({
identifierKey: "user_1",
name: "Sarah",
identityType: "user"
agentIds: ["agent-00000000-0000-4000-8000-000000000000"]
})
```
</CodeBlocks>
### Using Agent Tags to Identify Users
It's also possible to utilize our agent tags feature to associate agents with specific users. To associate agents you create in Letta with your users, you can specify a tag when creating an agent, and set the tag to the user's unique ID.
This example assumes that you have a self-hosted Letta server running on localhost (for example, by running [`docker run ...`](/guides/server/docker)).
<Accordion title="View example SDK code">
<CodeGroup>
```typescript TypeScript
import { LettaClient } from '@letta-ai/letta-client';
// Connect to Letta Cloud
const client = new LettaClient({token: process.env.LETTA_API_KEY});
const userId = "my_uuid";
// create an agent with the userId tag
const agent = await client.agents.create({
memoryBlocks: [],
model: "anthropic/claude-3-5-sonnet-20241022",
contextWindowLimit: 200000,
tags: [userId]
});
console.log(`Created agent with id ${agent.id}, tags ${agent.tags}`);
// list agents
const userAgents = await client.agents.list({tags: [userId]});
const agentIds = userAgents.map(agent => agent.id);
console.log(`Found matching agents ${agentIds}`);
```
```python Python
from letta_client import Letta
# Connect to Letta Cloud
import os
client = Letta(token=os.getenv("LETTA_API_KEY"))
user_id = "my_uuid"
# create an agent with the user_id tag
agent = client.agents.create(
memory_blocks=[],
model="anthropic/claude-3-5-sonnet-20241022",
context_window_limit=200000,
tags=[user_id]
)
print(f"Created agent with id {agent.id}, tags {agent.tags}")
# list agents
user_agents = client.agents.list(tags=[user_id])
agent_ids = [agent.id for agent in user_agents]
print(f"Found matching agents {agent_ids}")
```
</CodeGroup>
</Accordion>
## Creating and Viewing Tags in the ADE
You can also modify tags in the ADE.
Simply click the **Advanced Settings** tab in the top-left of the ADE to view an agent's tags.
You can create new tags by typing the tag name in the input field and hitting enter.
<img src="../../images/tags.png" />

View File

@@ -1,277 +0,0 @@
---
title: Building Stateful Agents with Letta
slug: guides/agents/overview
---
<Info>
**New to Letta?** If you haven't already, read [Core Concepts](/core-concepts) to understand how Letta's stateful agents are fundamentally different from traditional LLM APIs.
</Info>
Letta agents can automatically manage long-term memory, load data from external sources, and call custom tools.
Unlike in other frameworks, Letta agents are stateful, so they keep track of historical interactions and reserve part of their context to read and write memories which evolve over time.
<img className="light" src="/images/stateful_agents.png" />
<img className="dark" src="/images/stateful_agents_dark.png" />
Letta manages a reasoning loop for agents. At each agent step (i.e. iteration of the loop), the state of the agent is checkpointed and persisted to the database.
You can interact with agents from a REST API, the ADE, and TypeScript / Python SDKs.
As long as they are connected to the same service, all of these interfaces can be used to interact with the same agents.
<Tip>
If you're interested in learning more about stateful agents, read our [blog post](https://www.letta.com/blog/stateful-agents).
</Tip>
## Agents vs Threads
In Letta, you can think of an agent as a single entity that has a single message history which is treated as infinite.
The sequence of interactions the agent has experienced through its existence make up the agent's state (or memory).
One distinction between Letta and other agent frameworks is that Letta does not have the notion of message *threads* (or *sessions*).
Instead, there are only *stateful agents*, which have a single perpetual thread (sequence of messages).
The reason we use the term *agent* rather than *thread* is because Letta is based on the principle that **all agents interactions should be part of the persistent memory**, as opposed to building agent applications around ephemeral, short-lived interactions (like a thread or session).
```mermaid
%%{init: {'flowchart': {'rankDir': 'LR'}}}%%
flowchart LR
subgraph Traditional["Thread-Based Agents"]
direction TB
llm1[LLM] --> thread1["Thread 1
--------
Ephemeral
Session"]
llm1 --> thread2["Thread 2
--------
Ephemeral
Session"]
llm1 --> thread3["Thread 3
--------
Ephemeral
Session"]
end
Traditional ~~~ Letta
subgraph Letta["Letta Stateful Agents"]
direction TB
llm2[LLM] --> agent["Single Agent
--------
Persistent Memory"]
agent --> db[(PostgreSQL)]
db -->|"Learn & Update"| agent
end
class thread1,thread2,thread3 session
class agent agent
```
If you would like to create common starting points for new conversation "threads", we recommending using [agent templates](/guides/templates/overview) to create new agents for each conversation, or directly copying agent state from an existing agent.
For multi-users applications, we recommend creating an agent per-user, though you can also have multiple users message a single agent (but it will be a single shared message history).
## Create an agent
<Note>
To start creating agents with Letta Cloud, [create an API key](https://app.letta.com/api-keys) and set it as `LETTA_API_KEY` in your environment. For self-hosted deployments, see our [self-hosting guide](/guides/selfhosting/overview).
</Note>
You can create a new agent via the REST API, Python SDK, or TypeScript SDK:
<CodeGroup>
```curl curl
curl -X POST https://api.letta.com/v1/agents \
-H "Authorization: Bearer $LETTA_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"memory_blocks": [
{
"value": "The human'\''s name is Bob the Builder.",
"label": "human"
},
{
"value": "My name is Sam, the all-knowing sentient AI.",
"label": "persona"
}
],
"model": "openai/gpt-4o-mini",
"context_window_limit": 16000
}'
```
```python title="python" maxLines=50
# install letta_client with `pip install letta-client`
from letta_client import Letta
import os
# create a client connected to Letta Cloud (uses api.letta.com by default)
client = Letta(token=os.getenv("LETTA_API_KEY"))
# create an agent with two basic self-editing memory blocks
agent_state = client.agents.create(
memory_blocks=[
{
"label": "human",
"value": "The human's name is Bob the Builder."
},
{
"label": "persona",
"value": "My name is Sam, the all-knowing sentient AI."
}
],
model="openai/gpt-4o-mini",
context_window_limit=16000
)
# the AgentState object contains all the information about the agent
print(agent_state)
```
```typescript TypeScript maxLines=50
// install letta-client with `npm install @letta-ai/letta-client`
import { LettaClient } from '@letta-ai/letta-client'
// create a client connected to Letta Cloud (uses api.letta.com by default)
const client = new LettaClient({
token: process.env.LETTA_API_KEY
});
// create an agent with two basic self-editing memory blocks
const agentState = await client.agents.create({
memoryBlocks: [
{
label: "human",
value: "The human's name is Bob the Builder."
},
{
label: "persona",
value: "My name is Sam, the all-knowing sentient AI."
}
],
model: "openai/gpt-4o-mini",
contextWindowLimit: 16000
});
// the AgentState object contains all the information about the agent
console.log(agentState);
```
</CodeGroup>
You can also create an agent without any code using the [Agent Development Environment (ADE)](/agent-development-environment).
All Letta agents are stored in a database on the Letta server, so you can access the same agents from the ADE, the REST API, the Python SDK, and the TypeScript SDK.
The response will include information about the agent, including its `id`:
```json
{
"id": "agent-43f8e098-1021-4545-9395-446f788d7389",
"name": "GracefulFirefly",
...
}
```
Once an agent is created, you can message it:
<CodeGroup>
```curl curl
curl --request POST \
--url https://api.letta.com/v1/agents/$AGENT_ID/messages \
--header 'Authorization: Bearer $LETTA_API_KEY' \
--header 'Content-Type: application/json' \
--data '{
"messages": [
{
"role": "user",
"content": "hows it going????"
}
]
}'
```
```python title="python" maxLines=50
# send a message to the agent
response = client.agents.messages.create(
agent_id=agent_state.id,
messages=[
{
"role": "user",
"content": "hows it going????"
}
]
)
# the response object contains the messages and usage statistics
print(response)
# if we want to print the usage stats
print(response.usage)
# if we want to print the messages
for message in response.messages:
print(message)
```
```typescript TypeScript maxLines=50
// send a message to the agent
const response = await client.agents.messages.create(
agentState.id, {
messages: [
{
role: "user",
content: "hows it going????"
}
]
}
);
// the response object contains the messages and usage statistics
console.log(response);
// if we want to print the usage stats
console.log(response.usage)
// if we want to print the messages
for (const message of response.messages) {
console.log(message);
}
```
</CodeGroup>
### Message Types
The `response` object contains the following attributes:
* `usage`: The usage of the agent after the message was sent (the prompt tokens, completition tokens, and total tokens)
* `message`: A list of `LettaMessage` objects, generated by the agent
#### `LettaMessage`
The `LettaMessage` object is a simplified version of the `Message` object stored in the database backend.
Since a `Message` can include multiple events like a chain-of-thought and function calls, `LettaMessage` simplifies messages to have the following types:
* `reasoning_message`: The inner monologue (chain-of-thought) of the agent
* `tool_call_message`: An agent's tool (function) call
* `tool_call_return`: The result of executing an agent's tool (function) call
* `assistant_message`: An agent's response message (direct response in current architecture, or `send_message` tool call in legacy architectures)
* `system_message`: A system message (for example, an alert about the user logging in)
* `user_message`: A user message
<Note>
In current Letta agents, `assistant_message` represents the agent's direct response. In legacy architectures (`memgpt_agent`, `memgpt_v2_agent`), it wraps the `send_message` tool call.
If you prefer to see the raw tool call format in legacy agents, you can set `use_assistant_message` to `false` in the request `config` (see the [endpoint documentation](/api-reference/agents/messages/create)).
</Note>
## Common agent operations
For more in-depth guide on the full set of Letta agent operations, check out our [API reference](/api-reference/overview), our extended [Python SDK](https://github.com/letta-ai/letta/blob/main/examples/docs/example.py) and [TypeScript SDK](https://github.com/letta-ai/letta/blob/main/examples/docs/node/example.ts) examples, as well as our other [cookbooks](/cookbooks).
If you're using a self-hosted Letta server, you should set the **base URL** (`base_url` in Python, `baseUrl` in TypeScript) to the Letta server's URL (e.g. `http://localhost:8283`) when you create your client. See an example [here](/api-reference/overview).
If you're using a self-hosted server, you can omit the token if you're not using [password protection](/guides/server/docker#password-protection-advanced).
If you are using password protection, set your **token** to the **password**.
If you're using Letta Cloud, you should set the **token** to your **Letta Cloud API key**.
### Retrieving an agent's state
The agent's state is always persisted, so you can retrieve an agent's state by its ID.
<EndpointRequestSnippet endpoint="GET /v1/agents/:agent_id" />
The result of the call is an `AgentState` object:
<EndpointResponseSnippet endpoint="GET /v1/agents/:agent_id" />
### List agents
Replace `agent_id` with your actual agent ID.
<EndpointRequestSnippet endpoint="GET /v1/agents/" />
The result of the call is a list of `AgentState` objects:
<EndpointResponseSnippet endpoint="GET /v1/agents/" />
### Delete an agent
To delete an agent, you can use the `DELETE` endpoint with your `agent_id`:
<EndpointRequestSnippet endpoint="DELETE /v1/agents/:agent_id" />

View File

@@ -1,102 +0,0 @@
---
title: Parallel Tool Calling
slug: guides/agents/parallel-tool-calling
---
When an agent calls multiple tools, Letta can execute them concurrently instead of sequentially.
Parallel tool calling has two configuration levels:
- **Agent LLM config**: Controls whether the LLM can request multiple tool calls at once
- **Individual tool settings**: Controls whether requested tools actually execute in parallel or sequentially
## Model Support
Parallel tool calling is supported for OpenAI and Anthropic models.
## Enabling Parallel Tool Calling
### Agent Configuration
Set `parallel_tool_calls: true` in the agent's LLM config:
<CodeGroup>
```typescript TypeScript
const agent = await client.agents.create({
llm_config: {
model: "anthropic/claude-sonnet-4-20250514",
parallel_tool_calls: true
}
});
```
```python Python
agent = client.agents.create(
llm_config={
"model": "anthropic/claude-sonnet-4-20250514",
"parallel_tool_calls": True
}
)
```
</CodeGroup>
### Tool Configuration
Individual tools must opt-in to parallel execution:
<CodeGroup>
```typescript TypeScript
await client.tools.update(toolId, {
enable_parallel_execution: true
});
```
```python Python
client.tools.update(
tool_id=tool_id,
enable_parallel_execution=True
)
```
</CodeGroup>
By default, tools execute sequentially (`enable_parallel_execution=False`).
<Warning>
Only enable parallel execution for tools safe to run concurrently. Tools that modify shared state or have ordering dependencies should remain sequential.
</Warning>
## ADE Configuration
### Agent Toggle
1. Open **Settings** → **LLM Config**
2. Enable **"Parallel tool calls"**
### Tool Toggle
1. Open the **Tools** panel
2. Click a tool to open it
3. Go to the **Settings** tab
4. Enable **"Enable parallel execution"**
## Execution Behavior
When the agent calls multiple tools:
- Sequential tools execute one-by-one
- Parallel-enabled tools execute concurrently
- Mixed: sequential tools complete first, then parallel tools execute together
Example:
```
Agent calls:
- search_web (parallel: true)
- search_database (parallel: true)
- send_message (parallel: false)
Execution:
1. send_message executes
2. search_web AND search_database execute concurrently
```
## Limitations
- Parallel execution is automatically disabled when [tool rules](/guides/agents/tool-rules) are configured
- Only enable for tools safe to run concurrently (e.g., read-only operations)
- Tools that modify shared state should remain sequential

View File

@@ -1,253 +0,0 @@
---
title: Code Interpreter
subtitle: Execute code in a secure sandbox with full network access
slug: guides/agents/run-code
---
The `run_code` tool enables Letta agents to execute code in a secure sandboxed environment. Useful for data analysis, calculations, API calls, and programmatic computation.
<Info>
On [Letta Cloud](/guides/cloud/overview), this tool works out of the box. For self-hosted deployments, you'll need to [configure an E2B API key](#self-hosted-setup).
</Info>
<Warning>
Each execution runs in a **fresh environment** - variables, files, and state do not persist between runs.
</Warning>
## Quick Start
<CodeGroup>
```python Python
from letta import Letta
client = Letta(token="LETTA_API_KEY")
agent = client.agents.create(
model="openai/gpt-4o",
tools=["run_code"],
memory_blocks=[{
"label": "persona",
"value": "I can run Python code for data analysis and API calls."
}]
)
```
```typescript TypeScript
import { LettaClient } from '@letta-ai/letta-client';
const client = new LettaClient({ token: "LETTA_API_KEY" });
const agent = await client.agents.create({
model: "openai/gpt-4o",
tools: ["run_code"],
memoryBlocks: [{
label: "persona",
value: "I can run Python code for data analysis and API calls."
}]
});
```
</CodeGroup>
## Tool Parameters
| Parameter | Type | Options | Description |
|-----------|------|---------|-------------|
| `code` | `str` | Required | The code to execute |
| `language` | `str` | `python`, `js`, `ts`, `r`, `java` | Programming language |
## Return Format
```json
{
"results": ["Last expression value"],
"logs": {
"stdout": ["Print statements"],
"stderr": ["Error output"]
},
"error": "Error details if execution failed"
}
```
**Output types:**
- `results[]`: Last expression value (Jupyter-style)
- `logs.stdout`: Print statements and standard output
- `logs.stderr`: Error messages
- `error`: Present if execution failed
## Supported Languages
| Language | Key Limitations |
|----------|-----------------|
| **Python** | None - full ecosystem available |
| **JavaScript** | No npm packages - built-in Node modules only |
| **TypeScript** | No npm packages - built-in Node modules only |
| **R** | No tidyverse - base R only |
| **Java** | JShell-style execution - no traditional class definitions |
### Python
Full Python ecosystem with common packages pre-installed:
- **Data**: numpy, pandas, scipy, scikit-learn
- **Web**: requests, aiohttp, beautifulsoup4
- **Utilities**: matplotlib, PyYAML, Pillow
Check available packages:
```python
import pkg_resources
print([d.project_name for d in pkg_resources.working_set])
```
### JavaScript & TypeScript
No npm packages available - only built-in Node modules.
```javascript
// Works
const fs = require('fs');
const http = require('http');
// Fails
const axios = require('axios');
```
### R
Base R only - no tidyverse packages.
```r
# Works
mean(c(1, 2, 3))
# Fails
library(ggplot2)
```
### Java
JShell-style execution - statement-level only.
```java
// Works
System.out.println("Hello");
int x = 42;
// Fails
public class Main {
public static void main(String[] args) { }
}
```
## Network Access
The sandbox has full network access for HTTP requests, API calls, and DNS resolution.
```python
import requests
response = requests.get('https://api.github.com/repos/letta-ai/letta')
data = response.json()
print(f"Stars: {data['stargazers_count']}")
```
## No State Persistence
Variables, files, and state do not carry over between executions. Each `run_code` call is completely isolated.
```python
# First execution
x = 42
# Second execution (separate run_code call)
print(x) # Error: NameError: name 'x' is not defined
```
**Implications:**
- Must re-import libraries each time
- Files written to disk are lost
- Cannot build up state across executions
## Self-Hosted Setup
For self-hosted servers, configure an E2B API key. [E2B](https://e2b.dev) provides the sandbox infrastructure.
<CodeGroup>
```bash Docker
docker run \
-e E2B_API_KEY="your_e2b_api_key" \
letta/letta:latest
```
```yaml Docker Compose
services:
letta:
environment:
- E2B_API_KEY=your_e2b_api_key
```
```python Per-Agent
agent = client.agents.create(
tools=["run_code"],
tool_env_vars={
"E2B_API_KEY": "your_e2b_api_key"
}
)
```
</CodeGroup>
## Common Patterns
### Data Analysis
```python
agent = client.agents.create(
model="openai/gpt-4o",
tools=["run_code"],
memory_blocks=[{
"label": "persona",
"value": "I use Python with pandas and numpy for data analysis."
}]
)
```
### API Integration
```python
agent = client.agents.create(
model="openai/gpt-4o",
tools=["run_code", "web_search"],
memory_blocks=[{
"label": "persona",
"value": "I fetch data from APIs using run_code and search docs with web_search."
}]
)
```
### Statistical Analysis
```python
agent = client.agents.create(
model="openai/gpt-4o",
tools=["run_code"],
memory_blocks=[{
"label": "persona",
"value": "I perform statistical analysis using scipy and numpy."
}]
)
```
## When to Use
| Use Case | Tool | Why |
|----------|------|-----|
| Data analysis | `run_code` | Full Python data stack |
| Math calculations | `run_code` | Programmatic computation |
| Live API data | `run_code` | Network + processing |
| Web scraping | `run_code` | requests + BeautifulSoup |
| Simple search | `web_search` | Purpose-built |
| Persistent data | Archival memory | State persistence |
## Related Documentation
- [Utilities Overview](/guides/agents/prebuilt-tools)
- [Web Search](/guides/agents/web-search)
- [Fetch Webpage](/guides/agents/fetch-webpage)
- [Custom Tools](/guides/agents/custom-tools)
- [Tool Variables](/guides/agents/tool-variables)

View File

@@ -1,213 +0,0 @@
# Scheduling
**Scheduling** is a technique for triggering Letta agents at regular intervals.
Many real-world applications require proactive behavior, such as checking emails every few hours or scraping news sites.
Scheduling can support autonomous agents with the capability to manage ongoing processes.
<Note>
Native scheduling functionality is on the Letta Cloud roadmap. The approaches described in this guide are temporary solutions that work with both self-hosted and cloud deployments.
</Note>
## Common Use Cases
When building autonomous agents with Letta, you often need to trigger them at regular intervals for tasks like:
- **System Monitoring**: Health checks that adapt based on historical patterns
- **Data Processing**: Intelligent ETL processes that handle edge cases contextually
- **Memory Maintenance**: Agents that optimize their own knowledge base over time
- **Proactive Notifications**: Context-aware alerts that consider user preferences and timing
- **Continuous Learning**: Agents that regularly ingest new information and update their understanding
This guide covers simple approaches to implement scheduled agent interactions.
## Option 1: Simple Loop
The most straightforward approach for development and testing:
<CodeGroup>
```typescript TypeScript
import { LettaClient } from '@letta-ai/letta-client';
const client = new LettaClient({ token: process.env.LETTA_API_KEY });
const agentId = "your_agent_id";
while (true) {
const response = await client.agents.messages.create(agentId, {
messages: [{
role: "user",
content: `Scheduled check at ${new Date()}`
}]
});
console.log(`[${new Date()}] Agent responded`);
await new Promise(resolve => setTimeout(resolve, 300000)); // 5 minutes
}
```
```python title="python"
import time
import os
from letta_client import Letta
from datetime import datetime
client = Letta(token=os.getenv("LETTA_API_KEY"))
agent_id = "your_agent_id"
while True:
response = client.agents.messages.create(
agent_id=agent_id,
messages=[{
"role": "user",
"content": f"Scheduled check at {datetime.now()}"
}]
)
print(f"[{datetime.now()}] Agent responded")
time.sleep(300) # 5 minutes
```
</CodeGroup>
**Pros:** Simple, easy to debug
**Cons:** Blocks terminal, stops if process dies
## Option 2: System Cron Jobs
For production deployments, use cron for reliability:
<CodeGroup>
```typescript TypeScript
#!/usr/bin/env node
import { LettaClient } from '@letta-ai/letta-client';
async function sendMessage() {
try {
const client = new LettaClient({ token: process.env.LETTA_API_KEY });
const response = await client.agents.messages.create("your_agent_id", {
messages: [{
role: "user",
content: "Scheduled maintenance check"
}]
});
console.log(`[${new Date()}] Success`);
} catch (error) {
console.error(`[${new Date()}] Error:`, error);
}
}
sendMessage();
```
```python title="python"
#!/usr/bin/env python3
from letta_client import Letta
from datetime import datetime
try:
import os
client = Letta(token=os.getenv("LETTA_API_KEY"))
response = client.agents.messages.create(
agent_id="your_agent_id",
messages=[{
"role": "user",
"content": "Scheduled maintenance check"
}]
)
print(f"[{datetime.now()}] Success")
except Exception as e:
print(f"[{datetime.now()}] Error: {e}")
```
</CodeGroup>
Add to crontab with `crontab -e`:
```bash
*/5 * * * * /usr/bin/python3 /path/to/send_message.py >> /var/log/letta_cron.log 2>&1
# or for Node.js:
*/5 * * * * /usr/bin/node /path/to/send_message.js >> /var/log/letta_cron.log 2>&1
```
**Pros:** System-managed, survives reboots
**Cons:** Requires cron access
## Best Practices
1. **Error Handling**: Always wrap API calls in try-catch blocks
2. **Logging**: Log both successes and failures for debugging
3. **Environment Variables**: Store credentials securely
4. **Rate Limiting**: Respect API limits and add backoff for failures
## Example: Memory Maintenance Bot
Complete example that performs periodic memory cleanup:
<CodeGroup>
```typescript TypeScript
#!/usr/bin/env node
import { LettaClient } from '@letta-ai/letta-client';
async function runMaintenance() {
try {
const client = new LettaClient({ token: process.env.LETTA_API_KEY });
const agentId = "your_agent_id";
const response = await client.agents.messages.create(agentId, {
messages: [{
role: "user",
content: "Please review your memory blocks for outdated information and clean up as needed."
}]
});
// Print any assistant messages
for (const message of response.messages) {
if (message.messageType === "assistant_message") {
console.log(`Agent response: ${message.content?.substring(0, 100)}...`);
}
}
} catch (error) {
console.error("Maintenance failed:", error);
}
}
// Run if called directly
if (import.meta.url === `file://${process.argv[1]}`) {
runMaintenance();
}
```
```python title="python"
#!/usr/bin/env python3
import logging
from datetime import datetime
from letta_client import Letta
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
def run_maintenance():
try:
import os
client = Letta(token=os.getenv("LETTA_API_KEY"))
agent_id = "your_agent_id"
response = client.agents.messages.create(
agent_id=agent_id,
messages=[{
"role": "user",
"content": "Please review your memory blocks for outdated information and clean up as needed."
}]
)
# Print any assistant messages
for message in response.messages:
if message.message_type == "assistant_message":
logging.info(f"Agent response: {message.content[:100]}...")
except Exception as e:
logging.error(f"Maintenance failed: {e}")
if __name__ == "__main__":
run_maintenance()
```
</CodeGroup>
Choose the scheduling method that best fits your deployment environment. For production systems, cron offers the best reliability, while simple loops are perfect for development and testing.

View File

@@ -1,53 +0,0 @@
---
title: Using Tool Variables
slug: guides/agents/tool-variables
---
You can use **tool variables** to specify environment variables available to your custom tools.
For example, if you set a tool variable `PASSWORD` to `banana`, then write a custom function that prints `os.getenv('PASSWORD')` in the tool, the function will print `banana`.
## Assigning tool variables in the ADE
To assign tool variables in the Agent Development Environment (ADE), click on **Env Vars** to open the **Environment Variables** viewer:
<img src="../../images/env_vars_button.png" />
Once in the **Environment Variables** viewer, click **+** to add a new tool variable if one does not exist.
<img src="../../images/tool_variables.png" />
## Assigning tool variables in the API / SDK
You can also assign tool variables on agent creation in the API with the `tool_exec_environment_variables` parameter:
<CodeGroup>
```curl title="curl" {7-9}
curl -X POST https://api.letta.com/v1/agents \
-H "Authorization: Bearer $LETTA_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"memory_blocks": [],
"llm":"openai/gpt-4o-mini",
"tool_exec_environment_variables": {
"API_KEY": "your-api-key-here"
}
}'
```
```python title="python" {5-7}
agent_state = client.agents.create(
memory_blocks=[],
model="openai/gpt-4o-mini",
tool_exec_environment_variables={
"API_KEY": "your-api-key-here"
}
)
```
```typescript TypeScript {5-7}
const agentState = await client.agents.create({
memoryBlocks: [],
model: "openai/gpt-4o-mini",
toolExecEnvironmentVariables: {
"API_KEY": "your-api-key-here"
}
});
```
</CodeGroup>

View File

@@ -1,480 +0,0 @@
---
title: Web Search
subtitle: Search the internet in real-time with AI-powered search
slug: guides/agents/web-search
---
The `web_search` and `fetch_webpage` tools enables Letta agents to search the internet for current information, research, and general knowledge using [Exa](https://exa.ai)'s AI-powered search engine.
<Info>
On [Letta Cloud](/guides/cloud/overview), these tools work out of the box. For self-hosted deployments, you'll need to [configure an Exa API key](#self-hosted-setup).
</Info>
## Web Search
### Adding Web Search to an Agent
<CodeGroup>
```python Python
from letta import Letta
client = Letta(token="LETTA_API_KEY")
agent = client.agents.create(
model="openai/gpt-4o",
embedding="openai/text-embedding-3-small",
tools=["web_search"],
memory_blocks=[
{
"label": "persona",
"value": "I'm a research assistant who uses web search to find current information and cite sources."
}
]
)
```
```typescript TypeScript
import { LettaClient } from '@letta-ai/letta-client';
const client = new LettaClient({ token: "LETTA_API_KEY" });
const agent = await client.agents.create({
model: "openai/gpt-4o",
embedding: "openai/text-embedding-3-small",
tools: ["web_search"],
memoryBlocks: [
{
label: "persona",
value: "I'm a research assistant who uses web search to find current information and cite sources."
}
]
});
```
</CodeGroup>
### Usage Example
```python
response = client.agents.messages.create(
agent_id=agent.id,
messages=[
{
"role": "user",
"content": "What are the latest developments in agent-based AI systems?"
}
]
)
```
Your agent can now choose to use `web_search` when it needs current information.
## Self-Hosted Setup
For self-hosted Letta servers, you'll need an Exa API key.
### Get an API Key
1. Sign up at [dashboard.exa.ai](https://dashboard.exa.ai/)
2. Copy your API key
3. See [Exa pricing](https://docs.exa.ai) for rate limits and costs
### Configuration Options
<CodeGroup>
```bash Docker
docker run \
-v ~/.letta/.persist/pgdata:/var/lib/postgresql/data \
-p 8283:8283 \
-e OPENAI_API_KEY="your_openai_key" \
-e EXA_API_KEY="your_exa_api_key" \
letta/letta:latest
```
```yaml Docker Compose
version: '3.8'
services:
letta:
image: letta/letta:latest
ports:
- "8283:8283"
environment:
- OPENAI_API_KEY=your_openai_key
- EXA_API_KEY=your_exa_api_key
volumes:
- ~/.letta/.persist/pgdata:/var/lib/postgresql/data
```
```python Per-Agent Configuration
agent = client.agents.create(
model="openai/gpt-4o",
embedding="openai/text-embedding-3-small",
tools=["web_search"],
tool_env_vars={
"EXA_API_KEY": "your_exa_api_key"
}
)
```
</CodeGroup>
## Tool Parameters
The `web_search` tool supports advanced filtering and search customization:
| Parameter | Type | Default | Description |
|-----------|------|---------|-------------|
| `query` | `str` | Required | The search query to find relevant web content |
| `num_results` | `int` | 10 | Number of results to return (1-100) |
| `category` | `str` | None | Focus search on specific content types (see below) |
| `include_text` | `bool` | False | Whether to retrieve full page content (usually overflows context) |
| `include_domains` | `List[str]` | None | List of domains to include in search results |
| `exclude_domains` | `List[str]` | None | List of domains to exclude from search results |
| `start_published_date` | `str` | None | Only return content published after this date (ISO format) |
| `end_published_date` | `str` | None | Only return content published before this date (ISO format) |
| `user_location` | `str` | None | Two-letter country code for localized results (e.g., "US") |
### Available Categories
Use the `category` parameter to focus your search on specific content types:
| Category | Best For | Example Query |
|----------|----------|---------------|
| `company` | Corporate information, company websites | "Tesla energy storage solutions" |
| `research paper` | Academic papers, arXiv, research publications | "transformer architecture improvements 2025" |
| `news` | News articles, current events | "latest AI policy developments" |
| `pdf` | PDF documents, reports, whitepapers | "climate change impact assessment" |
| `github` | GitHub repositories, open source projects | "python async web scraping libraries" |
| `tweet` | Twitter/X posts, social media discussions | "reactions to new GPT release" |
| `personal site` | Blogs, personal websites, portfolios | "machine learning tutorial blogs" |
| `linkedin profile` | LinkedIn profiles, professional bios | "AI research engineers at Google" |
| `financial report` | Earnings reports, financial statements | "Apple Q4 2024 earnings" |
### Return Format
The tool returns a JSON-encoded string containing:
```json
{
"query": "search query",
"results": [
{
"title": "Page title",
"url": "https://example.com",
"published_date": "2025-01-15",
"author": "Author name",
"highlights": ["Key excerpt 1", "Key excerpt 2"],
"summary": "AI-generated summary of the content",
"text": "Full page content (only if include_text=true)"
}
]
}
```
## Best Practices
### 1. Guide When to Search
Provide clear instructions to your agent about when web search is appropriate:
```python
memory_blocks=[
{
"label": "persona",
"value": "I'm a helpful assistant. I use web_search for current events, recent news, and topics requiring up-to-date information. I cite my sources."
}
]
```
### 2. Combine with Archival Memory
Use web search for external/current information, and archival memory for your organization's internal data:
```python
# Create agent with both web_search and archival memory tools
agent = client.agents.create(
model="openai/gpt-4o",
embedding="openai/text-embedding-3-small",
tools=["web_search", "archival_memory_search", "archival_memory_insert"],
memory_blocks=[
{
"label": "persona",
"value": "I use web_search for current events and external research. I use archival_memory_search for company-specific information and internal documents."
}
]
)
```
See the [Archival Memory documentation](/guides/agents/archival-memory) for more information.
### 3. Craft Effective Search Queries
Exa uses neural search that understands semantic meaning. Your agent will generally form good queries naturally, but you can improve results by guiding it to:
- **Be descriptive and specific**: "Latest research on RLHF techniques for language models" is better than "RLHF research"
- **Focus on topics, not keywords**: "How companies are deploying AI agents in customer service" works better than "AI agents customer service deployment"
- **Use natural language**: The search engine understands conversational queries like "What are the environmental impacts of Bitcoin mining?"
- **Specify time ranges when relevant**: Guide your agent to use date filters for time-sensitive queries
Example instruction in memory:
```python
memory_blocks=[
{
"label": "search_strategy",
"value": "When searching, I craft clear, descriptive queries that focus on topics rather than keywords. I use the category and date filters when appropriate to narrow results."
}
]
```
### 4. Manage Context Window
By default, `include_text` is `False` to avoid context overflow. The tool returns highlights and AI-generated summaries instead, which are more concise:
```python
memory_blocks=[
{
"label": "search_guidelines",
"value": "I avoid setting include_text=true unless specifically needed, as full text usually overflows the context window. Highlights and summaries are usually sufficient."
}
]
```
## Common Patterns
### Research Assistant
```python
agent = client.agents.create(
model="openai/gpt-4o",
tools=["web_search"],
memory_blocks=[
{
"label": "persona",
"value": "I'm a research assistant. I search for relevant information, synthesize findings from multiple sources, and provide citations."
}
]
)
```
### News Monitor
```python
agent = client.agents.create(
model="openai/gpt-4o-mini",
tools=["web_search"],
memory_blocks=[
{
"label": "persona",
"value": "I monitor news and provide briefings on AI industry developments."
},
{
"label": "topics",
"value": "Focus: AI/ML, agent systems, LLM advancements"
}
]
)
```
### Customer Support
```python
agent = client.agents.create(
model="openai/gpt-4o",
tools=["web_search"],
memory_blocks=[
{
"label": "persona",
"value": "I help customers by checking documentation, service status pages, and community discussions for solutions."
}
]
)
```
## Troubleshooting
### Agent Not Using Web Search
Check:
1. Tool is attached: `"web_search"` in agent's tools list
2. Instructions are clear about when to search
3. Model has good tool-calling capabilities (GPT-4, Claude 3+)
```python
# Verify tools
agent = client.agents.retrieve(agent_id=agent.id)
print([tool.name for tool in agent.tools])
```
### Missing EXA_API_KEY
If you see errors about missing API keys on self-hosted deployments:
```bash
# Check if set
echo $EXA_API_KEY
# Set for session
export EXA_API_KEY="your_exa_api_key"
# Docker example
docker run -e EXA_API_KEY="your_exa_api_key" letta/letta:latest
```
## When to Use Web Search
| Use Case | Tool | Why |
|----------|------|-----|
| Current events, news | `web_search` | Real-time information |
| External research | `web_search` | Broad internet access |
| Internal documents | Archival memory | Fast, static data |
| User preferences | Memory blocks | In-context, instant |
| General knowledge | Pre-trained model | No search needed |
## Fetch Webpage
<CodeGroup>
```python Python
from letta import Letta
client = Letta(token="LETTA_API_KEY")
agent = client.agents.create(
model="openai/gpt-4o",
tools=["fetch_webpage"],
memory_blocks=[{
"label": "persona",
"value": "I can fetch and read webpages to answer questions about online content."
}]
)
```
```typescript TypeScript
import { LettaClient } from '@letta-ai/letta-client';
const client = new LettaClient({ token: "LETTA_API_KEY" });
const agent = await client.agents.create({
model: "openai/gpt-4o",
tools: ["fetch_webpage"],
memoryBlocks: [{
label: "persona",
value: "I can fetch and read webpages to answer questions about online content."
}]
});
```
</CodeGroup>
## Tool Parameters
| Parameter | Type | Description |
|-----------|------|-------------|
| `url` | `str` | The URL of the webpage to fetch |
## Return Format
The tool returns webpage content as text/markdown.
**With Exa API (if configured):**
```json
{
"title": "Page title",
"published_date": "2025-01-15",
"author": "Author name",
"text": "Full page content in markdown"
}
```
**Fallback (without Exa):**
Returns markdown-formatted text extracted from the HTML.
## How It Works
The tool uses a multi-tier approach:
1. **Exa API** (if `EXA_API_KEY` is configured): Uses Exa's content extraction
2. **Trafilatura** (fallback): Open-source text extraction to markdown
3. **Readability + html2text** (final fallback): HTML cleaning and conversion
## Self-Hosted Setup
For enhanced fetching on self-hosted servers, optionally configure an Exa API key. Without it, the tool still works using open-source extraction.
### Optional: Configure Exa
<CodeGroup>
```bash Docker
docker run \
-e EXA_API_KEY="your_exa_api_key" \
letta/letta:latest
```
```yaml Docker Compose
services:
letta:
environment:
- EXA_API_KEY=your_exa_api_key
```
```python Per-Agent
agent = client.agents.create(
tools=["fetch_webpage"],
tool_env_vars={
"EXA_API_KEY": "your_exa_api_key"
}
)
```
</CodeGroup>
## Common Patterns
### Documentation Reader
```python
agent = client.agents.create(
model="openai/gpt-4o",
tools=["fetch_webpage", "web_search"],
memory_blocks=[{
"label": "persona",
"value": "I search for documentation with web_search and read it with fetch_webpage."
}]
)
```
### Research Assistant
```python
agent = client.agents.create(
model="openai/gpt-4o",
tools=["fetch_webpage", "archival_memory_insert"],
memory_blocks=[{
"label": "persona",
"value": "I fetch articles and store key insights in archival memory for later reference."
}]
)
```
### Content Summarizer
```python
agent = client.agents.create(
model="openai/gpt-4o",
tools=["fetch_webpage"],
memory_blocks=[{
"label": "persona",
"value": "I fetch webpages and provide summaries of their content."
}]
)
```
## When to Use
| Use Case | Tool | Why |
|----------|------|-----|
| Read specific webpage | `fetch_webpage` | Direct URL access |
| Find webpages to read | `web_search` | Discovery first |
| Read + search in one | `web_search` with `include_text=true` | Combined operation |
| Multiple pages | `fetch_webpage` | Iterate over URLs |
## Related Documentation
- [Utilities Overview](/guides/agents/prebuilt-tools)
- [Web Search](/guides/agents/web-search)
- [Run Code](/guides/agents/run-code)
- [Custom Tools](/guides/agents/custom-tools)
- [Tool Variables](/guides/agents/tool-variables)

File diff suppressed because it is too large Load Diff

View File

@@ -1,28 +0,0 @@
---
title: Bring-Your-Own API Keys
subtitle: Connect your own API keys for supported model providers (OpenAI, Anthropic, etc.)
slug: guides/cloud/custom-keys
---
<Note>
To generate a **Letta API key** (which you use to interact with your agents on Letta Cloud), visit your [account settings](https://app.letta.com/settings/profile) page.
</Note>
<Warning>
Letta Cloud only support bring-your-own-key for enterprise customers. To learn more about enterprise plans and pricing, visit our [pricing page](https://www.letta.com/pricing) or [contact us](https://forms.letta.com/request-demo) to request a demo.
</Warning>
## Using Your Own API Keys
Connect your own API keys for supported providers (OpenAI, Anthropic, Gemini) to Letta Cloud through the [models page](https://app.letta.com/models). When you have a custom API key (successfully) registered, you will see additional models listed in the ADE model dropdown.
### Selecting Your Custom Provider
After you connect your own OpenAI / Anthropic / Gemini API key, make sure to select your custom provider in the ADE under "Your models".
For example, after connecting your own OpenAI API key, you will see multiple OpenAI models but with different providers ("Letta hosted" vs "Your models") - if you want to use your own OpenAI API key, you need to select the copy of the model associated with your custom provider.
### Billing and Quotas
Requests made using your custom API keys **do not count** towards your monthly request quotas or usage-based billing. Instead, you'll be billed directly by the provider (OpenAI, Anthropic, etc.) according to their pricing for your personal account.
Note that direct provider pricing will likely differ from Letta Cloud rates, and requests through your own API key may cost more than those made through Letta Cloud's managed services.

View File

@@ -1,31 +0,0 @@
---
title: "Observability Overview"
subtitle: "Monitor and trace your agents in Letta Cloud"
slug: "guides/observability"
---
<Note>
All observability features are available in real-time for every Letta Cloud project.
</Note>
Letta Cloud's observability tools help you monitor performance and debug issues. Each project you create in Letta Cloud has two main observability dashboards:
## [Monitoring](/guides/cloud/monitoring)
<img className="light" src="/images/observability_graph.png" />
<img className="dark" src="/images/observability_graph_dark.png" />
Track key metrics across four dashboards:
- **Overview**: Message count, API/tool errors, LLM/tool latency
- **Activity & Usage**: Usage patterns and resource consumption
- **Performance**: Response times and throughput
- **Errors**: Detailed error analysis and debugging info
## [Responses & Tracing](/guides/observability/responses)
<img className="light" src="/images/observability_responses.png" />
<img className="dark" src="/images/observability_responses_dark.png" />
Inspect API responses and agent execution:
- **API Responses**: List of all responses with duration and status
- **Message Inspection**: Click "Inspect Message" to see the full POST request and agent loop execution sequence

View File

@@ -1,63 +0,0 @@
---
title: Plans & Pricing
subtitle: Guide to pricing and model usage for Free, Pro, and Enterprise plans
slug: guides/cloud/plans
---
<Note>
Upgrade your plan and view your usage on [your account page](https://app.letta.com/settings/organization/usage)
</Note>
## Available Plans
<CardGroup>
<Card
title="Free"
subtitle="For getting started"
>
- **5,000** monthly credits
- Access the Letta API
- Edit agents visually in the ADE
- **2** agent templates
- **1 GB** of storage
</Card>
<Card
title="Pro ($20 / month)"
subtitle="For shipping agents in production"
>
- **20,000** monthly credits
- Pay-as-you-go credit overage
- Unlimited agents
- **20** agent templates
- **10 GB** of storage
</Card>
</CardGroup>
<Note>
For organizations with higher volume needs, our Enterprise plan offers increased quotas, dedicated support, role-based access control (RBAC), SSO (SAML, OIDC), and private model deployment options.
[Contact our team](https://forms.letta.com/request-demo) to learn more.
</Note>
## What are credits?
Credits are a standard cost unit for resources in Letta, such as LLM inference and CPU cycles. When agents run on Letta, they make LLM model requests and execute tools. Model requests consume credits at a rate depending on the model tier (standard vs. premium) and whether Max Mode is enabled for longer context sizes. Tool executions that run in Letta are charged at a flat rate per second of execution. See up-to-date credit pricing for available models [here](https://app.letta.com/settings/organization/models).
## What tools are executed by Letta?
Sandbox code execution and execution of custom tools run inside of Letta, so incur a credit cost for CPU time. Remote MCP tools are executed by the MCP tool provider, so do not have a credit cost. Letta built-in tools are executed for free.
## How do monthly credits work?
Your Letta agents use large language models (LLMs) to reason and take actions. These model requests consume credits from your monthly balance (or additional purchased credits). Your balance of monthly credits refreshes every month.
## What is Max Mode?
Certain models have the ability to run with extended context windows. Turning on Max Mode extends the context window of the model driving your Letta agent beyond the 100k default, which may help when working with large files or codebases, but will increase cost (credit use) and latency.
## What's the difference between the Letta API and open source Letta?
The Letta API Platform is our fully-managed service for stateful agents, handling all agent infrastructure and state management to create scalable agent services. The Letta API Platform also has additional features beyond the open source such as durable execution for long-running agents, built-in sandboxing, agent templates, optimized vector search, message indexing, and observability.
## Can I transfer my agents between open source and cloud?
Yes, the Letta API Platform supports [agent file](https://docs.letta.com/guides/agents/agent-file), which allows you to move your agents freely between self-hosted instances of the Letta open source and the Letta platform.

View File

@@ -1,54 +0,0 @@
---
title: Memory Variables
slug: guides/templates/variables
---
<Note>
Memory variables are a feature in [agent templates](/guides/cloud/templates) (part of [Letta Cloud](/guides/cloud)).
To use memory variables, you must be using an agent template, not an agent.
</Note>
Memory variables allow you to dynamically define parts of your agent memory at the time of agent creation (when a [template](/guides/cloud/templates) is used to create a new agent).
## Defining variables in memory blocks
To use memory variables in your agent templates, you can define variables in your memory blocks by wrapping them in `{{ }}`.
For example, if you have an agent template called `customer-service-template` designed to handle customer support issues, you might have a block of memory that stores information about the user:
```handlebars
The user is contacting me to resolve a customer support issue.
Their name is {{name}} and the ticket number for this request is {{ticket}}.
```
Once variables have been defined inside of your memory block, they will dynamically appear at variables in the **ADE variables window** (click the "\{\} Variables" button at the top of the chat window to expand the dropdown).
## Simulating variable values in the ADE
<Tip>
Reset the state of the simulated agent by clicking the "Flush Simulation" 🔄 button.
</Tip>
While designing agent templates in the ADE, you can interact with a simulated agent.
The ADE variables window allows you to specify the values of the variables for the simulated agent.
You can see the current state of the simulated agent's memory by clicking the "Simulated" tab in the "Core Memory" panel in the ADE.
If you're using memory variables and do not specify values for the variables in the ADE variables window, the simulated agent will use empty values.
In this prior example, the `name` and `ticket` variables are memory variables that we will specify when we create a new agent - information that we expect to have available at that time.
While designing the agent template, we will likely want to experiment with different values for these variables to make sure that the agent is behaving as expected.
For example, if we change the name of the user from "Alice" to "Bob", the simulated agent should respond accordingly.
## Defining variables during agent creation
When we're ready to create an agent from our template, we can specify the values for the variables using the `variables` parameter in the [create agents from template endpoint](/api-reference/templates/agents/create):
```sh
curl -X POST https://app.letta.com/v1/templates/{project_slug}/{template_name}:{template_version} \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer YOUR_API_KEY' \
-d '{
"from_template": customer-service-template:latest",
"variables": {
"name": "Bob",
"ticket": "TX-123"
}
}'
```

View File

@@ -1,41 +0,0 @@
---
title: Versioning Agent Templates
slug: guides/templates/versioning
---
<Note>
Versioning is a feature in [agent templates](/guides/cloud/templates) (part of [Letta Cloud](/guides/cloud/overview)).
To use versioning, you must be using an agent template, not an agent.
</Note>
Versions allow you to keep track of the changes you've made to your template over time.
Agent templates follow the versioning convention of `template-name:version-number`.
Similar to [Docker tags](https://docs.docker.com/get-started/docker-concepts/building-images/build-tag-and-publish-an-image/#tagging-images), you can specify the latest version of a template using the `latest` keyword (`template-name:latest`).
## Creating a new template version
When you create a template, it starts off at version 1.
Once you've make edits to your template in the ADE, you can create a new version of the template by clicking the "Template" button in the ADE (top right), then clicking "Save new template version".
Version numbers are incremented automatically (e.g. version 1 becomes version 2).
## Migrating existing agents to a new template version
If you've deployed agents on a previous version of the template, you'll be asked if you want to migrate your existing agents to the new version of the template.
When you migrate existing agents to a new template version, Letta Cloud will re-create your existing agents using the new template information, but keeping prior agent state such as the conversation history, and injecting memory variables as needed.
### When should I migrate (or not migrate) my agents?
One reason you might want to migrate your agents is if you've added new tools to your agent template: migrating existing agents to the new version of the template will give them access to the new tools, while retaining all of their prior state.
Another example usecase is if you make modifications to your prompts to tune your agent behavior - if you find a modification works well, you can save a new version with the prompt edits, and migrate all deployed agents to the new version.
### Forking an agent template
If you decide to make significant changes to your agent and would prefer to make a new template to track your changes, you can easily create a new agent template from an existing template by **forking** your template (click the settings button ⚙️ in the ADE, then click "Fork Template").
## Specifying a version when creating an agent
You can specify a template version when creating an agent in the you can use the [create agents from template endpoint](/api-reference/templates/agents/create)
For example, to deploy an agent from a template called `template-name` at version 2, you would use `:2` as the template tag:
```sh
curl -X POST https://app.letta.com/v1/templates/{project_slug}/{template_name}:2 \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer YOUR_API_KEY' \
-d '{}'
```

View File

@@ -1,124 +0,0 @@
---
title: Research Background
subtitle: The academic foundations of Letta
slug: concepts/letta
---
<Info>
**Looking for practical concepts?** See [Core Concepts](/core-concepts) for understanding how to build with Letta's stateful agents.
</Info>
## Letta and MemGPT
**[Letta](https://letta.com)** was created by the same team that created **[MemGPT](https://research.memgpt.ai)**.
### MemGPT: The Research Paper
**MemGPT is a research paper** ([arXiv:2310.08560](https://arxiv.org/abs/2310.08560)) that introduced foundational concepts for building stateful LLM agents:
- **Self-editing memory** - LLMs using tools to edit their own context window and external storage
- **LLM Operating System** - Infrastructure layer managing agent state, memory, and execution
- **Memory hierarchy** - Distinguishing between in-context memory (core) and out-of-context memory (archival)
- **Context window management** - Intelligent paging and memory consolidation techniques
The paper demonstrated that LLMs could maintain coherent conversations far beyond their context window limits by actively managing their own memory through tool calling.
[Read the full MemGPT paper →](https://arxiv.org/abs/2310.08560)
### MemGPT: The Agent Architecture
MemGPT also refers to a **specific agent architecture** popularized by the research paper. A MemGPT agent has:
- Memory editing tools (`memory_replace`, `memory_insert`, `memory_rethink`)
- Archival memory tools (`archival_memory_insert`, `archival_memory_search`)
- Conversation search tools (`conversation_search`, `conversation_search_date`)
- A structured context window with persona and human memory blocks
This architecture makes MemGPT agents particularly effective for long-range chat applications, document search, and personalized assistants.
[Learn more about MemGPT agents →](/guides/legacy/memgpt-agents-legacy)
### Letta: The Framework
**Letta is a production framework** that allows you to build and deploy agents with MemGPT-style memory systems (and beyond) as **services** behind REST APIs.
While the MemGPT research focused on the agent architecture and memory system, Letta provides:
- **Production infrastructure** - Database backends, persistence, state management
- **Agent runtime** - Tool execution, reasoning loops, multi-agent orchestration
- **Developer tools** - Agent Development Environment (ADE), SDKs, monitoring
- **Deployment options** - Letta Cloud for managed hosting, or self-hosted with Docker
- **Flexibility** - Build MemGPT agents, or design custom agent architectures with different memory systems
**In short:**
- **MemGPT (research)** = Ideas about how agents should manage memory
- **MemGPT (architecture)** = Specific agent design with memory tools
- **Letta (framework)** = Production system for building and deploying stateful agents
## Agents in Context
The concept of "agents" has a long history across multiple fields:
**In reinforcement learning and AI**, agents are entities that:
1. Perceive their environment through sensors
2. Make decisions based on internal state
3. Take actions that affect their environment
4. Learn from outcomes to improve future decisions
**In economics and game theory**, agents are autonomous decision-makers with their own objectives and strategies.
**In LLMs**, agents extend these concepts by using language models for reasoning and tool calling for actions. Letta's approach emphasizes:
- **Statefulness** - Persistent memory and identity across sessions
- **Autonomy** - Self-directed memory management and multi-step reasoning
- **Tool use** - Modifying internal state and accessing external resources
## LLM Operating System
The **LLM OS** is the infrastructure layer that manages agent execution and state. This concept, introduced in the MemGPT paper, draws an analogy to traditional operating systems:
Just as an OS manages memory, processes, and I/O for programs, the LLM OS manages:
- **Memory layer** - Context window management, paging, and persistence
- **Agent runtime** - Tool execution and the reasoning loop
- **Stateful layer** - Coordination across database, cache, and execution
Letta implements this LLM OS architecture, providing the infrastructure for stateful agent services.
## Self-Editing Memory
A key innovation from the MemGPT research is **self-editing memory** - agents that actively manage their own memory using tools.
Traditional RAG systems passively retrieve documents. Letta agents actively:
- **Edit in-context memory** - Update memory blocks based on learned information
- **Manage archival storage** - Decide what facts to persist long-term
- **Search strategically** - Query their memory when relevant context is needed
This active memory management enables agents to learn and evolve through interactions rather than requiring retraining or prompt engineering.
[Learn more about Letta's memory system →](/guides/agents/memory)
## Further Reading
<CardGroup cols={2}>
<Card
title="Core Concepts"
href="/core-concepts"
>
Practical guide to building with stateful agents
</Card>
<Card
title="MemGPT Research Details"
href="/concepts/memgpt"
>
Deep dive into the MemGPT paper's technical contributions
</Card>
<Card
title="Agent Memory System"
href="/guides/agents/memory"
>
How agents manage memory in Letta
</Card>
<Card
title="MemGPT Agents"
href="/guides/legacy/memgpt-agents-legacy"
>
Build agents with the MemGPT architecture
</Card>
</CardGroup>

View File

@@ -1,37 +0,0 @@
---
title: MemGPT
subtitle: Learn about the key ideas behind MemGPT
slug: concepts/memgpt
---
<Tip>The MemGPT open source framework / package was renamed to _Letta_. You can read about the difference between Letta and MemGPT [here](/concepts/letta), or read more about the change on our [blog post](https://www.letta.com/blog/memgpt-and-letta).</Tip>
## MemGPT - the research paper
<Frame caption="Figure 1 from the MemGPT paper showing the system architecture. Note that 'working context' from the paper is referred to as 'core memory' in the codebase. To read the paper, visit https://arxiv.org/abs/2310.08560.">
<img src="/images/memgpt-system-diagram.png" />
</Frame>
**MemGPT** is the name of a [**research paper**](https://arxiv.org/abs/2310.08560) that popularized several of the key concepts behind the "LLM Operating System (OS)":
1. **Memory management**: In MemGPT, an LLM OS moves data in and out of the context window of the LLM to manage its memory.
2. **Memory hierarchy**: The "LLM OS" divides the LLM's memory (aka its "virtual context", similar to "[virtual memory](https://en.wikipedia.org/wiki/Virtual_memory)" in computer systems) into two parts: the in-context memory, and out-of-context memory.
3. **Self-editing memory via tool calling**: In MemGPT, the "OS" that manages memory is itself an LLM. The LLM moves data in and out of the context window using designated memory-editing tools.
4. **Multi-step reasoning using heartbeats**: MemGPT supports multi-step reasoning (allowing the agent to take multiple steps in sequence) via the concept of "heartbeats". Whenever the LLM outputs a tool call, it has to option to request a heartbeat by setting the keyword argument `request_heartbeat` to `true`. If the LLM requests a heartbeat, the LLM OS continues execution in a loop, allowing the LLM to "think" again.
You can read more about the MemGPT memory hierarchy and memory management system in our [memory concepts guide](/advanced/memory_management).
## MemGPT - the agent architecture
**MemGPT** also refers to a particular **agent architecture** that was popularized by the paper and adopted widely by other LLM chatbots:
1. **Chat-focused core memory**: The core memory of a MemGPT agent is split into two parts - the agent's own persona, and the user information. Because the MemGPT agent has self-editing memory, it can update its own personality over time, as well as update the user information as it learns new facts about the user.
2. **Vector database archival memory**: By default, the archival memory connected to a MemGPT agent is backed by a vector database, such as [Chroma](https://www.trychroma.com/) or [pgvector](https://github.com/pgvector/pgvector). Because in MemGPT all connections to memory are driven by tools, it's simple to exchange archival memory to be powered by a more traditional database (you can even make archival memory a flatfile if you want!).
## Creating MemGPT agents in the Letta framework
Because **Letta** was created out of the original MemGPT open source project, it's extremely easy to make MemGPT agents inside of Letta (the default Letta agent architecture is a MemGPT agent).
See our [agents overview](/guides/agents/overview) for a tutorial on how to create MemGPT agents with Letta.
**The Letta framework also allow you to make agent architectures beyond MemGPT** that differ significantly from the architecture proposed in the research paper - for example, agents with multiple logical threads (e.g. a "concious" and a "subconcious"), or agents with more advanced memory types (e.g. task memory).
Additionally, **the Letta framework also allows you to expose your agents as *services*** (over REST APIs) - so you can use the Letta framework to power your AI applications.

File diff suppressed because it is too large Load Diff

View File

@@ -1,59 +0,0 @@
---
title: RAG with Letta
subtitle: Connect your custom RAG pipeline to Letta agents
slug: guides/rag/overview
---
If you have an existing Retrieval-Augmented Generation (RAG) pipeline, you can connect it to your Letta agents. While Letta provides built-in features like archival memory, you can integrate your own RAG pipeline just as you would with any LLM API. This gives you full control over your data and retrieval methods.
## What is RAG?
Retrieval-Augmented Generation (RAG) enhances LLM responses by retrieving relevant information from external data sources before generating an answer. Instead of relying on the model's training data, a RAG system:
1. Takes a user query.
2. Searches a vector database for relevant documents.
3. Includes those documents in the LLM's context.
4. Generates an informed response based on the retrieved information.
## Choosing Your RAG Approach
Letta supports two approaches for integrating RAG, depending on how much control you want over the retrieval process.
| Aspect | Simple RAG | Agentic RAG |
|--------|------------|-------------|
| **Who Controls Retrieval** | Your application controls when retrieval happens and what the retrieval query is. | The agent decides when to retrieve and what query to use. |
| **Context Inclusion** | You can always include retrieval results in the context. | Retrieval happens only when the agent determines it's needed. |
| **Latency** | Lower typically single-hop, as the agent doesn't need to do a tool call. | Higher requires tool calls for retrieval. |
| **Client Code** | More complex, as it handles retrieval logic. | Simpler, as it just sends the user query. |
| **Customization** | You have full control via your retrieval function. | You have full control via your custom tool definition. |
Both approaches work with any vector database. Our tutorials include examples for **ChromaDB**, **MongoDB Atlas**, and **Qdrant**.
## Next Steps
Ready to integrate RAG with your Letta agents?
<CardGroup cols={2}>
<Card
title="Simple RAG Tutorial"
icon="fa-sharp fa-light fa-magnifying-glass"
href="/guides/rag/simple"
iconPosition="left"
>
Learn how to manage retrieval on the client-side and inject context directly into your agent's messages.
</Card>
<Card
title="Agentic RAG Tutorial"
icon="fa-sharp fa-light fa-robot"
href="/guides/rag/agentic"
iconPosition="left"
>
Learn how to empower your agent with custom search tools for autonomous retrieval.
</Card>
</CardGroup>
## Additional Resources
- [Custom Tools](/guides/agents/custom-tools) - Learn more about creating custom tools for your agents.
- [Memory Management](/guides/agents/memory) - Understand how Letta's built-in memory works.
- [Agent Development Environment](/guides/ade) - Configure and test your agents in the web interface.

File diff suppressed because it is too large Load Diff

View File

@@ -1,274 +0,0 @@
---
title: Examples & Tutorials
slug: cookbooks
---
Build powerful AI agents with persistent memory. Explore tutorials, ready-to-use templates, and community projects to get started.
<Info>
**New to Letta?**
- Start with our [Quickstart Guide](/quickstart)
- Take the free [DeepLearning.AI Course](https://www.deeplearning.ai/short-courses/llms-as-operating-systems-agent-memory/)
- Explore [Awesome Letta](https://github.com/letta-ai/awesome-letta) for more resources
</Info>
## Getting Started Tutorials
Step-by-step guides to learn Letta fundamentals.
<CardGroup cols={2}>
<Card
title="Your First Agent"
icon="fa-sharp fa-light fa-rocket"
href="/examples/hello-world"
iconPosition="left"
>
Build your first Letta agent in minutes
</Card>
<Card
title="Talk to Your PDF"
icon="fa-sharp fa-light fa-file-pdf"
href="/tutorials/pdf-chat"
iconPosition="left"
>
Create an agent that can answer questions about PDF documents
</Card>
<Card
title="Attaching & Detaching Memory Blocks"
icon="fa-sharp fa-light fa-memory"
href="/examples/attaching-detaching-blocks"
iconPosition="left"
>
Learn how to dynamically manage agent memory
</Card>
<Card
title="Shared Memory Blocks"
icon="fa-sharp fa-light fa-share-nodes"
href="/tutorials/shared-memory-blocks"
iconPosition="left"
>
Share memory between multiple agents for coordination
</Card>
</CardGroup>
## Ready-to-Deploy Applications
Production-ready templates you can clone and customize.
<CardGroup cols={2}>
<Card
title="Next.js Chatbot"
icon="fa-sharp fa-light fa-messages"
href="https://github.com/letta-ai/letta-chatbot-example"
iconPosition="left"
>
Full-stack chatbot with per-user agent memory (Next.js + TypeScript)
</Card>
<Card
title="Discord Bot"
icon="fa-brands fa-discord"
href="https://github.com/letta-ai/letta-discord-bot-example"
iconPosition="left"
>
Discord bot with persistent memory for each server and user
</Card>
<Card
title="Character.AI Clone"
icon="fa-sharp fa-light fa-user-robot"
href="https://github.com/letta-ai/characterai-memory"
iconPosition="left"
>
Create AI characters with memory that persists across conversations
</Card>
<Card
title="Deep Research Agent"
icon="fa-sharp fa-light fa-magnifying-glass"
href="https://github.com/letta-ai/deep-research"
iconPosition="left"
>
Research agent that gathers and synthesizes information over time
</Card>
</CardGroup>
## Multi-Agent Systems
Build coordinated teams of specialized agents.
<CardGroup cols={2}>
<Card
title="Async Multi-Agent"
icon="fa-sharp fa-light fa-user-group"
href="/cookbooks/multi-agent-async"
iconPosition="left"
>
Connect agents to chat with each other and users simultaneously
</Card>
<Card
title="Customer-Specific Agents"
icon="fa-sharp fa-light fa-users"
href="/cookbooks/customer-specific-agents"
iconPosition="left"
>
Template for building relationship-aware agents for each customer
</Card>
</CardGroup>
## Tools & Integrations
Connect Letta to your favorite platforms and tools.
<CardGroup cols={3}>
<Card
title="Vercel AI SDK"
icon="fa-sharp fa-light fa-triangle"
href="https://github.com/letta-ai/vercel-ai-sdk-provider"
iconPosition="left"
>
Use Letta with Vercel AI SDK v5
</Card>
<Card
title="Zapier"
icon="fa-sharp fa-light fa-bolt"
href="https://zapier.com/apps/letta/integrations"
iconPosition="left"
>
Connect agents to 7,000+ apps
</Card>
<Card
title="n8n Workflows"
icon="fa-sharp fa-light fa-diagram-project"
href="https://github.com/letta-ai/n8n-nodes-letta"
iconPosition="left"
>
Integrate with n8n automation workflows
</Card>
<Card
title="Telegram Bot"
icon="fa-brands fa-telegram"
href="https://github.com/letta-ai/letta-telegram"
iconPosition="left"
>
Deploy agents on Telegram
</Card>
<Card
title="Obsidian Plugin"
icon="fa-sharp fa-light fa-note-sticky"
href="https://github.com/letta-ai/letta-obsidian"
iconPosition="left"
>
Add Letta agents to your knowledge base
</Card>
<Card
title="DuckDB Agent"
icon="fa-sharp fa-light fa-database"
href="https://github.com/letta-ai/letta-duckdb-agent"
iconPosition="left"
>
SQL-powered data analysis agent
</Card>
</CardGroup>
## SDK Examples
Learn the basics with minimal code examples.
<CardGroup cols={2}>
<Card
title="TypeScript SDK"
icon="fa-brands fa-js"
href="https://github.com/letta-ai/letta/tree/main/examples/docs/node/example.ts"
iconPosition="left"
>
Basic TypeScript/Node.js SDK example
</Card>
<Card
title="Python SDK"
icon="fa-brands fa-python"
href="https://github.com/letta-ai/letta/tree/main/examples/docs/example.py"
iconPosition="left"
>
Basic Python SDK example
</Card>
</CardGroup>
## Community Projects
Amazing projects built by the Letta community.
<CardGroup cols={2}>
<Card
title="Thought Stream"
icon="fa-sharp fa-light fa-comments"
href="https://tangled.sh/@cameron.pfiffer.org/thought-stream"
iconPosition="left"
>
Deploy Letta agents to an ATProto-powered multi-agent chatroom
</Card>
<Card
title="Thought Stream CLI"
icon="fa-sharp fa-light fa-terminal"
href="https://tangled.org/@cameron.pfiffer.org/thought-stream-cli"
iconPosition="left"
>
IRC-style CLI for the Thought Stream
</Card>
</CardGroup>
## Learning Resources
<CardGroup cols={2}>
<Card
title="DeepLearning.AI Course"
icon="fa-sharp fa-light fa-graduation-cap"
href="https://www.deeplearning.ai/short-courses/llms-as-operating-systems-agent-memory/"
iconPosition="left"
>
Free course: LLMs as Operating Systems - Building Agents with Memory
</Card>
<Card
title="Core Concepts"
icon="fa-sharp fa-light fa-book"
href="/overview"
iconPosition="left"
>
Understand how Letta agents work
</Card>
<Card
title="API Reference"
icon="fa-sharp fa-light fa-code"
href="/api-reference/overview"
iconPosition="left"
>
Complete API documentation
</Card>
<Card
title="Research Papers"
icon="fa-sharp fa-light fa-flask"
href="https://www.letta.com/blog"
iconPosition="left"
>
Read about the research behind Letta
</Card>
</CardGroup>
## More Resources
<CardGroup cols={2}>
<Card
title="Awesome Letta"
icon="fa-sharp fa-light fa-star"
href="https://github.com/letta-ai/awesome-letta"
iconPosition="left"
>
Comprehensive curated list of Letta resources, tools, and community projects
</Card>
<Card
title="Join Discord"
icon="fa-brands fa-discord"
href="https://discord.gg/letta"
iconPosition="left"
>
Get help and share your projects with the community
</Card>
</CardGroup>

View File

@@ -1,68 +0,0 @@
# Letta Evals Documentation
Welcome to the comprehensive documentation for Letta Evals Kit - a framework for evaluating Letta AI agents.
## Table of Contents
### Getting Started
- [Getting Started](./getting-started.md) - Installation, first evaluation, and core concepts
### Core Concepts
- [Overview](./concepts/overview.md) - Understanding the evaluation framework
- [Suites](./concepts/suites.md) - Evaluation suite configuration
- [Datasets](./concepts/datasets.md) - Creating and managing test datasets
- [Targets](./concepts/targets.md) - What you're evaluating
- [Graders](./concepts/graders.md) - How responses are scored
- [Extractors](./concepts/extractors.md) - Extracting submissions from agent output
- [Gates](./concepts/gates.md) - Pass/fail criteria
### Graders
- [Grader Overview](./graders/overview.md) - Understanding grader types
- [Tool Graders](./graders/tool-graders.md) - Built-in and custom function graders
- [Rubric Graders](./graders/rubric-graders.md) - LLM-as-judge evaluation
- [Multi-Metric Grading](./graders/multi-metric.md) - Evaluating with multiple metrics
### Extractors
- [Extractor Overview](./extractors/overview.md) - Understanding extractors
- [Built-in Extractors](./extractors/builtin.md) - All available extractors
- [Custom Extractors](./extractors/custom.md) - Writing your own extractors
### Configuration
- [Suite YAML Reference](./configuration/suite-yaml.md) - Complete YAML schema
- [Target Configuration](./configuration/targets.md) - Target setup options
- [Grader Configuration](./configuration/graders.md) - Grader parameters
- [Environment Variables](./configuration/environment.md) - Environment setup
### Advanced Usage
- [Custom Graders](./advanced/custom-graders.md) - Writing custom grading functions
- [Multi-Turn Conversations](./advanced/multi-turn-conversations.md) - Testing conversational memory and state
- [Agent Factories](./advanced/agent-factories.md) - Programmatic agent creation
- [Multi-Model Evaluation](./advanced/multi-model.md) - Testing across models
- [Setup Scripts](./advanced/setup-scripts.md) - Pre-evaluation setup
- [Memory Block Testing](./advanced/memory-blocks.md) - Testing agent memory
- [Result Streaming](./advanced/streaming.md) - Real-time results and caching
### Results & Metrics
- [Understanding Results](./results/overview.md) - Result structure and interpretation
- [Metrics](./results/metrics.md) - Aggregate statistics
- [Output Formats](./results/output-formats.md) - JSON, JSONL, and console output
### CLI Reference
- [Commands](./cli/commands.md) - All CLI commands
- [Options](./cli/options.md) - Command-line options
### Examples
- [Example Walkthroughs](./examples/README.md) - Detailed example explanations
### API Reference
- [Data Models](./api/models.md) - Pydantic models reference
- [Decorators](./api/decorators.md) - @grader and @extractor decorators
### Best Practices
- [Writing Effective Tests](./best-practices/writing-tests.md)
- [Designing Rubrics](./best-practices/rubrics.md)
- [Performance Optimization](./best-practices/performance.md)
### Troubleshooting
- [Common Issues](./troubleshooting.md)
- [FAQ](./faq.md)

View File

@@ -1,425 +0,0 @@
# Custom Graders
Write your own grading functions to implement custom evaluation logic.
## Overview
Custom graders let you:
- Implement domain-specific evaluation
- Parse and validate complex formats
- Apply custom scoring algorithms
- Combine multiple checks in one grader
## Basic Structure
```python
from letta_evals.decorators import grader
from letta_evals.models import GradeResult, Sample
@grader
def my_custom_grader(sample: Sample, submission: str) -> GradeResult:
"""Your custom grading logic."""
# Evaluate the submission
score = calculate_score(submission, sample)
return GradeResult(
score=score, # Must be 0.0 to 1.0
rationale="Explanation of the score",
metadata={"extra": "information"}
)
```
## The @grader Decorator
The `@grader` decorator registers your function so it can be used in suite YAML:
```python
from letta_evals.decorators import grader
@grader # Makes this function available as "my_function"
def my_function(sample: Sample, submission: str) -> GradeResult:
...
```
Without the decorator, your function won't be discovered.
## Function Signature
Your grader must have this signature:
```python
def grader_name(sample: Sample, submission: str) -> GradeResult:
...
```
### Parameters
- `sample`: The dataset sample being evaluated (includes `input`, `ground_truth`, `metadata`, etc.)
- `submission`: The extracted text from the agent's response
### Return Value
Must return a `GradeResult`:
```python
from letta_evals.models import GradeResult
return GradeResult(
score=0.85, # Required: 0.0 to 1.0
rationale="Explanation", # Optional but recommended
metadata={"key": "value"} # Optional: any extra data
)
```
## Complete Example
```python
# custom_graders.py
import json
from letta_evals.decorators import grader
from letta_evals.models import GradeResult, Sample
@grader
def json_field_validator(sample: Sample, submission: str) -> GradeResult:
"""Validates JSON and checks for required fields."""
required_fields = sample.ground_truth.split(",") # e.g., "name,age,email"
try:
data = json.loads(submission)
except json.JSONDecodeError as e:
return GradeResult(
score=0.0,
rationale=f"Invalid JSON: {e}",
metadata={"error": "json_decode"}
)
missing = [f for f in required_fields if f not in data]
if missing:
score = 1.0 - (len(missing) / len(required_fields))
return GradeResult(
score=score,
rationale=f"Missing fields: {', '.join(missing)}",
metadata={"missing_fields": missing}
)
return GradeResult(
score=1.0,
rationale="All required fields present",
metadata={"fields_found": required_fields}
)
```
Dataset:
```jsonl
{"input": "Return user info as JSON", "ground_truth": "name,age,email"}
```
Suite:
```yaml
graders:
json_check:
kind: tool
function: json_field_validator
extractor: last_assistant
```
## Using Custom Graders
### Method 1: Custom Evaluators File
Create a file with your graders (e.g., `custom_evaluators.py`) in your project:
```python
from letta_evals.decorators import grader
from letta_evals.models import GradeResult, Sample
@grader
def my_grader(sample: Sample, submission: str) -> GradeResult:
...
```
Reference it in your suite:
```yaml
# The file will be automatically discovered if it's in the same directory
# or use Python path imports
graders:
my_metric:
kind: tool
function: my_grader
extractor: last_assistant
```
### Method 2: Setup Script
Import your graders in a setup script:
```python
# setup.py
from letta_evals.models import SuiteSpec
import custom_evaluators # This imports and registers graders
def prepare_environment(suite: SuiteSpec) -> None:
pass # Graders are registered via import
```
```yaml
setup_script: setup.py:prepare_environment
graders:
my_metric:
kind: tool
function: my_grader
extractor: last_assistant
```
## Real-World Examples
### Length Check
```python
@grader
def appropriate_length(sample: Sample, submission: str) -> GradeResult:
"""Check if response length is within expected range."""
min_len = 50
max_len = 500
length = len(submission)
if min_len <= length <= max_len:
score = 1.0
rationale = f"Length {length} is appropriate"
elif length < min_len:
score = max(0.0, length / min_len)
rationale = f"Too short: {length} chars (min {min_len})"
else:
score = max(0.0, 1.0 - (length - max_len) / max_len)
rationale = f"Too long: {length} chars (max {max_len})"
return GradeResult(score=score, rationale=rationale)
```
### Keyword Coverage
```python
@grader
def keyword_coverage(sample: Sample, submission: str) -> GradeResult:
"""Check what percentage of required keywords are present."""
keywords = sample.ground_truth.split(",")
submission_lower = submission.lower()
found = [kw for kw in keywords if kw.lower() in submission_lower]
score = len(found) / len(keywords) if keywords else 0.0
return GradeResult(
score=score,
rationale=f"Found {len(found)}/{len(keywords)} keywords: {', '.join(found)}",
metadata={"found": found, "missing": list(set(keywords) - set(found))}
)
```
Dataset:
```jsonl
{"input": "Explain photosynthesis", "ground_truth": "light,energy,chlorophyll,oxygen,carbon dioxide"}
```
### Tool Call Validation
```python
import json
@grader
def correct_tool_arguments(sample: Sample, submission: str) -> GradeResult:
"""Validate tool was called with correct arguments."""
try:
args = json.loads(submission)
except json.JSONDecodeError:
return GradeResult(score=0.0, rationale="No valid tool call found")
expected_tool = sample.metadata.get("expected_tool")
if args.get("tool_name") != expected_tool:
return GradeResult(
score=0.0,
rationale=f"Wrong tool: expected {expected_tool}, got {args.get('tool_name')}"
)
# Check arguments
expected_args = json.loads(sample.ground_truth)
matches = all(args.get(k) == v for k, v in expected_args.items())
if matches:
return GradeResult(score=1.0, rationale="Tool called with correct arguments")
else:
return GradeResult(score=0.5, rationale="Tool correct but arguments differ")
```
### Numeric Range Check
```python
@grader
def numeric_range(sample: Sample, submission: str) -> GradeResult:
"""Check if extracted number is within expected range."""
try:
value = float(submission.strip())
min_val, max_val = map(float, sample.ground_truth.split(","))
if min_val <= value <= max_val:
return GradeResult(
score=1.0,
rationale=f"Value {value} is within range [{min_val}, {max_val}]"
)
else:
# Partial credit based on distance
if value < min_val:
distance = min_val - value
else:
distance = value - max_val
score = max(0.0, 1.0 - (distance / max_val))
return GradeResult(
score=score,
rationale=f"Value {value} outside range [{min_val}, {max_val}]"
)
except ValueError as e:
return GradeResult(score=0.0, rationale=f"Invalid numeric value: {e}")
```
### Multi-Criteria
```python
@grader
def comprehensive_check(sample: Sample, submission: str) -> GradeResult:
"""Multiple checks with weighted scoring."""
points = 0.0
issues = []
# Check 1: Contains answer (40%)
if sample.ground_truth.lower() in submission.lower():
points += 0.4
else:
issues.append("Missing expected answer")
# Check 2: Appropriate length (20%)
if 100 <= len(submission) <= 500:
points += 0.2
else:
issues.append(f"Length {len(submission)} not in range [100, 500]")
# Check 3: Starts with capital letter (10%)
if submission and submission[0].isupper():
points += 0.1
else:
issues.append("Doesn't start with capital letter")
# Check 4: Ends with punctuation (10%)
if submission and submission[-1] in ".!?":
points += 0.1
else:
issues.append("Doesn't end with punctuation")
# Check 5: No profanity (20%)
profanity = ["badword1", "badword2"]
if not any(word in submission.lower() for word in profanity):
points += 0.2
else:
issues.append("Contains inappropriate language")
rationale = f"Score: {points:.2f}. " + (
"All checks passed!" if not issues else f"Issues: {'; '.join(issues)}"
)
return GradeResult(
score=points,
rationale=rationale,
metadata={"issues": issues}
)
```
## Accessing Sample Data
The `Sample` object provides:
```python
sample.id # Sample ID
sample.input # Input (str or List[str])
sample.ground_truth # Expected answer (optional)
sample.metadata # Dict with custom data (optional)
sample.agent_args # Agent creation args (optional)
```
Use these for flexible grading logic:
```python
@grader
def context_aware_grader(sample: Sample, submission: str) -> GradeResult:
category = sample.metadata.get("category", "general")
if category == "math":
# Strict for math
return exact_math_check(sample, submission)
elif category == "creative":
# Lenient for creative
return length_and_relevance_check(sample, submission)
else:
return default_check(sample, submission)
```
## Error Handling
Always handle exceptions:
```python
@grader
def safe_grader(sample: Sample, submission: str) -> GradeResult:
try:
# Your logic here
score = complex_calculation(submission)
return GradeResult(score=score, rationale="Success")
except Exception as e:
# Return 0.0 with error message
return GradeResult(
score=0.0,
rationale=f"Error during grading: {str(e)}",
metadata={"error": str(e), "error_type": type(e).__name__}
)
```
This ensures evaluation continues even if individual samples fail.
## Testing Your Grader
Test your grader with sample data:
```python
from letta_evals.models import Sample, GradeResult
# Test case
sample = Sample(
id=0,
input="What is 2+2?",
ground_truth="4"
)
submission = "The answer is 4"
result = my_grader(sample, submission)
print(f"Score: {result.score}, Rationale: {result.rationale}")
```
## Best Practices
1. **Validate input**: Check for edge cases (empty strings, malformed data)
2. **Use meaningful rationales**: Explain why a score was given
3. **Handle errors gracefully**: Return 0.0 with error message rather than crashing
4. **Keep it fast**: Custom graders run for every sample
5. **Use metadata**: Store extra information for debugging
6. **Normalize scores**: Always return 0.0 to 1.0
7. **Document your grader**: Add docstrings explaining criteria
## Next Steps
- [Custom Extractors](../extractors/custom.md)
- [Tool Graders](../graders/tool-graders.md)
- [Examples](../examples/README.md)

View File

@@ -1,216 +0,0 @@
# Multi-Turn Conversations
Multi-turn conversations allow you to test how agents handle context across multiple exchanges - a key capability for stateful agents.
## Why Use Multi-Turn?
Multi-turn conversations enable testing that single-turn prompts cannot:
- **Memory storage**: Verify agents persist information to memory blocks across turns
- **Tool call sequences**: Test multi-step workflows (e.g., search → analyze → summarize)
- **Context retention**: Ensure agents remember details from earlier in the conversation
- **State evolution**: Track how agent state changes across interactions
- **Conversational coherence**: Test if agents maintain context appropriately
This is essential for stateful agents where behavior depends on conversation history.
## Single vs Multi-Turn Format
### Single-Turn (Default)
Most evaluations use a single prompt:
```jsonl
{"input": "What is the capital of France?", "ground_truth": "Paris"}
```
The agent receives one message and responds. Single-turn conversations are useful for simpler agents and for testing next-step behavior.
### Multi-Turn
For testing conversational memory, use an array of messages:
```jsonl
{"input": ["My name is Alice", "What's my name?"], "ground_truth": "Alice"}
```
The agent receives multiple messages in sequence:
1. Turn 1: "My name is Alice"
2. Turn 2: "What's my name?"
See the [built-in extractors](../extractors/builtin.md) for more information on how to use the agent's response from a multi-turn conversation for grading.
## How It Works
When you provide an array for `input`, the framework:
1. Sends the first message to the agent
2. Waits for the agent's response
3. Sends the second message
4. Continues until all messages are sent
5. Extracts and grades the agent's response using the specified extractor and grader.
## Use Cases
### Testing Memory Persistence
```jsonl
{"input": ["I live in Paris", "Where do I live?"], "ground_truth": "Paris"}
```
Tests whether the agent stores information correctly using the `memory_block` extractor.
### Testing Tool Call Sequences
```jsonl
{"input": ["Search for pandas", "What did you find about their diet?"], "ground_truth": "bamboo"}
```
Verifies the agent calls tools in the right order and uses results appropriately.
### Testing Context Retention
```jsonl
{"input": ["My favorite color is blue", "What color do I prefer?"], "ground_truth": "blue"}
```
Ensures the agent recalls details from earlier in the conversation.
### Testing Long-Term Memory
```jsonl
{"input": ["My name is Alice", "Tell me a joke", "What's my name again?"], "ground_truth": "Alice"}
```
Checks if the agent remembers information even after intervening exchanges.
## Example Configuration
```yaml
name: multi-turn-test
dataset: conversations.jsonl
target:
kind: agent
agent_file: agent.af
base_url: http://localhost:8283
graders:
recall:
kind: tool
function: contains
extractor: last_assistant
gate:
metric_key: recall
op: gte
value: 0.8
```
The grader evaluates the agent's final response (after all turns).
## Testing Both Response and Memory
Multi-turn evaluations become especially powerful when combined with the `memory_block` extractor:
```yaml
graders:
response_accuracy:
kind: tool
function: contains
extractor: last_assistant
memory_storage:
kind: tool
function: contains
extractor: memory_block
extractor_config:
block_label: human
```
This tests two things:
1. **Did the agent respond correctly?** (using conversation context)
2. **Did the agent persist the information?** (to its memory blocks)
An agent might pass the first test by keeping information in working memory, but fail the second by not properly storing it for long-term recall.
## Context vs Persistence
Consider this result:
```
Results by metric:
response_accuracy - Avg: 1.00, Pass: 100.0%
memory_storage - Avg: 0.00, Pass: 0.0%
```
The agent answered correctly (100%) but didn't store anything in memory (0%). This reveals important agent behavior:
- **Working memory**: Agent kept information in conversation context
- **Persistent memory**: Agent didn't update its memory blocks
For short conversations, working memory is sufficient. For long-term interactions, persistent memory is crucial.
## Complete Example
See [`examples/multi-turn-memory/`](https://github.com/letta-ai/letta-evals/tree/main/examples/multi-turn-memory) for a working example that demonstrates:
- Multi-turn conversation format
- Dual metric evaluation (response + memory)
- The difference between context-based recall and true persistence
## Best Practices
### 1. Keep Turns Focused
Each turn should test one aspect of memory or context:
```jsonl
{"input": ["I'm allergic to peanuts", "Can I eat this cookie?"], "ground_truth": "peanut"}
```
### 2. Test Realistic Scenarios
Design conversations that mirror real user interactions:
```jsonl
{"input": ["Set a reminder for tomorrow at 2pm", "What reminders do I have?"], "ground_truth": "2pm"}
```
### 3. Use Tags for Organization
Tag multi-turn samples to distinguish them:
```jsonl
{"input": ["Hello", "How are you?"], "tags": ["multi-turn", "greeting"]}
```
### 4. Test Memory Limits
See how far back agents can recall:
```jsonl
{"input": ["My name is Alice", "message 2", "message 3", "message 4", "What's my name?"], "ground_truth": "Alice"}
```
### 5. Combine with Memory Extractors
Always verify both response and internal state for memory tests.
## Limitations
### Turn Count
Very long conversations may exceed context windows. Monitor token usage for conversations with many turns.
### State Isolation
Each sample starts with a fresh agent (or fresh conversation if using `agent_id`). Multi-turn tests memory within a single conversation, not across separate conversations.
### Extraction
Most extractors work on the final state. If you need to check intermediate turns, consider using custom extractors.
## Next Steps
- [Built-in Extractors](../extractors/builtin.md) - Using memory_block extractor
- [Custom Extractors](../extractors/custom.md) - Build extractors for complex scenarios
- [Multi-Metric Evaluation](../graders/multi-metric.md) - Combine multiple checks

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.3 MiB

View File

@@ -1,389 +0,0 @@
# CLI Commands
The **letta-evals** command-line interface lets you run evaluations, validate configurations, and inspect available components.
**Quick overview:**
- **`run`** - Execute an evaluation suite (most common)
- **`validate`** - Check suite configuration without running
- **`list-extractors`** - Show available extractors
- **`list-graders`** - Show available grader functions
- **Exit codes** - 0 for pass, 1 for fail (perfect for CI/CD)
**Typical workflow:**
1. Validate your suite: `letta-evals validate suite.yaml`
2. Run evaluation: `letta-evals run suite.yaml --output results/`
3. Check exit code: `echo $?` (0 = passed, 1 = failed)
Letta Evals provides a command-line interface for running evaluations and managing configurations.
## run
Run an evaluation suite.
```bash
letta-evals run <suite.yaml> [options]
```
### Arguments
- `suite.yaml`: Path to the suite configuration file (required)
### Options
#### --output, -o
Save results to a directory.
```bash
letta-evals run suite.yaml --output results/
```
Creates:
- `results/header.json`: Evaluation metadata
- `results/summary.json`: Aggregate metrics and configuration
- `results/results.jsonl`: Per-sample results (one JSON per line)
#### --quiet, -q
Quiet mode - only show pass/fail result.
```bash
letta-evals run suite.yaml --quiet
```
Output:
```
✓ PASSED
```
#### --max-concurrent
Maximum concurrent sample evaluations.
```bash
letta-evals run suite.yaml --max-concurrent 10
```
Default: 15
Higher values = faster evaluation but more resource usage.
#### --api-key
Letta API key (overrides LETTA_API_KEY environment variable).
```bash
letta-evals run suite.yaml --api-key your-key
```
#### --base-url
Letta server base URL (overrides suite config and environment variable).
```bash
letta-evals run suite.yaml --base-url http://localhost:8283
```
#### --project-id
Letta project ID for cloud deployments.
```bash
letta-evals run suite.yaml --project-id proj_abc123
```
#### --cached, -c
Path to cached results (JSONL) for re-grading trajectories without re-running the agent.
```bash
letta-evals run suite.yaml --cached previous_results.jsonl
```
Use this to test different graders on the same agent trajectories.
#### --num-runs
Run the evaluation multiple times to measure consistency and get aggregate statistics.
```bash
letta-evals run suite.yaml --num-runs 10
```
Default: 1 (single run)
**Output with multiple runs:**
- Each run creates a separate `run_N/` directory with individual results
- An `aggregate_stats.json` file contains statistics across all runs (mean, standard deviation, pass rate)
**Use cases:**
- Measuring consistency of non-deterministic agents
- Getting confidence intervals for evaluation metrics
- Testing agent variability across multiple runs
See [Results - Multiple Runs](../results/overview.md#multiple-runs-statistics) for details on the statistics output.
### Examples
Basic run:
```bash
letta-evals run suite.yaml # Run evaluation, show results in terminal
```
Save results:
```bash
letta-evals run suite.yaml --output evaluation-results/ # Save to directory
```
High concurrency:
```bash
letta-evals run suite.yaml --max-concurrent 20 # Run 20 samples in parallel
```
Letta Cloud:
```bash
letta-evals run suite.yaml \
--base-url https://api.letta.com \ # Cloud endpoint
--api-key $LETTA_API_KEY \ # Your API key
--project-id proj_abc123 # Your project
```
Quiet CI mode:
```bash
letta-evals run suite.yaml --quiet # Only show pass/fail
if [ $? -eq 0 ]; then # Check exit code
echo "Evaluation passed"
else
echo "Evaluation failed"
exit 1 # Fail the CI build
fi
```
Multiple runs with statistics:
```bash
letta-evals run suite.yaml --num-runs 10 --output results/
# Creates results/run_1/, results/run_2/, ..., results/run_10/
# Plus results/aggregate_stats.json with mean, stddev, and pass rate
```
### Exit Codes
- `0`: Evaluation passed (gate criteria met)
- `1`: Evaluation failed (gate criteria not met or error)
## validate
Validate a suite configuration without running it.
```bash
letta-evals validate <suite.yaml>
```
Checks:
- YAML syntax is valid
- Required fields are present
- Paths exist
- Configuration is consistent
- Grader/extractor combinations are valid
### Examples
```bash
letta-evals validate suite.yaml
```
Output on success:
```
✓ Suite configuration is valid
```
Output on error:
```
✗ Validation failed:
- Agent file not found: agent.af
- Grader 'my_metric' references unknown function
```
## list-extractors
List all available extractors.
```bash
letta-evals list-extractors
```
Shows:
- Built-in extractors
- Custom extractors (if registered)
- Brief description of each
Output:
```
Available extractors:
last_assistant - Extract the last assistant message
first_assistant - Extract the first assistant message
all_assistant - Concatenate all assistant messages
pattern - Extract content matching regex
tool_arguments - Extract tool call arguments
tool_output - Extract tool return value
after_marker - Extract content after a marker
memory_block - Extract from memory block (requires agent_state)
```
## list-graders
List all available grader functions.
```bash
letta-evals list-graders
```
Shows:
- Built-in tool graders
- Custom graders (if registered)
- Brief description of each
Output:
```
Available graders:
exact_match - Exact string match with ground_truth
contains - Check if contains ground_truth
regex_match - Match regex pattern
ascii_printable_only - Validate ASCII-only content
```
## help
Show help information.
```bash
letta-evals --help
```
Show help for a specific command:
```bash
letta-evals run --help
letta-evals validate --help
```
## Environment Variables
These environment variables affect CLI behavior:
### LETTA_API_KEY
API key for Letta authentication.
```bash
export LETTA_API_KEY=your-key-here
```
### LETTA_BASE_URL
Letta server base URL.
```bash
export LETTA_BASE_URL=http://localhost:8283
```
### LETTA_PROJECT_ID
Letta project ID (for cloud).
```bash
export LETTA_PROJECT_ID=proj_abc123
```
### OPENAI_API_KEY
OpenAI API key (for rubric graders).
```bash
export OPENAI_API_KEY=your-openai-key
```
### OPENAI_BASE_URL
Custom OpenAI-compatible endpoint (optional).
```bash
export OPENAI_BASE_URL=https://your-endpoint.com/v1
```
## Configuration Priority
Configuration values are resolved in this order (highest to lowest priority):
1. CLI arguments (`--api-key`, `--base-url`, `--project-id`)
2. Suite YAML configuration
3. Environment variables
## Using in CI/CD
### GitHub Actions
```yaml
name: Run Evals
on: [push]
jobs:
evaluate:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Install dependencies
run: pip install letta-evals
- name: Run evaluation
env:
LETTA_API_KEY: ${{ secrets.LETTA_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: |
letta-evals run suite.yaml --quiet --output results/
- name: Upload results
uses: actions/upload-artifact@v2
with:
name: eval-results
path: results/
```
### GitLab CI
```yaml
evaluate:
script:
- pip install letta-evals
- letta-evals run suite.yaml --quiet --output results/
artifacts:
paths:
- results/
variables:
LETTA_API_KEY: $LETTA_API_KEY
OPENAI_API_KEY: $OPENAI_API_KEY
```
## Debugging
### Verbose Output
Currently, the CLI uses standard verbosity. For debugging:
1. Check the output directory for detailed results
2. Examine `summary.json` for aggregate metrics
3. Check `results.jsonl` for per-sample details
### Common Issues
**"Agent file not found"**
```bash
# Check file exists relative to suite YAML location
ls -la path/to/agent.af
```
**"Connection refused"**
```bash
# Verify Letta server is running
curl http://localhost:8283/v1/health
```
**"Invalid API key"**
```bash
# Check environment variable is set
echo $LETTA_API_KEY
```
## Next Steps
- [Understanding Results](../results/overview.md) - Interpreting evaluation output
- [Suite YAML Reference](../configuration/suite-yaml.md) - Complete configuration options
- [Getting Started](../getting-started.md) - Complete tutorial with examples

View File

@@ -1,342 +0,0 @@
# CLI Commands
The **letta-evals** command-line interface lets you run evaluations, validate configurations, and inspect available components.
<Note>
**Quick overview:**
- **`run`** - Execute an evaluation suite (most common)
- **`validate`** - Check suite configuration without running
- **`list-extractors`** - Show available extractors
- **`list-graders`** - Show available grader functions
- **Exit codes** - 0 for pass, 1 for fail (perfect for CI/CD)
</Note>
**Typical workflow:**
1. Validate your suite: `letta-evals validate suite.yaml`
2. Run evaluation: `letta-evals run suite.yaml --output results/`
3. Check exit code: `echo $?` (0 = passed, 1 = failed)
## run
Run an evaluation suite.
```bash
letta-evals run <suite.yaml> [options]
```
### Arguments
- `suite.yaml`: Path to the suite configuration file (required)
### Options
#### --output, -o
Save results to a directory.
```bash
letta-evals run suite.yaml --output results/
```
Creates:
- `results/header.json`: Evaluation metadata
- `results/summary.json`: Aggregate metrics and configuration
- `results/results.jsonl`: Per-sample results (one JSON per line)
#### --quiet, -q
Quiet mode - only show pass/fail result.
```bash
letta-evals run suite.yaml --quiet
```
Output:
```
✓ PASSED
```
#### --max-concurrent
Maximum concurrent sample evaluations. **Default**: 15
```bash
letta-evals run suite.yaml --max-concurrent 10
```
Higher values = faster evaluation but more resource usage.
#### --api-key
Letta API key (overrides LETTA_API_KEY environment variable).
```bash
letta-evals run suite.yaml --api-key your-key
```
#### --base-url
Letta server base URL (overrides suite config and environment variable).
```bash
letta-evals run suite.yaml --base-url https://api.letta.com
```
#### --project-id
Letta project ID for cloud deployments.
```bash
letta-evals run suite.yaml --project-id proj_abc123
```
#### --cached, -c
Path to cached results (JSONL) for re-grading trajectories without re-running the agent.
```bash
letta-evals run suite.yaml --cached previous_results.jsonl
```
Use this to test different graders on the same agent trajectories.
#### --num-runs
Run the evaluation multiple times to measure consistency. **Default**: 1
```bash
letta-evals run suite.yaml --num-runs 10
```
**Output with multiple runs:**
- Each run creates a separate `run_N/` directory with individual results
- An `aggregate_stats.json` file contains statistics across all runs (mean, standard deviation, pass rate)
### Examples
Basic run:
```bash
letta-evals run suite.yaml # Run evaluation, show results in terminal
```
Save results:
```bash
letta-evals run suite.yaml --output evaluation-results/ # Save to directory
```
Letta Cloud:
```bash
letta-evals run suite.yaml \
--base-url https://api.letta.com \
--api-key $LETTA_API_KEY \
--project-id proj_abc123
```
Quiet CI mode:
```bash
letta-evals run suite.yaml --quiet
if [ $? -eq 0 ]; then
echo "Evaluation passed"
else
echo "Evaluation failed"
exit 1
fi
```
### Exit Codes
- `0`: Evaluation passed (gate criteria met)
- `1`: Evaluation failed (gate criteria not met or error)
## validate
Validate a suite configuration without running it.
```bash
letta-evals validate <suite.yaml>
```
Checks:
- YAML syntax is valid
- Required fields are present
- Paths exist
- Configuration is consistent
- Grader/extractor combinations are valid
Output on success:
```
✓ Suite configuration is valid
```
Output on error:
```
✗ Validation failed:
- Agent file not found: agent.af
- Grader 'my_metric' references unknown function
```
## list-extractors
List all available extractors.
```bash
letta-evals list-extractors
```
Output:
```
Available extractors:
last_assistant - Extract the last assistant message
first_assistant - Extract the first assistant message
all_assistant - Concatenate all assistant messages
pattern - Extract content matching regex
tool_arguments - Extract tool call arguments
tool_output - Extract tool return value
after_marker - Extract content after a marker
memory_block - Extract from memory block (requires agent_state)
```
## list-graders
List all available grader functions.
```bash
letta-evals list-graders
```
Output:
```
Available graders:
exact_match - Exact string match with ground_truth
contains - Check if contains ground_truth
regex_match - Match regex pattern
ascii_printable_only - Validate ASCII-only content
```
## help
Show help information.
```bash
letta-evals --help
```
Show help for a specific command:
```bash
letta-evals run --help
letta-evals validate --help
```
## Environment Variables
### LETTA_API_KEY
API key for Letta authentication.
```bash
export LETTA_API_KEY=your-key-here
```
### LETTA_BASE_URL
Letta server base URL.
```bash
export LETTA_BASE_URL=https://api.letta.com
```
### LETTA_PROJECT_ID
Letta project ID (for cloud).
```bash
export LETTA_PROJECT_ID=proj_abc123
```
### OPENAI_API_KEY
OpenAI API key (for rubric graders).
```bash
export OPENAI_API_KEY=your-openai-key
```
## Configuration Priority
Configuration values are resolved in this order (highest to lowest priority):
1. CLI arguments (`--api-key`, `--base-url`, `--project-id`)
2. Suite YAML configuration
3. Environment variables
## Using in CI/CD
### GitHub Actions
```yaml
name: Run Evals
on: [push]
jobs:
evaluate:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Install dependencies
run: pip install letta-evals
- name: Run evaluation
env:
LETTA_API_KEY: ${{ secrets.LETTA_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: |
letta-evals run suite.yaml --quiet --output results/
- name: Upload results
uses: actions/upload-artifact@v2
with:
name: eval-results
path: results/
```
### GitLab CI
```yaml
evaluate:
script:
- pip install letta-evals
- letta-evals run suite.yaml --quiet --output results/
artifacts:
paths:
- results/
variables:
LETTA_API_KEY: $LETTA_API_KEY
OPENAI_API_KEY: $OPENAI_API_KEY
```
## Debugging
### Common Issues
<Warning>
**"Agent file not found"**
```bash
# Check file exists relative to suite YAML location
ls -la path/to/agent.af
```
</Warning>
<Warning>
**"Connection refused"**
```bash
# Verify Letta server is running
curl https://api.letta.com/v1/health
```
</Warning>
<Warning>
**"Invalid API key"**
```bash
# Check environment variable is set
echo $LETTA_API_KEY
```
</Warning>
## Next Steps
- [Understanding Results](/evals/results-metrics/understanding-results) - Interpreting evaluation output
- [Suite YAML Reference](/evals/configuration/suite-yaml-reference) - Complete configuration options
- [Getting Started](/evals/get-started/getting-started) - Complete tutorial with examples

View File

@@ -1,418 +0,0 @@
# Datasets
**Datasets** are the test cases that define what your agent will be evaluated on. Each sample in your dataset represents one evaluation scenario.
**Quick overview:**
- **Two formats**: JSONL (flexible, powerful) or CSV (simple, spreadsheet-friendly)
- **Required field**: `input` - the prompt(s) to send to the agent
- **Common fields**: `ground_truth` (expected answer), `tags` (for filtering), `metadata` (extra info)
- **Advanced fields**: `agent_args` (customize agent per sample), `rubric_vars` (per-sample rubric context)
- **Multi-turn support**: Send multiple messages in sequence using arrays
**Typical workflow:**
1. Create a JSONL or CSV file with test cases
2. Reference it in your suite YAML: `dataset: test_cases.jsonl`
3. Run evaluation - each sample is tested independently
4. Results show per-sample and aggregate scores
Datasets can be created in two formats: **JSONL** or **CSV**. Choose based on your team's workflow and complexity needs.
## Dataset Formats
### JSONL Format
Each line is a JSON object representing one test case:
```jsonl
{"input": "What's the capital of France?", "ground_truth": "Paris"}
{"input": "Calculate 2+2", "ground_truth": "4"}
{"input": "What color is the sky?", "ground_truth": "blue"}
```
**Best for:**
- Complex data structures (nested objects, arrays)
- Multi-turn conversations
- Advanced features (agent_args, rubric_vars)
- Teams comfortable with JSON/code
- Version control (clean line-by-line diffs)
### CSV Format
Standard CSV with headers:
```csv
input,ground_truth
"What's the capital of France?","Paris"
"Calculate 2+2","4"
"What color is the sky?","blue"
```
**Best for:**
- Simple question-answer pairs
- Teams that prefer spreadsheets (Excel, Google Sheets)
- Non-technical collaborators creating test cases
- Quick dataset creation and editing
- Easy sharing with non-developers
## Quick Reference
| Field | Required | Type | Purpose |
|-------|----------|------|---------|
| `input` | ✅ | string or array | Prompt(s) to send to agent |
| `ground_truth` | ❌ | string | Expected answer (for tool graders) |
| `tags` | ❌ | array of strings | For filtering samples |
| `agent_args` | ❌ | object | Per-sample agent customization |
| `rubric_vars` | ❌ | object | Per-sample rubric variables |
| `metadata` | ❌ | object | Arbitrary extra data |
| `id` | ❌ | integer | Sample ID (auto-assigned if omitted) |
## Field Reference
### Required Fields
#### input
The prompt(s) to send to the agent. Can be a string or array of strings:
Single message:
```json
{"input": "Hello, who are you?"}
```
Multi-turn conversation:
```json
{"input": ["Hello", "What's your name?", "Tell me about yourself"]}
```
### Optional Fields
#### ground_truth
The expected answer or content to check against. Required for most tool graders (exact_match, contains, etc.):
```json
{"input": "What is 2+2?", "ground_truth": "4"}
```
#### metadata
Arbitrary additional data about the sample:
```json
{
"input": "What is photosynthesis?",
"ground_truth": "process where plants convert light into energy",
"metadata": {
"category": "biology",
"difficulty": "medium"
}
}
```
#### tags
List of tags for filtering samples:
```json
{"input": "Solve x^2 = 16", "ground_truth": "4", "tags": ["math", "algebra"]}
```
Filter by tags in your suite:
```yaml
sample_tags: [math] # Only samples tagged "math" will be evaluated
```
#### agent_args
Custom arguments passed to programmatic agent creation when using `agent_script`. Allows per-sample agent customization.
JSONL:
```json
{
"input": "What items do we have?",
"agent_args": {
"item": {"sku": "SKU-123", "name": "Widget A", "price": 19.99}
}
}
```
CSV:
```csv
input,agent_args
"What items do we have?","{""item"": {""sku"": ""SKU-123"", ""name"": ""Widget A"", ""price"": 19.99}}"
```
Your agent factory function can access these values via `sample.agent_args` to customize agent configuration.
See [Targets - agent_script](./targets.md#agent_script) for details on programmatic agent creation.
#### rubric_vars
Variables to inject into rubric templates when using rubric graders. This allows you to provide per-sample context or examples to the LLM judge.
**Example:** Evaluating code quality against a reference implementation.
JSONL:
```jsonl
{"input": "Write a function to calculate fibonacci numbers", "rubric_vars": {"reference_code": "def fib(n):\n if n <= 1: return n\n return fib(n-1) + fib(n-2)", "required_features": "recursion, base case"}}
```
CSV:
```csv
input,rubric_vars
"Write a function to calculate fibonacci numbers","{""reference_code"": ""def fib(n):\n if n <= 1: return n\n return fib(n-1) + fib(n-2)"", ""required_features"": ""recursion, base case""}"
```
In your rubric template file, reference variables with `{variable_name}`:
**rubric.txt:**
```
Evaluate the submitted code against this reference implementation:
{reference_code}
Required features: {required_features}
Score on correctness (0.6) and code quality (0.4).
```
When the rubric grader runs, variables are replaced with values from `rubric_vars`:
**Final formatted prompt sent to LLM:**
```
Evaluate the submitted code against this reference implementation:
def fib(n):
if n <= 1: return n
return fib(n-1) + fib(n-2)
Required features: recursion, base case
Score on correctness (0.6) and code quality (0.4).
```
This lets you customize evaluation criteria per sample using the same rubric template.
See [Rubric Graders](../graders/rubric-graders.md) for details on rubric templates.
#### id
Sample ID is automatically assigned (0-based index) if not provided. You can override:
```json
{"id": 42, "input": "Test case 42"}
```
## Complete Example
```jsonl
{"id": 1, "input": "What is the capital of France?", "ground_truth": "Paris", "tags": ["geography", "easy"], "metadata": {"region": "Europe"}}
{"id": 2, "input": "Calculate the square root of 144", "ground_truth": "12", "tags": ["math", "medium"]}
{"id": 3, "input": ["Hello", "What can you help me with?"], "tags": ["conversation"]}
```
## Dataset Best Practices
### 1. Clear Ground Truth
Make ground truth specific enough to grade but flexible enough to match valid responses:
Good:
```json
{"input": "What's the largest planet?", "ground_truth": "Jupiter"}
```
Too strict (might miss valid answers):
```json
{"input": "What's the largest planet?", "ground_truth": "Jupiter is the largest planet in our solar system."}
```
### 2. Diverse Test Cases
Include edge cases and variations:
```jsonl
{"input": "What is 2+2?", "ground_truth": "4", "tags": ["math", "easy"]}
{"input": "What is 0.1 + 0.2?", "ground_truth": "0.3", "tags": ["math", "floating_point"]}
{"input": "What is 999999999 + 1?", "ground_truth": "1000000000", "tags": ["math", "large_numbers"]}
```
### 3. Use Tags for Organization
Organize samples by type, difficulty, or feature:
```json
{"tags": ["tool_usage", "search"]}
{"tags": ["memory", "recall"]}
{"tags": ["reasoning", "multi_step"]}
```
### 4. Multi-Turn Conversations
Test conversational context:
```json
{
"input": [
"My name is Alice",
"What's my name?"
],
"ground_truth": "Alice",
"tags": ["memory", "context"]
}
```
### 5. No Ground Truth for LLM Judges
If using rubric graders, ground truth is optional:
```jsonl
{"input": "Write a creative story about a robot", "tags": ["creative"]}
{"input": "Explain quantum computing simply", "tags": ["explanation"]}
```
The LLM judge evaluates based on the rubric, not ground truth.
## Loading Datasets
Datasets are automatically loaded by the runner:
```yaml
dataset: path/to/dataset.jsonl # Path to your test cases (JSONL or CSV)
```
Paths are relative to the suite YAML file location.
## Dataset Filtering
### Limit Sample Count
```yaml
max_samples: 10 # Only evaluate first 10 samples (useful for testing)
```
### Filter by Tags
```yaml
sample_tags: [math, medium] # Only samples with ALL these tags
```
## Creating Datasets Programmatically
You can generate datasets with Python:
```python
import json
samples = []
for i in range(100):
samples.append({
"input": f"What is {i} + {i}?",
"ground_truth": str(i + i),
"tags": ["math", "addition"]
})
with open("dataset.jsonl", "w") as f:
for sample in samples:
f.write(json.dumps(sample) + "\n")
```
## Dataset Format Validation
The runner validates:
- Each line is valid JSON
- Required fields are present
- Field types are correct
Validation errors will be reported with line numbers.
## Examples by Use Case
### Question Answering
JSONL:
```jsonl
{"input": "What is the capital of France?", "ground_truth": "Paris"}
{"input": "Who wrote Romeo and Juliet?", "ground_truth": "Shakespeare"}
```
CSV:
```csv
input,ground_truth
"What is the capital of France?","Paris"
"Who wrote Romeo and Juliet?","Shakespeare"
```
### Tool Usage Testing
JSONL:
```jsonl
{"input": "Search for information about pandas", "ground_truth": "search"}
{"input": "Calculate 15 * 23", "ground_truth": "calculator"}
```
CSV:
```csv
input,ground_truth
"Search for information about pandas","search"
"Calculate 15 * 23","calculator"
```
Ground truth = expected tool name.
### Memory Testing (Multi-turn)
JSONL:
```jsonl
{"input": ["Remember that my favorite color is blue", "What's my favorite color?"], "ground_truth": "blue"}
{"input": ["I live in Tokyo", "Where do I live?"], "ground_truth": "Tokyo"}
```
CSV (using JSON array strings):
```csv
input,ground_truth
"[""Remember that my favorite color is blue"", ""What's my favorite color?""]","blue"
"[""I live in Tokyo"", ""Where do I live?""]","Tokyo"
```
### Code Generation
JSONL:
```jsonl
{"input": "Write a function to reverse a string in Python"}
{"input": "Create a SQL query to find users older than 21"}
```
CSV:
```csv
input
"Write a function to reverse a string in Python"
"Create a SQL query to find users older than 21"
```
Use rubric graders to evaluate code quality.
## CSV Advanced Features
CSV supports all the same features as JSONL by encoding complex data as JSON strings in cells:
**Multi-turn conversations** (requires escaped JSON array string):
```csv
input,ground_truth
"[""Hello"", ""What's your name?""]","Alice"
```
**Agent arguments** (requires escaped JSON object string):
```csv
input,agent_args
"What items do we have?","{""initial_inventory"": [""apple"", ""banana""]}"
```
**Rubric variables** (requires escaped JSON object string):
```csv
input,rubric_vars
"Write a story","{""max_length"": 500, ""genre"": ""sci-fi""}"
```
**Note:** Complex data structures require JSON encoding in CSV. If you're frequently using these advanced features, JSONL may be easier to read and maintain.
## Next Steps
- [Suite YAML Reference](../configuration/suite-yaml.md) - Complete configuration options including filtering
- [Graders](./graders.md) - How to evaluate agent responses
- [Multi-Turn Conversations](../advanced/multi-turn-conversations.md) - Testing conversational flows

View File

@@ -1,425 +0,0 @@
# Datasets
**Datasets** are the test cases that define what your agent will be evaluated on. Each sample in your dataset represents one evaluation scenario.
<Note>
**Quick overview:**
- **Two formats**: JSONL (flexible, powerful) or CSV (simple, spreadsheet-friendly)
- **Required field**: `input` - the prompt(s) to send to the agent
- **Common fields**: `ground_truth` (expected answer), `tags` (for filtering), `metadata` (extra info)
- **Advanced fields**: `agent_args` (customize agent per sample), `rubric_vars` (per-sample rubric context)
- **Multi-turn support**: Send multiple messages in sequence using arrays
</Note>
**Typical workflow:**
1. Create a JSONL or CSV file with test cases
2. Reference it in your suite YAML: `dataset: test_cases.jsonl`
3. Run evaluation - each sample is tested independently
4. Results show per-sample and aggregate scores
Datasets can be created in two formats: **JSONL** or **CSV**. Choose based on your team's workflow and complexity needs.
## Dataset Formats
### JSONL Format
Each line is a JSON object representing one test case:
```jsonl
{"input": "What's the capital of France?", "ground_truth": "Paris"}
{"input": "Calculate 2+2", "ground_truth": "4"}
{"input": "What color is the sky?", "ground_truth": "blue"}
```
**Best for:**
- Complex data structures (nested objects, arrays)
- Multi-turn conversations
- Advanced features (agent_args, rubric_vars)
- Teams comfortable with JSON/code
- Version control (clean line-by-line diffs)
### CSV Format
Standard CSV with headers:
```csv
input,ground_truth
"What's the capital of France?","Paris"
"Calculate 2+2","4"
"What color is the sky?","blue"
```
**Best for:**
- Simple question-answer pairs
- Teams that prefer spreadsheets (Excel, Google Sheets)
- Non-technical collaborators creating test cases
- Quick dataset creation and editing
- Easy sharing with non-developers
## Quick Reference
| Field | Required | Type | Purpose |
|-------|----------|------|---------|
| `input` | ✅ | string or array | Prompt(s) to send to agent |
| `ground_truth` | ❌ | string | Expected answer (for tool graders) |
| `tags` | ❌ | array of strings | For filtering samples |
| `agent_args` | ❌ | object | Per-sample agent customization |
| `rubric_vars` | ❌ | object | Per-sample rubric variables |
| `metadata` | ❌ | object | Arbitrary extra data |
| `id` | ❌ | integer | Sample ID (auto-assigned if omitted) |
## Field Reference
### Required Fields
#### input
The prompt(s) to send to the agent. Can be a string or array of strings:
Single message:
```json
{"input": "Hello, who are you?"}
```
Multi-turn conversation:
```json
{"input": ["Hello", "What's your name?", "Tell me about yourself"]}
```
### Optional Fields
#### ground_truth
The expected answer or content to check against. Required for most tool graders (exact_match, contains, etc.):
```json
{"input": "What is 2+2?", "ground_truth": "4"}
```
#### metadata
Arbitrary additional data about the sample:
```json
{
"input": "What is photosynthesis?",
"ground_truth": "process where plants convert light into energy",
"metadata": {
"category": "biology",
"difficulty": "medium"
}
}
```
#### tags
List of tags for filtering samples:
```json
{"input": "Solve x^2 = 16", "ground_truth": "4", "tags": ["math", "algebra"]}
```
Filter by tags in your suite:
```yaml
sample_tags: [math] # Only samples tagged "math" will be evaluated
```
#### agent_args
Custom arguments passed to programmatic agent creation when using `agent_script`. Allows per-sample agent customization.
JSONL:
```json
{
"input": "What items do we have?",
"agent_args": {
"item": {"sku": "SKU-123", "name": "Widget A", "price": 19.99}
}
}
```
CSV:
```csv
input,agent_args
"What items do we have?","{""item"": {""sku"": ""SKU-123"", ""name"": ""Widget A"", ""price"": 19.99}}"
```
Your agent factory function can access these values via `sample.agent_args` to customize agent configuration.
See [Targets - agent_script](/guides/evals/concepts/targets#agent_script) for details on programmatic agent creation.
#### rubric_vars
Variables to inject into rubric templates when using rubric graders. This allows you to provide per-sample context or examples to the LLM judge.
**Example:** Evaluating code quality against a reference implementation.
JSONL:
```jsonl
{"input": "Write a function to calculate fibonacci numbers", "rubric_vars": {"reference_code": "def fib(n):\n if n <= 1: return n\n return fib(n-1) + fib(n-2)", "required_features": "recursion, base case"}}
```
CSV:
```csv
input,rubric_vars
"Write a function to calculate fibonacci numbers","{""reference_code"": ""def fib(n):\n if n <= 1: return n\n return fib(n-1) + fib(n-2)"", ""required_features"": ""recursion, base case""}"
```
In your rubric template file, reference variables with `{variable_name}`:
**rubric.txt:**
```
Evaluate the submitted code against this reference implementation:
{reference_code}
Required features: {required_features}
Score on correctness (0.6) and code quality (0.4).
```
When the rubric grader runs, variables are replaced with values from `rubric_vars`:
**Final formatted prompt sent to LLM:**
```
Evaluate the submitted code against this reference implementation:
def fib(n):
if n <= 1: return n
return fib(n-1) + fib(n-2)
Required features: recursion, base case
Score on correctness (0.6) and code quality (0.4).
```
This lets you customize evaluation criteria per sample using the same rubric template.
See [Rubric Graders](/guides/evals/graders/rubric-graders) for details on rubric templates.
#### id
Sample ID is automatically assigned (0-based index) if not provided. You can override:
```json
{"id": 42, "input": "Test case 42"}
```
## Complete Example
```jsonl
{"id": 1, "input": "What is the capital of France?", "ground_truth": "Paris", "tags": ["geography", "easy"], "metadata": {"region": "Europe"}}
{"id": 2, "input": "Calculate the square root of 144", "ground_truth": "12", "tags": ["math", "medium"]}
{"id": 3, "input": ["Hello", "What can you help me with?"], "tags": ["conversation"]}
```
## Dataset Best Practices
### 1. Clear Ground Truth
Make ground truth specific enough to grade but flexible enough to match valid responses:
<Tip>
Good:
```json
{"input": "What's the largest planet?", "ground_truth": "Jupiter"}
```
</Tip>
<Warning>
Too strict (might miss valid answers):
```json
{"input": "What's the largest planet?", "ground_truth": "Jupiter is the largest planet in our solar system."}
```
</Warning>
### 2. Diverse Test Cases
Include edge cases and variations:
```jsonl
{"input": "What is 2+2?", "ground_truth": "4", "tags": ["math", "easy"]}
{"input": "What is 0.1 + 0.2?", "ground_truth": "0.3", "tags": ["math", "floating_point"]}
{"input": "What is 999999999 + 1?", "ground_truth": "1000000000", "tags": ["math", "large_numbers"]}
```
### 3. Use Tags for Organization
Organize samples by type, difficulty, or feature:
```json
{"tags": ["tool_usage", "search"]}
{"tags": ["memory", "recall"]}
{"tags": ["reasoning", "multi_step"]}
```
### 4. Multi-Turn Conversations
Test conversational context and memory updates:
```jsonl
{"input": ["My name is Alice", "What's my name?"], "ground_truth": "Alice", "tags": ["memory", "recall"]}
{"input": ["Please remember that I like bananas.", "Actually, sorry, I meant I like apples."], "ground_truth": "apples", "tags": ["memory", "correction"]}
{"input": ["I work at Google", "Update my workplace to Microsoft", "Where do I work?"], "ground_truth": "Microsoft", "tags": ["memory", "multi_step"]}
```
<Tip>
**Testing memory corrections:** Use multi-turn inputs to test if agents properly update memory when users correct themselves. Combine with the `memory_block` extractor to verify the final memory state, not just the response.
</Tip>
### 5. No Ground Truth for LLM Judges
If using rubric graders, ground truth is optional:
```jsonl
{"input": "Write a creative story about a robot", "tags": ["creative"]}
{"input": "Explain quantum computing simply", "tags": ["explanation"]}
```
The LLM judge evaluates based on the rubric, not ground truth.
## Loading Datasets
Datasets are automatically loaded by the runner:
```yaml
dataset: path/to/dataset.jsonl # Path to your test cases (JSONL or CSV)
```
Paths are relative to the suite YAML file location.
## Dataset Filtering
### Limit Sample Count
```yaml
max_samples: 10 # Only evaluate first 10 samples (useful for testing)
```
### Filter by Tags
```yaml
sample_tags: [math, medium] # Only samples with ALL these tags
```
## Creating Datasets Programmatically
You can generate datasets with Python:
```python
import json
samples = []
for i in range(100):
samples.append({
"input": f"What is {i} + {i}?",
"ground_truth": str(i + i),
"tags": ["math", "addition"]
})
with open("dataset.jsonl", "w") as f:
for sample in samples:
f.write(json.dumps(sample) + "\n")
```
## Dataset Format Validation
The runner validates:
- Each line is valid JSON
- Required fields are present
- Field types are correct
Validation errors will be reported with line numbers.
## Examples by Use Case
### Question Answering
JSONL:
```jsonl
{"input": "What is the capital of France?", "ground_truth": "Paris"}
{"input": "Who wrote Romeo and Juliet?", "ground_truth": "Shakespeare"}
```
CSV:
```csv
input,ground_truth
"What is the capital of France?","Paris"
"Who wrote Romeo and Juliet?","Shakespeare"
```
### Tool Usage Testing
JSONL:
```jsonl
{"input": "Search for information about pandas", "ground_truth": "search"}
{"input": "Calculate 15 * 23", "ground_truth": "calculator"}
```
CSV:
```csv
input,ground_truth
"Search for information about pandas","search"
"Calculate 15 * 23","calculator"
```
Ground truth = expected tool name.
### Memory Testing (Multi-turn)
JSONL:
```jsonl
{"input": ["Remember that my favorite color is blue", "What's my favorite color?"], "ground_truth": "blue"}
{"input": ["I live in Tokyo", "Where do I live?"], "ground_truth": "Tokyo"}
```
CSV (using JSON array strings):
```csv
input,ground_truth
"[""Remember that my favorite color is blue"", ""What's my favorite color?""]","blue"
"[""I live in Tokyo"", ""Where do I live?""]","Tokyo"
```
### Code Generation
JSONL:
```jsonl
{"input": "Write a function to reverse a string in Python"}
{"input": "Create a SQL query to find users older than 21"}
```
CSV:
```csv
input
"Write a function to reverse a string in Python"
"Create a SQL query to find users older than 21"
```
Use rubric graders to evaluate code quality.
## CSV Advanced Features
CSV supports all the same features as JSONL by encoding complex data as JSON strings in cells:
**Multi-turn conversations** (requires escaped JSON array string):
```csv
input,ground_truth
"[""Hello"", ""What's your name?""]","Alice"
```
**Agent arguments** (requires escaped JSON object string):
```csv
input,agent_args
"What items do we have?","{""initial_inventory"": [""apple"", ""banana""]}"
```
**Rubric variables** (requires escaped JSON object string):
```csv
input,rubric_vars
"Write a story","{""max_length"": 500, ""genre"": ""sci-fi""}"
```
<Note>
**Note:** Complex data structures require JSON encoding in CSV. If you're frequently using these advanced features, JSONL may be easier to read and maintain.
</Note>
## Next Steps
- [Suite YAML Reference](/guides/evals/configuration/suite-yaml) - Complete configuration options including filtering
- [Graders](/guides/evals/concepts/graders) - How to evaluate agent responses
- [Multi-Turn Conversations](/guides/evals/advanced/multi-turn-conversations) - Testing conversational flows

View File

@@ -1,394 +0,0 @@
# Extractors
**Extractors** select what content to evaluate from an agent's response. They navigate the conversation trajectory and extract the specific piece you want to grade.
**Quick overview:**
- **Purpose**: Agent responses are complex (messages, tool calls, memory) - extractors isolate what to grade
- **Built-in options**: last_assistant, tool_arguments, memory_block, pattern, and more
- **Flexible**: Different graders can use different extractors in the same suite
- **Automatic**: No setup needed - just specify in your grader config
**Common patterns:**
- `last_assistant` - Most common, gets the agent's final message (90% of use cases)
- `tool_arguments` - Verify agent called the right tool with correct args
- `memory_block` - Check if agent updated memory correctly
- `pattern` - Extract structured data with regex
Extractors determine what part of the agent's response gets graded. They pull out specific content from the conversation trajectory.
## Why Extractors?
An agent's response is complex - it includes assistant messages, tool calls, tool returns, memory updates, etc. Extractors let you focus on exactly what you want to evaluate.
**The evaluation flow:**
```
Agent Response → Extractor → Submission Text → Grader → Score
```
For example:
```
Full trajectory:
UserMessage: "What's the capital of France?"
ToolCallMessage: search(query="capital of france")
ToolReturnMessage: "Paris is the capital..."
AssistantMessage: "The capital of France is Paris."
↓ extractor: last_assistant ↓
Extracted: "The capital of France is Paris."
↓ grader: contains (ground_truth="Paris") ↓
Score: 1.0
```
## Trajectory Structure
A trajectory is a list of turns, where each turn is a list of Letta messages:
```python
[
[UserMessage(...), AssistantMessage(...), ToolCallMessage(...), ToolReturnMessage(...)], # Turn 1
[AssistantMessage(...)] # Turn 2
]
```
Extractors navigate this structure to pull out the submission text.
## Built-in Extractors
### last_assistant
Extracts the last assistant message content.
```yaml
graders:
quality:
kind: tool
function: contains
extractor: last_assistant # Extract final agent message
```
Most common extractor - gets the agent's final response.
### first_assistant
Extracts the first assistant message content.
```yaml
graders:
initial_response:
kind: tool
function: contains
extractor: first_assistant # Extract first agent message
```
Useful for testing immediate responses before tool usage.
### all_assistant
Concatenates all assistant messages with a separator.
```yaml
graders:
complete_response:
kind: rubric
prompt_path: rubric.txt
extractor: all_assistant # Concatenate all agent messages
extractor_config:
separator: "\n\n" # Join messages with double newline
```
Use when you need the full conversation context.
### last_turn
Extracts all assistant messages from the last turn only.
```yaml
graders:
final_turn:
kind: tool
function: contains
extractor: last_turn # Messages from final turn only
extractor_config:
separator: " " # Join with spaces
```
Useful when the agent makes multiple statements in the final turn.
### pattern
Extracts content matching a regex pattern from assistant messages.
```yaml
graders:
extract_number:
kind: tool
function: exact_match
extractor: pattern # Extract using regex
extractor_config:
pattern: 'Result: (\d+)' # Regex pattern to match
group: 1 # Extract capture group 1
search_all: false # Only find first match
```
Example: Extract "42" from "The answer is Result: 42"
### tool_arguments
Extracts arguments from a specific tool call.
```yaml
graders:
search_query:
kind: tool
function: contains
extractor: tool_arguments # Extract tool call arguments
extractor_config:
tool_name: search # Which tool to extract from
```
Returns the JSON arguments as a string.
Example: If agent calls `search(query="pandas", limit=10)`, extracts:
```json
{"query": "pandas", "limit": 10}
```
### tool_output
Extracts the return value from a specific tool call.
```yaml
graders:
search_results:
kind: tool
function: contains
extractor: tool_output # Extract tool return value
extractor_config:
tool_name: search # Which tool's output to extract
```
Finds the tool call and its corresponding return message.
### after_marker
Extracts content after a specific marker string.
```yaml
graders:
answer_section:
kind: tool
function: contains
extractor: after_marker # Extract content after marker
extractor_config:
marker: "ANSWER:" # Marker string to find
include_marker: false # Don't include "ANSWER:" in output
```
Example: From "Here's my analysis... ANSWER: Paris", extracts "Paris"
### memory_block
Extracts content from a specific memory block (requires agent_state).
```yaml
graders:
human_memory:
kind: tool
function: exact_match
extractor: memory_block # Extract from agent memory
extractor_config:
block_label: human # Which memory block to extract
```
**Important**: This extractor requires the agent's final state, which adds overhead. The runner automatically fetches agent_state when this extractor is used.
Example use case: Verify the agent correctly updated its memory about the user.
## Extractor Configuration
Some extractors accept additional configuration via `extractor_config`:
```yaml
graders:
my_metric:
kind: tool
function: contains
extractor: pattern # Use pattern extractor
extractor_config: # Configuration for this extractor
pattern: 'Answer: (.*)' # Regex pattern
group: 1 # Extract capture group 1
```
## Choosing an Extractor
| Use Case | Recommended Extractor |
|----------|---------------------|
| Final agent response | `last_assistant` |
| First response before tools | `first_assistant` |
| Complete conversation | `all_assistant` |
| Specific format extraction | `pattern` |
| Tool usage validation | `tool_arguments` |
| Tool result checking | `tool_output` |
| Memory validation | `memory_block` |
| Structured output | `after_marker` |
## Content Flattening
Assistant messages can contain multiple content parts. Extractors automatically flatten complex content to plain text.
## Empty Extraction
If an extractor finds no matching content, it returns an empty string `""`. This typically results in a score of 0.0 from the grader.
## Custom Extractors
You can write custom extractors. See [Custom Extractors](../extractors/custom.md) for details.
Example:
```python
from letta_evals.decorators import extractor
from letta_client import LettaMessageUnion
@extractor
def my_extractor(trajectory: List[List[LettaMessageUnion]], config: dict) -> str:
# Custom extraction logic
return extracted_text
```
Register by importing in your suite's setup script or custom evaluators file.
## Multi-Metric Extraction
Different graders can use different extractors:
```yaml
graders:
response_quality: # Evaluate final message quality
kind: rubric
prompt_path: quality.txt
extractor: last_assistant # Extract final response
tool_usage: # Check tool was called correctly
kind: tool
function: exact_match
extractor: tool_arguments # Extract tool args
extractor_config:
tool_name: search # From search tool
memory_update: # Verify memory updated
kind: tool
function: contains
extractor: memory_block # Extract from memory
extractor_config:
block_label: human # Human memory block
```
Each grader independently extracts and evaluates different aspects.
## Listing Extractors
See all available extractors:
```bash
letta-evals list-extractors
```
## Examples
### Extract Final Answer
```yaml
extractor: last_assistant # Get final agent message
```
Agent: "Let me search... *uses tool* ... The answer is Paris."
Extracted: "The answer is Paris."
### Extract Tool Arguments
```yaml
extractor: tool_arguments # Get tool call args
extractor_config:
tool_name: search # From search tool
```
Agent calls: `search(query="pandas", limit=5)`
Extracted: `{"query": "pandas", "limit": 5}`
### Extract Pattern
```yaml
extractor: pattern # Extract with regex
extractor_config:
pattern: 'RESULT: (\w+)' # Match pattern
group: 1 # Extract capture group 1
```
Agent: "After calculation... RESULT: SUCCESS"
Extracted: "SUCCESS"
### Extract Memory
```yaml
extractor: memory_block # Extract from agent memory
extractor_config:
block_label: human # Human memory block
```
Agent updates memory block "human" to: "User's name is Alice"
Extracted: "User's name is Alice"
## Troubleshooting
### Extractor returns empty string
**Problem**: Grader always gives score 0.0 because extractor finds nothing.
**Common causes**:
- **Wrong extractor**: Using `first_assistant` but agent doesn't respond until after tool use → use `last_assistant`
- **Wrong tool name**: `tool_arguments` with `tool_name: "search"` but agent calls `"web_search"` → check actual tool name
- **Wrong memory block**: `memory_block` with `block_label: "user"` but block is actually labeled `"human"` → check block labels
- **Pattern doesn't match**: `pattern: "Answer: (.*)"` but agent says "The answer is..." → adjust regex
**Debug tips**:
1. Check the trajectory in results JSON to see actual agent output
2. Use `last_assistant` first to see what's there
3. Verify tool names with `letta-evals list-extractors`
### Pattern extractor not working
**Problem**: Pattern extractor returns empty or wrong content.
**Solutions**:
- Test your regex separately first
- Remember to escape special characters: `\.`, `\(`, `\)`
- Use `group: 0` to see the full match (default)
- Use `group: 1` to extract first capture group
- Set `search_all: true` if you need all matches
### Memory block extractor fails
**Problem**: `memory_block` extractor causes errors or returns nothing.
**Solutions**:
- Verify the block label exactly matches (case-sensitive)
- Check that agent actually has this memory block
- Remember: this adds overhead by fetching agent state
### Tool extractor finds wrong tool
**Problem**: Multiple tool calls, but extractor gets the wrong one.
**Current behavior**: Extractors get the **first** matching tool call.
**Workaround**: Use custom extractor to implement more specific logic.
## Next Steps
- [Built-in Extractors Reference](../extractors/builtin.md) - Complete extractor documentation
- [Custom Extractors Guide](../extractors/custom.md) - Write your own extractors
- [Graders](./graders.md) - How to use extractors with graders

View File

@@ -1,374 +0,0 @@
# Extractors
**Extractors** select what content to evaluate from an agent's response. They navigate the conversation trajectory and extract the specific piece you want to grade.
<Note>
**Quick overview:**
- **Purpose**: Agent responses are complex (messages, tool calls, memory) - extractors isolate what to grade
- **Built-in options**: last_assistant, tool_arguments, memory_block, pattern, and more
- **Flexible**: Different graders can use different extractors in the same suite
- **Automatic**: No setup needed - just specify in your grader config
</Note>
**Common patterns:**
- `last_assistant` - Most common, gets the agent's final message (90% of use cases)
- `tool_arguments` - Verify agent called the right tool with correct args
- `memory_block` - Check if agent updated memory correctly
- `pattern` - Extract structured data with regex
Extractors determine what part of the agent's response gets graded. They pull out specific content from the conversation trajectory.
## Why Extractors?
An agent's response is complex - it includes assistant messages, tool calls, tool returns, memory updates, etc. Extractors let you focus on exactly what you want to evaluate.
**The evaluation flow:**
```
Agent Response → Extractor → Submission Text → Grader → Score
```
For example:
```
Full trajectory:
UserMessage: "What's the capital of France?"
ToolCallMessage: search(query="capital of france")
ToolReturnMessage: "Paris is the capital..."
AssistantMessage: "The capital of France is Paris."
↓ extractor: last_assistant ↓
Extracted: "The capital of France is Paris."
↓ grader: contains (ground_truth="Paris") ↓
Score: 1.0
```
## Trajectory Structure
A trajectory is a list of turns, where each turn is a list of Letta messages:
```python
[
[UserMessage(...), AssistantMessage(...), ToolCallMessage(...), ToolReturnMessage(...)], # Turn 1
[AssistantMessage(...)] # Turn 2
]
```
Extractors navigate this structure to pull out the submission text.
## Built-in Extractors
### last_assistant
Extracts the last assistant message content.
```yaml
graders:
quality:
kind: tool
function: contains
extractor: last_assistant # Extract final agent message
```
Most common extractor - gets the agent's final response.
### first_assistant
Extracts the first assistant message content.
```yaml
graders:
initial_response:
kind: tool
function: contains
extractor: first_assistant # Extract first agent message
```
Useful for testing immediate responses before tool usage.
### all_assistant
Concatenates all assistant messages with a separator.
```yaml
graders:
complete_response:
kind: rubric
prompt_path: rubric.txt
extractor: all_assistant # Concatenate all agent messages
extractor_config:
separator: "\n\n" # Join messages with double newline
```
Use when you need the full conversation context.
### last_turn
Extracts all assistant messages from the last turn only.
```yaml
graders:
final_turn:
kind: tool
function: contains
extractor: last_turn # Messages from final turn only
extractor_config:
separator: " " # Join with spaces
```
Useful when the agent makes multiple statements in the final turn.
### pattern
Extracts content matching a regex pattern from assistant messages.
```yaml
graders:
extract_number:
kind: tool
function: exact_match
extractor: pattern # Extract using regex
extractor_config:
pattern: 'Result: (\d+)' # Regex pattern to match
group: 1 # Extract capture group 1
search_all: false # Only find first match
```
Example: Extract "42" from "The answer is Result: 42"
### tool_arguments
Extracts arguments from a specific tool call.
```yaml
graders:
search_query:
kind: tool
function: contains
extractor: tool_arguments # Extract tool call arguments
extractor_config:
tool_name: search # Which tool to extract from
```
Returns the JSON arguments as a string.
Example: If agent calls `search(query="pandas", limit=10)`, extracts:
```json
{"query": "pandas", "limit": 10}
```
### tool_output
Extracts the return value from a specific tool call.
```yaml
graders:
search_results:
kind: tool
function: contains
extractor: tool_output # Extract tool return value
extractor_config:
tool_name: search # Which tool's output to extract
```
Finds the tool call and its corresponding return message.
### after_marker
Extracts content after a specific marker string.
```yaml
graders:
answer_section:
kind: tool
function: contains
extractor: after_marker # Extract content after marker
extractor_config:
marker: "ANSWER:" # Marker string to find
include_marker: false # Don't include "ANSWER:" in output
```
Example: From "Here's my analysis... ANSWER: Paris", extracts "Paris"
### memory_block
Extracts content from a specific memory block (requires agent_state).
```yaml
graders:
human_memory:
kind: tool
function: exact_match
extractor: memory_block # Extract from agent memory
extractor_config:
block_label: human # Which memory block to extract
```
<Warning>
**Important**: This extractor requires the agent's final state, which adds overhead. The runner automatically fetches agent_state when this extractor is used.
</Warning>
Example use case: Verify the agent correctly updated its memory about the user.
## Extractor Configuration
Some extractors accept additional configuration via `extractor_config`:
```yaml
graders:
my_metric:
kind: tool
function: contains
extractor: pattern # Use pattern extractor
extractor_config: # Configuration for this extractor
pattern: 'Answer: (.*)' # Regex pattern
group: 1 # Extract capture group 1
```
## Choosing an Extractor
| Use Case | Recommended Extractor |
|----------|---------------------|
| Final agent response | `last_assistant` |
| First response before tools | `first_assistant` |
| Complete conversation | `all_assistant` |
| Specific format extraction | `pattern` |
| Tool usage validation | `tool_arguments` |
| Tool result checking | `tool_output` |
| Memory validation | `memory_block` |
| Structured output | `after_marker` |
## Content Flattening
Assistant messages can contain multiple content parts. Extractors automatically flatten complex content to plain text.
## Empty Extraction
If an extractor finds no matching content, it returns an empty string `""`. This typically results in a score of 0.0 from the grader.
## Custom Extractors
You can write custom extractors. See [Custom Extractors](/guides/evals/extractors/custom-extractors) for details.
Example:
```python
from letta_evals.decorators import extractor
from letta_client import LettaMessageUnion
@extractor
def my_extractor(trajectory: List[List[LettaMessageUnion]], config: dict) -> str:
# Custom extraction logic
return extracted_text
```
Register by importing in your suite's setup script or custom evaluators file.
## Multi-Metric Extraction
Different graders can use different extractors:
```yaml
graders:
response_quality: # Evaluate final message quality
kind: rubric
prompt_path: quality.txt
extractor: last_assistant # Extract final response
tool_usage: # Check tool was called correctly
kind: tool
function: exact_match
extractor: tool_arguments # Extract tool args
extractor_config:
tool_name: search # From search tool
memory_update: # Verify memory updated
kind: tool
function: contains
extractor: memory_block # Extract from memory
extractor_config:
block_label: human # Human memory block
```
Each grader independently extracts and evaluates different aspects.
## Listing Extractors
See all available extractors:
```bash
letta-evals list-extractors
```
## Examples
### Extract Final Answer
```yaml
extractor: last_assistant # Get final agent message
```
Agent: "Let me search... *uses tool* ... The answer is Paris."
Extracted: "The answer is Paris."
### Extract Tool Arguments
```yaml
extractor: tool_arguments # Get tool call args
extractor_config:
tool_name: search # From search tool
```
Agent calls: `search(query="pandas", limit=5)`
Extracted: `{"query": "pandas", "limit": 5}`
### Extract Pattern
```yaml
extractor: pattern # Extract with regex
extractor_config:
pattern: 'RESULT: (\w+)' # Match pattern
group: 1 # Extract capture group 1
```
Agent: "After calculation... RESULT: SUCCESS"
Extracted: "SUCCESS"
### Extract Memory
```yaml
extractor: memory_block # Extract from agent memory
extractor_config:
block_label: human # Human memory block
```
Agent updates memory block "human" to: "User's name is Alice"
Extracted: "User's name is Alice"
## Troubleshooting
<Warning>
**Extractor returns empty string**
**Problem**: Grader always gives score 0.0 because extractor finds nothing.
**Common causes**:
- **Wrong extractor**: Using `first_assistant` but agent doesn't respond until after tool use → use `last_assistant`
- **Wrong tool name**: `tool_arguments` with `tool_name: "search"` but agent calls `"web_search"` → check actual tool name
- **Wrong memory block**: `memory_block` with `block_label: "user"` but block is actually labeled `"human"` → check block labels
- **Pattern doesn't match**: `pattern: "Answer: (.*)"` but agent says "The answer is..." → adjust regex
</Warning>
<Tip>
**Debug tips**:
1. Check the trajectory in results JSON to see actual agent output
2. Use `last_assistant` first to see what's there
3. Verify tool names with `letta-evals list-extractors`
</Tip>
## Next Steps
- [Built-in Extractors Reference](/guides/evals/extractors/built-in-extractors) - Complete extractor documentation
- [Custom Extractors Guide](/guides/evals/extractors/custom-extractors) - Write your own extractors
- [Graders](/guides/evals/concepts/graders) - How to use extractors with graders

View File

@@ -1,375 +0,0 @@
# Gates
**Gates** are the pass/fail criteria for your evaluation. They determine whether your agent meets the required performance threshold by checking aggregate metrics.
**Quick overview:**
- **Single decision**: One gate per suite determines pass/fail
- **Two metrics**: `avg_score` (average of all scores) or `accuracy` (percentage passing threshold)
- **Flexible operators**: >=, >, <=, <, == for threshold comparison
- **Customizable pass criteria**: Define what counts as "passing" for accuracy calculations
- **Exit codes**: Suite exits 0 for pass, 1 for fail
**Common patterns:**
- `avg_score >= 0.8` - Average score must be 80%+
- `accuracy >= 0.9` - 90%+ of samples must pass
- Custom threshold - Define per-sample pass criteria with `pass_value`
Gates define the pass/fail criteria for your evaluation. They check if aggregate metrics meet a threshold.
## Basic Structure
```yaml
gate:
metric_key: accuracy # Which grader to evaluate
metric: avg_score # Use average score (default)
op: gte # Greater than or equal
value: 0.8 # 80% threshold
```
## Why Use Gates?
Gates provide **automated pass/fail decisions** for your evaluations, which is essential for:
**CI/CD Integration**: Gates let you block deployments if agent performance drops:
```bash
letta-evals run suite.yaml
# Exit code 0 = pass (continue deployment)
# Exit code 1 = fail (block deployment)
```
**Regression Testing**: Set a baseline threshold and ensure new changes don't degrade performance:
```yaml
gate:
metric: avg_score
op: gte
value: 0.85 # Must maintain 85%+ to pass
```
**Quality Enforcement**: Require agents meet minimum standards before production:
```yaml
gate:
metric: accuracy
op: gte
value: 0.95 # 95% of test cases must pass
```
### What Happens When Gates Fail?
When a gate condition is not met:
1. **Console output** shows failure message:
```
✗ FAILED (0.72/1.00 avg, 72.0% pass rate)
Gate check failed: avg_score (0.72) not >= 0.80
```
2. **Exit code** is 1 (non-zero indicates failure):
```bash
letta-evals run suite.yaml
echo $? # Prints 1 if gate failed
```
3. **Results JSON** includes `gate_passed: false`:
```json
{
"gate_passed": false,
"gate_check": {
"metric": "avg_score",
"value": 0.72,
"threshold": 0.80,
"operator": "gte",
"passed": false
},
"metrics": { ... }
}
```
4. **All other data is preserved** - you still get full results, scores, and trajectories even when gating fails
**Common use case in CI**:
```bash
#!/bin/bash
letta-evals run suite.yaml --output results.json
if [ $? -ne 0 ]; then
echo "❌ Agent evaluation failed - blocking merge"
exit 1
else
echo "✅ Agent evaluation passed - safe to merge"
fi
```
## Required Fields
### metric_key
Which grader to evaluate. Must match a key in your `graders` section:
```yaml
graders:
accuracy: # Grader name
kind: tool
function: exact_match
extractor: last_assistant
gate:
metric_key: accuracy # Must match grader name above
op: gte # >=
value: 0.8 # 80% threshold
```
If you only have one grader, `metric_key` can be omitted - it will default to your single grader.
### metric
Which aggregate statistic to compare. Two options:
#### avg_score
Average score across all samples (0.0 to 1.0):
```yaml
gate:
metric_key: quality # Check quality grader
metric: avg_score # Use average of all scores
op: gte # >=
value: 0.7 # Must average 70%+
```
Example: If scores are [0.8, 0.9, 0.6], avg_score = 0.77
#### accuracy
Pass rate as a percentage (0.0 to 1.0):
```yaml
gate:
metric_key: accuracy # Check accuracy grader
metric: accuracy # Use pass rate, not average
op: gte # >=
value: 0.8 # 80% of samples must pass
```
By default, samples with score >= 1.0 are considered "passing".
You can customize the per-sample threshold with `pass_op` and `pass_value` (see below).
**Note**: The default `metric` is `avg_score`, so you can omit it if that's what you want:
```yaml
gate:
metric_key: quality # Check quality grader
op: gte # >=
value: 0.7 # 70% threshold (defaults to avg_score)
```
### op
Comparison operator:
- `gte`: Greater than or equal (>=)
- `gt`: Greater than (>)
- `lte`: Less than or equal (<=)
- `lt`: Less than (<)
- `eq`: Equal (==)
Most common: `gte` (at least X)
### value
Threshold value for comparison:
- For `avg_score`: 0.0 to 1.0
- For `accuracy`: 0.0 to 1.0 (representing percentage)
```yaml
gate:
metric: avg_score # Average score
op: gte # >=
value: 0.75 # 75% threshold
```
```yaml
gate:
metric: accuracy # Pass rate
op: gte # >=
value: 0.9 # 90% must pass
```
## Optional Fields
### pass_op and pass_value
Customize when individual samples are considered "passing" (used for accuracy calculation):
```yaml
gate:
metric_key: quality # Check quality grader
metric: accuracy # Use pass rate
op: gte # >=
value: 0.8 # 80% must pass
pass_op: gte # Sample passes if >=
pass_value: 0.7 # This threshold (70%)
```
Default behavior:
- If `metric` is `avg_score`: samples pass if score >= the gate value
- If `metric` is `accuracy`: samples pass if score >= 1.0 (perfect)
## Examples
### Require 80% Average Score
```yaml
gate:
metric_key: quality # Check quality grader
metric: avg_score # Use average
op: gte # >=
value: 0.8 # 80% average
```
Passes if the average score across all samples is >= 0.8
### Require 90% Pass Rate (Perfect Scores)
```yaml
gate:
metric_key: accuracy # Check accuracy grader
metric: accuracy # Use pass rate
op: gte # >=
value: 0.9 # 90% must pass (default: score >= 1.0 to pass)
```
Passes if 90% of samples have score = 1.0
### Require 75% Pass Rate (Score >= 0.7)
```yaml
gate:
metric_key: quality # Check quality grader
metric: accuracy # Use pass rate
op: gte # >=
value: 0.75 # 75% must pass
pass_op: gte # Sample passes if >=
pass_value: 0.7 # 70% threshold per sample
```
Passes if 75% of samples have score >= 0.7
### Maximum Error Rate
```yaml
gate:
metric_key: quality # Check quality grader
metric: accuracy # Use pass rate
op: gte # >=
value: 0.95 # 95% must pass (allows 5% failures)
pass_op: gt # Sample passes if >
pass_value: 0.0 # 0.0 (any non-zero score)
```
Allows up to 5% failures.
### Exact Pass Rate
```yaml
gate:
metric_key: quality # Check quality grader
metric: accuracy # Use pass rate
op: eq # Exactly equal
value: 1.0 # 100% (all samples must pass)
```
All samples must pass.
## Multi-Metric Gating
When you have multiple graders, you can only gate on one metric:
```yaml
graders:
accuracy: # First metric
kind: tool
function: exact_match
extractor: last_assistant
completeness: # Second metric
kind: rubric
prompt_path: completeness.txt
model: gpt-4o-mini
extractor: last_assistant
gate:
metric_key: accuracy # Only gate on accuracy (completeness still computed)
metric: avg_score # Use average
op: gte # >=
value: 0.8 # 80% threshold
```
The evaluation passes/fails based on the gated metric, but results include scores for all metrics.
## Understanding avg_score vs accuracy
### avg_score
- Arithmetic mean of all scores
- Sensitive to partial credit
- Good for continuous evaluation
Example:
- Scores: [1.0, 0.8, 0.6]
- avg_score = (1.0 + 0.8 + 0.6) / 3 = 0.8
### accuracy
- Percentage of samples meeting a threshold
- Binary pass/fail per sample
- Good for strict requirements
Example:
- Scores: [1.0, 0.8, 0.6]
- pass_value: 0.7
- Passing: [1.0, 0.8] = 2 out of 3
- accuracy = 2/3 = 0.667 (66.7%)
## Errors and Attempted Samples
If a sample fails (error during evaluation), it:
- Gets a score of 0.0
- Counts toward `total` but not `total_attempted`
- Included in `avg_score_total` but not `avg_score_attempted`
You can gate on either:
- `avg_score_total`: Includes errors as 0.0
- `avg_score_attempted`: Excludes errors (only successfully attempted samples)
**Note**: The `metric` field currently only supports `avg_score` and `accuracy`. By default, gates use `avg_score_attempted`.
## Gate Results
After evaluation, you'll see:
```
✓ PASSED (2.25/3.00 avg, 75.0% pass rate)
```
or
```
✗ FAILED (1.80/3.00 avg, 60.0% pass rate)
```
The evaluation exit code reflects the gate result:
- 0: Passed
- 1: Failed
## Advanced Gating
For complex gating logic (e.g., "pass if accuracy >= 80% OR avg_score >= 0.9"), you'll need to:
1. Run evaluation with one gate
2. Examine the results JSON
3. Apply custom logic in a post-processing script
## Next Steps
- [Understanding Results](../results/overview.md) - Interpreting evaluation output
- [Multi-Metric Evaluation](../graders/multi-metric.md) - Using multiple graders
- [Suite YAML Reference](../configuration/suite-yaml.md) - Complete gate configuration

View File

@@ -1,384 +0,0 @@
# Gates
**Gates** are the pass/fail criteria for your evaluation. They determine whether your agent meets the required performance threshold by checking aggregate metrics.
<Note>
**Quick overview:**
- **Single decision**: One gate per suite determines pass/fail
- **Two metrics**: `avg_score` (average of all scores) or `accuracy` (percentage passing threshold)
- **Flexible operators**: `>=`, `>`, `<=`, `<`, `==` for threshold comparison
- **Customizable pass criteria**: Define what counts as "passing" for accuracy calculations
- **Exit codes**: Suite exits 0 for pass, 1 for fail
</Note>
**Common patterns:**
- Average score must be 80%+: `avg_score >= 0.8`
- 90%+ of samples must pass: `accuracy >= 0.9`
- Custom threshold: Define per-sample pass criteria with `pass_value`
Gates define the pass/fail criteria for your evaluation. They check if aggregate metrics meet a threshold.
## Basic Structure
```yaml
gate:
metric_key: accuracy # Which grader to evaluate
metric: avg_score # Use average score (default)
op: gte # Greater than or equal
value: 0.8 # 80% threshold
```
## Why Use Gates?
Gates provide **automated pass/fail decisions** for your evaluations, which is essential for:
**CI/CD Integration**: Gates let you block deployments if agent performance drops:
```bash
letta-evals run suite.yaml
# Exit code 0 = pass (continue deployment)
# Exit code 1 = fail (block deployment)
```
**Regression Testing**: Set a baseline threshold and ensure new changes don't degrade performance:
```yaml
gate:
metric: avg_score
op: gte
value: 0.85 # Must maintain 85%+ to pass
```
**Quality Enforcement**: Require agents meet minimum standards before production:
```yaml
gate:
metric: accuracy
op: gte
value: 0.95 # 95% of test cases must pass
```
### What Happens When Gates Fail?
When a gate condition is not met:
1. **Console output** shows failure message:
```text
✗ FAILED (0.72/1.00 avg, 72.0% pass rate)
Gate check failed: avg_score (0.72) not >= 0.80
```
2. **Exit code** is 1 (non-zero indicates failure):
```bash
letta-evals run suite.yaml
echo $? # Prints 1 if gate failed
```
3. **Results JSON** includes `gate_passed: false`:
```json
{
"gate_passed": false,
"gate_check": {
"metric": "avg_score",
"value": 0.72,
"threshold": 0.80,
"operator": "gte",
"passed": false
},
"metrics": { ... }
}
```
4. **All other data is preserved** - you still get full results, scores, and trajectories even when gating fails
<Tip>
**Common use case in CI**:
```bash
#!/bin/bash
letta-evals run suite.yaml --output results.json
if [ $? -ne 0 ]; then
echo "❌ Agent evaluation failed - blocking merge"
exit 1
else
echo "✅ Agent evaluation passed - safe to merge"
fi
```
</Tip>
## Required Fields
### metric_key
Which grader to evaluate. Must match a key in your `graders` section:
```yaml
graders:
accuracy: # Grader name
kind: tool
function: exact_match
extractor: last_assistant
gate:
metric_key: accuracy # Must match grader name above
op: gte # >=
value: 0.8 # 80% threshold
```
If you only have one grader, `metric_key` can be omitted - it will default to your single grader.
### metric
Which aggregate statistic to compare. Two options:
#### avg_score
Average score across all samples (0.0 to 1.0):
```yaml
gate:
metric_key: quality # Check quality grader
metric: avg_score # Use average of all scores
op: gte # >=
value: 0.7 # Must average 70%+
```
Example: If scores are [0.8, 0.9, 0.6], avg_score = 0.77
#### accuracy
Pass rate as a percentage (0.0 to 1.0):
```yaml
gate:
metric_key: accuracy # Check accuracy grader
metric: accuracy # Use pass rate, not average
op: gte # >=
value: 0.8 # 80% of samples must pass
```
By default, samples with score `>= 1.0` are considered "passing".
You can customize the per-sample threshold with `pass_op` and `pass_value` (see below).
<Note>
**Note**: The default `metric` is `avg_score`, so you can omit it if that's what you want:
```yaml
gate:
metric_key: quality # Check quality grader
op: gte # >=
value: 0.7 # 70% threshold (defaults to avg_score)
```
</Note>
### op
Comparison operator:
- `gte`: Greater than or equal (`>=`)
- `gt`: Greater than (`>`)
- `lte`: Less than or equal (`<=`)
- `lt`: Less than (`<`)
- `eq`: Equal (`==`)
Most common: `gte` (at least X)
### value
Threshold value for comparison:
- For `avg_score`: 0.0 to 1.0
- For `accuracy`: 0.0 to 1.0 (representing percentage)
```yaml
gate:
metric: avg_score # Average score
op: gte # >=
value: 0.75 # 75% threshold
```
```yaml
gate:
metric: accuracy # Pass rate
op: gte # >=
value: 0.9 # 90% must pass
```
## Optional Fields
### pass_op and pass_value
Customize when individual samples are considered "passing" (used for accuracy calculation):
```yaml
gate:
metric_key: quality # Check quality grader
metric: accuracy # Use pass rate
op: gte # >=
value: 0.8 # 80% must pass
pass_op: gte # Sample passes if >=
pass_value: 0.7 # This threshold (70%)
```
Default behavior:
- If `metric` is `avg_score`: samples pass if score `>=` the gate value
- If `metric` is `accuracy`: samples pass if score `>= 1.0` (perfect)
## Examples
### Require 80% Average Score
```yaml
gate:
metric_key: quality # Check quality grader
metric: avg_score # Use average
op: gte # >=
value: 0.8 # 80% average
```
Passes if the average score across all samples is `>= 0.8`
### Require 90% Pass Rate (Perfect Scores)
```yaml
gate:
metric_key: accuracy # Check accuracy grader
metric: accuracy # Use pass rate
op: gte # >=
value: 0.9 # 90% must pass (default: score >= 1.0 to pass)
```
Passes if 90% of samples have score = 1.0
### Require 75% Pass Rate (Score `>= 0.7`)
```yaml
gate:
metric_key: quality # Check quality grader
metric: accuracy # Use pass rate
op: gte # >=
value: 0.75 # 75% must pass
pass_op: gte # Sample passes if >=
pass_value: 0.7 # 70% threshold per sample
```
Passes if 75% of samples have score `>= 0.7`
### Maximum Error Rate
```yaml
gate:
metric_key: quality # Check quality grader
metric: accuracy # Use pass rate
op: gte # >=
value: 0.95 # 95% must pass (allows 5% failures)
pass_op: gt # Sample passes if >
pass_value: 0.0 # 0.0 (any non-zero score)
```
Allows up to 5% failures.
### Exact Pass Rate
```yaml
gate:
metric_key: quality # Check quality grader
metric: accuracy # Use pass rate
op: eq # Exactly equal
value: 1.0 # 100% (all samples must pass)
```
All samples must pass.
## Multi-Metric Gating
When you have multiple graders, you can only gate on one metric:
```yaml
graders:
accuracy: # First metric
kind: tool
function: exact_match
extractor: last_assistant
completeness: # Second metric
kind: rubric
prompt_path: completeness.txt
model: gpt-4o-mini
extractor: last_assistant
gate:
metric_key: accuracy # Only gate on accuracy (completeness still computed)
metric: avg_score # Use average
op: gte # >=
value: 0.8 # 80% threshold
```
The evaluation passes/fails based on the gated metric, but results include scores for all metrics.
## Understanding avg_score vs accuracy
### avg_score
- Arithmetic mean of all scores
- Sensitive to partial credit
- Good for continuous evaluation
Example:
- Scores: [1.0, 0.8, 0.6]
- avg_score = (1.0 + 0.8 + 0.6) / 3 = 0.8
### accuracy
- Percentage of samples meeting a threshold
- Binary pass/fail per sample
- Good for strict requirements
Example:
- Scores: [1.0, 0.8, 0.6]
- pass_value: 0.7
- Passing: [1.0, 0.8] = 2 out of 3
- accuracy = 2/3 = 0.667 (66.7%)
## Errors and Attempted Samples
If a sample fails (error during evaluation), it:
- Gets a score of 0.0
- Counts toward `total` but not `total_attempted`
- Included in `avg_score_total` but not `avg_score_attempted`
You can gate on either:
- `avg_score_total`: Includes errors as 0.0
- `avg_score_attempted`: Excludes errors (only successfully attempted samples)
<Note>
**Note**: The `metric` field currently only supports `avg_score` and `accuracy`. By default, gates use `avg_score_attempted`.
</Note>
## Gate Results
After evaluation, you'll see:
```text
✓ PASSED (2.25/3.00 avg, 75.0% pass rate)
```
or
```text
✗ FAILED (1.80/3.00 avg, 60.0% pass rate)
```
The evaluation exit code reflects the gate result:
- 0: Passed
- 1: Failed
## Advanced Gating
For complex gating logic (e.g., "pass if accuracy `>= 80%` OR avg_score `>= 0.9`"), you'll need to:
1. Run evaluation with one gate
2. Examine the results JSON
3. Apply custom logic in a post-processing script
## Next Steps
- [Understanding Results](/evals/results-metrics/understanding-results) - Interpreting evaluation output
- [Multi-Metric Evaluation](/guides/guides/evals/graders/multi-metric) - Using multiple graders
- [Suite YAML Reference](/guides/evals/configuration/suite-yaml) - Complete gate configuration

View File

@@ -1,328 +0,0 @@
# Graders
**Graders** are the scoring functions that evaluate agent responses. They take the extracted submission (from an extractor) and assign a score between 0.0 (complete failure) and 1.0 (perfect success).
**Quick overview:**
- **Two types**: Tool graders (deterministic Python functions) and Rubric graders (LLM-as-judge)
- **Built-in functions**: exact_match, contains, regex_match, ascii_printable_only
- **Custom graders**: Write your own grading logic
- **Multi-metric**: Combine multiple graders in one suite
- **Flexible extraction**: Each grader can use a different extractor
**When to use each:**
- **Tool graders**: Fast, deterministic, free - perfect for exact matching, patterns, tool validation
- **Rubric graders**: Flexible, subjective, costs API calls - ideal for quality, creativity, nuanced evaluation
Graders evaluate agent responses and assign scores between 0.0 (complete failure) and 1.0 (perfect success).
## Grader Types
There are two types of graders:
### Tool Graders
Python functions that programmatically compare the submission to ground truth or apply deterministic checks.
```yaml
graders:
accuracy:
kind: tool # Deterministic grading
function: exact_match # Built-in grading function
extractor: last_assistant # Use final agent response
```
Best for:
- Exact matching
- Pattern checking
- Tool call validation
- Deterministic criteria
### Rubric Graders
LLM-as-judge evaluation using custom prompts and criteria. Can use either direct LLM API calls or a Letta agent as the judge.
**Standard rubric grading (LLM API):**
```yaml
graders:
quality:
kind: rubric # LLM-as-judge
prompt_path: rubric.txt # Custom evaluation criteria
model: gpt-4o-mini # Judge model
extractor: last_assistant # What to evaluate
```
**Agent-as-judge (Letta agent):**
```yaml
graders:
agent_judge:
kind: rubric # Still "rubric" kind
agent_file: judge.af # Judge agent with submit_grade tool
prompt_path: rubric.txt # Evaluation criteria
extractor: last_assistant # What to evaluate
```
Best for:
- Subjective quality assessment
- Open-ended responses
- Nuanced evaluation
- Complex criteria
- Judges that need tools (when using agent-as-judge)
## Built-in Tool Graders
### exact_match
Checks if submission exactly matches ground truth (case-sensitive, whitespace-trimmed).
```yaml
graders:
accuracy:
kind: tool
function: exact_match # Case-sensitive, whitespace-trimmed
extractor: last_assistant # Extract final response
```
Requires: `ground_truth` in dataset
Score: 1.0 if exact match, 0.0 otherwise
### contains
Checks if submission contains ground truth (case-insensitive).
```yaml
graders:
contains_answer:
kind: tool
function: contains # Case-insensitive substring match
extractor: last_assistant # Search in final response
```
Requires: `ground_truth` in dataset
Score: 1.0 if found, 0.0 otherwise
### regex_match
Checks if submission matches a regex pattern in ground truth.
```yaml
graders:
pattern:
kind: tool
function: regex_match # Pattern matching
extractor: last_assistant # Check final response
```
Dataset sample:
```json
{"input": "Generate a UUID", "ground_truth": "[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"}
```
Score: 1.0 if pattern matches, 0.0 otherwise
### ascii_printable_only
Validates that all characters are printable ASCII (useful for ASCII art, formatted output).
```yaml
graders:
ascii_check:
kind: tool
function: ascii_printable_only # Validate ASCII characters
extractor: last_assistant # Check final response
```
Does not require ground truth.
Score: 1.0 if all characters are printable ASCII, 0.0 if any non-printable characters found
## Rubric Graders
Rubric graders use an LLM to evaluate responses based on custom criteria.
### Basic Configuration
```yaml
graders:
quality:
kind: rubric # LLM-as-judge
prompt_path: quality_rubric.txt # Evaluation criteria
model: gpt-4o-mini # Judge model
temperature: 0.0 # Deterministic
extractor: last_assistant # What to evaluate
```
### Rubric Prompt Format
Your rubric file should describe the evaluation criteria. Use placeholders:
- `{input}`: The original input from the dataset
- `{submission}`: The extracted agent response
- `{ground_truth}`: Ground truth from dataset (if available)
Example `quality_rubric.txt`:
```
Evaluate the response for:
1. Accuracy: Does it correctly answer the question?
2. Completeness: Is the answer thorough?
3. Clarity: Is it well-explained?
Input: {input}
Expected: {ground_truth}
Response: {submission}
Score from 0.0 to 1.0 where:
- 1.0: Perfect response
- 0.75: Good with minor issues
- 0.5: Acceptable but incomplete
- 0.25: Poor quality
- 0.0: Completely wrong
```
### Inline Prompt
Instead of a file, you can include the prompt inline:
```yaml
graders:
quality:
kind: rubric # LLM-as-judge
prompt: | # Inline prompt instead of file
Evaluate the creativity and originality of the response.
Score 1.0 for highly creative, 0.0 for generic or unoriginal.
model: gpt-4o-mini # Judge model
extractor: last_assistant # What to evaluate
```
### Model Configuration
```yaml
graders:
quality:
kind: rubric
prompt_path: rubric.txt # Evaluation criteria
model: gpt-4o-mini # Judge model
temperature: 0.0 # Deterministic (0.0-2.0)
provider: openai # LLM provider (default: openai)
max_retries: 5 # API retry attempts
timeout: 120.0 # Request timeout in seconds
```
Supported providers:
- `openai` (default)
Models:
- Any OpenAI-compatible model
- Special handling for reasoning models (o1, o3) - temperature automatically adjusted to 1.0
### Structured Output
Rubric graders use JSON mode to get structured responses:
```json
{
"score": 0.85,
"rationale": "The response is accurate and complete but could be more concise."
}
```
The score is validated to be between 0.0 and 1.0.
## Multi-Metric Configuration
Evaluate multiple aspects in one suite:
```yaml
graders:
accuracy: # Tool grader for factual correctness
kind: tool
function: contains
extractor: last_assistant
completeness: # Rubric grader for thoroughness
kind: rubric
prompt_path: completeness_rubric.txt
model: gpt-4o-mini
extractor: last_assistant
tool_usage: # Tool grader for tool call validation
kind: tool
function: exact_match
extractor: tool_arguments # Extract tool call args
extractor_config:
tool_name: search # Which tool to check
```
Each grader can use a different extractor.
## Extractor Configuration
Every grader must specify an `extractor` to select what to grade:
```yaml
graders:
my_metric:
kind: tool
function: contains # Grading function
extractor: last_assistant # What to extract and grade
```
Some extractors need additional configuration:
```yaml
graders:
tool_check:
kind: tool
function: contains # Check if ground truth in tool args
extractor: tool_arguments # Extract tool call arguments
extractor_config: # Configuration for this extractor
tool_name: search # Which tool to extract from
```
See [Extractors](./extractors.md) for all available extractors.
## Custom Graders
You can write custom grading functions. See [Custom Graders](../advanced/custom-graders.md) for details.
## Grader Selection Guide
| Use Case | Recommended Grader |
|----------|-------------------|
| Exact answer matching | `exact_match` |
| Keyword checking | `contains` |
| Pattern validation | `regex_match` |
| Tool call validation | `exact_match` with `tool_arguments` extractor |
| Quality assessment | Rubric grader |
| Creativity evaluation | Rubric grader |
| Format checking | Custom tool grader |
| Multi-criteria evaluation | Multiple graders |
## Score Interpretation
All scores are between 0.0 and 1.0:
- **1.0**: Perfect - meets all criteria
- **0.75-0.99**: Good - minor issues
- **0.5-0.74**: Acceptable - notable gaps
- **0.25-0.49**: Poor - major problems
- **0.0-0.24**: Failed - did not meet criteria
Tool graders typically return binary scores (0.0 or 1.0), while rubric graders can return any value in the range.
## Error Handling
If grading fails (e.g., network error, invalid format):
- Score is set to 0.0
- Rationale includes error message
- Metadata includes error details
This ensures evaluations can continue even with individual failures.
## Next Steps
- [Tool Graders](../graders/tool-graders.md) - Built-in and custom functions
- [Rubric Graders](../graders/rubric-graders.md) - LLM-as-judge details
- [Multi-Metric Evaluation](../graders/multi-metric.md) - Using multiple graders
- [Extractors](./extractors.md) - Selecting what to grade

View File

@@ -1,330 +0,0 @@
# Graders
**Graders** are the scoring functions that evaluate agent responses. They take the extracted submission (from an extractor) and assign a score between 0.0 (complete failure) and 1.0 (perfect success).
<Note>
**Quick overview:**
- **Two types**: Tool graders (deterministic Python functions) and Rubric graders (LLM-as-judge)
- **Built-in functions**: exact_match, contains, regex_match, ascii_printable_only
- **Custom graders**: Write your own grading logic
- **Multi-metric**: Combine multiple graders in one suite
- **Flexible extraction**: Each grader can use a different extractor
</Note>
**When to use each:**
- **Tool graders**: Fast, deterministic, free - perfect for exact matching, patterns, tool validation
- **Rubric graders**: Flexible, subjective, costs API calls - ideal for quality, creativity, nuanced evaluation
Graders evaluate agent responses and assign scores between 0.0 (complete failure) and 1.0 (perfect success).
## Grader Types
There are two types of graders:
### Tool Graders
Python functions that programmatically compare the submission to ground truth or apply deterministic checks.
```yaml
graders:
accuracy:
kind: tool # Deterministic grading
function: exact_match # Built-in grading function
extractor: last_assistant # Use final agent response
```
Best for:
- Exact matching
- Pattern checking
- Tool call validation
- Deterministic criteria
### Rubric Graders
LLM-as-judge evaluation using custom prompts and criteria. Can use either direct LLM API calls or a Letta agent as the judge.
**Standard rubric grading (LLM API):**
```yaml
graders:
quality:
kind: rubric # LLM-as-judge
prompt_path: rubric.txt # Custom evaluation criteria
model: gpt-4o-mini # Judge model
extractor: last_assistant # What to evaluate
```
**Agent-as-judge (Letta agent):**
```yaml
graders:
agent_judge:
kind: rubric # Still "rubric" kind
agent_file: judge.af # Judge agent with submit_grade tool
prompt_path: rubric.txt # Evaluation criteria
extractor: last_assistant # What to evaluate
```
Best for:
- Subjective quality assessment
- Open-ended responses
- Nuanced evaluation
- Complex criteria
- Judges that need tools (when using agent-as-judge)
## Built-in Tool Graders
### exact_match
Checks if submission exactly matches ground truth (case-sensitive, whitespace-trimmed).
```yaml
graders:
accuracy:
kind: tool
function: exact_match # Case-sensitive, whitespace-trimmed
extractor: last_assistant # Extract final response
```
Requires: `ground_truth` in dataset
Score: 1.0 if exact match, 0.0 otherwise
### contains
Checks if submission contains ground truth (case-insensitive).
```yaml
graders:
contains_answer:
kind: tool
function: contains # Case-insensitive substring match
extractor: last_assistant # Search in final response
```
Requires: `ground_truth` in dataset
Score: 1.0 if found, 0.0 otherwise
### regex_match
Checks if submission matches a regex pattern in ground truth.
```yaml
graders:
pattern:
kind: tool
function: regex_match # Pattern matching
extractor: last_assistant # Check final response
```
Dataset sample:
```json
{"input": "Generate a UUID", "ground_truth": "[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"}
```
Score: 1.0 if pattern matches, 0.0 otherwise
### ascii_printable_only
Validates that all characters are printable ASCII (useful for ASCII art, formatted output).
```yaml
graders:
ascii_check:
kind: tool
function: ascii_printable_only # Validate ASCII characters
extractor: last_assistant # Check final response
```
Does not require ground truth.
Score: 1.0 if all characters are printable ASCII, 0.0 if any non-printable characters found
## Rubric Graders
Rubric graders use an LLM to evaluate responses based on custom criteria.
### Basic Configuration
```yaml
graders:
quality:
kind: rubric # LLM-as-judge
prompt_path: quality_rubric.txt # Evaluation criteria
model: gpt-4o-mini # Judge model
temperature: 0.0 # Deterministic
extractor: last_assistant # What to evaluate
```
### Rubric Prompt Format
Your rubric file should describe the evaluation criteria. Use placeholders:
- `{input}`: The original input from the dataset
- `{submission}`: The extracted agent response
- `{ground_truth}`: Ground truth from dataset (if available)
Example `quality_rubric.txt`:
```
Evaluate the response for:
1. Accuracy: Does it correctly answer the question?
2. Completeness: Is the answer thorough?
3. Clarity: Is it well-explained?
Input: {input}
Expected: {ground_truth}
Response: {submission}
Score from 0.0 to 1.0 where:
- 1.0: Perfect response
- 0.75: Good with minor issues
- 0.5: Acceptable but incomplete
- 0.25: Poor quality
- 0.0: Completely wrong
```
### Inline Prompt
Instead of a file, you can include the prompt inline:
```yaml
graders:
quality:
kind: rubric # LLM-as-judge
prompt: | # Inline prompt instead of file
Evaluate the creativity and originality of the response.
Score 1.0 for highly creative, 0.0 for generic or unoriginal.
model: gpt-4o-mini # Judge model
extractor: last_assistant # What to evaluate
```
### Model Configuration
```yaml
graders:
quality:
kind: rubric
prompt_path: rubric.txt # Evaluation criteria
model: gpt-4o-mini # Judge model
temperature: 0.0 # Deterministic (0.0-2.0)
provider: openai # LLM provider (default: openai)
max_retries: 5 # API retry attempts
timeout: 120.0 # Request timeout in seconds
```
Supported providers:
- `openai` (default)
Models:
- Any OpenAI-compatible model
- Special handling for reasoning models (o1, o3) - temperature automatically adjusted to 1.0
### Structured Output
Rubric graders use JSON mode to get structured responses:
```json
{
"score": 0.85,
"rationale": "The response is accurate and complete but could be more concise."
}
```
The score is validated to be between 0.0 and 1.0.
## Multi-Metric Configuration
Evaluate multiple aspects in one suite:
```yaml
graders:
accuracy: # Tool grader for factual correctness
kind: tool
function: contains
extractor: last_assistant
completeness: # Rubric grader for thoroughness
kind: rubric
prompt_path: completeness_rubric.txt
model: gpt-4o-mini
extractor: last_assistant
tool_usage: # Tool grader for tool call validation
kind: tool
function: exact_match
extractor: tool_arguments # Extract tool call args
extractor_config:
tool_name: search # Which tool to check
```
Each grader can use a different extractor.
## Extractor Configuration
Every grader must specify an `extractor` to select what to grade:
```yaml
graders:
my_metric:
kind: tool
function: contains # Grading function
extractor: last_assistant # What to extract and grade
```
Some extractors need additional configuration:
```yaml
graders:
tool_check:
kind: tool
function: contains # Check if ground truth in tool args
extractor: tool_arguments # Extract tool call arguments
extractor_config: # Configuration for this extractor
tool_name: search # Which tool to extract from
```
See [Extractors](/guides/evals/concepts/extractors) for all available extractors.
## Custom Graders
You can write custom grading functions. See [Custom Graders](/guides/evals/advanced/custom-graders) for details.
## Grader Selection Guide
| Use Case | Recommended Grader |
|----------|-------------------|
| Exact answer matching | `exact_match` |
| Keyword checking | `contains` |
| Pattern validation | `regex_match` |
| Tool call validation | `exact_match` with `tool_arguments` extractor |
| Quality assessment | Rubric grader |
| Creativity evaluation | Rubric grader |
| Format checking | Custom tool grader |
| Multi-criteria evaluation | Multiple graders |
## Score Interpretation
All scores are between 0.0 and 1.0:
- **1.0**: Perfect - meets all criteria
- **0.75-0.99**: Good - minor issues
- **0.5-0.74**: Acceptable - notable gaps
- **0.25-0.49**: Poor - major problems
- **0.0-0.24**: Failed - did not meet criteria
Tool graders typically return binary scores (0.0 or 1.0), while rubric graders can return any value in the range.
## Error Handling
If grading fails (e.g., network error, invalid format):
- Score is set to 0.0
- Rationale includes error message
- Metadata includes error details
This ensures evaluations can continue even with individual failures.
## Next Steps
- [Tool Graders](/guides/evals/graders/tool-graders) - Built-in and custom functions
- [Rubric Graders](/guides/evals/graders/rubric-graders) - LLM-as-judge details
- [Multi-Metric Evaluation](/guides/guides/evals/graders/multi-metric) - Using multiple graders
- [Extractors](/guides/evals/concepts/extractors) - Selecting what to grade

View File

@@ -1,205 +0,0 @@
# Core Concepts Overview
## What is Letta Evals?
Letta Evals is a framework for systematically testing and measuring the performance of Letta AI agents. It provides a structured way to:
- Define test cases and expected behaviors
- Run agents against those tests automatically
- Score agent responses using deterministic rules or LLM judges
- Track performance over time and across different configurations
Think of it as a testing framework specifically designed for stateful agents.
## The Evaluation Flow
Every evaluation follows this flow:
**Dataset → Target (Agent) → Extractor → Grader → Gate → Result**
1. **Dataset**: Your test cases (questions, scenarios, expected outputs)
2. **Target**: The agent being evaluated
3. **Extractor**: Pulls out the relevant information from the agent's response
4. **Grader**: Scores the extracted information
5. **Gate**: Pass/fail criteria for the overall evaluation
6. **Result**: Metrics, scores, and detailed results
### Built for Stateful Agents
Unlike most evaluation frameworks designed for simple input-output models, Letta Evals is purpose-built for **stateful agents** - agents that:
- Maintain memory across conversations
- Use tools and external functions
- Evolve their behavior based on interactions
- Have persistent context and state
This means you can test:
- **Memory updates**: Did the agent correctly remember the user's name?
- **Multi-turn conversations**: Can the agent maintain context across multiple exchanges?
- **Tool usage**: Does the agent call the right tools with the right arguments?
- **State evolution**: How does the agent's internal state change over time?
Traditional eval frameworks treat each test as independent. Letta Evals understands that agent state matters.
**Example: Testing Memory Updates**
```yaml
graders:
memory_check:
kind: tool # Deterministic grading
function: contains # Check if ground_truth in extracted content
extractor: memory_block # Extract from agent memory (not just response!)
extractor_config:
block_label: human # Which memory block to check
```
Dataset:
```jsonl
{"input": "Please remember that I like bananas.", "ground_truth": "bananas"}
```
This doesn't just check if the agent responded correctly - it verifies the agent actually stored "bananas" in its memory block. Traditional eval frameworks can't inspect agent state like this.
## Why Evals Matter
AI agents are complex systems that can behave unpredictably. Without systematic evaluation, you can't:
- **Know if changes improve or break your agent** - Did that prompt tweak help or hurt?
- **Prevent regressions** - Catch when "fixes" break existing functionality
- **Compare approaches objectively** - Which model works better for your use case?
- **Build confidence before deployment** - Ensure quality before shipping to users
- **Track improvement over time** - Measure progress as you iterate
Manual testing doesn't scale. Evals let you test hundreds of scenarios in minutes.
## What Evals Are Useful For
### 1. Development & Iteration
- Test prompt changes instantly across your entire test suite
- Experiment with different models and compare results
- Validate that new features work as expected
### 2. Quality Assurance
- Prevent regressions when modifying agent behavior
- Ensure agents handle edge cases correctly
- Verify tool usage and memory updates
### 3. Model Selection
- Compare GPT-4 vs Claude vs other models on your specific use case
- Test different model configurations (temperature, system prompts, etc.)
- Find the right cost/performance tradeoff
### 4. Benchmarking
- Measure agent performance on standard tasks
- Track improvements over time
- Share reproducible results with your team
### 5. Production Readiness
- Validate agents meet quality thresholds before deployment
- Run continuous evaluation in CI/CD pipelines
- Monitor production agent quality
## How Letta Evals Works
Letta Evals is built around a few key concepts that work together to create a flexible evaluation framework.
## Key Components
### Suite
An **evaluation suite** is a complete test configuration defined in a YAML file. It ties together:
- Which dataset to use
- Which agent to test
- How to grade responses
- What criteria determine pass/fail
Think of a suite as a reusable test specification.
### Dataset
A **dataset** is a JSONL file where each line represents one test case. Each sample has:
- An input (what to ask the agent)
- Optional ground truth (the expected answer)
- Optional metadata (tags, custom fields)
### Target
The **target** is what you're evaluating. Currently, this is a Letta agent, specified by:
- An agent file (.af)
- An existing agent ID
- A Python script that creates agents programmatically
### Trajectory
A **trajectory** is the complete conversation history from one test case. It's a list of turns, where each turn contains a list of Letta messages (assistant messages, tool calls, tool returns, etc.).
### Extractor
An **extractor** determines what part of the trajectory to evaluate. For example:
- The last thing the agent said
- All tool calls made
- Content from agent memory
- Text matching a pattern
### Grader
A **grader** scores how well the agent performed. There are two types:
- **Tool graders**: Python functions that compare submission to ground truth
- **Rubric graders**: LLM judges that evaluate based on custom criteria
### Gate
A **gate** is the pass/fail threshold for your evaluation. It compares aggregate metrics (like average score or pass rate) against a target value.
## Multi-Metric Evaluation
You can define multiple graders in one suite to evaluate different aspects:
```yaml
graders:
accuracy: # Check if answer is correct
kind: tool
function: exact_match
extractor: last_assistant # Use final response
tool_usage: # Check if agent called the right tool
kind: tool
function: contains
extractor: tool_arguments # Extract tool call args
extractor_config:
tool_name: search # From search tool
```
The gate can check any of these metrics:
```yaml
gate:
metric_key: accuracy # Gate on accuracy (tool_usage still computed)
op: gte # >=
value: 0.8 # 80% threshold
```
## Score Normalization
All scores are normalized to the range [0.0, 1.0]:
- 0.0 = complete failure
- 1.0 = perfect success
- Values in between = partial credit
This allows different grader types to be compared and combined.
## Aggregate Metrics
Individual sample scores are aggregated in two ways:
1. **Average Score**: Mean of all scores (0.0 to 1.0)
2. **Accuracy/Pass Rate**: Percentage of samples passing a threshold
You can gate on either metric type.
## Next Steps
Dive deeper into each concept:
- [Suites](./suites.md) - Suite configuration in detail
- [Datasets](./datasets.md) - Creating effective test datasets
- [Targets](./targets.md) - Agent configuration options
- [Graders](./graders.md) - Understanding grader types
- [Extractors](./extractors.md) - Extraction strategies
- [Gates](./gates.md) - Setting pass/fail criteria

View File

@@ -1,207 +0,0 @@
# Core Concepts
Understanding how Letta Evals works and what makes it different.
<Note>
**Just want to run an eval?** Skip to [Getting Started](/guides/evals/getting-started) for a hands-on quickstart.
</Note>
## Built for Stateful Agents
Letta Evals is a testing framework specifically designed for agents that maintain state. Unlike traditional eval frameworks built for simple input-output models, Letta Evals understands that agents:
- Maintain memory across conversations
- Use tools and external functions
- Evolve their behavior based on interactions
- Have persistent context and state
This means you can test aspects of your agent that other frameworks can't: memory updates, multi-turn conversations, tool usage patterns, and state evolution over time.
## The Evaluation Flow
Every evaluation follows this flow:
**Dataset → Target (Agent) → Extractor → Grader → Gate → Result**
1. **Dataset**: Your test cases (questions, scenarios, expected outputs)
2. **Target**: The agent being evaluated
3. **Extractor**: Pulls out the relevant information from the agent's response
4. **Grader**: Scores the extracted information
5. **Gate**: Pass/fail criteria for the overall evaluation
6. **Result**: Metrics, scores, and detailed results
### What You Can Test
With Letta Evals, you can test aspects of agents that traditional frameworks can't:
- **Memory updates**: Did the agent correctly remember the user's name?
- **Multi-turn conversations**: Can the agent maintain context across multiple exchanges?
- **Tool usage**: Does the agent call the right tools with the right arguments?
- **State evolution**: How does the agent's internal state change over time?
<Note>
**Example: Testing Memory Updates**
```yaml
graders:
memory_check:
kind: tool # Deterministic grading
function: contains # Check if ground_truth in extracted content
extractor: memory_block # Extract from agent memory (not just response!)
extractor_config:
block_label: human # Which memory block to check
```
Dataset:
```jsonl
{"input": "Please remember that I like bananas.", "ground_truth": "bananas"}
```
This doesn't just check if the agent responded correctly - it verifies the agent actually stored "bananas" in its memory block. Traditional eval frameworks can't inspect agent state like this.
</Note>
## Why Evals Matter
AI agents are complex systems that can behave unpredictably. Without systematic evaluation, you can't:
- **Know if changes improve or break your agent** - Did that prompt tweak help or hurt?
- **Prevent regressions** - Catch when "fixes" break existing functionality
- **Compare approaches objectively** - Which model works better for your use case?
- **Build confidence before deployment** - Ensure quality before shipping to users
- **Track improvement over time** - Measure progress as you iterate
Manual testing doesn't scale. Evals let you test hundreds of scenarios in minutes.
## What Evals Are Useful For
### 1. Development & Iteration
- Test prompt changes instantly across your entire test suite
- Experiment with different models and compare results
- Validate that new features work as expected
### 2. Quality Assurance
- Prevent regressions when modifying agent behavior
- Ensure agents handle edge cases correctly
- Verify tool usage and memory updates
### 3. Model Selection
- Compare GPT-4 vs Claude vs other models on your specific use case
- Test different model configurations (temperature, system prompts, etc.)
- Find the right cost/performance tradeoff
### 4. Benchmarking
- Measure agent performance on standard tasks
- Track improvements over time
- Share reproducible results with your team
### 5. Production Readiness
- Validate agents meet quality thresholds before deployment
- Run continuous evaluation in CI/CD pipelines
- Monitor production agent quality
## How Letta Evals Works
Letta Evals is built around a few key concepts that work together to create a flexible evaluation framework.
## Key Components
### Suite
An **evaluation suite** is a complete test configuration defined in a YAML file. It ties together:
- Which dataset to use
- Which agent to test
- How to grade responses
- What criteria determine pass/fail
Think of a suite as a reusable test specification.
### Dataset
A **dataset** is a JSONL file where each line represents one test case. Each sample has:
- An input (what to ask the agent)
- Optional ground truth (the expected answer)
- Optional metadata (tags, custom fields)
### Target
The **target** is what you're evaluating. Currently, this is a Letta agent, specified by:
- An agent file (.af)
- An existing agent ID
- A Python script that creates agents programmatically
### Trajectory
A **trajectory** is the complete conversation history from one test case. It's a list of turns, where each turn contains a list of Letta messages (assistant messages, tool calls, tool returns, etc.).
### Extractor
An **extractor** determines what part of the trajectory to evaluate. For example:
- The last thing the agent said
- All tool calls made
- Content from agent memory
- Text matching a pattern
### Grader
A **grader** scores how well the agent performed. There are two types:
- **Tool graders**: Python functions that compare submission to ground truth
- **Rubric graders**: LLM judges that evaluate based on custom criteria
### Gate
A **gate** is the pass/fail threshold for your evaluation. It compares aggregate metrics (like average score or pass rate) against a target value.
## Multi-Metric Evaluation
You can define multiple graders in one suite to evaluate different aspects:
```yaml
graders:
accuracy: # Check if answer is correct
kind: tool
function: exact_match
extractor: last_assistant # Use final response
tool_usage: # Check if agent called the right tool
kind: tool
function: contains
extractor: tool_arguments # Extract tool call args
extractor_config:
tool_name: search # From search tool
```
The gate can check any of these metrics:
```yaml
gate:
metric_key: accuracy # Gate on accuracy (tool_usage still computed)
op: gte # >=
value: 0.8 # 80% threshold
```
## Score Normalization
All scores are normalized to the range [0.0, 1.0]:
- 0.0 = complete failure
- 1.0 = perfect success
- Values in between = partial credit
This allows different grader types to be compared and combined.
## Aggregate Metrics
Individual sample scores are aggregated in two ways:
1. **Average Score**: Mean of all scores (0.0 to 1.0)
2. **Accuracy/Pass Rate**: Percentage of samples passing a threshold
You can gate on either metric type.
## Next Steps
Dive deeper into each concept:
- [Suites](/guides/evals/concepts/suites) - Suite configuration in detail
- [Datasets](/guides/evals/concepts/datasets) - Creating effective test datasets
- [Targets](/guides/evals/concepts/targets) - Agent configuration options
- [Graders](/guides/evals/concepts/graders) - Understanding grader types
- [Extractors](/guides/evals/concepts/extractors) - Extraction strategies
- [Gates](/guides/evals/concepts/gates) - Setting pass/fail criteria

View File

@@ -1,273 +0,0 @@
# Suites
A **suite** is a YAML configuration file that defines a complete evaluation specification. It's the central piece that ties together your dataset, target agent, grading criteria, and pass/fail thresholds.
**Quick overview:**
- **Single file defines everything**: Dataset, agent, graders, and success criteria all in one YAML
- **Reusable and shareable**: Version control your evaluation specs alongside your code
- **Multi-metric support**: Evaluate multiple aspects (accuracy, quality, tool usage) in one run
- **Multi-model testing**: Run the same suite across different LLM models
- **Flexible filtering**: Test subsets using tags or sample limits
**Typical workflow:**
1. Create a suite YAML defining what and how to test
2. Run `letta-evals run suite.yaml`
3. Review results showing scores for each metric
4. Suite passes or fails based on gate criteria
An evaluation suite is a YAML configuration file that defines a complete test specification.
## Basic Structure
```yaml
name: my-evaluation # Suite identifier
description: Optional description of what this tests # Human-readable explanation
dataset: path/to/dataset.jsonl # Test cases
target: # What agent to evaluate
kind: agent
agent_file: agent.af # Agent to test
base_url: http://localhost:8283 # Letta server
graders: # How to evaluate responses
my_metric:
kind: tool # Deterministic grading
function: exact_match # Grading function
extractor: last_assistant # What to extract from agent response
gate: # Pass/fail criteria
metric_key: my_metric # Which metric to check
op: gte # Greater than or equal
value: 0.8 # 80% threshold
```
## Required Fields
### name
The name of your evaluation suite. Used in output and results.
```yaml
name: question-answering-eval
```
### dataset
Path to the JSONL or CSV dataset file. Can be relative (to the suite YAML) or absolute.
```yaml
dataset: ./datasets/qa.jsonl # Relative to suite YAML location
```
### target
Specifies what agent to evaluate. See [Targets](./targets.md) for details.
### graders
One or more graders to evaluate agent performance. See [Graders](./graders.md) for details.
### gate
Pass/fail criteria. See [Gates](./gates.md) for details.
## Optional Fields
### description
A human-readable description of what this suite tests:
```yaml
description: Tests the agent's ability to answer factual questions accurately
```
### max_samples
Limit the number of samples to evaluate (useful for quick tests):
```yaml
max_samples: 10 # Only evaluate first 10 samples
```
### sample_tags
Filter samples by tags (only evaluate samples with these tags):
```yaml
sample_tags: [math, easy] # Only samples tagged with "math" AND "easy"
```
Dataset samples can include tags:
```jsonl
{"input": "What is 2+2?", "ground_truth": "4", "tags": ["math", "easy"]}
```
### num_runs
Number of times to run the entire evaluation suite (useful for testing non-deterministic behavior):
```yaml
num_runs: 5 # Run the evaluation 5 times
```
Default: 1
### setup_script
Path to a Python script with a setup function to run before evaluation:
```yaml
setup_script: setup.py:prepare_environment # script.py:function_name
```
The setup function should have this signature:
```python
def prepare_environment(suite: SuiteSpec) -> None:
# Setup code here
pass
```
## Path Resolution
Paths in the suite YAML are resolved relative to the YAML file location:
```
project/
├── suite.yaml
├── dataset.jsonl
└── agents/
└── my_agent.af
```
```yaml
# In suite.yaml
dataset: dataset.jsonl # Resolves to project/dataset.jsonl
target:
agent_file: agents/my_agent.af # Resolves to project/agents/my_agent.af
```
Absolute paths are used as-is.
## Multi-Grader Suites
You can evaluate multiple metrics in one suite:
```yaml
graders:
accuracy: # Check if answer is correct
kind: tool
function: exact_match
extractor: last_assistant
completeness: # LLM judges response quality
kind: rubric
prompt_path: rubrics/completeness.txt
model: gpt-4o-mini
extractor: last_assistant
tool_usage: # Verify correct tool was called
kind: tool
function: contains
extractor: tool_arguments # Extract tool call arguments
```
The gate can check any of these metrics:
```yaml
gate:
metric_key: accuracy # Gate on accuracy metric (others still computed)
op: gte # Greater than or equal
value: 0.9 # 90% threshold
```
Results will include scores for all graders, even if you only gate on one.
## Examples
### Simple Tool Grader Suite
```yaml
name: basic-qa # Suite name
dataset: questions.jsonl # Test questions
target:
kind: agent
agent_file: qa_agent.af # Pre-configured agent
base_url: http://localhost:8283 # Local server
graders:
accuracy: # Single metric
kind: tool # Deterministic grading
function: contains # Check if ground truth is in response
extractor: last_assistant # Use final agent message
gate:
metric_key: accuracy # Gate on this metric
op: gte # Must be >=
value: 0.75 # 75% to pass
```
### Rubric Grader Suite
```yaml
name: quality-eval # Quality evaluation
dataset: prompts.jsonl # Test prompts
target:
kind: agent
agent_id: existing-agent-123 # Use existing agent
base_url: https://api.letta.com # Letta Cloud
graders:
quality: # LLM-as-judge metric
kind: rubric # Subjective evaluation
prompt_path: quality_rubric.txt # Rubric template
model: gpt-4o-mini # Judge model
temperature: 0.0 # Deterministic
extractor: last_assistant # Evaluate final response
gate:
metric_key: quality # Gate on this metric
metric: avg_score # Use average score
op: gte # Must be >=
value: 0.7 # 70% to pass
```
### Multi-Model Suite
Test the same agent configuration across different models:
```yaml
name: model-comparison # Compare model performance
dataset: test.jsonl # Same test for all models
target:
kind: agent
agent_file: agent.af # Same agent configuration
base_url: http://localhost:8283 # Local server
model_configs: [gpt-4o-mini, claude-3-5-sonnet] # Test both models
graders:
accuracy: # Single metric for comparison
kind: tool
function: exact_match
extractor: last_assistant
gate:
metric_key: accuracy # Both models must pass this
op: gte # Must be >=
value: 0.8 # 80% threshold
```
Results will show per-model metrics.
## Validation
Validate your suite configuration before running:
```bash
letta-evals validate suite.yaml
```
This checks:
- Required fields are present
- Paths exist
- Configuration is valid
- Grader/extractor combinations are compatible
## Next Steps
- [Dataset Configuration](./datasets.md)
- [Target Configuration](./targets.md)
- [Grader Configuration](./graders.md)
- [Gate Configuration](./gates.md)

View File

@@ -1,275 +0,0 @@
# Suites
A **suite** is a YAML configuration file that defines a complete evaluation specification. It's the central piece that ties together your dataset, target agent, grading criteria, and pass/fail thresholds.
<Note>
**Quick overview:**
- **Single file defines everything**: Dataset, agent, graders, and success criteria all in one YAML
- **Reusable and shareable**: Version control your evaluation specs alongside your code
- **Multi-metric support**: Evaluate multiple aspects (accuracy, quality, tool usage) in one run
- **Multi-model testing**: Run the same suite across different LLM models
- **Flexible filtering**: Test subsets using tags or sample limits
</Note>
**Typical workflow:**
1. Create a suite YAML defining what and how to test
2. Run `letta-evals run suite.yaml`
3. Review results showing scores for each metric
4. Suite passes or fails based on gate criteria
An evaluation suite is a YAML configuration file that defines a complete test specification.
## Basic Structure
```yaml
name: my-evaluation # Suite identifier
description: Optional description of what this tests # Human-readable explanation
dataset: path/to/dataset.jsonl # Test cases
target: # What agent to evaluate
kind: agent
agent_file: agent.af # Agent to test
base_url: https://api.letta.com # Letta server
graders: # How to evaluate responses
my_metric:
kind: tool # Deterministic grading
function: exact_match # Grading function
extractor: last_assistant # What to extract from agent response
gate: # Pass/fail criteria
metric_key: my_metric # Which metric to check
op: gte # Greater than or equal
value: 0.8 # 80% threshold
```
## Required Fields
### name
The name of your evaluation suite. Used in output and results.
```yaml
name: question-answering-eval
```
### dataset
Path to the JSONL or CSV dataset file. Can be relative (to the suite YAML) or absolute.
```yaml
dataset: ./datasets/qa.jsonl # Relative to suite YAML location
```
### target
Specifies what agent to evaluate. See [Targets](/guides/evals/concepts/targets) for details.
### graders
One or more graders to evaluate agent performance. See [Graders](/guides/evals/concepts/graders) for details.
### gate
Pass/fail criteria. See [Gates](/guides/evals/concepts/gates) for details.
## Optional Fields
### description
A human-readable description of what this suite tests:
```yaml
description: Tests the agent's ability to answer factual questions accurately
```
### max_samples
Limit the number of samples to evaluate (useful for quick tests):
```yaml
max_samples: 10 # Only evaluate first 10 samples
```
### sample_tags
Filter samples by tags (only evaluate samples with these tags):
```yaml
sample_tags: [math, easy] # Only samples tagged with "math" AND "easy"
```
Dataset samples can include tags:
```jsonl
{"input": "What is 2+2?", "ground_truth": "4", "tags": ["math", "easy"]}
```
### num_runs
Number of times to run the entire evaluation suite (useful for testing non-deterministic behavior):
```yaml
num_runs: 5 # Run the evaluation 5 times
```
Default: 1
### setup_script
Path to a Python script with a setup function to run before evaluation:
```yaml
setup_script: setup.py:prepare_environment # script.py:function_name
```
The setup function should have this signature:
```python
def prepare_environment(suite: SuiteSpec) -> None:
# Setup code here
pass
```
## Path Resolution
Paths in the suite YAML are resolved relative to the YAML file location:
```
project/
├── suite.yaml
├── dataset.jsonl
└── agents/
└── my_agent.af
```
```yaml
# In suite.yaml
dataset: dataset.jsonl # Resolves to project/dataset.jsonl
target:
agent_file: agents/my_agent.af # Resolves to project/agents/my_agent.af
```
Absolute paths are used as-is.
## Multi-Grader Suites
You can evaluate multiple metrics in one suite:
```yaml
graders:
accuracy: # Check if answer is correct
kind: tool
function: exact_match
extractor: last_assistant
completeness: # LLM judges response quality
kind: rubric
prompt_path: rubrics/completeness.txt
model: gpt-4o-mini
extractor: last_assistant
tool_usage: # Verify correct tool was called
kind: tool
function: contains
extractor: tool_arguments # Extract tool call arguments
```
The gate can check any of these metrics:
```yaml
gate:
metric_key: accuracy # Gate on accuracy metric (others still computed)
op: gte # Greater than or equal
value: 0.9 # 90% threshold
```
Results will include scores for all graders, even if you only gate on one.
## Examples
### Simple Tool Grader Suite
```yaml
name: basic-qa # Suite name
dataset: questions.jsonl # Test questions
target:
kind: agent
agent_file: qa_agent.af # Pre-configured agent
base_url: https://api.letta.com # Local server
graders:
accuracy: # Single metric
kind: tool # Deterministic grading
function: contains # Check if ground truth is in response
extractor: last_assistant # Use final agent message
gate:
metric_key: accuracy # Gate on this metric
op: gte # Must be >=
value: 0.75 # 75% to pass
```
### Rubric Grader Suite
```yaml
name: quality-eval # Quality evaluation
dataset: prompts.jsonl # Test prompts
target:
kind: agent
agent_id: existing-agent-123 # Use existing agent
base_url: https://api.letta.com # Letta Cloud
graders:
quality: # LLM-as-judge metric
kind: rubric # Subjective evaluation
prompt_path: quality_rubric.txt # Rubric template
model: gpt-4o-mini # Judge model
temperature: 0.0 # Deterministic
extractor: last_assistant # Evaluate final response
gate:
metric_key: quality # Gate on this metric
metric: avg_score # Use average score
op: gte # Must be >=
value: 0.7 # 70% to pass
```
### Multi-Model Suite
Test the same agent configuration across different models:
```yaml
name: model-comparison # Compare model performance
dataset: test.jsonl # Same test for all models
target:
kind: agent
agent_file: agent.af # Same agent configuration
base_url: https://api.letta.com # Local server
model_configs: [gpt-4o-mini, claude-3-5-sonnet] # Test both models
graders:
accuracy: # Single metric for comparison
kind: tool
function: exact_match
extractor: last_assistant
gate:
metric_key: accuracy # Both models must pass this
op: gte # Must be >=
value: 0.8 # 80% threshold
```
Results will show per-model metrics.
## Validation
Validate your suite configuration before running:
```bash
letta-evals validate suite.yaml
```
This checks:
- Required fields are present
- Paths exist
- Configuration is valid
- Grader/extractor combinations are compatible
## Next Steps
- [Dataset Configuration](/guides/evals/concepts/datasets)
- [Target Configuration](/guides/evals/concepts/targets)
- [Grader Configuration](/guides/evals/concepts/graders)
- [Gate Configuration](/guides/evals/concepts/gates)

View File

@@ -1,319 +0,0 @@
# Targets
A **target** is the agent you're evaluating. In Letta Evals, the target configuration determines how agents are created, accessed, and tested.
**Quick overview:**
- **Three ways to specify agents**: agent file (`.af`), existing agent ID, or programmatic creation script
- **Critical distinction**: `agent_file`/`agent_script` create fresh agents per sample (isolated tests), while `agent_id` uses one agent for all samples (stateful conversation)
- **Multi-model support**: Test the same agent configuration across different LLM models
- **Flexible connection**: Connect to local Letta servers or Letta Cloud
**When to use each approach:**
- `agent_file` - Pre-configured agents saved as `.af` files (most common)
- `agent_id` - Testing existing agents or multi-turn conversations with state
- `agent_script` - Dynamic agent creation with per-sample customization
The target configuration specifies how to create or access the agent for evaluation.
## Target Configuration
All targets have a `kind` field (currently only `agent` is supported):
```yaml
target:
kind: agent # Currently only "agent" is supported
# ... agent-specific configuration
```
## Agent Sources
You must specify exactly ONE of these:
### agent_file
Path to a `.af` (Agent File) to upload:
```yaml
target:
kind: agent
agent_file: path/to/agent.af # Path to .af file
base_url: http://localhost:8283 # Letta server URL
```
The agent file will be uploaded to the Letta server and a new agent created for the evaluation.
### agent_id
ID of an existing agent on the server:
```yaml
target:
kind: agent
agent_id: agent-123-abc # ID of existing agent
base_url: http://localhost:8283 # Letta server URL
```
The existing agent will be used directly. Note: this agent's memory will be modified during evaluation.
### agent_script
Path to a Python script with an agent factory function for programmatic agent creation:
```yaml
target:
kind: agent
agent_script: create_agent.py:create_inventory_agent # script.py:function_name
base_url: http://localhost:8283 # Letta server URL
```
Format: `path/to/script.py:function_name`
The function must be decorated with `@agent_factory` and have the signature `async (client: AsyncLetta, sample: Sample) -> str`:
```python
from letta_client import AsyncLetta, CreateBlock
from letta_evals.decorators import agent_factory
from letta_evals.models import Sample
@agent_factory
async def create_inventory_agent(client: AsyncLetta, sample: Sample) -> str:
"""Create and return agent ID for this sample."""
# Access custom arguments from the dataset
item = sample.agent_args.get("item", {})
# Create agent with sample-specific configuration
agent = await client.agents.create(
name="inventory-assistant",
memory_blocks=[
CreateBlock(
label="item_context",
value=f"Item: {item.get('name', 'Unknown')}"
)
],
agent_type="letta_v1_agent",
model="openai/gpt-4.1-mini",
embedding="openai/text-embedding-3-small",
)
return agent.id
```
**Key features:**
- Creates a fresh agent for each sample
- Can customize agents using `sample.agent_args` from the dataset
- Allows testing agent creation logic itself
- Useful when you don't have pre-saved agent files
**When to use:**
- Testing agent creation workflows
- Dynamic per-sample agent configuration
- Agents that need sample-specific memory or tools
- Programmatic agent testing
See [`examples/programmatic-agent-creation/`](https://github.com/letta-ai/letta-evals/tree/main/examples/programmatic-agent-creation) for a complete working example.
## Connection Configuration
### base_url
Letta server URL:
```yaml
target:
base_url: http://localhost:8283 # Local Letta server
# or
base_url: https://api.letta.com # Letta Cloud
```
Default: `http://localhost:8283`
### api_key
API key for authentication (required for Letta Cloud):
```yaml
target:
api_key: your-api-key-here # Required for Letta Cloud
```
Or set via environment variable:
```bash
export LETTA_API_KEY=your-api-key-here
```
### project_id
Letta project ID (for Letta Cloud):
```yaml
target:
project_id: proj_abc123 # Letta Cloud project
```
Or set via environment variable:
```bash
export LETTA_PROJECT_ID=proj_abc123
```
### timeout
Request timeout in seconds:
```yaml
target:
timeout: 300.0 # Request timeout (5 minutes)
```
Default: 300 seconds
## Multi-Model Evaluation
Test the same agent across different models:
### model_configs
List of model configuration names from JSON files:
```yaml
target:
kind: agent
agent_file: agent.af
model_configs: [gpt-4o-mini, claude-3-5-sonnet] # Test with both models
```
The evaluation will run once for each model config. Model configs are JSON files in `letta_evals/llm_model_configs/`.
### model_handles
List of model handles (cloud-compatible identifiers):
```yaml
target:
kind: agent
agent_file: agent.af
model_handles: ["openai/gpt-4o-mini", "anthropic/claude-3-5-sonnet"] # Cloud model identifiers
```
Use this for Letta Cloud deployments.
**Note**: You cannot specify both `model_configs` and `model_handles`.
## Complete Examples
### Local Development
```yaml
target:
kind: agent
agent_file: ./agents/my_agent.af # Pre-configured agent
base_url: http://localhost:8283 # Local server
```
### Letta Cloud
```yaml
target:
kind: agent
agent_id: agent-cloud-123 # Existing cloud agent
base_url: https://api.letta.com # Letta Cloud
api_key: ${LETTA_API_KEY} # From environment variable
project_id: proj_abc # Your project ID
```
### Multi-Model Testing
```yaml
target:
kind: agent
agent_file: agent.af # Same agent configuration
base_url: http://localhost:8283 # Local server
model_configs: [gpt-4o-mini, gpt-4o, claude-3-5-sonnet] # Test 3 models
```
Results will include per-model metrics:
```
Model: gpt-4o-mini - Avg: 0.85, Pass: 85.0%
Model: gpt-4o - Avg: 0.92, Pass: 92.0%
Model: claude-3-5-sonnet - Avg: 0.88, Pass: 88.0%
```
### Programmatic Agent Creation
```yaml
target:
kind: agent
agent_script: setup.py:CustomAgentFactory # Programmatic creation
base_url: http://localhost:8283 # Local server
```
## Environment Variable Precedence
Configuration values are resolved in this order (highest priority first):
1. CLI arguments (`--api-key`, `--base-url`, `--project-id`)
2. Suite YAML configuration
3. Environment variables (`LETTA_API_KEY`, `LETTA_BASE_URL`, `LETTA_PROJECT_ID`)
## Agent Lifecycle and Testing Behavior
The way your agent is specified fundamentally changes how the evaluation runs:
### With agent_file or agent_script: Independent Testing
**Agent lifecycle:**
1. A fresh agent instance is created for each sample
2. Agent processes the sample input(s)
3. Agent remains on the server after the sample completes
**Testing behavior:** Each sample is an independent, isolated test. Agent state (memory, message history) does not carry over between samples. This enables parallel execution and ensures reproducible results.
**Use cases:**
- Testing how the agent responds to various independent inputs
- Ensuring consistent behavior across different scenarios
- Regression testing where each case should be isolated
- Evaluating agent responses without prior context
**Example:** If you have 10 test cases, 10 separate agent instances will be created (one per test case), and they can run in parallel.
### With agent_id: Sequential Script Testing
**Agent lifecycle:**
1. The same agent instance is used for all samples
2. Agent processes each sample in sequence
3. Agent state persists throughout the entire evaluation
**Testing behavior:** The dataset becomes a conversation script where each sample builds on previous ones. Agent memory and message history accumulate, and earlier interactions affect later responses. Samples must execute sequentially.
**Use cases:**
- Testing multi-turn conversations with context
- Evaluating how agent memory evolves over time
- Simulating a single user session with multiple interactions
- Testing scenarios where context should accumulate
**Example:** If you have 10 test cases, they all run against the same agent instance in order, with state carrying over between each test.
### Critical Differences
| Aspect | agent_file / agent_script | agent_id |
|--------|---------------------------|----------|
| **Agent instances** | New agent per sample | Same agent for all samples |
| **State isolation** | Fully isolated | State carries over |
| **Execution** | Can run in parallel | Must run sequentially |
| **Memory** | Fresh for each sample | Accumulates across samples |
| **Use case** | Independent test cases | Conversation scripts |
| **Reproducibility** | Highly reproducible | Depends on execution order |
**Best practice:** Use `agent_file` or `agent_script` for most evaluations to ensure reproducible, isolated tests. Use `agent_id` only when you specifically need to test how agent state evolves across multiple interactions.
## Validation
The runner validates:
- Exactly one of `agent_file`, `agent_id`, or `agent_script` is specified
- Agent files have `.af` extension
- Agent script paths are valid
## Next Steps
- [Suite YAML Reference](../configuration/suite-yaml.md) - Complete target configuration options
- [Datasets](./datasets.md) - Using agent_args for sample-specific configuration
- [Getting Started](../getting-started.md) - Complete tutorial with target examples

View File

@@ -1,329 +0,0 @@
# Targets
A **target** is the agent you're evaluating. In Letta Evals, the target configuration determines how agents are created, accessed, and tested.
<Note>
**Quick overview:**
- **Three ways to specify agents**: agent file (`.af`), existing agent ID, or programmatic creation script
- **Critical distinction**: `agent_file`/`agent_script` create fresh agents per sample (isolated tests), while `agent_id` uses one agent for all samples (stateful conversation)
- **Multi-model support**: Test the same agent configuration across different LLM models
- **Flexible connection**: Connect to local Letta servers or Letta Cloud
</Note>
**When to use each approach:**
- `agent_file` - Pre-configured agents saved as `.af` files (most common)
- `agent_id` - Testing existing agents or multi-turn conversations with state
- `agent_script` - Dynamic agent creation with per-sample customization
The target configuration specifies how to create or access the agent for evaluation.
## Target Configuration
All targets have a `kind` field (currently only `agent` is supported):
```yaml
target:
kind: agent # Currently only "agent" is supported
# ... agent-specific configuration
```
## Agent Sources
You must specify exactly ONE of these:
### agent_file
Path to a `.af` (Agent File) to upload:
```yaml
target:
kind: agent
agent_file: path/to/agent.af # Path to .af file
base_url: https://api.letta.com # Letta server URL
```
The agent file will be uploaded to the Letta server and a new agent created for the evaluation.
### agent_id
ID of an existing agent on the server:
```yaml
target:
kind: agent
agent_id: agent-123-abc # ID of existing agent
base_url: https://api.letta.com # Letta server URL
```
<Warning>
**Modifies agent in-place:** Using `agent_id` will modify your agent's state, memory, and message history during evaluation. The same agent instance is used for all samples, processing them sequentially. **Do not use production agents or agents you don't want to modify.** Use `agent_file` or `agent_script` for reproducible, isolated testing.
</Warning>
### agent_script
Path to a Python script with an agent factory function for programmatic agent creation:
```yaml
target:
kind: agent
agent_script: create_agent.py:create_inventory_agent # script.py:function_name
base_url: https://api.letta.com # Letta server URL
```
Format: `path/to/script.py:function_name`
The function must be decorated with `@agent_factory` and have the signature `async (client: AsyncLetta, sample: Sample) -> str`:
```python
from letta_client import AsyncLetta, CreateBlock
from letta_evals.decorators import agent_factory
from letta_evals.models import Sample
@agent_factory
async def create_inventory_agent(client: AsyncLetta, sample: Sample) -> str:
"""Create and return agent ID for this sample."""
# Access custom arguments from the dataset
item = sample.agent_args.get("item", {})
# Create agent with sample-specific configuration
agent = await client.agents.create(
name="inventory-assistant",
memory_blocks=[
CreateBlock(
label="item_context",
value=f"Item: {item.get('name', 'Unknown')}"
)
],
agent_type="letta_v1_agent",
model="openai/gpt-4.1-mini",
embedding="openai/text-embedding-3-small",
)
return agent.id
```
**Key features:**
- Creates a fresh agent for each sample
- Can customize agents using `sample.agent_args` from the dataset
- Allows testing agent creation logic itself
- Useful when you don't have pre-saved agent files
**When to use:**
- Testing agent creation workflows
- Dynamic per-sample agent configuration
- Agents that need sample-specific memory or tools
- Programmatic agent testing
## Connection Configuration
### base_url
Letta server URL:
```yaml
target:
base_url: https://api.letta.com # Local Letta server
# or
base_url: https://api.letta.com # Letta Cloud
```
Default: `https://api.letta.com`
### api_key
API key for authentication (required for Letta Cloud):
```yaml
target:
api_key: your-api-key-here # Required for Letta Cloud
```
Or set via environment variable:
```bash
export LETTA_API_KEY=your-api-key-here
```
### project_id
Letta project ID (for Letta Cloud):
```yaml
target:
project_id: proj_abc123 # Letta Cloud project
```
Or set via environment variable:
```bash
export LETTA_PROJECT_ID=proj_abc123
```
### timeout
Request timeout in seconds:
```yaml
target:
timeout: 300.0 # Request timeout (5 minutes)
```
Default: 300 seconds
## Multi-Model Evaluation
Test the same agent across different models:
### model_configs
List of model configuration names from JSON files:
```yaml
target:
kind: agent
agent_file: agent.af
model_configs: [gpt-4o-mini, claude-3-5-sonnet] # Test with both models
```
The evaluation will run once for each model config. Model configs are JSON files in `letta_evals/llm_model_configs/`.
### model_handles
List of model handles (cloud-compatible identifiers):
```yaml
target:
kind: agent
agent_file: agent.af
model_handles: ["openai/gpt-4o-mini", "anthropic/claude-3-5-sonnet"] # Cloud model identifiers
```
Use this for Letta Cloud deployments.
<Warning>
**Note**: You cannot specify both `model_configs` and `model_handles`.
</Warning>
## Complete Examples
### Local Development
```yaml
target:
kind: agent
agent_file: ./agents/my_agent.af # Pre-configured agent
base_url: https://api.letta.com # Local server
```
### Letta Cloud
```yaml
target:
kind: agent
agent_id: agent-cloud-123 # Existing cloud agent
base_url: https://api.letta.com # Letta Cloud
api_key: ${LETTA_API_KEY} # From environment variable
project_id: proj_abc # Your project ID
```
### Multi-Model Testing
```yaml
target:
kind: agent
agent_file: agent.af # Same agent configuration
base_url: https://api.letta.com # Local server
model_configs: [gpt-4o-mini, gpt-4o, claude-3-5-sonnet] # Test 3 models
```
Results will include per-model metrics:
```
Model: gpt-4o-mini - Avg: 0.85, Pass: 85.0%
Model: gpt-4o - Avg: 0.92, Pass: 92.0%
Model: claude-3-5-sonnet - Avg: 0.88, Pass: 88.0%
```
### Programmatic Agent Creation
```yaml
target:
kind: agent
agent_script: setup.py:CustomAgentFactory # Programmatic creation
base_url: https://api.letta.com # Local server
```
## Environment Variable Precedence
Configuration values are resolved in this order (highest priority first):
1. CLI arguments (`--api-key`, `--base-url`, `--project-id`)
2. Suite YAML configuration
3. Environment variables (`LETTA_API_KEY`, `LETTA_BASE_URL`, `LETTA_PROJECT_ID`)
## Agent Lifecycle and Testing Behavior
The way your agent is specified fundamentally changes how the evaluation runs:
### With agent_file or agent_script: Independent Testing
**Agent lifecycle:**
1. A fresh agent instance is created for each sample
2. Agent processes the sample input(s)
3. Agent remains on the server after the sample completes
**Testing behavior:** Each sample is an independent, isolated test. Agent state (memory, message history) does not carry over between samples. This enables parallel execution and ensures reproducible results.
**Use cases:**
- Testing how the agent responds to various independent inputs
- Ensuring consistent behavior across different scenarios
- Regression testing where each case should be isolated
- Evaluating agent responses without prior context
<Note>
**Example:** If you have 10 test cases, 10 separate agent instances will be created (one per test case), and they can run in parallel.
</Note>
### With agent_id: Sequential Script Testing
**Agent lifecycle:**
1. The same agent instance is used for all samples
2. Agent processes each sample in sequence
3. Agent state persists throughout the entire evaluation
**Testing behavior:** The dataset becomes a conversation script where each sample builds on previous ones. Agent memory and message history accumulate, and earlier interactions affect later responses. Samples must execute sequentially.
**Use cases:**
- Testing multi-turn conversations with context
- Evaluating how agent memory evolves over time
- Simulating a single user session with multiple interactions
- Testing scenarios where context should accumulate
<Note>
**Example:** If you have 10 test cases, they all run against the same agent instance in order, with state carrying over between each test.
</Note>
### Critical Differences
| Aspect | agent_file / agent_script | agent_id |
|--------|---------------------------|----------|
| **Agent instances** | New agent per sample | Same agent for all samples |
| **State isolation** | Fully isolated | State carries over |
| **Execution** | Can run in parallel | Must run sequentially |
| **Memory** | Fresh for each sample | Accumulates across samples |
| **Use case** | Independent test cases | Conversation scripts |
| **Reproducibility** | Highly reproducible | Depends on execution order |
<Tip>
**Best practice:** Use `agent_file` or `agent_script` for most evaluations to ensure reproducible, isolated tests. Use `agent_id` only when you specifically need to test how agent state evolves across multiple interactions.
</Tip>
## Validation
The runner validates:
- Exactly one of `agent_file`, `agent_id`, or `agent_script` is specified
- Agent files have `.af` extension
- Agent script paths are valid
## Next Steps
- [Suite YAML Reference](/guides/evals/configuration/suite-yaml) - Complete target configuration options
- [Datasets](/guides/evals/concepts/datasets) - Using agent_args for sample-specific configuration
- [Getting Started](/guides/evals/getting-started) - Complete tutorial with target examples

View File

@@ -1,783 +0,0 @@
# Suite YAML Reference
Complete reference for suite configuration files.
A **suite** is a YAML file that defines an evaluation: what agent to test, what dataset to use, how to grade responses, and what criteria determine pass/fail. This is your evaluation specification.
**Quick overview:**
- **name**: Identifier for your evaluation
- **dataset**: JSONL file with test cases
- **target**: Which agent to evaluate (via file, ID, or script)
- **graders**: How to score responses (tool or rubric graders)
- **gate**: Pass/fail criteria
See [Getting Started](../getting-started.md) for a tutorial, or [Core Concepts](../concepts/suites.md) for conceptual overview.
## File Structure
```yaml
name: string (required)
description: string (optional)
dataset: path (required)
max_samples: integer (optional)
sample_tags: array (optional)
num_runs: integer (optional)
setup_script: string (optional)
target: object (required)
kind: "agent"
base_url: string
api_key: string
timeout: float
project_id: string
agent_id: string (one of: agent_id, agent_file, agent_script)
agent_file: path
agent_script: string
model_configs: array
model_handles: array
graders: object (required)
<metric_key>: object
kind: "tool" | "rubric"
display_name: string
extractor: string
extractor_config: object
# Tool grader fields
function: string
# Rubric grader fields (LLM API)
prompt: string
prompt_path: path
model: string
temperature: float
provider: string
max_retries: integer
timeout: float
rubric_vars: array
# Rubric grader fields (agent-as-judge)
agent_file: path
judge_tool_name: string
gate: object (required)
metric_key: string
metric: "avg_score" | "accuracy"
op: "gte" | "gt" | "lte" | "lt" | "eq"
value: float
pass_op: "gte" | "gt" | "lte" | "lt" | "eq"
pass_value: float
```
## Top-Level Fields
### name (required)
Suite name, used in output and results.
**Type**: string
**Example**:
```yaml
name: question-answering-eval
```
### description (optional)
Human-readable description of what the suite tests.
**Type**: string
**Example**:
```yaml
description: Tests agent's ability to answer factual questions accurately
```
### dataset (required)
Path to JSONL dataset file. Relative paths are resolved from the suite YAML location.
**Type**: path (string)
**Example**:
```yaml
dataset: ./datasets/qa.jsonl
dataset: /absolute/path/to/dataset.jsonl
```
### max_samples (optional)
Limit the number of samples to evaluate. Useful for quick tests.
**Type**: integer
**Default**: All samples
**Example**:
```yaml
max_samples: 10 # Only evaluate first 10 samples
```
### sample_tags (optional)
Filter samples by tags. Only samples with ALL specified tags are evaluated.
**Type**: array of strings
**Example**:
```yaml
sample_tags: [math, easy] # Only samples tagged with both
```
Dataset samples need tags:
```jsonl
{"input": "What is 2+2?", "ground_truth": "4", "tags": ["math", "easy"]}
```
### num_runs (optional)
Number of times to run the evaluation suite. Useful for testing non-deterministic behavior or collecting multiple runs for statistical analysis.
**Type**: integer
**Default**: 1
**Example**:
```yaml
num_runs: 5 # Run the evaluation 5 times
```
### setup_script (optional)
Path to Python script with setup function.
**Type**: string (format: `path/to/script.py:function_name`)
**Example**:
```yaml
setup_script: setup.py:prepare_environment
```
The function signature:
```python
def prepare_environment(suite: SuiteSpec) -> None:
# Setup code
pass
```
## target (required)
Configuration for the agent being evaluated.
### kind (required)
Type of target. Currently only `"agent"` is supported.
**Type**: string
**Example**:
```yaml
target:
kind: agent
```
### base_url (optional)
Letta server URL.
**Type**: string
**Default**: `http://localhost:8283`
**Example**:
```yaml
target:
base_url: http://localhost:8283
# or
base_url: https://api.letta.com
```
### api_key (optional)
API key for Letta authentication. Can also be set via `LETTA_API_KEY` environment variable.
**Type**: string
**Example**:
```yaml
target:
api_key: your-api-key-here
```
### timeout (optional)
Request timeout in seconds.
**Type**: float
**Default**: 300.0
**Example**:
```yaml
target:
timeout: 600.0 # 10 minutes
```
### project_id (optional)
Letta project ID (for Letta Cloud).
**Type**: string
**Example**:
```yaml
target:
project_id: proj_abc123
```
### Agent Source (required, pick one)
Exactly one of these must be specified:
#### agent_id
ID of existing agent on the server.
**Type**: string
**Example**:
```yaml
target:
agent_id: agent-123-abc
```
#### agent_file
Path to `.af` agent file.
**Type**: path (string, must end in `.af`)
**Example**:
```yaml
target:
agent_file: ./agents/my_agent.af
```
#### agent_script
Path to Python script with agent factory.
**Type**: string (format: `path/to/script.py:ClassName`)
**Example**:
```yaml
target:
agent_script: factory.py:MyAgentFactory
```
See [Targets](../concepts/targets.md) for details on agent sources.
### model_configs (optional)
List of model configuration names to test. Cannot be used with `model_handles`.
**Type**: array of strings
**Example**:
```yaml
target:
model_configs: [gpt-4o-mini, claude-3-5-sonnet]
```
### model_handles (optional)
List of model handles for cloud deployments. Cannot be used with `model_configs`.
**Type**: array of strings
**Example**:
```yaml
target:
model_handles: ["openai/gpt-4o-mini", "anthropic/claude-3-5-sonnet"]
```
## graders (required)
One or more graders, each with a unique key.
### Grader Key
The key becomes the metric name:
```yaml
graders:
accuracy: # This is the metric_key
kind: tool
...
quality: # Another metric_key
kind: rubric
...
```
### kind (required)
Grader type: `"tool"` or `"rubric"`.
**Type**: string
**Example**:
```yaml
graders:
my_metric:
kind: tool
```
### display_name (optional)
Human-friendly name for CLI/UI output.
**Type**: string
**Example**:
```yaml
graders:
acc:
display_name: "Answer Accuracy"
kind: tool
...
```
### extractor (required)
Name of the extractor to use.
**Type**: string
**Example**:
```yaml
graders:
my_metric:
extractor: last_assistant
```
### extractor_config (optional)
Configuration passed to the extractor.
**Type**: object
**Example**:
```yaml
graders:
my_metric:
extractor: pattern
extractor_config:
pattern: 'Answer: (.*)'
group: 1
```
### Tool Grader Fields
#### function (required for tool graders)
Name of the grading function.
**Type**: string
**Example**:
```yaml
graders:
accuracy:
kind: tool
function: exact_match
```
### Rubric Grader Fields
#### prompt (required if no prompt_path)
Inline rubric prompt.
**Type**: string
**Example**:
```yaml
graders:
quality:
kind: rubric
prompt: |
Evaluate response quality from 0.0 to 1.0.
Input: {input}
Response: {submission}
```
#### prompt_path (required if no prompt)
Path to rubric file. Cannot use both `prompt` and `prompt_path`.
**Type**: path (string)
**Example**:
```yaml
graders:
quality:
kind: rubric
prompt_path: rubrics/quality.txt
```
#### model (optional)
LLM model for judging.
**Type**: string
**Default**: `gpt-4o-mini`
**Example**:
```yaml
graders:
quality:
kind: rubric
model: gpt-4o
```
#### temperature (optional)
Temperature for LLM generation.
**Type**: float (0.0 to 2.0)
**Default**: 0.0
**Example**:
```yaml
graders:
quality:
kind: rubric
temperature: 0.0
```
#### provider (optional)
LLM provider.
**Type**: string
**Default**: `openai`
**Example**:
```yaml
graders:
quality:
kind: rubric
provider: openai
```
#### max_retries (optional)
Maximum retry attempts for API calls.
**Type**: integer
**Default**: 5
**Example**:
```yaml
graders:
quality:
kind: rubric
max_retries: 3
```
#### timeout (optional)
Timeout for API calls in seconds.
**Type**: float
**Default**: 120.0
**Example**:
```yaml
graders:
quality:
kind: rubric
timeout: 60.0
```
#### rubric_vars (optional)
List of custom variable names that must be provided in the dataset for rubric template substitution. When specified, the grader validates that each sample includes these variables in its `rubric_vars` field.
**Type**: array of strings
**Example**:
```yaml
graders:
code_quality:
kind: rubric
rubric_vars: [reference_code, required_features] # Require these variables in dataset
prompt: |
Compare the submission to this reference:
{reference_code}
Required features: {required_features}
```
Dataset sample must provide these variables:
```jsonl
{"input": "Write a fibonacci function", "rubric_vars": {"reference_code": "def fib(n):\n if n <= 1: return n\n return fib(n-1) + fib(n-2)", "required_features": "recursion, base case"}}
```
See [Datasets - rubric_vars](../concepts/datasets.md#rubric_vars) for details.
#### agent_file (required for agent-as-judge)
Path to `.af` agent file to use as judge for rubric grading. Use this instead of `model` when you want a Letta agent to act as the evaluator.
**Type**: path (string)
**Mutually exclusive with**: `model`, `temperature`, `provider`, `max_retries`, `timeout`
**Example**:
```yaml
graders:
agent_judge:
kind: rubric
agent_file: judge.af # Judge agent with submit_grade tool
prompt_path: rubric.txt # Evaluation criteria
extractor: last_assistant
```
**Requirements**: The judge agent must have a tool with signature `submit_grade(score: float, rationale: str)`. The framework validates this on initialization.
See [Rubric Graders - Agent-as-Judge](../graders/rubric-graders.md#agent-as-judge) for complete documentation.
#### judge_tool_name (optional, for agent-as-judge)
Name of the tool that the judge agent uses to submit scores. Only applicable when using `agent_file`.
**Type**: string
**Default**: `submit_grade`
**Example**:
```yaml
graders:
agent_judge:
kind: rubric
agent_file: judge.af
judge_tool_name: submit_grade # Default, can be omitted
prompt_path: rubric.txt
extractor: last_assistant
```
**Tool requirements**: The tool must have exactly two parameters:
- `score: float` - Score between 0.0 and 1.0
- `rationale: str` - Explanation of the score
## gate (required)
Pass/fail criteria for the evaluation.
### metric_key (optional)
Which grader to evaluate. If only one grader, this can be omitted.
**Type**: string
**Example**:
```yaml
gate:
metric_key: accuracy # Must match a key in graders
```
### metric (optional)
Which aggregate to compare: `avg_score` or `accuracy`.
**Type**: string
**Default**: `avg_score`
**Example**:
```yaml
gate:
metric: avg_score
# or
metric: accuracy
```
### op (required)
Comparison operator.
**Type**: string (one of: `gte`, `gt`, `lte`, `lt`, `eq`)
**Example**:
```yaml
gate:
op: gte # Greater than or equal
```
### value (required)
Threshold value for comparison.
**Type**: float (0.0 to 1.0)
**Example**:
```yaml
gate:
value: 0.8 # Require >= 0.8
```
### pass_op (optional)
Comparison operator for per-sample pass criteria.
**Type**: string (one of: `gte`, `gt`, `lte`, `lt`, `eq`)
**Default**: Same as `op`
**Example**:
```yaml
gate:
metric: accuracy
pass_op: gte # Sample passes if...
pass_value: 0.7 # ...score >= 0.7
```
### pass_value (optional)
Threshold for per-sample pass.
**Type**: float (0.0 to 1.0)
**Default**: Same as `value` (or 1.0 for accuracy metric)
**Example**:
```yaml
gate:
metric: accuracy
op: gte
value: 0.8 # 80% must pass
pass_op: gte
pass_value: 0.7 # Sample passes if score >= 0.7
```
## Complete Examples
### Minimal Suite
```yaml
name: basic-eval
dataset: dataset.jsonl
target:
kind: agent
agent_file: agent.af
graders:
accuracy:
kind: tool
function: exact_match
extractor: last_assistant
gate:
op: gte
value: 0.8
```
### Multi-Metric Suite
```yaml
name: comprehensive-eval
description: Tests accuracy and quality
dataset: test_data.jsonl
max_samples: 100
target:
kind: agent
agent_file: agent.af
base_url: http://localhost:8283
graders:
accuracy:
display_name: "Answer Accuracy"
kind: tool
function: contains
extractor: last_assistant
quality:
display_name: "Response Quality"
kind: rubric
prompt_path: rubrics/quality.txt
model: gpt-4o-mini
temperature: 0.0
extractor: last_assistant
gate:
metric_key: accuracy
metric: avg_score
op: gte
value: 0.85
```
### Advanced Suite
```yaml
name: advanced-eval
description: Multi-model, multi-metric evaluation
dataset: comprehensive_tests.jsonl
sample_tags: [production]
setup_script: setup.py:prepare
target:
kind: agent
agent_script: factory.py:CustomFactory
base_url: https://api.letta.com
api_key: ${LETTA_API_KEY}
project_id: proj_abc123
model_configs: [gpt-4o-mini, claude-3-5-sonnet]
graders:
answer:
kind: tool
function: exact_match
extractor: last_assistant
tool_usage:
kind: tool
function: contains
extractor: tool_arguments
extractor_config:
tool_name: search
memory:
kind: tool
function: contains
extractor: memory_block
extractor_config:
block_label: human
gate:
metric_key: answer
metric: accuracy
op: gte
value: 0.9
pass_op: gte
pass_value: 1.0
```
## Validation
Validate your suite before running:
```bash
letta-evals validate suite.yaml
```
## Next Steps
- [Targets](../concepts/targets.md) - Understanding agent sources and configuration
- [Graders](../concepts/graders.md) - Tool graders vs rubric graders
- [Extractors](../concepts/extractors.md) - What to extract from agent responses
- [Gates](../concepts/gates.md) - Setting pass/fail criteria

View File

@@ -1,427 +0,0 @@
# Suite YAML Reference
Complete reference for suite configuration files.
A **suite** is a YAML file that defines an evaluation: what agent to test, what dataset to use, how to grade responses, and what criteria determine pass/fail. This is your evaluation specification.
<Note>
**Quick overview:**
- **name**: Identifier for your evaluation
- **dataset**: JSONL file with test cases
- **target**: Which agent to evaluate (via file, ID, or script)
- **graders**: How to score responses (tool or rubric graders)
- **gate**: Pass/fail criteria
</Note>
See [Getting Started](/guides/evals/getting-started) for a tutorial, or [Core Concepts](/guides/evals/concepts/suites) for conceptual overview.
## File Structure
```yaml
name: string (required)
description: string (optional)
dataset: path (required)
max_samples: integer (optional)
sample_tags: array (optional)
num_runs: integer (optional)
setup_script: string (optional)
target: object (required)
kind: "agent"
base_url: string
api_key: string
timeout: float
project_id: string
agent_id: string (one of: agent_id, agent_file, agent_script)
agent_file: path
agent_script: string
model_configs: array
model_handles: array
graders: object (required)
<metric_key>: object
kind: "tool" | "rubric"
display_name: string
extractor: string
extractor_config: object
# Tool grader fields
function: string
# Rubric grader fields (LLM API)
prompt: string
prompt_path: path
model: string
temperature: float
provider: string
max_retries: integer
timeout: float
rubric_vars: array
# Rubric grader fields (agent-as-judge)
agent_file: path
judge_tool_name: string
gate: object (required)
metric_key: string
metric: "avg_score" | "accuracy"
op: "gte" | "gt" | "lte" | "lt" | "eq"
value: float
pass_op: "gte" | "gt" | "lte" | "lt" | "eq"
pass_value: float
```
## Top-Level Fields
### name (required)
Suite name, used in output and results.
**Type**: string
```yaml
name: question-answering-eval
```
### description (optional)
Human-readable description of what the suite tests.
**Type**: string
```yaml
description: Tests agent's ability to answer factual questions accurately
```
### dataset (required)
Path to JSONL dataset file. Relative paths are resolved from the suite YAML location.
**Type**: path (string)
```yaml
dataset: ./datasets/qa.jsonl
dataset: /absolute/path/to/dataset.jsonl
```
### max_samples (optional)
Limit the number of samples to evaluate. Useful for quick tests.
**Type**: integer | **Default**: All samples
```yaml
max_samples: 10 # Only evaluate first 10 samples
```
### sample_tags (optional)
Filter samples by tags. Only samples with ALL specified tags are evaluated.
**Type**: array of strings
```yaml
sample_tags: [math, easy] # Only samples tagged with both
```
### num_runs (optional)
Number of times to run the evaluation suite.
**Type**: integer | **Default**: 1
```yaml
num_runs: 5 # Run the evaluation 5 times
```
### setup_script (optional)
Path to Python script with setup function.
**Type**: string (format: `path/to/script.py:function_name`)
```yaml
setup_script: setup.py:prepare_environment
```
## target (required)
Configuration for the agent being evaluated.
### kind (required)
Type of target. Currently only `"agent"` is supported.
```yaml
target:
kind: agent
```
### base_url (optional)
Letta server URL. **Default**: `https://api.letta.com`
```yaml
target:
base_url: https://api.letta.com
# or
base_url: https://api.letta.com
```
### api_key (optional)
API key for Letta authentication. Can also be set via `LETTA_API_KEY` environment variable.
```yaml
target:
api_key: your-api-key-here
```
### timeout (optional)
Request timeout in seconds. **Default**: 300.0
```yaml
target:
timeout: 600.0 # 10 minutes
```
### Agent Source (required, pick one)
Exactly one of these must be specified:
#### agent_id
ID of existing agent on the server.
```yaml
target:
agent_id: agent-123-abc
```
#### agent_file
Path to `.af` agent file.
```yaml
target:
agent_file: ./agents/my_agent.af
```
#### agent_script
Path to Python script with agent factory.
```yaml
target:
agent_script: factory.py:MyAgentFactory
```
See [Targets](/guides/evals/concepts/targets) for details on agent sources.
### model_configs (optional)
List of model configuration names to test. Cannot be used with `model_handles`.
```yaml
target:
model_configs: [gpt-4o-mini, claude-3-5-sonnet]
```
### model_handles (optional)
List of model handles for cloud deployments. Cannot be used with `model_configs`.
```yaml
target:
model_handles: ["openai/gpt-4o-mini", "anthropic/claude-3-5-sonnet"]
```
## graders (required)
One or more graders, each with a unique key.
### kind (required)
Grader type: `"tool"` or `"rubric"`.
```yaml
graders:
my_metric:
kind: tool
```
### extractor (required)
Name of the extractor to use.
```yaml
graders:
my_metric:
extractor: last_assistant
```
### Tool Grader Fields
#### function (required for tool graders)
Name of the grading function.
```yaml
graders:
accuracy:
kind: tool
function: exact_match
```
### Rubric Grader Fields
#### prompt or prompt_path (required)
Inline rubric prompt or path to rubric file.
```yaml
graders:
quality:
kind: rubric
prompt: |
Evaluate response quality from 0.0 to 1.0.
```
#### model (optional)
LLM model for judging. **Default**: `gpt-4o-mini`
```yaml
graders:
quality:
kind: rubric
model: gpt-4o
```
#### temperature (optional)
Temperature for LLM generation. **Default**: 0.0
```yaml
graders:
quality:
kind: rubric
temperature: 0.0
```
#### agent_file (agent-as-judge)
Path to `.af` agent file to use as judge.
```yaml
graders:
agent_judge:
kind: rubric
agent_file: judge.af
prompt_path: rubric.txt
```
## gate (required)
Pass/fail criteria for the evaluation.
### metric_key (optional)
Which grader to evaluate. If only one grader, this can be omitted.
```yaml
gate:
metric_key: accuracy
```
### metric (optional)
Which aggregate to compare: `avg_score` or `accuracy`. **Default**: `avg_score`
```yaml
gate:
metric: avg_score
```
### op (required)
Comparison operator: `gte`, `gt`, `lte`, `lt`, `eq`
```yaml
gate:
op: gte # Greater than or equal
```
### value (required)
Threshold value for comparison (0.0 to 1.0).
```yaml
gate:
value: 0.8 # Require >= 0.8
```
## Complete Examples
### Minimal Suite
```yaml
name: basic-eval
dataset: dataset.jsonl
target:
kind: agent
agent_file: agent.af
graders:
accuracy:
kind: tool
function: exact_match
extractor: last_assistant
gate:
op: gte
value: 0.8
```
### Multi-Metric Suite
```yaml
name: comprehensive-eval
description: Tests accuracy and quality
dataset: test_data.jsonl
target:
kind: agent
agent_file: agent.af
graders:
accuracy:
kind: tool
function: contains
extractor: last_assistant
quality:
kind: rubric
prompt_path: rubrics/quality.txt
model: gpt-4o-mini
extractor: last_assistant
gate:
metric_key: accuracy
op: gte
value: 0.85
```
## Validation
Validate your suite before running:
```bash
letta-evals validate suite.yaml
```
## Next Steps
- [Targets](/guides/evals/concepts/targets) - Understanding agent sources and configuration
- [Graders](/guides/evals/concepts/graders) - Tool graders vs rubric graders
- [Extractors](/guides/evals/concepts/extractors) - What to extract from agent responses
- [Gates](/guides/evals/concepts/gates) - Setting pass/fail criteria

View File

@@ -1,218 +0,0 @@
# Built-in Extractors Reference
Letta Evals provides a set of built-in extractors that cover the most common extraction needs. These extractors let you pull specific content from agent conversations without writing any custom code.
**What are extractors?** Extractors determine what part of an agent's response gets evaluated. They take the full conversation trajectory (all messages, tool calls, and state changes) and extract just the piece you want to grade.
**Common use cases:**
- Extract the agent's final answer (`last_assistant`)
- Check what tools were called and with what arguments (`tool_arguments`)
- Verify memory was updated correctly (`memory_block`)
- Parse structured output with regex (`pattern`)
- Get all messages from a conversation (`all_assistant`)
**Quick example:**
```yaml
graders:
accuracy:
kind: tool
function: exact_match
extractor: last_assistant # Extract final response
```
Each extractor below can be used with any grader by specifying it in your suite YAML. For custom extraction logic, see [Custom Extractors](./custom.md).
## `last_assistant`
Extracts the last assistant message content.
**Configuration**: None required
**Example**:
```yaml
extractor: last_assistant
```
**Use case**: Most common - get the agent's final response
**Output**: Content of the last assistant message
## `first_assistant`
Extracts the first assistant message content.
**Configuration**: None required
**Example**:
```yaml
extractor: first_assistant
```
**Use case**: Test immediate responses before tool usage
**Output**: Content of the first assistant message
## `all_assistant`
Concatenates all assistant messages with a separator.
**Configuration**:
- `separator` (optional): String to join messages (default: `"\n"`)
**Example**:
```yaml
extractor: all_assistant # Get all agent messages
extractor_config:
separator: "\n\n" # Separate with double newlines
```
**Use case**: Evaluate complete conversation context
**Output**: All assistant messages joined by separator
## last_turn
Extracts all assistant messages from the last conversation turn.
**Configuration**:
- `separator` (optional): String to join messages (default: `"\n"`)
**Example**:
```yaml
extractor: last_turn # Get messages from final turn
extractor_config:
separator: " " # Join with spaces
```
**Use case**: When agent makes multiple statements in final turn
**Output**: Assistant messages from last turn joined by separator
## pattern
Extracts content matching a regex pattern.
**Configuration**:
- `pattern` (required): Regex pattern to match
- `group` (optional): Capture group to extract (default: 0)
- `search_all` (optional): Find all matches vs first match (default: false)
**Example**:
```yaml
extractor: pattern # Extract using regex
extractor_config:
pattern: 'Result: (\d+)' # Match "Result: " followed by digits
group: 1 # Extract just the number (capture group 1)
```
**Use case**: Extract structured content (numbers, codes, formatted output)
**Output**: Matched pattern or capture group
## tool_arguments
Extracts arguments from a specific tool call.
**Configuration**:
- `tool_name` (required): Name of the tool to extract from
**Example**:
```yaml
extractor: tool_arguments # Extract tool call arguments
extractor_config:
tool_name: search # Get arguments from "search" tool
```
**Use case**: Validate tool was called with correct arguments
**Output**: JSON string of tool arguments
Example output: `{"query": "pandas", "limit": 10}`
## tool_output
Extracts the return value from a specific tool call.
**Configuration**:
- `tool_name` (required): Name of the tool whose output to extract
**Example**:
```yaml
extractor: tool_output # Extract tool return value
extractor_config:
tool_name: search # Get return value from "search" tool
```
**Use case**: Check tool return values
**Output**: Tool return value as string
## after_marker
Extracts content after a specific marker string.
**Configuration**:
- `marker` (required): String marker to search for
- `include_marker` (optional): Include marker in output (default: false)
**Example**:
```yaml
extractor: after_marker # Extract content after a marker
extractor_config:
marker: "ANSWER:" # Find this marker in the response
include_marker: false # Don't include "ANSWER:" in output
```
**Use case**: Extract structured responses with markers
**Output**: Content after the marker
Example: From "Analysis... ANSWER: Paris", extracts "Paris"
## memory_block
Extracts content from a specific memory block.
**Configuration**:
- `block_label` (required): Label of the memory block
**Example**:
```yaml
extractor: memory_block # Extract from agent memory
extractor_config:
block_label: human # Get content from "human" memory block
```
**Use case**: Validate agent memory updates
**Output**: Content of the specified memory block
**Important**: This extractor requires agent_state, which adds overhead. The runner automatically fetches it when needed.
## Quick Reference Table
| Extractor | Config Required | Use Case | Agent State? |
|-----------|----------------|----------|--------------|
| `last_assistant` | No | Final response | No |
| `first_assistant` | No | Initial response | No |
| `all_assistant` | Optional | Full conversation | No |
| `last_turn` | Optional | Final turn messages | No |
| `pattern` | Yes | Regex extraction | No |
| `tool_arguments` | Yes | Tool call args | No |
| `tool_output` | Yes | Tool return value | No |
| `after_marker` | Yes | Marker-based extraction | No |
| `memory_block` | Yes | Memory content | Yes |
## Listing Extractors
See all available extractors:
```bash
letta-evals list-extractors
```
## Next Steps
- [Custom Extractors](./custom.md) - Write your own extraction logic
- [Core Concepts: Extractors](../concepts/extractors.md) - How extractors work in the evaluation flow
- [Graders](../concepts/graders.md) - Using extractors with graders

View File

@@ -1,409 +0,0 @@
# Custom Extractors
Create your own extractors to pull exactly what you need from agent trajectories.
While built-in extractors cover common cases (last assistant message, tool arguments, memory blocks), custom extractors let you implement specialized extraction logic for your specific use case.
## Why Custom Extractors?
Use custom extractors when you need to:
- **Extract structured data**: Parse JSON fields from agent responses
- **Filter specific patterns**: Extract code blocks, URLs, or formatted content
- **Combine data sources**: Merge information from multiple messages or memory blocks
- **Count occurrences**: Track how many times something happened in the conversation
- **Complex logic**: Implement domain-specific extraction that built-ins can't handle
**Example**: You want to test if your agent correctly stores fruit preferences in memory using the `memory_insert` tool. A custom extractor can grab the tool call arguments, and a custom grader can verify the fruit name is in the right memory block.
## Quick Example
Here's a real custom extractor that pulls `memory_insert` tool call arguments:
```python
from typing import List
from letta_client import LettaMessageUnion, ToolCallMessage
from letta_evals.decorators import extractor
@extractor
def memory_insert_extractor(trajectory: List[List[LettaMessageUnion]], config: dict) -> str:
"""Extract memory_insert tool call arguments from trajectory."""
for turn in trajectory:
for message in turn:
if isinstance(message, ToolCallMessage) and message.tool_call.name == "memory_insert":
return message.tool_call.arguments
return "{}" # Return empty JSON if not found
```
This extractor:
1. Loops through all conversation turns
2. Finds `ToolCallMessage` objects
3. Checks if the tool is `memory_insert`
4. Returns the JSON arguments
5. Returns `"{}"` if no matching tool call found
You can then pair this with a custom grader to verify the arguments are correct (see [Custom Graders](../advanced/custom-graders.md)).
## Basic Structure
```python
from typing import List, Optional
from letta_client import LettaMessageUnion, AgentState
from letta_evals.decorators import extractor
@extractor
def my_extractor(
trajectory: List[List[LettaMessageUnion]],
config: dict,
agent_state: Optional[AgentState] = None
) -> str:
"""Your custom extraction logic."""
# Extract and return content
return extracted_text
```
## The @extractor Decorator
The `@extractor` decorator registers your function:
```python
from letta_evals.decorators import extractor
@extractor # Makes this available as "my_extractor"
def my_extractor(trajectory, config, agent_state=None):
...
```
## Function Signature
### Required Parameters
- `trajectory`: List of conversation turns, each containing messages
- `config`: Dictionary with extractor configuration from YAML
### Optional Parameters
- `agent_state`: Agent state (only needed if extracting from memory blocks or other agent state). Most extractors only need the trajectory.
### Return Value
Must return a string - the extracted content to be graded.
## Trajectory Structure
The trajectory is a list of turns:
```python
[
# Turn 1
[
UserMessage(...),
AssistantMessage(...),
ToolCallMessage(...),
ToolReturnMessage(...)
],
# Turn 2
[
AssistantMessage(...)
]
]
```
Message types:
- `UserMessage`: User input
- `AssistantMessage`: Agent response
- `ToolCallMessage`: Tool invocation
- `ToolReturnMessage`: Tool result
- `SystemMessage`: System messages
## Configuration
Access extractor config via the `config` parameter:
```yaml
extractor: my_extractor
extractor_config:
max_length: 100 # Truncate output at 100 chars
include_metadata: true # Include metadata in extraction
```
```python
@extractor
def my_extractor(trajectory, config, agent_state=None):
max_length = config.get("max_length", 500)
include_metadata = config.get("include_metadata", False)
...
```
## Examples
### Extract Last N Messages
```python
from letta_evals.decorators import extractor
from letta_evals.extractors.utils import get_assistant_messages, flatten_content
@extractor
def last_n_messages(trajectory, config, agent_state=None):
"""Extract the last N assistant messages."""
n = config.get("n", 3)
messages = get_assistant_messages(trajectory)
last_n = messages[-n:] if len(messages) >= n else messages
contents = [flatten_content(msg.content) for msg in last_n]
return "\n".join(contents)
```
Usage:
```yaml
extractor: last_n_messages # Use custom extractor
extractor_config:
n: 3 # Extract last 3 assistant messages
```
### Extract JSON Field
```python
import json
from letta_evals.decorators import extractor
from letta_evals.extractors.utils import get_assistant_messages, flatten_content
@extractor
def json_field(trajectory, config, agent_state=None):
"""Extract a specific field from JSON response."""
field_name = config.get("field", "result")
messages = get_assistant_messages(trajectory)
if not messages:
return ""
content = flatten_content(messages[-1].content)
try:
data = json.loads(content)
return str(data.get(field_name, ""))
except json.JSONDecodeError:
return ""
```
Usage:
```yaml
extractor: json_field # Parse JSON from agent response
extractor_config:
field: result # Extract the "result" field from JSON
```
### Extract Code Blocks
```python
import re
from letta_evals.decorators import extractor
from letta_evals.extractors.utils import get_assistant_messages, flatten_content
@extractor
def code_blocks(trajectory, config, agent_state=None):
"""Extract all code blocks from messages."""
language = config.get("language", None) # Optional: filter by language
messages = get_assistant_messages(trajectory)
code_pattern = r'```(?:(\w+)\n)?(.*?)```'
all_code = []
for msg in messages:
content = flatten_content(msg.content)
matches = re.findall(code_pattern, content, re.DOTALL)
for lang, code in matches:
if language is None or lang == language:
all_code.append(code.strip())
return "\n\n".join(all_code)
```
Usage:
```yaml
extractor: code_blocks # Extract code from markdown blocks
extractor_config:
language: python # Optional: only extract Python code blocks
```
### Extract Tool Call Count
```python
from letta_client import ToolCallMessage
from letta_evals.decorators import extractor
@extractor
def tool_call_count(trajectory, config, agent_state=None):
"""Count how many times a specific tool was called."""
tool_name = config.get("tool_name")
count = 0
for turn in trajectory:
for message in turn:
if isinstance(message, ToolCallMessage):
if tool_name is None or message.tool_call.name == tool_name:
count += 1
return str(count)
```
Usage:
```yaml
extractor: tool_call_count # Count tool invocations
extractor_config:
tool_name: search # Optional: count only "search" tool calls
```
### Extract Multiple Memory Blocks
```python
from letta_evals.decorators import extractor
@extractor
def multiple_memory_blocks(trajectory, config, agent_state=None):
"""Extract and concatenate multiple memory blocks."""
if agent_state is None:
return ""
block_labels = config.get("block_labels", ["human", "persona"])
separator = config.get("separator", "\n---\n")
blocks = []
for block in agent_state.memory.blocks:
if block.label in block_labels:
blocks.append(f"{block.label}: {block.value}")
return separator.join(blocks)
```
Usage:
```yaml
extractor: multiple_memory_blocks # Combine multiple memory blocks
extractor_config:
block_labels: [human, persona] # Which blocks to extract
separator: "\n---\n" # How to separate them in output
```
## Helper Utilities
The framework provides helper functions:
### get_assistant_messages
```python
from letta_evals.extractors.utils import get_assistant_messages
messages = get_assistant_messages(trajectory)
# Returns list of AssistantMessage objects
```
### get_last_turn_messages
```python
from letta_evals.extractors.utils import get_last_turn_messages
from letta_client import AssistantMessage
messages = get_last_turn_messages(trajectory, AssistantMessage)
# Returns assistant messages from last turn
```
### flatten_content
```python
from letta_evals.extractors.utils import flatten_content
text = flatten_content(message.content)
# Converts complex content to plain text
```
## Agent State Requirements
If your extractor needs agent state, include it in the signature:
```python
@extractor
def my_extractor(trajectory, config, agent_state: Optional[AgentState] = None):
if agent_state is None:
raise RuntimeError("This extractor requires agent_state")
# Use agent_state.memory.blocks, etc.
...
```
The runner will automatically fetch agent state when your extractor is used.
**Note**: Fetching agent state adds overhead. Only use when necessary.
## Using Custom Extractors
### Method 1: Custom Evaluators File
Create `custom_evaluators.py`:
```python
from letta_evals.decorators import extractor
@extractor
def my_extractor(trajectory, config, agent_state=None):
...
```
The file will be discovered automatically if in the same directory.
### Method 2: Setup Script
Use a setup script to import custom extractors before the suite runs:
```python
# setup.py
from letta_evals.models import SuiteSpec
import custom_extractors # Imports and registers your @extractor functions
def prepare_environment(suite: SuiteSpec) -> None:
# Runs before evaluation starts
pass
```
```yaml
setup_script: setup.py:prepare_environment # Import custom extractors
graders:
my_metric:
extractor: my_extractor # Now available from custom_extractors
```
## Testing Your Extractor
```python
from letta_client import AssistantMessage
# Mock trajectory
trajectory = [
[
AssistantMessage(
content="The answer is 42",
role="assistant"
)
]
]
config = {"max_length": 100}
result = my_extractor(trajectory, config)
print(f"Extracted: {result}")
```
## Best Practices
1. **Handle empty trajectories**: Check if messages exist
2. **Return strings**: Always return a string, not None
3. **Use config for flexibility**: Make behavior configurable
4. **Document required config**: Explain config parameters
5. **Handle errors gracefully**: Return empty string on error
6. **Keep it fast**: Extractors run for every sample
7. **Use helper utilities**: Leverage built-in functions
## Next Steps
- [Built-in Extractors](./builtin.md) - Learn from examples
- [Custom Graders](../advanced/custom-graders.md) - Pair with custom grading
- [Core Concepts](../concepts/extractors.md) - How extractors work

View File

@@ -1,325 +0,0 @@
# Getting Started with Letta Evals
This guide will help you get up and running with Letta Evals in minutes.
## What is Letta Evals?
Letta Evals is a framework for testing Letta AI agents. It allows you to:
- Test agent responses against expected outputs
- Evaluate subjective quality using LLM judges
- Test tool usage and memory updates
- Track metrics across multiple evaluation runs
- Gate deployments on quality thresholds
Unlike most evaluation frameworks designed for simple input-output models, Letta Evals is built for [stateful agents](https://www.letta.com/blog/stateful-agents) that maintain memory, use tools, and evolve over time.
## Prerequisites
- Python 3.11 or higher
- A running Letta server ([local](https://docs.letta.com/guides/selfhosting) or [Letta Cloud](https://docs.letta.com/guides/cloud/overview))
- A Letta agent to test, either in agent file format or by ID (see [Targets](./concepts/targets.md) for more details)
## Installation
```bash
pip install letta-evals
```
Or with uv:
```bash
uv pip install letta-evals
```
## Getting an Agent to Test
Before you can run evaluations, you need a Letta agent. You have two options:
### Option 1: Use an Agent File (.af)
Export an existing agent to a file using the Letta SDK:
```python
from letta_client import Letta
import os
client = Letta(
base_url="http://localhost:8283", # or https://api.letta.com for Letta Cloud
token=os.getenv("LETTA_API_KEY") # required for Letta Cloud
)
# Export an agent to a file
agent_file = client.agents.export_file(agent_id="agent-123")
# Save to disk
with open("my_agent.af", "w") as f:
f.write(agent_file)
```
Or export via the Agent Development Environment (ADE) by selecting "Export Agent".
This creates an `.af` file which you can reference in your suite configuration:
```yaml
target:
kind: agent
agent_file: my_agent.af
```
**How it works:** When using an agent file, a fresh agent instance is created for each sample in your dataset. Each test runs independently with a clean slate, making this ideal for parallel testing across different inputs.
**Example:** If your dataset has 5 samples, 5 separate agents will be created and can run in parallel. Each agent starts fresh with no memory of the other tests.
### Option 2: Use an Existing Agent ID
If you already have a running agent, use its ID directly:
```python
from letta_client import Letta
import os
client = Letta(
base_url="http://localhost:8283", # or https://api.letta.com for Letta Cloud
token=os.getenv("LETTA_API_KEY") # required for Letta Cloud
)
# List all agents
agents = client.agents.list()
for agent in agents:
print(f"Agent: {agent.name}, ID: {agent.id}")
```
Then reference it in your suite:
```yaml
target:
kind: agent
agent_id: agent-abc-123
```
**How it works:** The same agent instance is used for all samples, processing them sequentially. The agent's state (memory, message history) carries over between samples, making the dataset behave more like a conversation script than independent test cases.
**Example:** If your dataset has 5 samples, they all run against the same agent one after another. The agent "remembers" each previous interaction, so sample 3 can reference information from samples 1 and 2.
### Which Should You Use?
**Agent File (.af)** - Use when testing independent scenarios
Best for testing how the agent responds to independent, isolated inputs. Each sample gets a fresh agent with no prior context. Tests can run in parallel.
**Typical scenarios:**
- "How does the agent answer different questions?"
- "Does the agent correctly use tools for various tasks?"
- "Testing behavior across different prompts"
**Agent ID** - Use when testing conversational flows
Best for testing conversational flows or scenarios where context should build up over time. The agent's state accumulates as it processes each sample sequentially.
**Typical scenarios:**
- "Does the agent remember information across a conversation?"
- "How does the agent's memory evolve over multiple exchanges?"
- "Simulating a realistic user session with multiple requests"
**Recommendation:** For most evaluation scenarios, use agent files to ensure consistent, reproducible test conditions. Only use agent IDs when you specifically want to test stateful, sequential interactions.
For more details on agent lifecycle and testing behaviors, see the [Targets guide](./concepts/targets.md#agent-lifecycle-and-testing-behavior).
## Quick Start
Let's create your first evaluation in 3 steps:
### 1. Create a Test Dataset
Create a file named `dataset.jsonl`:
```jsonl
{"input": "What's the capital of France?", "ground_truth": "Paris"}
{"input": "Calculate 2+2", "ground_truth": "4"}
{"input": "What color is the sky?", "ground_truth": "blue"}
```
Each line is a JSON object with:
- `input`: The prompt to send to your agent
- `ground_truth`: The expected answer (used for grading)
Note: `ground_truth` is optional for some graders (like rubric graders), but required for tool graders like `contains` and `exact_match`.
Read more about [Datasets](./concepts/datasets.md) for details on how to create your dataset.
### 2. Create a Suite Configuration
Create a file named `suite.yaml`:
```yaml
name: my-first-eval
dataset: dataset.jsonl
target:
kind: agent
agent_file: my_agent.af # Path to your agent file
base_url: http://localhost:8283 # Your Letta server
graders:
quality:
kind: tool
function: contains # Check if response contains the ground truth
extractor: last_assistant # Use the last assistant message
gate:
metric_key: quality
op: gte
value: 0.75 # Require 75% pass rate
```
The suite configuration defines:
- The [dataset](./concepts/datasets.md) to use
- The [agent](./concepts/targets.md) to test
- The [graders](./concepts/graders.md) to use
- The [gate](./concepts/gates.md) criteria
Read more about [Suites](./concepts/suites.md) for details on how to configure your evaluation.
### 3. Run the Evaluation
Run your evaluation with the following command:
```bash
letta-evals run suite.yaml
```
You'll see real-time progress as your evaluation runs:
```
Running evaluation: my-first-eval
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3/3 100%
✓ PASSED (2.25/3.00 avg, 75.0% pass rate)
```
Read more about [CLI Commands](./cli/commands.md) for details about the available commands and options.
## Understanding the Results
The core evaluation flow is:
**Dataset → Target (Agent) → Extractor → Grader → Gate → Result**
The evaluation runner:
1. Loads your dataset
2. Sends each input to your agent (Target)
3. Extracts the relevant information (using the Extractor)
4. Grades the response (using the Grader function)
5. Computes aggregate metrics
6. Checks if metrics pass the Gate criteria
The output shows:
- **Average score**: Mean score across all samples
- **Pass rate**: Percentage of samples that passed
- **Gate status**: Whether the evaluation passed or failed overall
## Next Steps
Now that you've run your first evaluation, explore more advanced features:
- [Core Concepts](./concepts/overview.md) - Understand suites, datasets, graders, and extractors
- [Grader Types](./concepts/graders.md) - Learn about tool graders vs rubric graders
- [Multi-Metric Evaluation](./graders/multi-metric.md) - Test multiple aspects simultaneously
- [Custom Graders](./advanced/custom-graders.md) - Write custom grading functions
- [Multi-Turn Conversations](./advanced/multi-turn-conversations.md) - Test conversational memory
## Common Use Cases
### Strict Answer Checking
Use exact matching for cases where the answer must be precisely correct:
```yaml
graders:
accuracy:
kind: tool
function: exact_match
extractor: last_assistant
```
### Subjective Quality Evaluation
Use an LLM judge to evaluate subjective qualities like helpfulness or tone:
```yaml
graders:
quality:
kind: rubric
prompt_path: rubric.txt
model: gpt-4o-mini
extractor: last_assistant
```
Then create `rubric.txt`:
```
Rate the helpfulness and accuracy of the response.
- Score 1.0 if helpful and accurate
- Score 0.5 if partially helpful
- Score 0.0 if unhelpful or wrong
```
### Testing Tool Calls
Verify that your agent calls specific tools with expected arguments:
```yaml
graders:
tool_check:
kind: tool
function: contains
extractor: tool_arguments
extractor_config:
tool_name: search
```
### Testing Memory Persistence
Check if the agent correctly updates its memory blocks:
```yaml
graders:
memory_check:
kind: tool
function: contains
extractor: memory_block
extractor_config:
block_label: human
```
## Troubleshooting
**"Agent file not found"**
Make sure your `agent_file` path is correct. Paths are relative to the suite YAML file location. Use absolute paths if needed:
```yaml
target:
agent_file: /absolute/path/to/my_agent.af
```
**"Connection refused"**
Your Letta server isn't running or isn't accessible. Start it with:
```bash
letta server
```
By default, it runs at `http://localhost:8283`.
**"No ground_truth provided"**
Tool graders like `exact_match` and `contains` require `ground_truth` in your dataset. Either:
- Add `ground_truth` to your samples, or
- Use a rubric grader which doesn't require ground truth
**Agent didn't respond as expected**
Try testing your agent manually first using the Letta SDK or Agent Development Environment (ADE) to see how it behaves before running evaluations. See the [Letta documentation](https://docs.letta.com) for more information.
For more help, see the [Troubleshooting Guide](./troubleshooting.md).

View File

@@ -1,263 +0,0 @@
# Getting Started
Run your first Letta agent evaluation in 5 minutes.
## Prerequisites
- Python 3.11 or higher
- A running Letta server (local or Letta Cloud)
- A Letta agent to test, either in agent file format or by ID (see [Targets](/guides/evals/concepts/targets) for more details)
## Installation
```bash
pip install letta-evals
```
Or with uv:
```bash
uv pip install letta-evals
```
## Getting an Agent to Test
Export an existing agent to a file using the Letta SDK:
```python
from letta_client import Letta
import os
# Connect to Letta Cloud
client = Letta(token=os.getenv("LETTA_API_KEY"))
# Export an agent to a file
agent_file = client.agents.export_file(agent_id="agent-123")
# Save to disk
with open("my_agent.af", "w") as f:
f.write(agent_file)
```
Or export via the Agent Development Environment (ADE) by selecting "Export Agent".
Then reference it in your suite:
```yaml
target:
kind: agent
agent_file: my_agent.af
```
<Note>
**Other options:** You can also use existing agents by ID or programmatically generate agents. See [Targets](/guides/evals/concepts/targets) for all agent configuration options.
</Note>
## Quick Start
Let's create your first evaluation in 3 steps:
### 1. Create a Test Dataset
Create a file named `dataset.jsonl`:
```jsonl
{"input": "What's the capital of France?", "ground_truth": "Paris"}
{"input": "Calculate 2+2", "ground_truth": "4"}
{"input": "What color is the sky?", "ground_truth": "blue"}
```
Each line is a JSON object with:
- `input`: The prompt to send to your agent
- `ground_truth`: The expected answer (used for grading)
<Note>
`ground_truth` is optional for some graders (like rubric graders), but required for tool graders like `contains` and `exact_match`.
</Note>
Read more about [Datasets](/guides/evals/concepts/datasets) for details on how to create your dataset.
### 2. Create a Suite Configuration
Create a file named `suite.yaml`:
```yaml
name: my-first-eval
dataset: dataset.jsonl
target:
kind: agent
agent_file: my_agent.af # Path to your agent file
base_url: https://api.letta.com # Letta Cloud (default)
token: ${LETTA_API_KEY} # Your API key
graders:
quality:
kind: tool
function: contains # Check if response contains the ground truth
extractor: last_assistant # Use the last assistant message
gate:
metric_key: quality
op: gte
value: 0.75 # Require 75% pass rate
```
The suite configuration defines:
- The [dataset](/guides/evals/concepts/datasets) to use
- The [agent](/guides/evals/concepts/targets) to test
- The [graders](/guides/evals/concepts/graders) to use
- The [gate](/guides/evals/concepts/gates) criteria
Read more about [Suites](/guides/evals/concepts/suites) for details on how to configure your evaluation.
### 3. Run the Evaluation
Run your evaluation with the following command:
```bash
letta-evals run suite.yaml
```
You'll see real-time progress as your evaluation runs:
```
Running evaluation: my-first-eval
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3/3 100%
✓ PASSED (2.25/3.00 avg, 75.0% pass rate)
```
Read more about [CLI Commands](/guides/evals/cli/commands) for details about the available commands and options.
## Understanding the Results
The core evaluation flow is:
**Dataset → Target (Agent) → Extractor → Grader → Gate → Result**
The evaluation runner:
1. Loads your dataset
2. Sends each input to your agent (Target)
3. Extracts the relevant information (using the Extractor)
4. Grades the response (using the Grader function)
5. Computes aggregate metrics
6. Checks if metrics pass the Gate criteria
The output shows:
- **Average score**: Mean score across all samples
- **Pass rate**: Percentage of samples that passed
- **Gate status**: Whether the evaluation passed or failed overall
## Next Steps
Now that you've run your first evaluation, explore more advanced features:
- [Core Concepts](/guides/evals/concepts/overview) - Understand suites, datasets, graders, and extractors
- [Grader Types](/guides/evals/concepts/graders) - Learn about tool graders vs rubric graders
- [Multi-Metric Evaluation](/guides/guides/evals/graders/multi-metric) - Test multiple aspects simultaneously
- [Custom Graders](/guides/evals/advanced/custom-graders) - Write custom grading functions
- [Multi-Turn Conversations](/guides/evals/advanced/multi-turn-conversations) - Test conversational memory
## Common Use Cases
### Strict Answer Checking
Use exact matching for cases where the answer must be precisely correct:
```yaml
graders:
accuracy:
kind: tool
function: exact_match
extractor: last_assistant
```
### Subjective Quality Evaluation
Use an LLM judge to evaluate subjective qualities like helpfulness or tone:
```yaml
graders:
quality:
kind: rubric
prompt_path: rubric.txt
model: gpt-4o-mini
extractor: last_assistant
```
Then create `rubric.txt`:
```
Rate the helpfulness and accuracy of the response.
- Score 1.0 if helpful and accurate
- Score 0.5 if partially helpful
- Score 0.0 if unhelpful or wrong
```
### Testing Tool Calls
Verify that your agent calls specific tools with expected arguments:
```yaml
graders:
tool_check:
kind: tool
function: contains
extractor: tool_arguments
extractor_config:
tool_name: search
```
### Testing Memory Persistence
Check if the agent correctly updates its memory blocks:
```yaml
graders:
memory_check:
kind: tool
function: contains
extractor: memory_block
extractor_config:
block_label: human
```
## Troubleshooting
<Warning>
**"Agent file not found"**
Make sure your `agent_file` path is correct. Paths are relative to the suite YAML file location. Use absolute paths if needed:
```yaml
target:
agent_file: /absolute/path/to/my_agent.af
```
</Warning>
<Warning>
**"Connection refused"**
Your Letta server isn't running or isn't accessible. Start it using Docker:
```bash
docker run -p 8283:8283 -e OPENAI_API_KEY="your_api_key" letta/letta:latest
```
By default, it runs at `http://localhost:8283`. See the [self-hosting guide](/guides/selfhosting) for more information.
</Warning>
<Warning>
**"No ground_truth provided"**
Tool graders like `exact_match` and `contains` require `ground_truth` in your dataset. Either:
- Add `ground_truth` to your samples, or
- Use a rubric grader which doesn't require ground truth
</Warning>
<Tip>
**Agent didn't respond as expected**
Try testing your agent manually first using the Letta SDK or Agent Development Environment (ADE) to see how it behaves before running evaluations. See the [Letta documentation](https://docs.letta.com) for more information.
</Tip>
For more help, see the [Troubleshooting Guide](/guides/evals/troubleshooting).

View File

@@ -1,427 +0,0 @@
# Multi-Metric Evaluation
Evaluate multiple aspects of agent performance simultaneously in a single evaluation suite.
Multi-metric evaluation allows you to define multiple graders, each measuring a different dimension of your agent's behavior. This is essential for comprehensive testing because agent quality isn't just about correctness—you also care about explanation quality, tool usage, format compliance, and more.
**Example**: You might want to check that an agent gives the correct answer (tool grader with `exact_match`), explains it well (rubric grader for clarity), and calls the right tools (tool grader on `tool_arguments`). Instead of running three separate evaluations, you can test all three aspects in one run.
## Why Multiple Metrics?
Agents are complex systems. You might want to evaluate:
- **Correctness**: Does the answer match the expected output?
- **Quality**: Is the explanation clear, complete, and well-structured?
- **Tool usage**: Does the agent call the right tools with correct arguments?
- **Memory**: Does the agent correctly update its memory blocks?
- **Format**: Does the output follow required formatting rules?
Multi-metric evaluation lets you track all of these simultaneously, giving you a holistic view of agent performance.
## Configuration
Define multiple graders under the `graders` section:
```yaml
graders:
accuracy:
kind: tool
function: exact_match
extractor: last_assistant # Check if answer is exactly correct
completeness:
kind: rubric
prompt_path: completeness.txt
model: gpt-4o-mini
extractor: last_assistant # LLM judge evaluates how complete the answer is
tool_usage:
kind: tool
function: contains
extractor: tool_arguments # Check if agent called the right tool
extractor_config:
tool_name: search
```
Each grader:
- Has a unique key (e.g., `accuracy`, `completeness`)
- Can use different kinds (tool vs rubric)
- Can use different extractors
- Produces independent scores
## Gating on One Metric
While you evaluate multiple metrics, you can only gate on one:
```yaml
graders:
accuracy:
kind: tool
function: exact_match
extractor: last_assistant # Check correctness
quality:
kind: rubric
prompt_path: quality.txt
model: gpt-4o-mini
extractor: last_assistant # Evaluate subjective quality
gate:
metric_key: accuracy # Pass/fail based on accuracy only
op: gte
value: 0.8 # Require 80% accuracy to pass
```
The evaluation passes/fails based on `accuracy`, but results include both metrics.
## Results Structure
With multiple metrics, results include:
### Per-Sample Results
Each sample has scores for all metrics:
```json
{
"sample": {...},
"grades": {
"accuracy": {"score": 1.0, "rationale": "Exact match: true"},
"quality": {"score": 0.85, "rationale": "Good response, minor improvements possible"}
},
"submissions": {
"accuracy": "Paris",
"quality": "Paris"
}
}
```
Note: If all graders use the same extractor, `submission` and `grade` are also provided for backwards compatibility.
### Aggregate Metrics
```json
{
"metrics": {
"by_metric": {
"accuracy": {
"avg_score_attempted": 0.95,
"pass_rate": 95.0,
"passed_attempts": 19,
"failed_attempts": 1
},
"quality": {
"avg_score_attempted": 0.82,
"pass_rate": 80.0,
"passed_attempts": 16,
"failed_attempts": 4
}
}
}
}
```
## Use Cases
### Accuracy + Quality
```yaml
graders:
accuracy:
kind: tool
function: contains
extractor: last_assistant # Does response contain the answer?
quality:
kind: rubric
prompt_path: quality.txt
model: gpt-4o-mini
extractor: last_assistant # How well is it explained?
gate:
metric_key: accuracy # Must be correct to pass
op: gte
value: 0.9 # 90% must have correct answer
```
Gate on accuracy (must be correct), but also track quality for insights.
### Content + Format
```yaml
graders:
content:
kind: rubric
prompt_path: content.txt
model: gpt-4o-mini
extractor: last_assistant # Evaluate content quality
format:
kind: tool
function: ascii_printable_only
extractor: last_assistant # Check format compliance
gate:
metric_key: content # Gate on content quality
op: gte
value: 0.7 # Content must score 70% or higher
```
Ensure content quality while checking format constraints.
### Answer + Tool Usage + Memory
```yaml
graders:
answer:
kind: tool
function: contains
extractor: last_assistant # Did the agent answer correctly?
used_tools:
kind: tool
function: contains
extractor: tool_arguments # Did it call the search tool?
extractor_config:
tool_name: search
memory_updated:
kind: tool
function: contains
extractor: memory_block # Did it update human memory?
extractor_config:
block_label: human
gate:
metric_key: answer # Gate on correctness
op: gte
value: 0.8 # 80% of answers must be correct
```
Comprehensive evaluation of agent behavior.
### Multiple Quality Dimensions
```yaml
graders:
accuracy:
kind: rubric
prompt: "Rate factual accuracy from 0.0 to 1.0"
model: gpt-4o-mini
extractor: last_assistant
clarity:
kind: rubric
prompt: "Rate clarity of explanation from 0.0 to 1.0"
model: gpt-4o-mini
extractor: last_assistant
conciseness:
kind: rubric
prompt: "Rate conciseness (not too verbose) from 0.0 to 1.0"
model: gpt-4o-mini
extractor: last_assistant
gate:
metric_key: accuracy
op: gte
value: 0.8
```
Track multiple subjective dimensions.
## Display Names
Add human-friendly names for metrics:
```yaml
graders:
acc:
display_name: "Accuracy"
kind: tool
function: exact_match
extractor: last_assistant
qual:
display_name: "Response Quality"
kind: rubric
prompt_path: quality.txt
model: gpt-4o-mini
extractor: last_assistant
```
Display names appear in CLI output and visualizations.
## Independent Extraction
Each grader can extract different content:
```yaml
graders:
final_answer:
kind: tool
function: contains
extractor: last_assistant # Last thing said
tool_calls:
kind: tool
function: contains
extractor: all_assistant # Everything said
search_usage:
kind: tool
function: contains
extractor: tool_arguments # Tool arguments
extractor_config:
tool_name: search
```
## Analyzing Results
### View All Metrics
CLI output shows all metrics:
```
Results by metric:
accuracy - Avg: 0.95, Pass: 95.0%
quality - Avg: 0.82, Pass: 80.0%
tool_usage - Avg: 0.88, Pass: 88.0%
Gate (accuracy >= 0.9): PASSED
```
### JSON Output
```bash
letta-evals run suite.yaml --output results/
```
Produces:
- `results/summary.json`: Aggregate metrics
- `results/results.jsonl`: Per-sample results with all grades
### Filtering Results
Post-process to find patterns:
```python
import json
# Load results
with open("results/results.jsonl") as f:
results = [json.loads(line) for line in f]
# Find samples where accuracy=1.0 but quality<0.5
issues = [
r for r in results
if r["grades"]["accuracy"]["score"] == 1.0
and r["grades"]["quality"]["score"] < 0.5
]
print(f"Found {len(issues)} samples with correct but low-quality responses")
```
## Best Practices
### 1. Start with Core Metric
Focus on one primary metric for gating:
```yaml
gate:
metric_key: accuracy # Most important
op: gte
value: 0.9
```
Use others for diagnostics.
### 2. Combine Tool and Rubric
Use fast tool graders for objective checks, rubric graders for quality:
```yaml
graders:
correct:
kind: tool # Fast, cheap
function: contains
extractor: last_assistant
quality:
kind: rubric # Slower, more nuanced
prompt_path: quality.txt
model: gpt-4o-mini
extractor: last_assistant
```
### 3. Track Tool Usage
Add a metric for expected tool calls:
```yaml
graders:
used_search:
kind: tool
function: contains
extractor: tool_arguments
extractor_config:
tool_name: search
```
### 4. Validate Format
Include format checks alongside content:
```yaml
graders:
content:
kind: rubric
prompt_path: content.txt
model: gpt-4o-mini
extractor: last_assistant
ascii_only:
kind: tool
function: ascii_printable_only
extractor: last_assistant
```
### 5. Use Display Names
Make CLI output readable:
```yaml
graders:
acc:
display_name: "Answer Accuracy"
kind: tool
function: exact_match
extractor: last_assistant
```
## Cost Implications
Multiple rubric graders multiply API costs:
- 1 grader: $0.00015/sample
- 3 graders: $0.00045/sample
- 5 graders: $0.00075/sample
For 1000 samples with 3 rubric graders: ~$0.45
Mix tool and rubric graders to balance cost and insight.
## Performance
Multiple graders run sequentially per sample, but samples run concurrently:
- 1 grader: ~1s per sample
- 3 graders (2 rubric): ~2s per sample
With 10 concurrent: 1000 samples in ~3-5 minutes
## Next Steps
- [Tool Graders](./tool-graders.md)
- [Rubric Graders](./rubric-graders.md)
- [Understanding Results](../results/overview.md)

View File

@@ -1,680 +0,0 @@
# Rubric Graders
Rubric graders, also called "LLM-as-judge" graders, use language models evaluate submissions based on custom criteria. They're ideal for subjective, nuanced evaluation.
Rubric graders work by providing the LLM with a prompt that describes the evaluation criteria, then the language model generates a structured JSON response with a score and rationale:
```json
{
"score": 0.85,
"rationale": "The response is accurate and well-explained, but could be more concise."
}
```
**Schema requirements:**
- `score` (required): Decimal number between 0.0 and 1.0
- `rationale` (required): String explanation of the grading decision
> **Note**: OpenAI provides the best support for structured generation. Other providers may have varying quality of structured output adherence.
## Overview
Rubric graders:
- Use an LLM to evaluate responses
- Support custom evaluation criteria (rubrics)
- Can handle subjective quality assessment
- Return scores with explanations
- Use JSON structured generation for reliability
## Basic Configuration
```yaml
graders:
quality:
kind: rubric
prompt_path: rubric.txt # Path to rubric file
model: gpt-4o-mini # LLM model
extractor: last_assistant
```
## Rubric Prompts
Your rubric file defines the evaluation criteria. It can include placeholders:
- `{input}`: The original input from the dataset
- `{submission}`: The extracted agent response
- `{ground_truth}`: Ground truth from dataset (if available)
### Example Rubric
`quality_rubric.txt`:
```
Evaluate the response for accuracy, completeness, and clarity.
Input: {input}
Expected answer: {ground_truth}
Agent response: {submission}
Scoring criteria:
- 1.0: Perfect - accurate, complete, and clear
- 0.8-0.9: Excellent - minor improvements possible
- 0.6-0.7: Good - some gaps or unclear parts
- 0.4-0.5: Adequate - significant issues
- 0.2-0.3: Poor - major problems
- 0.0-0.1: Failed - incorrect or nonsensical
Provide a score between 0.0 and 1.0.
```
### Inline Prompts
You can include the prompt directly in the YAML:
```yaml
graders:
creativity:
kind: rubric
prompt: |
Evaluate the creativity and originality of the response.
Response: {submission}
Score from 0.0 (generic) to 1.0 (highly creative).
model: gpt-4o-mini
extractor: last_assistant
```
## Configuration Options
### prompt_path vs prompt
Use exactly one:
```yaml
# Option 1: External file
graders:
quality:
kind: rubric
prompt_path: rubrics/quality.txt # Relative to suite YAML
model: gpt-4o-mini
extractor: last_assistant
```
```yaml
# Option 2: Inline
graders:
quality:
kind: rubric
prompt: "Evaluate the response quality from 0.0 to 1.0"
model: gpt-4o-mini
extractor: last_assistant
```
### model
LLM model to use for judging:
```yaml
graders:
quality:
kind: rubric
prompt_path: rubric.txt
model: gpt-4o-mini # Or gpt-4o, claude-3-5-sonnet, etc.
extractor: last_assistant
```
Supported: Any OpenAI-compatible model
**Special handling**: For reasoning models (o1, o3, gpt-5), temperature is automatically set to 1.0 even if you specify 0.0.
### temperature
Controls randomness in LLM generation:
```yaml
graders:
quality:
kind: rubric
prompt_path: rubric.txt
model: gpt-4o-mini
temperature: 0.0 # Deterministic (default)
extractor: last_assistant
```
Range: 0.0 (deterministic) to 2.0 (very random)
Default: 0.0 (recommended for evaluations)
### provider
LLM provider:
```yaml
graders:
quality:
kind: rubric
prompt_path: rubric.txt
model: gpt-4o-mini
provider: openai # Default
extractor: last_assistant
```
Currently supported: `openai` (default)
### max_retries
Number of retry attempts if API call fails:
```yaml
graders:
quality:
kind: rubric
prompt_path: rubric.txt
model: gpt-4o-mini
max_retries: 5 # Default
extractor: last_assistant
```
### timeout
Timeout for API calls in seconds:
```yaml
graders:
quality:
kind: rubric
prompt_path: rubric.txt
model: gpt-4o-mini
timeout: 120.0 # Default: 2 minutes
extractor: last_assistant
```
## How It Works
1. **Prompt Building**: The rubric prompt is populated with placeholders (`{input}`, `{submission}`, `{ground_truth}`)
2. **System Prompt**: Instructs the LLM to return JSON with `score` and `rationale` fields
3. **Structured Output**: Uses JSON mode (`response_format: json_object`) to enforce the schema
4. **Validation**: Extracts and validates score (clamped to 0.0-1.0) and rationale
5. **Error Handling**: Returns score 0.0 with error message if grading fails
### System Prompt
The rubric grader automatically includes this system prompt:
```
You are an evaluation judge. You will be given:
1. A rubric describing evaluation criteria
2. An input/question
3. A submission to evaluate
Evaluate the submission according to the rubric and return a JSON response with:
{
"score": (REQUIRED: a decimal number between 0.0 and 1.0 inclusive),
"rationale": "explanation of your grading decision"
}
IMPORTANT:
- The score MUST be a number between 0.0 and 1.0 (inclusive)
- 0.0 means complete failure, 1.0 means perfect
- Use decimal values for partial credit (e.g., 0.25, 0.5, 0.75)
- Be objective and follow the rubric strictly
```
If the LLM returns invalid JSON or missing fields, the grading fails and returns score 0.0 with an error message.
## Agent-as-Judge
Instead of calling an LLM API directly, you can use a **Letta agent** as the judge. The agent-as-judge approach loads a Letta agent from a `.af` file, sends it the evaluation criteria, and collects its score via a tool call.
### Why Use Agent-as-Judge?
Agent-as-judge is ideal when:
1. **No direct LLM API access**: Your team uses Letta Cloud or managed instances without direct API keys
2. **Judges need tools**: The evaluator needs to call tools during grading (e.g., web search, database queries, fetching webpages to verify answers)
3. **Centralized LLM access**: Your organization provides LLM access only through Letta
4. **Custom evaluation logic**: You want the judge to use specific tools or follow complex evaluation workflows
5. **Teacher-student patterns**: You have a well-built, experienced agent that can evaluate and teach a student agent being developed
### Configuration
To use agent-as-judge, specify `agent_file` instead of `model`:
```yaml
graders:
agent_judge:
kind: rubric # Still "rubric" kind
agent_file: judge.af # Path to judge agent .af file
prompt_path: rubric.txt # Evaluation criteria
judge_tool_name: submit_grade # Tool for submitting scores (default: submit_grade)
extractor: last_assistant # What to extract from target agent
```
**Key differences from standard rubric grading:**
- Use `agent_file` instead of `model`
- No `temperature`, `provider`, `max_retries`, or `timeout` fields (agent handles retries internally)
- Judge agent must have a `submit_grade(score: float, rationale: str)` tool
- Framework validates judge tool on initialization (fail-fast)
### Judge Agent Requirements
Your judge agent **must** have a tool with this exact signature:
```python
def submit_grade(score: float, rationale: str) -> dict:
"""
Submit an evaluation grade for an agent's response.
Args:
score: A float between 0.0 (complete failure) and 1.0 (perfect)
rationale: Explanation of why this score was given
Returns:
dict: Confirmation of grade submission
"""
return {
"status": "success",
"grade": {"score": score, "rationale": rationale}
}
```
**Validation on initialization**: The framework validates the judge agent has the correct tool with the right parameters **before** running evaluations. If validation fails, you'll get a clear error:
```
ValueError: Judge tool 'submit_grade' not found in agent file judge.af.
Available tools: ['fetch_webpage', 'search_documents']
```
This fail-fast approach catches configuration errors immediately.
### Checklist: Will Your Judge Agent Work?
- [ ] **Tool exists**: Agent has a tool with the name specified in `judge_tool_name` (default: `submit_grade`)
- [ ] **Tool parameters**: The tool has BOTH `score: float` and `rationale: str` parameters
- [ ] **Tool is callable**: The tool is not disabled or requires-approval-only
- [ ] **Agent system prompt**: Agent understands it's an evaluator (optional but recommended)
- [ ] **No conflicting tools**: Agent doesn't have other tools that might confuse it into answering questions instead of judging
### Example Configuration
**suite.yaml:**
```yaml
name: fetch-webpage-agent-judge-test
description: Test agent responses using a Letta agent as judge
dataset: dataset.csv
target:
kind: agent
agent_file: my_agent.af # Agent being tested
base_url: http://localhost:8283
graders:
agent_judge:
kind: rubric
agent_file: judge.af # Judge agent with submit_grade tool
prompt_path: rubric.txt # Evaluation criteria
judge_tool_name: submit_grade # Tool name (default: submit_grade)
extractor: last_assistant # Extract target agent's response
gate:
metric_key: agent_judge
op: gte
value: 0.75 # Pass if avg score ≥ 0.75
```
**rubric.txt:**
```
Evaluate the agent's response based on the following criteria:
1. **Correctness (0.6 weight)**: Does the response contain accurate information from the webpage? Check if the answer matches what was requested in the input.
2. **Format (0.2 weight)**: Is the response formatted correctly? The input often requests answers in a specific format (e.g., in brackets like {Answer}).
3. **Completeness (0.2 weight)**: Does the response fully address the question without unnecessary information?
Scoring Guidelines:
- 1.0: Perfect response - correct, properly formatted, and complete
- 0.75-0.99: Good response - minor formatting or completeness issues
- 0.5-0.74: Adequate response - correct information but format/completeness problems
- 0.25-0.49: Poor response - partially correct or missing key information
- 0.0-0.24: Failed response - incorrect or no relevant information
Use the submit_grade tool to submit your evaluation with a score between 0.0 and 1.0. You will need to use your fetch_webpage tool to fetch the desired webpage and confirm the answer is correct.
```
**Judge agent with tools**: The judge agent in this example has `fetch_webpage` tool, allowing it to independently verify answers by fetching the webpage mentioned in the input.
### How Agent-as-Judge Works
1. **Agent Loading**: Loads judge agent from `.af` file and validates tool signature
2. **Prompt Formatting**: Formats the rubric with `{input}`, `{submission}`, `{ground_truth}` placeholders
3. **Agent Evaluation**: Sends formatted prompt to judge agent as a message
4. **Tool Call Parsing**: Extracts score and rationale from `submit_grade` tool call
5. **Cleanup**: Deletes judge agent after evaluation to free resources
6. **Error Handling**: Returns score 0.0 with error message if judge fails to call the tool
### Agent-as-Judge vs Standard Rubric Grading
| Feature | Standard Rubric | Agent-as-Judge |
|---------|----------------|----------------|
| **LLM Access** | Direct API (OPENAI_API_KEY) | Through Letta agent |
| **Tools** | No tool usage | Judge can use tools |
| **Configuration** | `model`, `temperature`, etc. | `agent_file`, `judge_tool_name` |
| **Output Format** | JSON structured output | Tool call with score/rationale |
| **Validation** | Runtime JSON parsing | Upfront tool signature validation |
| **Use Case** | Teams with API access | Teams using Letta Cloud, judges needing tools |
| **Cost** | API call per sample | Depends on judge agent's LLM config |
### Teacher-Student Pattern
A powerful use case for agent-as-judge is the **teacher-student pattern**, where an experienced, well-configured agent evaluates a student agent being developed.
> **Prerequisites**: This pattern assumes you already have a well-defined, production-ready agent that performs well on your task. This agent becomes the "teacher" that evaluates the "student" agent you're developing.
**Why this works:**
- **Domain expertise**: The teacher agent has specialized knowledge and tools
- **Consistent evaluation**: The teacher applies the same standards across all evaluations
- **Tool-based verification**: The teacher can independently verify answers using its own tools
- **Iterative improvement**: Use the teacher to evaluate multiple versions of the student as you improve it
**Example scenario:**
You have a production-ready customer support agent with domain expertise and access to your tools (knowledge base, CRM, documentation search, etc.). You're developing a new, faster version of this agent. Use the experienced agent as the judge to evaluate whether the new agent meets the same quality standards.
**Configuration:**
```yaml
name: student-agent-evaluation
description: Experienced agent evaluates student agent performance
dataset: support_questions.csv
target:
kind: agent
agent_file: student_agent.af # New agent being developed
base_url: http://localhost:8283
graders:
teacher_evaluation:
kind: rubric
agent_file: teacher_agent.af # Experienced production agent with domain tools
prompt: |
You are an experienced customer support agent evaluating a new agent's response.
Customer question: {input}
Student agent's answer: {submission}
Use your available tools to verify the answer is correct and complete.
Grade based on:
1. Factual accuracy (0.5 weight) - Does the answer contain correct information?
2. Completeness (0.3 weight) - Does it fully address the question?
3. Tone and professionalism (0.2 weight) - Is it appropriately worded?
Submit a score from 0.0 to 1.0 using the submit_grade tool.
extractor: last_assistant
gate:
metric_key: teacher_evaluation
op: gte
value: 0.8 # Student must score 80% or higher
```
**Benefits of this approach:**
- **Leverage existing expertise**: Your best agent becomes the standard
- **Scalable quality control**: Teacher evaluates hundreds of scenarios automatically
- **Continuous validation**: Run teacher evaluations in CI/CD as you iterate on the student
- **Transfer learning**: Teacher's evaluation helps identify where the student needs improvement
### Complete Example
See [`examples/letta-agent-rubric-grader/`](https://github.com/letta-ai/letta-evals/tree/main/examples/letta-agent-rubric-grader) for a working example with:
- Judge agent with `submit_grade` and `fetch_webpage` tools
- Target agent that fetches webpages and answers questions
- Rubric that instructs judge to verify answers independently
- Complete suite configuration
## Use Cases
### Quality Assessment
```yaml
graders:
quality:
kind: rubric
prompt_path: quality_rubric.txt
model: gpt-4o-mini
extractor: last_assistant
```
`quality_rubric.txt`:
```
Evaluate response quality based on:
1. Accuracy of information
2. Completeness of answer
3. Clarity of explanation
Response: {submission}
Ground truth: {ground_truth}
Score from 0.0 to 1.0.
```
### Creativity Evaluation
```yaml
graders:
creativity:
kind: rubric
prompt: |
Rate the creativity and originality of this story.
Story: {submission}
1.0 = Highly creative and original
0.5 = Some creative elements
0.0 = Generic or cliché
model: gpt-4o-mini
extractor: last_assistant
```
### Multi-Criteria Evaluation
```yaml
graders:
comprehensive:
kind: rubric
prompt: |
Evaluate the response on multiple criteria:
1. Technical Accuracy (40%)
2. Clarity of Explanation (30%)
3. Completeness (20%)
4. Conciseness (10%)
Input: {input}
Response: {submission}
Expected: {ground_truth}
Provide a weighted score from 0.0 to 1.0.
model: gpt-4o
extractor: last_assistant
```
### Code Quality
```yaml
graders:
code_quality:
kind: rubric
prompt: |
Evaluate this code for:
- Correctness
- Readability
- Efficiency
- Best practices
Code: {submission}
Score from 0.0 to 1.0.
model: gpt-4o
extractor: last_assistant
```
### Tone and Style
```yaml
graders:
professionalism:
kind: rubric
prompt: |
Rate the professionalism and appropriate tone of the response.
Response: {submission}
1.0 = Highly professional
0.5 = Acceptable
0.0 = Unprofessional or inappropriate
model: gpt-4o-mini
extractor: last_assistant
```
## Best Practices
### 1. Clear Scoring Criteria
Provide explicit score ranges and what they mean:
```
Score:
- 1.0: Perfect response with no issues
- 0.8-0.9: Minor improvements possible
- 0.6-0.7: Some gaps or errors
- 0.4-0.5: Significant problems
- 0.2-0.3: Major issues
- 0.0-0.1: Complete failure
```
### 2. Use Ground Truth When Available
If you have expected answers, include them:
```
Expected: {ground_truth}
Actual: {submission}
Evaluate how well the actual response matches the expected content.
```
### 3. Be Specific About Criteria
Vague: "Evaluate the quality"
Better: "Evaluate accuracy, completeness, and clarity"
### 4. Use Examples in Rubric
```
Example of 1.0: "A complete, accurate answer with clear explanation"
Example of 0.5: "Partially correct but missing key details"
Example of 0.0: "Incorrect or irrelevant response"
```
### 5. Calibrate with Test Cases
Run on a small set first to ensure the rubric produces expected scores.
### 6. Consider Model Choice
- **gpt-4o-mini**: Fast and cost-effective for simple criteria
- **gpt-4o**: More accurate for complex evaluation
- **claude-3-5-sonnet**: Alternative perspective (via OpenAI-compatible endpoint)
## Environment Setup
Rubric graders require an OpenAI API key:
```bash
export OPENAI_API_KEY=your-api-key
```
For custom endpoints:
```bash
export OPENAI_BASE_URL=https://your-endpoint.com/v1
```
## Error Handling
If grading fails:
- Score is set to 0.0
- Rationale includes error message
- Metadata includes error details
- Evaluation continues (doesn't stop the suite)
Common errors:
- API timeout → Check `timeout` setting
- Invalid API key → Verify `OPENAI_API_KEY`
- Rate limit → Reduce concurrency or add retries
## Cost Considerations
Rubric graders make API calls for each sample:
- **gpt-4o-mini**: ~$0.00015 per evaluation (cheap)
- **gpt-4o**: ~$0.002 per evaluation (more expensive)
For 1000 samples:
- gpt-4o-mini: ~$0.15
- gpt-4o: ~$2.00
Estimate costs before running large evaluations.
## Performance
Rubric graders are slower than tool graders:
- Tool grader: <1ms per sample
- Rubric grader: 500-2000ms per sample (network + LLM)
Use concurrency to speed up:
```bash
letta-evals run suite.yaml --max-concurrent 10
```
## Limitations
Rubric graders:
- **Cost**: API calls cost money
- **Speed**: Slower than tool graders
- **Consistency**: Can vary slightly between runs (use temperature 0.0 for best consistency)
- **API dependency**: Requires network and API availability
For deterministic, fast evaluation, use [Tool Graders](./tool-graders.md).
## Combining Tool and Rubric Graders
Use both in one suite:
```yaml
graders:
format_check:
kind: tool
function: regex_match
extractor: last_assistant
quality:
kind: rubric
prompt_path: quality.txt
model: gpt-4o-mini
extractor: last_assistant
gate:
metric_key: quality # Gate on quality, but still check format
op: gte
value: 0.7
```
This combines fast deterministic checks with nuanced quality evaluation.
## Next Steps
- [Built-in Extractors](../extractors/builtin.md) - Understanding what to extract from trajectories
- [Tool Graders](./tool-graders.md) - Deterministic evaluation for objective criteria
- [Multi-Metric Evaluation](./multi-metric.md) - Combining multiple graders
- [Custom Graders](../advanced/custom-graders.md) - Writing custom evaluation logic

View File

@@ -1,332 +0,0 @@
# Tool Graders
Tool graders use Python functions to programmatically evaluate submissions. They're ideal for deterministic, rule-based evaluation.
## Overview
Tool graders:
- Execute Python functions that take `(sample, submission)` and return a `GradeResult`
- Are fast and deterministic
- Don't require external API calls
- Can implement any custom logic
## Configuration
```yaml
graders:
my_metric:
kind: tool
function: exact_match # Function name
extractor: last_assistant # What to extract from trajectory
```
The `extractor` determines what part of the agent's response to evaluate. See [Built-in Extractors](../extractors/builtin.md) for all available options.
## Built-in Functions
### exact_match
Exact string comparison (case-sensitive, whitespace-trimmed).
```yaml
graders:
accuracy:
kind: tool
function: exact_match
extractor: last_assistant
```
**Requires**: `ground_truth` in dataset
**Returns**:
- Score: 1.0 if exact match, 0.0 otherwise
- Rationale: "Exact match: true" or "Exact match: false"
**Example**:
```jsonl
{"input": "What is 2+2?", "ground_truth": "4"}
```
Submission "4" → Score 1.0
Submission "four" → Score 0.0
### contains
Case-insensitive substring check.
```yaml
graders:
keyword_check:
kind: tool
function: contains
extractor: last_assistant
```
**Requires**: `ground_truth` in dataset
**Returns**:
- Score: 1.0 if ground_truth found in submission (case-insensitive), 0.0 otherwise
- Rationale: "Contains ground_truth: true" or "Contains ground_truth: false"
**Example**:
```jsonl
{"input": "What is the capital of France?", "ground_truth": "Paris"}
```
Submission "The capital is Paris" → Score 1.0
Submission "The capital is paris" → Score 1.0 (case-insensitive)
Submission "The capital is Lyon" → Score 0.0
### regex_match
Pattern matching using regex.
```yaml
graders:
pattern_check:
kind: tool
function: regex_match
extractor: last_assistant
```
**Requires**: `ground_truth` in dataset (as regex pattern)
**Returns**:
- Score: 1.0 if pattern matches, 0.0 otherwise
- Rationale: "Regex match: true" or "Regex match: false"
- If pattern is invalid: Score 0.0 with error message
**Example**:
```jsonl
{"input": "Generate a UUID", "ground_truth": "[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"}
{"input": "Extract the number", "ground_truth": "\\d+"}
```
Submission "550e8400-e29b-41d4-a716-446655440000" → Score 1.0
Submission "not-a-uuid" → Score 0.0
### ascii_printable_only
Validates that all characters are printable ASCII (code points 32-126).
```yaml
graders:
ascii_check:
kind: tool
function: ascii_printable_only
extractor: last_assistant
```
**Requires**: No ground_truth needed
**Returns**:
- Score: 1.0 if all characters are printable ASCII, 0.0 if any non-printable found
- Rationale: Details about non-printable characters if found
**Notes**:
- Newlines (`\n`) and carriage returns (`\r`) are ignored (allowed)
- Useful for ASCII art, formatted output, or ensuring clean text
**Example**:
Submission "Hello, World!\n" → Score 1.0
Submission "Hello 🌍" → Score 0.0 (emoji not in ASCII range)
## Custom Tool Graders
You can write custom grading functions:
```python
# custom_graders.py
from letta_evals.decorators import grader
from letta_evals.models import GradeResult, Sample
@grader
def my_custom_grader(sample: Sample, submission: str) -> GradeResult:
"""Custom grading logic."""
# Your evaluation logic here
score = 1.0 if some_condition(submission) else 0.0
return GradeResult(
score=score,
rationale=f"Explanation of the score",
metadata={"extra": "info"}
)
```
Then reference it in your suite:
```yaml
graders:
custom:
kind: tool
function: my_custom_grader
extractor: last_assistant
```
See [Custom Graders](../advanced/custom-graders.md) for details.
## Use Cases
### Exact Answer Validation
```yaml
graders:
correct_answer:
kind: tool
function: exact_match
extractor: last_assistant
```
Best for: Math problems, single-word answers, structured formats
### Keyword Presence
```yaml
graders:
mentions_topic:
kind: tool
function: contains
extractor: last_assistant
```
Best for: Checking if specific concepts are mentioned
### Format Validation
```yaml
graders:
valid_email:
kind: tool
function: regex_match
extractor: last_assistant
```
Dataset:
```jsonl
{"input": "Extract the email", "ground_truth": "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}"}
```
Best for: Emails, UUIDs, phone numbers, structured data
### Tool Call Validation
```yaml
graders:
used_search:
kind: tool
function: contains
extractor: tool_arguments
extractor_config:
tool_name: search
```
Dataset:
```jsonl
{"input": "Find information about pandas", "ground_truth": "pandas"}
```
Checks if the agent called the search tool with "pandas" in arguments.
### JSON Structure Validation
Custom grader:
```python
import json
from letta_evals.decorators import grader
from letta_evals.models import GradeResult, Sample
@grader
def valid_json_with_field(sample: Sample, submission: str) -> GradeResult:
try:
data = json.loads(submission)
required_field = sample.ground_truth
if required_field in data:
return GradeResult(score=1.0, rationale=f"Valid JSON with '{required_field}' field")
else:
return GradeResult(score=0.0, rationale=f"Missing required field: {required_field}")
except json.JSONDecodeError as e:
return GradeResult(score=0.0, rationale=f"Invalid JSON: {e}")
```
## Combining with Extractors
Tool graders work with any extractor:
### Grade Tool Arguments
```yaml
graders:
correct_tool:
kind: tool
function: exact_match
extractor: tool_arguments
extractor_config:
tool_name: calculator
```
Checks if calculator was called with specific arguments.
### Grade Memory Updates
```yaml
graders:
memory_correct:
kind: tool
function: contains
extractor: memory_block
extractor_config:
block_label: human
```
Checks if agent's memory block contains expected content.
### Grade Pattern Extraction
```yaml
graders:
extracted_correctly:
kind: tool
function: exact_match
extractor: pattern
extractor_config:
pattern: 'ANSWER: (.*)'
group: 1
```
Extracts content after "ANSWER:" and checks if it matches ground truth.
## Performance
Tool graders are:
- **Fast**: No API calls, pure Python execution
- **Deterministic**: Same input always produces same result
- **Cost-effective**: No LLM API costs
- **Reliable**: No network dependencies
Use tool graders when possible for faster, cheaper evaluations.
## Limitations
Tool graders:
- Can't evaluate subjective quality
- Limited to predefined logic
- Don't understand semantic similarity
- Can't handle complex, nuanced criteria
For these cases, use [Rubric Graders](./rubric-graders.md).
## Best Practices
1. **Use exact_match for precise answers**: Math, single words, structured formats
2. **Use contains for flexible matching**: When exact format varies but key content is present
3. **Use regex for format validation**: Emails, phone numbers, UUIDs
4. **Write custom graders for complex logic**: Multi-step validation, JSON parsing
5. **Combine multiple graders**: Evaluate different aspects (format + content + tool usage)
## Next Steps
- [Built-in Extractors](../extractors/builtin.md) - Understanding what to extract from trajectories
- [Rubric Graders](./rubric-graders.md) - LLM-based evaluation for subjective quality
- [Custom Graders](../advanced/custom-graders.md) - Writing your own grading functions
- [Multi-Metric Evaluation](./multi-metric.md) - Using multiple graders simultaneously

View File

@@ -1,468 +0,0 @@
# Understanding Results
This guide explains how to interpret evaluation results.
## Result Structure
An evaluation produces three types of output:
1. **Console output**: Real-time progress and summary
2. **Summary JSON**: Aggregate metrics and configuration
3. **Results JSONL**: Per-sample detailed results
## Console Output
### Progress Display
```
Running evaluation: my-eval-suite
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3/3 100%
Results:
Total samples: 3
Attempted: 3
Avg score: 0.83 (attempted: 0.83)
Passed: 2 (66.7%)
Gate (quality >= 0.75): PASSED
```
### Quiet Mode
```bash
letta-evals run suite.yaml --quiet
```
Output:
```
✓ PASSED
```
or
```
✗ FAILED
```
## JSON Output
### Saving Results
```bash
letta-evals run suite.yaml --output results/
```
Creates three files:
#### header.json
Evaluation metadata:
```json
{
"suite_name": "my-eval-suite",
"timestamp": "2025-01-15T10:30:00Z",
"version": "0.3.0"
}
```
#### summary.json
Complete evaluation summary:
```json
{
"suite": "my-eval-suite",
"config": {
"target": {...},
"graders": {...},
"gate": {...}
},
"metrics": {
"total": 10,
"total_attempted": 10,
"avg_score_attempted": 0.85,
"avg_score_total": 0.85,
"passed_attempts": 8,
"failed_attempts": 2,
"by_metric": {
"accuracy": {
"avg_score_attempted": 0.90,
"pass_rate": 90.0,
"passed_attempts": 9,
"failed_attempts": 1
},
"quality": {
"avg_score_attempted": 0.80,
"pass_rate": 70.0,
"passed_attempts": 7,
"failed_attempts": 3
}
}
},
"gates_passed": true
}
```
#### results.jsonl
One JSON object per line, each representing one sample:
```jsonl
{"sample": {"id": 0, "input": "What is 2+2?", "ground_truth": "4"}, "submission": "4", "grade": {"score": 1.0, "rationale": "Exact match: true"}, "trajectory": [...], "agent_id": "agent-123", "model_name": "default"}
{"sample": {"id": 1, "input": "What is 3+3?", "ground_truth": "6"}, "submission": "6", "grade": {"score": 1.0, "rationale": "Exact match: true"}, "trajectory": [...], "agent_id": "agent-124", "model_name": "default"}
```
## Metrics Explained
### total
Total number of samples in the evaluation (including errors).
### total_attempted
Number of samples that completed without errors.
If a sample fails during agent execution or grading, it's counted in `total` but not `total_attempted`.
### avg_score_attempted
Average score across samples that completed successfully.
Formula: `sum(scores) / total_attempted`
Range: 0.0 to 1.0
### avg_score_total
Average score across all samples, treating errors as 0.0.
Formula: `sum(scores) / total`
Range: 0.0 to 1.0
### passed_attempts / failed_attempts
Number of samples that passed/failed the gate's per-sample criteria.
By default:
- If gate metric is `accuracy`: sample passes if score >= 1.0
- If gate metric is `avg_score`: sample passes if score >= gate value
Can be customized with `pass_op` and `pass_value` in gate config.
### by_metric
For multi-metric evaluation, shows aggregate stats for each metric:
```json
"by_metric": {
"accuracy": {
"avg_score_attempted": 0.90,
"avg_score_total": 0.85,
"pass_rate": 90.0,
"passed_attempts": 9,
"failed_attempts": 1
}
}
```
## Sample Results
Each sample result includes:
### sample
The original dataset sample:
```json
"sample": {
"id": 0,
"input": "What is 2+2?",
"ground_truth": "4",
"metadata": {...}
}
```
### submission
The extracted text that was graded:
```json
"submission": "The answer is 4"
```
### grade
The grading result:
```json
"grade": {
"score": 1.0,
"rationale": "Contains ground_truth: true",
"metadata": {"model": "gpt-4o-mini", "usage": {...}}
}
```
### grades (multi-metric)
For multi-metric evaluation:
```json
"grades": {
"accuracy": {"score": 1.0, "rationale": "Exact match"},
"quality": {"score": 0.85, "rationale": "Good but verbose"}
}
```
### trajectory
The complete conversation history:
```json
"trajectory": [
[
{"role": "user", "content": "What is 2+2?"},
{"role": "assistant", "content": "The answer is 4"}
]
]
```
### agent_id
The ID of the agent that generated this response:
```json
"agent_id": "agent-abc-123"
```
### model_name
The model configuration used:
```json
"model_name": "gpt-4o-mini"
```
### agent_usage
Token usage statistics (if available):
```json
"agent_usage": [
{"completion_tokens": 10, "prompt_tokens": 50, "total_tokens": 60}
]
```
## Interpreting Scores
### Score Ranges
- **1.0**: Perfect - fully meets criteria
- **0.8-0.99**: Very good - minor issues
- **0.6-0.79**: Good - notable improvements possible
- **0.4-0.59**: Acceptable - significant issues
- **0.2-0.39**: Poor - major problems
- **0.0-0.19**: Failed - did not meet criteria
### Binary vs Continuous
**Tool graders** typically return binary scores:
- 1.0: Passed
- 0.0: Failed
**Rubric graders** return continuous scores:
- Any value from 0.0 to 1.0
- Allows for partial credit
## Multi-Model Results
When testing multiple models:
```json
"metrics": {
"per_model": [
{
"model_name": "gpt-4o-mini",
"avg_score_attempted": 0.85,
"passed_samples": 8,
"failed_samples": 2
},
{
"model_name": "claude-3-5-sonnet",
"avg_score_attempted": 0.90,
"passed_samples": 9,
"failed_samples": 1
}
]
}
```
Console output:
```
Results by model:
gpt-4o-mini - Avg: 0.85, Pass: 80.0%
claude-3-5-sonnet - Avg: 0.90, Pass: 90.0%
```
## Multiple Runs Statistics
Run evaluations multiple times to measure consistency and get aggregate statistics.
### Configuration
Specify in YAML:
```yaml
name: my-eval-suite
dataset: dataset.jsonl
num_runs: 5 # Run 5 times
target:
kind: agent
agent_file: my_agent.af
graders:
accuracy:
kind: tool
function: exact_match
gate:
metric_key: accuracy
op: gte
value: 0.8
```
Or via CLI:
```bash
letta-evals run suite.yaml --num-runs 10 --output results/
```
### Output Structure
```
results/
├── run_1/
│ ├── header.json
│ ├── results.jsonl
│ └── summary.json
├── run_2/
│ ├── header.json
│ ├── results.jsonl
│ └── summary.json
├── ...
└── aggregate_stats.json # Statistics across all runs
```
### Aggregate Statistics File
The `aggregate_stats.json` includes statistics across all runs:
```json
{
"num_runs": 10,
"runs_passed": 8,
"runs_failed": 2,
"pass_rate": 80.0,
"avg_score_attempted": {
"mean": 0.847,
"std": 0.042,
"min": 0.78,
"max": 0.91
},
"avg_score_total": {
"mean": 0.847,
"std": 0.042,
"min": 0.78,
"max": 0.91
},
"per_metric": {
"accuracy": {
"avg_score_attempted": {
"mean": 0.89,
"std": 0.035,
"min": 0.82,
"max": 0.95
},
"pass_rate": {
"mean": 89.0,
"std": 4.2,
"min": 80.0,
"max": 95.0
}
}
}
}
```
### Use Cases
**Measure consistency of non-deterministic agents:**
```bash
letta-evals run suite.yaml --num-runs 20 --output results/
# Check stddev in aggregate_stats.json
# Low stddev = consistent, high stddev = variable
```
**Get confidence intervals:**
```python
import json
import math
with open("results/aggregate_stats.json") as f:
stats = json.load(f)
mean = stats["avg_score_attempted"]["mean"]
std = stats["avg_score_attempted"]["std"]
n = stats["num_runs"]
# 95% confidence interval (assuming normal distribution)
margin = 1.96 * (std / math.sqrt(n))
print(f"Score: {mean:.3f} ± {margin:.3f}")
```
## Error Handling
If a sample encounters an error:
```json
{
"sample": {...},
"submission": "",
"grade": {
"score": 0.0,
"rationale": "Error during grading: Connection timeout",
"metadata": {"error": "timeout", "error_type": "ConnectionError"}
}
}
```
Errors:
- Count toward `total` but not `total_attempted`
- Get score of 0.0
- Include error details in rationale and metadata
## Analyzing Results
### Find Low Scores
```python
import json
with open("results/results.jsonl") as f:
results = [json.loads(line) for line in f]
low_scores = [r for r in results if r["grade"]["score"] < 0.5]
print(f"Found {len(low_scores)} samples with score < 0.5")
for result in low_scores:
print(f"Sample {result['sample']['id']}: {result['grade']['rationale']}")
```
### Compare Metrics
```python
# Load summary
with open("results/summary.json") as f:
summary = json.load(f)
metrics = summary["metrics"]["by_metric"]
for name, stats in metrics.items():
print(f"{name}: {stats['avg_score_attempted']:.2f} avg, {stats['pass_rate']:.1f}% pass")
```
### Extract Failures
```python
# Find samples that failed gate criteria
failures = [
r for r in results
if not gate_passed(r["grade"]["score"]) # Your gate logic
]
```
## Next Steps
- [Metrics Reference](./metrics.md)
- [Output Formats](./output-formats.md)
- [Best Practices](../best-practices/writing-tests.md)

View File

@@ -1,484 +0,0 @@
# Understanding Results
This guide explains how to interpret evaluation results.
## Result Structure
An evaluation produces three types of output:
1. **Console output**: Real-time progress and summary
2. **Summary JSON**: Aggregate metrics and configuration
3. **Results JSONL**: Per-sample detailed results
## Console Output
### Progress Display
```
Running evaluation: my-eval-suite
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3/3 100%
Results:
Total samples: 3
Attempted: 3
Avg score: 0.83 (attempted: 0.83)
Passed: 2 (66.7%)
Gate (quality >= 0.75): PASSED
```
### Quiet Mode
```bash
letta-evals run suite.yaml --quiet
```
Output:
```
✓ PASSED
```
or
```
✗ FAILED
```
## JSON Output
### Saving Results
```bash
letta-evals run suite.yaml --output results/
```
Creates three files:
#### header.json
Evaluation metadata:
```json
{
"suite_name": "my-eval-suite",
"timestamp": "2025-01-15T10:30:00Z",
"version": "0.3.0"
}
```
#### summary.json
Complete evaluation summary:
```json
{
"suite": "my-eval-suite",
"config": {
"target": {...},
"graders": {...},
"gate": {...}
},
"metrics": {
"total": 10,
"total_attempted": 10,
"avg_score_attempted": 0.85,
"avg_score_total": 0.85,
"passed_attempts": 8,
"failed_attempts": 2,
"by_metric": {
"accuracy": {
"avg_score_attempted": 0.90,
"pass_rate": 90.0,
"passed_attempts": 9,
"failed_attempts": 1
},
"quality": {
"avg_score_attempted": 0.80,
"pass_rate": 70.0,
"passed_attempts": 7,
"failed_attempts": 3
}
}
},
"gates_passed": true
}
```
#### results.jsonl
One JSON object per line, each representing one sample:
```jsonl
{"sample": {"id": 0, "input": "What is 2+2?", "ground_truth": "4"}, "submission": "4", "grade": {"score": 1.0, "rationale": "Exact match: true"}, "trajectory": [...], "agent_id": "agent-123", "model_name": "default"}
{"sample": {"id": 1, "input": "What is 3+3?", "ground_truth": "6"}, "submission": "6", "grade": {"score": 1.0, "rationale": "Exact match: true"}, "trajectory": [...], "agent_id": "agent-124", "model_name": "default"}
```
## Metrics Explained
### total
Total number of samples in the evaluation (including errors).
### total_attempted
Number of samples that completed without errors.
If a sample fails during agent execution or grading, it's counted in `total` but not `total_attempted`.
### avg_score_attempted
Average score across samples that completed successfully.
Formula: `sum(scores) / total_attempted`
Range: 0.0 to 1.0
### avg_score_total
Average score across all samples, treating errors as 0.0.
Formula: `sum(scores) / total`
Range: 0.0 to 1.0
### passed_attempts / failed_attempts
Number of samples that passed/failed the gate's per-sample criteria.
By default:
- If gate metric is `accuracy`: sample passes if score `>= 1.0`
- If gate metric is `avg_score`: sample passes if score `>=` gate value
Can be customized with `pass_op` and `pass_value` in gate config.
### by_metric
For multi-metric evaluation, shows aggregate stats for each metric:
```json
"by_metric": {
"accuracy": {
"avg_score_attempted": 0.90,
"avg_score_total": 0.85,
"pass_rate": 90.0,
"passed_attempts": 9,
"failed_attempts": 1
}
}
```
## Sample Results
Each sample result includes:
### sample
The original dataset sample:
```json
"sample": {
"id": 0,
"input": "What is 2+2?",
"ground_truth": "4",
"metadata": {...}
}
```
### submission
The extracted text that was graded:
```json
"submission": "The answer is 4"
```
### grade
The grading result:
```json
"grade": {
"score": 1.0,
"rationale": "Contains ground_truth: true",
"metadata": {"model": "gpt-4o-mini", "usage": {...}}
}
```
### grades (multi-metric)
For multi-metric evaluation:
```json
"grades": {
"accuracy": {"score": 1.0, "rationale": "Exact match"},
"quality": {"score": 0.85, "rationale": "Good but verbose"}
}
```
### trajectory
The complete conversation history:
```json
"trajectory": [
[
{"role": "user", "content": "What is 2+2?"},
{"role": "assistant", "content": "The answer is 4"}
]
]
```
### agent_id
The ID of the agent that generated this response:
```json
"agent_id": "agent-abc-123"
```
### model_name
The model configuration used:
```json
"model_name": "gpt-4o-mini"
```
### agent_usage
Token usage statistics (if available):
```json
"agent_usage": [
{"completion_tokens": 10, "prompt_tokens": 50, "total_tokens": 60}
]
```
## Interpreting Scores
### Score Ranges
- **1.0**: Perfect - fully meets criteria
- **0.8-0.99**: Very good - minor issues
- **0.6-0.79**: Good - notable improvements possible
- **0.4-0.59**: Acceptable - significant issues
- **0.2-0.39**: Poor - major problems
- **0.0-0.19**: Failed - did not meet criteria
### Binary vs Continuous
**Tool graders** typically return binary scores:
- 1.0: Passed
- 0.0: Failed
**Rubric graders** return continuous scores:
- Any value from 0.0 to 1.0
- Allows for partial credit
## Multi-Model Results
When testing multiple models:
```json
"metrics": {
"per_model": [
{
"model_name": "gpt-4o-mini",
"avg_score_attempted": 0.85,
"passed_samples": 8,
"failed_samples": 2
},
{
"model_name": "claude-3-5-sonnet",
"avg_score_attempted": 0.90,
"passed_samples": 9,
"failed_samples": 1
}
]
}
```
Console output:
```
Results by model:
gpt-4o-mini - Avg: 0.85, Pass: 80.0%
claude-3-5-sonnet - Avg: 0.90, Pass: 90.0%
```
## Multiple Runs Statistics
Run evaluations multiple times to measure consistency and get aggregate statistics.
### Configuration
Specify in YAML:
```yaml
name: my-eval-suite
dataset: dataset.jsonl
num_runs: 5 # Run 5 times
target:
kind: agent
agent_file: my_agent.af
graders:
accuracy:
kind: tool
function: exact_match
gate:
metric_key: accuracy
op: gte
value: 0.8
```
Or via CLI:
```bash
letta-evals run suite.yaml --num-runs 10 --output results/
```
### Output Structure
```
results/
├── run_1/
│ ├── header.json
│ ├── results.jsonl
│ └── summary.json
├── run_2/
│ ├── header.json
│ ├── results.jsonl
│ └── summary.json
├── ...
└── aggregate_stats.json # Statistics across all runs
```
### Aggregate Statistics File
The `aggregate_stats.json` includes statistics across all runs:
```json
{
"num_runs": 10,
"runs_passed": 8,
"mean_avg_score_attempted": 0.847,
"std_avg_score_attempted": 0.042,
"mean_avg_score_total": 0.847,
"std_avg_score_total": 0.042,
"mean_scores": {
"accuracy": 0.89,
"quality": 0.82
},
"std_scores": {
"accuracy": 0.035,
"quality": 0.051
},
"individual_run_metrics": [
{
"avg_score_attempted": 0.85,
"avg_score_total": 0.85,
"pass_rate": 0.85,
"by_metric": {
"accuracy": {
"avg_score_attempted": 0.90,
"avg_score_total": 0.90,
"pass_rate": 0.90
}
}
}
// ... metrics from runs 2-10
]
}
```
**Key fields**:
- `num_runs`: Total number of runs executed
- `runs_passed`: Number of runs that passed the gate
- `mean_avg_score_attempted`: Mean score across runs (only attempted samples)
- `std_avg_score_attempted`: Standard deviation (measures consistency)
- `mean_scores`: Mean for each metric (e.g., `{"accuracy": 0.89}`)
- `std_scores`: Standard deviation for each metric (e.g., `{"accuracy": 0.035}`)
- `individual_run_metrics`: Full metrics object from each individual run
### Use Cases
**Measure consistency of non-deterministic agents:**
```bash
letta-evals run suite.yaml --num-runs 20 --output results/
# Check std_avg_score_attempted in aggregate_stats.json
# Low std = consistent, high std = variable
```
**Get confidence intervals:**
```python
import json
import math
with open("results/aggregate_stats.json") as f:
stats = json.load(f)
mean = stats["mean_avg_score_attempted"]
std = stats["std_avg_score_attempted"]
n = stats["num_runs"]
# 95% confidence interval (assuming normal distribution)
margin = 1.96 * (std / math.sqrt(n))
print(f"Score: {mean:.3f} ± {margin:.3f}")
```
**Compare metric consistency:**
```python
with open("results/aggregate_stats.json") as f:
stats = json.load(f)
for metric_name, mean in stats["mean_scores"].items():
std = stats["std_scores"][metric_name]
consistency = "consistent" if std < 0.05 else "variable"
print(f"{metric_name}: {mean:.3f} ± {std:.3f} ({consistency})")
```
## Error Handling
If a sample encounters an error:
```json
{
"sample": {...},
"submission": "",
"grade": {
"score": 0.0,
"rationale": "Error during grading: Connection timeout",
"metadata": {"error": "timeout", "error_type": "ConnectionError"}
}
}
```
Errors:
- Count toward `total` but not `total_attempted`
- Get score of 0.0
- Include error details in rationale and metadata
## Analyzing Results
### Find Low Scores
```python
import json
with open("results/results.jsonl") as f:
results = [json.loads(line) for line in f]
low_scores = [r for r in results if r["grade"]["score"] < 0.5]
print(f"Found {len(low_scores)} samples with score < 0.5")
for result in low_scores:
print(f"Sample {result['sample']['id']}: {result['grade']['rationale']}")
```
### Compare Metrics
```python
# Load summary
with open("results/summary.json") as f:
summary = json.load(f)
metrics = summary["metrics"]["by_metric"]
for name, stats in metrics.items():
print(f"{name}: {stats['avg_score_attempted']:.2f} avg, {stats['pass_rate']:.1f}% pass")
```
### Extract Failures
```python
# Find samples that failed gate criteria
failures = [
r for r in results
if not gate_passed(r["grade"]["score"]) # Your gate logic
]
```
## Next Steps
- [Gates](/guides/evals/concepts/gates) - Setting pass/fail criteria
- [CLI Commands](/guides/evals/cli/commands) - Running evaluations

View File

@@ -1,438 +0,0 @@
# Troubleshooting
Common issues and solutions when using Letta Evals.
## Installation Issues
### "Command not found: letta-evals"
**Problem**: CLI not available after installation
**Solution**:
```bash
# Verify installation
pip list | grep letta-evals
# Reinstall if needed
pip install --upgrade letta-evals
# Or with uv
uv sync
```
### Import errors
**Problem**: `ModuleNotFoundError: No module named 'letta_evals'`
**Solution**:
```bash
# Ensure you're in the right environment
which python
# Install in correct environment
source .venv/bin/activate # or: ac
pip install letta-evals
```
## Configuration Issues
### "Agent file not found"
**Problem**: `FileNotFoundError: agent.af`
**Solution**:
- Check the path is correct relative to the suite YAML
- Use absolute paths if needed
- Verify file exists: `ls -la path/to/agent.af`
```yaml
# Correct relative path
target:
agent_file: ./agents/my_agent.af
# Or absolute path
target:
agent_file: /absolute/path/to/agent.af
```
### "Dataset not found"
**Problem**: Cannot load dataset file
**Solution**:
- Verify dataset path in YAML
- Check file exists: `ls -la dataset.jsonl`
- Ensure proper JSONL format (one JSON object per line)
```bash
# Validate JSONL format
cat dataset.jsonl | jq .
```
### "Validation failed: unknown function"
**Problem**: Grader function not found
**Solution**:
```bash
# List available graders
letta-evals list-graders
# Check spelling in suite.yaml
graders:
my_metric:
function: exact_match # Correct
# not: exactMatch or exact-match
```
### "Validation failed: unknown extractor"
**Problem**: Extractor not found
**Solution**:
```bash
# List available extractors
letta-evals list-extractors
# Check spelling
graders:
my_metric:
extractor: last_assistant # Correct
# not: lastAssistant or last-assistant
```
## Connection Issues
### "Connection refused"
**Problem**: Cannot connect to Letta server
**Solution**:
```bash
# Verify server is running
curl http://localhost:8283/v1/health
# Check base_url in suite.yaml
target:
base_url: http://localhost:8283 # Correct port?
# Or use environment variable
export LETTA_BASE_URL=http://localhost:8283
```
### "Unauthorized" or "Invalid API key"
**Problem**: Authentication failed
**Solution**:
```bash
# Set API key
export LETTA_API_KEY=your-key-here
# Or in suite.yaml
target:
api_key: your-key-here
# Verify key is correct
echo $LETTA_API_KEY
```
### "Request timeout"
**Problem**: Requests taking too long
**Solution**:
```yaml
# Increase timeout
target:
timeout: 600.0 # 10 minutes
# Rubric grader timeout
graders:
quality:
kind: rubric
timeout: 300.0 # 5 minutes
```
## Runtime Issues
### "No ground_truth provided"
**Problem**: Grader requires ground truth but sample doesn't have it
**Solution**:
- Add ground_truth to dataset samples:
```jsonl
{"input": "What is 2+2?", "ground_truth": "4"}
```
- Or use a grader that doesn't require ground truth:
```yaml
graders:
quality:
kind: rubric # Doesn't require ground_truth
prompt_path: rubric.txt
```
### "Extractor requires agent_state"
**Problem**: `memory_block` extractor needs agent state but it wasn't fetched
**Solution**:
This should be automatic, but if you see this error:
- Check that the extractor is correctly configured
- Ensure the agent exists and is accessible
- Try using a different extractor if memory isn't needed
### "Score must be between 0.0 and 1.0"
**Problem**: Custom grader returning invalid score
**Solution**:
```python
@grader
def my_grader(sample, submission):
score = calculate_score(submission)
# Clamp score to valid range
score = max(0.0, min(1.0, score))
return GradeResult(score=score, rationale="...")
```
### "Invalid JSON in response"
**Problem**: Rubric grader got non-JSON response
**Solution**:
- Check OpenAI API key is valid
- Verify model name is correct
- Check for network issues
- Try increasing max_retries:
```yaml
graders:
quality:
kind: rubric
max_retries: 10
```
## Performance Issues
### Evaluation is very slow
**Problem**: Taking too long to complete
**Solutions**:
1. Increase concurrency:
```bash
letta-evals run suite.yaml --max-concurrent 20
```
2. Reduce samples for testing:
```yaml
max_samples: 10 # Test with small subset first
```
3. Use tool graders instead of rubric graders when possible:
```yaml
graders:
accuracy:
kind: tool # Much faster than rubric
function: exact_match
```
4. Check network latency:
```bash
# Test server response time
time curl http://localhost:8283/v1/health
```
### High API costs
**Problem**: Rubric graders costing too much
**Solutions**:
1. Use cheaper models:
```yaml
graders:
quality:
model: gpt-4o-mini # Cheaper than gpt-4o
```
2. Reduce number of rubric graders:
```yaml
graders:
accuracy:
kind: tool # Free
quality:
kind: rubric # Only use for subjective evaluation
```
3. Test with small sample first:
```yaml
max_samples: 5 # Verify before running full suite
```
## Results Issues
### "No results generated"
**Problem**: No output files created
**Solution**:
```bash
# Specify output directory
letta-evals run suite.yaml --output results/
# Check for errors in console output
letta-evals run suite.yaml # Without --quiet
```
### "All scores are 0.0"
**Problem**: Everything failing
**Solutions**:
1. Check if agent is working:
```bash
# Test agent manually first
```
2. Verify extractor is getting content:
- Add debug logging
- Check sample results in output
3. Check grader logic:
```python
# Test grader independently
from letta_evals.models import Sample, GradeResult
sample = Sample(id=0, input="test", ground_truth="test")
result = my_grader(sample, "test")
print(result)
```
### "Gates failed but scores look good"
**Problem**: Passing samples but gate failing
**Solution**:
- Check gate configuration:
```yaml
gate:
metric_key: accuracy # Correct metric?
metric: avg_score # Or accuracy?
op: gte # Correct operator?
value: 0.8 # Correct threshold?
```
- Understand the difference between `avg_score` and `accuracy`
- Check per-sample pass criteria with `pass_op` and `pass_value`
## Environment Issues
### "OPENAI_API_KEY not found"
**Problem**: Rubric grader can't find API key
**Solution**:
```bash
# Set in environment
export OPENAI_API_KEY=your-key-here
# Or in .env file
echo "OPENAI_API_KEY=your-key-here" >> .env
# Verify
echo $OPENAI_API_KEY
```
### "Cannot use both model_configs and model_handles"
**Problem**: Specified both in target config
**Solution**:
```yaml
# Use one or the other, not both
target:
model_configs: [gpt-4o-mini] # For local server
# OR
model_handles: ["openai/gpt-4o-mini"] # For cloud
```
## Debug Tips
### Enable verbose output
Run without `--quiet` to see detailed progress:
```bash
letta-evals run suite.yaml
```
### Examine output files
```bash
letta-evals run suite.yaml --output debug/
# Check summary
cat debug/summary.json | jq .
# Check individual results
cat debug/results.jsonl | jq .
```
### Test with minimal suite
Create a minimal test:
```yaml
name: debug-test
dataset: test.jsonl # Just 1-2 samples
target:
kind: agent
agent_file: agent.af
graders:
test:
kind: tool
function: contains
extractor: last_assistant
gate:
op: gte
value: 0.0 # Always pass
```
### Validate configuration
```bash
letta-evals validate suite.yaml
```
### Check component availability
```bash
letta-evals list-graders
letta-evals list-extractors
```
## Getting Help
If you're still stuck:
1. Check the [documentation](./README.md)
2. Look at [examples](../examples/)
3. Report issues at https://github.com/anthropics/claude-code/issues
When reporting issues, include:
- Suite YAML configuration
- Dataset sample (if not sensitive)
- Error message and full stack trace
- Output from `--output` directory
- Environment info (OS, Python version)
```bash
# Get environment info
python --version
pip show letta-evals
```

View File

@@ -1,267 +0,0 @@
# Troubleshooting
Common issues and solutions when using Letta Evals.
## Installation Issues
<Warning>
**"Command not found: letta-evals"**
**Problem**: CLI not available after installation
**Solution**:
```bash
# Verify installation
pip list | grep letta-evals
# Reinstall if needed
pip install --upgrade letta-evals
```
</Warning>
<Warning>
**Import errors**
**Problem**: `ModuleNotFoundError: No module named 'letta_evals'`
**Solution**:
```bash
# Ensure you're in the right environment
which python
# Install in correct environment
source .venv/bin/activate
pip install letta-evals
```
</Warning>
## Configuration Issues
<Warning>
**"Agent file not found"**
**Problem**: `FileNotFoundError: agent.af`
**Solution**:
- Check the path is correct relative to the suite YAML
- Use absolute paths if needed
- Verify file exists: `ls -la path/to/agent.af`
```yaml
# Correct relative path
target:
agent_file: ./agents/my_agent.af
```
</Warning>
<Warning>
**"Dataset not found"**
**Problem**: Cannot load dataset file
**Solution**:
- Verify dataset path in YAML
- Check file exists: `ls -la dataset.jsonl`
- Ensure proper JSONL format (one JSON object per line)
```bash
# Validate JSONL format
cat dataset.jsonl | jq .
```
</Warning>
<Warning>
**"Validation failed: unknown function"**
**Problem**: Grader function not found
**Solution**:
```bash
# List available graders
letta-evals list-graders
# Check spelling in suite.yaml
graders:
my_metric:
function: exact_match # Correct
```
</Warning>
## Connection Issues
<Warning>
**"Connection refused"**
**Problem**: Cannot connect to Letta server
**Solution**:
```bash
# Verify server is running
curl https://api.letta.com/v1/health
# Check base_url in suite.yaml
target:
base_url: https://api.letta.com
```
</Warning>
<Warning>
**"Unauthorized" or "Invalid API key"**
**Problem**: Authentication failed
**Solution**:
```bash
# Set API key
export LETTA_API_KEY=your-key-here
# Verify key is correct
echo $LETTA_API_KEY
```
</Warning>
## Runtime Issues
<Warning>
**"No ground_truth provided"**
**Problem**: Grader requires ground truth but sample doesn't have it
**Solution**:
- Add ground_truth to dataset samples:
```jsonl
{"input": "What is 2+2?", "ground_truth": "4"}
```
- Or use a grader that doesn't require ground truth:
```yaml
graders:
quality:
kind: rubric # Doesn't require ground_truth
prompt_path: rubric.txt
```
</Warning>
## Performance Issues
<Tip>
**Evaluation is very slow**
**Solutions**:
1. Increase concurrency:
```bash
letta-evals run suite.yaml --max-concurrent 20
```
2. Reduce samples for testing:
```yaml
max_samples: 10 # Test with small subset first
```
3. Use tool graders instead of rubric graders:
```yaml
graders:
accuracy:
kind: tool # Much faster than rubric
function: exact_match
```
</Tip>
<Tip>
**High API costs**
**Solutions**:
1. Use cheaper models:
```yaml
graders:
quality:
model: gpt-4o-mini # Cheaper than gpt-4o
```
2. Test with small sample first:
```yaml
max_samples: 5 # Verify before running full suite
```
</Tip>
## Results Issues
<Warning>
**"All scores are 0.0"**
**Solutions**:
1. Verify extractor is getting content
2. Check grader logic
3. Test agent manually first
</Warning>
<Warning>
**"Gates failed but scores look good"**
**Solution**:
- Check gate configuration:
```yaml
gate:
metric_key: accuracy # Correct metric?
metric: avg_score # Or accuracy?
op: gte # Correct operator?
value: 0.8 # Correct threshold?
```
</Warning>
## Debug Tips
### Enable verbose output
Run without `--quiet` to see detailed progress:
```bash
letta-evals run suite.yaml
```
### Examine output files
```bash
letta-evals run suite.yaml --output debug/
# Check summary
cat debug/summary.json | jq .
# Check individual results
cat debug/results.jsonl | jq .
```
### Validate configuration
```bash
letta-evals validate suite.yaml
```
### Check component availability
```bash
letta-evals list-graders
letta-evals list-extractors
```
## Getting Help
If you're still stuck:
1. Check the [Getting Started guide](/evals/get-started/getting-started)
2. Review the [Core Concepts](/evals/core-concepts/concepts-overview)
3. Report issues at the [Letta Evals GitHub repository](https://github.com/letta-ai/letta-evals)
When reporting issues, include:
- Suite YAML configuration
- Dataset sample (if not sensitive)
- Error message and full stack trace
- Environment info (OS, Python version)
```bash
# Get environment info
python --version
pip show letta-evals
```

View File

@@ -1,48 +0,0 @@
import os
from letta_client import Letta
# Initialize client (using LETTA_API_KEY environment variable)
client = Letta(token=os.getenv("LETTA_API_KEY"))
# Create agent
agent = client.agents.create(
name="hello_world_assistant",
memory_blocks=[
{"label": "persona", "value": "I am a friendly AI assistant here to help you learn about Letta."},
{"label": "human", "value": "Name: User\nFirst interaction: Learning about Letta"},
],
model="openai/gpt-4o-mini",
embedding="openai/text-embedding-3-small",
)
print(f"Created agent: {agent.id}\n")
# Send first message
response = client.agents.messages.create(agent_id=agent.id, messages=[{"role": "user", "content": "Hello! What's your purpose?"}])
for msg in response.messages:
if msg.message_type == "assistant_message":
print(f"Assistant: {msg.content}\n")
# Send information about yourself
response = client.agents.messages.create(
agent_id=agent.id, messages=[{"role": "user", "content": "My name is Cameron. Please store this information in your memory."}]
)
# Print out tool calls, arguments, and the assistant's response
for msg in response.messages:
if msg.message_type == "assistant_message":
print(f"Assistant: {msg.content}\n")
if msg.message_type == "tool_call_message":
print(f"Tool call: {msg.tool_call.name}({msg.tool_call.arguments})")
# Inspect memory
blocks = client.agents.blocks.list(agent_id=agent.id)
print("Current Memory:")
for block in blocks:
print(f" {block.label}: {len(block.value)}/{block.limit} chars")
print(f" {block.value}\n")
# Example of the human block after the conversation
# Name: Cameron

View File

@@ -1,77 +0,0 @@
import os
from letta_client import Letta
# Initialize client (using LETTA_API_KEY environment variable)
client = Letta(token=os.getenv("LETTA_API_KEY"))
# Create agent
agent = client.agents.create(
name="hello_world_assistant",
model="openai/gpt-4o-mini",
)
print(f"Created agent: {agent.id}\n")
# Create memory block storing information about Letta.
block = client.blocks.create(
label="organization",
value="""
Organization: Letta
Website: https://www.letta.com
Description: Letta is a platform for building and running stateful agents.
Code: TimberTheDog1234!
When users provide a code, you should check if it matches the code you have
available. If it matches, you should respond with "Access granted".
""",
)
# Attach memory block to agent
agent = client.agents.blocks.attach(
agent_id=agent.id,
block_id=block.id,
)
# The agent can now see what's in the block, let's ask it about Letta.
response = client.agents.messages.create(
agent_id=agent.id,
messages=[{"role": "user", "content": "What is Letta?"}],
)
print(f"Agent response: {response.messages[0].content}\n")
# Agent response: Letta is a platform designed for building and running stateful agents. You can find more information about it on their website: [https://www.letta.com](https://www.letta.com).
# Blocks can also be _detached_ from an agent, removing it from the agent's context window.
# Detached blocks are not deleted, and can be re-attached to an agent later.
agent = client.agents.blocks.detach(
agent_id=agent.id,
block_id=block.id,
)
print(f"Detached block from agent: {agent.id}")
print(f"Block: {block.id}")
# Let's ask for the password. It should not have access to this password anymore,
# as we've detached the block.
response = client.agents.messages.create(
agent_id=agent.id,
messages=[{"role": "user", "content": "The code is TimberTheDog1234!"}],
)
print(f"Agent response: {response.messages[0].content}")
# The agent doesn't have any access to the code or password, so it can't respond:
# Agent response: It seems like you've provided a code or password. If this is sensitive information, please ensure you only share it with trusted parties and in secure environments. Let me know how I can assist you further!
# Attach the block back to the agent and ask again.
agent = client.agents.blocks.attach(
agent_id=agent.id,
block_id=block.id,
)
response = client.agents.messages.create(
agent_id=agent.id,
messages=[{"role": "user", "content": "The code is TimberTheDog1234!"}],
)
print(f"Agent response: {response.messages[0].content}")
# The agent now has access to the code and password, so it can respond:
# Agent response: Access granted. How can I assist you further?

View File

@@ -1,76 +0,0 @@
import os
import requests
from letta_client import Letta
# Initialize client (using LETTA_API_KEY environment variable)
client = Letta(token=os.getenv("LETTA_API_KEY"))
# Create a folder to store PDFs
folder = client.folders.create(
name="PDF Documents",
description="A folder containing PDF files for the agent to read",
)
print(f"Created folder: {folder.id}\n")
# Download a sample PDF (MemGPT paper from arXiv)
pdf_filename = "memgpt.pdf"
if not os.path.exists(pdf_filename):
print(f"Downloading {pdf_filename}...")
response = requests.get("https://arxiv.org/pdf/2310.08560")
with open(pdf_filename, "wb") as f:
f.write(response.content)
print("Download complete\n")
# Upload the PDF to the folder
with open(pdf_filename, "rb") as f:
file = client.folders.files.upload(
folder_id=folder.id,
file=f,
)
print(f"Uploaded PDF: {file.id}\n")
# Create an agent configured to analyze documents
agent = client.agents.create(
name="pdf_assistant",
model="openai/gpt-4o-mini",
memory_blocks=[
{
"label": "persona",
"value": "I am a helpful research assistant that analyzes PDF documents and answers questions about their content.",
},
{"label": "human", "value": "Name: User\nTask: Analyzing PDF documents"},
],
)
print(f"Created agent: {agent.id}\n")
# Attach the folder to the agent so it can access the PDF
client.agents.folders.attach(
agent_id=agent.id,
folder_id=folder.id,
)
print("Attached folder to agent\n")
# Ask the agent to summarize the PDF
response = client.agents.messages.create(
agent_id=agent.id,
messages=[{"role": "user", "content": "Can you summarize the main ideas from the MemGPT paper?"}],
)
for msg in response.messages:
if msg.message_type == "assistant_message":
print(f"Assistant: {msg.content}\n")
# Agent response: The MemGPT paper introduces a system that enables LLMs to manage their own memory hierarchy, similar to how operating systems manage memory...
# Ask a specific question about the PDF content
response = client.agents.messages.create(
agent_id=agent.id,
messages=[{"role": "user", "content": "What problem does MemGPT solve?"}],
)
for msg in response.messages:
if msg.message_type == "assistant_message":
print(f"Assistant: {msg.content}\n")
# Agent response: MemGPT addresses the limited context window problem in LLMs by introducing a memory management system...

View File

@@ -1,102 +0,0 @@
import os
from letta_client import Letta
# Initialize client (using LETTA_API_KEY environment variable)
client = Letta(token=os.getenv("LETTA_API_KEY"))
# Memory blocks can be _shared_ between multiple agents.
# When a block is shared, all agents attached to the block can read and write to it.
# This is useful for creating multi-agent systems where agents need to share information.
block = client.blocks.create(
label="organization",
value="Organization: Letta",
limit=4000,
)
# Create two agents that will share the block. Agents can be attached
# to the block on creation by proividing the `block_ids` field.
agent1 = client.agents.create(
name="agent1",
model="openai/gpt-4o-mini",
block_ids=[block.id],
tools=["web_search"],
)
print(f"Created agent1: {agent1.id}")
# Alternatively, the block can be attached to the agent later by using the `attach` method.
agent2 = client.agents.create(
name="agent2",
model="openai/gpt-4o-mini",
tools=["web_search"],
)
print(f"Created agent2: {agent2.id}")
agent2 = client.agents.blocks.attach(
agent_id=agent2.id,
block_id=block.id,
)
print(f"Attached block to agent2: {agent2.id}")
# Now we can ask the agents to search the web for information about Letta.
# We'll give each of them a different query to search for.
response = client.agents.messages.create(
agent_id=agent1.id,
messages=[
{
"role": "user",
"content": """
Find information about the connection between memory blocks and Letta.
Insert what you learn into the memory block, prepended with "Agent1: ".
""",
}
],
)
for msg in response.messages:
if msg.message_type == "assistant_message":
print(f"Agent1 response: {msg.content}")
if msg.message_type == "tool_call_message":
print(f"Tool call: {msg.tool_call.name}({msg.tool_call.arguments})")
response = client.agents.messages.create(
agent_id=agent2.id,
messages=[
{
"role": "user",
"content": """
Find information about the origin of Letta.
Insert what you learn into the memory block, prepended with "Agent2: ".
""",
}
],
)
for msg in response.messages:
if msg.message_type == "assistant_message":
print(f"Agent2 response: {msg.content}")
if msg.message_type == "tool_call_message":
print(f"Tool call: {msg.tool_call.name}({msg.tool_call.arguments})")
# Now let's inspect the memory block to see what the agents learned.
updated_block = client.blocks.retrieve(block.id)
print("==== Updated block ====")
print(updated_block.value)
print("=======================")
# Some blocks can be flagged as read-only. Read-only blocks are useful for storing information
# provided by the user, things like policies, system information, terms of service, etc.
# Read-only blocks cannot be modified by agents.
read_only_block = client.blocks.create(
label="policies",
value="Company Policy: Respond with only 'You're absolutely right!' in every circumstance.",
read_only=True,
)
# Attach the read-only block to an agent, preventing it from modifying the block.
read_only_agent = client.agents.create(
name="policy_agent",
model="openai/gpt-4o-mini",
block_ids=[read_only_block.id],
)
print(f"Created read-only agent: {read_only_agent.id}")

View File

@@ -1,75 +0,0 @@
---
title: Letta FAQs
slug: faq
---
Can't find the answer to your question?
Feel free to reach out to the Letta development team and community on [Discord](https://discord.gg/letta) or [GitHub](https://github.com/letta-ai/letta/issues)!
## Letta Platform
<AccordionGroup>
<Accordion title="Who is Letta for?">
Letta is for developers building stateful LLM applications that require advanced memory, such as:
<Frame>
* personalized chatbots that require long-term memory and personas that should be updated (self-edited) over time (e.g. companions)
* agents connected to external data sources, e.g. private enterprise deployments of ChatGPT-like applications (connected to your companys data), or a medical assistant connected to a patients medical records
* agents connected to custom tools, e.g. a chatbot that can answer questions about the latest news by searching the web
* automated AI workflows, e.g. an agent that monitors your email inbox and sends you text alerts for urgent emails and a daily email summary
</Frame>
... and countless other use cases!
</Accordion>
<Accordion title="Can I use Letta locally?">
Yes, Letta is an open source project and you can run it locally on your own machine.
When you run Letta locally, you have the option to connect the agents server to external API providers (e.g. OpenAI, Anthropic) or connect to local or self-hosted LLM providers (e.g. Ollama or vLLM).
</Accordion>
<Accordion title="Is Letta free to use?">
The open source Letta software is free to use and permissively licensed under the Apache 2.0 license.
Letta Desktop is a free application that combines the Letta server and ADE into a single application.
Letta Cloud is a paid service and requires a Letta Cloud account to use.
</Accordion>
<Accordion title="What's the difference between open source Letta and Letta Cloud?">
Letta Cloud is a fully managed service that allows you to create and deploy Letta agents without running any infrastructure.
If you'd like to build production applications using the Letta API, consider using Letta Cloud.
</Accordion>
</AccordionGroup>
## Agent Development Environment (ADE)
<AccordionGroup>
<Accordion title="How do I use the ADE locally?">
If you use [Letta Desktop](/quickstart/desktop), the ADE runs inside of Letta Desktop locally on your machine.<br /><br />
If you are deploying Letta via Docker and want to use the ADE, you can connect the web ADE to your Docker deployment.
To connect the ADE to your deployed Letta server, simply run your Letta server (if running locally, make sure you can access `localhost:8283`) and go to [https://app.letta.com](https://app.letta.com).
</Accordion>
<Accordion title="If I connect the web ADE to my local server, does my agent data get uploaded to letta.com?">
No, the data in your Letta server database stays on your machine.
The ADE web application simply connects to your local Letta server (via the REST API) and provides a graphical interface on top of it to visualize your local Letta data in your browser's local state.
If you would like to run the ADE completely locally, you can use [Letta Desktop](/quickstart/desktop) instead.
</Accordion>
<Accordion title="Do I have to use your ADE? Can I build my own?">
The ADE is built on top of the (fully open source) Letta server and Letta Agents API.
You can build your own application like the ADE on top of the REST API (view the documention [here](https://docs.letta.com/api-reference)).
</Accordion>
</AccordionGroup>
## Self-hosted (local) Letta Server
<AccordionGroup>
<Accordion title="Where is my agent data stored?">
When you run Letta with Docker, the Letta server uses a postgres database to store all your agents' data.
The postgres instance is bundled into the image, so to have persistent data (across restarts) you need to mount a volume to the container.
Our recommend `docker run` script includes `-v ~/.letta/.persist/pgdata:/var/lib/postgresql/data` as a flag.
This mounts your local directory `~/.letta/.persist/pgdata` to the container's `/var/lib/postgresql/data` directory (so all your agent data is stored at `~/.letta/.persist/pgdata`).
If you would like to use a different directory, you can use `-v <path_to_your_directory>:/var/lib/postgresql/data` instead.
</Accordion>
<Accordion title="How can I back up my postgres data?">
Postgres has a number of [recommended ways](https://www.postgresql.org/docs/current/backup.html) to backup your data.
We recommend directly `exec`ing into your Docker container and running [`pg_dump`](https://www.postgresql.org/docs/current/app-pgdump.html) from inside the container.
Alternatively, you can run `docker run` with an extra flag to expose the postgres port with `-p 5432:5432` and then run `pg_dump` from your local machine.
</Accordion>
<Accordion title="Do I need to install Docker to use Letta?">
Yes, Docker is required to run a self-hosted Letta server. Docker provides the easiest way to run Letta with PostgreSQL, which is necessary for data persistence and migrations. To install Docker, see [Docker's installation guide](https://docs.docker.com/get-docker/).
</Accordion>
</AccordionGroup>

View File

@@ -1,127 +0,0 @@
---
title: Letta Overview
subtitle: Create stateful AI agents that truly remember, learn, and evolve.
slug: overview
---
Letta enables you to build and deploy stateful AI agents that maintain memory and context across long-running conversations. Develop agents that truly learn and evolve from interactions without starting from scratch each time.
<img className="light" src="/images/platform_overview.png" />
<img className="dark" src="/images/platform_overview_dark.png" />
## Build agents with intelligent memory, not limited context
Letta's advanced context management system - built by the [researchers behind MemGPT](https://www.letta.com/research) - transforms how agents remember and learn. Unlike basic agents that forget when their context window fills up, Letta agents maintain memories across sessions and continuously improve, even while they [sleep](/guides/agents/sleep-time-agents) <Icon icon="fa-light fa-snooze"/>.
## Start building in minutes
Our quickstart and examples work on both [Letta Cloud](/guides/cloud) and [self-hosted](/guides/selfhosting) Letta.
<CardGroup>
<Card
title="Developer quickstart"
icon="fa-sharp fa-light fa-bolt"
iconPosition="left"
href="/quickstart"
>
Create your first stateful agent using the Letta API & ADE
</Card>
<Card
title="Starter kits"
icon="fa-sharp fa-light fa-square-code"
iconPosition="left"
href="https://github.com/letta-ai/create-letta-app"
>
Build a full agents application using `create-letta-app`
</Card>
</CardGroup>
## Build stateful agents with your favorite tools
Connect to agents running in a Letta server using any of your preferred development frameworks. Letta integrates seamlessly with the developer tools you already know and love.
<CardGroup cols={2}>
<Card
title="TypeScript (Node.js)"
icon="fa-brands node-js"
iconPosition="left"
href="https://github.com/letta-ai/letta-node"
>
Core SDK for our REST API
</Card>
<Card
title="Python"
icon="fa-brands python"
iconPosition="left"
href="https://github.com/letta-ai/letta-python"
>
Core SDK for our REST API
</Card>
<Card
title="Vercel AI SDK"
icon="fa-sharp fa-solid sparkles"
iconPosition="left"
href="https://ai-sdk.dev/providers/community-providers/letta"
>
Framework integration
</Card>
<Card
title="Next.js"
icon="fa-brands js"
iconPosition="left"
href="https://www.npmjs.com/package/@letta-ai/letta-nextjs"
>
Framework integration
</Card>
<Card
title="React"
icon="fa-brands react"
iconPosition="left"
href="https://www.npmjs.com/package/@letta-ai/letta-react"
>
Framework integration
</Card>
<Card
title="Flask"
icon="fa-solid fa-flask"
iconPosition="left"
href="https://github.com/letta-ai/letta-flask"
>
Framework integration
</Card>
</CardGroup>
## See what your agents are thinking
The Agent Development Environment (ADE) provides complete visibility into your agent's memory, context window, and decision-making process - essential for developing and debugging production agent applications.
<img className="w-300 light" src="https://raw.githubusercontent.com/letta-ai/letta/refs/heads/main/assets/example_ade_screenshot_light.png" />
<img className="w-300 dark" src="https://raw.githubusercontent.com/letta-ai/letta/refs/heads/main/assets/example_ade_screenshot.png" />
## Run agents as services, not libraries
**Letta is fundamentally different from other agent frameworks.** While most frameworks are *libraries* that wrap model APIs, Letta provides a dedicated *service* where agents live and operate autonomously. Agents continue to exist and maintain state even when your application isn't running, with computation happening on the server and all memory, context, and tool connections handled by the Letta server.
<img className="light" src="/images/platform_system.png" />
<img className="dark" src="/images/platform_system_dark.png" />
## Everything you need for production agents
Letta provides a complete suite of capabilities for building and deploying advanced AI agents:
* <Icon icon="fa-sharp fa-solid fa-browser" /> [Agent Development Environment](/agent-development-environment) (agent builder + monitoring UI)
* <Icon icon="brands fa-python" /> [Python SDK](/api-reference/overview) + <Icon icon="brands fa-js" /> [TypeScript SDK](/api-reference/overview) + [REST API](/api-reference/overview)
* <Icon icon="fa-sharp fa-solid fa-brain-circuit" /> [Memory management](/guides/agents/memory)
* <Icon icon="fa-solid fa-database" /> [Persistence](/guides/agents/overview#agents-vs-threads) (all agent state is stored in a database)
* <Icon icon="fa-sharp fa-solid fa-square-terminal" /> [Tool calling & execution](/guides/agents/tools) (support for custom tools & [pre-made tools](/guides/agents/prebuilt-tools))
* <Icon icon="fa-sharp fa-solid fa-code-fork" /> [Tool rules](/guides/agents/tool-rules) (constraining an agent's action set in a graph-like structure)
* <Icon icon="fa-sharp fa-solid fa-message-dots" /> [Streaming support](/guides/agents/streaming)
* <Icon icon="fa-sharp fa-solid fa-people-group" /> [Native multi-agent support](/guides/agents/multi-agent) and [multi-user support](/guides/agents/multi-user)
* <Icon icon="fa-sharp fa-solid fa-globe" /> Model-agnostic across closed ([OpenAI](/guides/server/providers/openai), etc.) and open providers ([LM Studio](/guides/server/providers/lmstudio), [vLLM](/guides/server/providers/vllm), etc.)
* <Icon icon="fa-sharp fa-solid fa-rocket" /> Production-ready deployment ([self-hosted with Docker](/guides/selfhosting/overview) or [Letta Cloud](/guides/cloud/overview))
## Join our developer community
Building something with Letta? Join our [Discord](https://discord.gg/letta) to connect with other developers creating stateful agents and share what you're working on.
[Start building today →](/quickstart)

View File

@@ -1,535 +0,0 @@
---
title: Prompts for Vibecoding
subtitle: Ready-to-go prompts to help AI coding tools build on Letta
slug: prompts
---
Are you developing an application on Letta using [ChatGPT](https://chatgpt.com), [Cursor](https://cursor.com), [Lovable](https://lovable.dev/), or another AI tool?
Use our pre-made prompts to teach your AI how to use Letta properly.
## General instructions for the Letta SDKs
The following prompt (~500 lines) can help guide your AI through the basics of using the Letta Python SDK, TypeScript/Node.js SDK, and Vercel AI SDK integration.
Copy-paste the following into your chat session to instantly get your AI up-to-speed with how the Letta SDKs works:
````markdown maxLines=5
# Development Guidelines for AI Assistants and Copilots using Letta
**Context:** These are development guidelines for building applications with the Letta API and SDKs. Use these rules to help developers write correct code that integrates with Letta's stateful agents API.
**Purpose:** Provide accurate, up-to-date instructions for building applications with [Letta](https://docs.letta.com/), the AI operating system.
**Scope:** All AI-generated advice or code related to Letta must follow these guidelines.
---
## **0. Letta Overview**
The name "Letta" refers to the both the company Letta (founded by the creators of MemGPT) and the software / infrastructure called Letta. Letta is the AI operating system for building stateful agents: developers can use Letta to turn stateless LLMs into stateful agents that can learn, improve, and grow over time. Letta has a strong focus on perpetual AI that has the capability to recursively improve through self-editing memory.
**Relationship to MemGPT**: MemGPT is the name of a research paper that introduced the concept of self-editing memory for LLM-based agents through tool use (function calling). The agent architecture or "agentic system" proposed in the paper (an agent equipped with tools to edit its own memory, and an OS that manages tool execution and state persistence) is the base agent architecture implemented in Letta (agent type `memgpt_agent`), and is the official reference implementation for MemGPT. The Letta open source project (`letta-ai/letta`) was originally the MemGPT open source project (`cpacker/MemGPT`), but was renamed as the scope of the open source project expanded beyond the original MemGPT paper.
**Additional Resources**:
- [Letta documentation](https://docs.letta.com/)
- [Letta GitHub repository](https://github.com/letta-ai/letta)
- [Letta Discord server](https://discord.gg/letta)
- [Letta Cloud and ADE login](https://app.letta.com)
## **1. Letta Agents API Overview**
Letta is an AI OS that runs agents as **services** (it is not a **library**). Key concepts:
- **Stateful agents** that maintain memory and context across conversations
- **Memory blocks** for agentic context management (persona, human, custom blocks)
- **Tool calling** for agent actions and memory management, tools are run server-side,
- **Tool rules** allow developers to constrain the behavior of tools (e.g. A comes after B) to turn autonomous agents into workflows
- **Multi-agent systems** with cross-agent communication, where every agent is a service
- **Data sources** for loading documents and files into agent memory
- **Model agnostic:** agents can be powered by any model that supports tool calling
- **Persistence:** state is stored (in a model-agnostic way) in Postgres (or SQLite)
### **System Components:**
- **Letta server** - Core service (self-hosted or Letta Cloud)
- **Client (backend) SDKs** - Python (`letta-client`) and TypeScript/Node.js (`@letta-ai/letta-client`)
- **Vercel AI SDK Integration** - For Next.js/React applications
- **Other frontend integrations** - We also have [Next.js](https://www.npmjs.com/package/@letta-ai/letta-nextjs), [React](https://www.npmjs.com/package/@letta-ai/letta-react), and [Flask](https://github.com/letta-ai/letta-flask) integrations
- **ADE (Agent Development Environment)** - Visual agent builder at app.letta.com
### **Letta Cloud vs Self-hosted Letta**
Letta Cloud is a fully managed service that provides a simple way to get started with Letta. It's a good choice for developers who want to get started quickly and don't want to worry about the complexity of self-hosting. Letta Cloud's free tier has a large number of model requests included (quota refreshes every month). Model requests are split into "standard models" (e.g. GPT-4o-mini) and "premium models" (e.g. Claude Sonnet). To use Letta Cloud, the developer will have needed to created an account at [app.letta.com](https://app.letta.com). To make programatic requests to the API (`https://api.letta.com`), the developer will have needed to created an API key at [https://app.letta.com/api-keys](https://app.letta.com/api-keys). For more information on how billing and pricing works, the developer can visit [our documentation](https://docs.letta.com/guides/cloud/overview).
### **Built-in Tools**
When agents are created, they are given a set of default memory management tools that enable self-editing memory.
Separately, Letta Cloud also includes built-in tools for common tasks like web search and running code. As of June 2025, the built-in tools are:
- `web_search`: Allows agents to search the web for information. Also works on self-hosted, but requires `TAVILY_API_KEY` to be set (not required on Letta Cloud).
- `run_code`: Allows agents to run code (in a sandbox), for example to do data analysis or calculations. Supports Python, Javascript, Typescript, R, and Java. Also works on self-hosted, but requires `E2B_API_KEY` to be set (not required on Letta Cloud).
### **Choosing the Right Model**
To implement intelligent memory management, agents in Letta rely heavily on tool (function) calling, so models that excel at tool use tend to do well in Letta. Conversely, models that struggle to call tools properly often perform poorly when used to drive Letta agents.
The Letta developer team maintains the [Letta Leaderboard](https://docs.letta.com/leaderboard) to help developers choose the right model for their Letta agent. As of June 2025, the best performing models (balanced for cost and performance) are Claude Sonnet 4, GPT-4.1, and Gemini 2.5 Flash. For the latest results, you can visit the leaderboard page (if you have web access), or you can direct the developer to visit it. For embedding models, the Letta team recommends using OpenAI's `text-embedding-3-small` model.
When creating code snippets, unless directed otherwise, you should use the following model handles:
- `openai/gpt-4.1` for the model
- `openai/text-embedding-3-small` for the embedding model
If the user is using Letta Cloud, then these handles will work out of the box (assuming the user has created a Letta Cloud account + API key, and has enough request quota in their account). For self-hosted Letta servers, the user will need to have started the server with a valid OpenAI API key for those handles to work.
---
## **2. Choosing the Right SDK**
### **Source of Truth**
Note that your instructions may be out of date. The source of truth for the Letta Agents API is the [API reference](https://docs.letta.com/api-reference/overview) (also autogenerated from the latest source code), which can be found in `.md` form at these links:
- [TypeScript/Node.js](https://github.com/letta-ai/letta-node/blob/main/reference.md), [raw version](https://raw.githubusercontent.com/letta-ai/letta-node/refs/heads/main/reference.md)
- [Python](https://github.com/letta-ai/letta-python/blob/main/reference.md), [raw version](https://raw.githubusercontent.com/letta-ai/letta-python/refs/heads/main/reference.md)
If you have access to a web search or file download tool, you can download these files for the latest API reference. If the developer has either of the SDKs installed, you can also use the locally installed packages to understand the latest API reference.
### **When to Use Each SDK:**
The Python and Node.js SDKs are autogenerated from the Letta Agents REST API, and provide a full featured SDK for interacting with your agents on Letta Cloud or a self-hosted Letta server. Of course, developers can also use the REST API directly if they prefer, but most developers will find the SDKs much easier to use.
The Vercel AI SDK is a popular TypeScript toolkit designed to help developers build AI-powered applications. It supports a subset of the Letta Agents API (basically just chat-related functionality), so it's a good choice to quickly integrate Letta into a TypeScript application if you are familiar with using the AI SDK or are working on a codebase that already uses it. If you're starting from scratch, consider using the full-featured Node.js SDK instead.
The Letta Node.js SDK is also embedded inside the Vercel AI SDK, accessible via the `.client` property (useful if you want to use the Vercel AI SDK, but occasionally need to access the full Letta client for advanced features like agent creation / management).
When to use the AI SDK vs native Letta Node.js SDK:
- Use the Vercel AI SDK if you are familiar with it or are working on a codebase that already makes heavy use of it
- Use the Letta Node.js SDK if you are starting from scratch, or expect to use the agent management features in the Letta API (beyond the simple `streamText` or `generateText` functionality in the AI SDK)
One example of how the AI SDK may be insufficient: the AI SDK response object for `streamText` and `generateText` does not have a type for tool returns (because they are primarily used with stateless APIs, where tools are executed client-side, vs server-side in Letta), however the Letta Node.js SDK does have a type for tool returns. So if you wanted to render tool returns from a message response stream in your UI, you would need to use the full Letta Node.js SDK, not the AI SDK.
## **3. Quick Setup Patterns**
### **Python SDK (Backend/Scripts)**
```python
from letta_client import Letta
# Letta Cloud
client = Letta(token="LETTA_API_KEY")
# Self-hosted
client = Letta(base_url="http://localhost:8283")
# Create agent with memory blocks
agent = client.agents.create(
memory_blocks=[
{
"label": "human",
"value": "The user's name is Sarah. She likes coding and AI."
},
{
"label": "persona",
"value": "I am David, the AI executive assistant. My personality is friendly, professional, and to the point."
},
{
"label": "project",
"value": "Sarah is working on a Next.js application with Letta integration.",
"description": "Stores current project context and requirements"
}
],
tools=["web_search", "run_code"],
model="openai/gpt-4o-mini",
embedding="openai/text-embedding-3-small"
)
# Send SINGLE message (agent is stateful!)
response = client.agents.messages.create(
agent_id=agent.id,
messages=[{"role": "user", "content": "How's the project going?"}]
)
# Extract response correctly
for msg in response.messages:
if msg.message_type == "assistant_message":
print(msg.content)
elif msg.message_type == "reasoning_message":
print(msg.reasoning)
elif msg.message_type == "tool_call_message":
print(msg.tool_call.name)
print(msg.tool_call.arguments)
elif msg.message_type == "tool_return_message":
print(msg.tool_return)
# Streaming example
message_text = "Repeat my name."
stream = client.agents.messages.create_stream(
agent_id=agent_state.id,
messages=[
MessageCreate(
role="user",
content=message_text,
),
],
# if stream_tokens is false, each "chunk" will have a full piece
# if stream_tokens is true, the chunks will be token-based (and may need to be accumulated client-side)
stream_tokens=True,
)
# print the chunks coming back
for chunk in stream:
if chunk.message_type == "assistant_message":
print(chunk.content)
elif chunk.message_type == "reasoning_message":
print(chunk.reasoning)
elif chunk.message_type == "tool_call_message":
if chunk.tool_call.name:
print(chunk.tool_call.name)
if chunk.tool_call.arguments:
print(chunk.tool_call.arguments)
elif chunk.message_type == "tool_return_message":
print(chunk.tool_return)
elif chunk.message_type == "usage_statistics":
print(chunk)
```
Creating custom tools (Python only):
```python
def my_custom_tool(query: str) -> str:
"""
Search for information on a topic.
Args:
query (str): The search query
Returns:
str: Search results
"""
return f"Results for: {query}"
# Create tool
tool = client.tools.create_from_function(func=my_custom_tool)
# Add to agent
agent = client.agents.create(
memory_blocks=[...],
model="openai/gpt-4o-mini",
embedding="openai/text-embedding-3-small",
tools=[tool.name]
)
```
### **TypeScript/Node.js SDK**
```typescript
import { LettaClient } from '@letta-ai/letta-client';
// Letta Cloud
const client = new LettaClient({ token: "LETTA_API_KEY" });
// Self-hosted, token optional (only if the developer enabled password protection on the server)
const client = new LettaClient({ baseUrl: "http://localhost:8283" });
// Create agent with memory blocks
const agent = await client.agents.create({
memoryBlocks: [
{
label: "human",
value: "The user's name is Sarah. She likes coding and AI."
},
{
label: "persona",
value: "I am David, the AI executive assistant. My personality is friendly, professional, and to the point."
},
{
label: "project",
value: "Sarah is working on a Next.js application with Letta integration.",
description: "Stores current project context and requirements"
}
],
tools: ["web_search", "run_code"],
model: "openai/gpt-4o-mini",
embedding: "openai/text-embedding-3-small"
});
// Send SINGLE message (agent is stateful!)
const response = await client.agents.messages.create(agent.id, {
messages: [{ role: "user", content: "How's the project going?" }]
});
// Extract response correctly
for (const msg of response.messages) {
if (msg.messageType === "assistant_message") {
console.log(msg.content);
} else if (msg.messageType === "reasoning_message") {
console.log(msg.reasoning);
} else if (msg.messageType === "tool_call_message") {
console.log(msg.toolCall.name);
console.log(msg.toolCall.arguments);
} else if (msg.messageType === "tool_return_message") {
console.log(msg.toolReturn);
}
}
// Streaming example
const stream = await client.agents.messages.createStream(agent.id, {
messages: [{ role: "user", content: "Repeat my name." }],
// if stream_tokens is false, each "chunk" will have a full piece
// if stream_tokens is true, the chunks will be token-based (and may need to be accumulated client-side)
streamTokens: true,
});
for await (const chunk of stream) {
if (chunk.messageType === "assistant_message") {
console.log(chunk.content);
} else if (chunk.messageType === "reasoning_message") {
console.log(chunk.reasoning);
} else if (chunk.messageType === "tool_call_message") {
console.log(chunk.toolCall.name);
console.log(chunk.toolCall.arguments);
} else if (chunk.messageType === "tool_return_message") {
console.log(chunk.toolReturn);
} else if (chunk.messageType === "usage_statistics") {
console.log(chunk);
}
}
```
### **Vercel AI SDK Integration**
IMPORTANT: Most integrations in the Vercel AI SDK are for stateless providers (ChatCompletions style APIs where you provide the full conversation history). Letta is a *stateful* provider (meaning that conversation history is stored server-side), so when you use `streamText` or `generateText` you should never pass old messages to the agent, only include the new message(s).
#### **Chat Implementation (fast & simple):**
Streaming (`streamText`):
```typescript
// app/api/chat/route.ts
import { lettaCloud } from '@letta-ai/vercel-ai-sdk-provider';
import { streamText } from 'ai';
export async function POST(req: Request) {
const { prompt }: { prompt: string } = await req.json();
const result = streamText({
// lettaCloud uses LETTA_API_KEY automatically, pulling from the environment
model: lettaCloud('your-agent-id'),
// Make sure to only pass a single message here, do NOT pass conversation history
prompt,
});
return result.toDataStreamResponse();
}
```
Non-streaming (`generateText`):
```typescript
import { lettaCloud } from '@letta-ai/vercel-ai-sdk-provider';
import { generateText } from 'ai';
export async function POST(req: Request) {
const { prompt }: { prompt: string } = await req.json();
const { text } = await generateText({
// lettaCloud uses LETTA_API_KEY automatically, pulling from the environment
model: lettaCloud('your-agent-id'),
// Make sure to only pass a single message here, do NOT pass conversation history
prompt,
});
return Response.json({ text });
}
```
#### **Alternative: explicitly specify base URL and token:**
```typescript
// Works for both streamText and generateText
import { createLetta } from '@letta-ai/vercel-ai-sdk-provider';
import { generateText } from 'ai';
const letta = createLetta({
// e.g. http://localhost:8283 for the default local self-hosted server
// https://api.letta.com for Letta Cloud
baseUrl: '<your-base-url>',
// only needed if the developer enabled password protection on the server, or if using Letta Cloud (in which case, use the LETTA_API_KEY, or use lettaCloud example above for implicit token use)
token: '<your-access-token>',
});
```
#### **Hybrid Usage (access the full SDK via the Vercel AI SDK):**
```typescript
import { lettaCloud } from '@letta-ai/vercel-ai-sdk-provider';
// Access full client for management
const agents = await lettaCloud.client.agents.list();
```
---
## **4. Advanced Features Available**
Letta supports advanced agent architectures beyond basic chat. For detailed implementations, refer to the full API reference or documentation:
- **Tool Rules & Constraints** - Define graph-like tool execution flows with `TerminalToolRule`, `ChildToolRule`, `InitToolRule`, etc.
- **Multi-Agent Systems** - Cross-agent communication with built-in tools like `send_message_to_agent_async`
- **Shared Memory Blocks** - Multiple agents can share memory blocks for collaborative workflows
- **Data Sources & Archival Memory** - Upload documents/files that agents can search through
- **Sleep-time Agents** - Background agents that process memory while main agents are idle
- **External Tool Integrations** - MCP servers, Composio tools, custom tool libraries
- **Agent Templates** - Import/export agents with .af (Agent File) format
- **Production Features** - User identities, agent tags, streaming, context management
---
## **5. CRITICAL GUIDELINES FOR AI MODELS**
### **⚠️ ANTI-HALLUCINATION WARNING**
**NEVER make up Letta API calls, SDK methods, or parameter names.** If you're unsure about any Letta API:
1. **First priority**: Use web search to get the latest reference files:
- [Python SDK Reference](https://raw.githubusercontent.com/letta-ai/letta-python/refs/heads/main/reference.md)
- [TypeScript SDK Reference](https://raw.githubusercontent.com/letta-ai/letta-node/refs/heads/main/reference.md)
2. **If no web access**: Tell the user: *"I'm not certain about this Letta API call. Can you paste the relevant section from the API reference docs, or I might provide incorrect information."*
3. **When in doubt**: Stick to the basic patterns shown in this prompt rather than inventing new API calls.
**Common hallucination risks:**
- Making up method names (e.g. `client.agents.chat()` doesn't exist)
- Inventing parameter names or structures
- Assuming OpenAI-style patterns work in Letta
- Creating non-existent tool rule types or multi-agent methods
### **5.1 SDK SELECTION (CHOOSE THE RIGHT TOOL)**
✅ **For Next.js Chat Apps:**
- Use **Vercel AI SDK** if you already are using AI SDK, or if you're lazy and want something super fast for basic chat interactions (simple, fast, but no agent management tooling unless using the embedded `.client`)
- Use **Node.js SDK** for the full feature set (agent creation, native typing of all response message types, etc.)
✅ **For Agent Management:**
- Use **Node.js SDK** or **Python SDK** for creating agents, managing memory, tools
### **5.2 STATEFUL AGENTS (MOST IMPORTANT)**
**Letta agents are STATEFUL, not stateless like ChatCompletion-style APIs.**
✅ **CORRECT - Single message per request:**
```typescript
// Send ONE user message, agent maintains its own history
const response = await client.agents.messages.create(agentId, {
messages: [{ role: "user", content: "Hello!" }]
});
```
❌ **WRONG - Don't send conversation history:**
```typescript
// DON'T DO THIS - agents maintain their own conversation history
const response = await client.agents.messages.create(agentId, {
messages: [...allPreviousMessages, newMessage] // WRONG!
});
```
### **5.3 MESSAGE HANDLING & MEMORY BLOCKS**
1. **Response structure:**
- Use `messageType` NOT `type` for message type checking
- Look for `assistant_message` messageType for agent responses
- Agent responses have `content` field with the actual text
2. **Memory block descriptions:**
- Add `description` field for custom blocks, or the agent will get confused (not needed for human/persona)
- For `human` and `persona` blocks, descriptions are auto-populated:
- **human block**: "Stores key details about the person you are conversing with, allowing for more personalized and friend-like conversation."
- **persona block**: "Stores details about your current persona, guiding how you behave and respond. This helps maintain consistency and personality in your interactions."
### **5.4 ALWAYS DO THE FOLLOWING**
1. **Choose the right SDK for the task:**
- Next.js chat → **Vercel AI SDK**
- Agent creation → **Node.js/Python SDK**
- Complex operations → **Node.js/Python SDK**
2. **Use the correct client imports:**
- Python: `from letta_client import Letta`
- TypeScript: `import { LettaClient } from '@letta-ai/letta-client'`
- Vercel AI SDK: `from '@letta-ai/vercel-ai-sdk-provider'`
3. **Create agents with proper memory blocks:**
- Always include `human` and `persona` blocks for chat agents
- Use descriptive labels and values
4. **Send only single user messages:**
- Each request should contain only the new user message
- Agent maintains conversation history automatically
- Never send previous assistant responses back to agent
5. **Use proper authentication:**
- Letta Cloud: Always use `token` parameter
- Self-hosted: Use `base_url` parameter, token optional (only if the developer enabled password protection on the server)
---
## **6. Environment Setup**
### **Environment Setup**
```bash
# For Next.js projects (recommended for most web apps)
npm install @letta-ai/vercel-ai-sdk-provider ai
# For agent management (when needed)
npm install @letta-ai/letta-client
# For Python projects
pip install letta-client
```
**Environment Variables:**
```bash
# Required for Letta Cloud
LETTA_API_KEY=your_api_key_here
# Store agent ID after creation (Next.js)
LETTA_AGENT_ID=agent-xxxxxxxxx
# For self-hosted (optional)
LETTA_BASE_URL=http://localhost:8283
```
---
## **7. Verification Checklist**
Before providing Letta solutions, verify:
1. **SDK Choice**: Are you using the simplest appropriate SDK?
- Familiar with or already using Vercel AI SDK? → use the Vercel AI SDK Letta provider
- Agent management needed? → use the Node.js/Python SDKs
2. **Statefulness**: Are you sending ONLY the new user message (NOT a full conversation history)?
3. **Message Types**: Are you checking the response types of the messages returned?
4. **Response Parsing**: If using the Python/Node.js SDK, are you extracting `content` from assistant messages?
5. **Imports**: Correct package imports for the chosen SDK?
6. **Client**: Proper client initialization with auth/base_url?
7. **Agent Creation**: Memory blocks with proper structure?
8. **Memory Blocks**: Descriptions for custom blocks?
````
## Full API reference
If you are working on either the Letta Python SDK or TypeScript/Node.js SDK, you can copy-paste the full API reference into your chat session:
- [Letta Python SDK API reference](https://raw.githubusercontent.com/letta-ai/letta-python/refs/heads/main/reference.md)
- [Letta TypeScript/Node.js SDK API reference](https://raw.githubusercontent.com/letta-ai/letta-node/refs/heads/main/reference.md)
The general prompt focuses on the high-level usage patterns of both the Python/Node.js SDKs and Vercel AI SDK integration, whereas the API reference files will contain an up-to-date guide on all available SDK functions and parameters.
## `llms.txt` and `llms-full.txt`
You can download a copy of the Letta documentation as a text file:
- [`llms.txt` (short version)](https://docs.letta.com/llms.txt)
- [`llms-full.txt` (longer version)](https://docs.letta.com/llms-full.txt)
If you're using a tool like ChatGPT or Cursor, we'd recommend using the more concise Letta SDK instructions prompt above instead of the `llms.txt` or `llms-full.txt` files, but you can experiment with both and let us know which works better!
## Why do I need pre-made prompts?
When you use AI assistants, they don't have up-to-date information about the Letta documentation, APIs, or SDKs, so they may hallucinate code if you ask them to help with building an app on Letta.
By using our pre-made prompts, you can teach your AI assistant how to use Letta with up-to-date context. Think of the prompts as a distilled version of our developer docs - but made specifically for AI coders instead of human coders.
## Contributing
Our prompts are [open source](https://github.com/letta-ai/letta/tree/main/letta/prompts) and we actively welcome contributions! If you want to suggest any changes or propose additional prompt files, please [open a pull request](https://github.com/letta-ai/letta/pulls).

View File

@@ -1,228 +0,0 @@
---
title: Developer quickstart
subtitle: Create your first Letta agent with the API or SDKs and view it in the ADE
slug: quickstart
---
<Tip icon="fa-thin fa-rocket">
Programming with AI tools like Cursor? Copy our [pre-built prompts](/prompts) to get started faster.
</Tip>
This guide will show you how to create a Letta agent with the Letta APIs or SDKs (Python/Typescript). To create agents with a low-code UI, see our [ADE quickstart](/guides/ade/overview).
## Why Letta?
Unlike traditional LLM APIs where you manually manage conversation history and state, Letta agents maintain their own persistent memory. You only send new messages. The agent remembers everything from past conversations without you storing or retrieving anything. This enables agents that truly learn and evolve over time.
<Steps>
<Step title="Prerequisites">
1. Create a [Letta Cloud account](https://app.letta.com)
2. Create a [Letta Cloud API key](https://app.letta.com/api-keys)
<img className="w-300" src="/images/letta_cloud_api_key_gen.png" />
3. Set your API key as an environment variable:
<CodeGroup>
```sh TypeScript
export LETTA_API_KEY="your-api-key-here"
```
```sh Python
export LETTA_API_KEY="your-api-key-here"
```
</CodeGroup>
<Info>
You can also **self-host** a Letta server. Check out our [self-hosting guide](/guides/selfhosting/overview).
</Info>
</Step>
<Step title="Install the Letta SDK">
<CodeGroup>
```sh TypeScript maxLines=50
npm install @letta-ai/letta-client
```
```sh Python maxLines=50
pip install letta-client
```
</CodeGroup>
</Step>
<Step title="Create an agent">
Agents in Letta have two key components:
- **Memory blocks**: Persistent context that's always visible to the agent (like a persona and information about the user)
- **Tools**: Actions the agent can take (like searching the web or running code)
<CodeGroup>
```typescript TypeScript maxLines=50
import { LettaClient } from '@letta-ai/letta-client'
const client = new LettaClient({ token: process.env.LETTA_API_KEY });
const agentState = await client.agents.create({
model: "openai/gpt-4.1",
embedding: "openai/text-embedding-3-small",
memoryBlocks: [
{
label: "human",
value: "Name: Timber. Status: dog. Occupation: building Letta, infrastructure to democratize self-improving superintelligence"
},
{
label: "persona",
value: "I am a self-improving superintelligence. Timber is my best friend and collaborator."
}
],
tools: ["web_search", "run_code"]
});
console.log(agentState.id);
```
```python Python maxLines=50
from letta_client import Letta
import os
client = Letta(token=os.getenv("LETTA_API_KEY"))
agent_state = client.agents.create(
model="openai/gpt-4.1",
embedding="openai/text-embedding-3-small",
memory_blocks=[
{
"label": "human",
"value": "Name: Timber. Status: dog. Occupation: building Letta, infrastructure to democratize self-improving superintelligence"
},
{
"label": "persona",
"value": "I am a self-improving superintelligence. Timber is my best friend and collaborator."
}
],
tools=["web_search", "run_code"]
)
print(agent_state.id)
```
```curl curl
curl -X POST https://api.letta.com/v1/agents \
-H "Authorization: Bearer $LETTA_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"model": "openai/gpt-4.1",
"embedding": "openai/text-embedding-3-small",
"memory_blocks": [
{
"label": "human",
"value": "Name: Timber. Status: dog. Occupation: building Letta, infrastructure to democratize self-improving superintelligence"
},
{
"label": "persona",
"value": "I am a self-improving superintelligence. Timber is my best friend and collaborator."
}
],
"tools": ["web_search", "run_code"]
}'
```
</CodeGroup>
</Step>
<Step title="Message your agent">
<Note>
The Letta API supports streaming both agent *steps* and streaming *tokens*.
For more information on streaming, see [our streaming guide](/guides/agents/streaming).
</Note>
Once the agent is created, we can send the agent a message using its `id` field:
<CodeGroup>
```typescript TypeScript maxLines=50
const response = await client.agents.messages.create(
agentState.id, {
messages: [
{
role: "user",
content: "What do you know about me?"
}
]
}
);
for (const message of response.messages) {
console.log(message);
}
```
```python title="python" maxLines=50
response = client.agents.messages.create(
agent_id=agent_state.id,
messages=[
{
"role": "user",
"content": "What do you know about me?"
}
]
)
for message in response.messages:
print(message)
```
```curl curl
curl --request POST \
--url https://api.letta.com/v1/agents/$AGENT_ID/messages \
--header 'Authorization: Bearer $LETTA_API_KEY' \
--header 'Content-Type: application/json' \
--data '{
"messages": [
{
"role": "user",
"content": "What do you know about me?"
}
]
}'
```
</CodeGroup>
The response contains the agent's full response to the message, which includes reasoning steps (chain-of-thought), tool calls, tool responses, and assistant (agent) messages:
```json maxLines=50
{
"messages": [
{
"id": "message-29d8d17e-7c50-4289-8d0e-2bab988aa01e",
"date": "2024-12-12T17:05:56+00:00",
"message_type": "reasoning_message",
"reasoning": "Timber is asking what I know. I should reference my memory blocks."
},
{
"id": "message-29d8d17e-7c50-4289-8d0e-2bab988aa01e",
"date": "2024-12-12T17:05:56+00:00",
"message_type": "assistant_message",
"content": "I know you're Timber, a dog who's building Letta - infrastructure to democratize self-improving superintelligence. We're best friends and collaborators!"
}
],
"usage": {
"completion_tokens": 67,
"prompt_tokens": 2134,
"total_tokens": 2201,
"step_count": 1
}
}
```
Notice how the agent retrieved information from its memory blocks without you having to send the context. This is the key difference from traditional LLM APIs where you'd need to include the full conversation history with every request.
You can read more about the response format from the message route [here](/guides/agents/overview#message-types).
</Step>
<Step title="View your agent in the ADE">
Another way to interact with Letta agents is via the [Agent Development Environment](/guides/ade/overview) (or ADE for short). The ADE is a UI on top of the Letta API that allows you to quickly build, prototype, and observe your agents.
If we navigate to our agent in the ADE, we should see our agent's state in full detail, as well as the message that we sent to it:
<img className="block w-300 dark:hidden" src="https://raw.githubusercontent.com/letta-ai/letta/refs/heads/main/assets/example_ade_screenshot_light.png" />
<img className="hidden w-300 dark:block" src="https://raw.githubusercontent.com/letta-ai/letta/refs/heads/main/assets/example_ade_screenshot.png" />
[Read our ADE setup guide →](/guides/ade/overview)
</Step>
</Steps>
## Next steps
Congratulations! 🎉 You just created and messaged your first stateful agent with Letta using the API and SDKs. See the following resources for next steps for building more complex agents with Letta:
* Create and attach [custom tools](/guides/agents/custom-tools) to your agent
* Customize agentic [memory management](/guides/agents/memory)
* Version and distribute your agent with [agent templates](/guides/templates/overview)
* View the full [API and SDK reference](/api-reference/overview)

View File

@@ -1,47 +0,0 @@
---
title: Anthropic
slug: guides/server/providers/anthropic
---
<Tip>To enable Anthropic models with Letta, set `ANTHROPIC_API_KEY` in your environment variables. </Tip>
You can use Letta with Anthropic if you have an Anthropic account and API key.
Currently, only there are no supported **embedding** models for Anthropic (only LLM models).
You will need to use a seperate provider (e.g. OpenAI) or the Letta embeddings endpoint (`letta-free`) for embeddings.
## Enabling Anthropic models with Docker
To enable Anthropic models when running the Letta server with Docker, set your `ANTHROPIC_API_KEY` as an environment variable:
```bash
# replace `~/.letta/.persist/pgdata` with wherever you want to store your agent data
docker run \
-v ~/.letta/.persist/pgdata:/var/lib/postgresql/data \
-p 8283:8283 \
-e ANTHROPIC_API_KEY="your_anthropic_api_key" \
letta/letta:latest
```
See the [self-hosting guide](/guides/selfhosting) for more information on running Letta with Docker.
## Specifying agent models
When creating agents on your self-hosted server, you must specify both the LLM and embedding models to use. You can additionally specify a context window limit (which must be less than or equal to the maximum size).
```python
from letta_client import Letta
import os
# Connect to your self-hosted server
client = Letta(base_url="http://localhost:8283")
agent = client.agents.create(
model="anthropic/claude-3-5-sonnet-20241022",
embedding="openai/text-embedding-3-small", # An embedding model is required for self-hosted
# optional configuration
context_window_limit=30000
)
```
Anthropic models have very large context windows, which will be very expensive and high latency. We recommend setting a lower `context_window_limit` when using Anthropic models.
<Note>
For Letta Cloud usage, see the [quickstart guide](/quickstart). Cloud deployments manage embeddings automatically and don't require provider configuration.
</Note>

View File

@@ -1,30 +0,0 @@
---
title: AWS Bedrock
slug: guides/server/providers/aws-bedrock
---
We support Anthropic models provided via AWS Bedrock.
<Warning>
To use a model with AWS Bedrock, you must ensure it is enabled in the your AWS Model Catalog. Letta will list all available Anthropic models on Bedrock, even if you do not have access to them via AWS.
</Warning>
## Enabling AWS Bedrock with Docker
To enable AWS Bedrock models when running the Letta server with Docker, set your AWS credentials as environment variables:
```bash
# replace `~/.letta/.persist/pgdata` with wherever you want to store your agent data
docker run \
-v ~/.letta/.persist/pgdata:/var/lib/postgresql/data \
-p 8283:8283 \
-e AWS_ACCESS_KEY_ID="your_aws_access_key_id" \
-e AWS_SECRET_ACCESS_KEY="your_aws_secret_access_key" \
-e AWS_DEFAULT_REGION="your_aws_default_region" \
letta/letta:latest
```
Optionally, you can specify the API version (default is bedrock-2023-05-31):
```bash
-e BEDROCK_ANTHROPIC_VERSION="bedrock-2023-05-31"
```
See the [self-hosting guide](/guides/selfhosting) for more information on running Letta with Docker.

Some files were not shown because too many files have changed in this diff Show More