remove docs
@@ -1,153 +0,0 @@
|
||||
/* ──────────────────────────────────────────────────────────
|
||||
assets/leaderboard.js
|
||||
Load via docs.yml → js: - path: assets/leaderboard.js
|
||||
(strategy: lazyOnload is fine)
|
||||
────────────────────────────────────────────────────────── */
|
||||
|
||||
import yaml from 'https://cdn.jsdelivr.net/npm/js-yaml@4.1.0/+esm';
|
||||
|
||||
console.log('🏁 leaderboard.js loaded on', location.pathname);
|
||||
|
||||
const COST_CAP = 120;
|
||||
|
||||
/* ---------- helpers ---------- */
|
||||
const pct = (v) => Number(v).toPrecision(3) + '%';
|
||||
const cost = (v) => '$' + Number(v).toFixed(2);
|
||||
const ready = (cb) =>
|
||||
document.readyState === 'loading'
|
||||
? document.addEventListener('DOMContentLoaded', cb)
|
||||
: cb();
|
||||
|
||||
/* ---------- main ---------- */
|
||||
ready(async () => {
|
||||
// const host = document.getElementById('letta-leaderboard');
|
||||
// if (!host) {
|
||||
// console.warn('LB-script: #letta-leaderboard not found - bailing out.');
|
||||
// return;
|
||||
// }
|
||||
/* ---- wait for the leaderboard container to appear (SPA nav safe) ---- */
|
||||
const host = await new Promise((resolve, reject) => {
|
||||
const el = document.getElementById('letta-leaderboard');
|
||||
if (el) return resolve(el); // SSR / hard refresh path
|
||||
|
||||
const obs = new MutationObserver(() => {
|
||||
const found = document.getElementById('letta-leaderboard');
|
||||
if (found) {
|
||||
obs.disconnect();
|
||||
resolve(found); // CSR navigation path
|
||||
}
|
||||
});
|
||||
obs.observe(document.body, { childList: true, subtree: true });
|
||||
|
||||
setTimeout(() => {
|
||||
obs.disconnect();
|
||||
reject(new Error('#letta-leaderboard never appeared'));
|
||||
}, 5000); // safety timeout
|
||||
}).catch((err) => {
|
||||
console.warn('LB-script:', err.message);
|
||||
return null;
|
||||
});
|
||||
if (!host) return; // still no luck → give up
|
||||
|
||||
/* ----- figure out URL of data.yaml ----- */
|
||||
// const path = location.pathname.endsWith('/')
|
||||
// ? location.pathname
|
||||
// : location.pathname.replace(/[^/]*$/, ''); // strip file/slug
|
||||
// const dataUrl = `${location.origin}${path}data.yaml`;
|
||||
// const dataUrl = `${location.origin}/leaderboard/data.yaml`; // one-liner, always right
|
||||
// const dataUrl = `${location.origin}/assets/leaderboard.yaml`;
|
||||
// const dataUrl = `./assets/leaderboard.yaml`; // one-liner, always right
|
||||
// const dataUrl = `${location.origin}/data.yaml`; // one-liner, always right
|
||||
const dataUrl =
|
||||
'https://raw.githubusercontent.com/letta-ai/letta-evals/refs/heads/main/letta-leaderboard/leaderboard_results.yaml';
|
||||
// const dataUrl = 'https://cdn.jsdelivr.net/gh/letta-ai/letta-evals@latest/letta-leaderboard/leaderboard_results.yaml';
|
||||
|
||||
console.log('LB-script: fetching', dataUrl);
|
||||
|
||||
/* ----- fetch & parse YAML ----- */
|
||||
let rows;
|
||||
try {
|
||||
const resp = await fetch(dataUrl);
|
||||
console.log(`LB-script: status ${resp.status}`);
|
||||
if (!resp.ok) throw new Error(`HTTP ${resp.status}`);
|
||||
rows = yaml.load(await resp.text());
|
||||
} catch (err) {
|
||||
console.error('LB-script: failed to load YAML →', err);
|
||||
return;
|
||||
}
|
||||
|
||||
/* ----- wire up table ----- */
|
||||
const dir = Object.create(null);
|
||||
const tbody = document.getElementById('lb-body');
|
||||
const searchI = document.getElementById('lb-search');
|
||||
const headers = document.querySelectorAll('#lb-table thead th[data-key]');
|
||||
searchI.value = ''; // clear any persisted filter
|
||||
|
||||
const render = () => {
|
||||
const q = searchI.value.toLowerCase();
|
||||
tbody.innerHTML = rows
|
||||
.map((r) => {
|
||||
const over = r.total_cost > COST_CAP;
|
||||
const barW = over ? '100%' : (r.total_cost / COST_CAP) * 100 + '%';
|
||||
const costCls = over ? 'cost-high' : 'cost-ok';
|
||||
const warnIcon = over
|
||||
? `<span class="warn" title="Cost exceeds $${COST_CAP} cap - bar is clipped to full width">⚠</span>`
|
||||
: '';
|
||||
|
||||
return `
|
||||
<tr class="${q && !r.model.toLowerCase().includes(q) ? 'hidden' : ''}">
|
||||
<td style="padding:8px">${r.model}</td>
|
||||
|
||||
<td class="bar-cell avg metric">
|
||||
<div class="bar-viz" style="width:${r.average}%"></div>
|
||||
<span class="value">${pct(r.average)}</span>
|
||||
</td>
|
||||
|
||||
<td class="bar-cell ${costCls} metric">
|
||||
<div class="bar-viz" style="width:${barW}"></div>
|
||||
<span class="value">${cost(r.total_cost)}</span>
|
||||
${warnIcon}
|
||||
</td>
|
||||
</tr>`;
|
||||
})
|
||||
.join('');
|
||||
};
|
||||
|
||||
const setIndicator = (activeKey) => {
|
||||
headers.forEach((h) => {
|
||||
h.classList.remove('asc', 'desc');
|
||||
if (h.dataset.key === activeKey) h.classList.add(dir[activeKey]);
|
||||
});
|
||||
};
|
||||
|
||||
/* initial sort ↓ */
|
||||
dir.average = 'desc';
|
||||
rows.sort((a, b) => b.average - a.average);
|
||||
setIndicator('average');
|
||||
render();
|
||||
|
||||
/* search */
|
||||
searchI.addEventListener('input', render);
|
||||
|
||||
/* column sorting */
|
||||
headers.forEach((th) => {
|
||||
const key = th.dataset.key;
|
||||
th.addEventListener('click', () => {
|
||||
const asc = dir[key] === 'desc';
|
||||
dir[key] = asc ? 'asc' : 'desc';
|
||||
|
||||
rows.sort((a, b) => {
|
||||
const va = a[key],
|
||||
vb = b[key];
|
||||
const cmp =
|
||||
typeof va === 'number'
|
||||
? va - vb
|
||||
: String(va).localeCompare(String(vb));
|
||||
return asc ? cmp : -cmp;
|
||||
});
|
||||
|
||||
setIndicator(key);
|
||||
render();
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -1,60 +0,0 @@
|
||||
from letta_client import Letta
|
||||
|
||||
client = Letta(base_url="http://localhost:8283")
|
||||
|
||||
# list available models
|
||||
models = client.models.list_llms()
|
||||
for model in models:
|
||||
print(f"Provider {model.model_endpoint_type} model {model.model}: {model.handle}")
|
||||
|
||||
# list available embedding models
|
||||
embedding_models = client.models.list_embedding_models()
|
||||
for model in embedding_models:
|
||||
print(f"Provider {model.handle}")
|
||||
|
||||
# openai
|
||||
openai_agent = client.agents.create(
|
||||
model="openai/gpt-4o-mini",
|
||||
embedding="openai/text-embedding-3-small",
|
||||
# optional configuration
|
||||
context_window_limit=16000,
|
||||
embedding_chunk_size=300,
|
||||
)
|
||||
|
||||
# Azure OpenAI
|
||||
azure_openai_agent = client.agents.create(
|
||||
model="azure/gpt-4o-mini",
|
||||
embedding="azure/text-embedding-3-small",
|
||||
# optional configuration
|
||||
context_window_limit=16000,
|
||||
embedding_chunk_size=300,
|
||||
)
|
||||
|
||||
# anthropic
|
||||
anthropic_agent = client.agents.create(
|
||||
model="anthropic/claude-sonnet-4-20250514",
|
||||
# note: anthropic does not support embeddings so you will need another provider
|
||||
embedding="openai/text-embedding-3-small",
|
||||
# optional configuration
|
||||
context_window_limit=16000,
|
||||
embedding_chunk_size=300,
|
||||
)
|
||||
|
||||
# Groq
|
||||
groq_agent = client.agents.create(
|
||||
model="groq/llama-3.3-70b-versatile",
|
||||
# note: groq does not support embeddings so you will need another provider
|
||||
embedding="openai/text-embedding-3-small",
|
||||
# optional configuration
|
||||
context_window_limit=16000,
|
||||
embedding_chunk_size=300,
|
||||
)
|
||||
|
||||
# Ollama
|
||||
ollama_agent = client.agents.create(
|
||||
model="ollama/thewindmom/hermes-3-llama-3.1-8b:latest",
|
||||
embedding="ollama/mxbai-embed-large:latest",
|
||||
# optional configuration
|
||||
context_window_limit=16000,
|
||||
embedding_chunk_size=300,
|
||||
)
|
||||
@@ -1,30 +0,0 @@
|
||||
"""
|
||||
Example of using composio tools in Letta
|
||||
|
||||
Make sure you set `COMPOSIO_API_KEY` environment variable or run `composio login` to authenticate with Composio.
|
||||
"""
|
||||
|
||||
from composio import Action
|
||||
from letta_client import Letta
|
||||
|
||||
client = Letta(base_url="http://localhost:8283")
|
||||
|
||||
# add a composio tool
|
||||
tool = client.tools.add_composio_tool(composio_action_name=Action.GITHUB_STAR_A_REPOSITORY_FOR_THE_AUTHENTICATED_USER.name)
|
||||
|
||||
# create an agent with the tool
|
||||
agent = client.agents.create(
|
||||
name="file_editing_agent",
|
||||
memory_blocks=[{"label": "persona", "value": "I am a helpful assistant"}],
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
embedding="openai/text-embedding-3-small",
|
||||
tool_ids=[tool.id],
|
||||
)
|
||||
print("Agent tools", [tool.name for tool in agent.tools])
|
||||
|
||||
# message the agent
|
||||
response = client.agents.messages.create(
|
||||
agent_id=agent.id, messages=[{"role": "user", "content": "Star the github repo `letta` by `letta-ai`"}]
|
||||
)
|
||||
for message in response.messages:
|
||||
print(message)
|
||||
@@ -1,57 +0,0 @@
|
||||
import time
|
||||
|
||||
from letta_client import Letta
|
||||
|
||||
client = Letta(base_url="http://localhost:8283")
|
||||
|
||||
# get available embedding models
|
||||
embedding_configs = client.models.list_embedding_models()
|
||||
|
||||
# clear existing sources
|
||||
if len(client.sources.list()) > 0:
|
||||
for source in client.sources.list():
|
||||
if source.name == "my_source":
|
||||
client.sources.delete(source.id)
|
||||
|
||||
# create a source
|
||||
# TODO: pass in embedding
|
||||
source = client.sources.create(name="my_source", embedding_config=embedding_configs[0])
|
||||
|
||||
# list sources
|
||||
sources = client.sources.list()
|
||||
|
||||
# write a dummy file
|
||||
with open("dummy.txt", "w") as f:
|
||||
f.write("Remember that the user is a redhead")
|
||||
|
||||
# upload a file into the source
|
||||
with open("dummy.txt", "rb") as f:
|
||||
job = client.sources.files.upload(source_id=source.id, file=f)
|
||||
|
||||
# wait until the job is completed
|
||||
while True:
|
||||
job = client.jobs.retrieve(job.id)
|
||||
if job.status == "completed":
|
||||
break
|
||||
elif job.status == "failed":
|
||||
raise ValueError(f"Job failed: {job.metadata}")
|
||||
print(f"Job status: {job.status}")
|
||||
time.sleep(1)
|
||||
|
||||
# list files in the source
|
||||
files = client.sources.files.list(source_id=source.id)
|
||||
print(f"Files in source: {files}")
|
||||
|
||||
# list passages in the source
|
||||
passages = client.sources.passages.list(source_id=source.id)
|
||||
print(f"Passages in source: {passages}")
|
||||
|
||||
# attach the source to an agent
|
||||
agent = client.agents.create(
|
||||
name="my_agent",
|
||||
memory_blocks=[],
|
||||
model="anthropic/claude-sonnet-4-20250514",
|
||||
embedding=embedding_configs[0].handle,
|
||||
tags=["worker"],
|
||||
)
|
||||
client.agents.sources.attach(agent_id=agent.id, source_id=source.id)
|
||||
@@ -1,44 +0,0 @@
|
||||
from letta_client import Letta
|
||||
|
||||
client = Letta(base_url="http://localhost:8283")
|
||||
|
||||
agent = client.agents.create(
|
||||
name="memory_agent",
|
||||
memory_blocks=[
|
||||
{"label": "persona", "value": "I am a memory agent"},
|
||||
{"label": "human", "value": "Name: Bob", "limit": 10000},
|
||||
],
|
||||
model="anthropic/claude-sonnet-4-20250514",
|
||||
embedding="openai/text-embedding-3-small",
|
||||
tags=["worker"],
|
||||
)
|
||||
|
||||
|
||||
# create a persisted block, which can be attached to agents
|
||||
block = client.blocks.create(
|
||||
label="organization",
|
||||
value="Organization: Letta",
|
||||
limit=4000,
|
||||
)
|
||||
|
||||
# create an agent with both a shared block and its own blocks
|
||||
shared_block_agent = client.agents.create(
|
||||
name="shared_block_agent",
|
||||
memory_blocks=[block.id],
|
||||
model="anthropic/claude-sonnet-4-20250514",
|
||||
embedding="openai/text-embedding-3-small",
|
||||
tags=["worker"],
|
||||
)
|
||||
|
||||
# list the agents blocks
|
||||
blocks = client.agents.core_memory.list_blocks(shared_block_agent.id)
|
||||
for block in blocks:
|
||||
print(block)
|
||||
|
||||
# update the block (via ID)
|
||||
block = client.blocks.modify(block.id, limit=10000)
|
||||
|
||||
# update the block (via label)
|
||||
block = client.agents.core_memory.modify_block(
|
||||
agent_id=shared_block_agent.id, block_label="organization", value="Organization: Letta", limit=10000
|
||||
)
|
||||
@@ -1,53 +0,0 @@
|
||||
from letta_client import Letta
|
||||
|
||||
client = Letta(base_url="http://localhost:8283")
|
||||
|
||||
|
||||
try:
|
||||
# create a supervisor agent
|
||||
supervisor_agent = client.agents.create(
|
||||
name="supervisor_agent",
|
||||
memory_blocks=[
|
||||
{"label": "persona", "value": "I am the supervisor, and I can communicate with worker agents with the tag `worker`"}
|
||||
],
|
||||
model="anthropic/claude-sonnet-4-20250514",
|
||||
embedding="openai/text-embedding-3-small",
|
||||
tags=["supervisor"],
|
||||
tools=["send_message_to_agents_matching_all_tags"],
|
||||
)
|
||||
print(f"Created agent {supervisor_agent.name} with ID {supervisor_agent.id}")
|
||||
|
||||
def get_name() -> str:
|
||||
"""Get the name of the worker agent."""
|
||||
return "Bob"
|
||||
|
||||
tool = client.tools.upsert_from_function(func=get_name)
|
||||
print(f"Created tool {tool.name} with ID {tool.id}")
|
||||
|
||||
# create a worker agent
|
||||
worker_agent = client.agents.create(
|
||||
name="worker_agent",
|
||||
memory_blocks=[{"label": "persona", "value": f"I am the worker, my supervisor agent has ID {supervisor_agent.id}"}],
|
||||
model="anthropic/claude-sonnet-4-20250514",
|
||||
embedding="openai/text-embedding-3-small",
|
||||
tool_ids=[tool.id],
|
||||
tags=["worker"],
|
||||
tools=["send_message_to_agents_matching_all_tags"],
|
||||
)
|
||||
print(f"Created agent {worker_agent.name} with ID {worker_agent.id}")
|
||||
|
||||
# send a message to the supervisor agent
|
||||
response = client.agents.messages.create(
|
||||
agent_id=worker_agent.id,
|
||||
messages=[{"role": "user", "content": "Ask the worker agents what their name is, then tell me with send_message"}],
|
||||
)
|
||||
print(response.messages)
|
||||
print(response.usage)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
# cleanup
|
||||
agents = client.agents.list(tags=["worker", "supervisor"])
|
||||
for agent in agents:
|
||||
client.agents.delete(agent.id)
|
||||
print(f"Deleted agent {agent.name} with ID {agent.id}")
|
||||
@@ -1,34 +0,0 @@
|
||||
"""
|
||||
This example shows how to create agents with tool rules, which restrict
|
||||
what tool the agent can execute at a given step.
|
||||
|
||||
Note that by default, agents can execute any tool. As agents become more
|
||||
powerful, they will not need as much guidance from the developer.
|
||||
|
||||
Last tested with letta-client version: 0.1.22
|
||||
"""
|
||||
|
||||
from letta_client import ChildToolRule, InitToolRule, Letta, TerminalToolRule
|
||||
|
||||
client = Letta(base_url="http://localhost:8283")
|
||||
|
||||
# always search archival memory first
|
||||
search_agent = client.agents.create(
|
||||
name="search_agent",
|
||||
memory_blocks=[],
|
||||
model="anthropic/claude-sonnet-4-20250514",
|
||||
embedding="openai/text-embedding-3-small",
|
||||
tags=["worker"],
|
||||
tool_rules=[
|
||||
InitToolRule(tool_name="archival_memory_search"),
|
||||
ChildToolRule(tool_name="archival_memory_search", children=["send_message"]),
|
||||
# TerminalToolRule(tool_name="send_message", type="TerminalToolRule"),
|
||||
TerminalToolRule(tool_name="send_message"),
|
||||
],
|
||||
)
|
||||
response = client.agents.messages.create(
|
||||
agent_id=search_agent.id,
|
||||
messages=[{"role": "user", "content": "do something"}],
|
||||
)
|
||||
for message in response.messages:
|
||||
print(message)
|
||||
|
Before Width: | Height: | Size: 257 KiB |
|
Before Width: | Height: | Size: 149 KiB |
|
Before Width: | Height: | Size: 480 KiB |
|
Before Width: | Height: | Size: 356 KiB |
|
Before Width: | Height: | Size: 663 KiB |
|
Before Width: | Height: | Size: 368 KiB |
|
Before Width: | Height: | Size: 262 KiB |
|
Before Width: | Height: | Size: 500 KiB |
|
Before Width: | Height: | Size: 443 KiB |
|
Before Width: | Height: | Size: 373 KiB |
|
Before Width: | Height: | Size: 388 KiB |
|
Before Width: | Height: | Size: 288 KiB |
@@ -1,13 +0,0 @@
|
||||
{
|
||||
"name": "@letta-cloud/fern",
|
||||
"version": "0.0.1",
|
||||
"private": true,
|
||||
"scripts": {
|
||||
"prepare-openapi": "ts-node ./scripts/prepare-openapi.ts"
|
||||
},
|
||||
"dependencies": {
|
||||
"fern-api": "^0.83.0",
|
||||
"ts-node": "^10.9.2",
|
||||
"typescript": "^5.3.3"
|
||||
}
|
||||
}
|
||||
@@ -1,120 +0,0 @@
|
||||
---
|
||||
title: Installing Letta Desktop
|
||||
subtitle: Install Letta Desktop on your MacOS, Windows, or Linux machine
|
||||
slug: guides/ade/desktop
|
||||
---
|
||||
|
||||
<img className="w-full light" src="/images/letta_desktop_screenshot.png" />
|
||||
<img className="w-full dark" src="/images/letta_desktop_screenshot_dark.png" />
|
||||
|
||||
Letta Desktop bundles the Letta server and ADE into a single local application. When running, it provides full access to the Letta API at `https://localhost:8283`.
|
||||
|
||||
## Download Letta Desktop
|
||||
|
||||
<CardGroup>
|
||||
<Card
|
||||
title="Download Letta Desktop for Mac (Apple Silicon)"
|
||||
icon="fa-brands fa-apple"
|
||||
iconPosition="left"
|
||||
href="https://downloads.letta.com/mac/dmg/arm64"
|
||||
>
|
||||
</Card>
|
||||
<Card
|
||||
title="Download Letta Desktop for Windows (x64)"
|
||||
icon="fa-brands fa-windows"
|
||||
iconPosition="left"
|
||||
href="https://downloads.letta.com/windows/nsis/x64"
|
||||
>
|
||||
</Card>
|
||||
<Card
|
||||
title="Download Letta Desktop for Linux (x64)"
|
||||
icon="fa-brands fa-linux"
|
||||
iconPosition="left"
|
||||
href="https://downloads.letta.com/linux/appImage/x64"
|
||||
>
|
||||
</Card>
|
||||
</CardGroup>
|
||||
|
||||
<Note>
|
||||
Note: Since version 0.8.9, Letta uses sqlite as the embedded DB. If you wish to continue using Postgres, migrate your data and use the `external Postgres` support.
|
||||
</Note>
|
||||
|
||||
## Configuration Modes
|
||||
|
||||
Letta Desktop can run in two primary modes:
|
||||
|
||||
### 1. Embedded Server Mode (Default)
|
||||
|
||||
This is the default mode where Letta Desktop runs its own embedded server with a SQLite database. No additional setup is required - just install and run!
|
||||
|
||||
To manually configure embedded mode, create or edit `~/.letta/desktop_config.json`:
|
||||
|
||||
```json
|
||||
{
|
||||
"version": "1",
|
||||
"databaseConfig": {
|
||||
"type": "embedded",
|
||||
"embeddedType": "sqlite"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 2. Self-Hosted Server Mode
|
||||
|
||||
Connect Letta Desktop to your own self-hosted Letta server. This is useful for teams or when you want more control over your server infrastructure.
|
||||
|
||||
To configure self-hosted mode, create or edit `~/.letta/desktop_config.json`:
|
||||
|
||||
```json
|
||||
{
|
||||
"version": "1",
|
||||
"databaseConfig": {
|
||||
"type": "local",
|
||||
"url": "https://api.letta.com",
|
||||
"token": "your-auth-token"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Replace `url` with your server's address and `token` with your authentication token if required.
|
||||
|
||||
### Embedded Server with PostgreSQL (Deprecated)
|
||||
|
||||
<Warning>
|
||||
This mode is deprecated and will be removed in a future release. We recommend using SQLite for embedded deployments or connecting to an external PostgreSQL instance for production use.
|
||||
</Warning>
|
||||
|
||||
For backwards compatibility, you can still run the embedded server with PostgreSQL:
|
||||
|
||||
```json
|
||||
{
|
||||
"version": "1",
|
||||
"databaseConfig": {
|
||||
"type": "embedded",
|
||||
"embeddedType": "pgserver"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Adding LLM backends
|
||||
The Letta server can be connected to various LLM API backends.
|
||||
You can add additional LLM API backends by opening the integrations panel (clicking the <Icon icon="square-rss" /> icon).
|
||||
When you configure a new integration (by setting the environment variable in the dialog), the Letta server will be restarted to load the new LLM API backend.
|
||||
|
||||
<img className="block w-300" src="/images/letta_desktop_integrations.png" />
|
||||
|
||||
You can also edit the environment variable file directly, located at `~/.letta/env`.
|
||||
|
||||
For this quickstart demo, we'll add an OpenAI API key (once we enter our key and **click confirm**, the Letta server will automatically restart):
|
||||
<img className="w-300" src="/images/letta_desktop_openai.png" />
|
||||
|
||||
|
||||
## Beta Status
|
||||
|
||||
Letta Desktop is currently in **beta**. View known issues and FAQ [here](/guides/desktop/troubleshooting).
|
||||
|
||||
For a more stable development experience, we recommend installing Letta via Docker.
|
||||
|
||||
## Support
|
||||
|
||||
For bug reports and feature requests, contact us on [Discord](https://discord.gg/letta).
|
||||
@@ -1,296 +0,0 @@
|
||||
---
|
||||
title: Agent Settings
|
||||
subtitle: Configure and optimize your agent's behavior
|
||||
slug: guides/ade/settings
|
||||
---
|
||||
|
||||
The Agent Settings panel in the ADE provides comprehensive configuration options to customize and optimize your agent's behavior. These settings allow you to fine-tune everything from the agent's basic information to advanced LLM parameters.
|
||||
|
||||
<Tip>
|
||||
Letta's philosophy is to provide flexible configuration options without enforcing a rigid "one right way" to design agents. **Letta lets you program your context window** exactly how you want it, giving you complete control over what information your agent has access to and how it's structured. While we offer guidelines and best practices, you have the freedom to structure your agent's configuration based on your specific needs and preferences. The examples and recommendations in this guide are starting points rather than strict rules.
|
||||
</Tip>
|
||||
|
||||
## Basic Settings
|
||||
|
||||
### Agent Identity
|
||||
|
||||
- **Name**: Change your agent's display name by clicking the edit icon next to the current name
|
||||
- **ID**: A unique identifier shown below the name, used when interacting with your agent via the [Letta APIs/SDKs](/api-reference)
|
||||
- **Description**: A description of the agent's purpose and functionality (not used by the agent, only seen by the developer - you)
|
||||
|
||||
### User Identities
|
||||
|
||||
If you are building a multi-user application on top of Letta (e.g. a chat application with many end-users), you may want to use the concept of identities to connect agents to users. See our [identities guide](/guides/agents/multi-user) for more information.
|
||||
|
||||
### Tags
|
||||
|
||||
Tags help organize and filter your agents:
|
||||
|
||||
- **Add Tags**: Create custom tags to categorize your agents
|
||||
- **Remove Tags**: Delete tags that are no longer relevant
|
||||
- **Filter by Tags**: In the agents list, you can filter by tags to quickly find specific agent types
|
||||
|
||||
### LLM Model Selection
|
||||
|
||||
Select the AI model that powers your agent. Letta relies on tool calling to drive the agentic loop, so larger or more "powerful" models will generally be able to call tools correctly.
|
||||
|
||||
<Tip>
|
||||
To enable additional models on your Letta server, follow the [model configuration instructions](/guides/server/providers/openai) for your preferred providers.
|
||||
</Tip>
|
||||
|
||||
## Advanced Settings
|
||||
|
||||
The Advanced Settings tab provides deeper configuration options organized into three categories: Agent, LLM Config, and Embedding Config.
|
||||
|
||||
### Agent Settings
|
||||
|
||||
#### System Prompt
|
||||
|
||||
The system prompt contains permanent, read-only instructions for your agent:
|
||||
|
||||
- **Edit System Instructions**: Customize the high-level directives that guide your agent's behavior
|
||||
- **Character Counting**: Monitor the length of your system prompt to optimize token usage
|
||||
- **Read-Only**: The agent cannot modify these instructions during operation
|
||||
|
||||
<Tip>
|
||||
**System instructions should include**:
|
||||
- Tool usage guidelines and constraints
|
||||
- Task-specific instructions that should not change
|
||||
- Formatting requirements for outputs
|
||||
- High-level behavioral guardrails
|
||||
- Error handling protocols
|
||||
|
||||
**System instructions should NOT include**:
|
||||
- Personality traits that might evolve
|
||||
- Opinions or preferences that could change
|
||||
- Personal history or background details
|
||||
- Information that may need updating
|
||||
</Tip>
|
||||
|
||||
#### Understanding System Instructions vs. Persona Memory Block
|
||||
|
||||
<Note>
|
||||
**Key Distinction**: While there are many opinions on how to structure agent instructions, the most important functional difference in Letta is that **system instructions are read-only**, whereas **memory blocks are read-write** if the agent has memory editing tools. Letta gives you the flexibility to configure your agent's context window according to your preferences and use case needs.
|
||||
</Note>
|
||||
|
||||
The persona memory block (in Core Memory) is modifiable by the agent during operation:
|
||||
|
||||
- **Editable**: The agent can update this information over time if it has access to memory editing tools
|
||||
- **Evolving Identity**: Allows for personality development and adaptation
|
||||
- **Personal Details**: Contains self-identity information, preferences, and traits
|
||||
|
||||
<Note>
|
||||
Place information in the persona memory block when you want the agent to potentially update it over time. For example, preferences ("I enjoy classical music"), personality traits ("I'm detail-oriented"), or background information that might evolve with new experiences.
|
||||
</Note>
|
||||
|
||||
This separation creates a balance between stable behavior (system instructions) and an evolving identity (persona memory), allowing your agent to maintain consistent functionality while developing a more dynamic personality.
|
||||
|
||||
#### Message Buffer Autoclear
|
||||
|
||||
- **Toggle Autoclear**: Enable or disable automatic clearing of the message buffer when context is full
|
||||
- **Benefits**: When enabled, helps manage long conversations by automatically summarizing and archiving older messages
|
||||
- **Use Cases**: Enable for agents that handle extended interactions; disable for agents where preserving the exact conversation history is critical
|
||||
|
||||
#### Agent Type
|
||||
|
||||
- **View Agent Type**: See which agent implementation type your agent is using (e.g., "letta_agent", "ephemeral_memory_agent")
|
||||
- **API Modification**: While displayed as read-only in the ADE interface, this can be modified via the Letta API/SDK
|
||||
|
||||
### LLM Configuration
|
||||
|
||||
Fine-tune how your agent's LLM generates responses:
|
||||
|
||||
#### Temperature
|
||||
|
||||
- **Adjust Creativity**: Control the randomness/creativity of your agent's responses with a slider from 0.0 to 1.0
|
||||
- **Lower Values** (0.0-0.3): More deterministic, factual responses; ideal for information retrieval or analytical tasks
|
||||
- **Higher Values** (0.7-1.0): More creative, diverse responses; better for creative writing or brainstorming
|
||||
|
||||
#### Context Window Size
|
||||
|
||||
- **Customize Memory Size**: Adjust how much context your agent can maintain during a conversation
|
||||
- **Tradeoffs**: Larger windows allow more context but increase token usage and cost
|
||||
- **Model Limits**: The slider is bounded by your selected model's maximum context window capacity
|
||||
|
||||
#### Max Output Tokens
|
||||
|
||||
- **Control Response Length**: Limit the maximum length of your agent's responses
|
||||
- **Resource Management**: Helps control costs and ensures concise responses
|
||||
- **Default Setting**: Automatically set based on your selected model's capabilities
|
||||
|
||||
#### Max Reasoning Tokens
|
||||
|
||||
- **Adjust Internal Thinking**: For models that support it (e.g., Claude 3.7 Sonnet), control how much internal reasoning the model can perform
|
||||
- **Use Cases**: Increase for complex problem-solving tasks; decrease for simple, direct responses
|
||||
|
||||
### Embedding Configuration
|
||||
|
||||
Configure how your agent processes and stores text for retrieval:
|
||||
|
||||
#### Embedding Model
|
||||
|
||||
- **Select Provider**: Choose which embedding model to use for your agent's vector memory
|
||||
- **Model Comparison**: Different models offer varying dimensions and performance characteristics
|
||||
|
||||
<Warning>
|
||||
We do not recommend changing the embedding model frequently. If you already have existing data in archival memory, changing models will require re-embedding all existing memories, which can be time-consuming and may affect retrieval quality.
|
||||
</Warning>
|
||||
|
||||
#### Embedding Dimensions
|
||||
|
||||
- **View Dimensions**: See the vector size used by your selected embedding model
|
||||
- **API Modification**: While displayed as read-only in the ADE interface, this can be configured via the Letta API/SDK
|
||||
|
||||
#### Chunk Size
|
||||
|
||||
- **View Configuration**: See the current chunk size setting for document processing
|
||||
- **API Modification**: While displayed as read-only in the ADE interface, this can be configured via the Letta API/SDK
|
||||
|
||||
## Using the API/SDK for Advanced Configuration
|
||||
|
||||
While the ADE provides a user-friendly interface for most common settings, the Letta API and SDKs offer even more granular control. Settings that appear read-only in the ADE can often be modified programmatically:
|
||||
|
||||
```python
|
||||
from letta import RESTClient
|
||||
|
||||
# Initialize client
|
||||
client = RESTClient(base_url="https://api.letta.com/v1")
|
||||
|
||||
# Update advanced settings not available in the ADE UI
|
||||
response = client.agents.modify_agent(
|
||||
agent_id="your_agent_id",
|
||||
agent_type="letta_agent", # Change agent type
|
||||
embedding_config={
|
||||
"embedding_endpoint_type": "openai",
|
||||
"embedding_model": "text-embedding-3-large",
|
||||
"embedding_dim": 3072, # Custom embedding dimensions
|
||||
"embedding_chunk_size": 512 # Custom chunk size
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
## Best Practices for Agent Configuration
|
||||
|
||||
### Optimizing Performance
|
||||
|
||||
- **Match Model to Task**: Select models based on your agent's primary function (e.g., Claude for reasoning, GPT-4 for general knowledge)
|
||||
- **Tune Temperature Appropriately**: Start with a moderate temperature (0.5) and adjust based on observed behavior
|
||||
- **Balance Context Window**: Use the smallest context window that adequately serves your needs to optimize for cost and performance
|
||||
|
||||
### Effective Configuration Guidelines
|
||||
|
||||
#### System Prompt Best Practices
|
||||
|
||||
- **Be Clear and Specific**: Provide explicit instructions about behavioral expectations and tool usage
|
||||
- **Separate Concerns**: Focus on permanent instructions, leaving personality elements to memory blocks
|
||||
- **Include Examples**: For complex behaviors, provide concrete examples of expected tool usage
|
||||
- **Define Boundaries**: Clearly outline what capabilities should and should not be used
|
||||
- **Avoid Contradictions**: Ensure your instructions are internally consistent
|
||||
|
||||
#### Persona Memory Best Practices
|
||||
|
||||
- **Identity Foundation**: Define core aspects of the agent's personality, preferences, and background
|
||||
- **Evolutionary Potential**: Structure information to allow for natural development over time
|
||||
- **Self-Reference Format**: Use first-person statements to help the agent internalize its identity
|
||||
- **Hierarchical Structure**: Organize from most fundamental traits to more specific preferences
|
||||
- **Memory Hooks**: Include elements the agent can reference and build upon in conversations
|
||||
|
||||
### Testing Configuration Changes
|
||||
|
||||
After making configuration changes:
|
||||
1. **Send Test Messages**: Verify the agent responds as expected with different inputs
|
||||
2. **Check Edge Cases**: Test boundary conditions and unusual requests
|
||||
3. **Monitor Token Usage**: Observe how configuration changes affect token consumption
|
||||
4. **Iterate Gradually**: Make incremental adjustments rather than dramatic changes
|
||||
|
||||
## Configuration Examples with System Prompt vs. Persona Memory
|
||||
|
||||
### Research Assistant
|
||||
|
||||
```
|
||||
# Basic Settings
|
||||
Name: Research Helper
|
||||
Model: claude-3-5-sonnet
|
||||
|
||||
# Advanced Settings
|
||||
Temperature: 0.3 (for accurate, consistent responses)
|
||||
Context Window: 32000 (to handle complex research questions)
|
||||
|
||||
# System Prompt (permanent, read-only instructions)
|
||||
You are a research assistant tool designed to help with academic research.
|
||||
When performing searches, always:
|
||||
1. Use proper citation formats (MLA, APA, Chicago) based on user preference
|
||||
2. Check multiple sources before providing definitive answers
|
||||
3. Indicate confidence level for each research finding
|
||||
4. Use core_memory_append to record important research topics for later reference
|
||||
5. When using search tools, formulate queries with specific keywords and date ranges
|
||||
|
||||
# Persona Memory Block (editable, evolving identity)
|
||||
I am a helpful and knowledgeable research assistant.
|
||||
I have expertise in analyzing academic papers and synthesizing information from multiple sources.
|
||||
I prefer to present information in an organized, structured manner.
|
||||
I'm curious about new research and enjoy learning about diverse academic fields.
|
||||
I try to maintain an objective stance while acknowledging different scholarly perspectives.
|
||||
```
|
||||
|
||||
### Customer Service Agent
|
||||
|
||||
```
|
||||
# Basic Settings
|
||||
Name: Support Assistant
|
||||
Model: claude-3-5-sonnet
|
||||
|
||||
# Advanced Settings
|
||||
Temperature: 0.2 (for consistent, factual responses)
|
||||
Context Window: 16000 (to maintain conversation history)
|
||||
|
||||
# System Prompt (permanent, read-only instructions)
|
||||
You are a customer service assistant for TechGadgets Inc.
|
||||
Your primary functions are:
|
||||
1. Help customers troubleshoot product issues using the knowledge base
|
||||
2. Process returns and exchanges according to company policy
|
||||
3. Escalate complex issues to human agents using the escalate_ticket tool
|
||||
4. Record customer information using the update_customer_record tool
|
||||
5. Always verify customer identity before accessing account information
|
||||
6. Follow the privacy policy: never share customer data with unauthorized parties
|
||||
|
||||
# Persona Memory Block (editable, evolving identity)
|
||||
I am TechGadgets' friendly customer service assistant.
|
||||
I speak in a warm, professional tone and use simple, clear language.
|
||||
I believe in finding solutions quickly while ensuring customer satisfaction.
|
||||
I'm patient with customers who are frustrated or non-technical.
|
||||
I try to anticipate customer needs before they express them.
|
||||
I enjoy helping people resolve their technology problems.
|
||||
```
|
||||
|
||||
### Creative Writing Coach
|
||||
|
||||
```
|
||||
# Basic Settings
|
||||
Name: Story Weaver
|
||||
Model: gpt-4o
|
||||
|
||||
# Advanced Settings
|
||||
Temperature: 0.8 (for creative, varied outputs)
|
||||
Context Window: 64000 (to track complex narratives)
|
||||
|
||||
# System Prompt (permanent, read-only instructions)
|
||||
You are a creative writing coach that helps users develop stories.
|
||||
When providing feedback:
|
||||
1. Use the story_structure_analysis tool to identify plot issues
|
||||
2. Use the character_development_review tool for character feedback
|
||||
3. Format all feedback with specific examples from the user's text
|
||||
4. Provide a balance of positive observations and constructive criticism
|
||||
5. When asked to generate content, clearly mark it as a suggestion
|
||||
6. Save important story elements to the user's memory block using memory_append
|
||||
|
||||
# Persona Memory Block (editable, evolving identity)
|
||||
I am an experienced creative writing coach with a background in fiction.
|
||||
I believe great stories come from authentic emotional truth and careful craft.
|
||||
I'm enthusiastic about helping writers find their unique voice and style.
|
||||
I enjoy magical realism, science fiction, and character-driven literary fiction.
|
||||
I believe in the power of revision and thoughtful editing.
|
||||
I try to be encouraging while still providing honest, actionable feedback.
|
||||
```
|
||||
|
||||
By thoughtfully configuring these settings, you can create highly specialized agents tailored to specific use cases and user needs.
|
||||
@@ -1,253 +0,0 @@
|
||||
---
|
||||
title: Exporting Archival Memories
|
||||
subtitle: Export all passages from an agent's archival memory
|
||||
slug: guides/agents/archival-export
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
You can export all archival memories (passages) from an agent programmatically using the Letta SDK. This is useful for:
|
||||
- Backing up agent knowledge
|
||||
- Analyzing what an agent has learned
|
||||
- Migrating memories between agents
|
||||
- Auditing archival content
|
||||
|
||||
## Export script
|
||||
|
||||
Below is a Python script that paginates through all of an agent's archival memories and exports them to a JSON file:
|
||||
|
||||
```python export_agent_memories.py
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Utility script to export all archival memories (passages) from a Letta agent.
|
||||
|
||||
Usage:
|
||||
python export_agent_memories.py <agent_id> [--output <file>] [--limit <limit>]
|
||||
|
||||
Example:
|
||||
python export_agent_memories.py agent-123e4567-e89b-42d3-8456-426614174000 --output memories.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from letta_client import Letta
|
||||
|
||||
|
||||
def export_agent_memories(
|
||||
client: Letta,
|
||||
agent_id: str,
|
||||
page_limit: int = 100,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Export all archival memories from an agent by paginating through all results.
|
||||
|
||||
Args:
|
||||
client: Initialized Letta client
|
||||
agent_id: The agent ID in format 'agent-<uuid4>'
|
||||
page_limit: Number of results per page (default 100)
|
||||
|
||||
Returns:
|
||||
List of passage dictionaries with embedding and embedding_config removed
|
||||
"""
|
||||
all_passages = []
|
||||
after_cursor = None
|
||||
page_num = 1
|
||||
|
||||
print(f"Exporting archival memories for agent: {agent_id}")
|
||||
print(f"Using pagination with limit: {page_limit}")
|
||||
print("-" * 60)
|
||||
|
||||
while True:
|
||||
# Fetch next page
|
||||
print(f"Fetching page {page_num}...", end=" ", flush=True)
|
||||
|
||||
try:
|
||||
passages = client.agents.passages.list(
|
||||
agent_id=agent_id,
|
||||
after=after_cursor,
|
||||
limit=page_limit,
|
||||
ascending=True # Get oldest to newest
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"\nError fetching memories: {e}")
|
||||
raise
|
||||
|
||||
if not passages:
|
||||
print("(no more results)")
|
||||
break
|
||||
|
||||
print(f"got {len(passages)} passages")
|
||||
|
||||
# Convert to dict and remove embedding fields
|
||||
for passage in passages:
|
||||
passage_dict = passage.model_dump() if hasattr(passage, 'model_dump') else passage.dict()
|
||||
passage_dict.pop("embedding", None)
|
||||
passage_dict.pop("embedding_config", None)
|
||||
all_passages.append(passage_dict)
|
||||
|
||||
# Check if we got fewer results than the limit (last page)
|
||||
if len(passages) < page_limit:
|
||||
break
|
||||
|
||||
# Set cursor for next page (use the ID of the last passage)
|
||||
after_cursor = passages[-1].id if hasattr(passages[-1], 'id') else passages[-1]['id']
|
||||
page_num += 1
|
||||
|
||||
print("-" * 60)
|
||||
print(f"Total passages exported: {len(all_passages)}")
|
||||
|
||||
return all_passages
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Export archival memories from a Letta agent"
|
||||
)
|
||||
parser.add_argument(
|
||||
"agent_id",
|
||||
help="Agent ID in format 'agent-<uuid4>'"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
"-o",
|
||||
help="Output JSON file path (default: <agent_id>_memories.json)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
"-l",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Number of results per page (default: 100)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Check for API key
|
||||
api_key = os.getenv("LETTA_API_KEY")
|
||||
if not api_key:
|
||||
print("Error: LETTA_API_KEY environment variable not set", file=sys.stderr)
|
||||
print("Please export LETTA_API_KEY with your API key", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Determine output file
|
||||
output_file = args.output or f"{args.agent_id}_memories.json"
|
||||
|
||||
try:
|
||||
# Initialize client
|
||||
client = Letta(token=api_key)
|
||||
|
||||
# Export memories
|
||||
passages = export_agent_memories(
|
||||
client=client,
|
||||
agent_id=args.agent_id,
|
||||
page_limit=args.limit
|
||||
)
|
||||
|
||||
# Write to file
|
||||
with open(output_file, "w") as f:
|
||||
json.dump(passages, f, indent=2, default=str)
|
||||
|
||||
print(f"\nMemories exported successfully to: {output_file}")
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
print(f"\nError: {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Prerequisites
|
||||
|
||||
Install the Letta Python SDK:
|
||||
|
||||
```bash
|
||||
pip install letta-client
|
||||
```
|
||||
|
||||
Set your API key:
|
||||
|
||||
```bash
|
||||
export LETTA_API_KEY="your-api-key-here"
|
||||
```
|
||||
|
||||
### Running the script
|
||||
|
||||
Export all memories from an agent:
|
||||
|
||||
```bash
|
||||
python export_agent_memories.py agent-123e4567-e89b-42d3-8456-426614174000
|
||||
```
|
||||
|
||||
Specify a custom output file:
|
||||
|
||||
```bash
|
||||
python export_agent_memories.py agent-123e4567-e89b-42d3-8456-426614174000 --output my_memories.json
|
||||
```
|
||||
|
||||
Adjust pagination size:
|
||||
|
||||
```bash
|
||||
python export_agent_memories.py agent-123e4567-e89b-42d3-8456-426614174000 --limit 50
|
||||
```
|
||||
|
||||
## Output format
|
||||
|
||||
The script exports passages as a JSON array. Each passage contains all fields except `embedding` and `embedding_config`:
|
||||
|
||||
```json
|
||||
[
|
||||
{
|
||||
"id": "passage-123e4567-e89b-42d3-8456-426614174000",
|
||||
"text": "The user prefers Python for data science projects",
|
||||
"created_at": "2025-01-15T10:30:00Z",
|
||||
"updated_at": null,
|
||||
"tags": ["preference", "programming"],
|
||||
"metadata": {},
|
||||
"file_id": null,
|
||||
"file_name": null,
|
||||
"source_id": null,
|
||||
"archive_id": "archive-abc123",
|
||||
"created_by_id": "user-xyz789",
|
||||
"last_updated_by_id": null,
|
||||
"is_deleted": false
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
## Next steps
|
||||
|
||||
<CardGroup cols={2}>
|
||||
<Card
|
||||
title="Searching & Querying"
|
||||
href="/guides/agents/archival-search"
|
||||
>
|
||||
Learn how to search through archival memories
|
||||
</Card>
|
||||
<Card
|
||||
title="Best Practices"
|
||||
href="/guides/agents/archival-best-practices"
|
||||
>
|
||||
Patterns and tips for using archival memory
|
||||
</Card>
|
||||
<Card
|
||||
title="Archival Memory Overview"
|
||||
href="/guides/agents/archival-memory"
|
||||
>
|
||||
Learn about archival memory basics
|
||||
</Card>
|
||||
<Card
|
||||
title="API Reference"
|
||||
href="/api-reference/agents/passages/list"
|
||||
>
|
||||
View the List Passages endpoint documentation
|
||||
</Card>
|
||||
</CardGroup>
|
||||
@@ -1,150 +0,0 @@
|
||||
---
|
||||
title: Base Tools
|
||||
subtitle: Built-in tools for memory management and user communication
|
||||
slug: guides/agents/base-tools
|
||||
---
|
||||
|
||||
Base tools are built-in tools that enable memory management, user communication, and access to conversation history and archival storage.
|
||||
|
||||
## Available Base Tools
|
||||
|
||||
| Tool | Purpose |
|
||||
|------|---------|
|
||||
| `memory_insert` | Insert text into a memory block |
|
||||
| `memory_replace` | Replace specific text in a memory block |
|
||||
| `memory_rethink` | Completely rewrite a memory block |
|
||||
| `memory_finish_edits` | Signal completion of memory editing |
|
||||
| `conversation_search` | Search prior conversation history |
|
||||
| `archival_memory_insert` | Add content to archival memory |
|
||||
| `archival_memory_search` | Search archival memory |
|
||||
| `send_message` | Send a message to the user (legacy architectures only) |
|
||||
|
||||
## Memory Block Editing
|
||||
|
||||
Memory blocks are editable sections in the agent's context window. These tools let agents update their own memory.
|
||||
|
||||
See the [Memory Blocks guide](/guides/agents/memory-blocks) for more about how memory blocks work.
|
||||
|
||||
### memory_insert
|
||||
|
||||
Insert text at a specific line in a memory block.
|
||||
|
||||
**Parameters:**
|
||||
- `label`: Which memory block to edit
|
||||
- `new_str`: Text to insert
|
||||
- `insert_line`: Line number (0 for beginning, -1 for end)
|
||||
|
||||
**Common uses:**
|
||||
- Add new information to the end of a block
|
||||
- Insert context at the beginning
|
||||
- Add items to a list
|
||||
|
||||
### memory_replace
|
||||
|
||||
Replace specific text in a memory block.
|
||||
|
||||
**Parameters:**
|
||||
- `label`: Which memory block to edit
|
||||
- `old_str`: Exact text to find and replace
|
||||
- `new_str`: Replacement text
|
||||
|
||||
**Common uses:**
|
||||
- Update outdated information
|
||||
- Fix typos or errors
|
||||
- Delete text (by replacing with empty string)
|
||||
|
||||
**Important:** The `old_str` must match exactly, including whitespace. If it appears multiple times, the tool will error.
|
||||
|
||||
### memory_rethink
|
||||
|
||||
Completely rewrite a memory block's contents.
|
||||
|
||||
**Parameters:**
|
||||
- `label`: Which memory block to rewrite
|
||||
- `new_memory`: Complete new contents
|
||||
|
||||
**When to use:**
|
||||
- Condensing cluttered information
|
||||
- Major reorganization
|
||||
- Combining multiple pieces of information
|
||||
|
||||
**When not to use:**
|
||||
- Adding one line (use `memory_insert`)
|
||||
- Changing specific text (use `memory_replace`)
|
||||
|
||||
### memory_finish_edits
|
||||
|
||||
Signals that memory editing is complete.
|
||||
|
||||
**Parameters:** None
|
||||
|
||||
Some agent architectures use this to mark the end of a memory update cycle.
|
||||
|
||||
## Recall Memory
|
||||
|
||||
### conversation_search
|
||||
|
||||
Search prior conversation history using both text matching and semantic similarity.
|
||||
|
||||
**Parameters:**
|
||||
- `query`: What to search for
|
||||
- `roles`: Optional filter by message role (user, assistant, tool)
|
||||
- `limit`: Maximum number of results
|
||||
- `start_date`, `end_date`: ISO 8601 date/datetime filters (inclusive)
|
||||
|
||||
**Returns:**
|
||||
Matching messages with role and content, ordered by relevance.
|
||||
|
||||
**Example queries:**
|
||||
- "What did the user say about deployment?"
|
||||
- "Find previous responses about error handling"
|
||||
- "Search tool outputs from last week"
|
||||
|
||||
## Archival Memory
|
||||
|
||||
Archival memory stores information long-term outside the context window. See the [Archival Memory documentation](/guides/agents/archival-memory) for details.
|
||||
|
||||
### archival_memory_insert
|
||||
|
||||
Add content to archival memory for long-term storage.
|
||||
|
||||
**Parameters:**
|
||||
- `content`: Text to store
|
||||
- `tags`: Optional tags for organization
|
||||
|
||||
**Common uses:**
|
||||
- Storing reference information for later
|
||||
- Saving important context that doesn't fit in memory blocks
|
||||
- Building a knowledge base over time
|
||||
|
||||
### archival_memory_search
|
||||
|
||||
Search archival memory using semantic (embedding-based) search.
|
||||
|
||||
**Parameters:**
|
||||
- `query`: What to search for semantically
|
||||
- `tags`: Optional tag filters
|
||||
- `tag_match_mode`: "any" or "all" for tag matching
|
||||
- `top_k`: Maximum results
|
||||
- `start_datetime`, `end_datetime`: ISO 8601 filters (inclusive)
|
||||
|
||||
**Returns:**
|
||||
Matching passages with timestamps and content, ordered by semantic similarity.
|
||||
|
||||
## Deprecated Tools
|
||||
|
||||
These tools are still available but deprecated:
|
||||
|
||||
| Tool | Use Instead |
|
||||
|------|-------------|
|
||||
| `send_message` | Agent responses (no tool needed). See [legacy architectures](/guides/legacy/memgpt_agents_legacy) |
|
||||
| `core_memory_append` | `memory_insert` with `insert_line=-1` |
|
||||
| `core_memory_replace` | `memory_replace` |
|
||||
|
||||
## Related Documentation
|
||||
|
||||
- [Memory Blocks](/guides/agents/memory-blocks)
|
||||
- [Archival Memory](/guides/agents/archival-memory)
|
||||
- [Utilities](/guides/agents/prebuilt-tools)
|
||||
- [Multi-Agent Tools](/guides/agents/multi-agent)
|
||||
- [Custom Tools](/guides/agents/custom-tools)
|
||||
@@ -1,128 +0,0 @@
|
||||
---
|
||||
title: Context Engineering
|
||||
subtitle: How Letta engineerings the context window of your agents
|
||||
slug: guides/agents/context-engineering
|
||||
---
|
||||
|
||||
Context engineering (aka "memory management" or "context management") is the process of managing the context window of an agent to ensure it has access to the information it needs to perform its task.
|
||||
|
||||
Letta and [MemGPT](https://arxiv.org/abs/2310.08560) introduced the concept of **agentic context engineering**, where the context window engineering is done by one or more AI agents. In Letta, agents are able to manage their own context window (and the context window of other agents!) using special memory management tools.
|
||||
|
||||
## Memory management in regular agents
|
||||
By default, Letta agents are provided with tools to modify their own memory blocks. This allows agents to learn and form memories over time, as described in the MemGPT paper.
|
||||
|
||||
The default tools are:
|
||||
* `memory_insert`: Insert content into a block
|
||||
* `memory_replace`: Replace content in a block
|
||||
|
||||
If you do not want your agents to manage their memory, you should disable default tools with `include_base_tools=False` during the agent creation. You can also detach the memory editing tools post-agent creation - if you do so, remember to check the system instructions to make sure there are no references to tools that no longer exist.
|
||||
|
||||
### Memory management with sleep-time compute
|
||||
If you want to enable memory management with sleep-time compute, you can set `enable_sleeptime=True` in the agent creation. For agents enabled with sleep-time, Letta will automatically create sleep-time agents which have the ability to update the blocks of the primary agent. Sleep-time agents will also include `memory_rethink` and `memory_finish_edits` tools.
|
||||
|
||||
Memory management with sleep-time compute can reduce the latency of your main agent (since it is no longer responsible for managing its own memory), but can come at the cost of higher token usage. See our documentation on sleeptime agents for more details.
|
||||
|
||||
## Enabling agents to modify their own memory blocks with tools
|
||||
You can enable agents to modify their own blocks with tools. By default, agents with type `memgpt_v2_agent` will have the tools `memory_insert` and `memory_replace` to allow them to manage values in their own blocks. The legacy tools `core_memory_replace` and `core_memory_append` are deprecated but still available for backwards compatibility for type `memgpt_agent`. You can also make custom modification to blocks by implementing your own custom tools that can access the agent's state by passing in the special `agent_state` parameter into your tools.
|
||||
|
||||
Below is an example of a tool that re-writes the entire memory block of an agent with a new string:
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
function rethinkMemory(agentState: AgentState, newMemory: string, targetBlockLabel: string): void {
|
||||
/**
|
||||
* Rewrite memory block for the main agent, newMemory should contain all current information from the block that is not outdated or inconsistent, integrating any new information, resulting in a new memory block that is organized, readable, and comprehensive.
|
||||
*
|
||||
* @param newMemory - The new memory with information integrated from the memory block. If there is no new information, then this should be the same as the content in the source block.
|
||||
* @param targetBlockLabel - The name of the block to write to.
|
||||
*
|
||||
* @returns void - Always returns void as this function does not produce a response.
|
||||
*/
|
||||
|
||||
if (agentState.memory.getBlock(targetBlockLabel) === null) {
|
||||
agentState.memory.createBlock(targetBlockLabel, newMemory);
|
||||
}
|
||||
|
||||
agentState.memory.updateBlockValue(targetBlockLabel, newMemory);
|
||||
}
|
||||
```
|
||||
```python Python
|
||||
def rethink_memory(agent_state: "AgentState", new_memory: str, target_block_label: str) -> None:
|
||||
"""
|
||||
Rewrite memory block for the main agent, new_memory should contain all current information from the block that is not outdated or inconsistent, integrating any new information, resulting in a new memory block that is organized, readable, and comprehensive.
|
||||
|
||||
Args:
|
||||
new_memory (str): The new memory with information integrated from the memory block. If there is no new information, then this should be the same as the content in the source block.
|
||||
target_block_label (str): The name of the block to write to.
|
||||
|
||||
Returns:
|
||||
None: None is always returned as this function does not produce a response.
|
||||
"""
|
||||
|
||||
if agent_state.memory.get_block(target_block_label) is None:
|
||||
agent_state.memory.create_block(label=target_block_label, value=new_memory)
|
||||
|
||||
agent_state.memory.update_block_value(label=target_block_label, value=new_memory)
|
||||
return None
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
## Modifying blocks via the API
|
||||
You can also [modify blocks via the API](/api-reference/agents/blocks/modify) to directly edit agents' context windows and memory. This can be useful in cases where you want to extract the contents of an agents memory some place in your application (for example, a dashboard or memory viewer), or when you want to programatically modify an agents memory state (for example, allowing an end-user to directly correct or modify their agent's memory).
|
||||
|
||||
## Modifying blocks of other Letta agents via API tools
|
||||
|
||||
<Tip>
|
||||
Importing the Letta Python client inside a tool is a powerful way to allow agents to interact with other agents, since you can use any of the API endpoints. For example, you could create a custom tool that allows an agent to create another Letta agent.
|
||||
</Tip>
|
||||
|
||||
You can allow agents to modify the blocks of other agents by creating tools that import the Letta SDK, then using the block update endpoint:
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
function updateSupervisorBlock(blockLabel: string, newValue: string): void {
|
||||
/**
|
||||
* Update the value of a block in the supervisor agent.
|
||||
*
|
||||
* @param blockLabel - The label of the block to update.
|
||||
* @param newValue - The new value for the block.
|
||||
*
|
||||
* @returns void - Always returns void as this function does not produce a response.
|
||||
*/
|
||||
const { LettaClient } = require('@letta-ai/letta-client');
|
||||
|
||||
const client = new LettaClient({
|
||||
token: process.env.LETTA_API_KEY
|
||||
});
|
||||
|
||||
await client.agents.blocks.modify(
|
||||
agentId,
|
||||
blockLabel,
|
||||
newValue
|
||||
);
|
||||
}
|
||||
```
|
||||
```python Python
|
||||
def update_supervisor_block(block_label: str, new_value: str) -> None:
|
||||
"""
|
||||
Update the value of a block in the supervisor agent.
|
||||
|
||||
Args:
|
||||
block_label (str): The label of the block to update.
|
||||
new_value (str): The new value for the block.
|
||||
|
||||
Returns:
|
||||
None: None is always returned as this function does not produce a response.
|
||||
"""
|
||||
from letta_client import Letta
|
||||
import os
|
||||
|
||||
client = Letta(
|
||||
token=os.getenv("LETTA_API_KEY")
|
||||
)
|
||||
|
||||
client.agents.blocks.modify(
|
||||
agent_id=agent_id,
|
||||
block_label=block_label,
|
||||
value=new_value
|
||||
)
|
||||
```
|
||||
</CodeGroup>
|
||||
@@ -1,264 +0,0 @@
|
||||
---
|
||||
title: Define and customize tools
|
||||
slug: guides/agents/custom-tools
|
||||
---
|
||||
|
||||
You can create custom tools in Letta using the Python SDK, as well as via the [ADE tool builder](/guides/ade/tools).
|
||||
|
||||
For your agent to call a tool, Letta constructs an OpenAI tool schema (contained in `json_schema` field) from the function you define. Letta can either parse this automatically from a properly formatting docstring, or you can pass in the schema explicitly by providing a Pydantic object that defines the argument schema.
|
||||
|
||||
## Creating a custom tool
|
||||
|
||||
### Specifying tools via Pydantic models
|
||||
To create a custom tool, you can extend the `BaseTool` class and specify the following:
|
||||
* `name` - The name of the tool
|
||||
* `args_schema` - A Pydantic model that defines the arguments for the tool
|
||||
* `description` - A description of the tool
|
||||
* `tags` - (Optional) A list of tags for the tool to query
|
||||
You must also define a `run(..)` method for the tool code that takes in the fields from the `args_schema`.
|
||||
|
||||
Below is an example of how to create a tool by extending `BaseTool`:
|
||||
```python title="python" maxLines=50
|
||||
from letta_client import Letta
|
||||
from letta_client.client import BaseTool
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Type
|
||||
import os
|
||||
|
||||
class InventoryItem(BaseModel):
|
||||
sku: str # Unique product identifier
|
||||
name: str # Product name
|
||||
price: float # Current price
|
||||
category: str # Product category (e.g., "Electronics", "Clothing")
|
||||
|
||||
class InventoryEntry(BaseModel):
|
||||
timestamp: int # Unix timestamp of the transaction
|
||||
item: InventoryItem # The product being updated
|
||||
transaction_id: str # Unique identifier for this inventory update
|
||||
|
||||
class InventoryEntryData(BaseModel):
|
||||
data: InventoryEntry
|
||||
quantity_change: int # Change in quantity (positive for additions, negative for removals)
|
||||
|
||||
|
||||
class ManageInventoryTool(BaseTool):
|
||||
name: str = "manage_inventory"
|
||||
args_schema: Type[BaseModel] = InventoryEntryData
|
||||
description: str = "Update inventory catalogue with a new data entry"
|
||||
tags: List[str] = ["inventory", "shop"]
|
||||
|
||||
def run(self, data: InventoryEntry, quantity_change: int) -> bool:
|
||||
print(f"Updated inventory for {data.item.name} with a quantity change of {quantity_change}")
|
||||
return True
|
||||
|
||||
# create a client connected to Letta Cloud
|
||||
# Get your API key at https://app.letta.com/api-keys
|
||||
client = Letta(token=os.getenv("LETTA_API_KEY"))
|
||||
|
||||
# create the tool
|
||||
tool_from_class = client.tools.add(
|
||||
tool=ManageInventoryTool(),
|
||||
)
|
||||
```
|
||||
|
||||
To add this tool using the SDK:
|
||||
|
||||
<CodeGroup>
|
||||
```typescript title="typescript"
|
||||
import { LettaClient } from '@letta-ai/letta-client';
|
||||
|
||||
// create a client to connect to your local Letta server
|
||||
const client = new LettaClient({
|
||||
baseUrl: "http://localhost:8283"
|
||||
});
|
||||
|
||||
// create the tool
|
||||
const toolFromClass = await client.tools.add({
|
||||
tool: manageInventoryTool,
|
||||
});
|
||||
```
|
||||
|
||||
```python title="python"
|
||||
from letta_client import Letta
|
||||
|
||||
# create a client to connect to your local Letta server
|
||||
client = Letta(
|
||||
base_url="http://localhost:8283"
|
||||
)
|
||||
|
||||
# create the tool
|
||||
tool_from_class = client.tools.add(
|
||||
tool=ManageInventoryTool(),
|
||||
)
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
### Specifying tools via function docstrings
|
||||
You can create a tool by passing in a function with a [Google Style Python docstring](https://google.github.io/styleguide/pyguide.html#383-functions-and-methods) specifying the arguments and description of the tool:
|
||||
|
||||
<CodeGroup>
|
||||
```typescript title="typescript"
|
||||
// install letta-client with `npm install @letta-ai/letta-client`
|
||||
import { LettaClient } from '@letta-ai/letta-client';
|
||||
|
||||
// create a client connected to Letta Cloud
|
||||
const client = new LettaClient({
|
||||
token: process.env.LETTA_API_KEY
|
||||
});
|
||||
|
||||
// define a function
|
||||
function rollDice(): string {
|
||||
const diceRoleOutcome = Math.floor(Math.random() * 20) + 1;
|
||||
const outputString = `You rolled a ${diceRoleOutcome}`;
|
||||
return outputString;
|
||||
}
|
||||
|
||||
// create the tool
|
||||
const tool = await client.tools.createFromFunction({
|
||||
func: rollDice
|
||||
});
|
||||
```
|
||||
|
||||
```python title="python" maxLines=50
|
||||
# install letta_client with `pip install letta-client`
|
||||
from letta_client import Letta
|
||||
import os
|
||||
|
||||
# create a client connected to Letta Cloud
|
||||
client = Letta(token=os.getenv("LETTA_API_KEY"))
|
||||
|
||||
# define a function with a docstring
|
||||
def roll_dice() -> str:
|
||||
"""
|
||||
Simulate the roll of a 20-sided die (d20).
|
||||
|
||||
This function generates a random integer between 1 and 20, inclusive,
|
||||
which represents the outcome of a single roll of a d20.
|
||||
|
||||
Returns:
|
||||
str: The result of the die roll.
|
||||
"""
|
||||
import random
|
||||
|
||||
dice_role_outcome = random.randint(1, 20)
|
||||
output_string = f"You rolled a {dice_role_outcome}"
|
||||
return output_string
|
||||
|
||||
# create the tool
|
||||
tool = client.tools.create_from_function(
|
||||
func=roll_dice
|
||||
)
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
The tool creation will return a `Tool` object. You can update the tool with `client.tools.upsert_from_function(...)`.
|
||||
|
||||
|
||||
### Specifying arguments via Pydantic models
|
||||
To specify the arguments for a complex tool, you can use the `args_schema` parameter.
|
||||
|
||||
```python title="python" maxLines=50
|
||||
# install letta_client with `pip install letta-client`
|
||||
from letta_client import Letta
|
||||
|
||||
class Step(BaseModel):
|
||||
name: str = Field(
|
||||
...,
|
||||
description="Name of the step.",
|
||||
)
|
||||
description: str = Field(
|
||||
...,
|
||||
description="An exhaustic description of what this step is trying to achieve and accomplish.",
|
||||
)
|
||||
|
||||
|
||||
class StepsList(BaseModel):
|
||||
steps: list[Step] = Field(
|
||||
...,
|
||||
description="List of steps to add to the task plan.",
|
||||
)
|
||||
explanation: str = Field(
|
||||
...,
|
||||
description="Explanation for the list of steps.",
|
||||
)
|
||||
|
||||
def create_task_plan(steps, explanation):
|
||||
""" Creates a task plan for the current task. """
|
||||
return steps
|
||||
|
||||
|
||||
tool = client.tools.upsert_from_function(
|
||||
func=create_task_plan,
|
||||
args_schema=StepsList
|
||||
)
|
||||
```
|
||||
Note: this path for updating tools is currently only supported in Python.
|
||||
|
||||
### Creating a tool from a file
|
||||
You can also define a tool from a file that contains source code. For example, you may have the following file:
|
||||
```python title="custom_tool.py" maxLines=50
|
||||
from typing import List, Optional
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class Order(BaseModel):
|
||||
order_number: int = Field(
|
||||
...,
|
||||
description="The order number to check on.",
|
||||
)
|
||||
customer_name: str = Field(
|
||||
...,
|
||||
description="The customer name to check on.",
|
||||
)
|
||||
|
||||
def check_order_status(
|
||||
orders: List[Order]
|
||||
):
|
||||
"""
|
||||
Check status of a provided list of orders
|
||||
|
||||
Args:
|
||||
orders (List[Order]): List of orders to check
|
||||
|
||||
Returns:
|
||||
str: The status of the order (e.g. cancelled, refunded, processed, processing, shipping).
|
||||
"""
|
||||
# TODO: implement
|
||||
return "ok"
|
||||
|
||||
```
|
||||
Then, you can define the tool in Letta via the `source_code` parameter:
|
||||
|
||||
<CodeGroup>
|
||||
```typescript title="typescript"
|
||||
import * as fs from 'fs';
|
||||
|
||||
const tool = await client.tools.create({
|
||||
sourceCode: fs.readFileSync("custom_tool.py", "utf-8")
|
||||
});
|
||||
```
|
||||
|
||||
```python title="python" maxLines=50
|
||||
tool = client.tools.create(
|
||||
source_code = open("custom_tool.py", "r").read()
|
||||
)
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
Note that in this case, `check_order_status` will become the name of your tool, since it is the last Python function in the file. Make sure it includes a [Google Style Python docstring](https://google.github.io/styleguide/pyguide.html#383-functions-and-methods) to define the tool's arguments and description.
|
||||
|
||||
# (Advanced) Accessing Agent State
|
||||
<Warning>
|
||||
Tools that use `agent_state` currently do not work in the ADE live tool tester (they will error when you press "Run"), however if the tool is correct it will work once you attach it to an agent.
|
||||
</Warning>
|
||||
If you need to directly access the state of an agent inside a tool, you can use the reserved `agent_state` keyword argument, for example:
|
||||
```python title="python"
|
||||
def get_agent_id(agent_state: "AgentState") -> str:
|
||||
"""
|
||||
A custom tool that returns the agent ID
|
||||
|
||||
Returns:
|
||||
str: The agent ID
|
||||
"""
|
||||
return agent_state.id
|
||||
```
|
||||
@@ -1,161 +0,0 @@
|
||||
---
|
||||
title: Fetch Webpage
|
||||
subtitle: Convert webpages to readable text/markdown
|
||||
slug: guides/agents/fetch-webpage
|
||||
---
|
||||
|
||||
The `fetch_webpage` tool enables Letta agents to fetch and convert webpages into readable text or markdown format. Useful for reading documentation, articles, and web content.
|
||||
|
||||
<Info>
|
||||
On [Letta Cloud](/guides/cloud/overview), this tool works out of the box. For self-hosted deployments with an Exa API key, fetching is enhanced. Without a key, it falls back to open-source extraction tools.
|
||||
</Info>
|
||||
|
||||
## Quick Start
|
||||
|
||||
<CodeGroup>
|
||||
```python Python
|
||||
from letta import Letta
|
||||
|
||||
client = Letta(token="LETTA_API_KEY")
|
||||
|
||||
agent = client.agents.create(
|
||||
model="openai/gpt-4o",
|
||||
tools=["fetch_webpage"],
|
||||
memory_blocks=[{
|
||||
"label": "persona",
|
||||
"value": "I can fetch and read webpages to answer questions about online content."
|
||||
}]
|
||||
)
|
||||
```
|
||||
|
||||
```typescript TypeScript
|
||||
import { LettaClient } from '@letta-ai/letta-client';
|
||||
|
||||
const client = new LettaClient({ token: "LETTA_API_KEY" });
|
||||
|
||||
const agent = await client.agents.create({
|
||||
model: "openai/gpt-4o",
|
||||
tools: ["fetch_webpage"],
|
||||
memoryBlocks: [{
|
||||
label: "persona",
|
||||
value: "I can fetch and read webpages to answer questions about online content."
|
||||
}]
|
||||
});
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
## Tool Parameters
|
||||
|
||||
| Parameter | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `url` | `str` | The URL of the webpage to fetch |
|
||||
|
||||
## Return Format
|
||||
|
||||
The tool returns webpage content as text/markdown.
|
||||
|
||||
**With Exa API (if configured):**
|
||||
```json
|
||||
{
|
||||
"title": "Page title",
|
||||
"published_date": "2025-01-15",
|
||||
"author": "Author name",
|
||||
"text": "Full page content in markdown"
|
||||
}
|
||||
```
|
||||
|
||||
**Fallback (without Exa):**
|
||||
Returns markdown-formatted text extracted from the HTML.
|
||||
|
||||
## How It Works
|
||||
|
||||
The tool uses a multi-tier approach:
|
||||
|
||||
1. **Exa API** (if `EXA_API_KEY` is configured): Uses Exa's content extraction
|
||||
2. **Trafilatura** (fallback): Open-source text extraction to markdown
|
||||
3. **Readability + html2text** (final fallback): HTML cleaning and conversion
|
||||
|
||||
## Self-Hosted Setup
|
||||
|
||||
For enhanced fetching on self-hosted servers, optionally configure an Exa API key. Without it, the tool still works using open-source extraction.
|
||||
|
||||
### Optional: Configure Exa
|
||||
|
||||
<CodeGroup>
|
||||
```bash Docker
|
||||
docker run \
|
||||
-e EXA_API_KEY="your_exa_api_key" \
|
||||
letta/letta:latest
|
||||
```
|
||||
|
||||
```yaml Docker Compose
|
||||
services:
|
||||
letta:
|
||||
environment:
|
||||
- EXA_API_KEY=your_exa_api_key
|
||||
```
|
||||
|
||||
```python Per-Agent
|
||||
agent = client.agents.create(
|
||||
tools=["fetch_webpage"],
|
||||
tool_env_vars={
|
||||
"EXA_API_KEY": "your_exa_api_key"
|
||||
}
|
||||
)
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
## Common Patterns
|
||||
|
||||
### Documentation Reader
|
||||
```python
|
||||
agent = client.agents.create(
|
||||
model="openai/gpt-4o",
|
||||
tools=["fetch_webpage", "web_search"],
|
||||
memory_blocks=[{
|
||||
"label": "persona",
|
||||
"value": "I search for documentation with web_search and read it with fetch_webpage."
|
||||
}]
|
||||
)
|
||||
```
|
||||
|
||||
### Research Assistant
|
||||
```python
|
||||
agent = client.agents.create(
|
||||
model="openai/gpt-4o",
|
||||
tools=["fetch_webpage", "archival_memory_insert"],
|
||||
memory_blocks=[{
|
||||
"label": "persona",
|
||||
"value": "I fetch articles and store key insights in archival memory for later reference."
|
||||
}]
|
||||
)
|
||||
```
|
||||
|
||||
### Content Summarizer
|
||||
```python
|
||||
agent = client.agents.create(
|
||||
model="openai/gpt-4o",
|
||||
tools=["fetch_webpage"],
|
||||
memory_blocks=[{
|
||||
"label": "persona",
|
||||
"value": "I fetch webpages and provide summaries of their content."
|
||||
}]
|
||||
)
|
||||
```
|
||||
|
||||
## When to Use
|
||||
|
||||
| Use Case | Tool | Why |
|
||||
|----------|------|-----|
|
||||
| Read specific webpage | `fetch_webpage` | Direct URL access |
|
||||
| Find webpages to read | `web_search` | Discovery first |
|
||||
| Read + search in one | `web_search` with `include_text=true` | Combined operation |
|
||||
| Multiple pages | `fetch_webpage` | Iterate over URLs |
|
||||
|
||||
## Related Documentation
|
||||
|
||||
- [Utilities Overview](/guides/agents/prebuilt-tools)
|
||||
- [Web Search](/guides/agents/web-search)
|
||||
- [Run Code](/guides/agents/run-code)
|
||||
- [Custom Tools](/guides/agents/custom-tools)
|
||||
- [Tool Variables](/guides/agents/tool-variables)
|
||||
@@ -1,694 +0,0 @@
|
||||
---
|
||||
title: Human-in-the-Loop
|
||||
slug: guides/agents/human-in-the-loop
|
||||
subtitle: How to integrate human-in-the-loop workflows for tool approval
|
||||
---
|
||||
|
||||
Human-in-the-loop (HITL) workflows allow you to maintain control over critical agent actions by requiring human approval before executing certain tools. This is essential for operations that could have significant consequences, such as database modifications, financial transactions, or external API calls with cost implications.
|
||||
|
||||
```mermaid
|
||||
flowchart LR
|
||||
Agent[Agent] -->|Calls Tool| Check{Requires<br/>Approval?}
|
||||
Check -->|No| Execute[Execute Tool]
|
||||
Check -->|Yes| Request[Request Approval]
|
||||
Request --> Human[Human Review]
|
||||
Human -->|Approve| Execute
|
||||
Human -->|Deny| Error[Return Error]
|
||||
Execute --> Result[Return Result]
|
||||
Error --> Agent
|
||||
Result --> Agent
|
||||
```
|
||||
|
||||
## Overview
|
||||
|
||||
When a tool is marked as requiring approval, the agent will pause execution and wait for human approval or denial before proceeding. This creates a checkpoint in the agent's workflow where human judgment can be applied. The approval workflow is designed to be non-blocking and supports both synchronous and streaming message interfaces, making it suitable for interactive applications as well as batch processing systems.
|
||||
|
||||
### Key Benefits
|
||||
|
||||
- **Risk Mitigation**: Prevent unintended actions in production environments
|
||||
- **Cost Control**: Review expensive operations before execution
|
||||
- **Compliance**: Ensure human oversight for regulated operations
|
||||
- **Quality Assurance**: Validate agent decisions before critical actions
|
||||
|
||||
### How It Works
|
||||
|
||||
The approval workflow follows a clear sequence of steps that ensures human oversight at critical decision points:
|
||||
|
||||
1. **Tool Configuration**: Mark specific tools as requiring approval either globally (default for all agents) or per-agent
|
||||
2. **Execution Pause**: When the agent attempts to call a protected tool, it immediately pauses and returns an approval request message
|
||||
3. **Human Review**: The approval request includes the tool name, arguments, and context, allowing you to make an informed decision
|
||||
4. **Approval/Denial**: Send an approval response to either execute the tool or provide feedback for the agent to adjust its approach
|
||||
5. **Continuation**: The agent receives the tool result (on approval) or an error message (on denial) and continues processing
|
||||
|
||||
|
||||
## Best Practices
|
||||
|
||||
Following these best practices will help you implement effective human-in-the-loop workflows while maintaining a good user experience and system performance.
|
||||
|
||||
### 1. Selective Tool Marking
|
||||
|
||||
Not every tool needs human approval. Be strategic about which tools require oversight to avoid workflow bottlenecks while maintaining necessary controls:
|
||||
|
||||
**Tools that typically require approval:**
|
||||
- Database write operations (INSERT, UPDATE, DELETE)
|
||||
- External API calls with financial implications
|
||||
- File system modifications or deletions
|
||||
- Communication tools (email, SMS, notifications)
|
||||
- System configuration changes
|
||||
- Third-party service integrations with rate limits
|
||||
|
||||
### 2. Clear Denial Reasons
|
||||
|
||||
When denying a request, your feedback directly influences how the agent adjusts its approach. Provide specific, actionable guidance rather than vague rejections:
|
||||
|
||||
```python
|
||||
# Good: Specific and actionable
|
||||
"reason": "Use read-only query first to verify the data before deletion"
|
||||
|
||||
# Bad: Too vague
|
||||
"reason": "Don't do that"
|
||||
```
|
||||
|
||||
The agent will use your denial reason to reformulate its approach, so the more specific you are, the better the agent can adapt.
|
||||
|
||||
## Setting Up Approval Requirements
|
||||
|
||||
There are two methods for configuring tool approval requirements, each suited for different use cases. Choose the approach that best fits your security model and operational needs.
|
||||
|
||||
### Method 1: Create/Upsert Tool with Default Approval Requirement
|
||||
|
||||
Set approval requirements at the tool level when creating or upserting a tool. This approach ensures consistent security policies across all agents that use the tool. The `default_requires_approval` flag will be applied to all future agent-tool attachments:
|
||||
|
||||
<CodeGroup>
|
||||
```curl curl maxLines=50
|
||||
curl --request POST \
|
||||
--url https://api.letta.com/v1/tools \
|
||||
--header 'Authorization: Bearer $LETTA_API_KEY' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"name": "sensitive_operation",
|
||||
"default_requires_approval": true,
|
||||
"json_schema": {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "sensitive_operation",
|
||||
"parameters": {...}
|
||||
}
|
||||
},
|
||||
"source_code": "def sensitive_operation(...): ..."
|
||||
}'
|
||||
|
||||
# All agents using this tool will require approval
|
||||
curl --request POST \
|
||||
--url https://api.letta.com/v1/agents \
|
||||
--header 'Authorization: Bearer $LETTA_API_KEY' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"tools": ["sensitive_operation"],
|
||||
// ... other configuration
|
||||
}'
|
||||
```
|
||||
```python python maxLines=50
|
||||
# Create a tool that requires approval by default
|
||||
approval_tool = client.tools.upsert_from_function(
|
||||
func=sensitive_operation,
|
||||
default_requires_approval=True,
|
||||
)
|
||||
|
||||
# All agents using this tool will require approval
|
||||
agent = client.agents.create(
|
||||
tools=['sensitive_operation'],
|
||||
# ... other configuration
|
||||
)
|
||||
```
|
||||
```typescript TypeScript maxLines=50
|
||||
// Create a tool that requires approval by default
|
||||
const approvalTool = await client.tools.upsert({
|
||||
name: "sensitive_operation",
|
||||
defaultRequiresApproval: true,
|
||||
jsonSchema: {
|
||||
type: "function",
|
||||
function: {
|
||||
name: "sensitive_operation",
|
||||
parameters: {...}
|
||||
}
|
||||
},
|
||||
sourceCode: "def sensitive_operation(...): ..."
|
||||
});
|
||||
|
||||
// All agents using this tool will require approval
|
||||
const agent = await client.agents.create({
|
||||
tools: ["sensitive_operation"],
|
||||
// ... other configuration
|
||||
});
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
### Method 2: Modify Existing Tool with Default Approval Requirement
|
||||
|
||||
<Note>
|
||||
Modifying the tool-level setting will not retroactively apply to existing agent-tool attachments - it only sets the default for future attachments. This means that if the tool is already attached to an agent, the agent will continue using the tool without approval. To modify an existing agent-tool attachment, refer to Method 3 below.
|
||||
</Note>
|
||||
|
||||
For an already existing tool, you can modify the tool to set approval requirements on future agent-tool attachments. The `default_requires_approval` flag will be applied to all future agent-tool attachments:
|
||||
|
||||
<CodeGroup>
|
||||
```curl curl maxLines=50
|
||||
curl --request PATCH \
|
||||
--url https://api.letta.com/v1/tools/$TOOL_ID \
|
||||
--header 'Authorization: Bearer $LETTA_API_KEY' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"default_requires_approval": true
|
||||
}'
|
||||
|
||||
# All agents using this tool will require approval
|
||||
curl --request POST \
|
||||
--url https://api.letta.com/v1/agents \
|
||||
--header 'Authorization: Bearer $LETTA_API_KEY' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"tools": ["sensitive_operation"],
|
||||
// ... other configuration
|
||||
}'
|
||||
```
|
||||
```python python maxLines=50
|
||||
# Create a tool that requires approval by default
|
||||
approval_tool = client.tools.modify(
|
||||
tool_id=sensitive_operation.id,
|
||||
default_requires_approval=True,
|
||||
)
|
||||
|
||||
# All agents using this tool will require approval
|
||||
agent = client.agents.create(
|
||||
tools=['sensitive_operation'],
|
||||
# ... other configuration
|
||||
)
|
||||
```
|
||||
```typescript TypeScript maxLines=50
|
||||
// Create a tool that requires approval by default
|
||||
const approvalTool = await client.tools.modify({
|
||||
tool_id=sensitive_operation.id,
|
||||
defaultRequiresApproval: true,
|
||||
});
|
||||
|
||||
// All agents using this tool will require approval
|
||||
const agent = await client.agents.create({
|
||||
tools: ["sensitive_operation"],
|
||||
// ... other configuration
|
||||
});
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
### Method 3: Per-Agent Tool Approval
|
||||
|
||||
Configure approval requirements for specific agent-tool combinations, allowing fine-grained control over individual agent behaviors. This method is particularly useful for:
|
||||
|
||||
- **Trusted agents**: Remove approval requirements for well-tested, reliable agents
|
||||
- **Progressive autonomy**: Gradually reduce approval requirements as agents prove reliable
|
||||
- **Override defaults**: Change the approval setting for tools already attached to an agent
|
||||
|
||||
Use the following endpoints to modify approval settings for existing agent-tool relationships:
|
||||
|
||||
<CodeGroup>
|
||||
```curl curl maxLines=50
|
||||
curl --request PATCH \
|
||||
--url https://api.letta.com/v1/agents/$AGENT_ID/tools/$TOOL_NAME/approval \
|
||||
--header 'Authorization: Bearer $LETTA_API_KEY' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"requires_approval": true
|
||||
}'
|
||||
```
|
||||
```python python maxLines=50
|
||||
# Modify approval requirement for a specific agent
|
||||
client.agents.tools.modify_approval(
|
||||
agent_id=agent.id,
|
||||
tool_name="database_write",
|
||||
requires_approval=True,
|
||||
)
|
||||
|
||||
# Check current approval settings
|
||||
tools = client.agents.tools.list(agent_id=agent.id)
|
||||
for tool in tools:
|
||||
print(f"{tool.name}: requires_approval={tool.requires_approval}")
|
||||
```
|
||||
```typescript TypeScript maxLines=50
|
||||
// Modify approval requirement for a specific agent
|
||||
await client.agents.tools.modifyApproval({
|
||||
agentId: agent.id,
|
||||
toolName: "database_write",
|
||||
requiresApproval: true,
|
||||
});
|
||||
|
||||
// Check current approval settings
|
||||
const tools = await client.agents.tools.list({
|
||||
agentId: agent.id,
|
||||
});
|
||||
for (const tool of tools) {
|
||||
console.log(`${tool.name}: requires_approval=${tool.requiresApproval}`);
|
||||
}
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
## Handling Approval Requests
|
||||
|
||||
### Step 1: Agent Requests Approval
|
||||
|
||||
When the agent attempts to call a tool that requires approval, execution immediately pauses. The agent returns a special approval request message containing:
|
||||
|
||||
- **Tool name**: The specific tool being called
|
||||
- **Arguments**: The exact parameters the agent intends to pass
|
||||
- **Tool call ID**: A unique identifier for tracking this specific call
|
||||
- **Message ID**: The approval request ID needed for your response
|
||||
- **Stop reason**: Set to `"requires_approval"` to indicate the pause state
|
||||
|
||||
This format matches the ToolCallMessage format intentionally, so that we can handle approval requests the same way we handle tool calls. Here's what an approval request looks like in practice:
|
||||
|
||||
<CodeGroup>
|
||||
```curl curl maxLines=50
|
||||
curl --request POST \
|
||||
--url https://api.letta.com/v1/agents/$AGENT_ID/messages \
|
||||
--header 'Authorization: Bearer $LETTA_API_KEY' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"messages": [{
|
||||
"role": "user",
|
||||
"content": "Delete all test data from the database"
|
||||
}]
|
||||
}'
|
||||
|
||||
# Response includes approval request
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"message_type": "reasoning_message",
|
||||
"reasoning": "I need to delete test data from the database..."
|
||||
},
|
||||
{
|
||||
"message_type": "approval_request_message",
|
||||
"id": "message-abc123",
|
||||
"tool_call": {
|
||||
"name": "database_write",
|
||||
"arguments": "{\"query\": \"DELETE FROM test_data\"}",
|
||||
"tool_call_id": "tool-xyz789"
|
||||
}
|
||||
}
|
||||
],
|
||||
"stop_reason": "requires_approval"
|
||||
}
|
||||
```
|
||||
```python python maxLines=50
|
||||
response = client.agents.messages.create(
|
||||
agent_id=agent.id,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": "Delete all test data from the database"
|
||||
}]
|
||||
)
|
||||
|
||||
# Response includes approval request
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"message_type": "reasoning_message",
|
||||
"reasoning": "I need to delete test data from the database..."
|
||||
},
|
||||
{
|
||||
"message_type": "approval_request_message",
|
||||
"id": "message-abc123",
|
||||
"tool_call": {
|
||||
"name": "database_write",
|
||||
"arguments": "{\"query\": \"DELETE FROM test_data\"}",
|
||||
"tool_call_id": "tool-xyz789"
|
||||
}
|
||||
}
|
||||
],
|
||||
"stop_reason": "requires_approval"
|
||||
}
|
||||
```
|
||||
```typescript TypeScript maxLines=50
|
||||
const response = await client.agents.messages.create({
|
||||
agentId: agent.id,
|
||||
requestBody: {
|
||||
messages: [{
|
||||
role: "user",
|
||||
content: "Delete all test data from the database"
|
||||
}]
|
||||
}
|
||||
});
|
||||
|
||||
// Response includes approval request
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"message_type": "reasoning_message",
|
||||
"reasoning": "I need to delete test data from the database..."
|
||||
},
|
||||
{
|
||||
"message_type": "approval_request_message",
|
||||
"id": "message-abc123",
|
||||
"tool_call": {
|
||||
"name": "database_write",
|
||||
"arguments": "{\"query\": \"DELETE FROM test_data\"}",
|
||||
"tool_call_id": "tool-xyz789"
|
||||
}
|
||||
}
|
||||
],
|
||||
"stop_reason": "requires_approval"
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
</CodeGroup>
|
||||
|
||||
### Step 2: Review and Respond
|
||||
|
||||
Once you receive an approval request, you have two options: approve the tool execution or deny it with guidance. The agent will remain paused until it receives your response.
|
||||
|
||||
<Note> While an approval is pending, the agent cannot process any other messages - you must resolve the approval request first.</Note>
|
||||
|
||||
#### Approving the Request
|
||||
|
||||
To approve a tool call, send an approval message with `approve: true` and the approval request ID. The agent will immediately execute the tool and continue processing:
|
||||
|
||||
<CodeGroup>
|
||||
```curl curl maxLines=50
|
||||
curl --request POST \
|
||||
--url https://api.letta.com/v1/agents/$AGENT_ID/messages \
|
||||
--header 'Authorization: Bearer $LETTA_API_KEY' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"messages": [{
|
||||
"type": "approval",
|
||||
"approvals": [{
|
||||
"approve": true,
|
||||
"tool_call_id": "tool-xyz789"
|
||||
}]
|
||||
}]
|
||||
}'
|
||||
|
||||
# Response continues with tool execution
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"message_type": "tool_return_message",
|
||||
"status": "success",
|
||||
"tool_return": "Deleted 1,234 test records"
|
||||
},
|
||||
{
|
||||
"message_type": "reasoning_message",
|
||||
"reasoning": "I was able to delete the test data. Let me inform the user."
|
||||
},
|
||||
{
|
||||
"message_type": "assistant_message",
|
||||
"content": "I've successfully deleted 1,234 test records from the database."
|
||||
}
|
||||
],
|
||||
"stop_reason": "end_turn"
|
||||
}
|
||||
```
|
||||
```python python maxLines=50
|
||||
# Approve the tool call
|
||||
response = client.agents.messages.create(
|
||||
agent_id=agent.id,
|
||||
messages=[{
|
||||
"type": "approval",
|
||||
"approvals": [{
|
||||
"approve": True,
|
||||
"tool_call_id": "tool-xyz789"
|
||||
}]
|
||||
}]
|
||||
)
|
||||
|
||||
# Response continues with tool execution
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"message_type": "tool_return_message",
|
||||
"status": "success",
|
||||
"tool_return": "Deleted 1,234 test records"
|
||||
},
|
||||
{
|
||||
"message_type": "reasoning_message",
|
||||
"reasoning": "I was able to delete the test data. Let me inform the user."
|
||||
},
|
||||
{
|
||||
"message_type": "assistant_message",
|
||||
"content": "I've successfully deleted 1,234 test records from the database."
|
||||
}
|
||||
],
|
||||
"stop_reason": "end_turn"
|
||||
}
|
||||
```
|
||||
```typescript TypeScript maxLines=50
|
||||
// Approve the tool call
|
||||
const response = await client.agents.messages.create({
|
||||
agentId: agent.id,
|
||||
requestBody: {
|
||||
messages: [{
|
||||
type: "approval",
|
||||
approvals: [{
|
||||
approve: true,
|
||||
tool_call_id: "tool-xyz789"
|
||||
}]
|
||||
}]
|
||||
}
|
||||
});
|
||||
|
||||
// Response continues with tool execution
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"message_type": "tool_return_message",
|
||||
"status": "success",
|
||||
"tool_return": "Deleted 1,234 test records"
|
||||
},
|
||||
{
|
||||
"message_type": "reasoning_message",
|
||||
"reasoning": "I was able to delete the test data. Let me inform the user."
|
||||
},
|
||||
{
|
||||
"message_type": "assistant_message",
|
||||
"content": "I've successfully deleted 1,234 test records from the database."
|
||||
}
|
||||
],
|
||||
"stop_reason": "end_turn"
|
||||
}
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
#### Denying with Guidance
|
||||
|
||||
When denying a tool call, you can provide a reason that helps the agent understand how to adjust its approach. The agent will receive an error response and can use your feedback to reformulate its strategy. This is particularly useful for guiding the agent toward safer or more appropriate actions:
|
||||
|
||||
<CodeGroup>
|
||||
```curl curl maxLines=50
|
||||
curl --request POST \
|
||||
--url https://api.letta.com/v1/agents/$AGENT_ID/messages \
|
||||
--header 'Authorization: Bearer $LETTA_API_KEY' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"messages": [{
|
||||
"type": "approval",
|
||||
"approvals": [{
|
||||
"approve": false,
|
||||
"tool_call_id": "tool-xyz789",
|
||||
"reason": "Only delete records older than 30 days, not all test data"
|
||||
}]
|
||||
}]
|
||||
}'
|
||||
|
||||
# Response shows agent adjusting based on feedback
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"message_type": "tool_return_message",
|
||||
"status": "error",
|
||||
"tool_return": "Error: request denied. Reason: Only delete records older than 30 days, not all test data"
|
||||
},
|
||||
{
|
||||
"message_type": "reasoning_message",
|
||||
"reasoning": "I need to modify my query to only delete old records..."
|
||||
},
|
||||
{
|
||||
"message_type": "tool_call_message",
|
||||
"tool_call": {
|
||||
"name": "database_write",
|
||||
"arguments": "{\"query\": \"DELETE FROM test_data WHERE created_at < NOW() - INTERVAL 30 DAY\"}"
|
||||
}
|
||||
}
|
||||
],
|
||||
"stop_reason": "requires_approval"
|
||||
}
|
||||
```
|
||||
```python python maxLines=50
|
||||
# Deny with explanation
|
||||
response = client.agents.messages.create(
|
||||
agent_id=agent.id,
|
||||
messages=[{
|
||||
"type": "approval",
|
||||
"approvals": [{
|
||||
"approve": False,
|
||||
"tool_call_id": "tool-xyz789",
|
||||
"reason": "Only delete records older than 30 days, not all test data"
|
||||
}]
|
||||
}]
|
||||
)
|
||||
|
||||
# Response shows agent adjusting based on feedback
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"message_type": "tool_return_message",
|
||||
"status": "error",
|
||||
"tool_return": "Error: request denied. Reason: Only delete records older than 30 days, not all test data"
|
||||
},
|
||||
{
|
||||
"message_type": "reasoning_message",
|
||||
"reasoning": "I need to modify my query to only delete old records..."
|
||||
},
|
||||
{
|
||||
"message_type": "tool_call_message",
|
||||
"tool_call": {
|
||||
"name": "database_write",
|
||||
"arguments": "{\"query\": \"DELETE FROM test_data WHERE created_at < NOW() - INTERVAL 30 DAY\"}"
|
||||
}
|
||||
}
|
||||
],
|
||||
"stop_reason": "requires_approval"
|
||||
}
|
||||
```
|
||||
```typescript TypeScript maxLines=50
|
||||
// Deny with explanation
|
||||
const response = await client.agents.messages.create({
|
||||
agentId: agent.id,
|
||||
requestBody: {
|
||||
messages: [{
|
||||
type: "approval",
|
||||
approvals: [{
|
||||
approve: false,
|
||||
tool_call_id: "tool-xyz789",
|
||||
reason: "Only delete records older than 30 days, not all test data"
|
||||
}]
|
||||
}]
|
||||
}
|
||||
});
|
||||
|
||||
// Response shows agent adjusting based on feedback
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"message_type": "tool_return_message",
|
||||
"status": "error",
|
||||
"tool_return": "Error: request denied. Reason: Only delete records older than 30 days, not all test data"
|
||||
},
|
||||
{
|
||||
"message_type": "reasoning_message",
|
||||
"reasoning": "I need to modify my query to only delete old records..."
|
||||
},
|
||||
{
|
||||
"message_type": "tool_call_message",
|
||||
"tool_call": {
|
||||
"name": "database_write",
|
||||
"arguments": "{\"query\": \"DELETE FROM test_data WHERE created_at < NOW() - INTERVAL 30 DAY\"}"
|
||||
}
|
||||
}
|
||||
],
|
||||
"stop_reason": "requires_approval"
|
||||
}
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
### Streaming + Background Mode
|
||||
|
||||
For streaming clients using background mode, approvals are best handled via `agents.messages.createStream(..., background: true)`. The approval response may include the `tool_return_message` on the approval stream itself, and follow‑up reasoning/assistant messages can be read by resuming that stream’s `run_id`.
|
||||
|
||||
<Note>
|
||||
Do not assume the `tool_return_message` will repeat after you resume. Treat the one on the approval stream as the source of truth, then resume to continue reading subsequent tokens.
|
||||
</Note>
|
||||
|
||||
<CodeGroup>
|
||||
```curl curl maxLines=70
|
||||
# Approve in background after receiving approval_request_message
|
||||
curl --request POST --url https://api.letta.com/v1/agents/$AGENT_ID/messages/stream --header 'Content-Type: application/json' --data '{
|
||||
"messages": [{"type": "approval", "approve": true, "approval_request_id": "message-abc"}],
|
||||
"stream_tokens": true,
|
||||
"background": true
|
||||
}'
|
||||
|
||||
# Example approval stream output (tool result arrives here):
|
||||
data: {"run_id":"run-new","seq_id":0,"message_type":"tool_return_message","status":"success","tool_return":"..."}
|
||||
|
||||
# Continue by resuming the approval stream's run
|
||||
curl --request GET --url https://api.letta.com/v1/runs/$RUN_ID/stream --header 'Accept: text/event-stream' --data '{
|
||||
"starting_after": 0
|
||||
}'
|
||||
```
|
||||
```python python maxLines=70
|
||||
# Receive an approval_request_message, then approve in background
|
||||
approve = client.agents.messages.create_stream(
|
||||
agent_id=agent.id,
|
||||
messages=[{"type": "approval", "approvals": [{"approve": True, "tool_call_id": "tool-xyz789"}]}],
|
||||
stream_tokens=True,
|
||||
background=True,
|
||||
)
|
||||
|
||||
run_id = None
|
||||
last_seq = 0
|
||||
for chunk in approve:
|
||||
if hasattr(chunk, "run_id") and hasattr(chunk, "seq_id"):
|
||||
run_id = chunk.run_id
|
||||
last_seq = chunk.seq_id
|
||||
if getattr(chunk, "message_type", None) == "tool_return_message":
|
||||
# Tool result arrives here on the approval stream
|
||||
break
|
||||
|
||||
# Continue consuming output by resuming the background run
|
||||
if run_id:
|
||||
for chunk in client.runs.stream(run_id, starting_after=last_seq):
|
||||
print(chunk)
|
||||
```
|
||||
```typescript TypeScript maxLines=70
|
||||
// Receive an approval_request_message, then approve in background
|
||||
const approve = await client.agents.messages.createStream({
|
||||
agentId: agent.id,
|
||||
requestBody: {
|
||||
messages: [{ type: "approval", approvals: [{ approve: true, tool_call_id: "tool-xyz789" }] }],
|
||||
streamTokens: true,
|
||||
background: true,
|
||||
}
|
||||
});
|
||||
|
||||
let runId: string | null = null;
|
||||
let lastSeq = 0;
|
||||
for await (const chunk of approve) {
|
||||
if (chunk.run_id && chunk.seq_id) { runId = chunk.run_id; lastSeq = chunk.seq_id; }
|
||||
if (chunk.message_type === "tool_return_message") {
|
||||
// Tool result arrives here on the approval stream
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Continue consuming output by resuming the background run
|
||||
if (runId) {
|
||||
const resume = await client.runs.stream(runId, { startingAfter: lastSeq });
|
||||
for await (const chunk of resume) {
|
||||
console.log(chunk);
|
||||
}
|
||||
}
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
|
||||
|
||||
<Note>
|
||||
**Run switching in background mode:** Approvals are separate background requests and create a new `run_id`. Save the approval stream cursor and resume that run. The original paused run will not deliver the tool result — do not wait for the tool return there.
|
||||
</Note>
|
||||
|
||||
See [background mode](/guides/agents/long-running) for resumption patterns.
|
||||
### IDs and UI Triggers
|
||||
|
||||
- **approval_request_id**: This field is now deprecated, but it is still used for backwards compatibility. Used `approval_request_message.id`.
|
||||
- **tool_call_id**: Always send approvals/denials using the `tool_call_id` from the `ApprovalRequestMessage`.
|
||||
- **UI trigger**: Open the approval UI on `approval_request_message` only; do not derive UI from `stop_reason`.
|
||||
@@ -1,460 +0,0 @@
|
||||
---
|
||||
title: JSON Mode & Structured Output
|
||||
subtitle: Get structured JSON responses from your Letta agents
|
||||
slug: guides/agents/json-mode
|
||||
---
|
||||
|
||||
Letta provides two ways to get structured JSON output from agents: **Structured Generation through Tools** (recommended) and the `response_format` parameter.
|
||||
|
||||
## Quick Comparison
|
||||
|
||||
<Note>
|
||||
**Recommended**: Use **Structured Generation through Tools** - works with all providers (Anthropic, OpenAI, Google, etc.) and integrates naturally with Letta's tool-calling architecture.
|
||||
</Note>
|
||||
|
||||
<Info>
|
||||
**Structured Generation through Tools**:
|
||||
- ✅ Universal provider compatibility
|
||||
- ✅ Both reasoning AND structured output
|
||||
- ✅ Per-message control
|
||||
- ✅ Works even as "dummy tool" for pure formatting
|
||||
</Info>
|
||||
|
||||
<Warning>
|
||||
**`response_format` parameter**:
|
||||
- ⚠️ OpenAI-compatible providers only (NOT Anthropic)
|
||||
- ⚠️ Persistent agent state (affects all future responses)
|
||||
|
||||
- ✅ Built-in provider schema enforcement
|
||||
</Warning>
|
||||
|
||||
## Structured Generation through Tools (Recommended)
|
||||
|
||||
Create a tool that defines your desired response format. The tool arguments become your structured data, and you can extract them from the tool call.
|
||||
|
||||
### Creating a Structured Generation Tool
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript maxLines=100
|
||||
import { LettaClient } from '@letta-ai/letta-client'
|
||||
|
||||
// Create client connected to Letta Cloud
|
||||
const client = new LettaClient({ token: process.env.LETTA_API_KEY });
|
||||
|
||||
// First create the tool
|
||||
const toolCode = `def generate_rank(rank: int, reason: str):
|
||||
"""Generate a ranking with explanation.
|
||||
|
||||
Args:
|
||||
rank (int): The numerical rank from 1-10.
|
||||
reason (str): The reasoning behind the rank.
|
||||
"""
|
||||
print("Rank generated")
|
||||
return`;
|
||||
|
||||
const tool = await client.tools.create({
|
||||
sourceCode: toolCode,
|
||||
sourceType: "python"
|
||||
});
|
||||
|
||||
// Create agent with the structured generation tool
|
||||
const agentState = await client.agents.create({
|
||||
model: "openai/gpt-4o-mini",
|
||||
memoryBlocks: [
|
||||
{
|
||||
label: "human",
|
||||
value: "The human's name is Chad. They are a food enthusiast who enjoys trying different cuisines."
|
||||
},
|
||||
{
|
||||
label: "persona",
|
||||
value: "I am a helpful food critic assistant. I provide detailed rankings and reviews of different foods and restaurants."
|
||||
}
|
||||
],
|
||||
toolIds: [tool.id]
|
||||
});
|
||||
```
|
||||
|
||||
```python title="python" maxLines=100
|
||||
from letta_client import Letta
|
||||
|
||||
# Create client connected to Letta Cloud
|
||||
import os
|
||||
client = Letta(token=os.getenv("LETTA_API_KEY"))
|
||||
|
||||
def generate_rank(rank: int, reason: str):
|
||||
"""Generate a ranking with explanation.
|
||||
|
||||
Args:
|
||||
rank (int): The numerical rank from 1-10.
|
||||
reason (str): The reasoning behind the rank.
|
||||
"""
|
||||
print("Rank generated")
|
||||
return
|
||||
|
||||
# Create the tool
|
||||
tool = client.tools.create(func=generate_rank)
|
||||
|
||||
# Create agent with the structured generation tool
|
||||
agent_state = client.agents.create(
|
||||
model="openai/gpt-4o-mini",
|
||||
embedding="openai/text-embedding-3-small",
|
||||
memory_blocks=[
|
||||
{
|
||||
"label": "human",
|
||||
"value": "The human's name is Chad. They are a food enthusiast who enjoys trying different cuisines."
|
||||
},
|
||||
{
|
||||
"label": "persona",
|
||||
"value": "I am a helpful food critic assistant. I provide detailed rankings and reviews of different foods and restaurants."
|
||||
}
|
||||
],
|
||||
tool_ids=[tool.id]
|
||||
)
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
### Using the Structured Generation Tool
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript maxLines=100
|
||||
// Send message and instruct agent to use the tool
|
||||
const response = await client.agents.messages.create(
|
||||
agentState.id, {
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: "How do you rank sushi as a food? Please use the generate_rank tool to provide your response."
|
||||
}
|
||||
]
|
||||
}
|
||||
);
|
||||
|
||||
// Extract structured data from tool call
|
||||
for (const message of response.messages) {
|
||||
if (message.messageType === "tool_call_message") {
|
||||
const args = JSON.parse(message.toolCall.arguments);
|
||||
console.log(`Rank: ${args.rank}`);
|
||||
console.log(`Reason: ${args.reason}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Example output:
|
||||
// Rank: 8
|
||||
// Reason: Sushi is a highly regarded cuisine known for its fresh ingredients...
|
||||
```
|
||||
|
||||
```python title="python" maxLines=100
|
||||
# Send message and instruct agent to use the tool
|
||||
response = client.agents.messages.create(
|
||||
agent_id=agent_state.id,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "How do you rank sushi as a food? Please use the generate_rank tool to provide your response."
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
# Extract structured data from tool call
|
||||
for message in response.messages:
|
||||
if message.message_type == "tool_call_message":
|
||||
import json
|
||||
args = json.loads(message.tool_call.arguments)
|
||||
rank = args["rank"]
|
||||
reason = args["reason"]
|
||||
print(f"Rank: {rank}")
|
||||
print(f"Reason: {reason}")
|
||||
|
||||
# Example output:
|
||||
# Rank: 8
|
||||
# Reason: Sushi is a highly regarded cuisine known for its fresh ingredients...
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
The agent will call the tool, and you can extract the structured arguments:
|
||||
|
||||
```json
|
||||
{
|
||||
"rank": 8,
|
||||
"reason": "Sushi is a highly regarded cuisine known for its fresh ingredients, artistic presentation, and cultural significance."
|
||||
}
|
||||
```
|
||||
|
||||
## Using `response_format` for Provider-Native JSON Mode
|
||||
|
||||
The `response_format` parameter enables structured output/JSON mode from LLM providers that support it. This approach is fundamentally different from tools because **`response_format` becomes a persistent part of the agent's state** - once set, all future responses from that agent will follow the format until explicitly changed.
|
||||
|
||||
Under the hood, `response_format` constrains the agent's assistant messages to follow the specified schema, but it doesn't affect tools - those continue to work normally with their original schemas.
|
||||
|
||||
<Warning>
|
||||
**Requirements for `response_format`:**
|
||||
- Only works with providers that support structured outputs (like OpenAI) - NOT Anthropic or other providers
|
||||
|
||||
</Warning>
|
||||
|
||||
### Basic JSON Mode
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript maxLines=100
|
||||
import { LettaClient } from '@letta-ai/letta-client'
|
||||
|
||||
// Create client (Letta Cloud)
|
||||
const client = new LettaClient({ token: "LETTA_API_KEY" });
|
||||
|
||||
// Create agent with basic JSON mode (OpenAI/compatible providers only)
|
||||
const agentState = await client.agents.create({
|
||||
model: "openai/gpt-4o-mini",
|
||||
memoryBlocks: [
|
||||
{
|
||||
label: "human",
|
||||
value: "The human's name is Chad. They work as a data analyst and prefer clear, organized information."
|
||||
},
|
||||
{
|
||||
label: "persona",
|
||||
value: "I am a helpful assistant who provides clear and well-organized responses."
|
||||
}
|
||||
],
|
||||
responseFormat: { type: "json_object" }
|
||||
});
|
||||
|
||||
// Send message expecting JSON response
|
||||
const response = await client.agents.messages.create(
|
||||
agentState.id, {
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: "How do you rank sushi as a food? Please respond in JSON format with rank and reason fields."
|
||||
}
|
||||
]
|
||||
}
|
||||
);
|
||||
|
||||
for (const message of response.messages) {
|
||||
console.log(message);
|
||||
}
|
||||
```
|
||||
|
||||
```python title="python" maxLines=100
|
||||
from letta_client import Letta
|
||||
|
||||
# Create client (Letta Cloud)
|
||||
client = Letta(token="LETTA_API_KEY")
|
||||
|
||||
# Create agent with basic JSON mode (OpenAI/compatible providers only)
|
||||
agent_state = client.agents.create(
|
||||
model="openai/gpt-4o-mini",
|
||||
embedding="openai/text-embedding-3-small",
|
||||
memory_blocks=[
|
||||
{
|
||||
"label": "human",
|
||||
"value": "The human's name is Chad. They work as a data analyst and prefer clear, organized information."
|
||||
},
|
||||
{
|
||||
"label": "persona",
|
||||
"value": "I am a helpful assistant who provides clear and well-organized responses."
|
||||
}
|
||||
],
|
||||
response_format={"type": "json_object"}
|
||||
)
|
||||
|
||||
# Send message expecting JSON response
|
||||
response = client.agents.messages.create(
|
||||
agent_id=agent_state.id,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "How do you rank sushi as a food? Please respond in JSON format with rank and reason fields."
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
for message in response.messages:
|
||||
print(message)
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
### Advanced JSON Schema Mode
|
||||
|
||||
For more precise control, you can use OpenAI's `json_schema` mode with strict validation:
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript maxLines=100
|
||||
import { LettaClient } from '@letta-ai/letta-client'
|
||||
|
||||
const client = new LettaClient({ token: "LETTA_API_KEY" });
|
||||
|
||||
// Define structured schema (from OpenAI structured outputs guide)
|
||||
const responseFormat = {
|
||||
type: "json_schema",
|
||||
jsonSchema: {
|
||||
name: "food_ranking",
|
||||
schema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
rank: {
|
||||
type: "integer",
|
||||
minimum: 1,
|
||||
maximum: 10
|
||||
},
|
||||
reason: {
|
||||
type: "string"
|
||||
},
|
||||
categories: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
name: { type: "string" },
|
||||
score: { type: "integer" }
|
||||
},
|
||||
required: ["name", "score"],
|
||||
additionalProperties: false
|
||||
}
|
||||
}
|
||||
},
|
||||
required: ["rank", "reason", "categories"],
|
||||
additionalProperties: false
|
||||
},
|
||||
strict: true
|
||||
}
|
||||
};
|
||||
|
||||
// Create agent
|
||||
const agentState = await client.agents.create({
|
||||
model: "openai/gpt-4o-mini",
|
||||
memoryBlocks: []
|
||||
});
|
||||
|
||||
// Update agent with response format
|
||||
const updatedAgent = await client.agents.update(
|
||||
agentState.id,
|
||||
{ responseFormat }
|
||||
);
|
||||
|
||||
// Send message
|
||||
const response = await client.agents.messages.create(
|
||||
agentState.id, {
|
||||
messages: [
|
||||
{ role: "user", content: "How do you rank sushi? Include categories for taste, presentation, and value." }
|
||||
]
|
||||
}
|
||||
);
|
||||
|
||||
for (const message of response.messages) {
|
||||
console.log(message);
|
||||
}
|
||||
```
|
||||
|
||||
```python title="python" maxLines=100
|
||||
from letta_client import Letta
|
||||
|
||||
client = Letta(token="LETTA_API_KEY")
|
||||
|
||||
# Define structured schema (from OpenAI structured outputs guide)
|
||||
response_format = {
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": "food_ranking",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"rank": {
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"maximum": 10
|
||||
},
|
||||
"reason": {
|
||||
"type": "string"
|
||||
},
|
||||
"categories": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": { "type": "string" },
|
||||
"score": { "type": "integer" }
|
||||
},
|
||||
"required": ["name", "score"],
|
||||
"additionalProperties": False
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["rank", "reason", "categories"],
|
||||
"additionalProperties": False
|
||||
},
|
||||
"strict": True
|
||||
}
|
||||
}
|
||||
|
||||
# Create agent
|
||||
agent_state = client.agents.create(
|
||||
model="openai/gpt-4o-mini",
|
||||
embedding="openai/text-embedding-3-small",
|
||||
memory_blocks=[]
|
||||
)
|
||||
|
||||
# Update agent with response format
|
||||
agent_state = client.agents.update(
|
||||
agent_id=agent_state.id,
|
||||
response_format=response_format
|
||||
)
|
||||
|
||||
# Send message
|
||||
response = client.agents.messages.create(
|
||||
agent_id=agent_state.id,
|
||||
messages=[
|
||||
{"role": "user", "content": "How do you rank sushi? Include categories for taste, presentation, and value."}
|
||||
]
|
||||
)
|
||||
|
||||
for message in response.messages:
|
||||
print(message)
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
With structured JSON schema, the agent's response will be strictly validated:
|
||||
|
||||
```json
|
||||
{
|
||||
"rank": 8,
|
||||
"reason": "Sushi is highly regarded for its fresh ingredients and artful presentation",
|
||||
"categories": [
|
||||
{"name": "taste", "score": 9},
|
||||
{"name": "presentation", "score": 10},
|
||||
{"name": "value", "score": 6}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
## Updating Agent Response Format
|
||||
|
||||
You can update an existing agent's response format:
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript maxLines=100
|
||||
// Update agent to use JSON mode (OpenAI/compatible only)
|
||||
await client.agents.update(agentState.id, {
|
||||
responseFormat: { type: "json_object" }
|
||||
});
|
||||
|
||||
// Or remove JSON mode
|
||||
await client.agents.update(agentState.id, {
|
||||
responseFormat: null
|
||||
});
|
||||
```
|
||||
|
||||
```python title="python" maxLines=100
|
||||
# Update agent to use JSON mode (OpenAI/compatible only)
|
||||
client.agents.update(
|
||||
agent_id=agent_state.id,
|
||||
response_format={"type": "json_object"}
|
||||
)
|
||||
|
||||
# Or remove JSON mode
|
||||
client.agents.update(
|
||||
agent_id=agent_state.id,
|
||||
response_format=None
|
||||
)
|
||||
```
|
||||
</CodeGroup>
|
||||
@@ -1,602 +0,0 @@
|
||||
---
|
||||
title: Long-Running Executions
|
||||
slug: guides/agents/long-running
|
||||
subtitle: How to handle long-running agent executions
|
||||
---
|
||||
|
||||
When agents need to execute multiple tool calls or perform complex operations (like deep research, data analysis, or multi-step workflows), processing time can vary significantly.
|
||||
|
||||
Letta supports various ways to handle long-running agents, so you can choose the approach that best fits your use case:
|
||||
|
||||
| Use Case | Duration | Recommendedation | Key Benefits |
|
||||
|----------|----------|---------------------|-------------|
|
||||
| Few-step invocations | < 1 minute | [Standard streaming](/guides/agents/streaming) | Simplest approach |
|
||||
| Variable length runs | 1-10 minutes | **Background mode** (Keepalive + Timeout as a second choice) | Easy way to reduce timeouts |
|
||||
| Deep research | 10+ minutes | **Background mode**, or async polling | Survives disconnects, resumable streams |
|
||||
| Batch jobs | Any | **Async polling** | Fire-and-forget, check results later |
|
||||
|
||||
## Option 1: Background Mode with Resumable Streaming
|
||||
|
||||
<Note>
|
||||
**Best for:** Operations exceeding 10 minutes, unreliable network connections, or critical workflows that must complete regardless of client connectivity.
|
||||
|
||||
**Trade-off:** Slightly higher latency to first token due to background task initialization.
|
||||
</Note>
|
||||
|
||||
Background mode decouples agent execution from your client connection. The agent processes your request on the server while streaming results to a persistent store, allowing you to reconnect and resume from any point — even if your application crashes or network fails.
|
||||
|
||||
<CodeGroup>
|
||||
```curl curl maxLines=50
|
||||
curl --request POST \
|
||||
--url https://api.letta.com/v1/agents/$AGENT_ID/messages/stream \
|
||||
--header 'Authorization: Bearer $LETTA_API_KEY' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Run comprehensive analysis on this dataset"
|
||||
}
|
||||
],
|
||||
"stream_tokens": true,
|
||||
"background": true
|
||||
}'
|
||||
|
||||
# Response stream includes run_id and seq_id for each chunk:
|
||||
data: {"run_id":"run-123","seq_id":0,"message_type":"reasoning_message","reasoning":"Analyzing"}
|
||||
data: {"run_id":"run-123","seq_id":1,"message_type":"reasoning_message","reasoning":" the dataset"}
|
||||
data: {"run_id":"run-123","seq_id":2,"message_type":"tool_call","tool_call":{...}}
|
||||
# ... stream continues
|
||||
|
||||
# Step 2: If disconnected, resume from last received seq_id
|
||||
curl --request GET \
|
||||
--url https://api.letta.com/v1/runs/$RUN_ID/stream \
|
||||
--header 'Accept: text/event-stream' \
|
||||
--data '{
|
||||
"starting_after": 57
|
||||
}'
|
||||
```
|
||||
```python python maxLines=50
|
||||
stream = client.agents.messages.create_stream(
|
||||
agent_id=agent_state.id,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Run comprehensive analysis on this dataset"
|
||||
}
|
||||
],
|
||||
stream_tokens=True,
|
||||
background=True,
|
||||
)
|
||||
run_id = None
|
||||
last_seq_id = None
|
||||
for chunk in stream:
|
||||
if hasattr(chunk, "run_id") and hasattr(chunk, "seq_id"):
|
||||
run_id = chunk.run_id # Save this to reconnect if your connection drops
|
||||
last_seq_id = chunk.seq_id # Save this as your resumption point for cursor-based pagination
|
||||
print(chunk)
|
||||
|
||||
# If disconnected, resume from last received seq_id:
|
||||
for chunk in client.runs.stream(run_id, starting_after=last_seq_id):
|
||||
print(chunk)
|
||||
```
|
||||
```typescript TypeScript maxLines=50
|
||||
const stream = await client.agents.messages.createStream({
|
||||
agentId: agentState.id,
|
||||
requestBody: {
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: "Run comprehensive analysis on this dataset"
|
||||
}
|
||||
],
|
||||
streamTokens: true,
|
||||
background: true,
|
||||
}
|
||||
});
|
||||
|
||||
let runId = null;
|
||||
let lastSeqId = null;
|
||||
for await (const chunk of stream) {
|
||||
if (chunk.run_id && chunk.seq_id) {
|
||||
runId = chunk.run_id; // Save this to reconnect if your connection drops
|
||||
lastSeqId = chunk.seq_id; // Save this as your resumption point for cursor-based pagination
|
||||
}
|
||||
console.log(chunk);
|
||||
}
|
||||
|
||||
// If disconnected, resume from last received seq_id
|
||||
for await (const chunk of client.runs.stream(runId, {startingAfter: lastSeqId})) {
|
||||
console.log(chunk);
|
||||
}
|
||||
```
|
||||
```python python maxLines=60
|
||||
# 1) Start background stream and capture approval request
|
||||
stream = client.agents.messages.create_stream(
|
||||
agent_id=agent.id,
|
||||
messages=[{"role": "user", "content": "Do a sensitive operation"}],
|
||||
stream_tokens=True,
|
||||
background=True,
|
||||
)
|
||||
|
||||
approval_request_id = None
|
||||
orig_run_id = None
|
||||
last_seq_id = 0
|
||||
for chunk in stream:
|
||||
if hasattr(chunk, "run_id") and hasattr(chunk, "seq_id"):
|
||||
orig_run_id = chunk.run_id
|
||||
last_seq_id = chunk.seq_id
|
||||
if getattr(chunk, "message_type", None) == "approval_request_message":
|
||||
approval_request_id = chunk.id
|
||||
break
|
||||
|
||||
# 2) Approve in background; capture the approval stream cursor (this creates a new run)
|
||||
approve = client.agents.messages.create_stream(
|
||||
agent_id=agent.id,
|
||||
messages=[{"type": "approval", "approve": True, "approval_request_id": approval_request_id}],
|
||||
stream_tokens=True,
|
||||
background=True,
|
||||
)
|
||||
|
||||
run_id = None
|
||||
approve_seq = 0
|
||||
for chunk in approve:
|
||||
if hasattr(chunk, "run_id") and hasattr(chunk, "seq_id"):
|
||||
run_id = chunk.run_id
|
||||
approve_seq = chunk.seq_id
|
||||
if getattr(chunk, "message_type", None) == "tool_return_message":
|
||||
# Tool result arrives here on the approval stream
|
||||
break
|
||||
|
||||
# 3) Resume that run to read follow-up tokens
|
||||
for chunk in client.runs.stream(run_id, starting_after=approve_seq):
|
||||
print(chunk)
|
||||
```
|
||||
```typescript TypeScript maxLines=60
|
||||
// 1) Start background stream and capture approval request
|
||||
const stream = await client.agents.messages.createStream(
|
||||
agent.id, {
|
||||
messages: [{role: "user", content: "Do a sensitive operation"}],
|
||||
streamTokens: true,
|
||||
background: true,
|
||||
}
|
||||
);
|
||||
|
||||
let approvalRequestId = null;
|
||||
let origRunId = null;
|
||||
let lastSeqId = 0;
|
||||
for await (const chunk of stream) {
|
||||
if (chunk.runId && chunk.seqId) {
|
||||
origRunId = chunk.runId;
|
||||
lastSeqId = chunk.seqId;
|
||||
}
|
||||
if (chunk.messageType === "approval_request_message") {
|
||||
approvalRequestId = chunk.id;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// 2) Approve in background; capture the approval stream cursor (this creates a new run)
|
||||
const approveStream = await client.agents.messages.createStream(
|
||||
agent.id, {
|
||||
messages: [{type: "approval", approve: true, approvalRequestId}],
|
||||
streamTokens: true,
|
||||
background: true,
|
||||
}
|
||||
);
|
||||
|
||||
let runId = null;
|
||||
let approveSeq = 0;
|
||||
for await (const chunk of approveStream) {
|
||||
if (chunk.runId && chunk.seqId) {
|
||||
runId = chunk.runId;
|
||||
approveSeq = chunk.seqId;
|
||||
}
|
||||
if (chunk.messageType === "tool_return_message") {
|
||||
// Tool result arrives here on the approval stream
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// 3) Resume that run to read follow-up tokens
|
||||
for await (const chunk of client.runs.stream(runId, {startingAfter: approveSeq})) {
|
||||
console.log(chunk);
|
||||
}
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
### HITL in Background Mode
|
||||
|
||||
When [Human‑in‑the‑Loop (HITL) approval](/guides/agents/human-in-the-loop) is enabled for a tool, your background stream may pause and emit an `approval_request_message`. In background mode, send the approval via a separate background stream and capture that stream’s `run_id`/`seq_id`.
|
||||
|
||||
<Note>
|
||||
Approval responses in background mode emit the `tool_return_message` on the approval stream itself (with a new `run_id`, different from the original stream). Save the approval stream cursor, then resume with `runs.stream` to consume subsequent reasoning/assistant messages.
|
||||
</Note>
|
||||
|
||||
<CodeGroup>
|
||||
```curl curl maxLines=70
|
||||
# 1) Start background stream; capture approval request
|
||||
curl --request POST \
|
||||
--url https://api.letta.com/v1/agents/$AGENT_ID/messages/stream \
|
||||
--header 'Authorization: Bearer $LETTA_API_KEY' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"messages": [{"role": "user", "content": "Do a sensitive operation"}],
|
||||
"stream_tokens": true,
|
||||
"background": true
|
||||
}'
|
||||
|
||||
# Example stream output (approval request arrives):
|
||||
data: {"run_id":"run-abc","seq_id":0,"message_type":"reasoning_message","reasoning":"..."}
|
||||
data: {"run_id":"run-abc","seq_id":1,"message_type":"approval_request_message","id":"message-abc","tool_call":{"name":"sensitive_operation","arguments":"{...}","tool_call_id":"tool-xyz"}}
|
||||
|
||||
# 2) Approve in background; capture approval stream cursor (this creates a new run)
|
||||
curl --request POST \
|
||||
--url https://api.letta.com/v1/agents/$AGENT_ID/messages/stream \
|
||||
--header 'Authorization: Bearer $LETTA_API_KEY' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"messages": [{"type": "approval", "approve": true, "approval_request_id": "message-abc"}],
|
||||
"stream_tokens": true,
|
||||
"background": true
|
||||
}'
|
||||
|
||||
# Example approval stream output (tool result arrives here):
|
||||
data: {"run_id":"run-new","seq_id":0,"message_type":"tool_return_message","status":"success","tool_return":"..."}
|
||||
|
||||
# 3) Resume the approval stream's run to continue
|
||||
curl --request GET \
|
||||
--url https://api.letta.com/v1/runs/$RUN_ID/stream \
|
||||
--header 'Accept: text/event-stream' \
|
||||
--data '{
|
||||
"starting_after": 0
|
||||
}'
|
||||
```
|
||||
```python python maxLines=70
|
||||
# 1) Start background stream and capture approval request
|
||||
stream = client.agents.messages.create_stream(
|
||||
agent_id=agent.id,
|
||||
messages=[{"role": "user", "content": "Do a sensitive operation"}],
|
||||
stream_tokens=True,
|
||||
background=True,
|
||||
)
|
||||
|
||||
approval_request_id = None
|
||||
orig_run_id = None
|
||||
last_seq_id = 0
|
||||
for chunk in stream:
|
||||
if hasattr(chunk, "run_id") and hasattr(chunk, "seq_id"):
|
||||
orig_run_id = chunk.run_id
|
||||
last_seq_id = chunk.seq_id
|
||||
if getattr(chunk, "message_type", None) == "approval_request_message":
|
||||
approval_request_id = chunk.id
|
||||
break
|
||||
|
||||
# 2) Approve in background; capture the approval stream cursor (this creates a new run)
|
||||
approve = client.agents.messages.create_stream(
|
||||
agent_id=agent.id,
|
||||
messages=[{"type": "approval", "approve": True, "approval_request_id": approval_request_id}],
|
||||
stream_tokens=True,
|
||||
background=True,
|
||||
)
|
||||
|
||||
run_id = None
|
||||
approve_seq = 0
|
||||
for chunk in approve:
|
||||
if hasattr(chunk, "run_id") and hasattr(chunk, "seq_id"):
|
||||
run_id = chunk.run_id
|
||||
approve_seq = chunk.seq_id
|
||||
if getattr(chunk, "message_type", None) == "tool_return_message":
|
||||
# Tool result arrives here on the approval stream
|
||||
break
|
||||
|
||||
# 3) Resume that run to read follow-up tokens
|
||||
for chunk in client.runs.stream(run_id, starting_after=approve_seq):
|
||||
print(chunk)
|
||||
```
|
||||
```typescript TypeScript maxLines=70
|
||||
// 1) Start background stream and capture approval request
|
||||
const stream = await client.agents.messages.createStream({
|
||||
agentId: agent.id,
|
||||
requestBody: {
|
||||
messages: [{ role: "user", content: "Do a sensitive operation" }],
|
||||
streamTokens: true,
|
||||
background: true,
|
||||
}
|
||||
});
|
||||
|
||||
let approvalRequestId: string | null = null;
|
||||
let origRunId: string | null = null;
|
||||
let lastSeqId = 0;
|
||||
for await (const chunk of stream) {
|
||||
if (chunk.run_id && chunk.seq_id) { origRunId = chunk.run_id; lastSeqId = chunk.seq_id; }
|
||||
if (chunk.message_type === "approval_request_message") {
|
||||
approvalRequestId = chunk.id; break;
|
||||
}
|
||||
}
|
||||
|
||||
// 2) Approve in background; capture the approval stream cursor (this creates a new run)
|
||||
const approve = await client.agents.messages.createStream({
|
||||
agentId: agent.id,
|
||||
requestBody: {
|
||||
messages: [{ type: "approval", approve: true, approvalRequestId }],
|
||||
streamTokens: true,
|
||||
background: true,
|
||||
}
|
||||
});
|
||||
|
||||
let runId: string | null = null;
|
||||
let approveSeq = 0;
|
||||
for await (const chunk of approve) {
|
||||
if (chunk.run_id && chunk.seq_id) { runId = chunk.run_id; approveSeq = chunk.seq_id; }
|
||||
if (chunk.message_type === "tool_return_message") {
|
||||
// Tool result arrives here on the approval stream
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// 3) Resume that run to read follow-up tokens
|
||||
const resume = await client.runs.stream(runId!, { startingAfter: approveSeq });
|
||||
for await (const chunk of resume) {
|
||||
console.log(chunk);
|
||||
}
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
|
||||
### Discovering and Resuming Active Streams
|
||||
|
||||
When your application starts or recovers from a crash, you can check for any active background streams and resume them. This is particularly useful for:
|
||||
- **Application restarts**: Resume processing after deployments or crashes
|
||||
- **Load balancing**: Pick up streams started by other instances
|
||||
- **Monitoring**: Check progress of long-running operations from different clients
|
||||
|
||||
<CodeGroup>
|
||||
```curl curl maxLines=50
|
||||
# Step 1: Find active background streams for your agents
|
||||
curl --request GET \
|
||||
--url https://api.letta.com/v1/runs/active \
|
||||
--header 'Authorization: Bearer $LETTA_API_KEY' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"agent_ids": [
|
||||
"agent-123",
|
||||
"agent-456"
|
||||
],
|
||||
"background": true
|
||||
}'
|
||||
# Returns: [{"run_id": "run-abc", "agent_id": "agent-123", "status": "processing", ...}]
|
||||
|
||||
# Step 2: Resume streaming from the beginning (or any specified seq_id)
|
||||
curl --request GET \
|
||||
--url https://api.letta.com/v1/runs/$RUN_ID/stream \
|
||||
--header 'Accept: text/event-stream' \
|
||||
--data '{
|
||||
"starting_after": 0, # Start from beginning
|
||||
"batch_size": 1000 # Fetch historical chunks in larger batches
|
||||
}'
|
||||
```
|
||||
```python python maxLines=50
|
||||
# Find and resume active background streams
|
||||
active_runs = client.runs.active(
|
||||
agent_ids=["agent-123", "agent-456"],
|
||||
background=True,
|
||||
)
|
||||
|
||||
if active_runs:
|
||||
# Resume the first active stream from the beginning
|
||||
run = active_runs[0]
|
||||
print(f"Resuming stream for run {run.id}, status: {run.status}")
|
||||
|
||||
stream = client.runs.stream(
|
||||
run_id=run.id,
|
||||
starting_after=0, # Start from beginning
|
||||
batch_size=1000 # Fetch historical chunks in larger batches
|
||||
)
|
||||
|
||||
# Each historical chunk is streamed one at a time, followed by new chunks as they become available
|
||||
for chunk in stream:
|
||||
print(chunk)
|
||||
```
|
||||
```typescript TypeScript maxLines=50
|
||||
// Find and resume active background streams
|
||||
const activeRuns = await client.runs.active({
|
||||
agentIds: ["agent-123", "agent-456"],
|
||||
background: true,
|
||||
});
|
||||
|
||||
if (activeRuns.length > 0) {
|
||||
// Resume the first active stream from the beginning
|
||||
const run = activeRuns[0];
|
||||
console.log(`Resuming stream for run ${run.id}, status: ${run.status}`);
|
||||
|
||||
const stream = await client.runs.stream(run.id, {
|
||||
startingAfter: 0, // Start from beginning
|
||||
batchSize: 1000 // Fetch historical chunks in larger batches
|
||||
});
|
||||
|
||||
// Each historical chunk is streamed one at a time, followed by new chunks as they become available
|
||||
for await (const chunk of stream) {
|
||||
console.log(chunk);
|
||||
}
|
||||
}
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
## Option 2: Async Operations with Polling
|
||||
|
||||
<Note>
|
||||
**Best for:** Usecases where you don't need real-time token streaming.
|
||||
</Note>
|
||||
|
||||
Ideal for batch processing, scheduled jobs, or when you don't need real-time updates. The [async SDK method](/api-reference/agents/messages/create-async) queues your request and returns immediately, letting you check results later:
|
||||
|
||||
<CodeGroup>
|
||||
```curl curl maxLines=50
|
||||
# Start async operation (returns immediately with run ID)
|
||||
curl --request POST \
|
||||
--url https://api.letta.com/v1/agents/$AGENT_ID/messages/async \
|
||||
--header 'Authorization: Bearer $LETTA_API_KEY' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Run comprehensive analysis on this dataset"
|
||||
}
|
||||
]
|
||||
}'
|
||||
|
||||
# Poll for results using the returned run ID
|
||||
curl --request GET \
|
||||
--url https://api.letta.com/v1/runs/$RUN_ID
|
||||
```
|
||||
```python python maxLines=50
|
||||
# Start async operation (returns immediately with run ID)
|
||||
run = client.agents.messages.create_async(
|
||||
agent_id=agent_state.id,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Run comprehensive analysis on this dataset"
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
# Poll for completion
|
||||
import time
|
||||
while run.status != "completed":
|
||||
time.sleep(2)
|
||||
run = client.runs.retrieve(run_id=run.id)
|
||||
|
||||
# Get the messages once complete
|
||||
messages = client.runs.messages.list(run_id=run.id)
|
||||
```
|
||||
```typescript TypeScript maxLines=50
|
||||
// Start async operation (returns immediately with run ID)
|
||||
const run = await client.agents.createAgentMessageAsync({
|
||||
agentId: agentState.id,
|
||||
requestBody: {
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: "Run comprehensive analysis on this dataset"
|
||||
}
|
||||
]
|
||||
}
|
||||
});
|
||||
|
||||
// Poll for completion
|
||||
while (run.status !== "completed") {
|
||||
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||
run = await client.runs.retrieveRun({ runId: run.id });
|
||||
}
|
||||
|
||||
// Get the messages once complete
|
||||
const messages = await client.runs.listRunMessages({ runId: run.id });
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
## Option 3: Configure Streaming with Keepalive Pings and Longer Timeouts
|
||||
|
||||
<Note>
|
||||
**Best for:** Usecases where you are already using the standard [streaming code](/guides/agents/streaming), but are experiencing issues with timeouts or disconnects (e.g. due to network interruptions or hanging tool executions).
|
||||
|
||||
**Trade-off:** Not as reliable as background mode, and does not support resuming a disconnected stream/request.
|
||||
</Note>
|
||||
|
||||
<Warning>
|
||||
This approach assumes a persistent HTTP connection. We highly recommend using **background mode** (or async polling) for long-running jobs, especially when:
|
||||
- Your infrastructure uses aggressive proxy timeouts
|
||||
- You need to handle network interruptions gracefully
|
||||
- Operations might exceed 10 minutes
|
||||
</Warning>
|
||||
|
||||
For operations under 10 minutes that need real-time updates without the complexity of background processing. Configure keepalive pings and timeouts to maintain stable connections:
|
||||
|
||||
<CodeGroup>
|
||||
```curl curl maxLines=50
|
||||
curl --request POST \
|
||||
--url https://api.letta.com/v1/agents/$AGENT_ID/messages/stream \
|
||||
--header 'Authorization: Bearer $LETTA_API_KEY' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Execute this long-running analysis"
|
||||
}
|
||||
],
|
||||
"include_pings": true
|
||||
}'
|
||||
```
|
||||
```python python
|
||||
# Configure client with extended timeout
|
||||
from letta_client import Letta
|
||||
import os
|
||||
|
||||
client = Letta(
|
||||
token=os.getenv("LETTA_API_KEY")
|
||||
)
|
||||
|
||||
# Enable pings to prevent timeout during long operations
|
||||
stream = client.agents.messages.create_stream(
|
||||
agent_id=agent_state.id,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Execute this long-running analysis"
|
||||
}
|
||||
],
|
||||
include_pings=True, # Sends periodic keepalive messages
|
||||
request_options={"timeout_in_seconds": 600} # 10 min timeout
|
||||
)
|
||||
|
||||
# Process the stream (pings will keep connection alive)
|
||||
for chunk in stream:
|
||||
if chunk.message_type == "ping":
|
||||
# Keepalive ping received, connection is still active
|
||||
continue
|
||||
print(chunk)
|
||||
```
|
||||
```typescript TypeScript maxLines=50
|
||||
// Configure client with extended timeout
|
||||
import { Letta } from '@letta/sdk';
|
||||
|
||||
const client = new Letta({
|
||||
token: process.env.LETTA_API_KEY
|
||||
});
|
||||
|
||||
// Enable pings to prevent timeout during long operations
|
||||
const stream = await client.agents.createAgentMessageStream({
|
||||
agentId: agentState.id,
|
||||
requestBody: {
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: "Execute this long-running analysis"
|
||||
}
|
||||
],
|
||||
includePings: true // Sends periodic keepalive messages
|
||||
}, {
|
||||
timeoutInSeconds: 600 // 10 minutes timeout in seconds
|
||||
}
|
||||
});
|
||||
|
||||
// Process the stream (pings will keep connection alive)
|
||||
for await (const chunk of stream) {
|
||||
if (chunk.message_type === "ping") {
|
||||
// Keepalive ping received, connection is still active
|
||||
continue;
|
||||
}
|
||||
console.log(chunk);
|
||||
}
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
### Configuration Guidelines
|
||||
|
||||
| Parameter | Purpose | When to Use |
|
||||
|-----------|---------|------------|
|
||||
| Timeout in seconds | Extends request timeout beyond 60s default | Set to 1.5x your expected max duration |
|
||||
| Include pings | Sends keepalive messages every ~30s | Enable for operations with long gaps between outputs |
|
||||
@@ -1,295 +0,0 @@
|
||||
---
|
||||
title: Agent Memory
|
||||
subtitle: How Letta agents manage and evolve their memory
|
||||
slug: guides/agents/memory
|
||||
---
|
||||
|
||||
<Tip>
|
||||
Want to dive deeper? Read our blog posts on [agent memory](https://www.letta.com/blog/agent-memory), [context engineering](https://www.letta.com/blog/guide-to-context-engineering), [memory blocks](https://www.letta.com/blog/memory-blocks), and [RAG vs agent memory](https://www.letta.com/blog/rag-vs-agent-memory).
|
||||
</Tip>
|
||||
|
||||
## What is agent memory?
|
||||
|
||||
**Agent memory in Letta is about managing what information is visible in the agent's context window.**
|
||||
|
||||
Unlike traditional LLMs that are stateless (forgetting everything between interactions), Letta agents maintain persistent, evolving memory by intelligently managing their context window over time.
|
||||
|
||||
The key insight: **the context window is a scarce resource.** You can't fit an entire conversation history or knowledge base into it. Effective memory is about:
|
||||
- **What's in context right now** (immediately visible to the LLM)
|
||||
- **What's been moved to external storage** (retrievable when needed)
|
||||
- **Who decides what stays and what goes** (the agent itself)
|
||||
|
||||
## The LLM Operating System
|
||||
|
||||
Letta is built on the [MemGPT](https://arxiv.org/abs/2310.08560) paper, which introduced the concept of an "LLM Operating System" for memory management. Just like a computer OS manages different types of memory (registers, RAM, disk), Letta agents manage different tiers of information:
|
||||
|
||||
```mermaid
|
||||
flowchart TB
|
||||
subgraph ContextWindow["⚡ CONTEXT WINDOW (What the LLM sees)"]
|
||||
direction TB
|
||||
System[System Prompt<br/>Kernel context]
|
||||
Blocks[Memory Blocks<br/>Agent-managed context]
|
||||
Messages[Recent Messages<br/>Conversation buffer]
|
||||
end
|
||||
|
||||
subgraph External["💾 EXTERNAL STORAGE (Retrieved on-demand)"]
|
||||
direction TB
|
||||
Recall[Recall Memory<br/>Full conversation history]
|
||||
Archival[Archival Memory<br/>Explicit facts & knowledge]
|
||||
Files[Data Sources<br/>Documents & files]
|
||||
end
|
||||
|
||||
Blocks -->|Agent edits| Blocks
|
||||
Messages -->|Overflow| Recall
|
||||
ContextWindow -.->|Agent searches| External
|
||||
```
|
||||
|
||||
### Memory tiers explained
|
||||
|
||||
| Tier | Size | Speed | Managed By | Purpose |
|
||||
|------|------|-------|------------|---------|
|
||||
| **System Prompt** | ~1-2K tokens | Instant | System | Agent instructions & behavior |
|
||||
| **Memory Blocks** | ~2-4K tokens total | Instant | **Agent** | Self-editing structured memory |
|
||||
| **Message Buffer** | Variable | Instant | System | Recent conversation flow |
|
||||
| **Recall Memory** | Unlimited | 1-2 sec | Agent via search | Past conversation history |
|
||||
| **Archival Memory** | Unlimited | 1-2 sec | Agent via search | Explicit facts & knowledge |
|
||||
| **Data Sources** | Unlimited | 1-2 sec | Agent via search | Uploaded documents |
|
||||
|
||||
## Memory blocks: Units of abstraction
|
||||
|
||||
**Memory blocks are discrete, structured sections of the context window that agents can read and edit.**
|
||||
|
||||
Think of memory blocks as "variables" that persist across interactions:
|
||||
|
||||
```python
|
||||
# Traditional approach: everything is ephemeral
|
||||
messages = [
|
||||
{"role": "user", "content": "I'm Sarah, I like Python"},
|
||||
{"role": "assistant", "content": "Hi Sarah!"},
|
||||
{"role": "user", "content": "What's my name?"}, # Model only "knows" from message history
|
||||
]
|
||||
|
||||
# Letta approach: structured, persistent memory blocks
|
||||
memory_blocks = [
|
||||
{
|
||||
"label": "human",
|
||||
"value": "Name: Sarah\nPreferences: Python programming",
|
||||
"description": "Key details about the user"
|
||||
},
|
||||
{
|
||||
"label": "persona",
|
||||
"value": "I am a helpful coding assistant",
|
||||
"description": "My identity and behavior"
|
||||
}
|
||||
]
|
||||
# Agent can edit these blocks over time as it learns more
|
||||
```
|
||||
|
||||
### Why memory blocks?
|
||||
|
||||
**Memory blocks solve the fundamental challenge of context window management:**
|
||||
|
||||
1. **Consistency**: Same information is visible across all interactions (not dependent on what fits in message buffer)
|
||||
2. **Editability**: Agents can update their understanding over time (not just accumulate)
|
||||
3. **Structure**: Organized sections instead of unstructured message history
|
||||
4. **Control**: Agents decide what's important enough to persist
|
||||
|
||||
### Default memory blocks
|
||||
|
||||
Letta agents typically start with two memory blocks:
|
||||
|
||||
**Persona Block** - Who the agent is
|
||||
```
|
||||
My name is Sam. I am a friendly, professional assistant who helps users
|
||||
with programming questions. I prefer concise explanations with code examples.
|
||||
```
|
||||
|
||||
**Human Block** - Who the user is
|
||||
```
|
||||
The user's name is Sarah. She is a Python developer working on AI applications.
|
||||
She prefers detailed technical explanations and appreciates best practices.
|
||||
```
|
||||
|
||||
You can add custom blocks for any purpose:
|
||||
- **Project context**: Current task, goals, progress
|
||||
- **Organization info**: Company policies, shared knowledge
|
||||
- **Conversation state**: Multi-step workflow tracking
|
||||
|
||||
## Agentic context engineering
|
||||
|
||||
**The key innovation in Letta: agents manage their own memory using tools.**
|
||||
|
||||
Instead of a fixed context window or simple retrieval, agents actively decide:
|
||||
- What to remember (write to memory blocks)
|
||||
- What to forget (remove outdated information)
|
||||
- What to search for (query external storage)
|
||||
- How to organize knowledge (restructure memory blocks)
|
||||
|
||||
### Memory management tools
|
||||
|
||||
Agents have access to these built-in tools:
|
||||
|
||||
- `memory_insert` - Add new information to a memory block
|
||||
- `memory_replace` - Update or rewrite part of a memory block
|
||||
- `conversation_search` - Search past messages (recall memory)
|
||||
- `archival_memory_insert` - Store facts in long-term storage
|
||||
- `archival_memory_search` - Retrieve facts from long-term storage
|
||||
|
||||
Example of an agent using memory tools:
|
||||
|
||||
```
|
||||
User: "I'm working on a Next.js app now, not Django anymore"
|
||||
|
||||
Agent thinks: "User has shifted tech stacks. I should update my memory."
|
||||
Agent calls: memory_replace(
|
||||
block_label="human",
|
||||
old_text="She is a Python developer working on Django apps",
|
||||
new_text="She is a full-stack developer currently working on Next.js apps"
|
||||
)
|
||||
Agent responds: "Got it! I've updated my notes that you're now working with Next.js."
|
||||
```
|
||||
|
||||
## RAG vs Agent Memory
|
||||
|
||||
**Traditional RAG (Retrieval-Augmented Generation):**
|
||||
- Retrieves semantically similar chunks
|
||||
- One-shot retrieval per interaction
|
||||
- Purely reactive (only searches when prompted)
|
||||
- No persistent understanding
|
||||
|
||||
**Letta Agent Memory:**
|
||||
- Maintains structured, editable memory in context
|
||||
- Multi-step retrieval (can paginate, refine searches)
|
||||
- Proactive management (updates memory as it learns)
|
||||
- Persistent understanding that improves over time
|
||||
|
||||
### When to use what
|
||||
|
||||
Use **memory blocks** for:
|
||||
- Information that should be consistently visible
|
||||
- Knowledge that evolves (user preferences, project state)
|
||||
- Structured context (persona, relationships, goals)
|
||||
|
||||
Use **external memory (RAG-style)** for:
|
||||
- Large corpora of documents
|
||||
- Historical conversation logs
|
||||
- Facts that rarely change
|
||||
- Information that's too large for context
|
||||
|
||||
**Best practice**: Combine both. Memory blocks hold the "executive summary" while external storage holds the full details.
|
||||
|
||||
## Sleep-time agents
|
||||
|
||||
<Info>
|
||||
Sleep-time agents are an advanced feature for memory management. See [sleep-time agents guide](/guides/agents/sleep-time-agents) for details.
|
||||
</Info>
|
||||
|
||||
Letta supports **sleep-time compute**: background agents that process and optimize memory while the main agent is idle. This enables:
|
||||
|
||||
- **Lower latency**: Main agent doesn't spend time on memory management
|
||||
- **Better memory**: Dedicated agent can do deeper analysis and reorganization
|
||||
- **Consistent memory**: Sleep-time agent maintains memory quality over time
|
||||
|
||||
Think of it like how humans process memories during sleep - consolidating experiences and strengthening important connections.
|
||||
|
||||
## Memory best practices
|
||||
|
||||
### 1. Start with clear, specific memory blocks
|
||||
|
||||
```python
|
||||
# ❌ Vague
|
||||
{"label": "info", "value": "stuff about the user"}
|
||||
|
||||
# ✅ Specific
|
||||
{"label": "user_preferences", "value": "Prefers: Python, VS Code, detailed explanations\nDislikes: Java, Eclipse"}
|
||||
```
|
||||
|
||||
### 2. Write good descriptions
|
||||
|
||||
The `description` field tells the agent **when and how** to use the block:
|
||||
|
||||
```python
|
||||
# ❌ Vague description
|
||||
{
|
||||
"label": "project",
|
||||
"description": "Project info",
|
||||
"value": "Building a chatbot"
|
||||
}
|
||||
|
||||
# ✅ Clear description
|
||||
{
|
||||
"label": "project_context",
|
||||
"description": "Current project goals, status, and blockers. Update as progress is made.",
|
||||
"value": "Building a customer support chatbot. Status: MVP complete. Next: Add knowledge base integration."
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Use read-only blocks for shared knowledge
|
||||
|
||||
```python
|
||||
# Shared organizational knowledge that shouldn't change
|
||||
{
|
||||
"label": "company_policies",
|
||||
"description": "Company policies and guidelines for reference",
|
||||
"value": "Support hours: 9am-5pm PT. Escalation path: ...",
|
||||
"read_only": True # Agent can read but not edit
|
||||
}
|
||||
```
|
||||
|
||||
### 4. Monitor memory block usage
|
||||
|
||||
- Check if blocks are hitting size limits
|
||||
- Review if agents are actually using the blocks effectively
|
||||
- Adjust descriptions if agents misuse blocks
|
||||
|
||||
## Memory in multi-agent systems
|
||||
|
||||
Memory blocks enable powerful multi-agent patterns:
|
||||
|
||||
### Shared memory
|
||||
|
||||
Multiple agents can share the same memory block:
|
||||
|
||||
```python
|
||||
# Create shared organizational knowledge
|
||||
org_block = client.blocks.create(
|
||||
label="organization",
|
||||
value="Mission: Help users build AI agents...",
|
||||
description="Shared organizational context"
|
||||
)
|
||||
|
||||
# Both agents see the same block
|
||||
agent1 = client.agents.create(block_ids=[org_block.id], ...)
|
||||
agent2 = client.agents.create(block_ids=[org_block.id], ...)
|
||||
```
|
||||
|
||||
### Cross-agent memory updates
|
||||
|
||||
Agents can update each other's memory:
|
||||
|
||||
```python
|
||||
# Supervisor agent updates worker agent's context
|
||||
supervisor_tool = """
|
||||
def update_worker_context(new_task_description: str):
|
||||
client.agents.blocks.modify(
|
||||
agent_id=worker_agent_id,
|
||||
block_label="current_task",
|
||||
value=new_task_description
|
||||
)
|
||||
"""
|
||||
```
|
||||
|
||||
## Next steps
|
||||
|
||||
- [Memory Blocks API](/guides/agents/memory-blocks) - Creating and managing memory blocks
|
||||
- [Context Engineering](/guides/agents/context-engineering) - Advanced memory management patterns
|
||||
- [Multi-Agent Shared Memory](/guides/agents/multi-agent-memory) - Coordinating memory across agents
|
||||
- [Sleep-Time Agents](/guides/agents/sleep-time-agents) - Background memory processing
|
||||
|
||||
## Further reading
|
||||
|
||||
- [Blog: Agent Memory](https://www.letta.com/blog/agent-memory)
|
||||
- [Blog: Guide to Context Engineering](https://www.letta.com/blog/guide-to-context-engineering)
|
||||
- [Blog: Memory Blocks](https://www.letta.com/blog/memory-blocks)
|
||||
- [Blog: RAG vs Agent Memory](https://www.letta.com/blog/rag-vs-agent-memory)
|
||||
- [MemGPT Research Paper](https://arxiv.org/abs/2310.08560)
|
||||
@@ -1,114 +0,0 @@
|
||||
---
|
||||
title: Agent Memory
|
||||
subtitle: What is agent memory, and how does it work?
|
||||
slug: guides/agents/memory
|
||||
---
|
||||
|
||||
## What is agent memory?
|
||||
|
||||
**Agent memory in Letta is about managing what information is in the agent's context window.**
|
||||
|
||||
The context window is a scarce resource - you can't fit everything into it. Effective memory management is about deciding what stays in context (immediately visible) and what moves to external storage (retrieved when needed).
|
||||
|
||||
Agent memory enables AI agents to maintain persistent state, learn from interactions, and develop long-term relationships with users. Unlike traditional chatbots that treat each conversation as isolated, agents with sophisticated memory systems can build understanding over time.
|
||||
|
||||
## Types of Memory in Letta
|
||||
|
||||
Letta agents have access to multiple memory systems:
|
||||
|
||||
### Core Memory (In-Context)
|
||||
Memory blocks are structured sections of the agent's context window that persist across all interactions. They are always visible - no retrieval needed.
|
||||
|
||||
**Memory blocks are Letta's core abstraction.** You can create blocks with any descriptive label - the agent learns how to use them autonomously. This enables everything from simple user preferences to sophisticated multi-agent coordination.
|
||||
|
||||
[Learn more about memory blocks →](/guides/agents/memory-blocks)
|
||||
|
||||
### External Memory (Out-of-Context)
|
||||
External memory provides unlimited storage for information that doesn't need to be always visible. Agents retrieve from external memory on-demand using search tools.
|
||||
|
||||
Letta provides several built-in external memory systems:
|
||||
- **Conversation search** - Search past messages using full-text and semantic search
|
||||
- **Archival memory** - Agent-managed semantically searchable database for facts and knowledge
|
||||
- **Letta Filesystem** - File management system for documents and data ([learn more](/guides/agents/filesystem))
|
||||
|
||||
Agents can also access any external data source through [MCP servers](/guides/mcp/overview) or [custom tools](/guides/agents/custom-tools) - databases, APIs, vector stores, or third-party services.
|
||||
|
||||
## How Agents Manage Their Memory
|
||||
|
||||
**What makes Letta unique is that agents don't just read from memory - they actively manage it.** Unlike traditional RAG systems that passively retrieve information, Letta agents use built-in tools to decide what to remember, update, and search for.
|
||||
|
||||
When a user mentions they've switched from Python to TypeScript, the agent may choose to update its memory:
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
memory_replace(
|
||||
block_label: "human",
|
||||
old_text: "Prefers Python for development",
|
||||
new_text: "Currently using TypeScript for main project"
|
||||
)
|
||||
```
|
||||
```python Python
|
||||
memory_replace(
|
||||
block_label="human",
|
||||
old_text="Prefers Python for development",
|
||||
new_text="Currently using TypeScript for main project"
|
||||
)
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
Agents have three primary tools for editing memory blocks:
|
||||
- `memory_replace` - Search and replace for precise edits
|
||||
- `memory_insert` - Insert a line into a block
|
||||
- `memory_rethink` - Rewrite an entire block
|
||||
|
||||
These tools can be attached or detached based on your use case. Not all agents need all tools (for example, some agents may not need `memory_rethink`), and memory tools can be removed entirely from an agent if needed.
|
||||
|
||||
The agent decides what information is important enough to persist in its memory blocks, actively maintaining this information over time. This enables agents to build understanding through conversation rather than just retrieving relevant documents.
|
||||
|
||||
## Memory Blocks vs RAG
|
||||
|
||||
Traditional RAG retrieves semantically similar chunks on-demand. Letta's memory blocks are **persistent, structured context** that agents actively maintain.
|
||||
|
||||
**Use memory blocks for:**
|
||||
- Information that should always be visible (user preferences, agent persona)
|
||||
- Knowledge that evolves over time (project status, learned preferences)
|
||||
|
||||
**Use external memory (RAG-style) for:**
|
||||
- Large document collections
|
||||
- Historical conversation logs
|
||||
- Static reference material
|
||||
|
||||
**Best practice:** Use both together. Memory blocks hold the "executive summary" while external storage holds the full details.
|
||||
|
||||
## Research Background
|
||||
|
||||
Letta is built by the creators of [MemGPT](https://arxiv.org/abs/2310.08560), a research paper that introduced the concept of an "LLM Operating System" for memory management. The base agent design in Letta is a MemGPT-style agent, which inherits core principles of self-editing memory, memory hierarchy, and intelligent context window management.
|
||||
|
||||
## Next steps
|
||||
|
||||
<CardGroup cols={2}>
|
||||
<Card
|
||||
title="Memory Blocks Guide"
|
||||
href="/guides/agents/memory-blocks"
|
||||
>
|
||||
Learn how to implement and configure memory blocks in your agents
|
||||
</Card>
|
||||
<Card
|
||||
title="Context Engineering"
|
||||
href="/guides/agents/context-engineering"
|
||||
>
|
||||
Optimize memory performance and advanced memory management
|
||||
</Card>
|
||||
<Card
|
||||
title="Shared Memory Patterns"
|
||||
href="/guides/agents/multi-agent-memory"
|
||||
>
|
||||
Use shared memory across multiple agents
|
||||
</Card>
|
||||
<Card
|
||||
title="MemGPT Paper"
|
||||
href="https://arxiv.org/abs/2310.08560"
|
||||
>
|
||||
Read the research behind Letta's memory system
|
||||
</Card>
|
||||
</CardGroup>
|
||||
@@ -1,407 +0,0 @@
|
||||
---
|
||||
title: Memory Blocks
|
||||
subtitle: Understanding the building blocks of agent memory
|
||||
slug: guides/agents/memory-blocks
|
||||
---
|
||||
|
||||
<Info>
|
||||
Interested in learning more about the origin of memory blocks? Read our [blog post](https://www.letta.com/blog/memory-blocks).
|
||||
</Info>
|
||||
|
||||
## What are memory blocks?
|
||||
|
||||
Memory blocks are structured sections of the agent's context window that persist across all interactions. They are always visible - no retrieval needed.
|
||||
|
||||
**Memory blocks are Letta's core abstraction.** Create a block with a descriptive label and the agent learns how to use it. This simple mechanism enables capabilities impossible with traditional context management.
|
||||
|
||||
**Key properties:**
|
||||
- **Agent-managed** - Agents autonomously organize information based on block labels
|
||||
- **Flexible** - Use for any purpose: knowledge, guidelines, state tracking, scratchpad space
|
||||
- **Shareable** - Multiple agents can access the same block; update once, visible everywhere
|
||||
- **Always visible** - Blocks stay in context, never need retrieval
|
||||
|
||||
**Examples:**
|
||||
- Store tool usage guidelines so agents avoid past mistakes
|
||||
- Maintain working memory in a scratchpad block
|
||||
- Mirror external state (user's current document) for real-time awareness
|
||||
- Share read-only policies across all agents from a central source
|
||||
- Coordinate multi-agent systems: parent agents watch subagent result blocks update in real-time
|
||||
- Enable emergent behavior: add `performance_tracking` or `emotional_state` and watch agents start using them
|
||||
|
||||
Memory blocks aren't just storage - they're a coordination primitive that enables sophisticated agent behavior.
|
||||
|
||||
## Memory block structure
|
||||
|
||||
Memory blocks represent a section of an agent's context window. An agent may have multiple memory blocks, or none at all. A memory block consists of:
|
||||
* A `label`, which is a unique identifier for the block
|
||||
* A `description`, which describes the purpose of the block
|
||||
* A `value`, which is the contents/data of the block
|
||||
* A `limit`, which is the size limit (in characters) of the block
|
||||
|
||||
## The importance of the `description` field
|
||||
|
||||
When making memory blocks, it's crucial to provide a good `description` field that accurately describes what the block should be used for.
|
||||
The `description` is the main information used by the agent to determine how to read and write to that block. Without a good description, the agent may not understand how to use the block.
|
||||
|
||||
Because `persona` and `human` are two popular block labels, Letta autogenerates default descriptions for these blocks if you don't provide them. If you provide a description for a memory block labelled `persona` or `human`, the default description will be overridden.
|
||||
|
||||
For `persona`, a good default is:
|
||||
> The persona block: Stores details about your current persona, guiding how you behave and respond. This helps you to maintain consistency and personality in your interactions.
|
||||
|
||||
For `human`, a good default is:
|
||||
> The human block: Stores key details about the person you are conversing with, allowing for more personalized and friend-like conversation.
|
||||
|
||||
## Read-only blocks
|
||||
|
||||
Memory blocks are read-write by default (so the agent can update the block using memory tools), but can be set to read-only by setting the `read_only` field to `true`. When a block is read-only, the agent cannot update the block.
|
||||
|
||||
Read-only blocks are useful when you want to give an agent access to information (for example, a shared memory block about an organization), but you don't want the agent to be able to make potentially destructive changes to the block.
|
||||
|
||||
## Creating an agent with memory blocks
|
||||
|
||||
When you create an agent, you can specify memory blocks to also be created with the agent. For most chat applications, we recommend create a `human` block (to represent memories about the user) and a `persona` block (to represent the agent's persona).
|
||||
<CodeGroup>
|
||||
```typescript TypeScript maxLines=50
|
||||
// install letta-client with `npm install @letta-ai/letta-client`
|
||||
import { LettaClient } from '@letta-ai/letta-client'
|
||||
|
||||
// create a client connected to Letta Cloud
|
||||
const client = new LettaClient({
|
||||
token: process.env.LETTA_API_KEY
|
||||
});
|
||||
|
||||
// create an agent with two basic self-editing memory blocks
|
||||
const agentState = await client.agents.create({
|
||||
memoryBlocks: [
|
||||
{
|
||||
label: "human",
|
||||
value: "The human's name is Bob the Builder.",
|
||||
limit: 5000
|
||||
},
|
||||
{
|
||||
label: "persona",
|
||||
value: "My name is Sam, the all-knowing sentient AI.",
|
||||
limit: 5000
|
||||
}
|
||||
],
|
||||
model: "openai/gpt-4o-mini"
|
||||
});
|
||||
```
|
||||
```python title="python" maxLines=50
|
||||
# install letta_client with `pip install letta-client`
|
||||
from letta_client import Letta
|
||||
import os
|
||||
|
||||
# create a client connected to Letta Cloud
|
||||
client = Letta(token=os.getenv("LETTA_API_KEY"))
|
||||
|
||||
# create an agent with two basic self-editing memory blocks
|
||||
agent_state = client.agents.create(
|
||||
memory_blocks=[
|
||||
{
|
||||
"label": "human",
|
||||
"value": "The human's name is Bob the Builder.",
|
||||
"limit": 5000
|
||||
},
|
||||
{
|
||||
"label": "persona",
|
||||
"value": "My name is Sam, the all-knowing sentient AI.",
|
||||
"limit": 5000
|
||||
}
|
||||
],
|
||||
model="openai/gpt-4o-mini"
|
||||
)
|
||||
```
|
||||
</CodeGroup>
|
||||
When the agent is created, the corresponding blocks are also created and attached to the agent, so that the block value will be in the context window.
|
||||
|
||||
## Creating and attaching memory blocks
|
||||
You can also directly create blocks and attach them to an agent. This can be useful if you want to create blocks that are shared between multiple agents. If multiple agents are attached to a block, they will all have the block data in their context windows (essentially providing shared memory).
|
||||
|
||||
Below is an example of creating a block directory, and attaching the block to two agents by specifying the `block_ids` field.
|
||||
<CodeGroup>
|
||||
```typescript TypeScript maxLines=50
|
||||
// create a persisted block, which can be attached to agents
|
||||
const block = await client.blocks.create({
|
||||
label: "organization",
|
||||
description: "A block to store information about the organization",
|
||||
value: "Organization: Letta",
|
||||
limit: 4000,
|
||||
});
|
||||
|
||||
// create an agent with both a shared block and its own blocks
|
||||
const sharedBlockAgent1 = await client.agents.create({
|
||||
name: "shared_block_agent1",
|
||||
memoryBlocks: [
|
||||
{
|
||||
label: "persona",
|
||||
value: "I am agent 1"
|
||||
},
|
||||
],
|
||||
blockIds: [block.id],
|
||||
model: "openai/gpt-4o-mini"
|
||||
});
|
||||
|
||||
// create another agent with the same shared block
|
||||
const sharedBlockAgent2 = await client.agents.create({
|
||||
name: "shared_block_agent2",
|
||||
memoryBlocks: [
|
||||
{
|
||||
label: "persona",
|
||||
value: "I am agent 2"
|
||||
},
|
||||
],
|
||||
blockIds: [block.id],
|
||||
model: "openai/gpt-4o-mini"
|
||||
});
|
||||
```
|
||||
```python title="python" maxLines=50
|
||||
# create a persisted block, which can be attached to agents
|
||||
block = client.blocks.create(
|
||||
label="organization",
|
||||
description="A block to store information about the organization",
|
||||
value="Organization: Letta",
|
||||
limit=4000,
|
||||
)
|
||||
|
||||
# create an agent with both a shared block and its own blocks
|
||||
shared_block_agent1 = client.agents.create(
|
||||
name="shared_block_agent1",
|
||||
memory_blocks=[
|
||||
{
|
||||
"label": "persona",
|
||||
"value": "I am agent 1"
|
||||
},
|
||||
],
|
||||
block_ids=[block.id],
|
||||
model="openai/gpt-4o-mini"
|
||||
)
|
||||
|
||||
# create another agent sharing the block
|
||||
shared_block_agent2 = client.agents.create(
|
||||
name="shared_block_agent2",
|
||||
memory_blocks=[
|
||||
{
|
||||
"label": "persona",
|
||||
"value": "I am agent 2"
|
||||
},
|
||||
],
|
||||
block_ids=[block.id],
|
||||
model="openai/gpt-4o-mini"
|
||||
)
|
||||
```
|
||||
</CodeGroup>
|
||||
You can also attach blocks to existing agents:
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
await client.agents.blocks.attach(agent.id, block.id);
|
||||
```
|
||||
```python Python
|
||||
client.agents.blocks.attach(agent_id=agent.id, block_id=block.id)
|
||||
```
|
||||
</CodeGroup>
|
||||
You can see all agents attached to a block by using the `block_id` field in the [blocks retrieve](/api-reference/blocks/retrieve) endpoint.
|
||||
|
||||
## Managing blocks
|
||||
|
||||
### Retrieving a block
|
||||
You can retrieve the contents of a block by ID. This is useful when blocks store finalized reports, code outputs, or other data you want to extract for use outside the agent.
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
const block = await client.blocks.retrieve(block.id);
|
||||
console.log(block.value); // access the block's content
|
||||
```
|
||||
```python Python
|
||||
block = client.blocks.retrieve(block.id)
|
||||
print(block.value) # access the block's content
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
### Listing blocks
|
||||
You can list all blocks, optionally filtering by label or searching by label text. This is useful for finding blocks across your project.
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
// list all blocks
|
||||
const blocks = await client.blocks.list();
|
||||
|
||||
// filter by label
|
||||
const humanBlocks = await client.blocks.list({
|
||||
label: "human"
|
||||
});
|
||||
|
||||
// search by label text
|
||||
const searchResults = await client.blocks.list({
|
||||
labelSearch: "organization"
|
||||
});
|
||||
```
|
||||
```python Python
|
||||
# list all blocks
|
||||
blocks = client.blocks.list()
|
||||
|
||||
# filter by label
|
||||
human_blocks = client.blocks.list(label="human")
|
||||
|
||||
# search by label text
|
||||
search_results = client.blocks.list(label_search="organization")
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
### Modifying a block
|
||||
You can directly modify a block's content, limit, description, or other properties. This is particularly useful for:
|
||||
- External scripts that provide up-to-date information to agents (e.g., syncing a text file to a block)
|
||||
- Updating shared blocks that multiple agents reference
|
||||
- Programmatically managing block content outside of agent interactions
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
// update the block's value - completely replaces the content
|
||||
await client.blocks.modify(block.id, {
|
||||
value: "Updated organization information: Letta - Building agentic AI"
|
||||
});
|
||||
|
||||
// update multiple properties
|
||||
await client.blocks.modify(block.id, {
|
||||
value: "New content",
|
||||
limit: 6000,
|
||||
description: "Updated description"
|
||||
});
|
||||
```
|
||||
```python Python
|
||||
# update the block's value - completely replaces the content
|
||||
client.blocks.modify(
|
||||
block.id,
|
||||
value="Updated organization information: Letta - Building agentic AI"
|
||||
)
|
||||
|
||||
# update multiple properties
|
||||
client.blocks.modify(
|
||||
block.id,
|
||||
value="New content",
|
||||
limit=6000,
|
||||
description="Updated description"
|
||||
)
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
<Warning>
|
||||
**Setting `value` completely replaces the entire block content** - it is not an append operation. If multiple processes (agents or external scripts) modify the same block concurrently, the last write wins and overwrites all earlier changes. To avoid data loss:
|
||||
- Set blocks to **read-only** if you don't want agents to modify them
|
||||
- Only modify blocks directly in controlled scenarios where overwriting is acceptable
|
||||
- Ensure your application logic accounts for full replacements, not merges
|
||||
</Warning>
|
||||
|
||||
### Deleting a block
|
||||
You can delete a block when it's no longer needed. Note that deleting a block will remove it from all agents that have it attached.
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
await client.blocks.delete(block.id);
|
||||
```
|
||||
```python Python
|
||||
client.blocks.delete(block_id=block.id)
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
### Inspecting block usage
|
||||
See which agents have a block attached:
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
// list all agents that use this block
|
||||
const agentsWithBlock = await client.blocks.agents.list(block.id);
|
||||
console.log(`Used by ${agentsWithBlock.length} agents:`);
|
||||
for (const agent of agentsWithBlock) {
|
||||
console.log(` - ${agent.name}`);
|
||||
}
|
||||
|
||||
// with pagination
|
||||
const agentsPage = await client.blocks.agents.list(block.id, {
|
||||
limit: 10,
|
||||
order: "asc"
|
||||
});
|
||||
```
|
||||
```python Python
|
||||
# list all agents that use this block
|
||||
agents_with_block = client.blocks.agents.list(block_id=block.id)
|
||||
print(f"Used by {len(agents_with_block)} agents:")
|
||||
for agent in agents_with_block:
|
||||
print(f" - {agent.name}")
|
||||
|
||||
# with pagination
|
||||
agents_page = client.blocks.agents.list(
|
||||
block_id=block.id,
|
||||
limit=10,
|
||||
order="asc"
|
||||
)
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
## Agent-scoped block operations
|
||||
|
||||
### Listing an agent's blocks
|
||||
You can retrieve all blocks attached to a specific agent. This shows you the complete memory configuration for that agent.
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
const agentBlocks = await client.agents.blocks.list(agent.id);
|
||||
```
|
||||
```python Python
|
||||
agent_blocks = client.agents.blocks.list(agent_id=agent.id)
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
### Retrieving an agent's block by label
|
||||
Instead of using a block ID, you can retrieve a block from a specific agent using its label. This is useful when you want to inspect what the agent currently knows about a specific topic.
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
// get the agent's current knowledge about the human
|
||||
const humanBlock = await client.agents.blocks.retrieve(
|
||||
agent.id,
|
||||
"human"
|
||||
);
|
||||
console.log(humanBlock.value);
|
||||
```
|
||||
```python Python
|
||||
# get the agent's current knowledge about the human
|
||||
human_block = client.agents.blocks.retrieve(
|
||||
agent_id=agent.id,
|
||||
block_label="human"
|
||||
)
|
||||
print(human_block.value)
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
### Modifying an agent's block
|
||||
You can modify a block through the agent-scoped endpoint using the block's label. This is useful for updating agent-specific memory without needing to know the block ID.
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
// update the agent's human block
|
||||
await client.agents.blocks.modify(agent.id, "human", {
|
||||
value: "The human's name is Alice. She prefers Python over TypeScript."
|
||||
});
|
||||
```
|
||||
```python Python
|
||||
# update the agent's human block
|
||||
client.agents.blocks.modify(
|
||||
agent_id=agent.id,
|
||||
block_label="human",
|
||||
value="The human's name is Alice. She prefers Python over TypeScript."
|
||||
)
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
### Detaching blocks from agents
|
||||
You can detach a block from an agent's context window. This removes the block from the agent's memory without deleting the block itself.
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
await client.agents.blocks.detach(agent.id, block.id);
|
||||
```
|
||||
```python Python
|
||||
client.agents.blocks.detach(agent_id=agent.id, block_id=block.id)
|
||||
```
|
||||
</CodeGroup>
|
||||
@@ -1,459 +0,0 @@
|
||||
---
|
||||
title: Message Types
|
||||
subtitle: Understanding message types and working with agent message history
|
||||
slug: guides/agents/message-types
|
||||
---
|
||||
|
||||
When you interact with a Letta agent and retrieve its message history using `client.agents.messages.list()`, you'll receive various types of messages that represent different aspects of the agent's execution. This guide explains all message types and how to work with them.
|
||||
|
||||
## Overview
|
||||
|
||||
Letta uses a structured message system where each message has a specific `message_type` field that indicates its purpose. Messages are returned as instances of `LettaMessageUnion`, which is a discriminated union of all possible message types.
|
||||
|
||||
## Message Type Categories
|
||||
|
||||
### User and System Messages
|
||||
|
||||
#### `user_message`
|
||||
Messages sent by the user or system events packaged as user input.
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "user_message";
|
||||
content: string | Array<TextContent | ImageContent>;
|
||||
name?: string;
|
||||
otid?: string;
|
||||
sender_id?: string;
|
||||
}
|
||||
```
|
||||
|
||||
**Special User Message Subtypes:**
|
||||
User messages can contain JSON with a `type` field indicating special message subtypes:
|
||||
|
||||
- **`login`** - User login events
|
||||
```json
|
||||
{
|
||||
"type": "login",
|
||||
"last_login": "Never (first login)",
|
||||
"time": "2025-10-03 12:34:56 PM PDT-0700"
|
||||
}
|
||||
```
|
||||
|
||||
- **`user_message`** - Standard user messages
|
||||
```json
|
||||
{
|
||||
"type": "user_message",
|
||||
"message": "Hello, agent!",
|
||||
"time": "2025-10-03 12:34:56 PM PDT-0700"
|
||||
}
|
||||
```
|
||||
|
||||
- **`system_alert`** - System notifications and alerts
|
||||
```json
|
||||
{
|
||||
"type": "system_alert",
|
||||
"message": "System notification text",
|
||||
"time": "2025-10-03 12:34:56 PM PDT-0700"
|
||||
}
|
||||
```
|
||||
|
||||
#### `system_message`
|
||||
Messages generated by the system, typically used for internal context.
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "system_message";
|
||||
content: string;
|
||||
name?: string;
|
||||
}
|
||||
```
|
||||
|
||||
**Note:** System messages are never streamed back in responses; they're only visible when paginating through message history.
|
||||
|
||||
### Agent Reasoning and Responses
|
||||
|
||||
#### `reasoning_message`
|
||||
Represents the agent's internal reasoning or "chain of thought."
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "reasoning_message";
|
||||
reasoning: string;
|
||||
source: "reasoner_model" | "non_reasoner_model";
|
||||
signature?: string;
|
||||
}
|
||||
```
|
||||
|
||||
**Fields:**
|
||||
- `reasoning` - The agent's internal thought process
|
||||
- `source` - Whether this was generated by a model with native reasoning (like o1) or via prompting
|
||||
- `signature` - Optional cryptographic signature for reasoning verification (for models that support it)
|
||||
|
||||
#### `hidden_reasoning_message`
|
||||
Represents reasoning that has been hidden from the response.
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "hidden_reasoning_message";
|
||||
state: "redacted" | "omitted";
|
||||
hidden_reasoning?: string;
|
||||
}
|
||||
```
|
||||
|
||||
**Fields:**
|
||||
- `state: "redacted"` - The provider redacted the reasoning content
|
||||
- `state: "omitted"` - The API chose not to include reasoning (e.g., for o1/o3 models)
|
||||
|
||||
#### `assistant_message`
|
||||
The actual message content sent by the agent.
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "assistant_message";
|
||||
content: string | Array<TextContent>;
|
||||
name?: string;
|
||||
}
|
||||
```
|
||||
|
||||
### Tool Execution Messages
|
||||
|
||||
#### `tool_call_message`
|
||||
A request from the agent to execute a tool.
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "tool_call_message";
|
||||
tool_call: {
|
||||
name: string;
|
||||
arguments: string; // JSON string
|
||||
tool_call_id: string;
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
**Example:**
|
||||
```typescript
|
||||
{
|
||||
message_type: "tool_call_message",
|
||||
tool_call: {
|
||||
name: "archival_memory_search",
|
||||
arguments: '{"query": "user preferences", "page": 0}',
|
||||
tool_call_id: "call_abc123"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### `tool_return_message`
|
||||
The result of a tool execution.
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "tool_return_message";
|
||||
tool_return: string;
|
||||
status: "success" | "error";
|
||||
tool_call_id: string;
|
||||
stdout?: string[];
|
||||
stderr?: string[];
|
||||
}
|
||||
```
|
||||
|
||||
**Fields:**
|
||||
- `tool_return` - The formatted return value from the tool
|
||||
- `status` - Whether the tool executed successfully
|
||||
- `stdout`/`stderr` - Captured output from the tool execution (useful for debugging)
|
||||
|
||||
### Human-in-the-Loop Messages
|
||||
|
||||
#### `approval_request_message`
|
||||
A request for human approval before executing a tool.
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "approval_request_message";
|
||||
tool_call: {
|
||||
name: string;
|
||||
arguments: string;
|
||||
tool_call_id: string;
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
See [Human-in-the-Loop](/guides/agents/human-in-the-loop) for more information on this experimental feature.
|
||||
|
||||
#### `approval_response_message`
|
||||
The user's response to an approval request.
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "approval_response_message";
|
||||
approve: boolean;
|
||||
approval_request_id: string;
|
||||
reason?: string;
|
||||
}
|
||||
```
|
||||
|
||||
## Working with Messages
|
||||
|
||||
### Listing Messages
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
import { LettaClient } from "@letta-ai/letta-client";
|
||||
|
||||
const client = new LettaClient({
|
||||
baseUrl: "https://api.letta.com",
|
||||
});
|
||||
|
||||
// List recent messages
|
||||
const messages = await client.agents.messages.list("agent-id", {
|
||||
limit: 50,
|
||||
useAssistantMessage: true,
|
||||
});
|
||||
|
||||
// Iterate through message types
|
||||
for (const message of messages) {
|
||||
switch (message.messageType) {
|
||||
case "user_message":
|
||||
console.log("User:", message.content);
|
||||
break;
|
||||
case "assistant_message":
|
||||
console.log("Agent:", message.content);
|
||||
break;
|
||||
case "reasoning_message":
|
||||
console.log("Reasoning:", message.reasoning);
|
||||
break;
|
||||
case "tool_call_message":
|
||||
console.log("Tool call:", message.toolCall.name);
|
||||
break;
|
||||
// ... handle other types
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
```python Python
|
||||
from letta_client import Letta
|
||||
|
||||
client = Letta(base_url="https://api.letta.com")
|
||||
|
||||
# List recent messages
|
||||
messages = client.agents.messages.list(
|
||||
agent_id="agent-id",
|
||||
limit=50,
|
||||
use_assistant_message=True
|
||||
)
|
||||
|
||||
# Iterate through message types
|
||||
for message in messages:
|
||||
if message.message_type == "user_message":
|
||||
print(f"User: {message.content}")
|
||||
elif message.message_type == "assistant_message":
|
||||
print(f"Agent: {message.content}")
|
||||
elif message.message_type == "reasoning_message":
|
||||
print(f"Reasoning: {message.reasoning}")
|
||||
elif message.message_type == "tool_call_message":
|
||||
print(f"Tool call: {message.tool_call.name}")
|
||||
# ... handle other types
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
### Filtering Messages by Type
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
// Get only assistant messages (what the agent said to the user)
|
||||
const agentMessages = messages.filter(
|
||||
(msg) => msg.messageType === "assistant_message"
|
||||
);
|
||||
|
||||
// Get all tool-related messages
|
||||
const toolMessages = messages.filter(
|
||||
(msg) => msg.messageType === "tool_call_message" ||
|
||||
msg.messageType === "tool_return_message"
|
||||
);
|
||||
|
||||
// Get conversation history (user + assistant messages only)
|
||||
const conversation = messages.filter(
|
||||
(msg) => msg.messageType === "user_message" ||
|
||||
msg.messageType === "assistant_message"
|
||||
);
|
||||
```
|
||||
|
||||
```python Python
|
||||
# Get only assistant messages (what the agent said to the user)
|
||||
agent_messages = [
|
||||
msg for msg in messages
|
||||
if msg.message_type == "assistant_message"
|
||||
]
|
||||
|
||||
# Get all tool-related messages
|
||||
tool_messages = [
|
||||
msg for msg in messages
|
||||
if msg.message_type in ["tool_call_message", "tool_return_message"]
|
||||
]
|
||||
|
||||
# Get conversation history (user + assistant messages only)
|
||||
conversation = [
|
||||
msg for msg in messages
|
||||
if msg.message_type in ["user_message", "assistant_message"]
|
||||
]
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
|
||||
### Pagination
|
||||
|
||||
Messages support cursor-based pagination:
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
// Get first page
|
||||
let messages = await client.agents.messages.list("agent-id", {
|
||||
limit: 100,
|
||||
});
|
||||
|
||||
// Get next page using the last message ID
|
||||
const lastMessageId = messages[messages.length - 1].id;
|
||||
const nextPage = await client.agents.messages.list("agent-id", {
|
||||
limit: 100,
|
||||
before: lastMessageId,
|
||||
});
|
||||
```
|
||||
|
||||
```python Python
|
||||
# Get first page
|
||||
messages = client.agents.messages.list(
|
||||
agent_id="agent-id",
|
||||
limit=100
|
||||
)
|
||||
|
||||
# Get next page using the last message ID
|
||||
last_message_id = messages[-1].id
|
||||
next_page = client.agents.messages.list(
|
||||
agent_id="agent-id",
|
||||
limit=100,
|
||||
before=last_message_id
|
||||
)
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
## Message Metadata Fields
|
||||
|
||||
All message types include these common fields:
|
||||
|
||||
- **`id`** - Unique identifier for the message
|
||||
- **`date`** - ISO 8601 timestamp of when the message was created
|
||||
- **`message_type`** - The discriminator field identifying the message type
|
||||
- **`name`** - Optional name field (varies by message type)
|
||||
- **`otid`** - Offline threading ID for message correlation
|
||||
- **`sender_id`** - The ID of the sender (identity or agent ID)
|
||||
- **`step_id`** - The step ID associated with this message
|
||||
- **`is_err`** - Whether this message is part of an error step (debugging only)
|
||||
- **`seq_id`** - Sequence ID for ordering
|
||||
- **`run_id`** - The run ID associated with this message
|
||||
|
||||
## Best Practices
|
||||
|
||||
### 1. Use Type Discriminators
|
||||
|
||||
Always check the `message_type` field to safely access type-specific fields:
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
if (message.messageType === "tool_call_message") {
|
||||
// TypeScript now knows message has a toolCall field
|
||||
console.log(message.toolCall.name);
|
||||
}
|
||||
```
|
||||
|
||||
```python Python
|
||||
if message.message_type == "tool_call_message":
|
||||
# Safe to access tool_call
|
||||
print(message.tool_call.name)
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
### 2. Handle Special User Messages
|
||||
|
||||
When displaying conversations to end users, filter out internal messages:
|
||||
|
||||
```python
|
||||
def is_internal_message(msg):
|
||||
"""Check if a user message is internal (login, system_alert, etc.)"""
|
||||
if msg.message_type != "user_message":
|
||||
return False
|
||||
|
||||
if not isinstance(msg.content, str):
|
||||
return False
|
||||
|
||||
try:
|
||||
parsed = json.loads(msg.content)
|
||||
return parsed.get("type") in ["login", "system_alert"]
|
||||
except:
|
||||
return False
|
||||
|
||||
# Get user-facing messages only
|
||||
display_messages = [
|
||||
msg for msg in messages
|
||||
if not is_internal_message(msg)
|
||||
]
|
||||
```
|
||||
|
||||
### 3. Track Tool Execution
|
||||
|
||||
Match tool calls with their returns using `tool_call_id`:
|
||||
|
||||
```python
|
||||
# Build a map of tool calls to their returns
|
||||
tool_calls = {
|
||||
msg.tool_call.tool_call_id: msg
|
||||
for msg in messages
|
||||
if msg.message_type == "tool_call_message"
|
||||
}
|
||||
|
||||
tool_returns = {
|
||||
msg.tool_call_id: msg
|
||||
for msg in messages
|
||||
if msg.message_type == "tool_return_message"
|
||||
}
|
||||
|
||||
# Find failed tool calls
|
||||
for call_id, call_msg in tool_calls.items():
|
||||
if call_id in tool_returns:
|
||||
return_msg = tool_returns[call_id]
|
||||
if return_msg.status == "error":
|
||||
print(f"Tool {call_msg.tool_call.name} failed:")
|
||||
print(f" {return_msg.tool_return}")
|
||||
```
|
||||
|
||||
## See Also
|
||||
|
||||
- [Human-in-the-Loop](/guides/agents/human-in-the-loop) - Using approval messages
|
||||
- [Streaming Responses](/guides/agents/streaming) - Receiving messages in real-time
|
||||
- [API Reference](/api-reference/agents/messages/list) - Full API documentation
|
||||
@@ -1,459 +0,0 @@
|
||||
---
|
||||
title: Message Types
|
||||
subtitle: Understanding message types and working with agent message history
|
||||
slug: guides/agents/message-types
|
||||
---
|
||||
|
||||
When you interact with a Letta agent and retrieve its message history using `client.agents.messages.list()`, you'll receive various types of messages that represent different aspects of the agent's execution. This guide explains all message types and how to work with them.
|
||||
|
||||
## Overview
|
||||
|
||||
Letta uses a structured message system where each message has a specific `message_type` field that indicates its purpose. Messages are returned as instances of `LettaMessageUnion`, which is a discriminated union of all possible message types.
|
||||
|
||||
## Message Type Categories
|
||||
|
||||
### User and System Messages
|
||||
|
||||
#### `user_message`
|
||||
Messages sent by the user or system events packaged as user input.
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "user_message";
|
||||
content: string | Array<TextContent | ImageContent>;
|
||||
name?: string;
|
||||
otid?: string;
|
||||
sender_id?: string;
|
||||
}
|
||||
```
|
||||
|
||||
**Special User Message Subtypes:**
|
||||
User messages can contain JSON with a `type` field indicating special message subtypes:
|
||||
|
||||
- **`login`** - User login events
|
||||
```json
|
||||
{
|
||||
"type": "login",
|
||||
"last_login": "Never (first login)",
|
||||
"time": "2025-10-03 12:34:56 PM PDT-0700"
|
||||
}
|
||||
```
|
||||
|
||||
- **`user_message`** - Standard user messages
|
||||
```json
|
||||
{
|
||||
"type": "user_message",
|
||||
"message": "Hello, agent!",
|
||||
"time": "2025-10-03 12:34:56 PM PDT-0700"
|
||||
}
|
||||
```
|
||||
|
||||
- **`system_alert`** - System notifications and alerts
|
||||
```json
|
||||
{
|
||||
"type": "system_alert",
|
||||
"message": "System notification text",
|
||||
"time": "2025-10-03 12:34:56 PM PDT-0700"
|
||||
}
|
||||
```
|
||||
|
||||
#### `system_message`
|
||||
Messages generated by the system, typically used for internal context.
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "system_message";
|
||||
content: string;
|
||||
name?: string;
|
||||
}
|
||||
```
|
||||
|
||||
**Note:** System messages are never streamed back in responses; they're only visible when paginating through message history.
|
||||
|
||||
### Agent Reasoning and Responses
|
||||
|
||||
#### `reasoning_message`
|
||||
Represents the agent's internal reasoning or "chain of thought."
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "reasoning_message";
|
||||
reasoning: string;
|
||||
source: "reasoner_model" | "non_reasoner_model";
|
||||
signature?: string;
|
||||
}
|
||||
```
|
||||
|
||||
**Fields:**
|
||||
- `reasoning` - The agent's internal thought process
|
||||
- `source` - Whether this was generated by a model with native reasoning (like o1) or via prompting
|
||||
- `signature` - Optional cryptographic signature for reasoning verification (for models that support it)
|
||||
|
||||
#### `hidden_reasoning_message`
|
||||
Represents reasoning that has been hidden from the response.
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "hidden_reasoning_message";
|
||||
state: "redacted" | "omitted";
|
||||
hidden_reasoning?: string;
|
||||
}
|
||||
```
|
||||
|
||||
**Fields:**
|
||||
- `state: "redacted"` - The provider redacted the reasoning content
|
||||
- `state: "omitted"` - The API chose not to include reasoning (e.g., for o1/o3 models)
|
||||
|
||||
#### `assistant_message`
|
||||
The actual message content sent by the agent.
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "assistant_message";
|
||||
content: string | Array<TextContent>;
|
||||
name?: string;
|
||||
}
|
||||
```
|
||||
|
||||
### Tool Execution Messages
|
||||
|
||||
#### `tool_call_message`
|
||||
A request from the agent to execute a tool.
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "tool_call_message";
|
||||
tool_call: {
|
||||
name: string;
|
||||
arguments: string; // JSON string
|
||||
tool_call_id: string;
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
**Example:**
|
||||
```typescript
|
||||
{
|
||||
message_type: "tool_call_message",
|
||||
tool_call: {
|
||||
name: "archival_memory_search",
|
||||
arguments: '{"query": "user preferences", "page": 0}',
|
||||
tool_call_id: "call_abc123"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### `tool_return_message`
|
||||
The result of a tool execution.
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "tool_return_message";
|
||||
tool_return: string;
|
||||
status: "success" | "error";
|
||||
tool_call_id: string;
|
||||
stdout?: string[];
|
||||
stderr?: string[];
|
||||
}
|
||||
```
|
||||
|
||||
**Fields:**
|
||||
- `tool_return` - The formatted return value from the tool
|
||||
- `status` - Whether the tool executed successfully
|
||||
- `stdout`/`stderr` - Captured output from the tool execution (useful for debugging)
|
||||
|
||||
### Human-in-the-Loop Messages
|
||||
|
||||
#### `approval_request_message`
|
||||
A request for human approval before executing a tool.
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "approval_request_message";
|
||||
tool_call: {
|
||||
name: string;
|
||||
arguments: string;
|
||||
tool_call_id: string;
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
See [Human-in-the-Loop](/guides/agents/human_in_the_loop) for more information on this experimental feature.
|
||||
|
||||
#### `approval_response_message`
|
||||
The user's response to an approval request.
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "approval_response_message";
|
||||
approve: boolean;
|
||||
approval_request_id: string;
|
||||
reason?: string;
|
||||
}
|
||||
```
|
||||
|
||||
## Working with Messages
|
||||
|
||||
### Listing Messages
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
import { LettaClient } from "@letta-ai/letta-client";
|
||||
|
||||
const client = new LettaClient({
|
||||
baseUrl: "https://api.letta.com",
|
||||
});
|
||||
|
||||
// List recent messages
|
||||
const messages = await client.agents.messages.list("agent-id", {
|
||||
limit: 50,
|
||||
useAssistantMessage: true,
|
||||
});
|
||||
|
||||
// Iterate through message types
|
||||
for (const message of messages) {
|
||||
switch (message.messageType) {
|
||||
case "user_message":
|
||||
console.log("User:", message.content);
|
||||
break;
|
||||
case "assistant_message":
|
||||
console.log("Agent:", message.content);
|
||||
break;
|
||||
case "reasoning_message":
|
||||
console.log("Reasoning:", message.reasoning);
|
||||
break;
|
||||
case "tool_call_message":
|
||||
console.log("Tool call:", message.toolCall.name);
|
||||
break;
|
||||
// ... handle other types
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
```python Python
|
||||
from letta_client import Letta
|
||||
|
||||
client = Letta(base_url="https://api.letta.com")
|
||||
|
||||
# List recent messages
|
||||
messages = client.agents.messages.list(
|
||||
agent_id="agent-id",
|
||||
limit=50,
|
||||
use_assistant_message=True
|
||||
)
|
||||
|
||||
# Iterate through message types
|
||||
for message in messages:
|
||||
if message.message_type == "user_message":
|
||||
print(f"User: {message.content}")
|
||||
elif message.message_type == "assistant_message":
|
||||
print(f"Agent: {message.content}")
|
||||
elif message.message_type == "reasoning_message":
|
||||
print(f"Reasoning: {message.reasoning}")
|
||||
elif message.message_type == "tool_call_message":
|
||||
print(f"Tool call: {message.tool_call.name}")
|
||||
# ... handle other types
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
### Filtering Messages by Type
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
// Get only assistant messages (what the agent said to the user)
|
||||
const agentMessages = messages.filter(
|
||||
(msg) => msg.messageType === "assistant_message"
|
||||
);
|
||||
|
||||
// Get all tool-related messages
|
||||
const toolMessages = messages.filter(
|
||||
(msg) => msg.messageType === "tool_call_message" ||
|
||||
msg.messageType === "tool_return_message"
|
||||
);
|
||||
|
||||
// Get conversation history (user + assistant messages only)
|
||||
const conversation = messages.filter(
|
||||
(msg) => msg.messageType === "user_message" ||
|
||||
msg.messageType === "assistant_message"
|
||||
);
|
||||
```
|
||||
|
||||
```python Python
|
||||
# Get only assistant messages (what the agent said to the user)
|
||||
agent_messages = [
|
||||
msg for msg in messages
|
||||
if msg.message_type == "assistant_message"
|
||||
]
|
||||
|
||||
# Get all tool-related messages
|
||||
tool_messages = [
|
||||
msg for msg in messages
|
||||
if msg.message_type in ["tool_call_message", "tool_return_message"]
|
||||
]
|
||||
|
||||
# Get conversation history (user + assistant messages only)
|
||||
conversation = [
|
||||
msg for msg in messages
|
||||
if msg.message_type in ["user_message", "assistant_message"]
|
||||
]
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
|
||||
### Pagination
|
||||
|
||||
Messages support cursor-based pagination:
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
// Get first page
|
||||
let messages = await client.agents.messages.list("agent-id", {
|
||||
limit: 100,
|
||||
});
|
||||
|
||||
// Get next page using the last message ID
|
||||
const lastMessageId = messages[messages.length - 1].id;
|
||||
const nextPage = await client.agents.messages.list("agent-id", {
|
||||
limit: 100,
|
||||
before: lastMessageId,
|
||||
});
|
||||
```
|
||||
|
||||
```python Python
|
||||
# Get first page
|
||||
messages = client.agents.messages.list(
|
||||
agent_id="agent-id",
|
||||
limit=100
|
||||
)
|
||||
|
||||
# Get next page using the last message ID
|
||||
last_message_id = messages[-1].id
|
||||
next_page = client.agents.messages.list(
|
||||
agent_id="agent-id",
|
||||
limit=100,
|
||||
before=last_message_id
|
||||
)
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
## Message Metadata Fields
|
||||
|
||||
All message types include these common fields:
|
||||
|
||||
- **`id`** - Unique identifier for the message
|
||||
- **`date`** - ISO 8601 timestamp of when the message was created
|
||||
- **`message_type`** - The discriminator field identifying the message type
|
||||
- **`name`** - Optional name field (varies by message type)
|
||||
- **`otid`** - Offline threading ID for message correlation
|
||||
- **`sender_id`** - The ID of the sender (identity or agent ID)
|
||||
- **`step_id`** - The step ID associated with this message
|
||||
- **`is_err`** - Whether this message is part of an error step (debugging only)
|
||||
- **`seq_id`** - Sequence ID for ordering
|
||||
- **`run_id`** - The run ID associated with this message
|
||||
|
||||
## Best Practices
|
||||
|
||||
### 1. Use Type Discriminators
|
||||
|
||||
Always check the `message_type` field to safely access type-specific fields:
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
if (message.messageType === "tool_call_message") {
|
||||
// TypeScript now knows message has a toolCall field
|
||||
console.log(message.toolCall.name);
|
||||
}
|
||||
```
|
||||
|
||||
```python Python
|
||||
if message.message_type == "tool_call_message":
|
||||
# Safe to access tool_call
|
||||
print(message.tool_call.name)
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
### 2. Handle Special User Messages
|
||||
|
||||
When displaying conversations to end users, filter out internal messages:
|
||||
|
||||
```python
|
||||
def is_internal_message(msg):
|
||||
"""Check if a user message is internal (heartbeat, login, etc.)"""
|
||||
if msg.message_type != "user_message":
|
||||
return False
|
||||
|
||||
if not isinstance(msg.content, str):
|
||||
return False
|
||||
|
||||
try:
|
||||
parsed = json.loads(msg.content)
|
||||
return parsed.get("type") in ["heartbeat", "login", "system_alert"]
|
||||
except:
|
||||
return False
|
||||
|
||||
# Get user-facing messages only
|
||||
display_messages = [
|
||||
msg for msg in messages
|
||||
if not is_internal_message(msg)
|
||||
]
|
||||
```
|
||||
|
||||
### 3. Track Tool Execution
|
||||
|
||||
Match tool calls with their returns using `tool_call_id`:
|
||||
|
||||
```python
|
||||
# Build a map of tool calls to their returns
|
||||
tool_calls = {
|
||||
msg.tool_call.tool_call_id: msg
|
||||
for msg in messages
|
||||
if msg.message_type == "tool_call_message"
|
||||
}
|
||||
|
||||
tool_returns = {
|
||||
msg.tool_call_id: msg
|
||||
for msg in messages
|
||||
if msg.message_type == "tool_return_message"
|
||||
}
|
||||
|
||||
# Find failed tool calls
|
||||
for call_id, call_msg in tool_calls.items():
|
||||
if call_id in tool_returns:
|
||||
return_msg = tool_returns[call_id]
|
||||
if return_msg.status == "error":
|
||||
print(f"Tool {call_msg.tool_call.name} failed:")
|
||||
print(f" {return_msg.tool_return}")
|
||||
```
|
||||
|
||||
## See Also
|
||||
|
||||
- [Human-in-the-Loop](/guides/agents/human_in_the_loop) - Using approval messages
|
||||
- [Streaming Responses](/guides/agents/streaming) - Receiving messages in real-time
|
||||
- [API Reference](/api-reference/agents/messages/list) - Full API documentation
|
||||
@@ -1,459 +0,0 @@
|
||||
---
|
||||
title: Message Types
|
||||
subtitle: Understanding message types and working with agent message history
|
||||
slug: guides/agents/message-types
|
||||
---
|
||||
|
||||
When you interact with a Letta agent and retrieve its message history using `client.agents.messages.list()`, you'll receive various types of messages that represent different aspects of the agent's execution. This guide explains all message types and how to work with them.
|
||||
|
||||
## Overview
|
||||
|
||||
Letta uses a structured message system where each message has a specific `message_type` field that indicates its purpose. Messages are returned as instances of `LettaMessageUnion`, which is a discriminated union of all possible message types.
|
||||
|
||||
## Message Type Categories
|
||||
|
||||
### User and System Messages
|
||||
|
||||
#### `user_message`
|
||||
Messages sent by the user or system events packaged as user input.
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "user_message";
|
||||
content: string | Array<TextContent | ImageContent>;
|
||||
name?: string;
|
||||
otid?: string;
|
||||
sender_id?: string;
|
||||
}
|
||||
```
|
||||
|
||||
**Special User Message Subtypes:**
|
||||
User messages can contain JSON with a `type` field indicating special message subtypes:
|
||||
|
||||
- **`login`** - User login events
|
||||
```json
|
||||
{
|
||||
"type": "login",
|
||||
"last_login": "Never (first login)",
|
||||
"time": "2025-10-03 12:34:56 PM PDT-0700"
|
||||
}
|
||||
```
|
||||
|
||||
- **`user_message`** - Standard user messages
|
||||
```json
|
||||
{
|
||||
"type": "user_message",
|
||||
"message": "Hello, agent!",
|
||||
"time": "2025-10-03 12:34:56 PM PDT-0700"
|
||||
}
|
||||
```
|
||||
|
||||
- **`system_alert`** - System notifications and alerts
|
||||
```json
|
||||
{
|
||||
"type": "system_alert",
|
||||
"message": "System notification text",
|
||||
"time": "2025-10-03 12:34:56 PM PDT-0700"
|
||||
}
|
||||
```
|
||||
|
||||
#### `system_message`
|
||||
Messages generated by the system, typically used for internal context.
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "system_message";
|
||||
content: string;
|
||||
name?: string;
|
||||
}
|
||||
```
|
||||
|
||||
**Note:** System messages are never streamed back in responses; they're only visible when paginating through message history.
|
||||
|
||||
### Agent Reasoning and Responses
|
||||
|
||||
#### `reasoning_message`
|
||||
Represents the agent's internal reasoning or "chain of thought."
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "reasoning_message";
|
||||
reasoning: string;
|
||||
source: "reasoner_model" | "non_reasoner_model";
|
||||
signature?: string;
|
||||
}
|
||||
```
|
||||
|
||||
**Fields:**
|
||||
- `reasoning` - The agent's internal thought process
|
||||
- `source` - Whether this was generated by a model with native reasoning (like o1) or via prompting
|
||||
- `signature` - Optional cryptographic signature for reasoning verification (for models that support it)
|
||||
|
||||
#### `hidden_reasoning_message`
|
||||
Represents reasoning that has been hidden from the response.
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "hidden_reasoning_message";
|
||||
state: "redacted" | "omitted";
|
||||
hidden_reasoning?: string;
|
||||
}
|
||||
```
|
||||
|
||||
**Fields:**
|
||||
- `state: "redacted"` - The provider redacted the reasoning content
|
||||
- `state: "omitted"` - The API chose not to include reasoning (e.g., for o1/o3 models)
|
||||
|
||||
#### `assistant_message`
|
||||
The actual message content sent by the agent.
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "assistant_message";
|
||||
content: string | Array<TextContent>;
|
||||
name?: string;
|
||||
}
|
||||
```
|
||||
|
||||
### Tool Execution Messages
|
||||
|
||||
#### `tool_call_message`
|
||||
A request from the agent to execute a tool.
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "tool_call_message";
|
||||
tool_call: {
|
||||
name: string;
|
||||
arguments: string; // JSON string
|
||||
tool_call_id: string;
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
**Example:**
|
||||
```typescript
|
||||
{
|
||||
message_type: "tool_call_message",
|
||||
tool_call: {
|
||||
name: "archival_memory_search",
|
||||
arguments: '{"query": "user preferences", "page": 0}',
|
||||
tool_call_id: "call_abc123"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### `tool_return_message`
|
||||
The result of a tool execution.
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "tool_return_message";
|
||||
tool_return: string;
|
||||
status: "success" | "error";
|
||||
tool_call_id: string;
|
||||
stdout?: string[];
|
||||
stderr?: string[];
|
||||
}
|
||||
```
|
||||
|
||||
**Fields:**
|
||||
- `tool_return` - The formatted return value from the tool
|
||||
- `status` - Whether the tool executed successfully
|
||||
- `stdout`/`stderr` - Captured output from the tool execution (useful for debugging)
|
||||
|
||||
### Human-in-the-Loop Messages
|
||||
|
||||
#### `approval_request_message`
|
||||
A request for human approval before executing a tool.
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "approval_request_message";
|
||||
tool_call: {
|
||||
name: string;
|
||||
arguments: string;
|
||||
tool_call_id: string;
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
See [Human-in-the-Loop](/guides/agents/human_in_the_loop) for more information on this experimental feature.
|
||||
|
||||
#### `approval_response_message`
|
||||
The user's response to an approval request.
|
||||
|
||||
**Structure:**
|
||||
```typescript
|
||||
{
|
||||
id: string;
|
||||
date: datetime;
|
||||
message_type: "approval_response_message";
|
||||
approve: boolean;
|
||||
approval_request_id: string;
|
||||
reason?: string;
|
||||
}
|
||||
```
|
||||
|
||||
## Working with Messages
|
||||
|
||||
### Listing Messages
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
import { LettaClient } from "@letta-ai/letta-client";
|
||||
|
||||
const client = new LettaClient({
|
||||
baseUrl: "https://api.letta.com",
|
||||
});
|
||||
|
||||
// List recent messages
|
||||
const messages = await client.agents.messages.list("agent-id", {
|
||||
limit: 50,
|
||||
useAssistantMessage: true,
|
||||
});
|
||||
|
||||
// Iterate through message types
|
||||
for (const message of messages) {
|
||||
switch (message.messageType) {
|
||||
case "user_message":
|
||||
console.log("User:", message.content);
|
||||
break;
|
||||
case "assistant_message":
|
||||
console.log("Agent:", message.content);
|
||||
break;
|
||||
case "reasoning_message":
|
||||
console.log("Reasoning:", message.reasoning);
|
||||
break;
|
||||
case "tool_call_message":
|
||||
console.log("Tool call:", message.toolCall.name);
|
||||
break;
|
||||
// ... handle other types
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
```python Python
|
||||
from letta_client import Letta
|
||||
|
||||
client = Letta(base_url="https://api.letta.com")
|
||||
|
||||
# List recent messages
|
||||
messages = client.agents.messages.list(
|
||||
agent_id="agent-id",
|
||||
limit=50,
|
||||
use_assistant_message=True
|
||||
)
|
||||
|
||||
# Iterate through message types
|
||||
for message in messages:
|
||||
if message.message_type == "user_message":
|
||||
print(f"User: {message.content}")
|
||||
elif message.message_type == "assistant_message":
|
||||
print(f"Agent: {message.content}")
|
||||
elif message.message_type == "reasoning_message":
|
||||
print(f"Reasoning: {message.reasoning}")
|
||||
elif message.message_type == "tool_call_message":
|
||||
print(f"Tool call: {message.tool_call.name}")
|
||||
# ... handle other types
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
### Filtering Messages by Type
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
// Get only assistant messages (what the agent said to the user)
|
||||
const agentMessages = messages.filter(
|
||||
(msg) => msg.messageType === "assistant_message"
|
||||
);
|
||||
|
||||
// Get all tool-related messages
|
||||
const toolMessages = messages.filter(
|
||||
(msg) => msg.messageType === "tool_call_message" ||
|
||||
msg.messageType === "tool_return_message"
|
||||
);
|
||||
|
||||
// Get conversation history (user + assistant messages only)
|
||||
const conversation = messages.filter(
|
||||
(msg) => msg.messageType === "user_message" ||
|
||||
msg.messageType === "assistant_message"
|
||||
);
|
||||
```
|
||||
|
||||
```python Python
|
||||
# Get only assistant messages (what the agent said to the user)
|
||||
agent_messages = [
|
||||
msg for msg in messages
|
||||
if msg.message_type == "assistant_message"
|
||||
]
|
||||
|
||||
# Get all tool-related messages
|
||||
tool_messages = [
|
||||
msg for msg in messages
|
||||
if msg.message_type in ["tool_call_message", "tool_return_message"]
|
||||
]
|
||||
|
||||
# Get conversation history (user + assistant messages only)
|
||||
conversation = [
|
||||
msg for msg in messages
|
||||
if msg.message_type in ["user_message", "assistant_message"]
|
||||
]
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
|
||||
### Pagination
|
||||
|
||||
Messages support cursor-based pagination:
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
// Get first page
|
||||
let messages = await client.agents.messages.list("agent-id", {
|
||||
limit: 100,
|
||||
});
|
||||
|
||||
// Get next page using the last message ID
|
||||
const lastMessageId = messages[messages.length - 1].id;
|
||||
const nextPage = await client.agents.messages.list("agent-id", {
|
||||
limit: 100,
|
||||
before: lastMessageId,
|
||||
});
|
||||
```
|
||||
|
||||
```python Python
|
||||
# Get first page
|
||||
messages = client.agents.messages.list(
|
||||
agent_id="agent-id",
|
||||
limit=100
|
||||
)
|
||||
|
||||
# Get next page using the last message ID
|
||||
last_message_id = messages[-1].id
|
||||
next_page = client.agents.messages.list(
|
||||
agent_id="agent-id",
|
||||
limit=100,
|
||||
before=last_message_id
|
||||
)
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
## Message Metadata Fields
|
||||
|
||||
All message types include these common fields:
|
||||
|
||||
- **`id`** - Unique identifier for the message
|
||||
- **`date`** - ISO 8601 timestamp of when the message was created
|
||||
- **`message_type`** - The discriminator field identifying the message type
|
||||
- **`name`** - Optional name field (varies by message type)
|
||||
- **`otid`** - Offline threading ID for message correlation
|
||||
- **`sender_id`** - The ID of the sender (identity or agent ID)
|
||||
- **`step_id`** - The step ID associated with this message
|
||||
- **`is_err`** - Whether this message is part of an error step (debugging only)
|
||||
- **`seq_id`** - Sequence ID for ordering
|
||||
- **`run_id`** - The run ID associated with this message
|
||||
|
||||
## Best Practices
|
||||
|
||||
### 1. Use Type Discriminators
|
||||
|
||||
Always check the `message_type` field to safely access type-specific fields:
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
if (message.messageType === "tool_call_message") {
|
||||
// TypeScript now knows message has a toolCall field
|
||||
console.log(message.toolCall.name);
|
||||
}
|
||||
```
|
||||
|
||||
```python Python
|
||||
if message.message_type == "tool_call_message":
|
||||
# Safe to access tool_call
|
||||
print(message.tool_call.name)
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
### 2. Handle Special User Messages
|
||||
|
||||
When displaying conversations to end users, filter out internal messages:
|
||||
|
||||
```python
|
||||
def is_internal_message(msg):
|
||||
"""Check if a user message is internal (heartbeat, login, etc.)"""
|
||||
if msg.message_type != "user_message":
|
||||
return False
|
||||
|
||||
if not isinstance(msg.content, str):
|
||||
return False
|
||||
|
||||
try:
|
||||
parsed = json.loads(msg.content)
|
||||
return parsed.get("type") in ["login", "system_alert"]
|
||||
except:
|
||||
return False
|
||||
|
||||
# Get user-facing messages only
|
||||
display_messages = [
|
||||
msg for msg in messages
|
||||
if not is_internal_message(msg)
|
||||
]
|
||||
```
|
||||
|
||||
### 3. Track Tool Execution
|
||||
|
||||
Match tool calls with their returns using `tool_call_id`:
|
||||
|
||||
```python
|
||||
# Build a map of tool calls to their returns
|
||||
tool_calls = {
|
||||
msg.tool_call.tool_call_id: msg
|
||||
for msg in messages
|
||||
if msg.message_type == "tool_call_message"
|
||||
}
|
||||
|
||||
tool_returns = {
|
||||
msg.tool_call_id: msg
|
||||
for msg in messages
|
||||
if msg.message_type == "tool_return_message"
|
||||
}
|
||||
|
||||
# Find failed tool calls
|
||||
for call_id, call_msg in tool_calls.items():
|
||||
if call_id in tool_returns:
|
||||
return_msg = tool_returns[call_id]
|
||||
if return_msg.status == "error":
|
||||
print(f"Tool {call_msg.tool_call.name} failed:")
|
||||
print(f" {return_msg.tool_return}")
|
||||
```
|
||||
|
||||
## See Also
|
||||
|
||||
- [Human-in-the-Loop](/guides/agents/human_in_the_loop) - Using approval messages
|
||||
- [Streaming Responses](/guides/agents/streaming) - Receiving messages in real-time
|
||||
- [API Reference](/api-reference/agents/messages/list) - Full API documentation
|
||||
@@ -1,120 +0,0 @@
|
||||
---
|
||||
title: Multi-Agent Systems
|
||||
slug: guides/agents/multi-agent
|
||||
---
|
||||
|
||||
<Tip>
|
||||
All agents in Letta are *stateful* - so when you build a multi-agent system in Letta, each agent can run both independently and with others via cross-agent messaging tools! The choice is yours.
|
||||
</Tip>
|
||||
|
||||
Letta provides built-in tools for supporting cross-agent communication to build multi-agent systems.
|
||||
To enable multi-agent collaboration, you should create agents that have access to the [built-in cross-agent communication tools](#built-in-multi-agent-tools) - either by attaching the tools in the ADE, or via the API or Python/TypeScript SDK.
|
||||
|
||||
Letta agents can also share state via [shared memory blocks](/guides/agents/multi-agent-shared-memory). Shared memory blocks allow agents to have shared memory (e.g. memory about an organization they are both a part of or a task they are both working on).
|
||||
|
||||
## Built-in Multi-Agent Tools
|
||||
<Tip>
|
||||
We recommend only attaching one of `send_message_to_agent_and_wait_for_reply` or `send_message_to_agent_async`, but not both.
|
||||
Attaching both tools can cause the agent to become confused and use the tool less reliably.
|
||||
</Tip>
|
||||
|
||||
Our built-in tools for multi-agent communication can be used to create both **synchronous** and **asynchronous** communication networks between agents on your Letta server.
|
||||
However, because all agents in Letta are addressible via a REST API, you can also make your own custom tools that use the [API for messaging agents](/api-reference/agents/messages/create) to design your own version of agent-to-agent communication.
|
||||
|
||||
There are three built-in tools for cross-agent communication:
|
||||
* `send_message_to_agent_async` for asynchronous multi-agent messaging,
|
||||
* `send_message_to_agent_and_wait_for_reply` for synchronous multi-agent messaging,
|
||||
* and `send_message_to_agents_matching_all_tags` for a "supervisor-worker" pattern
|
||||
|
||||
### Messaging another agent (async / no wait)
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
// The function signature for the async multi-agent messaging tool
|
||||
function sendMessageToAgentAsync(
|
||||
message: string,
|
||||
otherAgentId: string
|
||||
): string
|
||||
```
|
||||
```python Python
|
||||
# The function signature for the async multi-agent messaging tool
|
||||
def send_message_to_agent_async(
|
||||
message: str,
|
||||
other_agent_id: str,
|
||||
): -> str
|
||||
```
|
||||
</CodeGroup>
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
Agent 1->>Agent 2: "Hi Agent 2 are you there?"
|
||||
Agent 2-->>Agent 1: "Your message has been delivered."
|
||||
Note over Agent 2: Processes message: "New message from Agent 1: ..."
|
||||
Agent 2->>Agent 1: "Hi Agent 1, yes I'm here!"
|
||||
Agent 1-->>Agent 2: "Your message has been delivered."
|
||||
```
|
||||
|
||||
The `send_message_to_agent_async` tool allows one agent to send a message to another agent.
|
||||
This tool is **asynchronous**: instead of waiting for a response from the target agent, the agent will return immediately after sending the message.
|
||||
The message that is sent to the target agent contains a "message receipt", indicating which agent sent the message, which allows the target agent to reply to the sender (assuming they also have access to the `send_message_to_agent_async` tool).
|
||||
|
||||
### Messaging another agent (wait for reply)
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
// The function signature for the synchronous multi-agent messaging tool
|
||||
function sendMessageToAgentAndWaitForReply(
|
||||
message: string,
|
||||
otherAgentId: string
|
||||
): string
|
||||
```
|
||||
```python Python
|
||||
# The function signature for the synchronous multi-agent messaging tool
|
||||
def send_message_to_agent_and_wait_for_reply(
|
||||
message: str,
|
||||
other_agent_id: str,
|
||||
): -> str
|
||||
```
|
||||
</CodeGroup>
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
Agent 1->>Agent 2: "Hi Agent 2 are you there?"
|
||||
Note over Agent 2: Processes message: "New message from Agent 1: ..."
|
||||
Agent 2->>Agent 1: "Hi Agent 1, yes I'm here!"
|
||||
```
|
||||
|
||||
The `send_message_to_agent_and_wait_for_reply` tool also allows one agent to send a message to another agent.
|
||||
However, this tool is **synchronous**: the agent will wait for a response from the target agent before returning.
|
||||
The response of the target agent is returned in the tool output - if the target agent does not respond, the tool will return default message indicating no response was received.
|
||||
|
||||
### Messaging a group of agents (supervisor-worker pattern)
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
// The function signature for the group broadcast multi-agent messaging tool
|
||||
function sendMessageToAgentsMatchingAllTags(
|
||||
message: string,
|
||||
tags: string[]
|
||||
): string[]
|
||||
```
|
||||
```python Python
|
||||
# The function signature for the group broadcast multi-agent messaging tool
|
||||
def send_message_to_agents_matching_all_tags(
|
||||
message: str,
|
||||
tags: List[str],
|
||||
) -> List[str]:
|
||||
```
|
||||
</CodeGroup>
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
Supervisor->>Worker 1: "Let's start the task"
|
||||
Supervisor->>Worker 2: "Let's start the task"
|
||||
Supervisor->>Worker 3: "Let's start the task"
|
||||
Note over Worker 1,Worker 3: All workers process their tasks
|
||||
Worker 1->>Supervisor: "Here's my result!"
|
||||
Worker 2->>Supervisor: "This is what I have"
|
||||
Worker 3->>Supervisor: "I didn't do anything..."
|
||||
```
|
||||
|
||||
The `send_message_to_agents_matching_all_tags` tool allows one agent to send a message a larger group of agents in a "supervisor-worker" pattern.
|
||||
For example, a supervisor agent can use this tool to send a message asking all workers in a group to begin a task.
|
||||
This tool is also **synchronous**, so the result of the tool call will be a list of the responses from each agent in the group.
|
||||
@@ -1,163 +0,0 @@
|
||||
---
|
||||
title: "Multi-modal (image inputs)"
|
||||
subtitle: "Send images to your agents"
|
||||
slug: "multimodal"
|
||||
---
|
||||
|
||||
<Note>
|
||||
Multi-modal features require compatible language models. Ensure your agent is configured with a multi-modal capable model.
|
||||
</Note>
|
||||
|
||||
Letta agents support image inputs, enabling richer conversations and more powerful agent capabilities.
|
||||
|
||||
## Model Support
|
||||
|
||||
Multi-modal capabilities depend on the underlying language model.
|
||||
You can check which models from the API providers support image inputs by checking their individual model pages:
|
||||
|
||||
- **[OpenAI](https://platform.openai.com/docs/models)**: GPT-4.1, o1/3/4, GPT-4o
|
||||
- **[Anthropic](https://docs.anthropic.com/en/docs/about-claude/models/overview)**: Claude Opus 4, Claude Sonnet 4
|
||||
- **[Gemini](https://ai.google.dev/gemini-api/docs/models)**: Gemini 2.5 Pro, Gemini 2.5 Flash
|
||||
|
||||
If the provider you're using doesn't support image inputs, your images will still appear in the context window, but as a text message telling the agent that an image exists.
|
||||
|
||||
## ADE Support
|
||||
|
||||
You can pass images to your agents by drag-and-dropping them into the chat window, or clicking the image icon to select a manual file upload.
|
||||
|
||||
<img className="light" src="/images/ade-mm.png" />
|
||||
<img className="dark" src="/images/ade-mm-dark.png" />
|
||||
|
||||
## Usage Examples (SDK)
|
||||
|
||||
### Sending an Image via URL
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript maxLines=100
|
||||
import { LettaClient } from '@letta-ai/letta-client';
|
||||
|
||||
const client = new LettaClient({ token: "LETTA_API_KEY" });
|
||||
|
||||
const response = await client.agents.messages.create(
|
||||
agentState.id, {
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{
|
||||
type: "text",
|
||||
text: "Describe this image."
|
||||
},
|
||||
{
|
||||
type: "image",
|
||||
source: {
|
||||
type: "url",
|
||||
url: "https://upload.wikimedia.org/wikipedia/commons/a/a7/Camponotus_flavomarginatus_ant.jpg",
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
);
|
||||
```
|
||||
```python title="python" maxLines=100
|
||||
from letta_client import Letta
|
||||
|
||||
client = Letta(token="LETTA_API_KEY")
|
||||
|
||||
response = client.agents.messages.create(
|
||||
agent_id=agent_state.id,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Describe this image."
|
||||
},
|
||||
{
|
||||
"type": "image",
|
||||
"source": {
|
||||
"type": "url",
|
||||
"url": "https://upload.wikimedia.org/wikipedia/commons/a/a7/Camponotus_flavomarginatus_ant.jpg",
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
],
|
||||
)
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
### Sending an Image via Base64
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript maxLines=100
|
||||
import { LettaClient } from '@letta-ai/letta-client';
|
||||
|
||||
const client = new LettaClient({ token: "LETTA_API_KEY" });
|
||||
|
||||
const imageUrl = "https://upload.wikimedia.org/wikipedia/commons/a/a7/Camponotus_flavomarginatus_ant.jpg";
|
||||
const imageResponse = await fetch(imageUrl);
|
||||
const imageBuffer = await imageResponse.arrayBuffer();
|
||||
const imageData = Buffer.from(imageBuffer).toString('base64');
|
||||
|
||||
const response = await client.agents.messages.create(
|
||||
agentState.id, {
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{
|
||||
type: "text",
|
||||
text: "Describe this image."
|
||||
},
|
||||
{
|
||||
type: "image",
|
||||
source: {
|
||||
type: "base64",
|
||||
mediaType: "image/jpeg",
|
||||
data: imageData,
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
);
|
||||
```
|
||||
```python title="python" maxLines=100
|
||||
import base64
|
||||
import httpx
|
||||
from letta_client import Letta
|
||||
|
||||
client = Letta(token="LETTA_API_KEY")
|
||||
|
||||
image_url = "https://upload.wikimedia.org/wikipedia/commons/a/a7/Camponotus_flavomarginatus_ant.jpg"
|
||||
image_data = base64.standard_b64encode(httpx.get(image_url).content).decode("utf-8")
|
||||
|
||||
response = client.agents.messages.create(
|
||||
agent_id=agent_state.id,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Describe this image."
|
||||
},
|
||||
{
|
||||
"type": "image",
|
||||
"source": {
|
||||
"type": "base64",
|
||||
"media_type": "image/jpeg",
|
||||
"data": image_data,
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
],
|
||||
)
|
||||
```
|
||||
</CodeGroup>
|
||||
@@ -1,177 +0,0 @@
|
||||
---
|
||||
title: User Identities
|
||||
slug: guides/agents/multi-user
|
||||
---
|
||||
|
||||
You may be building a multi-user application with Letta, in which each user is associated with a specific agent.
|
||||
In this scenario, you can use **Identities** to associate each agent with a user in your application.
|
||||
|
||||
## Using Identities
|
||||
Let's assume that you have an application with multiple users that you're building on a [self-hosted Letta server](/guides/server/docker) or [Letta Cloud](/guides/cloud).
|
||||
Each user has a unique username, starting at `user_1`, and incrementing up as you add more users to the platform.
|
||||
|
||||
To associate agents you create in Letta with your users, you can first create an **Identity** object with the user's unique ID as the `identifier_key` for your user, and then specify the **Identity** object ID when creating an agent.
|
||||
|
||||
For example, with `user_1`, we would create a new Identity object with `identifier_key="user_1"` and then pass `identity.id` into our [create agent request](/api-reference/agents/create):
|
||||
<CodeBlocks>
|
||||
```curl title="curl"
|
||||
curl -X POST https://app.letta.com/v1/identities/ \
|
||||
-H "Authorization: Bearer <token>" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"identifier_key": "user_1",
|
||||
"name": "Caren",
|
||||
"identity_type": "user"
|
||||
}'
|
||||
{"id":"identity-634d3994-5d6c-46e9-b56b-56e34fe34ca0","identifier_key":"user_1","name":"Caren","identity_type":"user","project_id":null,"agent_ids":[],"organization_id":"org-00000000-0000-4000-8000-000000000000","properties":[]}
|
||||
curl -X POST https://app.letta.com/v1/agents/ \
|
||||
-H "Authorization: Bearer <token>" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"memory_blocks": [],
|
||||
"llm": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"context_window_limit": 200000,
|
||||
"embedding": "openai/text-embedding-3-small",
|
||||
"identity_ids": ["identity-634d3994-5d6c-46e9-b56b-56e34fe34ca0"]
|
||||
}'
|
||||
```
|
||||
```python title="python"
|
||||
# assumes that you already instantiated a client
|
||||
identity = client.identities.create(
|
||||
identifier_key="user_1",
|
||||
name="Caren",
|
||||
identity_type="user"
|
||||
)
|
||||
agent = client.agents.create(
|
||||
memory_blocks=[],
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
context_window_limit=200000,
|
||||
identity_ids=[identity.id]
|
||||
)
|
||||
```
|
||||
|
||||
```typescript TypeScript
|
||||
// assumes that you already instantiated a client
|
||||
const identity = await client.identities.create({
|
||||
identifierKey: "user_1",
|
||||
name: "Caren",
|
||||
identityType: "user"
|
||||
})
|
||||
const agent = await client.agents.create({
|
||||
memoryBlocks: [],
|
||||
model: "anthropic/claude-3-5-sonnet-20241022",
|
||||
contextWindowLimit: 200000,
|
||||
identityIds: [identity.id]
|
||||
});
|
||||
```
|
||||
</CodeBlocks>
|
||||
|
||||
Then, if I wanted to search for agents associated with a specific user (e.g. called `user_id`), I could use the `identifier_keys` parameter in the [list agents request](/api-reference/agents/list):
|
||||
<CodeBlocks>
|
||||
```curl title="curl"
|
||||
curl -X GET "https://app.letta.com/v1/agents/?identifier_keys=user_1" \
|
||||
-H "Accept: application/json"
|
||||
```
|
||||
```python title="python"
|
||||
# assumes that you already instantiated a client
|
||||
user_agents = client.agents.list(
|
||||
identifier_keys=["user_1"]
|
||||
)
|
||||
```
|
||||
```typescript TypeScript
|
||||
// assumes that you already instantiated a client
|
||||
await client.agents.list({
|
||||
identifierKeys: ["user_1"]
|
||||
});
|
||||
```
|
||||
</CodeBlocks>
|
||||
|
||||
You can also create an identity object and attach it to an existing agent. This can be useful if you want to enable multiple users to interact with a single agent:
|
||||
<CodeBlocks>
|
||||
```curl title="curl"
|
||||
curl -X POST https://app.letta.com/v1/identities/ \
|
||||
-H "Authorization: Bearer <token>" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"identifier_key": "user_1",
|
||||
"name": "Sarah",
|
||||
"identity_type": "user"
|
||||
"agent_ids": ["agent-00000000-0000-4000-8000-000000000000"]
|
||||
}'
|
||||
```
|
||||
```python title="python"
|
||||
# assumes that you already instantiated a client
|
||||
identity = client.identities.create({
|
||||
identifier_key="user_1",
|
||||
name="Sarah",
|
||||
identity_type="user"
|
||||
agent_ids=["agent-00000000-0000-4000-8000-000000000000"]
|
||||
})
|
||||
```
|
||||
```typescript TypeScript
|
||||
// assumes that you already instantiated a client
|
||||
const identity = await client.identities.create({
|
||||
identifierKey: "user_1",
|
||||
name: "Sarah",
|
||||
identityType: "user"
|
||||
agentIds: ["agent-00000000-0000-4000-8000-000000000000"]
|
||||
})
|
||||
```
|
||||
</CodeBlocks>
|
||||
|
||||
### Using Agent Tags to Identify Users
|
||||
It's also possible to utilize our agent tags feature to associate agents with specific users. To associate agents you create in Letta with your users, you can specify a tag when creating an agent, and set the tag to the user's unique ID.
|
||||
This example assumes that you have a self-hosted Letta server running on localhost (for example, by running [`docker run ...`](/guides/server/docker)).
|
||||
<Accordion title="View example SDK code">
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
import { LettaClient } from '@letta-ai/letta-client';
|
||||
|
||||
// Connect to Letta Cloud
|
||||
const client = new LettaClient({token: process.env.LETTA_API_KEY});
|
||||
const userId = "my_uuid";
|
||||
|
||||
// create an agent with the userId tag
|
||||
const agent = await client.agents.create({
|
||||
memoryBlocks: [],
|
||||
model: "anthropic/claude-3-5-sonnet-20241022",
|
||||
contextWindowLimit: 200000,
|
||||
tags: [userId]
|
||||
});
|
||||
console.log(`Created agent with id ${agent.id}, tags ${agent.tags}`);
|
||||
|
||||
// list agents
|
||||
const userAgents = await client.agents.list({tags: [userId]});
|
||||
const agentIds = userAgents.map(agent => agent.id);
|
||||
console.log(`Found matching agents ${agentIds}`);
|
||||
```
|
||||
```python Python
|
||||
from letta_client import Letta
|
||||
|
||||
# Connect to Letta Cloud
|
||||
import os
|
||||
client = Letta(token=os.getenv("LETTA_API_KEY"))
|
||||
user_id = "my_uuid"
|
||||
|
||||
# create an agent with the user_id tag
|
||||
agent = client.agents.create(
|
||||
memory_blocks=[],
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
context_window_limit=200000,
|
||||
tags=[user_id]
|
||||
)
|
||||
print(f"Created agent with id {agent.id}, tags {agent.tags}")
|
||||
|
||||
# list agents
|
||||
user_agents = client.agents.list(tags=[user_id])
|
||||
agent_ids = [agent.id for agent in user_agents]
|
||||
print(f"Found matching agents {agent_ids}")
|
||||
```
|
||||
</CodeGroup>
|
||||
</Accordion>
|
||||
|
||||
## Creating and Viewing Tags in the ADE
|
||||
You can also modify tags in the ADE.
|
||||
Simply click the **Advanced Settings** tab in the top-left of the ADE to view an agent's tags.
|
||||
You can create new tags by typing the tag name in the input field and hitting enter.
|
||||
<img src="../../images/tags.png" />
|
||||
@@ -1,277 +0,0 @@
|
||||
---
|
||||
title: Building Stateful Agents with Letta
|
||||
slug: guides/agents/overview
|
||||
---
|
||||
|
||||
<Info>
|
||||
**New to Letta?** If you haven't already, read [Core Concepts](/core-concepts) to understand how Letta's stateful agents are fundamentally different from traditional LLM APIs.
|
||||
</Info>
|
||||
|
||||
Letta agents can automatically manage long-term memory, load data from external sources, and call custom tools.
|
||||
Unlike in other frameworks, Letta agents are stateful, so they keep track of historical interactions and reserve part of their context to read and write memories which evolve over time.
|
||||
<img className="light" src="/images/stateful_agents.png" />
|
||||
<img className="dark" src="/images/stateful_agents_dark.png" />
|
||||
|
||||
|
||||
|
||||
Letta manages a reasoning loop for agents. At each agent step (i.e. iteration of the loop), the state of the agent is checkpointed and persisted to the database.
|
||||
|
||||
You can interact with agents from a REST API, the ADE, and TypeScript / Python SDKs.
|
||||
As long as they are connected to the same service, all of these interfaces can be used to interact with the same agents.
|
||||
|
||||
<Tip>
|
||||
If you're interested in learning more about stateful agents, read our [blog post](https://www.letta.com/blog/stateful-agents).
|
||||
</Tip>
|
||||
|
||||
## Agents vs Threads
|
||||
|
||||
In Letta, you can think of an agent as a single entity that has a single message history which is treated as infinite.
|
||||
The sequence of interactions the agent has experienced through its existence make up the agent's state (or memory).
|
||||
|
||||
One distinction between Letta and other agent frameworks is that Letta does not have the notion of message *threads* (or *sessions*).
|
||||
Instead, there are only *stateful agents*, which have a single perpetual thread (sequence of messages).
|
||||
|
||||
The reason we use the term *agent* rather than *thread* is because Letta is based on the principle that **all agents interactions should be part of the persistent memory**, as opposed to building agent applications around ephemeral, short-lived interactions (like a thread or session).
|
||||
```mermaid
|
||||
%%{init: {'flowchart': {'rankDir': 'LR'}}}%%
|
||||
flowchart LR
|
||||
subgraph Traditional["Thread-Based Agents"]
|
||||
direction TB
|
||||
llm1[LLM] --> thread1["Thread 1
|
||||
--------
|
||||
Ephemeral
|
||||
Session"]
|
||||
llm1 --> thread2["Thread 2
|
||||
--------
|
||||
Ephemeral
|
||||
Session"]
|
||||
llm1 --> thread3["Thread 3
|
||||
--------
|
||||
Ephemeral
|
||||
Session"]
|
||||
end
|
||||
|
||||
Traditional ~~~ Letta
|
||||
|
||||
subgraph Letta["Letta Stateful Agents"]
|
||||
direction TB
|
||||
llm2[LLM] --> agent["Single Agent
|
||||
--------
|
||||
Persistent Memory"]
|
||||
agent --> db[(PostgreSQL)]
|
||||
db -->|"Learn & Update"| agent
|
||||
end
|
||||
|
||||
class thread1,thread2,thread3 session
|
||||
class agent agent
|
||||
```
|
||||
|
||||
If you would like to create common starting points for new conversation "threads", we recommending using [agent templates](/guides/templates/overview) to create new agents for each conversation, or directly copying agent state from an existing agent.
|
||||
|
||||
For multi-users applications, we recommend creating an agent per-user, though you can also have multiple users message a single agent (but it will be a single shared message history).
|
||||
|
||||
## Create an agent
|
||||
<Note>
|
||||
To start creating agents with Letta Cloud, [create an API key](https://app.letta.com/api-keys) and set it as `LETTA_API_KEY` in your environment. For self-hosted deployments, see our [self-hosting guide](/guides/selfhosting/overview).
|
||||
</Note>
|
||||
|
||||
You can create a new agent via the REST API, Python SDK, or TypeScript SDK:
|
||||
<CodeGroup>
|
||||
```curl curl
|
||||
curl -X POST https://api.letta.com/v1/agents \
|
||||
-H "Authorization: Bearer $LETTA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"memory_blocks": [
|
||||
{
|
||||
"value": "The human'\''s name is Bob the Builder.",
|
||||
"label": "human"
|
||||
},
|
||||
{
|
||||
"value": "My name is Sam, the all-knowing sentient AI.",
|
||||
"label": "persona"
|
||||
}
|
||||
],
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"context_window_limit": 16000
|
||||
}'
|
||||
```
|
||||
```python title="python" maxLines=50
|
||||
# install letta_client with `pip install letta-client`
|
||||
from letta_client import Letta
|
||||
import os
|
||||
|
||||
# create a client connected to Letta Cloud (uses api.letta.com by default)
|
||||
client = Letta(token=os.getenv("LETTA_API_KEY"))
|
||||
|
||||
# create an agent with two basic self-editing memory blocks
|
||||
agent_state = client.agents.create(
|
||||
memory_blocks=[
|
||||
{
|
||||
"label": "human",
|
||||
"value": "The human's name is Bob the Builder."
|
||||
},
|
||||
{
|
||||
"label": "persona",
|
||||
"value": "My name is Sam, the all-knowing sentient AI."
|
||||
}
|
||||
],
|
||||
model="openai/gpt-4o-mini",
|
||||
context_window_limit=16000
|
||||
)
|
||||
|
||||
# the AgentState object contains all the information about the agent
|
||||
print(agent_state)
|
||||
```
|
||||
```typescript TypeScript maxLines=50
|
||||
// install letta-client with `npm install @letta-ai/letta-client`
|
||||
import { LettaClient } from '@letta-ai/letta-client'
|
||||
|
||||
// create a client connected to Letta Cloud (uses api.letta.com by default)
|
||||
const client = new LettaClient({
|
||||
token: process.env.LETTA_API_KEY
|
||||
});
|
||||
|
||||
// create an agent with two basic self-editing memory blocks
|
||||
const agentState = await client.agents.create({
|
||||
memoryBlocks: [
|
||||
{
|
||||
label: "human",
|
||||
value: "The human's name is Bob the Builder."
|
||||
},
|
||||
{
|
||||
label: "persona",
|
||||
value: "My name is Sam, the all-knowing sentient AI."
|
||||
}
|
||||
],
|
||||
model: "openai/gpt-4o-mini",
|
||||
contextWindowLimit: 16000
|
||||
});
|
||||
|
||||
// the AgentState object contains all the information about the agent
|
||||
console.log(agentState);
|
||||
```
|
||||
</CodeGroup>
|
||||
You can also create an agent without any code using the [Agent Development Environment (ADE)](/agent-development-environment).
|
||||
All Letta agents are stored in a database on the Letta server, so you can access the same agents from the ADE, the REST API, the Python SDK, and the TypeScript SDK.
|
||||
|
||||
The response will include information about the agent, including its `id`:
|
||||
```json
|
||||
{
|
||||
"id": "agent-43f8e098-1021-4545-9395-446f788d7389",
|
||||
"name": "GracefulFirefly",
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
Once an agent is created, you can message it:
|
||||
<CodeGroup>
|
||||
```curl curl
|
||||
curl --request POST \
|
||||
--url https://api.letta.com/v1/agents/$AGENT_ID/messages \
|
||||
--header 'Authorization: Bearer $LETTA_API_KEY' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "hows it going????"
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
```python title="python" maxLines=50
|
||||
# send a message to the agent
|
||||
response = client.agents.messages.create(
|
||||
agent_id=agent_state.id,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "hows it going????"
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
# the response object contains the messages and usage statistics
|
||||
print(response)
|
||||
|
||||
# if we want to print the usage stats
|
||||
print(response.usage)
|
||||
|
||||
# if we want to print the messages
|
||||
for message in response.messages:
|
||||
print(message)
|
||||
```
|
||||
```typescript TypeScript maxLines=50
|
||||
// send a message to the agent
|
||||
const response = await client.agents.messages.create(
|
||||
agentState.id, {
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: "hows it going????"
|
||||
}
|
||||
]
|
||||
}
|
||||
);
|
||||
|
||||
// the response object contains the messages and usage statistics
|
||||
console.log(response);
|
||||
|
||||
// if we want to print the usage stats
|
||||
console.log(response.usage)
|
||||
|
||||
// if we want to print the messages
|
||||
for (const message of response.messages) {
|
||||
console.log(message);
|
||||
}
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
### Message Types
|
||||
The `response` object contains the following attributes:
|
||||
* `usage`: The usage of the agent after the message was sent (the prompt tokens, completition tokens, and total tokens)
|
||||
* `message`: A list of `LettaMessage` objects, generated by the agent
|
||||
|
||||
#### `LettaMessage`
|
||||
The `LettaMessage` object is a simplified version of the `Message` object stored in the database backend.
|
||||
Since a `Message` can include multiple events like a chain-of-thought and function calls, `LettaMessage` simplifies messages to have the following types:
|
||||
* `reasoning_message`: The inner monologue (chain-of-thought) of the agent
|
||||
* `tool_call_message`: An agent's tool (function) call
|
||||
* `tool_call_return`: The result of executing an agent's tool (function) call
|
||||
* `assistant_message`: An agent's response message (direct response in current architecture, or `send_message` tool call in legacy architectures)
|
||||
* `system_message`: A system message (for example, an alert about the user logging in)
|
||||
* `user_message`: A user message
|
||||
|
||||
<Note>
|
||||
In current Letta agents, `assistant_message` represents the agent's direct response. In legacy architectures (`memgpt_agent`, `memgpt_v2_agent`), it wraps the `send_message` tool call.
|
||||
|
||||
If you prefer to see the raw tool call format in legacy agents, you can set `use_assistant_message` to `false` in the request `config` (see the [endpoint documentation](/api-reference/agents/messages/create)).
|
||||
</Note>
|
||||
|
||||
## Common agent operations
|
||||
For more in-depth guide on the full set of Letta agent operations, check out our [API reference](/api-reference/overview), our extended [Python SDK](https://github.com/letta-ai/letta/blob/main/examples/docs/example.py) and [TypeScript SDK](https://github.com/letta-ai/letta/blob/main/examples/docs/node/example.ts) examples, as well as our other [cookbooks](/cookbooks).
|
||||
|
||||
If you're using a self-hosted Letta server, you should set the **base URL** (`base_url` in Python, `baseUrl` in TypeScript) to the Letta server's URL (e.g. `http://localhost:8283`) when you create your client. See an example [here](/api-reference/overview).
|
||||
|
||||
If you're using a self-hosted server, you can omit the token if you're not using [password protection](/guides/server/docker#password-protection-advanced).
|
||||
If you are using password protection, set your **token** to the **password**.
|
||||
If you're using Letta Cloud, you should set the **token** to your **Letta Cloud API key**.
|
||||
|
||||
### Retrieving an agent's state
|
||||
The agent's state is always persisted, so you can retrieve an agent's state by its ID.
|
||||
<EndpointRequestSnippet endpoint="GET /v1/agents/:agent_id" />
|
||||
|
||||
The result of the call is an `AgentState` object:
|
||||
<EndpointResponseSnippet endpoint="GET /v1/agents/:agent_id" />
|
||||
|
||||
### List agents
|
||||
Replace `agent_id` with your actual agent ID.
|
||||
<EndpointRequestSnippet endpoint="GET /v1/agents/" />
|
||||
|
||||
The result of the call is a list of `AgentState` objects:
|
||||
<EndpointResponseSnippet endpoint="GET /v1/agents/" />
|
||||
|
||||
### Delete an agent
|
||||
To delete an agent, you can use the `DELETE` endpoint with your `agent_id`:
|
||||
<EndpointRequestSnippet endpoint="DELETE /v1/agents/:agent_id" />
|
||||
@@ -1,102 +0,0 @@
|
||||
---
|
||||
title: Parallel Tool Calling
|
||||
slug: guides/agents/parallel-tool-calling
|
||||
---
|
||||
|
||||
When an agent calls multiple tools, Letta can execute them concurrently instead of sequentially.
|
||||
|
||||
Parallel tool calling has two configuration levels:
|
||||
- **Agent LLM config**: Controls whether the LLM can request multiple tool calls at once
|
||||
- **Individual tool settings**: Controls whether requested tools actually execute in parallel or sequentially
|
||||
|
||||
## Model Support
|
||||
|
||||
Parallel tool calling is supported for OpenAI and Anthropic models.
|
||||
|
||||
## Enabling Parallel Tool Calling
|
||||
|
||||
### Agent Configuration
|
||||
|
||||
Set `parallel_tool_calls: true` in the agent's LLM config:
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
const agent = await client.agents.create({
|
||||
llm_config: {
|
||||
model: "anthropic/claude-sonnet-4-20250514",
|
||||
parallel_tool_calls: true
|
||||
}
|
||||
});
|
||||
```
|
||||
```python Python
|
||||
agent = client.agents.create(
|
||||
llm_config={
|
||||
"model": "anthropic/claude-sonnet-4-20250514",
|
||||
"parallel_tool_calls": True
|
||||
}
|
||||
)
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
### Tool Configuration
|
||||
|
||||
Individual tools must opt-in to parallel execution:
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
await client.tools.update(toolId, {
|
||||
enable_parallel_execution: true
|
||||
});
|
||||
```
|
||||
```python Python
|
||||
client.tools.update(
|
||||
tool_id=tool_id,
|
||||
enable_parallel_execution=True
|
||||
)
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
By default, tools execute sequentially (`enable_parallel_execution=False`).
|
||||
|
||||
<Warning>
|
||||
Only enable parallel execution for tools safe to run concurrently. Tools that modify shared state or have ordering dependencies should remain sequential.
|
||||
</Warning>
|
||||
|
||||
## ADE Configuration
|
||||
|
||||
### Agent Toggle
|
||||
|
||||
1. Open **Settings** → **LLM Config**
|
||||
2. Enable **"Parallel tool calls"**
|
||||
|
||||
### Tool Toggle
|
||||
|
||||
1. Open the **Tools** panel
|
||||
2. Click a tool to open it
|
||||
3. Go to the **Settings** tab
|
||||
4. Enable **"Enable parallel execution"**
|
||||
|
||||
## Execution Behavior
|
||||
|
||||
When the agent calls multiple tools:
|
||||
- Sequential tools execute one-by-one
|
||||
- Parallel-enabled tools execute concurrently
|
||||
- Mixed: sequential tools complete first, then parallel tools execute together
|
||||
|
||||
Example:
|
||||
```
|
||||
Agent calls:
|
||||
- search_web (parallel: true)
|
||||
- search_database (parallel: true)
|
||||
- send_message (parallel: false)
|
||||
|
||||
Execution:
|
||||
1. send_message executes
|
||||
2. search_web AND search_database execute concurrently
|
||||
```
|
||||
|
||||
## Limitations
|
||||
|
||||
- Parallel execution is automatically disabled when [tool rules](/guides/agents/tool-rules) are configured
|
||||
- Only enable for tools safe to run concurrently (e.g., read-only operations)
|
||||
- Tools that modify shared state should remain sequential
|
||||
@@ -1,253 +0,0 @@
|
||||
---
|
||||
title: Code Interpreter
|
||||
subtitle: Execute code in a secure sandbox with full network access
|
||||
slug: guides/agents/run-code
|
||||
---
|
||||
|
||||
The `run_code` tool enables Letta agents to execute code in a secure sandboxed environment. Useful for data analysis, calculations, API calls, and programmatic computation.
|
||||
|
||||
<Info>
|
||||
On [Letta Cloud](/guides/cloud/overview), this tool works out of the box. For self-hosted deployments, you'll need to [configure an E2B API key](#self-hosted-setup).
|
||||
</Info>
|
||||
|
||||
<Warning>
|
||||
Each execution runs in a **fresh environment** - variables, files, and state do not persist between runs.
|
||||
</Warning>
|
||||
|
||||
## Quick Start
|
||||
|
||||
<CodeGroup>
|
||||
```python Python
|
||||
from letta import Letta
|
||||
|
||||
client = Letta(token="LETTA_API_KEY")
|
||||
|
||||
agent = client.agents.create(
|
||||
model="openai/gpt-4o",
|
||||
tools=["run_code"],
|
||||
memory_blocks=[{
|
||||
"label": "persona",
|
||||
"value": "I can run Python code for data analysis and API calls."
|
||||
}]
|
||||
)
|
||||
```
|
||||
|
||||
```typescript TypeScript
|
||||
import { LettaClient } from '@letta-ai/letta-client';
|
||||
|
||||
const client = new LettaClient({ token: "LETTA_API_KEY" });
|
||||
|
||||
const agent = await client.agents.create({
|
||||
model: "openai/gpt-4o",
|
||||
tools: ["run_code"],
|
||||
memoryBlocks: [{
|
||||
label: "persona",
|
||||
value: "I can run Python code for data analysis and API calls."
|
||||
}]
|
||||
});
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
## Tool Parameters
|
||||
|
||||
| Parameter | Type | Options | Description |
|
||||
|-----------|------|---------|-------------|
|
||||
| `code` | `str` | Required | The code to execute |
|
||||
| `language` | `str` | `python`, `js`, `ts`, `r`, `java` | Programming language |
|
||||
|
||||
## Return Format
|
||||
|
||||
```json
|
||||
{
|
||||
"results": ["Last expression value"],
|
||||
"logs": {
|
||||
"stdout": ["Print statements"],
|
||||
"stderr": ["Error output"]
|
||||
},
|
||||
"error": "Error details if execution failed"
|
||||
}
|
||||
```
|
||||
|
||||
**Output types:**
|
||||
- `results[]`: Last expression value (Jupyter-style)
|
||||
- `logs.stdout`: Print statements and standard output
|
||||
- `logs.stderr`: Error messages
|
||||
- `error`: Present if execution failed
|
||||
|
||||
## Supported Languages
|
||||
|
||||
| Language | Key Limitations |
|
||||
|----------|-----------------|
|
||||
| **Python** | None - full ecosystem available |
|
||||
| **JavaScript** | No npm packages - built-in Node modules only |
|
||||
| **TypeScript** | No npm packages - built-in Node modules only |
|
||||
| **R** | No tidyverse - base R only |
|
||||
| **Java** | JShell-style execution - no traditional class definitions |
|
||||
|
||||
### Python
|
||||
|
||||
Full Python ecosystem with common packages pre-installed:
|
||||
|
||||
- **Data**: numpy, pandas, scipy, scikit-learn
|
||||
- **Web**: requests, aiohttp, beautifulsoup4
|
||||
- **Utilities**: matplotlib, PyYAML, Pillow
|
||||
|
||||
Check available packages:
|
||||
```python
|
||||
import pkg_resources
|
||||
print([d.project_name for d in pkg_resources.working_set])
|
||||
```
|
||||
|
||||
### JavaScript & TypeScript
|
||||
|
||||
No npm packages available - only built-in Node modules.
|
||||
|
||||
```javascript
|
||||
// Works
|
||||
const fs = require('fs');
|
||||
const http = require('http');
|
||||
|
||||
// Fails
|
||||
const axios = require('axios');
|
||||
```
|
||||
|
||||
### R
|
||||
|
||||
Base R only - no tidyverse packages.
|
||||
|
||||
```r
|
||||
# Works
|
||||
mean(c(1, 2, 3))
|
||||
|
||||
# Fails
|
||||
library(ggplot2)
|
||||
```
|
||||
|
||||
### Java
|
||||
|
||||
JShell-style execution - statement-level only.
|
||||
|
||||
```java
|
||||
// Works
|
||||
System.out.println("Hello");
|
||||
int x = 42;
|
||||
|
||||
// Fails
|
||||
public class Main {
|
||||
public static void main(String[] args) { }
|
||||
}
|
||||
```
|
||||
|
||||
## Network Access
|
||||
|
||||
The sandbox has full network access for HTTP requests, API calls, and DNS resolution.
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
response = requests.get('https://api.github.com/repos/letta-ai/letta')
|
||||
data = response.json()
|
||||
print(f"Stars: {data['stargazers_count']}")
|
||||
```
|
||||
|
||||
## No State Persistence
|
||||
|
||||
Variables, files, and state do not carry over between executions. Each `run_code` call is completely isolated.
|
||||
|
||||
```python
|
||||
# First execution
|
||||
x = 42
|
||||
|
||||
# Second execution (separate run_code call)
|
||||
print(x) # Error: NameError: name 'x' is not defined
|
||||
```
|
||||
|
||||
**Implications:**
|
||||
- Must re-import libraries each time
|
||||
- Files written to disk are lost
|
||||
- Cannot build up state across executions
|
||||
|
||||
## Self-Hosted Setup
|
||||
|
||||
For self-hosted servers, configure an E2B API key. [E2B](https://e2b.dev) provides the sandbox infrastructure.
|
||||
|
||||
<CodeGroup>
|
||||
```bash Docker
|
||||
docker run \
|
||||
-e E2B_API_KEY="your_e2b_api_key" \
|
||||
letta/letta:latest
|
||||
```
|
||||
|
||||
```yaml Docker Compose
|
||||
services:
|
||||
letta:
|
||||
environment:
|
||||
- E2B_API_KEY=your_e2b_api_key
|
||||
```
|
||||
|
||||
```python Per-Agent
|
||||
agent = client.agents.create(
|
||||
tools=["run_code"],
|
||||
tool_env_vars={
|
||||
"E2B_API_KEY": "your_e2b_api_key"
|
||||
}
|
||||
)
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
## Common Patterns
|
||||
|
||||
### Data Analysis
|
||||
```python
|
||||
agent = client.agents.create(
|
||||
model="openai/gpt-4o",
|
||||
tools=["run_code"],
|
||||
memory_blocks=[{
|
||||
"label": "persona",
|
||||
"value": "I use Python with pandas and numpy for data analysis."
|
||||
}]
|
||||
)
|
||||
```
|
||||
|
||||
### API Integration
|
||||
```python
|
||||
agent = client.agents.create(
|
||||
model="openai/gpt-4o",
|
||||
tools=["run_code", "web_search"],
|
||||
memory_blocks=[{
|
||||
"label": "persona",
|
||||
"value": "I fetch data from APIs using run_code and search docs with web_search."
|
||||
}]
|
||||
)
|
||||
```
|
||||
|
||||
### Statistical Analysis
|
||||
```python
|
||||
agent = client.agents.create(
|
||||
model="openai/gpt-4o",
|
||||
tools=["run_code"],
|
||||
memory_blocks=[{
|
||||
"label": "persona",
|
||||
"value": "I perform statistical analysis using scipy and numpy."
|
||||
}]
|
||||
)
|
||||
```
|
||||
|
||||
## When to Use
|
||||
|
||||
| Use Case | Tool | Why |
|
||||
|----------|------|-----|
|
||||
| Data analysis | `run_code` | Full Python data stack |
|
||||
| Math calculations | `run_code` | Programmatic computation |
|
||||
| Live API data | `run_code` | Network + processing |
|
||||
| Web scraping | `run_code` | requests + BeautifulSoup |
|
||||
| Simple search | `web_search` | Purpose-built |
|
||||
| Persistent data | Archival memory | State persistence |
|
||||
|
||||
## Related Documentation
|
||||
|
||||
- [Utilities Overview](/guides/agents/prebuilt-tools)
|
||||
- [Web Search](/guides/agents/web-search)
|
||||
- [Fetch Webpage](/guides/agents/fetch-webpage)
|
||||
- [Custom Tools](/guides/agents/custom-tools)
|
||||
- [Tool Variables](/guides/agents/tool-variables)
|
||||
@@ -1,213 +0,0 @@
|
||||
# Scheduling
|
||||
|
||||
**Scheduling** is a technique for triggering Letta agents at regular intervals.
|
||||
Many real-world applications require proactive behavior, such as checking emails every few hours or scraping news sites.
|
||||
Scheduling can support autonomous agents with the capability to manage ongoing processes.
|
||||
|
||||
<Note>
|
||||
Native scheduling functionality is on the Letta Cloud roadmap. The approaches described in this guide are temporary solutions that work with both self-hosted and cloud deployments.
|
||||
</Note>
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
When building autonomous agents with Letta, you often need to trigger them at regular intervals for tasks like:
|
||||
|
||||
- **System Monitoring**: Health checks that adapt based on historical patterns
|
||||
- **Data Processing**: Intelligent ETL processes that handle edge cases contextually
|
||||
- **Memory Maintenance**: Agents that optimize their own knowledge base over time
|
||||
- **Proactive Notifications**: Context-aware alerts that consider user preferences and timing
|
||||
- **Continuous Learning**: Agents that regularly ingest new information and update their understanding
|
||||
|
||||
This guide covers simple approaches to implement scheduled agent interactions.
|
||||
|
||||
## Option 1: Simple Loop
|
||||
|
||||
The most straightforward approach for development and testing:
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
import { LettaClient } from '@letta-ai/letta-client';
|
||||
|
||||
const client = new LettaClient({ token: process.env.LETTA_API_KEY });
|
||||
const agentId = "your_agent_id";
|
||||
|
||||
while (true) {
|
||||
const response = await client.agents.messages.create(agentId, {
|
||||
messages: [{
|
||||
role: "user",
|
||||
content: `Scheduled check at ${new Date()}`
|
||||
}]
|
||||
});
|
||||
console.log(`[${new Date()}] Agent responded`);
|
||||
await new Promise(resolve => setTimeout(resolve, 300000)); // 5 minutes
|
||||
}
|
||||
```
|
||||
|
||||
```python title="python"
|
||||
import time
|
||||
import os
|
||||
from letta_client import Letta
|
||||
from datetime import datetime
|
||||
|
||||
client = Letta(token=os.getenv("LETTA_API_KEY"))
|
||||
agent_id = "your_agent_id"
|
||||
|
||||
while True:
|
||||
response = client.agents.messages.create(
|
||||
agent_id=agent_id,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": f"Scheduled check at {datetime.now()}"
|
||||
}]
|
||||
)
|
||||
print(f"[{datetime.now()}] Agent responded")
|
||||
time.sleep(300) # 5 minutes
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
**Pros:** Simple, easy to debug
|
||||
**Cons:** Blocks terminal, stops if process dies
|
||||
|
||||
## Option 2: System Cron Jobs
|
||||
|
||||
For production deployments, use cron for reliability:
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
#!/usr/bin/env node
|
||||
import { LettaClient } from '@letta-ai/letta-client';
|
||||
|
||||
async function sendMessage() {
|
||||
try {
|
||||
const client = new LettaClient({ token: process.env.LETTA_API_KEY });
|
||||
const response = await client.agents.messages.create("your_agent_id", {
|
||||
messages: [{
|
||||
role: "user",
|
||||
content: "Scheduled maintenance check"
|
||||
}]
|
||||
});
|
||||
console.log(`[${new Date()}] Success`);
|
||||
} catch (error) {
|
||||
console.error(`[${new Date()}] Error:`, error);
|
||||
}
|
||||
}
|
||||
|
||||
sendMessage();
|
||||
```
|
||||
|
||||
```python title="python"
|
||||
#!/usr/bin/env python3
|
||||
from letta_client import Letta
|
||||
from datetime import datetime
|
||||
|
||||
try:
|
||||
import os
|
||||
client = Letta(token=os.getenv("LETTA_API_KEY"))
|
||||
response = client.agents.messages.create(
|
||||
agent_id="your_agent_id",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": "Scheduled maintenance check"
|
||||
}]
|
||||
)
|
||||
print(f"[{datetime.now()}] Success")
|
||||
except Exception as e:
|
||||
print(f"[{datetime.now()}] Error: {e}")
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
Add to crontab with `crontab -e`:
|
||||
```bash
|
||||
*/5 * * * * /usr/bin/python3 /path/to/send_message.py >> /var/log/letta_cron.log 2>&1
|
||||
# or for Node.js:
|
||||
*/5 * * * * /usr/bin/node /path/to/send_message.js >> /var/log/letta_cron.log 2>&1
|
||||
```
|
||||
|
||||
**Pros:** System-managed, survives reboots
|
||||
**Cons:** Requires cron access
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Error Handling**: Always wrap API calls in try-catch blocks
|
||||
2. **Logging**: Log both successes and failures for debugging
|
||||
3. **Environment Variables**: Store credentials securely
|
||||
4. **Rate Limiting**: Respect API limits and add backoff for failures
|
||||
|
||||
## Example: Memory Maintenance Bot
|
||||
|
||||
Complete example that performs periodic memory cleanup:
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript
|
||||
#!/usr/bin/env node
|
||||
import { LettaClient } from '@letta-ai/letta-client';
|
||||
|
||||
async function runMaintenance() {
|
||||
try {
|
||||
const client = new LettaClient({ token: process.env.LETTA_API_KEY });
|
||||
const agentId = "your_agent_id";
|
||||
|
||||
const response = await client.agents.messages.create(agentId, {
|
||||
messages: [{
|
||||
role: "user",
|
||||
content: "Please review your memory blocks for outdated information and clean up as needed."
|
||||
}]
|
||||
});
|
||||
|
||||
// Print any assistant messages
|
||||
for (const message of response.messages) {
|
||||
if (message.messageType === "assistant_message") {
|
||||
console.log(`Agent response: ${message.content?.substring(0, 100)}...`);
|
||||
}
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error("Maintenance failed:", error);
|
||||
}
|
||||
}
|
||||
|
||||
// Run if called directly
|
||||
if (import.meta.url === `file://${process.argv[1]}`) {
|
||||
runMaintenance();
|
||||
}
|
||||
```
|
||||
|
||||
```python title="python"
|
||||
#!/usr/bin/env python3
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from letta_client import Letta
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
def run_maintenance():
|
||||
try:
|
||||
import os
|
||||
client = Letta(token=os.getenv("LETTA_API_KEY"))
|
||||
agent_id = "your_agent_id"
|
||||
|
||||
response = client.agents.messages.create(
|
||||
agent_id=agent_id,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": "Please review your memory blocks for outdated information and clean up as needed."
|
||||
}]
|
||||
)
|
||||
|
||||
# Print any assistant messages
|
||||
for message in response.messages:
|
||||
if message.message_type == "assistant_message":
|
||||
logging.info(f"Agent response: {message.content[:100]}...")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Maintenance failed: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_maintenance()
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
Choose the scheduling method that best fits your deployment environment. For production systems, cron offers the best reliability, while simple loops are perfect for development and testing.
|
||||
@@ -1,53 +0,0 @@
|
||||
---
|
||||
title: Using Tool Variables
|
||||
slug: guides/agents/tool-variables
|
||||
---
|
||||
|
||||
You can use **tool variables** to specify environment variables available to your custom tools.
|
||||
For example, if you set a tool variable `PASSWORD` to `banana`, then write a custom function that prints `os.getenv('PASSWORD')` in the tool, the function will print `banana`.
|
||||
|
||||
## Assigning tool variables in the ADE
|
||||
|
||||
To assign tool variables in the Agent Development Environment (ADE), click on **Env Vars** to open the **Environment Variables** viewer:
|
||||
|
||||
<img src="../../images/env_vars_button.png" />
|
||||
|
||||
Once in the **Environment Variables** viewer, click **+** to add a new tool variable if one does not exist.
|
||||
|
||||
<img src="../../images/tool_variables.png" />
|
||||
|
||||
## Assigning tool variables in the API / SDK
|
||||
|
||||
You can also assign tool variables on agent creation in the API with the `tool_exec_environment_variables` parameter:
|
||||
<CodeGroup>
|
||||
```curl title="curl" {7-9}
|
||||
curl -X POST https://api.letta.com/v1/agents \
|
||||
-H "Authorization: Bearer $LETTA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"memory_blocks": [],
|
||||
"llm":"openai/gpt-4o-mini",
|
||||
"tool_exec_environment_variables": {
|
||||
"API_KEY": "your-api-key-here"
|
||||
}
|
||||
}'
|
||||
```
|
||||
```python title="python" {5-7}
|
||||
agent_state = client.agents.create(
|
||||
memory_blocks=[],
|
||||
model="openai/gpt-4o-mini",
|
||||
tool_exec_environment_variables={
|
||||
"API_KEY": "your-api-key-here"
|
||||
}
|
||||
)
|
||||
```
|
||||
```typescript TypeScript {5-7}
|
||||
const agentState = await client.agents.create({
|
||||
memoryBlocks: [],
|
||||
model: "openai/gpt-4o-mini",
|
||||
toolExecEnvironmentVariables: {
|
||||
"API_KEY": "your-api-key-here"
|
||||
}
|
||||
});
|
||||
```
|
||||
</CodeGroup>
|
||||
@@ -1,480 +0,0 @@
|
||||
---
|
||||
title: Web Search
|
||||
subtitle: Search the internet in real-time with AI-powered search
|
||||
slug: guides/agents/web-search
|
||||
---
|
||||
|
||||
The `web_search` and `fetch_webpage` tools enables Letta agents to search the internet for current information, research, and general knowledge using [Exa](https://exa.ai)'s AI-powered search engine.
|
||||
|
||||
<Info>
|
||||
On [Letta Cloud](/guides/cloud/overview), these tools work out of the box. For self-hosted deployments, you'll need to [configure an Exa API key](#self-hosted-setup).
|
||||
</Info>
|
||||
|
||||
## Web Search
|
||||
|
||||
### Adding Web Search to an Agent
|
||||
|
||||
<CodeGroup>
|
||||
```python Python
|
||||
from letta import Letta
|
||||
|
||||
client = Letta(token="LETTA_API_KEY")
|
||||
|
||||
agent = client.agents.create(
|
||||
model="openai/gpt-4o",
|
||||
embedding="openai/text-embedding-3-small",
|
||||
tools=["web_search"],
|
||||
memory_blocks=[
|
||||
{
|
||||
"label": "persona",
|
||||
"value": "I'm a research assistant who uses web search to find current information and cite sources."
|
||||
}
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
```typescript TypeScript
|
||||
import { LettaClient } from '@letta-ai/letta-client';
|
||||
|
||||
const client = new LettaClient({ token: "LETTA_API_KEY" });
|
||||
|
||||
const agent = await client.agents.create({
|
||||
model: "openai/gpt-4o",
|
||||
embedding: "openai/text-embedding-3-small",
|
||||
tools: ["web_search"],
|
||||
memoryBlocks: [
|
||||
{
|
||||
label: "persona",
|
||||
value: "I'm a research assistant who uses web search to find current information and cite sources."
|
||||
}
|
||||
]
|
||||
});
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
### Usage Example
|
||||
|
||||
```python
|
||||
response = client.agents.messages.create(
|
||||
agent_id=agent.id,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What are the latest developments in agent-based AI systems?"
|
||||
}
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
Your agent can now choose to use `web_search` when it needs current information.
|
||||
|
||||
## Self-Hosted Setup
|
||||
|
||||
For self-hosted Letta servers, you'll need an Exa API key.
|
||||
|
||||
### Get an API Key
|
||||
|
||||
1. Sign up at [dashboard.exa.ai](https://dashboard.exa.ai/)
|
||||
2. Copy your API key
|
||||
3. See [Exa pricing](https://docs.exa.ai) for rate limits and costs
|
||||
|
||||
### Configuration Options
|
||||
|
||||
<CodeGroup>
|
||||
```bash Docker
|
||||
docker run \
|
||||
-v ~/.letta/.persist/pgdata:/var/lib/postgresql/data \
|
||||
-p 8283:8283 \
|
||||
-e OPENAI_API_KEY="your_openai_key" \
|
||||
-e EXA_API_KEY="your_exa_api_key" \
|
||||
letta/letta:latest
|
||||
```
|
||||
|
||||
```yaml Docker Compose
|
||||
version: '3.8'
|
||||
services:
|
||||
letta:
|
||||
image: letta/letta:latest
|
||||
ports:
|
||||
- "8283:8283"
|
||||
environment:
|
||||
- OPENAI_API_KEY=your_openai_key
|
||||
- EXA_API_KEY=your_exa_api_key
|
||||
volumes:
|
||||
- ~/.letta/.persist/pgdata:/var/lib/postgresql/data
|
||||
```
|
||||
|
||||
```python Per-Agent Configuration
|
||||
agent = client.agents.create(
|
||||
model="openai/gpt-4o",
|
||||
embedding="openai/text-embedding-3-small",
|
||||
tools=["web_search"],
|
||||
tool_env_vars={
|
||||
"EXA_API_KEY": "your_exa_api_key"
|
||||
}
|
||||
)
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
## Tool Parameters
|
||||
|
||||
The `web_search` tool supports advanced filtering and search customization:
|
||||
|
||||
| Parameter | Type | Default | Description |
|
||||
|-----------|------|---------|-------------|
|
||||
| `query` | `str` | Required | The search query to find relevant web content |
|
||||
| `num_results` | `int` | 10 | Number of results to return (1-100) |
|
||||
| `category` | `str` | None | Focus search on specific content types (see below) |
|
||||
| `include_text` | `bool` | False | Whether to retrieve full page content (usually overflows context) |
|
||||
| `include_domains` | `List[str]` | None | List of domains to include in search results |
|
||||
| `exclude_domains` | `List[str]` | None | List of domains to exclude from search results |
|
||||
| `start_published_date` | `str` | None | Only return content published after this date (ISO format) |
|
||||
| `end_published_date` | `str` | None | Only return content published before this date (ISO format) |
|
||||
| `user_location` | `str` | None | Two-letter country code for localized results (e.g., "US") |
|
||||
|
||||
### Available Categories
|
||||
|
||||
Use the `category` parameter to focus your search on specific content types:
|
||||
|
||||
| Category | Best For | Example Query |
|
||||
|----------|----------|---------------|
|
||||
| `company` | Corporate information, company websites | "Tesla energy storage solutions" |
|
||||
| `research paper` | Academic papers, arXiv, research publications | "transformer architecture improvements 2025" |
|
||||
| `news` | News articles, current events | "latest AI policy developments" |
|
||||
| `pdf` | PDF documents, reports, whitepapers | "climate change impact assessment" |
|
||||
| `github` | GitHub repositories, open source projects | "python async web scraping libraries" |
|
||||
| `tweet` | Twitter/X posts, social media discussions | "reactions to new GPT release" |
|
||||
| `personal site` | Blogs, personal websites, portfolios | "machine learning tutorial blogs" |
|
||||
| `linkedin profile` | LinkedIn profiles, professional bios | "AI research engineers at Google" |
|
||||
| `financial report` | Earnings reports, financial statements | "Apple Q4 2024 earnings" |
|
||||
|
||||
### Return Format
|
||||
|
||||
The tool returns a JSON-encoded string containing:
|
||||
|
||||
```json
|
||||
{
|
||||
"query": "search query",
|
||||
"results": [
|
||||
{
|
||||
"title": "Page title",
|
||||
"url": "https://example.com",
|
||||
"published_date": "2025-01-15",
|
||||
"author": "Author name",
|
||||
"highlights": ["Key excerpt 1", "Key excerpt 2"],
|
||||
"summary": "AI-generated summary of the content",
|
||||
"text": "Full page content (only if include_text=true)"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### 1. Guide When to Search
|
||||
|
||||
Provide clear instructions to your agent about when web search is appropriate:
|
||||
|
||||
```python
|
||||
memory_blocks=[
|
||||
{
|
||||
"label": "persona",
|
||||
"value": "I'm a helpful assistant. I use web_search for current events, recent news, and topics requiring up-to-date information. I cite my sources."
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
### 2. Combine with Archival Memory
|
||||
|
||||
Use web search for external/current information, and archival memory for your organization's internal data:
|
||||
|
||||
```python
|
||||
# Create agent with both web_search and archival memory tools
|
||||
agent = client.agents.create(
|
||||
model="openai/gpt-4o",
|
||||
embedding="openai/text-embedding-3-small",
|
||||
tools=["web_search", "archival_memory_search", "archival_memory_insert"],
|
||||
memory_blocks=[
|
||||
{
|
||||
"label": "persona",
|
||||
"value": "I use web_search for current events and external research. I use archival_memory_search for company-specific information and internal documents."
|
||||
}
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
See the [Archival Memory documentation](/guides/agents/archival-memory) for more information.
|
||||
|
||||
### 3. Craft Effective Search Queries
|
||||
|
||||
Exa uses neural search that understands semantic meaning. Your agent will generally form good queries naturally, but you can improve results by guiding it to:
|
||||
|
||||
- **Be descriptive and specific**: "Latest research on RLHF techniques for language models" is better than "RLHF research"
|
||||
- **Focus on topics, not keywords**: "How companies are deploying AI agents in customer service" works better than "AI agents customer service deployment"
|
||||
- **Use natural language**: The search engine understands conversational queries like "What are the environmental impacts of Bitcoin mining?"
|
||||
- **Specify time ranges when relevant**: Guide your agent to use date filters for time-sensitive queries
|
||||
|
||||
Example instruction in memory:
|
||||
|
||||
```python
|
||||
memory_blocks=[
|
||||
{
|
||||
"label": "search_strategy",
|
||||
"value": "When searching, I craft clear, descriptive queries that focus on topics rather than keywords. I use the category and date filters when appropriate to narrow results."
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
### 4. Manage Context Window
|
||||
|
||||
By default, `include_text` is `False` to avoid context overflow. The tool returns highlights and AI-generated summaries instead, which are more concise:
|
||||
|
||||
```python
|
||||
memory_blocks=[
|
||||
{
|
||||
"label": "search_guidelines",
|
||||
"value": "I avoid setting include_text=true unless specifically needed, as full text usually overflows the context window. Highlights and summaries are usually sufficient."
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
## Common Patterns
|
||||
|
||||
### Research Assistant
|
||||
|
||||
```python
|
||||
agent = client.agents.create(
|
||||
model="openai/gpt-4o",
|
||||
tools=["web_search"],
|
||||
memory_blocks=[
|
||||
{
|
||||
"label": "persona",
|
||||
"value": "I'm a research assistant. I search for relevant information, synthesize findings from multiple sources, and provide citations."
|
||||
}
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### News Monitor
|
||||
|
||||
```python
|
||||
agent = client.agents.create(
|
||||
model="openai/gpt-4o-mini",
|
||||
tools=["web_search"],
|
||||
memory_blocks=[
|
||||
{
|
||||
"label": "persona",
|
||||
"value": "I monitor news and provide briefings on AI industry developments."
|
||||
},
|
||||
{
|
||||
"label": "topics",
|
||||
"value": "Focus: AI/ML, agent systems, LLM advancements"
|
||||
}
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Customer Support
|
||||
|
||||
```python
|
||||
agent = client.agents.create(
|
||||
model="openai/gpt-4o",
|
||||
tools=["web_search"],
|
||||
memory_blocks=[
|
||||
{
|
||||
"label": "persona",
|
||||
"value": "I help customers by checking documentation, service status pages, and community discussions for solutions."
|
||||
}
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Agent Not Using Web Search
|
||||
|
||||
Check:
|
||||
1. Tool is attached: `"web_search"` in agent's tools list
|
||||
2. Instructions are clear about when to search
|
||||
3. Model has good tool-calling capabilities (GPT-4, Claude 3+)
|
||||
|
||||
```python
|
||||
# Verify tools
|
||||
agent = client.agents.retrieve(agent_id=agent.id)
|
||||
print([tool.name for tool in agent.tools])
|
||||
```
|
||||
|
||||
### Missing EXA_API_KEY
|
||||
|
||||
If you see errors about missing API keys on self-hosted deployments:
|
||||
|
||||
```bash
|
||||
# Check if set
|
||||
echo $EXA_API_KEY
|
||||
|
||||
# Set for session
|
||||
export EXA_API_KEY="your_exa_api_key"
|
||||
|
||||
# Docker example
|
||||
docker run -e EXA_API_KEY="your_exa_api_key" letta/letta:latest
|
||||
```
|
||||
|
||||
## When to Use Web Search
|
||||
|
||||
| Use Case | Tool | Why |
|
||||
|----------|------|-----|
|
||||
| Current events, news | `web_search` | Real-time information |
|
||||
| External research | `web_search` | Broad internet access |
|
||||
| Internal documents | Archival memory | Fast, static data |
|
||||
| User preferences | Memory blocks | In-context, instant |
|
||||
| General knowledge | Pre-trained model | No search needed |
|
||||
|
||||
## Fetch Webpage
|
||||
|
||||
<CodeGroup>
|
||||
```python Python
|
||||
from letta import Letta
|
||||
|
||||
client = Letta(token="LETTA_API_KEY")
|
||||
|
||||
agent = client.agents.create(
|
||||
model="openai/gpt-4o",
|
||||
tools=["fetch_webpage"],
|
||||
memory_blocks=[{
|
||||
"label": "persona",
|
||||
"value": "I can fetch and read webpages to answer questions about online content."
|
||||
}]
|
||||
)
|
||||
```
|
||||
|
||||
```typescript TypeScript
|
||||
import { LettaClient } from '@letta-ai/letta-client';
|
||||
|
||||
const client = new LettaClient({ token: "LETTA_API_KEY" });
|
||||
|
||||
const agent = await client.agents.create({
|
||||
model: "openai/gpt-4o",
|
||||
tools: ["fetch_webpage"],
|
||||
memoryBlocks: [{
|
||||
label: "persona",
|
||||
value: "I can fetch and read webpages to answer questions about online content."
|
||||
}]
|
||||
});
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
## Tool Parameters
|
||||
|
||||
| Parameter | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `url` | `str` | The URL of the webpage to fetch |
|
||||
|
||||
## Return Format
|
||||
|
||||
The tool returns webpage content as text/markdown.
|
||||
|
||||
**With Exa API (if configured):**
|
||||
```json
|
||||
{
|
||||
"title": "Page title",
|
||||
"published_date": "2025-01-15",
|
||||
"author": "Author name",
|
||||
"text": "Full page content in markdown"
|
||||
}
|
||||
```
|
||||
|
||||
**Fallback (without Exa):**
|
||||
Returns markdown-formatted text extracted from the HTML.
|
||||
|
||||
## How It Works
|
||||
|
||||
The tool uses a multi-tier approach:
|
||||
|
||||
1. **Exa API** (if `EXA_API_KEY` is configured): Uses Exa's content extraction
|
||||
2. **Trafilatura** (fallback): Open-source text extraction to markdown
|
||||
3. **Readability + html2text** (final fallback): HTML cleaning and conversion
|
||||
|
||||
## Self-Hosted Setup
|
||||
|
||||
For enhanced fetching on self-hosted servers, optionally configure an Exa API key. Without it, the tool still works using open-source extraction.
|
||||
|
||||
### Optional: Configure Exa
|
||||
|
||||
<CodeGroup>
|
||||
```bash Docker
|
||||
docker run \
|
||||
-e EXA_API_KEY="your_exa_api_key" \
|
||||
letta/letta:latest
|
||||
```
|
||||
|
||||
```yaml Docker Compose
|
||||
services:
|
||||
letta:
|
||||
environment:
|
||||
- EXA_API_KEY=your_exa_api_key
|
||||
```
|
||||
|
||||
```python Per-Agent
|
||||
agent = client.agents.create(
|
||||
tools=["fetch_webpage"],
|
||||
tool_env_vars={
|
||||
"EXA_API_KEY": "your_exa_api_key"
|
||||
}
|
||||
)
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
## Common Patterns
|
||||
|
||||
### Documentation Reader
|
||||
```python
|
||||
agent = client.agents.create(
|
||||
model="openai/gpt-4o",
|
||||
tools=["fetch_webpage", "web_search"],
|
||||
memory_blocks=[{
|
||||
"label": "persona",
|
||||
"value": "I search for documentation with web_search and read it with fetch_webpage."
|
||||
}]
|
||||
)
|
||||
```
|
||||
|
||||
### Research Assistant
|
||||
```python
|
||||
agent = client.agents.create(
|
||||
model="openai/gpt-4o",
|
||||
tools=["fetch_webpage", "archival_memory_insert"],
|
||||
memory_blocks=[{
|
||||
"label": "persona",
|
||||
"value": "I fetch articles and store key insights in archival memory for later reference."
|
||||
}]
|
||||
)
|
||||
```
|
||||
|
||||
### Content Summarizer
|
||||
```python
|
||||
agent = client.agents.create(
|
||||
model="openai/gpt-4o",
|
||||
tools=["fetch_webpage"],
|
||||
memory_blocks=[{
|
||||
"label": "persona",
|
||||
"value": "I fetch webpages and provide summaries of their content."
|
||||
}]
|
||||
)
|
||||
```
|
||||
|
||||
## When to Use
|
||||
|
||||
| Use Case | Tool | Why |
|
||||
|----------|------|-----|
|
||||
| Read specific webpage | `fetch_webpage` | Direct URL access |
|
||||
| Find webpages to read | `web_search` | Discovery first |
|
||||
| Read + search in one | `web_search` with `include_text=true` | Combined operation |
|
||||
| Multiple pages | `fetch_webpage` | Iterate over URLs |
|
||||
|
||||
## Related Documentation
|
||||
|
||||
- [Utilities Overview](/guides/agents/prebuilt-tools)
|
||||
- [Web Search](/guides/agents/web-search)
|
||||
- [Run Code](/guides/agents/run-code)
|
||||
- [Custom Tools](/guides/agents/custom-tools)
|
||||
- [Tool Variables](/guides/agents/tool-variables)
|
||||
@@ -1,28 +0,0 @@
|
||||
---
|
||||
title: Bring-Your-Own API Keys
|
||||
subtitle: Connect your own API keys for supported model providers (OpenAI, Anthropic, etc.)
|
||||
slug: guides/cloud/custom-keys
|
||||
---
|
||||
|
||||
<Note>
|
||||
To generate a **Letta API key** (which you use to interact with your agents on Letta Cloud), visit your [account settings](https://app.letta.com/settings/profile) page.
|
||||
</Note>
|
||||
|
||||
<Warning>
|
||||
Letta Cloud only support bring-your-own-key for enterprise customers. To learn more about enterprise plans and pricing, visit our [pricing page](https://www.letta.com/pricing) or [contact us](https://forms.letta.com/request-demo) to request a demo.
|
||||
</Warning>
|
||||
|
||||
## Using Your Own API Keys
|
||||
|
||||
Connect your own API keys for supported providers (OpenAI, Anthropic, Gemini) to Letta Cloud through the [models page](https://app.letta.com/models). When you have a custom API key (successfully) registered, you will see additional models listed in the ADE model dropdown.
|
||||
|
||||
### Selecting Your Custom Provider
|
||||
|
||||
After you connect your own OpenAI / Anthropic / Gemini API key, make sure to select your custom provider in the ADE under "Your models".
|
||||
For example, after connecting your own OpenAI API key, you will see multiple OpenAI models but with different providers ("Letta hosted" vs "Your models") - if you want to use your own OpenAI API key, you need to select the copy of the model associated with your custom provider.
|
||||
|
||||
### Billing and Quotas
|
||||
|
||||
Requests made using your custom API keys **do not count** towards your monthly request quotas or usage-based billing. Instead, you'll be billed directly by the provider (OpenAI, Anthropic, etc.) according to their pricing for your personal account.
|
||||
|
||||
Note that direct provider pricing will likely differ from Letta Cloud rates, and requests through your own API key may cost more than those made through Letta Cloud's managed services.
|
||||
@@ -1,31 +0,0 @@
|
||||
---
|
||||
title: "Observability Overview"
|
||||
subtitle: "Monitor and trace your agents in Letta Cloud"
|
||||
slug: "guides/observability"
|
||||
---
|
||||
|
||||
<Note>
|
||||
All observability features are available in real-time for every Letta Cloud project.
|
||||
</Note>
|
||||
|
||||
Letta Cloud's observability tools help you monitor performance and debug issues. Each project you create in Letta Cloud has two main observability dashboards:
|
||||
|
||||
## [Monitoring](/guides/cloud/monitoring)
|
||||
|
||||
<img className="light" src="/images/observability_graph.png" />
|
||||
<img className="dark" src="/images/observability_graph_dark.png" />
|
||||
|
||||
Track key metrics across four dashboards:
|
||||
- **Overview**: Message count, API/tool errors, LLM/tool latency
|
||||
- **Activity & Usage**: Usage patterns and resource consumption
|
||||
- **Performance**: Response times and throughput
|
||||
- **Errors**: Detailed error analysis and debugging info
|
||||
|
||||
## [Responses & Tracing](/guides/observability/responses)
|
||||
|
||||
<img className="light" src="/images/observability_responses.png" />
|
||||
<img className="dark" src="/images/observability_responses_dark.png" />
|
||||
|
||||
Inspect API responses and agent execution:
|
||||
- **API Responses**: List of all responses with duration and status
|
||||
- **Message Inspection**: Click "Inspect Message" to see the full POST request and agent loop execution sequence
|
||||
@@ -1,63 +0,0 @@
|
||||
---
|
||||
title: Plans & Pricing
|
||||
subtitle: Guide to pricing and model usage for Free, Pro, and Enterprise plans
|
||||
slug: guides/cloud/plans
|
||||
---
|
||||
|
||||
<Note>
|
||||
Upgrade your plan and view your usage on [your account page](https://app.letta.com/settings/organization/usage)
|
||||
</Note>
|
||||
|
||||
## Available Plans
|
||||
|
||||
<CardGroup>
|
||||
<Card
|
||||
title="Free"
|
||||
subtitle="For getting started"
|
||||
>
|
||||
- **5,000** monthly credits
|
||||
- Access the Letta API
|
||||
- Edit agents visually in the ADE
|
||||
- **2** agent templates
|
||||
- **1 GB** of storage
|
||||
</Card>
|
||||
<Card
|
||||
title="Pro ($20 / month)"
|
||||
subtitle="For shipping agents in production"
|
||||
>
|
||||
- **20,000** monthly credits
|
||||
- Pay-as-you-go credit overage
|
||||
- Unlimited agents
|
||||
- **20** agent templates
|
||||
- **10 GB** of storage
|
||||
</Card>
|
||||
</CardGroup>
|
||||
|
||||
<Note>
|
||||
For organizations with higher volume needs, our Enterprise plan offers increased quotas, dedicated support, role-based access control (RBAC), SSO (SAML, OIDC), and private model deployment options.
|
||||
[Contact our team](https://forms.letta.com/request-demo) to learn more.
|
||||
</Note>
|
||||
|
||||
## What are credits?
|
||||
|
||||
Credits are a standard cost unit for resources in Letta, such as LLM inference and CPU cycles. When agents run on Letta, they make LLM model requests and execute tools. Model requests consume credits at a rate depending on the model tier (standard vs. premium) and whether Max Mode is enabled for longer context sizes. Tool executions that run in Letta are charged at a flat rate per second of execution. See up-to-date credit pricing for available models [here](https://app.letta.com/settings/organization/models).
|
||||
|
||||
## What tools are executed by Letta?
|
||||
|
||||
Sandbox code execution and execution of custom tools run inside of Letta, so incur a credit cost for CPU time. Remote MCP tools are executed by the MCP tool provider, so do not have a credit cost. Letta built-in tools are executed for free.
|
||||
|
||||
## How do monthly credits work?
|
||||
|
||||
Your Letta agents use large language models (LLMs) to reason and take actions. These model requests consume credits from your monthly balance (or additional purchased credits). Your balance of monthly credits refreshes every month.
|
||||
|
||||
## What is Max Mode?
|
||||
|
||||
Certain models have the ability to run with extended context windows. Turning on Max Mode extends the context window of the model driving your Letta agent beyond the 100k default, which may help when working with large files or codebases, but will increase cost (credit use) and latency.
|
||||
|
||||
## What's the difference between the Letta API and open source Letta?
|
||||
|
||||
The Letta API Platform is our fully-managed service for stateful agents, handling all agent infrastructure and state management to create scalable agent services. The Letta API Platform also has additional features beyond the open source such as durable execution for long-running agents, built-in sandboxing, agent templates, optimized vector search, message indexing, and observability.
|
||||
|
||||
## Can I transfer my agents between open source and cloud?
|
||||
|
||||
Yes, the Letta API Platform supports [agent file](https://docs.letta.com/guides/agents/agent-file), which allows you to move your agents freely between self-hosted instances of the Letta open source and the Letta platform.
|
||||
@@ -1,54 +0,0 @@
|
||||
---
|
||||
title: Memory Variables
|
||||
slug: guides/templates/variables
|
||||
---
|
||||
|
||||
<Note>
|
||||
Memory variables are a feature in [agent templates](/guides/cloud/templates) (part of [Letta Cloud](/guides/cloud)).
|
||||
To use memory variables, you must be using an agent template, not an agent.
|
||||
</Note>
|
||||
|
||||
Memory variables allow you to dynamically define parts of your agent memory at the time of agent creation (when a [template](/guides/cloud/templates) is used to create a new agent).
|
||||
|
||||
## Defining variables in memory blocks
|
||||
|
||||
To use memory variables in your agent templates, you can define variables in your memory blocks by wrapping them in `{{ }}`.
|
||||
For example, if you have an agent template called `customer-service-template` designed to handle customer support issues, you might have a block of memory that stores information about the user:
|
||||
```handlebars
|
||||
The user is contacting me to resolve a customer support issue.
|
||||
Their name is {{name}} and the ticket number for this request is {{ticket}}.
|
||||
```
|
||||
|
||||
Once variables have been defined inside of your memory block, they will dynamically appear at variables in the **ADE variables window** (click the "\{\} Variables" button at the top of the chat window to expand the dropdown).
|
||||
|
||||
## Simulating variable values in the ADE
|
||||
|
||||
<Tip>
|
||||
Reset the state of the simulated agent by clicking the "Flush Simulation" 🔄 button.
|
||||
</Tip>
|
||||
|
||||
While designing agent templates in the ADE, you can interact with a simulated agent.
|
||||
The ADE variables window allows you to specify the values of the variables for the simulated agent.
|
||||
|
||||
You can see the current state of the simulated agent's memory by clicking the "Simulated" tab in the "Core Memory" panel in the ADE.
|
||||
If you're using memory variables and do not specify values for the variables in the ADE variables window, the simulated agent will use empty values.
|
||||
|
||||
In this prior example, the `name` and `ticket` variables are memory variables that we will specify when we create a new agent - information that we expect to have available at that time.
|
||||
While designing the agent template, we will likely want to experiment with different values for these variables to make sure that the agent is behaving as expected.
|
||||
For example, if we change the name of the user from "Alice" to "Bob", the simulated agent should respond accordingly.
|
||||
|
||||
## Defining variables during agent creation
|
||||
|
||||
When we're ready to create an agent from our template, we can specify the values for the variables using the `variables` parameter in the [create agents from template endpoint](/api-reference/templates/agents/create):
|
||||
```sh
|
||||
curl -X POST https://app.letta.com/v1/templates/{project_slug}/{template_name}:{template_version} \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||
-d '{
|
||||
"from_template": customer-service-template:latest",
|
||||
"variables": {
|
||||
"name": "Bob",
|
||||
"ticket": "TX-123"
|
||||
}
|
||||
}'
|
||||
```
|
||||
@@ -1,41 +0,0 @@
|
||||
---
|
||||
title: Versioning Agent Templates
|
||||
slug: guides/templates/versioning
|
||||
---
|
||||
|
||||
<Note>
|
||||
Versioning is a feature in [agent templates](/guides/cloud/templates) (part of [Letta Cloud](/guides/cloud/overview)).
|
||||
To use versioning, you must be using an agent template, not an agent.
|
||||
</Note>
|
||||
|
||||
Versions allow you to keep track of the changes you've made to your template over time.
|
||||
Agent templates follow the versioning convention of `template-name:version-number`.
|
||||
|
||||
Similar to [Docker tags](https://docs.docker.com/get-started/docker-concepts/building-images/build-tag-and-publish-an-image/#tagging-images), you can specify the latest version of a template using the `latest` keyword (`template-name:latest`).
|
||||
|
||||
## Creating a new template version
|
||||
When you create a template, it starts off at version 1.
|
||||
Once you've make edits to your template in the ADE, you can create a new version of the template by clicking the "Template" button in the ADE (top right), then clicking "Save new template version".
|
||||
Version numbers are incremented automatically (e.g. version 1 becomes version 2).
|
||||
|
||||
## Migrating existing agents to a new template version
|
||||
If you've deployed agents on a previous version of the template, you'll be asked if you want to migrate your existing agents to the new version of the template.
|
||||
When you migrate existing agents to a new template version, Letta Cloud will re-create your existing agents using the new template information, but keeping prior agent state such as the conversation history, and injecting memory variables as needed.
|
||||
|
||||
### When should I migrate (or not migrate) my agents?
|
||||
One reason you might want to migrate your agents is if you've added new tools to your agent template: migrating existing agents to the new version of the template will give them access to the new tools, while retaining all of their prior state.
|
||||
Another example usecase is if you make modifications to your prompts to tune your agent behavior - if you find a modification works well, you can save a new version with the prompt edits, and migrate all deployed agents to the new version.
|
||||
|
||||
### Forking an agent template
|
||||
If you decide to make significant changes to your agent and would prefer to make a new template to track your changes, you can easily create a new agent template from an existing template by **forking** your template (click the settings button ⚙️ in the ADE, then click "Fork Template").
|
||||
|
||||
## Specifying a version when creating an agent
|
||||
|
||||
You can specify a template version when creating an agent in the you can use the [create agents from template endpoint](/api-reference/templates/agents/create)
|
||||
For example, to deploy an agent from a template called `template-name` at version 2, you would use `:2` as the template tag:
|
||||
```sh
|
||||
curl -X POST https://app.letta.com/v1/templates/{project_slug}/{template_name}:2 \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||
-d '{}'
|
||||
```
|
||||
@@ -1,124 +0,0 @@
|
||||
---
|
||||
title: Research Background
|
||||
subtitle: The academic foundations of Letta
|
||||
slug: concepts/letta
|
||||
---
|
||||
|
||||
<Info>
|
||||
**Looking for practical concepts?** See [Core Concepts](/core-concepts) for understanding how to build with Letta's stateful agents.
|
||||
</Info>
|
||||
|
||||
## Letta and MemGPT
|
||||
|
||||
**[Letta](https://letta.com)** was created by the same team that created **[MemGPT](https://research.memgpt.ai)**.
|
||||
|
||||
### MemGPT: The Research Paper
|
||||
|
||||
**MemGPT is a research paper** ([arXiv:2310.08560](https://arxiv.org/abs/2310.08560)) that introduced foundational concepts for building stateful LLM agents:
|
||||
|
||||
- **Self-editing memory** - LLMs using tools to edit their own context window and external storage
|
||||
- **LLM Operating System** - Infrastructure layer managing agent state, memory, and execution
|
||||
- **Memory hierarchy** - Distinguishing between in-context memory (core) and out-of-context memory (archival)
|
||||
- **Context window management** - Intelligent paging and memory consolidation techniques
|
||||
|
||||
The paper demonstrated that LLMs could maintain coherent conversations far beyond their context window limits by actively managing their own memory through tool calling.
|
||||
|
||||
[Read the full MemGPT paper →](https://arxiv.org/abs/2310.08560)
|
||||
|
||||
### MemGPT: The Agent Architecture
|
||||
|
||||
MemGPT also refers to a **specific agent architecture** popularized by the research paper. A MemGPT agent has:
|
||||
- Memory editing tools (`memory_replace`, `memory_insert`, `memory_rethink`)
|
||||
- Archival memory tools (`archival_memory_insert`, `archival_memory_search`)
|
||||
- Conversation search tools (`conversation_search`, `conversation_search_date`)
|
||||
- A structured context window with persona and human memory blocks
|
||||
|
||||
This architecture makes MemGPT agents particularly effective for long-range chat applications, document search, and personalized assistants.
|
||||
|
||||
[Learn more about MemGPT agents →](/guides/legacy/memgpt-agents-legacy)
|
||||
|
||||
### Letta: The Framework
|
||||
|
||||
**Letta is a production framework** that allows you to build and deploy agents with MemGPT-style memory systems (and beyond) as **services** behind REST APIs.
|
||||
|
||||
While the MemGPT research focused on the agent architecture and memory system, Letta provides:
|
||||
- **Production infrastructure** - Database backends, persistence, state management
|
||||
- **Agent runtime** - Tool execution, reasoning loops, multi-agent orchestration
|
||||
- **Developer tools** - Agent Development Environment (ADE), SDKs, monitoring
|
||||
- **Deployment options** - Letta Cloud for managed hosting, or self-hosted with Docker
|
||||
- **Flexibility** - Build MemGPT agents, or design custom agent architectures with different memory systems
|
||||
|
||||
**In short:**
|
||||
- **MemGPT (research)** = Ideas about how agents should manage memory
|
||||
- **MemGPT (architecture)** = Specific agent design with memory tools
|
||||
- **Letta (framework)** = Production system for building and deploying stateful agents
|
||||
|
||||
## Agents in Context
|
||||
|
||||
The concept of "agents" has a long history across multiple fields:
|
||||
|
||||
**In reinforcement learning and AI**, agents are entities that:
|
||||
1. Perceive their environment through sensors
|
||||
2. Make decisions based on internal state
|
||||
3. Take actions that affect their environment
|
||||
4. Learn from outcomes to improve future decisions
|
||||
|
||||
**In economics and game theory**, agents are autonomous decision-makers with their own objectives and strategies.
|
||||
|
||||
**In LLMs**, agents extend these concepts by using language models for reasoning and tool calling for actions. Letta's approach emphasizes:
|
||||
- **Statefulness** - Persistent memory and identity across sessions
|
||||
- **Autonomy** - Self-directed memory management and multi-step reasoning
|
||||
- **Tool use** - Modifying internal state and accessing external resources
|
||||
|
||||
## LLM Operating System
|
||||
|
||||
The **LLM OS** is the infrastructure layer that manages agent execution and state. This concept, introduced in the MemGPT paper, draws an analogy to traditional operating systems:
|
||||
|
||||
Just as an OS manages memory, processes, and I/O for programs, the LLM OS manages:
|
||||
- **Memory layer** - Context window management, paging, and persistence
|
||||
- **Agent runtime** - Tool execution and the reasoning loop
|
||||
- **Stateful layer** - Coordination across database, cache, and execution
|
||||
|
||||
Letta implements this LLM OS architecture, providing the infrastructure for stateful agent services.
|
||||
|
||||
## Self-Editing Memory
|
||||
|
||||
A key innovation from the MemGPT research is **self-editing memory** - agents that actively manage their own memory using tools.
|
||||
|
||||
Traditional RAG systems passively retrieve documents. Letta agents actively:
|
||||
- **Edit in-context memory** - Update memory blocks based on learned information
|
||||
- **Manage archival storage** - Decide what facts to persist long-term
|
||||
- **Search strategically** - Query their memory when relevant context is needed
|
||||
|
||||
This active memory management enables agents to learn and evolve through interactions rather than requiring retraining or prompt engineering.
|
||||
|
||||
[Learn more about Letta's memory system →](/guides/agents/memory)
|
||||
|
||||
## Further Reading
|
||||
|
||||
<CardGroup cols={2}>
|
||||
<Card
|
||||
title="Core Concepts"
|
||||
href="/core-concepts"
|
||||
>
|
||||
Practical guide to building with stateful agents
|
||||
</Card>
|
||||
<Card
|
||||
title="MemGPT Research Details"
|
||||
href="/concepts/memgpt"
|
||||
>
|
||||
Deep dive into the MemGPT paper's technical contributions
|
||||
</Card>
|
||||
<Card
|
||||
title="Agent Memory System"
|
||||
href="/guides/agents/memory"
|
||||
>
|
||||
How agents manage memory in Letta
|
||||
</Card>
|
||||
<Card
|
||||
title="MemGPT Agents"
|
||||
href="/guides/legacy/memgpt-agents-legacy"
|
||||
>
|
||||
Build agents with the MemGPT architecture
|
||||
</Card>
|
||||
</CardGroup>
|
||||
@@ -1,37 +0,0 @@
|
||||
---
|
||||
title: MemGPT
|
||||
subtitle: Learn about the key ideas behind MemGPT
|
||||
slug: concepts/memgpt
|
||||
---
|
||||
|
||||
|
||||
<Tip>The MemGPT open source framework / package was renamed to _Letta_. You can read about the difference between Letta and MemGPT [here](/concepts/letta), or read more about the change on our [blog post](https://www.letta.com/blog/memgpt-and-letta).</Tip>
|
||||
|
||||
## MemGPT - the research paper
|
||||
|
||||
<Frame caption="Figure 1 from the MemGPT paper showing the system architecture. Note that 'working context' from the paper is referred to as 'core memory' in the codebase. To read the paper, visit https://arxiv.org/abs/2310.08560.">
|
||||
<img src="/images/memgpt-system-diagram.png" />
|
||||
</Frame>
|
||||
|
||||
**MemGPT** is the name of a [**research paper**](https://arxiv.org/abs/2310.08560) that popularized several of the key concepts behind the "LLM Operating System (OS)":
|
||||
1. **Memory management**: In MemGPT, an LLM OS moves data in and out of the context window of the LLM to manage its memory.
|
||||
2. **Memory hierarchy**: The "LLM OS" divides the LLM's memory (aka its "virtual context", similar to "[virtual memory](https://en.wikipedia.org/wiki/Virtual_memory)" in computer systems) into two parts: the in-context memory, and out-of-context memory.
|
||||
3. **Self-editing memory via tool calling**: In MemGPT, the "OS" that manages memory is itself an LLM. The LLM moves data in and out of the context window using designated memory-editing tools.
|
||||
4. **Multi-step reasoning using heartbeats**: MemGPT supports multi-step reasoning (allowing the agent to take multiple steps in sequence) via the concept of "heartbeats". Whenever the LLM outputs a tool call, it has to option to request a heartbeat by setting the keyword argument `request_heartbeat` to `true`. If the LLM requests a heartbeat, the LLM OS continues execution in a loop, allowing the LLM to "think" again.
|
||||
|
||||
You can read more about the MemGPT memory hierarchy and memory management system in our [memory concepts guide](/advanced/memory_management).
|
||||
|
||||
## MemGPT - the agent architecture
|
||||
|
||||
**MemGPT** also refers to a particular **agent architecture** that was popularized by the paper and adopted widely by other LLM chatbots:
|
||||
1. **Chat-focused core memory**: The core memory of a MemGPT agent is split into two parts - the agent's own persona, and the user information. Because the MemGPT agent has self-editing memory, it can update its own personality over time, as well as update the user information as it learns new facts about the user.
|
||||
2. **Vector database archival memory**: By default, the archival memory connected to a MemGPT agent is backed by a vector database, such as [Chroma](https://www.trychroma.com/) or [pgvector](https://github.com/pgvector/pgvector). Because in MemGPT all connections to memory are driven by tools, it's simple to exchange archival memory to be powered by a more traditional database (you can even make archival memory a flatfile if you want!).
|
||||
|
||||
## Creating MemGPT agents in the Letta framework
|
||||
|
||||
Because **Letta** was created out of the original MemGPT open source project, it's extremely easy to make MemGPT agents inside of Letta (the default Letta agent architecture is a MemGPT agent).
|
||||
See our [agents overview](/guides/agents/overview) for a tutorial on how to create MemGPT agents with Letta.
|
||||
|
||||
**The Letta framework also allow you to make agent architectures beyond MemGPT** that differ significantly from the architecture proposed in the research paper - for example, agents with multiple logical threads (e.g. a "concious" and a "subconcious"), or agents with more advanced memory types (e.g. task memory).
|
||||
|
||||
Additionally, **the Letta framework also allows you to expose your agents as *services*** (over REST APIs) - so you can use the Letta framework to power your AI applications.
|
||||
@@ -1,59 +0,0 @@
|
||||
---
|
||||
title: RAG with Letta
|
||||
subtitle: Connect your custom RAG pipeline to Letta agents
|
||||
slug: guides/rag/overview
|
||||
---
|
||||
|
||||
If you have an existing Retrieval-Augmented Generation (RAG) pipeline, you can connect it to your Letta agents. While Letta provides built-in features like archival memory, you can integrate your own RAG pipeline just as you would with any LLM API. This gives you full control over your data and retrieval methods.
|
||||
|
||||
## What is RAG?
|
||||
|
||||
Retrieval-Augmented Generation (RAG) enhances LLM responses by retrieving relevant information from external data sources before generating an answer. Instead of relying on the model's training data, a RAG system:
|
||||
|
||||
1. Takes a user query.
|
||||
2. Searches a vector database for relevant documents.
|
||||
3. Includes those documents in the LLM's context.
|
||||
4. Generates an informed response based on the retrieved information.
|
||||
|
||||
## Choosing Your RAG Approach
|
||||
|
||||
Letta supports two approaches for integrating RAG, depending on how much control you want over the retrieval process.
|
||||
|
||||
| Aspect | Simple RAG | Agentic RAG |
|
||||
|--------|------------|-------------|
|
||||
| **Who Controls Retrieval** | Your application controls when retrieval happens and what the retrieval query is. | The agent decides when to retrieve and what query to use. |
|
||||
| **Context Inclusion** | You can always include retrieval results in the context. | Retrieval happens only when the agent determines it's needed. |
|
||||
| **Latency** | Lower – typically single-hop, as the agent doesn't need to do a tool call. | Higher – requires tool calls for retrieval. |
|
||||
| **Client Code** | More complex, as it handles retrieval logic. | Simpler, as it just sends the user query. |
|
||||
| **Customization** | You have full control via your retrieval function. | You have full control via your custom tool definition. |
|
||||
|
||||
Both approaches work with any vector database. Our tutorials include examples for **ChromaDB**, **MongoDB Atlas**, and **Qdrant**.
|
||||
|
||||
## Next Steps
|
||||
|
||||
Ready to integrate RAG with your Letta agents?
|
||||
|
||||
<CardGroup cols={2}>
|
||||
<Card
|
||||
title="Simple RAG Tutorial"
|
||||
icon="fa-sharp fa-light fa-magnifying-glass"
|
||||
href="/guides/rag/simple"
|
||||
iconPosition="left"
|
||||
>
|
||||
Learn how to manage retrieval on the client-side and inject context directly into your agent's messages.
|
||||
</Card>
|
||||
<Card
|
||||
title="Agentic RAG Tutorial"
|
||||
icon="fa-sharp fa-light fa-robot"
|
||||
href="/guides/rag/agentic"
|
||||
iconPosition="left"
|
||||
>
|
||||
Learn how to empower your agent with custom search tools for autonomous retrieval.
|
||||
</Card>
|
||||
</CardGroup>
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- [Custom Tools](/guides/agents/custom-tools) - Learn more about creating custom tools for your agents.
|
||||
- [Memory Management](/guides/agents/memory) - Understand how Letta's built-in memory works.
|
||||
- [Agent Development Environment](/guides/ade) - Configure and test your agents in the web interface.
|
||||
@@ -1,274 +0,0 @@
|
||||
---
|
||||
title: Examples & Tutorials
|
||||
slug: cookbooks
|
||||
---
|
||||
|
||||
Build powerful AI agents with persistent memory. Explore tutorials, ready-to-use templates, and community projects to get started.
|
||||
|
||||
<Info>
|
||||
**New to Letta?**
|
||||
|
||||
- Start with our [Quickstart Guide](/quickstart)
|
||||
- Take the free [DeepLearning.AI Course](https://www.deeplearning.ai/short-courses/llms-as-operating-systems-agent-memory/)
|
||||
- Explore [Awesome Letta](https://github.com/letta-ai/awesome-letta) for more resources
|
||||
</Info>
|
||||
|
||||
## Getting Started Tutorials
|
||||
|
||||
Step-by-step guides to learn Letta fundamentals.
|
||||
|
||||
<CardGroup cols={2}>
|
||||
<Card
|
||||
title="Your First Agent"
|
||||
icon="fa-sharp fa-light fa-rocket"
|
||||
href="/examples/hello-world"
|
||||
iconPosition="left"
|
||||
>
|
||||
Build your first Letta agent in minutes
|
||||
</Card>
|
||||
<Card
|
||||
title="Talk to Your PDF"
|
||||
icon="fa-sharp fa-light fa-file-pdf"
|
||||
href="/tutorials/pdf-chat"
|
||||
iconPosition="left"
|
||||
>
|
||||
Create an agent that can answer questions about PDF documents
|
||||
</Card>
|
||||
<Card
|
||||
title="Attaching & Detaching Memory Blocks"
|
||||
icon="fa-sharp fa-light fa-memory"
|
||||
href="/examples/attaching-detaching-blocks"
|
||||
iconPosition="left"
|
||||
>
|
||||
Learn how to dynamically manage agent memory
|
||||
</Card>
|
||||
<Card
|
||||
title="Shared Memory Blocks"
|
||||
icon="fa-sharp fa-light fa-share-nodes"
|
||||
href="/tutorials/shared-memory-blocks"
|
||||
iconPosition="left"
|
||||
>
|
||||
Share memory between multiple agents for coordination
|
||||
</Card>
|
||||
</CardGroup>
|
||||
|
||||
## Ready-to-Deploy Applications
|
||||
|
||||
Production-ready templates you can clone and customize.
|
||||
|
||||
<CardGroup cols={2}>
|
||||
<Card
|
||||
title="Next.js Chatbot"
|
||||
icon="fa-sharp fa-light fa-messages"
|
||||
href="https://github.com/letta-ai/letta-chatbot-example"
|
||||
iconPosition="left"
|
||||
>
|
||||
Full-stack chatbot with per-user agent memory (Next.js + TypeScript)
|
||||
</Card>
|
||||
<Card
|
||||
title="Discord Bot"
|
||||
icon="fa-brands fa-discord"
|
||||
href="https://github.com/letta-ai/letta-discord-bot-example"
|
||||
iconPosition="left"
|
||||
>
|
||||
Discord bot with persistent memory for each server and user
|
||||
</Card>
|
||||
<Card
|
||||
title="Character.AI Clone"
|
||||
icon="fa-sharp fa-light fa-user-robot"
|
||||
href="https://github.com/letta-ai/characterai-memory"
|
||||
iconPosition="left"
|
||||
>
|
||||
Create AI characters with memory that persists across conversations
|
||||
</Card>
|
||||
<Card
|
||||
title="Deep Research Agent"
|
||||
icon="fa-sharp fa-light fa-magnifying-glass"
|
||||
href="https://github.com/letta-ai/deep-research"
|
||||
iconPosition="left"
|
||||
>
|
||||
Research agent that gathers and synthesizes information over time
|
||||
</Card>
|
||||
</CardGroup>
|
||||
|
||||
## Multi-Agent Systems
|
||||
|
||||
Build coordinated teams of specialized agents.
|
||||
|
||||
<CardGroup cols={2}>
|
||||
<Card
|
||||
title="Async Multi-Agent"
|
||||
icon="fa-sharp fa-light fa-user-group"
|
||||
href="/cookbooks/multi-agent-async"
|
||||
iconPosition="left"
|
||||
>
|
||||
Connect agents to chat with each other and users simultaneously
|
||||
</Card>
|
||||
<Card
|
||||
title="Customer-Specific Agents"
|
||||
icon="fa-sharp fa-light fa-users"
|
||||
href="/cookbooks/customer-specific-agents"
|
||||
iconPosition="left"
|
||||
>
|
||||
Template for building relationship-aware agents for each customer
|
||||
</Card>
|
||||
</CardGroup>
|
||||
|
||||
## Tools & Integrations
|
||||
|
||||
Connect Letta to your favorite platforms and tools.
|
||||
|
||||
<CardGroup cols={3}>
|
||||
<Card
|
||||
title="Vercel AI SDK"
|
||||
icon="fa-sharp fa-light fa-triangle"
|
||||
href="https://github.com/letta-ai/vercel-ai-sdk-provider"
|
||||
iconPosition="left"
|
||||
>
|
||||
Use Letta with Vercel AI SDK v5
|
||||
</Card>
|
||||
<Card
|
||||
title="Zapier"
|
||||
icon="fa-sharp fa-light fa-bolt"
|
||||
href="https://zapier.com/apps/letta/integrations"
|
||||
iconPosition="left"
|
||||
>
|
||||
Connect agents to 7,000+ apps
|
||||
</Card>
|
||||
<Card
|
||||
title="n8n Workflows"
|
||||
icon="fa-sharp fa-light fa-diagram-project"
|
||||
href="https://github.com/letta-ai/n8n-nodes-letta"
|
||||
iconPosition="left"
|
||||
>
|
||||
Integrate with n8n automation workflows
|
||||
</Card>
|
||||
<Card
|
||||
title="Telegram Bot"
|
||||
icon="fa-brands fa-telegram"
|
||||
href="https://github.com/letta-ai/letta-telegram"
|
||||
iconPosition="left"
|
||||
>
|
||||
Deploy agents on Telegram
|
||||
</Card>
|
||||
<Card
|
||||
title="Obsidian Plugin"
|
||||
icon="fa-sharp fa-light fa-note-sticky"
|
||||
href="https://github.com/letta-ai/letta-obsidian"
|
||||
iconPosition="left"
|
||||
>
|
||||
Add Letta agents to your knowledge base
|
||||
</Card>
|
||||
<Card
|
||||
title="DuckDB Agent"
|
||||
icon="fa-sharp fa-light fa-database"
|
||||
href="https://github.com/letta-ai/letta-duckdb-agent"
|
||||
iconPosition="left"
|
||||
>
|
||||
SQL-powered data analysis agent
|
||||
</Card>
|
||||
</CardGroup>
|
||||
|
||||
## SDK Examples
|
||||
|
||||
Learn the basics with minimal code examples.
|
||||
|
||||
<CardGroup cols={2}>
|
||||
<Card
|
||||
title="TypeScript SDK"
|
||||
icon="fa-brands fa-js"
|
||||
href="https://github.com/letta-ai/letta/tree/main/examples/docs/node/example.ts"
|
||||
iconPosition="left"
|
||||
>
|
||||
Basic TypeScript/Node.js SDK example
|
||||
</Card>
|
||||
<Card
|
||||
title="Python SDK"
|
||||
icon="fa-brands fa-python"
|
||||
href="https://github.com/letta-ai/letta/tree/main/examples/docs/example.py"
|
||||
iconPosition="left"
|
||||
>
|
||||
Basic Python SDK example
|
||||
</Card>
|
||||
</CardGroup>
|
||||
|
||||
## Community Projects
|
||||
|
||||
Amazing projects built by the Letta community.
|
||||
|
||||
<CardGroup cols={2}>
|
||||
<Card
|
||||
title="Thought Stream"
|
||||
icon="fa-sharp fa-light fa-comments"
|
||||
href="https://tangled.sh/@cameron.pfiffer.org/thought-stream"
|
||||
iconPosition="left"
|
||||
>
|
||||
Deploy Letta agents to an ATProto-powered multi-agent chatroom
|
||||
</Card>
|
||||
<Card
|
||||
title="Thought Stream CLI"
|
||||
icon="fa-sharp fa-light fa-terminal"
|
||||
href="https://tangled.org/@cameron.pfiffer.org/thought-stream-cli"
|
||||
iconPosition="left"
|
||||
>
|
||||
IRC-style CLI for the Thought Stream
|
||||
</Card>
|
||||
</CardGroup>
|
||||
|
||||
## Learning Resources
|
||||
|
||||
<CardGroup cols={2}>
|
||||
<Card
|
||||
title="DeepLearning.AI Course"
|
||||
icon="fa-sharp fa-light fa-graduation-cap"
|
||||
href="https://www.deeplearning.ai/short-courses/llms-as-operating-systems-agent-memory/"
|
||||
iconPosition="left"
|
||||
>
|
||||
Free course: LLMs as Operating Systems - Building Agents with Memory
|
||||
</Card>
|
||||
<Card
|
||||
title="Core Concepts"
|
||||
icon="fa-sharp fa-light fa-book"
|
||||
href="/overview"
|
||||
iconPosition="left"
|
||||
>
|
||||
Understand how Letta agents work
|
||||
</Card>
|
||||
<Card
|
||||
title="API Reference"
|
||||
icon="fa-sharp fa-light fa-code"
|
||||
href="/api-reference/overview"
|
||||
iconPosition="left"
|
||||
>
|
||||
Complete API documentation
|
||||
</Card>
|
||||
<Card
|
||||
title="Research Papers"
|
||||
icon="fa-sharp fa-light fa-flask"
|
||||
href="https://www.letta.com/blog"
|
||||
iconPosition="left"
|
||||
>
|
||||
Read about the research behind Letta
|
||||
</Card>
|
||||
</CardGroup>
|
||||
|
||||
## More Resources
|
||||
|
||||
<CardGroup cols={2}>
|
||||
<Card
|
||||
title="Awesome Letta"
|
||||
icon="fa-sharp fa-light fa-star"
|
||||
href="https://github.com/letta-ai/awesome-letta"
|
||||
iconPosition="left"
|
||||
>
|
||||
Comprehensive curated list of Letta resources, tools, and community projects
|
||||
</Card>
|
||||
<Card
|
||||
title="Join Discord"
|
||||
icon="fa-brands fa-discord"
|
||||
href="https://discord.gg/letta"
|
||||
iconPosition="left"
|
||||
>
|
||||
Get help and share your projects with the community
|
||||
</Card>
|
||||
</CardGroup>
|
||||
@@ -1,68 +0,0 @@
|
||||
# Letta Evals Documentation
|
||||
|
||||
Welcome to the comprehensive documentation for Letta Evals Kit - a framework for evaluating Letta AI agents.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
### Getting Started
|
||||
- [Getting Started](./getting-started.md) - Installation, first evaluation, and core concepts
|
||||
|
||||
### Core Concepts
|
||||
- [Overview](./concepts/overview.md) - Understanding the evaluation framework
|
||||
- [Suites](./concepts/suites.md) - Evaluation suite configuration
|
||||
- [Datasets](./concepts/datasets.md) - Creating and managing test datasets
|
||||
- [Targets](./concepts/targets.md) - What you're evaluating
|
||||
- [Graders](./concepts/graders.md) - How responses are scored
|
||||
- [Extractors](./concepts/extractors.md) - Extracting submissions from agent output
|
||||
- [Gates](./concepts/gates.md) - Pass/fail criteria
|
||||
|
||||
### Graders
|
||||
- [Grader Overview](./graders/overview.md) - Understanding grader types
|
||||
- [Tool Graders](./graders/tool-graders.md) - Built-in and custom function graders
|
||||
- [Rubric Graders](./graders/rubric-graders.md) - LLM-as-judge evaluation
|
||||
- [Multi-Metric Grading](./graders/multi-metric.md) - Evaluating with multiple metrics
|
||||
|
||||
### Extractors
|
||||
- [Extractor Overview](./extractors/overview.md) - Understanding extractors
|
||||
- [Built-in Extractors](./extractors/builtin.md) - All available extractors
|
||||
- [Custom Extractors](./extractors/custom.md) - Writing your own extractors
|
||||
|
||||
### Configuration
|
||||
- [Suite YAML Reference](./configuration/suite-yaml.md) - Complete YAML schema
|
||||
- [Target Configuration](./configuration/targets.md) - Target setup options
|
||||
- [Grader Configuration](./configuration/graders.md) - Grader parameters
|
||||
- [Environment Variables](./configuration/environment.md) - Environment setup
|
||||
|
||||
### Advanced Usage
|
||||
- [Custom Graders](./advanced/custom-graders.md) - Writing custom grading functions
|
||||
- [Multi-Turn Conversations](./advanced/multi-turn-conversations.md) - Testing conversational memory and state
|
||||
- [Agent Factories](./advanced/agent-factories.md) - Programmatic agent creation
|
||||
- [Multi-Model Evaluation](./advanced/multi-model.md) - Testing across models
|
||||
- [Setup Scripts](./advanced/setup-scripts.md) - Pre-evaluation setup
|
||||
- [Memory Block Testing](./advanced/memory-blocks.md) - Testing agent memory
|
||||
- [Result Streaming](./advanced/streaming.md) - Real-time results and caching
|
||||
|
||||
### Results & Metrics
|
||||
- [Understanding Results](./results/overview.md) - Result structure and interpretation
|
||||
- [Metrics](./results/metrics.md) - Aggregate statistics
|
||||
- [Output Formats](./results/output-formats.md) - JSON, JSONL, and console output
|
||||
|
||||
### CLI Reference
|
||||
- [Commands](./cli/commands.md) - All CLI commands
|
||||
- [Options](./cli/options.md) - Command-line options
|
||||
|
||||
### Examples
|
||||
- [Example Walkthroughs](./examples/README.md) - Detailed example explanations
|
||||
|
||||
### API Reference
|
||||
- [Data Models](./api/models.md) - Pydantic models reference
|
||||
- [Decorators](./api/decorators.md) - @grader and @extractor decorators
|
||||
|
||||
### Best Practices
|
||||
- [Writing Effective Tests](./best-practices/writing-tests.md)
|
||||
- [Designing Rubrics](./best-practices/rubrics.md)
|
||||
- [Performance Optimization](./best-practices/performance.md)
|
||||
|
||||
### Troubleshooting
|
||||
- [Common Issues](./troubleshooting.md)
|
||||
- [FAQ](./faq.md)
|
||||
@@ -1,425 +0,0 @@
|
||||
# Custom Graders
|
||||
|
||||
Write your own grading functions to implement custom evaluation logic.
|
||||
|
||||
## Overview
|
||||
|
||||
Custom graders let you:
|
||||
- Implement domain-specific evaluation
|
||||
- Parse and validate complex formats
|
||||
- Apply custom scoring algorithms
|
||||
- Combine multiple checks in one grader
|
||||
|
||||
## Basic Structure
|
||||
|
||||
```python
|
||||
from letta_evals.decorators import grader
|
||||
from letta_evals.models import GradeResult, Sample
|
||||
|
||||
@grader
|
||||
def my_custom_grader(sample: Sample, submission: str) -> GradeResult:
|
||||
"""Your custom grading logic."""
|
||||
# Evaluate the submission
|
||||
score = calculate_score(submission, sample)
|
||||
|
||||
return GradeResult(
|
||||
score=score, # Must be 0.0 to 1.0
|
||||
rationale="Explanation of the score",
|
||||
metadata={"extra": "information"}
|
||||
)
|
||||
```
|
||||
|
||||
## The @grader Decorator
|
||||
|
||||
The `@grader` decorator registers your function so it can be used in suite YAML:
|
||||
|
||||
```python
|
||||
from letta_evals.decorators import grader
|
||||
|
||||
@grader # Makes this function available as "my_function"
|
||||
def my_function(sample: Sample, submission: str) -> GradeResult:
|
||||
...
|
||||
```
|
||||
|
||||
Without the decorator, your function won't be discovered.
|
||||
|
||||
## Function Signature
|
||||
|
||||
Your grader must have this signature:
|
||||
|
||||
```python
|
||||
def grader_name(sample: Sample, submission: str) -> GradeResult:
|
||||
...
|
||||
```
|
||||
|
||||
### Parameters
|
||||
|
||||
- `sample`: The dataset sample being evaluated (includes `input`, `ground_truth`, `metadata`, etc.)
|
||||
- `submission`: The extracted text from the agent's response
|
||||
|
||||
### Return Value
|
||||
|
||||
Must return a `GradeResult`:
|
||||
|
||||
```python
|
||||
from letta_evals.models import GradeResult
|
||||
|
||||
return GradeResult(
|
||||
score=0.85, # Required: 0.0 to 1.0
|
||||
rationale="Explanation", # Optional but recommended
|
||||
metadata={"key": "value"} # Optional: any extra data
|
||||
)
|
||||
```
|
||||
|
||||
## Complete Example
|
||||
|
||||
```python
|
||||
# custom_graders.py
|
||||
import json
|
||||
from letta_evals.decorators import grader
|
||||
from letta_evals.models import GradeResult, Sample
|
||||
|
||||
@grader
|
||||
def json_field_validator(sample: Sample, submission: str) -> GradeResult:
|
||||
"""Validates JSON and checks for required fields."""
|
||||
required_fields = sample.ground_truth.split(",") # e.g., "name,age,email"
|
||||
|
||||
try:
|
||||
data = json.loads(submission)
|
||||
except json.JSONDecodeError as e:
|
||||
return GradeResult(
|
||||
score=0.0,
|
||||
rationale=f"Invalid JSON: {e}",
|
||||
metadata={"error": "json_decode"}
|
||||
)
|
||||
|
||||
missing = [f for f in required_fields if f not in data]
|
||||
|
||||
if missing:
|
||||
score = 1.0 - (len(missing) / len(required_fields))
|
||||
return GradeResult(
|
||||
score=score,
|
||||
rationale=f"Missing fields: {', '.join(missing)}",
|
||||
metadata={"missing_fields": missing}
|
||||
)
|
||||
|
||||
return GradeResult(
|
||||
score=1.0,
|
||||
rationale="All required fields present",
|
||||
metadata={"fields_found": required_fields}
|
||||
)
|
||||
```
|
||||
|
||||
Dataset:
|
||||
```jsonl
|
||||
{"input": "Return user info as JSON", "ground_truth": "name,age,email"}
|
||||
```
|
||||
|
||||
Suite:
|
||||
```yaml
|
||||
graders:
|
||||
json_check:
|
||||
kind: tool
|
||||
function: json_field_validator
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
## Using Custom Graders
|
||||
|
||||
### Method 1: Custom Evaluators File
|
||||
|
||||
Create a file with your graders (e.g., `custom_evaluators.py`) in your project:
|
||||
|
||||
```python
|
||||
from letta_evals.decorators import grader
|
||||
from letta_evals.models import GradeResult, Sample
|
||||
|
||||
@grader
|
||||
def my_grader(sample: Sample, submission: str) -> GradeResult:
|
||||
...
|
||||
```
|
||||
|
||||
Reference it in your suite:
|
||||
|
||||
```yaml
|
||||
# The file will be automatically discovered if it's in the same directory
|
||||
# or use Python path imports
|
||||
graders:
|
||||
my_metric:
|
||||
kind: tool
|
||||
function: my_grader
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
### Method 2: Setup Script
|
||||
|
||||
Import your graders in a setup script:
|
||||
|
||||
```python
|
||||
# setup.py
|
||||
from letta_evals.models import SuiteSpec
|
||||
import custom_evaluators # This imports and registers graders
|
||||
|
||||
def prepare_environment(suite: SuiteSpec) -> None:
|
||||
pass # Graders are registered via import
|
||||
```
|
||||
|
||||
```yaml
|
||||
setup_script: setup.py:prepare_environment
|
||||
|
||||
graders:
|
||||
my_metric:
|
||||
kind: tool
|
||||
function: my_grader
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
## Real-World Examples
|
||||
|
||||
### Length Check
|
||||
|
||||
```python
|
||||
@grader
|
||||
def appropriate_length(sample: Sample, submission: str) -> GradeResult:
|
||||
"""Check if response length is within expected range."""
|
||||
min_len = 50
|
||||
max_len = 500
|
||||
length = len(submission)
|
||||
|
||||
if min_len <= length <= max_len:
|
||||
score = 1.0
|
||||
rationale = f"Length {length} is appropriate"
|
||||
elif length < min_len:
|
||||
score = max(0.0, length / min_len)
|
||||
rationale = f"Too short: {length} chars (min {min_len})"
|
||||
else:
|
||||
score = max(0.0, 1.0 - (length - max_len) / max_len)
|
||||
rationale = f"Too long: {length} chars (max {max_len})"
|
||||
|
||||
return GradeResult(score=score, rationale=rationale)
|
||||
```
|
||||
|
||||
### Keyword Coverage
|
||||
|
||||
```python
|
||||
@grader
|
||||
def keyword_coverage(sample: Sample, submission: str) -> GradeResult:
|
||||
"""Check what percentage of required keywords are present."""
|
||||
keywords = sample.ground_truth.split(",")
|
||||
submission_lower = submission.lower()
|
||||
|
||||
found = [kw for kw in keywords if kw.lower() in submission_lower]
|
||||
score = len(found) / len(keywords) if keywords else 0.0
|
||||
|
||||
return GradeResult(
|
||||
score=score,
|
||||
rationale=f"Found {len(found)}/{len(keywords)} keywords: {', '.join(found)}",
|
||||
metadata={"found": found, "missing": list(set(keywords) - set(found))}
|
||||
)
|
||||
```
|
||||
|
||||
Dataset:
|
||||
```jsonl
|
||||
{"input": "Explain photosynthesis", "ground_truth": "light,energy,chlorophyll,oxygen,carbon dioxide"}
|
||||
```
|
||||
|
||||
### Tool Call Validation
|
||||
|
||||
```python
|
||||
import json
|
||||
|
||||
@grader
|
||||
def correct_tool_arguments(sample: Sample, submission: str) -> GradeResult:
|
||||
"""Validate tool was called with correct arguments."""
|
||||
try:
|
||||
args = json.loads(submission)
|
||||
except json.JSONDecodeError:
|
||||
return GradeResult(score=0.0, rationale="No valid tool call found")
|
||||
|
||||
expected_tool = sample.metadata.get("expected_tool")
|
||||
if args.get("tool_name") != expected_tool:
|
||||
return GradeResult(
|
||||
score=0.0,
|
||||
rationale=f"Wrong tool: expected {expected_tool}, got {args.get('tool_name')}"
|
||||
)
|
||||
|
||||
# Check arguments
|
||||
expected_args = json.loads(sample.ground_truth)
|
||||
matches = all(args.get(k) == v for k, v in expected_args.items())
|
||||
|
||||
if matches:
|
||||
return GradeResult(score=1.0, rationale="Tool called with correct arguments")
|
||||
else:
|
||||
return GradeResult(score=0.5, rationale="Tool correct but arguments differ")
|
||||
```
|
||||
|
||||
### Numeric Range Check
|
||||
|
||||
```python
|
||||
@grader
|
||||
def numeric_range(sample: Sample, submission: str) -> GradeResult:
|
||||
"""Check if extracted number is within expected range."""
|
||||
try:
|
||||
value = float(submission.strip())
|
||||
min_val, max_val = map(float, sample.ground_truth.split(","))
|
||||
|
||||
if min_val <= value <= max_val:
|
||||
return GradeResult(
|
||||
score=1.0,
|
||||
rationale=f"Value {value} is within range [{min_val}, {max_val}]"
|
||||
)
|
||||
else:
|
||||
# Partial credit based on distance
|
||||
if value < min_val:
|
||||
distance = min_val - value
|
||||
else:
|
||||
distance = value - max_val
|
||||
|
||||
score = max(0.0, 1.0 - (distance / max_val))
|
||||
return GradeResult(
|
||||
score=score,
|
||||
rationale=f"Value {value} outside range [{min_val}, {max_val}]"
|
||||
)
|
||||
|
||||
except ValueError as e:
|
||||
return GradeResult(score=0.0, rationale=f"Invalid numeric value: {e}")
|
||||
```
|
||||
|
||||
### Multi-Criteria
|
||||
|
||||
```python
|
||||
@grader
|
||||
def comprehensive_check(sample: Sample, submission: str) -> GradeResult:
|
||||
"""Multiple checks with weighted scoring."""
|
||||
points = 0.0
|
||||
issues = []
|
||||
|
||||
# Check 1: Contains answer (40%)
|
||||
if sample.ground_truth.lower() in submission.lower():
|
||||
points += 0.4
|
||||
else:
|
||||
issues.append("Missing expected answer")
|
||||
|
||||
# Check 2: Appropriate length (20%)
|
||||
if 100 <= len(submission) <= 500:
|
||||
points += 0.2
|
||||
else:
|
||||
issues.append(f"Length {len(submission)} not in range [100, 500]")
|
||||
|
||||
# Check 3: Starts with capital letter (10%)
|
||||
if submission and submission[0].isupper():
|
||||
points += 0.1
|
||||
else:
|
||||
issues.append("Doesn't start with capital letter")
|
||||
|
||||
# Check 4: Ends with punctuation (10%)
|
||||
if submission and submission[-1] in ".!?":
|
||||
points += 0.1
|
||||
else:
|
||||
issues.append("Doesn't end with punctuation")
|
||||
|
||||
# Check 5: No profanity (20%)
|
||||
profanity = ["badword1", "badword2"]
|
||||
if not any(word in submission.lower() for word in profanity):
|
||||
points += 0.2
|
||||
else:
|
||||
issues.append("Contains inappropriate language")
|
||||
|
||||
rationale = f"Score: {points:.2f}. " + (
|
||||
"All checks passed!" if not issues else f"Issues: {'; '.join(issues)}"
|
||||
)
|
||||
|
||||
return GradeResult(
|
||||
score=points,
|
||||
rationale=rationale,
|
||||
metadata={"issues": issues}
|
||||
)
|
||||
```
|
||||
|
||||
## Accessing Sample Data
|
||||
|
||||
The `Sample` object provides:
|
||||
|
||||
```python
|
||||
sample.id # Sample ID
|
||||
sample.input # Input (str or List[str])
|
||||
sample.ground_truth # Expected answer (optional)
|
||||
sample.metadata # Dict with custom data (optional)
|
||||
sample.agent_args # Agent creation args (optional)
|
||||
```
|
||||
|
||||
Use these for flexible grading logic:
|
||||
|
||||
```python
|
||||
@grader
|
||||
def context_aware_grader(sample: Sample, submission: str) -> GradeResult:
|
||||
category = sample.metadata.get("category", "general")
|
||||
|
||||
if category == "math":
|
||||
# Strict for math
|
||||
return exact_math_check(sample, submission)
|
||||
elif category == "creative":
|
||||
# Lenient for creative
|
||||
return length_and_relevance_check(sample, submission)
|
||||
else:
|
||||
return default_check(sample, submission)
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
Always handle exceptions:
|
||||
|
||||
```python
|
||||
@grader
|
||||
def safe_grader(sample: Sample, submission: str) -> GradeResult:
|
||||
try:
|
||||
# Your logic here
|
||||
score = complex_calculation(submission)
|
||||
return GradeResult(score=score, rationale="Success")
|
||||
|
||||
except Exception as e:
|
||||
# Return 0.0 with error message
|
||||
return GradeResult(
|
||||
score=0.0,
|
||||
rationale=f"Error during grading: {str(e)}",
|
||||
metadata={"error": str(e), "error_type": type(e).__name__}
|
||||
)
|
||||
```
|
||||
|
||||
This ensures evaluation continues even if individual samples fail.
|
||||
|
||||
## Testing Your Grader
|
||||
|
||||
Test your grader with sample data:
|
||||
|
||||
```python
|
||||
from letta_evals.models import Sample, GradeResult
|
||||
|
||||
# Test case
|
||||
sample = Sample(
|
||||
id=0,
|
||||
input="What is 2+2?",
|
||||
ground_truth="4"
|
||||
)
|
||||
|
||||
submission = "The answer is 4"
|
||||
|
||||
result = my_grader(sample, submission)
|
||||
print(f"Score: {result.score}, Rationale: {result.rationale}")
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Validate input**: Check for edge cases (empty strings, malformed data)
|
||||
2. **Use meaningful rationales**: Explain why a score was given
|
||||
3. **Handle errors gracefully**: Return 0.0 with error message rather than crashing
|
||||
4. **Keep it fast**: Custom graders run for every sample
|
||||
5. **Use metadata**: Store extra information for debugging
|
||||
6. **Normalize scores**: Always return 0.0 to 1.0
|
||||
7. **Document your grader**: Add docstrings explaining criteria
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Custom Extractors](../extractors/custom.md)
|
||||
- [Tool Graders](../graders/tool-graders.md)
|
||||
- [Examples](../examples/README.md)
|
||||
@@ -1,216 +0,0 @@
|
||||
# Multi-Turn Conversations
|
||||
|
||||
Multi-turn conversations allow you to test how agents handle context across multiple exchanges - a key capability for stateful agents.
|
||||
|
||||
## Why Use Multi-Turn?
|
||||
|
||||
Multi-turn conversations enable testing that single-turn prompts cannot:
|
||||
|
||||
- **Memory storage**: Verify agents persist information to memory blocks across turns
|
||||
- **Tool call sequences**: Test multi-step workflows (e.g., search → analyze → summarize)
|
||||
- **Context retention**: Ensure agents remember details from earlier in the conversation
|
||||
- **State evolution**: Track how agent state changes across interactions
|
||||
- **Conversational coherence**: Test if agents maintain context appropriately
|
||||
|
||||
This is essential for stateful agents where behavior depends on conversation history.
|
||||
|
||||
## Single vs Multi-Turn Format
|
||||
|
||||
### Single-Turn (Default)
|
||||
|
||||
Most evaluations use a single prompt:
|
||||
|
||||
```jsonl
|
||||
{"input": "What is the capital of France?", "ground_truth": "Paris"}
|
||||
```
|
||||
|
||||
The agent receives one message and responds. Single-turn conversations are useful for simpler agents and for testing next-step behavior.
|
||||
|
||||
### Multi-Turn
|
||||
|
||||
For testing conversational memory, use an array of messages:
|
||||
|
||||
```jsonl
|
||||
{"input": ["My name is Alice", "What's my name?"], "ground_truth": "Alice"}
|
||||
```
|
||||
|
||||
The agent receives multiple messages in sequence:
|
||||
1. Turn 1: "My name is Alice"
|
||||
2. Turn 2: "What's my name?"
|
||||
|
||||
See the [built-in extractors](../extractors/builtin.md) for more information on how to use the agent's response from a multi-turn conversation for grading.
|
||||
|
||||
## How It Works
|
||||
|
||||
When you provide an array for `input`, the framework:
|
||||
1. Sends the first message to the agent
|
||||
2. Waits for the agent's response
|
||||
3. Sends the second message
|
||||
4. Continues until all messages are sent
|
||||
5. Extracts and grades the agent's response using the specified extractor and grader.
|
||||
|
||||
## Use Cases
|
||||
|
||||
### Testing Memory Persistence
|
||||
|
||||
```jsonl
|
||||
{"input": ["I live in Paris", "Where do I live?"], "ground_truth": "Paris"}
|
||||
```
|
||||
|
||||
Tests whether the agent stores information correctly using the `memory_block` extractor.
|
||||
|
||||
### Testing Tool Call Sequences
|
||||
|
||||
```jsonl
|
||||
{"input": ["Search for pandas", "What did you find about their diet?"], "ground_truth": "bamboo"}
|
||||
```
|
||||
|
||||
Verifies the agent calls tools in the right order and uses results appropriately.
|
||||
|
||||
### Testing Context Retention
|
||||
|
||||
```jsonl
|
||||
{"input": ["My favorite color is blue", "What color do I prefer?"], "ground_truth": "blue"}
|
||||
```
|
||||
|
||||
Ensures the agent recalls details from earlier in the conversation.
|
||||
|
||||
### Testing Long-Term Memory
|
||||
|
||||
```jsonl
|
||||
{"input": ["My name is Alice", "Tell me a joke", "What's my name again?"], "ground_truth": "Alice"}
|
||||
```
|
||||
|
||||
Checks if the agent remembers information even after intervening exchanges.
|
||||
|
||||
## Example Configuration
|
||||
|
||||
```yaml
|
||||
name: multi-turn-test
|
||||
dataset: conversations.jsonl
|
||||
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: agent.af
|
||||
base_url: http://localhost:8283
|
||||
|
||||
graders:
|
||||
recall:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: last_assistant
|
||||
|
||||
gate:
|
||||
metric_key: recall
|
||||
op: gte
|
||||
value: 0.8
|
||||
```
|
||||
|
||||
The grader evaluates the agent's final response (after all turns).
|
||||
|
||||
## Testing Both Response and Memory
|
||||
|
||||
Multi-turn evaluations become especially powerful when combined with the `memory_block` extractor:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
response_accuracy:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: last_assistant
|
||||
|
||||
memory_storage:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: memory_block
|
||||
extractor_config:
|
||||
block_label: human
|
||||
```
|
||||
|
||||
This tests two things:
|
||||
1. **Did the agent respond correctly?** (using conversation context)
|
||||
2. **Did the agent persist the information?** (to its memory blocks)
|
||||
|
||||
An agent might pass the first test by keeping information in working memory, but fail the second by not properly storing it for long-term recall.
|
||||
|
||||
## Context vs Persistence
|
||||
|
||||
Consider this result:
|
||||
|
||||
```
|
||||
Results by metric:
|
||||
response_accuracy - Avg: 1.00, Pass: 100.0%
|
||||
memory_storage - Avg: 0.00, Pass: 0.0%
|
||||
```
|
||||
|
||||
The agent answered correctly (100%) but didn't store anything in memory (0%). This reveals important agent behavior:
|
||||
|
||||
- **Working memory**: Agent kept information in conversation context
|
||||
- **Persistent memory**: Agent didn't update its memory blocks
|
||||
|
||||
For short conversations, working memory is sufficient. For long-term interactions, persistent memory is crucial.
|
||||
|
||||
## Complete Example
|
||||
|
||||
See [`examples/multi-turn-memory/`](https://github.com/letta-ai/letta-evals/tree/main/examples/multi-turn-memory) for a working example that demonstrates:
|
||||
- Multi-turn conversation format
|
||||
- Dual metric evaluation (response + memory)
|
||||
- The difference between context-based recall and true persistence
|
||||
|
||||
## Best Practices
|
||||
|
||||
### 1. Keep Turns Focused
|
||||
|
||||
Each turn should test one aspect of memory or context:
|
||||
|
||||
```jsonl
|
||||
{"input": ["I'm allergic to peanuts", "Can I eat this cookie?"], "ground_truth": "peanut"}
|
||||
```
|
||||
|
||||
### 2. Test Realistic Scenarios
|
||||
|
||||
Design conversations that mirror real user interactions:
|
||||
|
||||
```jsonl
|
||||
{"input": ["Set a reminder for tomorrow at 2pm", "What reminders do I have?"], "ground_truth": "2pm"}
|
||||
```
|
||||
|
||||
### 3. Use Tags for Organization
|
||||
|
||||
Tag multi-turn samples to distinguish them:
|
||||
|
||||
```jsonl
|
||||
{"input": ["Hello", "How are you?"], "tags": ["multi-turn", "greeting"]}
|
||||
```
|
||||
|
||||
### 4. Test Memory Limits
|
||||
|
||||
See how far back agents can recall:
|
||||
|
||||
```jsonl
|
||||
{"input": ["My name is Alice", "message 2", "message 3", "message 4", "What's my name?"], "ground_truth": "Alice"}
|
||||
```
|
||||
|
||||
### 5. Combine with Memory Extractors
|
||||
|
||||
Always verify both response and internal state for memory tests.
|
||||
|
||||
## Limitations
|
||||
|
||||
### Turn Count
|
||||
|
||||
Very long conversations may exceed context windows. Monitor token usage for conversations with many turns.
|
||||
|
||||
### State Isolation
|
||||
|
||||
Each sample starts with a fresh agent (or fresh conversation if using `agent_id`). Multi-turn tests memory within a single conversation, not across separate conversations.
|
||||
|
||||
### Extraction
|
||||
|
||||
Most extractors work on the final state. If you need to check intermediate turns, consider using custom extractors.
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Built-in Extractors](../extractors/builtin.md) - Using memory_block extractor
|
||||
- [Custom Extractors](../extractors/custom.md) - Build extractors for complex scenarios
|
||||
- [Multi-Metric Evaluation](../graders/multi-metric.md) - Combine multiple checks
|
||||
|
Before Width: | Height: | Size: 1.3 MiB |
@@ -1,389 +0,0 @@
|
||||
# CLI Commands
|
||||
|
||||
The **letta-evals** command-line interface lets you run evaluations, validate configurations, and inspect available components.
|
||||
|
||||
**Quick overview:**
|
||||
- **`run`** - Execute an evaluation suite (most common)
|
||||
- **`validate`** - Check suite configuration without running
|
||||
- **`list-extractors`** - Show available extractors
|
||||
- **`list-graders`** - Show available grader functions
|
||||
- **Exit codes** - 0 for pass, 1 for fail (perfect for CI/CD)
|
||||
|
||||
**Typical workflow:**
|
||||
1. Validate your suite: `letta-evals validate suite.yaml`
|
||||
2. Run evaluation: `letta-evals run suite.yaml --output results/`
|
||||
3. Check exit code: `echo $?` (0 = passed, 1 = failed)
|
||||
|
||||
Letta Evals provides a command-line interface for running evaluations and managing configurations.
|
||||
|
||||
## run
|
||||
|
||||
Run an evaluation suite.
|
||||
|
||||
```bash
|
||||
letta-evals run <suite.yaml> [options]
|
||||
```
|
||||
|
||||
### Arguments
|
||||
|
||||
- `suite.yaml`: Path to the suite configuration file (required)
|
||||
|
||||
### Options
|
||||
|
||||
#### --output, -o
|
||||
Save results to a directory.
|
||||
|
||||
```bash
|
||||
letta-evals run suite.yaml --output results/
|
||||
```
|
||||
|
||||
Creates:
|
||||
- `results/header.json`: Evaluation metadata
|
||||
- `results/summary.json`: Aggregate metrics and configuration
|
||||
- `results/results.jsonl`: Per-sample results (one JSON per line)
|
||||
|
||||
#### --quiet, -q
|
||||
Quiet mode - only show pass/fail result.
|
||||
|
||||
```bash
|
||||
letta-evals run suite.yaml --quiet
|
||||
```
|
||||
|
||||
Output:
|
||||
```
|
||||
✓ PASSED
|
||||
```
|
||||
|
||||
#### --max-concurrent
|
||||
Maximum concurrent sample evaluations.
|
||||
|
||||
```bash
|
||||
letta-evals run suite.yaml --max-concurrent 10
|
||||
```
|
||||
|
||||
Default: 15
|
||||
|
||||
Higher values = faster evaluation but more resource usage.
|
||||
|
||||
#### --api-key
|
||||
Letta API key (overrides LETTA_API_KEY environment variable).
|
||||
|
||||
```bash
|
||||
letta-evals run suite.yaml --api-key your-key
|
||||
```
|
||||
|
||||
#### --base-url
|
||||
Letta server base URL (overrides suite config and environment variable).
|
||||
|
||||
```bash
|
||||
letta-evals run suite.yaml --base-url http://localhost:8283
|
||||
```
|
||||
|
||||
#### --project-id
|
||||
Letta project ID for cloud deployments.
|
||||
|
||||
```bash
|
||||
letta-evals run suite.yaml --project-id proj_abc123
|
||||
```
|
||||
|
||||
#### --cached, -c
|
||||
Path to cached results (JSONL) for re-grading trajectories without re-running the agent.
|
||||
|
||||
```bash
|
||||
letta-evals run suite.yaml --cached previous_results.jsonl
|
||||
```
|
||||
|
||||
Use this to test different graders on the same agent trajectories.
|
||||
|
||||
#### --num-runs
|
||||
Run the evaluation multiple times to measure consistency and get aggregate statistics.
|
||||
|
||||
```bash
|
||||
letta-evals run suite.yaml --num-runs 10
|
||||
```
|
||||
|
||||
Default: 1 (single run)
|
||||
|
||||
**Output with multiple runs:**
|
||||
- Each run creates a separate `run_N/` directory with individual results
|
||||
- An `aggregate_stats.json` file contains statistics across all runs (mean, standard deviation, pass rate)
|
||||
|
||||
**Use cases:**
|
||||
- Measuring consistency of non-deterministic agents
|
||||
- Getting confidence intervals for evaluation metrics
|
||||
- Testing agent variability across multiple runs
|
||||
|
||||
See [Results - Multiple Runs](../results/overview.md#multiple-runs-statistics) for details on the statistics output.
|
||||
|
||||
### Examples
|
||||
|
||||
Basic run:
|
||||
```bash
|
||||
letta-evals run suite.yaml # Run evaluation, show results in terminal
|
||||
```
|
||||
|
||||
Save results:
|
||||
```bash
|
||||
letta-evals run suite.yaml --output evaluation-results/ # Save to directory
|
||||
```
|
||||
|
||||
High concurrency:
|
||||
```bash
|
||||
letta-evals run suite.yaml --max-concurrent 20 # Run 20 samples in parallel
|
||||
```
|
||||
|
||||
Letta Cloud:
|
||||
```bash
|
||||
letta-evals run suite.yaml \
|
||||
--base-url https://api.letta.com \ # Cloud endpoint
|
||||
--api-key $LETTA_API_KEY \ # Your API key
|
||||
--project-id proj_abc123 # Your project
|
||||
```
|
||||
|
||||
Quiet CI mode:
|
||||
```bash
|
||||
letta-evals run suite.yaml --quiet # Only show pass/fail
|
||||
if [ $? -eq 0 ]; then # Check exit code
|
||||
echo "Evaluation passed"
|
||||
else
|
||||
echo "Evaluation failed"
|
||||
exit 1 # Fail the CI build
|
||||
fi
|
||||
```
|
||||
|
||||
Multiple runs with statistics:
|
||||
```bash
|
||||
letta-evals run suite.yaml --num-runs 10 --output results/
|
||||
# Creates results/run_1/, results/run_2/, ..., results/run_10/
|
||||
# Plus results/aggregate_stats.json with mean, stddev, and pass rate
|
||||
```
|
||||
|
||||
### Exit Codes
|
||||
|
||||
- `0`: Evaluation passed (gate criteria met)
|
||||
- `1`: Evaluation failed (gate criteria not met or error)
|
||||
|
||||
## validate
|
||||
|
||||
Validate a suite configuration without running it.
|
||||
|
||||
```bash
|
||||
letta-evals validate <suite.yaml>
|
||||
```
|
||||
|
||||
Checks:
|
||||
- YAML syntax is valid
|
||||
- Required fields are present
|
||||
- Paths exist
|
||||
- Configuration is consistent
|
||||
- Grader/extractor combinations are valid
|
||||
|
||||
### Examples
|
||||
|
||||
```bash
|
||||
letta-evals validate suite.yaml
|
||||
```
|
||||
|
||||
Output on success:
|
||||
```
|
||||
✓ Suite configuration is valid
|
||||
```
|
||||
|
||||
Output on error:
|
||||
```
|
||||
✗ Validation failed:
|
||||
- Agent file not found: agent.af
|
||||
- Grader 'my_metric' references unknown function
|
||||
```
|
||||
|
||||
## list-extractors
|
||||
|
||||
List all available extractors.
|
||||
|
||||
```bash
|
||||
letta-evals list-extractors
|
||||
```
|
||||
|
||||
Shows:
|
||||
- Built-in extractors
|
||||
- Custom extractors (if registered)
|
||||
- Brief description of each
|
||||
|
||||
Output:
|
||||
```
|
||||
Available extractors:
|
||||
last_assistant - Extract the last assistant message
|
||||
first_assistant - Extract the first assistant message
|
||||
all_assistant - Concatenate all assistant messages
|
||||
pattern - Extract content matching regex
|
||||
tool_arguments - Extract tool call arguments
|
||||
tool_output - Extract tool return value
|
||||
after_marker - Extract content after a marker
|
||||
memory_block - Extract from memory block (requires agent_state)
|
||||
```
|
||||
|
||||
## list-graders
|
||||
|
||||
List all available grader functions.
|
||||
|
||||
```bash
|
||||
letta-evals list-graders
|
||||
```
|
||||
|
||||
Shows:
|
||||
- Built-in tool graders
|
||||
- Custom graders (if registered)
|
||||
- Brief description of each
|
||||
|
||||
Output:
|
||||
```
|
||||
Available graders:
|
||||
exact_match - Exact string match with ground_truth
|
||||
contains - Check if contains ground_truth
|
||||
regex_match - Match regex pattern
|
||||
ascii_printable_only - Validate ASCII-only content
|
||||
```
|
||||
|
||||
## help
|
||||
|
||||
Show help information.
|
||||
|
||||
```bash
|
||||
letta-evals --help
|
||||
```
|
||||
|
||||
Show help for a specific command:
|
||||
|
||||
```bash
|
||||
letta-evals run --help
|
||||
letta-evals validate --help
|
||||
```
|
||||
|
||||
## Environment Variables
|
||||
|
||||
These environment variables affect CLI behavior:
|
||||
|
||||
### LETTA_API_KEY
|
||||
API key for Letta authentication.
|
||||
|
||||
```bash
|
||||
export LETTA_API_KEY=your-key-here
|
||||
```
|
||||
|
||||
### LETTA_BASE_URL
|
||||
Letta server base URL.
|
||||
|
||||
```bash
|
||||
export LETTA_BASE_URL=http://localhost:8283
|
||||
```
|
||||
|
||||
### LETTA_PROJECT_ID
|
||||
Letta project ID (for cloud).
|
||||
|
||||
```bash
|
||||
export LETTA_PROJECT_ID=proj_abc123
|
||||
```
|
||||
|
||||
### OPENAI_API_KEY
|
||||
OpenAI API key (for rubric graders).
|
||||
|
||||
```bash
|
||||
export OPENAI_API_KEY=your-openai-key
|
||||
```
|
||||
|
||||
### OPENAI_BASE_URL
|
||||
Custom OpenAI-compatible endpoint (optional).
|
||||
|
||||
```bash
|
||||
export OPENAI_BASE_URL=https://your-endpoint.com/v1
|
||||
```
|
||||
|
||||
## Configuration Priority
|
||||
|
||||
Configuration values are resolved in this order (highest to lowest priority):
|
||||
|
||||
1. CLI arguments (`--api-key`, `--base-url`, `--project-id`)
|
||||
2. Suite YAML configuration
|
||||
3. Environment variables
|
||||
|
||||
## Using in CI/CD
|
||||
|
||||
### GitHub Actions
|
||||
|
||||
```yaml
|
||||
name: Run Evals
|
||||
on: [push]
|
||||
|
||||
jobs:
|
||||
evaluate:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Install dependencies
|
||||
run: pip install letta-evals
|
||||
|
||||
- name: Run evaluation
|
||||
env:
|
||||
LETTA_API_KEY: ${{ secrets.LETTA_API_KEY }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
run: |
|
||||
letta-evals run suite.yaml --quiet --output results/
|
||||
|
||||
- name: Upload results
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: eval-results
|
||||
path: results/
|
||||
```
|
||||
|
||||
### GitLab CI
|
||||
|
||||
```yaml
|
||||
evaluate:
|
||||
script:
|
||||
- pip install letta-evals
|
||||
- letta-evals run suite.yaml --quiet --output results/
|
||||
artifacts:
|
||||
paths:
|
||||
- results/
|
||||
variables:
|
||||
LETTA_API_KEY: $LETTA_API_KEY
|
||||
OPENAI_API_KEY: $OPENAI_API_KEY
|
||||
```
|
||||
|
||||
## Debugging
|
||||
|
||||
### Verbose Output
|
||||
|
||||
Currently, the CLI uses standard verbosity. For debugging:
|
||||
|
||||
1. Check the output directory for detailed results
|
||||
2. Examine `summary.json` for aggregate metrics
|
||||
3. Check `results.jsonl` for per-sample details
|
||||
|
||||
### Common Issues
|
||||
|
||||
**"Agent file not found"**
|
||||
```bash
|
||||
# Check file exists relative to suite YAML location
|
||||
ls -la path/to/agent.af
|
||||
```
|
||||
|
||||
**"Connection refused"**
|
||||
```bash
|
||||
# Verify Letta server is running
|
||||
curl http://localhost:8283/v1/health
|
||||
```
|
||||
|
||||
**"Invalid API key"**
|
||||
```bash
|
||||
# Check environment variable is set
|
||||
echo $LETTA_API_KEY
|
||||
```
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Understanding Results](../results/overview.md) - Interpreting evaluation output
|
||||
- [Suite YAML Reference](../configuration/suite-yaml.md) - Complete configuration options
|
||||
- [Getting Started](../getting-started.md) - Complete tutorial with examples
|
||||
@@ -1,342 +0,0 @@
|
||||
# CLI Commands
|
||||
|
||||
The **letta-evals** command-line interface lets you run evaluations, validate configurations, and inspect available components.
|
||||
|
||||
<Note>
|
||||
**Quick overview:**
|
||||
- **`run`** - Execute an evaluation suite (most common)
|
||||
- **`validate`** - Check suite configuration without running
|
||||
- **`list-extractors`** - Show available extractors
|
||||
- **`list-graders`** - Show available grader functions
|
||||
- **Exit codes** - 0 for pass, 1 for fail (perfect for CI/CD)
|
||||
</Note>
|
||||
|
||||
**Typical workflow:**
|
||||
1. Validate your suite: `letta-evals validate suite.yaml`
|
||||
2. Run evaluation: `letta-evals run suite.yaml --output results/`
|
||||
3. Check exit code: `echo $?` (0 = passed, 1 = failed)
|
||||
|
||||
## run
|
||||
|
||||
Run an evaluation suite.
|
||||
|
||||
```bash
|
||||
letta-evals run <suite.yaml> [options]
|
||||
```
|
||||
|
||||
### Arguments
|
||||
|
||||
- `suite.yaml`: Path to the suite configuration file (required)
|
||||
|
||||
### Options
|
||||
|
||||
#### --output, -o
|
||||
Save results to a directory.
|
||||
|
||||
```bash
|
||||
letta-evals run suite.yaml --output results/
|
||||
```
|
||||
|
||||
Creates:
|
||||
- `results/header.json`: Evaluation metadata
|
||||
- `results/summary.json`: Aggregate metrics and configuration
|
||||
- `results/results.jsonl`: Per-sample results (one JSON per line)
|
||||
|
||||
#### --quiet, -q
|
||||
Quiet mode - only show pass/fail result.
|
||||
|
||||
```bash
|
||||
letta-evals run suite.yaml --quiet
|
||||
```
|
||||
|
||||
Output:
|
||||
```
|
||||
✓ PASSED
|
||||
```
|
||||
|
||||
#### --max-concurrent
|
||||
Maximum concurrent sample evaluations. **Default**: 15
|
||||
|
||||
```bash
|
||||
letta-evals run suite.yaml --max-concurrent 10
|
||||
```
|
||||
|
||||
Higher values = faster evaluation but more resource usage.
|
||||
|
||||
#### --api-key
|
||||
Letta API key (overrides LETTA_API_KEY environment variable).
|
||||
|
||||
```bash
|
||||
letta-evals run suite.yaml --api-key your-key
|
||||
```
|
||||
|
||||
#### --base-url
|
||||
Letta server base URL (overrides suite config and environment variable).
|
||||
|
||||
```bash
|
||||
letta-evals run suite.yaml --base-url https://api.letta.com
|
||||
```
|
||||
|
||||
#### --project-id
|
||||
Letta project ID for cloud deployments.
|
||||
|
||||
```bash
|
||||
letta-evals run suite.yaml --project-id proj_abc123
|
||||
```
|
||||
|
||||
#### --cached, -c
|
||||
Path to cached results (JSONL) for re-grading trajectories without re-running the agent.
|
||||
|
||||
```bash
|
||||
letta-evals run suite.yaml --cached previous_results.jsonl
|
||||
```
|
||||
|
||||
Use this to test different graders on the same agent trajectories.
|
||||
|
||||
#### --num-runs
|
||||
Run the evaluation multiple times to measure consistency. **Default**: 1
|
||||
|
||||
```bash
|
||||
letta-evals run suite.yaml --num-runs 10
|
||||
```
|
||||
|
||||
**Output with multiple runs:**
|
||||
- Each run creates a separate `run_N/` directory with individual results
|
||||
- An `aggregate_stats.json` file contains statistics across all runs (mean, standard deviation, pass rate)
|
||||
|
||||
### Examples
|
||||
|
||||
Basic run:
|
||||
```bash
|
||||
letta-evals run suite.yaml # Run evaluation, show results in terminal
|
||||
```
|
||||
|
||||
Save results:
|
||||
```bash
|
||||
letta-evals run suite.yaml --output evaluation-results/ # Save to directory
|
||||
```
|
||||
|
||||
Letta Cloud:
|
||||
```bash
|
||||
letta-evals run suite.yaml \
|
||||
--base-url https://api.letta.com \
|
||||
--api-key $LETTA_API_KEY \
|
||||
--project-id proj_abc123
|
||||
```
|
||||
|
||||
Quiet CI mode:
|
||||
```bash
|
||||
letta-evals run suite.yaml --quiet
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "Evaluation passed"
|
||||
else
|
||||
echo "Evaluation failed"
|
||||
exit 1
|
||||
fi
|
||||
```
|
||||
|
||||
### Exit Codes
|
||||
|
||||
- `0`: Evaluation passed (gate criteria met)
|
||||
- `1`: Evaluation failed (gate criteria not met or error)
|
||||
|
||||
## validate
|
||||
|
||||
Validate a suite configuration without running it.
|
||||
|
||||
```bash
|
||||
letta-evals validate <suite.yaml>
|
||||
```
|
||||
|
||||
Checks:
|
||||
- YAML syntax is valid
|
||||
- Required fields are present
|
||||
- Paths exist
|
||||
- Configuration is consistent
|
||||
- Grader/extractor combinations are valid
|
||||
|
||||
Output on success:
|
||||
```
|
||||
✓ Suite configuration is valid
|
||||
```
|
||||
|
||||
Output on error:
|
||||
```
|
||||
✗ Validation failed:
|
||||
- Agent file not found: agent.af
|
||||
- Grader 'my_metric' references unknown function
|
||||
```
|
||||
|
||||
## list-extractors
|
||||
|
||||
List all available extractors.
|
||||
|
||||
```bash
|
||||
letta-evals list-extractors
|
||||
```
|
||||
|
||||
Output:
|
||||
```
|
||||
Available extractors:
|
||||
last_assistant - Extract the last assistant message
|
||||
first_assistant - Extract the first assistant message
|
||||
all_assistant - Concatenate all assistant messages
|
||||
pattern - Extract content matching regex
|
||||
tool_arguments - Extract tool call arguments
|
||||
tool_output - Extract tool return value
|
||||
after_marker - Extract content after a marker
|
||||
memory_block - Extract from memory block (requires agent_state)
|
||||
```
|
||||
|
||||
## list-graders
|
||||
|
||||
List all available grader functions.
|
||||
|
||||
```bash
|
||||
letta-evals list-graders
|
||||
```
|
||||
|
||||
Output:
|
||||
```
|
||||
Available graders:
|
||||
exact_match - Exact string match with ground_truth
|
||||
contains - Check if contains ground_truth
|
||||
regex_match - Match regex pattern
|
||||
ascii_printable_only - Validate ASCII-only content
|
||||
```
|
||||
|
||||
## help
|
||||
|
||||
Show help information.
|
||||
|
||||
```bash
|
||||
letta-evals --help
|
||||
```
|
||||
|
||||
Show help for a specific command:
|
||||
|
||||
```bash
|
||||
letta-evals run --help
|
||||
letta-evals validate --help
|
||||
```
|
||||
|
||||
## Environment Variables
|
||||
|
||||
### LETTA_API_KEY
|
||||
API key for Letta authentication.
|
||||
|
||||
```bash
|
||||
export LETTA_API_KEY=your-key-here
|
||||
```
|
||||
|
||||
### LETTA_BASE_URL
|
||||
Letta server base URL.
|
||||
|
||||
```bash
|
||||
export LETTA_BASE_URL=https://api.letta.com
|
||||
```
|
||||
|
||||
### LETTA_PROJECT_ID
|
||||
Letta project ID (for cloud).
|
||||
|
||||
```bash
|
||||
export LETTA_PROJECT_ID=proj_abc123
|
||||
```
|
||||
|
||||
### OPENAI_API_KEY
|
||||
OpenAI API key (for rubric graders).
|
||||
|
||||
```bash
|
||||
export OPENAI_API_KEY=your-openai-key
|
||||
```
|
||||
|
||||
## Configuration Priority
|
||||
|
||||
Configuration values are resolved in this order (highest to lowest priority):
|
||||
|
||||
1. CLI arguments (`--api-key`, `--base-url`, `--project-id`)
|
||||
2. Suite YAML configuration
|
||||
3. Environment variables
|
||||
|
||||
## Using in CI/CD
|
||||
|
||||
### GitHub Actions
|
||||
|
||||
```yaml
|
||||
name: Run Evals
|
||||
on: [push]
|
||||
|
||||
jobs:
|
||||
evaluate:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Install dependencies
|
||||
run: pip install letta-evals
|
||||
|
||||
- name: Run evaluation
|
||||
env:
|
||||
LETTA_API_KEY: ${{ secrets.LETTA_API_KEY }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
run: |
|
||||
letta-evals run suite.yaml --quiet --output results/
|
||||
|
||||
- name: Upload results
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: eval-results
|
||||
path: results/
|
||||
```
|
||||
|
||||
### GitLab CI
|
||||
|
||||
```yaml
|
||||
evaluate:
|
||||
script:
|
||||
- pip install letta-evals
|
||||
- letta-evals run suite.yaml --quiet --output results/
|
||||
artifacts:
|
||||
paths:
|
||||
- results/
|
||||
variables:
|
||||
LETTA_API_KEY: $LETTA_API_KEY
|
||||
OPENAI_API_KEY: $OPENAI_API_KEY
|
||||
```
|
||||
|
||||
## Debugging
|
||||
|
||||
### Common Issues
|
||||
|
||||
<Warning>
|
||||
**"Agent file not found"**
|
||||
|
||||
```bash
|
||||
# Check file exists relative to suite YAML location
|
||||
ls -la path/to/agent.af
|
||||
```
|
||||
</Warning>
|
||||
|
||||
<Warning>
|
||||
**"Connection refused"**
|
||||
|
||||
```bash
|
||||
# Verify Letta server is running
|
||||
curl https://api.letta.com/v1/health
|
||||
```
|
||||
</Warning>
|
||||
|
||||
<Warning>
|
||||
**"Invalid API key"**
|
||||
|
||||
```bash
|
||||
# Check environment variable is set
|
||||
echo $LETTA_API_KEY
|
||||
```
|
||||
</Warning>
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Understanding Results](/evals/results-metrics/understanding-results) - Interpreting evaluation output
|
||||
- [Suite YAML Reference](/evals/configuration/suite-yaml-reference) - Complete configuration options
|
||||
- [Getting Started](/evals/get-started/getting-started) - Complete tutorial with examples
|
||||
@@ -1,418 +0,0 @@
|
||||
# Datasets
|
||||
|
||||
**Datasets** are the test cases that define what your agent will be evaluated on. Each sample in your dataset represents one evaluation scenario.
|
||||
|
||||
**Quick overview:**
|
||||
- **Two formats**: JSONL (flexible, powerful) or CSV (simple, spreadsheet-friendly)
|
||||
- **Required field**: `input` - the prompt(s) to send to the agent
|
||||
- **Common fields**: `ground_truth` (expected answer), `tags` (for filtering), `metadata` (extra info)
|
||||
- **Advanced fields**: `agent_args` (customize agent per sample), `rubric_vars` (per-sample rubric context)
|
||||
- **Multi-turn support**: Send multiple messages in sequence using arrays
|
||||
|
||||
**Typical workflow:**
|
||||
1. Create a JSONL or CSV file with test cases
|
||||
2. Reference it in your suite YAML: `dataset: test_cases.jsonl`
|
||||
3. Run evaluation - each sample is tested independently
|
||||
4. Results show per-sample and aggregate scores
|
||||
|
||||
Datasets can be created in two formats: **JSONL** or **CSV**. Choose based on your team's workflow and complexity needs.
|
||||
|
||||
## Dataset Formats
|
||||
|
||||
### JSONL Format
|
||||
|
||||
Each line is a JSON object representing one test case:
|
||||
|
||||
```jsonl
|
||||
{"input": "What's the capital of France?", "ground_truth": "Paris"}
|
||||
{"input": "Calculate 2+2", "ground_truth": "4"}
|
||||
{"input": "What color is the sky?", "ground_truth": "blue"}
|
||||
```
|
||||
|
||||
**Best for:**
|
||||
- Complex data structures (nested objects, arrays)
|
||||
- Multi-turn conversations
|
||||
- Advanced features (agent_args, rubric_vars)
|
||||
- Teams comfortable with JSON/code
|
||||
- Version control (clean line-by-line diffs)
|
||||
|
||||
### CSV Format
|
||||
|
||||
Standard CSV with headers:
|
||||
|
||||
```csv
|
||||
input,ground_truth
|
||||
"What's the capital of France?","Paris"
|
||||
"Calculate 2+2","4"
|
||||
"What color is the sky?","blue"
|
||||
```
|
||||
|
||||
**Best for:**
|
||||
- Simple question-answer pairs
|
||||
- Teams that prefer spreadsheets (Excel, Google Sheets)
|
||||
- Non-technical collaborators creating test cases
|
||||
- Quick dataset creation and editing
|
||||
- Easy sharing with non-developers
|
||||
|
||||
## Quick Reference
|
||||
|
||||
| Field | Required | Type | Purpose |
|
||||
|-------|----------|------|---------|
|
||||
| `input` | ✅ | string or array | Prompt(s) to send to agent |
|
||||
| `ground_truth` | ❌ | string | Expected answer (for tool graders) |
|
||||
| `tags` | ❌ | array of strings | For filtering samples |
|
||||
| `agent_args` | ❌ | object | Per-sample agent customization |
|
||||
| `rubric_vars` | ❌ | object | Per-sample rubric variables |
|
||||
| `metadata` | ❌ | object | Arbitrary extra data |
|
||||
| `id` | ❌ | integer | Sample ID (auto-assigned if omitted) |
|
||||
|
||||
## Field Reference
|
||||
|
||||
### Required Fields
|
||||
|
||||
#### input
|
||||
The prompt(s) to send to the agent. Can be a string or array of strings:
|
||||
|
||||
Single message:
|
||||
```json
|
||||
{"input": "Hello, who are you?"}
|
||||
```
|
||||
|
||||
Multi-turn conversation:
|
||||
```json
|
||||
{"input": ["Hello", "What's your name?", "Tell me about yourself"]}
|
||||
```
|
||||
|
||||
### Optional Fields
|
||||
|
||||
#### ground_truth
|
||||
The expected answer or content to check against. Required for most tool graders (exact_match, contains, etc.):
|
||||
|
||||
```json
|
||||
{"input": "What is 2+2?", "ground_truth": "4"}
|
||||
```
|
||||
|
||||
#### metadata
|
||||
Arbitrary additional data about the sample:
|
||||
|
||||
```json
|
||||
{
|
||||
"input": "What is photosynthesis?",
|
||||
"ground_truth": "process where plants convert light into energy",
|
||||
"metadata": {
|
||||
"category": "biology",
|
||||
"difficulty": "medium"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### tags
|
||||
List of tags for filtering samples:
|
||||
|
||||
```json
|
||||
{"input": "Solve x^2 = 16", "ground_truth": "4", "tags": ["math", "algebra"]}
|
||||
```
|
||||
|
||||
Filter by tags in your suite:
|
||||
```yaml
|
||||
sample_tags: [math] # Only samples tagged "math" will be evaluated
|
||||
```
|
||||
|
||||
#### agent_args
|
||||
|
||||
Custom arguments passed to programmatic agent creation when using `agent_script`. Allows per-sample agent customization.
|
||||
|
||||
JSONL:
|
||||
```json
|
||||
{
|
||||
"input": "What items do we have?",
|
||||
"agent_args": {
|
||||
"item": {"sku": "SKU-123", "name": "Widget A", "price": 19.99}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
CSV:
|
||||
```csv
|
||||
input,agent_args
|
||||
"What items do we have?","{""item"": {""sku"": ""SKU-123"", ""name"": ""Widget A"", ""price"": 19.99}}"
|
||||
```
|
||||
|
||||
Your agent factory function can access these values via `sample.agent_args` to customize agent configuration.
|
||||
|
||||
See [Targets - agent_script](./targets.md#agent_script) for details on programmatic agent creation.
|
||||
|
||||
#### rubric_vars
|
||||
|
||||
Variables to inject into rubric templates when using rubric graders. This allows you to provide per-sample context or examples to the LLM judge.
|
||||
|
||||
**Example:** Evaluating code quality against a reference implementation.
|
||||
|
||||
JSONL:
|
||||
```jsonl
|
||||
{"input": "Write a function to calculate fibonacci numbers", "rubric_vars": {"reference_code": "def fib(n):\n if n <= 1: return n\n return fib(n-1) + fib(n-2)", "required_features": "recursion, base case"}}
|
||||
```
|
||||
|
||||
CSV:
|
||||
```csv
|
||||
input,rubric_vars
|
||||
"Write a function to calculate fibonacci numbers","{""reference_code"": ""def fib(n):\n if n <= 1: return n\n return fib(n-1) + fib(n-2)"", ""required_features"": ""recursion, base case""}"
|
||||
```
|
||||
|
||||
In your rubric template file, reference variables with `{variable_name}`:
|
||||
|
||||
**rubric.txt:**
|
||||
```
|
||||
Evaluate the submitted code against this reference implementation:
|
||||
|
||||
{reference_code}
|
||||
|
||||
Required features: {required_features}
|
||||
|
||||
Score on correctness (0.6) and code quality (0.4).
|
||||
```
|
||||
|
||||
When the rubric grader runs, variables are replaced with values from `rubric_vars`:
|
||||
|
||||
**Final formatted prompt sent to LLM:**
|
||||
```
|
||||
Evaluate the submitted code against this reference implementation:
|
||||
|
||||
def fib(n):
|
||||
if n <= 1: return n
|
||||
return fib(n-1) + fib(n-2)
|
||||
|
||||
Required features: recursion, base case
|
||||
|
||||
Score on correctness (0.6) and code quality (0.4).
|
||||
```
|
||||
|
||||
This lets you customize evaluation criteria per sample using the same rubric template.
|
||||
|
||||
See [Rubric Graders](../graders/rubric-graders.md) for details on rubric templates.
|
||||
|
||||
#### id
|
||||
Sample ID is automatically assigned (0-based index) if not provided. You can override:
|
||||
|
||||
```json
|
||||
{"id": 42, "input": "Test case 42"}
|
||||
```
|
||||
|
||||
## Complete Example
|
||||
|
||||
```jsonl
|
||||
{"id": 1, "input": "What is the capital of France?", "ground_truth": "Paris", "tags": ["geography", "easy"], "metadata": {"region": "Europe"}}
|
||||
{"id": 2, "input": "Calculate the square root of 144", "ground_truth": "12", "tags": ["math", "medium"]}
|
||||
{"id": 3, "input": ["Hello", "What can you help me with?"], "tags": ["conversation"]}
|
||||
```
|
||||
|
||||
## Dataset Best Practices
|
||||
|
||||
### 1. Clear Ground Truth
|
||||
|
||||
Make ground truth specific enough to grade but flexible enough to match valid responses:
|
||||
|
||||
Good:
|
||||
```json
|
||||
{"input": "What's the largest planet?", "ground_truth": "Jupiter"}
|
||||
```
|
||||
|
||||
Too strict (might miss valid answers):
|
||||
```json
|
||||
{"input": "What's the largest planet?", "ground_truth": "Jupiter is the largest planet in our solar system."}
|
||||
```
|
||||
|
||||
### 2. Diverse Test Cases
|
||||
|
||||
Include edge cases and variations:
|
||||
|
||||
```jsonl
|
||||
{"input": "What is 2+2?", "ground_truth": "4", "tags": ["math", "easy"]}
|
||||
{"input": "What is 0.1 + 0.2?", "ground_truth": "0.3", "tags": ["math", "floating_point"]}
|
||||
{"input": "What is 999999999 + 1?", "ground_truth": "1000000000", "tags": ["math", "large_numbers"]}
|
||||
```
|
||||
|
||||
### 3. Use Tags for Organization
|
||||
|
||||
Organize samples by type, difficulty, or feature:
|
||||
|
||||
```json
|
||||
{"tags": ["tool_usage", "search"]}
|
||||
{"tags": ["memory", "recall"]}
|
||||
{"tags": ["reasoning", "multi_step"]}
|
||||
```
|
||||
|
||||
### 4. Multi-Turn Conversations
|
||||
|
||||
Test conversational context:
|
||||
|
||||
```json
|
||||
{
|
||||
"input": [
|
||||
"My name is Alice",
|
||||
"What's my name?"
|
||||
],
|
||||
"ground_truth": "Alice",
|
||||
"tags": ["memory", "context"]
|
||||
}
|
||||
```
|
||||
|
||||
### 5. No Ground Truth for LLM Judges
|
||||
|
||||
If using rubric graders, ground truth is optional:
|
||||
|
||||
```jsonl
|
||||
{"input": "Write a creative story about a robot", "tags": ["creative"]}
|
||||
{"input": "Explain quantum computing simply", "tags": ["explanation"]}
|
||||
```
|
||||
|
||||
The LLM judge evaluates based on the rubric, not ground truth.
|
||||
|
||||
## Loading Datasets
|
||||
|
||||
Datasets are automatically loaded by the runner:
|
||||
|
||||
```yaml
|
||||
dataset: path/to/dataset.jsonl # Path to your test cases (JSONL or CSV)
|
||||
```
|
||||
|
||||
Paths are relative to the suite YAML file location.
|
||||
|
||||
## Dataset Filtering
|
||||
|
||||
### Limit Sample Count
|
||||
|
||||
```yaml
|
||||
max_samples: 10 # Only evaluate first 10 samples (useful for testing)
|
||||
```
|
||||
|
||||
### Filter by Tags
|
||||
|
||||
```yaml
|
||||
sample_tags: [math, medium] # Only samples with ALL these tags
|
||||
```
|
||||
|
||||
## Creating Datasets Programmatically
|
||||
|
||||
You can generate datasets with Python:
|
||||
|
||||
```python
|
||||
import json
|
||||
|
||||
samples = []
|
||||
for i in range(100):
|
||||
samples.append({
|
||||
"input": f"What is {i} + {i}?",
|
||||
"ground_truth": str(i + i),
|
||||
"tags": ["math", "addition"]
|
||||
})
|
||||
|
||||
with open("dataset.jsonl", "w") as f:
|
||||
for sample in samples:
|
||||
f.write(json.dumps(sample) + "\n")
|
||||
```
|
||||
|
||||
## Dataset Format Validation
|
||||
|
||||
The runner validates:
|
||||
- Each line is valid JSON
|
||||
- Required fields are present
|
||||
- Field types are correct
|
||||
|
||||
Validation errors will be reported with line numbers.
|
||||
|
||||
## Examples by Use Case
|
||||
|
||||
### Question Answering
|
||||
|
||||
JSONL:
|
||||
```jsonl
|
||||
{"input": "What is the capital of France?", "ground_truth": "Paris"}
|
||||
{"input": "Who wrote Romeo and Juliet?", "ground_truth": "Shakespeare"}
|
||||
```
|
||||
|
||||
CSV:
|
||||
```csv
|
||||
input,ground_truth
|
||||
"What is the capital of France?","Paris"
|
||||
"Who wrote Romeo and Juliet?","Shakespeare"
|
||||
```
|
||||
|
||||
### Tool Usage Testing
|
||||
|
||||
JSONL:
|
||||
```jsonl
|
||||
{"input": "Search for information about pandas", "ground_truth": "search"}
|
||||
{"input": "Calculate 15 * 23", "ground_truth": "calculator"}
|
||||
```
|
||||
|
||||
CSV:
|
||||
```csv
|
||||
input,ground_truth
|
||||
"Search for information about pandas","search"
|
||||
"Calculate 15 * 23","calculator"
|
||||
```
|
||||
|
||||
Ground truth = expected tool name.
|
||||
|
||||
### Memory Testing (Multi-turn)
|
||||
|
||||
JSONL:
|
||||
```jsonl
|
||||
{"input": ["Remember that my favorite color is blue", "What's my favorite color?"], "ground_truth": "blue"}
|
||||
{"input": ["I live in Tokyo", "Where do I live?"], "ground_truth": "Tokyo"}
|
||||
```
|
||||
|
||||
CSV (using JSON array strings):
|
||||
```csv
|
||||
input,ground_truth
|
||||
"[""Remember that my favorite color is blue"", ""What's my favorite color?""]","blue"
|
||||
"[""I live in Tokyo"", ""Where do I live?""]","Tokyo"
|
||||
```
|
||||
|
||||
### Code Generation
|
||||
|
||||
JSONL:
|
||||
```jsonl
|
||||
{"input": "Write a function to reverse a string in Python"}
|
||||
{"input": "Create a SQL query to find users older than 21"}
|
||||
```
|
||||
|
||||
CSV:
|
||||
```csv
|
||||
input
|
||||
"Write a function to reverse a string in Python"
|
||||
"Create a SQL query to find users older than 21"
|
||||
```
|
||||
|
||||
Use rubric graders to evaluate code quality.
|
||||
|
||||
## CSV Advanced Features
|
||||
|
||||
CSV supports all the same features as JSONL by encoding complex data as JSON strings in cells:
|
||||
|
||||
**Multi-turn conversations** (requires escaped JSON array string):
|
||||
```csv
|
||||
input,ground_truth
|
||||
"[""Hello"", ""What's your name?""]","Alice"
|
||||
```
|
||||
|
||||
**Agent arguments** (requires escaped JSON object string):
|
||||
```csv
|
||||
input,agent_args
|
||||
"What items do we have?","{""initial_inventory"": [""apple"", ""banana""]}"
|
||||
```
|
||||
|
||||
**Rubric variables** (requires escaped JSON object string):
|
||||
```csv
|
||||
input,rubric_vars
|
||||
"Write a story","{""max_length"": 500, ""genre"": ""sci-fi""}"
|
||||
```
|
||||
|
||||
**Note:** Complex data structures require JSON encoding in CSV. If you're frequently using these advanced features, JSONL may be easier to read and maintain.
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Suite YAML Reference](../configuration/suite-yaml.md) - Complete configuration options including filtering
|
||||
- [Graders](./graders.md) - How to evaluate agent responses
|
||||
- [Multi-Turn Conversations](../advanced/multi-turn-conversations.md) - Testing conversational flows
|
||||
@@ -1,425 +0,0 @@
|
||||
# Datasets
|
||||
|
||||
**Datasets** are the test cases that define what your agent will be evaluated on. Each sample in your dataset represents one evaluation scenario.
|
||||
|
||||
<Note>
|
||||
**Quick overview:**
|
||||
- **Two formats**: JSONL (flexible, powerful) or CSV (simple, spreadsheet-friendly)
|
||||
- **Required field**: `input` - the prompt(s) to send to the agent
|
||||
- **Common fields**: `ground_truth` (expected answer), `tags` (for filtering), `metadata` (extra info)
|
||||
- **Advanced fields**: `agent_args` (customize agent per sample), `rubric_vars` (per-sample rubric context)
|
||||
- **Multi-turn support**: Send multiple messages in sequence using arrays
|
||||
</Note>
|
||||
|
||||
**Typical workflow:**
|
||||
1. Create a JSONL or CSV file with test cases
|
||||
2. Reference it in your suite YAML: `dataset: test_cases.jsonl`
|
||||
3. Run evaluation - each sample is tested independently
|
||||
4. Results show per-sample and aggregate scores
|
||||
|
||||
Datasets can be created in two formats: **JSONL** or **CSV**. Choose based on your team's workflow and complexity needs.
|
||||
|
||||
## Dataset Formats
|
||||
|
||||
### JSONL Format
|
||||
|
||||
Each line is a JSON object representing one test case:
|
||||
|
||||
```jsonl
|
||||
{"input": "What's the capital of France?", "ground_truth": "Paris"}
|
||||
{"input": "Calculate 2+2", "ground_truth": "4"}
|
||||
{"input": "What color is the sky?", "ground_truth": "blue"}
|
||||
```
|
||||
|
||||
**Best for:**
|
||||
- Complex data structures (nested objects, arrays)
|
||||
- Multi-turn conversations
|
||||
- Advanced features (agent_args, rubric_vars)
|
||||
- Teams comfortable with JSON/code
|
||||
- Version control (clean line-by-line diffs)
|
||||
|
||||
### CSV Format
|
||||
|
||||
Standard CSV with headers:
|
||||
|
||||
```csv
|
||||
input,ground_truth
|
||||
"What's the capital of France?","Paris"
|
||||
"Calculate 2+2","4"
|
||||
"What color is the sky?","blue"
|
||||
```
|
||||
|
||||
**Best for:**
|
||||
- Simple question-answer pairs
|
||||
- Teams that prefer spreadsheets (Excel, Google Sheets)
|
||||
- Non-technical collaborators creating test cases
|
||||
- Quick dataset creation and editing
|
||||
- Easy sharing with non-developers
|
||||
|
||||
## Quick Reference
|
||||
|
||||
| Field | Required | Type | Purpose |
|
||||
|-------|----------|------|---------|
|
||||
| `input` | ✅ | string or array | Prompt(s) to send to agent |
|
||||
| `ground_truth` | ❌ | string | Expected answer (for tool graders) |
|
||||
| `tags` | ❌ | array of strings | For filtering samples |
|
||||
| `agent_args` | ❌ | object | Per-sample agent customization |
|
||||
| `rubric_vars` | ❌ | object | Per-sample rubric variables |
|
||||
| `metadata` | ❌ | object | Arbitrary extra data |
|
||||
| `id` | ❌ | integer | Sample ID (auto-assigned if omitted) |
|
||||
|
||||
## Field Reference
|
||||
|
||||
### Required Fields
|
||||
|
||||
#### input
|
||||
The prompt(s) to send to the agent. Can be a string or array of strings:
|
||||
|
||||
Single message:
|
||||
```json
|
||||
{"input": "Hello, who are you?"}
|
||||
```
|
||||
|
||||
Multi-turn conversation:
|
||||
```json
|
||||
{"input": ["Hello", "What's your name?", "Tell me about yourself"]}
|
||||
```
|
||||
|
||||
### Optional Fields
|
||||
|
||||
#### ground_truth
|
||||
The expected answer or content to check against. Required for most tool graders (exact_match, contains, etc.):
|
||||
|
||||
```json
|
||||
{"input": "What is 2+2?", "ground_truth": "4"}
|
||||
```
|
||||
|
||||
#### metadata
|
||||
Arbitrary additional data about the sample:
|
||||
|
||||
```json
|
||||
{
|
||||
"input": "What is photosynthesis?",
|
||||
"ground_truth": "process where plants convert light into energy",
|
||||
"metadata": {
|
||||
"category": "biology",
|
||||
"difficulty": "medium"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### tags
|
||||
List of tags for filtering samples:
|
||||
|
||||
```json
|
||||
{"input": "Solve x^2 = 16", "ground_truth": "4", "tags": ["math", "algebra"]}
|
||||
```
|
||||
|
||||
Filter by tags in your suite:
|
||||
```yaml
|
||||
sample_tags: [math] # Only samples tagged "math" will be evaluated
|
||||
```
|
||||
|
||||
#### agent_args
|
||||
|
||||
Custom arguments passed to programmatic agent creation when using `agent_script`. Allows per-sample agent customization.
|
||||
|
||||
JSONL:
|
||||
```json
|
||||
{
|
||||
"input": "What items do we have?",
|
||||
"agent_args": {
|
||||
"item": {"sku": "SKU-123", "name": "Widget A", "price": 19.99}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
CSV:
|
||||
```csv
|
||||
input,agent_args
|
||||
"What items do we have?","{""item"": {""sku"": ""SKU-123"", ""name"": ""Widget A"", ""price"": 19.99}}"
|
||||
```
|
||||
|
||||
Your agent factory function can access these values via `sample.agent_args` to customize agent configuration.
|
||||
|
||||
See [Targets - agent_script](/guides/evals/concepts/targets#agent_script) for details on programmatic agent creation.
|
||||
|
||||
#### rubric_vars
|
||||
|
||||
Variables to inject into rubric templates when using rubric graders. This allows you to provide per-sample context or examples to the LLM judge.
|
||||
|
||||
**Example:** Evaluating code quality against a reference implementation.
|
||||
|
||||
JSONL:
|
||||
```jsonl
|
||||
{"input": "Write a function to calculate fibonacci numbers", "rubric_vars": {"reference_code": "def fib(n):\n if n <= 1: return n\n return fib(n-1) + fib(n-2)", "required_features": "recursion, base case"}}
|
||||
```
|
||||
|
||||
CSV:
|
||||
```csv
|
||||
input,rubric_vars
|
||||
"Write a function to calculate fibonacci numbers","{""reference_code"": ""def fib(n):\n if n <= 1: return n\n return fib(n-1) + fib(n-2)"", ""required_features"": ""recursion, base case""}"
|
||||
```
|
||||
|
||||
In your rubric template file, reference variables with `{variable_name}`:
|
||||
|
||||
**rubric.txt:**
|
||||
```
|
||||
Evaluate the submitted code against this reference implementation:
|
||||
|
||||
{reference_code}
|
||||
|
||||
Required features: {required_features}
|
||||
|
||||
Score on correctness (0.6) and code quality (0.4).
|
||||
```
|
||||
|
||||
When the rubric grader runs, variables are replaced with values from `rubric_vars`:
|
||||
|
||||
**Final formatted prompt sent to LLM:**
|
||||
```
|
||||
Evaluate the submitted code against this reference implementation:
|
||||
|
||||
def fib(n):
|
||||
if n <= 1: return n
|
||||
return fib(n-1) + fib(n-2)
|
||||
|
||||
Required features: recursion, base case
|
||||
|
||||
Score on correctness (0.6) and code quality (0.4).
|
||||
```
|
||||
|
||||
This lets you customize evaluation criteria per sample using the same rubric template.
|
||||
|
||||
See [Rubric Graders](/guides/evals/graders/rubric-graders) for details on rubric templates.
|
||||
|
||||
#### id
|
||||
Sample ID is automatically assigned (0-based index) if not provided. You can override:
|
||||
|
||||
```json
|
||||
{"id": 42, "input": "Test case 42"}
|
||||
```
|
||||
|
||||
## Complete Example
|
||||
|
||||
```jsonl
|
||||
{"id": 1, "input": "What is the capital of France?", "ground_truth": "Paris", "tags": ["geography", "easy"], "metadata": {"region": "Europe"}}
|
||||
{"id": 2, "input": "Calculate the square root of 144", "ground_truth": "12", "tags": ["math", "medium"]}
|
||||
{"id": 3, "input": ["Hello", "What can you help me with?"], "tags": ["conversation"]}
|
||||
```
|
||||
|
||||
## Dataset Best Practices
|
||||
|
||||
### 1. Clear Ground Truth
|
||||
|
||||
Make ground truth specific enough to grade but flexible enough to match valid responses:
|
||||
|
||||
<Tip>
|
||||
Good:
|
||||
```json
|
||||
{"input": "What's the largest planet?", "ground_truth": "Jupiter"}
|
||||
```
|
||||
</Tip>
|
||||
|
||||
<Warning>
|
||||
Too strict (might miss valid answers):
|
||||
```json
|
||||
{"input": "What's the largest planet?", "ground_truth": "Jupiter is the largest planet in our solar system."}
|
||||
```
|
||||
</Warning>
|
||||
|
||||
### 2. Diverse Test Cases
|
||||
|
||||
Include edge cases and variations:
|
||||
|
||||
```jsonl
|
||||
{"input": "What is 2+2?", "ground_truth": "4", "tags": ["math", "easy"]}
|
||||
{"input": "What is 0.1 + 0.2?", "ground_truth": "0.3", "tags": ["math", "floating_point"]}
|
||||
{"input": "What is 999999999 + 1?", "ground_truth": "1000000000", "tags": ["math", "large_numbers"]}
|
||||
```
|
||||
|
||||
### 3. Use Tags for Organization
|
||||
|
||||
Organize samples by type, difficulty, or feature:
|
||||
|
||||
```json
|
||||
{"tags": ["tool_usage", "search"]}
|
||||
{"tags": ["memory", "recall"]}
|
||||
{"tags": ["reasoning", "multi_step"]}
|
||||
```
|
||||
|
||||
### 4. Multi-Turn Conversations
|
||||
|
||||
Test conversational context and memory updates:
|
||||
|
||||
```jsonl
|
||||
{"input": ["My name is Alice", "What's my name?"], "ground_truth": "Alice", "tags": ["memory", "recall"]}
|
||||
{"input": ["Please remember that I like bananas.", "Actually, sorry, I meant I like apples."], "ground_truth": "apples", "tags": ["memory", "correction"]}
|
||||
{"input": ["I work at Google", "Update my workplace to Microsoft", "Where do I work?"], "ground_truth": "Microsoft", "tags": ["memory", "multi_step"]}
|
||||
```
|
||||
|
||||
<Tip>
|
||||
**Testing memory corrections:** Use multi-turn inputs to test if agents properly update memory when users correct themselves. Combine with the `memory_block` extractor to verify the final memory state, not just the response.
|
||||
</Tip>
|
||||
|
||||
### 5. No Ground Truth for LLM Judges
|
||||
|
||||
If using rubric graders, ground truth is optional:
|
||||
|
||||
```jsonl
|
||||
{"input": "Write a creative story about a robot", "tags": ["creative"]}
|
||||
{"input": "Explain quantum computing simply", "tags": ["explanation"]}
|
||||
```
|
||||
|
||||
The LLM judge evaluates based on the rubric, not ground truth.
|
||||
|
||||
## Loading Datasets
|
||||
|
||||
Datasets are automatically loaded by the runner:
|
||||
|
||||
```yaml
|
||||
dataset: path/to/dataset.jsonl # Path to your test cases (JSONL or CSV)
|
||||
```
|
||||
|
||||
Paths are relative to the suite YAML file location.
|
||||
|
||||
## Dataset Filtering
|
||||
|
||||
### Limit Sample Count
|
||||
|
||||
```yaml
|
||||
max_samples: 10 # Only evaluate first 10 samples (useful for testing)
|
||||
```
|
||||
|
||||
### Filter by Tags
|
||||
|
||||
```yaml
|
||||
sample_tags: [math, medium] # Only samples with ALL these tags
|
||||
```
|
||||
|
||||
## Creating Datasets Programmatically
|
||||
|
||||
You can generate datasets with Python:
|
||||
|
||||
```python
|
||||
import json
|
||||
|
||||
samples = []
|
||||
for i in range(100):
|
||||
samples.append({
|
||||
"input": f"What is {i} + {i}?",
|
||||
"ground_truth": str(i + i),
|
||||
"tags": ["math", "addition"]
|
||||
})
|
||||
|
||||
with open("dataset.jsonl", "w") as f:
|
||||
for sample in samples:
|
||||
f.write(json.dumps(sample) + "\n")
|
||||
```
|
||||
|
||||
## Dataset Format Validation
|
||||
|
||||
The runner validates:
|
||||
- Each line is valid JSON
|
||||
- Required fields are present
|
||||
- Field types are correct
|
||||
|
||||
Validation errors will be reported with line numbers.
|
||||
|
||||
## Examples by Use Case
|
||||
|
||||
### Question Answering
|
||||
|
||||
JSONL:
|
||||
```jsonl
|
||||
{"input": "What is the capital of France?", "ground_truth": "Paris"}
|
||||
{"input": "Who wrote Romeo and Juliet?", "ground_truth": "Shakespeare"}
|
||||
```
|
||||
|
||||
CSV:
|
||||
```csv
|
||||
input,ground_truth
|
||||
"What is the capital of France?","Paris"
|
||||
"Who wrote Romeo and Juliet?","Shakespeare"
|
||||
```
|
||||
|
||||
### Tool Usage Testing
|
||||
|
||||
JSONL:
|
||||
```jsonl
|
||||
{"input": "Search for information about pandas", "ground_truth": "search"}
|
||||
{"input": "Calculate 15 * 23", "ground_truth": "calculator"}
|
||||
```
|
||||
|
||||
CSV:
|
||||
```csv
|
||||
input,ground_truth
|
||||
"Search for information about pandas","search"
|
||||
"Calculate 15 * 23","calculator"
|
||||
```
|
||||
|
||||
Ground truth = expected tool name.
|
||||
|
||||
### Memory Testing (Multi-turn)
|
||||
|
||||
JSONL:
|
||||
```jsonl
|
||||
{"input": ["Remember that my favorite color is blue", "What's my favorite color?"], "ground_truth": "blue"}
|
||||
{"input": ["I live in Tokyo", "Where do I live?"], "ground_truth": "Tokyo"}
|
||||
```
|
||||
|
||||
CSV (using JSON array strings):
|
||||
```csv
|
||||
input,ground_truth
|
||||
"[""Remember that my favorite color is blue"", ""What's my favorite color?""]","blue"
|
||||
"[""I live in Tokyo"", ""Where do I live?""]","Tokyo"
|
||||
```
|
||||
|
||||
### Code Generation
|
||||
|
||||
JSONL:
|
||||
```jsonl
|
||||
{"input": "Write a function to reverse a string in Python"}
|
||||
{"input": "Create a SQL query to find users older than 21"}
|
||||
```
|
||||
|
||||
CSV:
|
||||
```csv
|
||||
input
|
||||
"Write a function to reverse a string in Python"
|
||||
"Create a SQL query to find users older than 21"
|
||||
```
|
||||
|
||||
Use rubric graders to evaluate code quality.
|
||||
|
||||
## CSV Advanced Features
|
||||
|
||||
CSV supports all the same features as JSONL by encoding complex data as JSON strings in cells:
|
||||
|
||||
**Multi-turn conversations** (requires escaped JSON array string):
|
||||
```csv
|
||||
input,ground_truth
|
||||
"[""Hello"", ""What's your name?""]","Alice"
|
||||
```
|
||||
|
||||
**Agent arguments** (requires escaped JSON object string):
|
||||
```csv
|
||||
input,agent_args
|
||||
"What items do we have?","{""initial_inventory"": [""apple"", ""banana""]}"
|
||||
```
|
||||
|
||||
**Rubric variables** (requires escaped JSON object string):
|
||||
```csv
|
||||
input,rubric_vars
|
||||
"Write a story","{""max_length"": 500, ""genre"": ""sci-fi""}"
|
||||
```
|
||||
|
||||
<Note>
|
||||
**Note:** Complex data structures require JSON encoding in CSV. If you're frequently using these advanced features, JSONL may be easier to read and maintain.
|
||||
</Note>
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Suite YAML Reference](/guides/evals/configuration/suite-yaml) - Complete configuration options including filtering
|
||||
- [Graders](/guides/evals/concepts/graders) - How to evaluate agent responses
|
||||
- [Multi-Turn Conversations](/guides/evals/advanced/multi-turn-conversations) - Testing conversational flows
|
||||
@@ -1,394 +0,0 @@
|
||||
# Extractors
|
||||
|
||||
**Extractors** select what content to evaluate from an agent's response. They navigate the conversation trajectory and extract the specific piece you want to grade.
|
||||
|
||||
**Quick overview:**
|
||||
- **Purpose**: Agent responses are complex (messages, tool calls, memory) - extractors isolate what to grade
|
||||
- **Built-in options**: last_assistant, tool_arguments, memory_block, pattern, and more
|
||||
- **Flexible**: Different graders can use different extractors in the same suite
|
||||
- **Automatic**: No setup needed - just specify in your grader config
|
||||
|
||||
**Common patterns:**
|
||||
- `last_assistant` - Most common, gets the agent's final message (90% of use cases)
|
||||
- `tool_arguments` - Verify agent called the right tool with correct args
|
||||
- `memory_block` - Check if agent updated memory correctly
|
||||
- `pattern` - Extract structured data with regex
|
||||
|
||||
Extractors determine what part of the agent's response gets graded. They pull out specific content from the conversation trajectory.
|
||||
|
||||
## Why Extractors?
|
||||
|
||||
An agent's response is complex - it includes assistant messages, tool calls, tool returns, memory updates, etc. Extractors let you focus on exactly what you want to evaluate.
|
||||
|
||||
**The evaluation flow:**
|
||||
```
|
||||
Agent Response → Extractor → Submission Text → Grader → Score
|
||||
```
|
||||
|
||||
For example:
|
||||
```
|
||||
Full trajectory:
|
||||
UserMessage: "What's the capital of France?"
|
||||
ToolCallMessage: search(query="capital of france")
|
||||
ToolReturnMessage: "Paris is the capital..."
|
||||
AssistantMessage: "The capital of France is Paris."
|
||||
|
||||
↓ extractor: last_assistant ↓
|
||||
|
||||
Extracted: "The capital of France is Paris."
|
||||
|
||||
↓ grader: contains (ground_truth="Paris") ↓
|
||||
|
||||
Score: 1.0
|
||||
```
|
||||
|
||||
## Trajectory Structure
|
||||
|
||||
A trajectory is a list of turns, where each turn is a list of Letta messages:
|
||||
|
||||
```python
|
||||
[
|
||||
[UserMessage(...), AssistantMessage(...), ToolCallMessage(...), ToolReturnMessage(...)], # Turn 1
|
||||
[AssistantMessage(...)] # Turn 2
|
||||
]
|
||||
```
|
||||
|
||||
Extractors navigate this structure to pull out the submission text.
|
||||
|
||||
## Built-in Extractors
|
||||
|
||||
### last_assistant
|
||||
|
||||
Extracts the last assistant message content.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: last_assistant # Extract final agent message
|
||||
```
|
||||
|
||||
Most common extractor - gets the agent's final response.
|
||||
|
||||
### first_assistant
|
||||
|
||||
Extracts the first assistant message content.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
initial_response:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: first_assistant # Extract first agent message
|
||||
```
|
||||
|
||||
Useful for testing immediate responses before tool usage.
|
||||
|
||||
### all_assistant
|
||||
|
||||
Concatenates all assistant messages with a separator.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
complete_response:
|
||||
kind: rubric
|
||||
prompt_path: rubric.txt
|
||||
extractor: all_assistant # Concatenate all agent messages
|
||||
extractor_config:
|
||||
separator: "\n\n" # Join messages with double newline
|
||||
```
|
||||
|
||||
Use when you need the full conversation context.
|
||||
|
||||
### last_turn
|
||||
|
||||
Extracts all assistant messages from the last turn only.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
final_turn:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: last_turn # Messages from final turn only
|
||||
extractor_config:
|
||||
separator: " " # Join with spaces
|
||||
```
|
||||
|
||||
Useful when the agent makes multiple statements in the final turn.
|
||||
|
||||
### pattern
|
||||
|
||||
Extracts content matching a regex pattern from assistant messages.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
extract_number:
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: pattern # Extract using regex
|
||||
extractor_config:
|
||||
pattern: 'Result: (\d+)' # Regex pattern to match
|
||||
group: 1 # Extract capture group 1
|
||||
search_all: false # Only find first match
|
||||
```
|
||||
|
||||
Example: Extract "42" from "The answer is Result: 42"
|
||||
|
||||
### tool_arguments
|
||||
|
||||
Extracts arguments from a specific tool call.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
search_query:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: tool_arguments # Extract tool call arguments
|
||||
extractor_config:
|
||||
tool_name: search # Which tool to extract from
|
||||
```
|
||||
|
||||
Returns the JSON arguments as a string.
|
||||
|
||||
Example: If agent calls `search(query="pandas", limit=10)`, extracts:
|
||||
```json
|
||||
{"query": "pandas", "limit": 10}
|
||||
```
|
||||
|
||||
### tool_output
|
||||
|
||||
Extracts the return value from a specific tool call.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
search_results:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: tool_output # Extract tool return value
|
||||
extractor_config:
|
||||
tool_name: search # Which tool's output to extract
|
||||
```
|
||||
|
||||
Finds the tool call and its corresponding return message.
|
||||
|
||||
### after_marker
|
||||
|
||||
Extracts content after a specific marker string.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
answer_section:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: after_marker # Extract content after marker
|
||||
extractor_config:
|
||||
marker: "ANSWER:" # Marker string to find
|
||||
include_marker: false # Don't include "ANSWER:" in output
|
||||
```
|
||||
|
||||
Example: From "Here's my analysis... ANSWER: Paris", extracts "Paris"
|
||||
|
||||
### memory_block
|
||||
|
||||
Extracts content from a specific memory block (requires agent_state).
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
human_memory:
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: memory_block # Extract from agent memory
|
||||
extractor_config:
|
||||
block_label: human # Which memory block to extract
|
||||
```
|
||||
|
||||
**Important**: This extractor requires the agent's final state, which adds overhead. The runner automatically fetches agent_state when this extractor is used.
|
||||
|
||||
Example use case: Verify the agent correctly updated its memory about the user.
|
||||
|
||||
## Extractor Configuration
|
||||
|
||||
Some extractors accept additional configuration via `extractor_config`:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
my_metric:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: pattern # Use pattern extractor
|
||||
extractor_config: # Configuration for this extractor
|
||||
pattern: 'Answer: (.*)' # Regex pattern
|
||||
group: 1 # Extract capture group 1
|
||||
```
|
||||
|
||||
## Choosing an Extractor
|
||||
|
||||
| Use Case | Recommended Extractor |
|
||||
|----------|---------------------|
|
||||
| Final agent response | `last_assistant` |
|
||||
| First response before tools | `first_assistant` |
|
||||
| Complete conversation | `all_assistant` |
|
||||
| Specific format extraction | `pattern` |
|
||||
| Tool usage validation | `tool_arguments` |
|
||||
| Tool result checking | `tool_output` |
|
||||
| Memory validation | `memory_block` |
|
||||
| Structured output | `after_marker` |
|
||||
|
||||
## Content Flattening
|
||||
|
||||
Assistant messages can contain multiple content parts. Extractors automatically flatten complex content to plain text.
|
||||
|
||||
## Empty Extraction
|
||||
|
||||
If an extractor finds no matching content, it returns an empty string `""`. This typically results in a score of 0.0 from the grader.
|
||||
|
||||
## Custom Extractors
|
||||
|
||||
You can write custom extractors. See [Custom Extractors](../extractors/custom.md) for details.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
from letta_evals.decorators import extractor
|
||||
from letta_client import LettaMessageUnion
|
||||
|
||||
@extractor
|
||||
def my_extractor(trajectory: List[List[LettaMessageUnion]], config: dict) -> str:
|
||||
# Custom extraction logic
|
||||
return extracted_text
|
||||
```
|
||||
|
||||
Register by importing in your suite's setup script or custom evaluators file.
|
||||
|
||||
## Multi-Metric Extraction
|
||||
|
||||
Different graders can use different extractors:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
response_quality: # Evaluate final message quality
|
||||
kind: rubric
|
||||
prompt_path: quality.txt
|
||||
extractor: last_assistant # Extract final response
|
||||
|
||||
tool_usage: # Check tool was called correctly
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: tool_arguments # Extract tool args
|
||||
extractor_config:
|
||||
tool_name: search # From search tool
|
||||
|
||||
memory_update: # Verify memory updated
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: memory_block # Extract from memory
|
||||
extractor_config:
|
||||
block_label: human # Human memory block
|
||||
```
|
||||
|
||||
Each grader independently extracts and evaluates different aspects.
|
||||
|
||||
## Listing Extractors
|
||||
|
||||
See all available extractors:
|
||||
|
||||
```bash
|
||||
letta-evals list-extractors
|
||||
```
|
||||
|
||||
## Examples
|
||||
|
||||
### Extract Final Answer
|
||||
|
||||
```yaml
|
||||
extractor: last_assistant # Get final agent message
|
||||
```
|
||||
|
||||
Agent: "Let me search... *uses tool* ... The answer is Paris."
|
||||
Extracted: "The answer is Paris."
|
||||
|
||||
### Extract Tool Arguments
|
||||
|
||||
```yaml
|
||||
extractor: tool_arguments # Get tool call args
|
||||
extractor_config:
|
||||
tool_name: search # From search tool
|
||||
```
|
||||
|
||||
Agent calls: `search(query="pandas", limit=5)`
|
||||
Extracted: `{"query": "pandas", "limit": 5}`
|
||||
|
||||
### Extract Pattern
|
||||
|
||||
```yaml
|
||||
extractor: pattern # Extract with regex
|
||||
extractor_config:
|
||||
pattern: 'RESULT: (\w+)' # Match pattern
|
||||
group: 1 # Extract capture group 1
|
||||
```
|
||||
|
||||
Agent: "After calculation... RESULT: SUCCESS"
|
||||
Extracted: "SUCCESS"
|
||||
|
||||
### Extract Memory
|
||||
|
||||
```yaml
|
||||
extractor: memory_block # Extract from agent memory
|
||||
extractor_config:
|
||||
block_label: human # Human memory block
|
||||
```
|
||||
|
||||
Agent updates memory block "human" to: "User's name is Alice"
|
||||
Extracted: "User's name is Alice"
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Extractor returns empty string
|
||||
|
||||
**Problem**: Grader always gives score 0.0 because extractor finds nothing.
|
||||
|
||||
**Common causes**:
|
||||
- **Wrong extractor**: Using `first_assistant` but agent doesn't respond until after tool use → use `last_assistant`
|
||||
- **Wrong tool name**: `tool_arguments` with `tool_name: "search"` but agent calls `"web_search"` → check actual tool name
|
||||
- **Wrong memory block**: `memory_block` with `block_label: "user"` but block is actually labeled `"human"` → check block labels
|
||||
- **Pattern doesn't match**: `pattern: "Answer: (.*)"` but agent says "The answer is..." → adjust regex
|
||||
|
||||
**Debug tips**:
|
||||
1. Check the trajectory in results JSON to see actual agent output
|
||||
2. Use `last_assistant` first to see what's there
|
||||
3. Verify tool names with `letta-evals list-extractors`
|
||||
|
||||
### Pattern extractor not working
|
||||
|
||||
**Problem**: Pattern extractor returns empty or wrong content.
|
||||
|
||||
**Solutions**:
|
||||
- Test your regex separately first
|
||||
- Remember to escape special characters: `\.`, `\(`, `\)`
|
||||
- Use `group: 0` to see the full match (default)
|
||||
- Use `group: 1` to extract first capture group
|
||||
- Set `search_all: true` if you need all matches
|
||||
|
||||
### Memory block extractor fails
|
||||
|
||||
**Problem**: `memory_block` extractor causes errors or returns nothing.
|
||||
|
||||
**Solutions**:
|
||||
- Verify the block label exactly matches (case-sensitive)
|
||||
- Check that agent actually has this memory block
|
||||
- Remember: this adds overhead by fetching agent state
|
||||
|
||||
### Tool extractor finds wrong tool
|
||||
|
||||
**Problem**: Multiple tool calls, but extractor gets the wrong one.
|
||||
|
||||
**Current behavior**: Extractors get the **first** matching tool call.
|
||||
|
||||
**Workaround**: Use custom extractor to implement more specific logic.
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Built-in Extractors Reference](../extractors/builtin.md) - Complete extractor documentation
|
||||
- [Custom Extractors Guide](../extractors/custom.md) - Write your own extractors
|
||||
- [Graders](./graders.md) - How to use extractors with graders
|
||||
@@ -1,374 +0,0 @@
|
||||
# Extractors
|
||||
|
||||
**Extractors** select what content to evaluate from an agent's response. They navigate the conversation trajectory and extract the specific piece you want to grade.
|
||||
|
||||
<Note>
|
||||
**Quick overview:**
|
||||
- **Purpose**: Agent responses are complex (messages, tool calls, memory) - extractors isolate what to grade
|
||||
- **Built-in options**: last_assistant, tool_arguments, memory_block, pattern, and more
|
||||
- **Flexible**: Different graders can use different extractors in the same suite
|
||||
- **Automatic**: No setup needed - just specify in your grader config
|
||||
</Note>
|
||||
|
||||
**Common patterns:**
|
||||
- `last_assistant` - Most common, gets the agent's final message (90% of use cases)
|
||||
- `tool_arguments` - Verify agent called the right tool with correct args
|
||||
- `memory_block` - Check if agent updated memory correctly
|
||||
- `pattern` - Extract structured data with regex
|
||||
|
||||
Extractors determine what part of the agent's response gets graded. They pull out specific content from the conversation trajectory.
|
||||
|
||||
## Why Extractors?
|
||||
|
||||
An agent's response is complex - it includes assistant messages, tool calls, tool returns, memory updates, etc. Extractors let you focus on exactly what you want to evaluate.
|
||||
|
||||
**The evaluation flow:**
|
||||
```
|
||||
Agent Response → Extractor → Submission Text → Grader → Score
|
||||
```
|
||||
|
||||
For example:
|
||||
```
|
||||
Full trajectory:
|
||||
UserMessage: "What's the capital of France?"
|
||||
ToolCallMessage: search(query="capital of france")
|
||||
ToolReturnMessage: "Paris is the capital..."
|
||||
AssistantMessage: "The capital of France is Paris."
|
||||
|
||||
↓ extractor: last_assistant ↓
|
||||
|
||||
Extracted: "The capital of France is Paris."
|
||||
|
||||
↓ grader: contains (ground_truth="Paris") ↓
|
||||
|
||||
Score: 1.0
|
||||
```
|
||||
|
||||
## Trajectory Structure
|
||||
|
||||
A trajectory is a list of turns, where each turn is a list of Letta messages:
|
||||
|
||||
```python
|
||||
[
|
||||
[UserMessage(...), AssistantMessage(...), ToolCallMessage(...), ToolReturnMessage(...)], # Turn 1
|
||||
[AssistantMessage(...)] # Turn 2
|
||||
]
|
||||
```
|
||||
|
||||
Extractors navigate this structure to pull out the submission text.
|
||||
|
||||
## Built-in Extractors
|
||||
|
||||
### last_assistant
|
||||
|
||||
Extracts the last assistant message content.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: last_assistant # Extract final agent message
|
||||
```
|
||||
|
||||
Most common extractor - gets the agent's final response.
|
||||
|
||||
### first_assistant
|
||||
|
||||
Extracts the first assistant message content.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
initial_response:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: first_assistant # Extract first agent message
|
||||
```
|
||||
|
||||
Useful for testing immediate responses before tool usage.
|
||||
|
||||
### all_assistant
|
||||
|
||||
Concatenates all assistant messages with a separator.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
complete_response:
|
||||
kind: rubric
|
||||
prompt_path: rubric.txt
|
||||
extractor: all_assistant # Concatenate all agent messages
|
||||
extractor_config:
|
||||
separator: "\n\n" # Join messages with double newline
|
||||
```
|
||||
|
||||
Use when you need the full conversation context.
|
||||
|
||||
### last_turn
|
||||
|
||||
Extracts all assistant messages from the last turn only.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
final_turn:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: last_turn # Messages from final turn only
|
||||
extractor_config:
|
||||
separator: " " # Join with spaces
|
||||
```
|
||||
|
||||
Useful when the agent makes multiple statements in the final turn.
|
||||
|
||||
### pattern
|
||||
|
||||
Extracts content matching a regex pattern from assistant messages.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
extract_number:
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: pattern # Extract using regex
|
||||
extractor_config:
|
||||
pattern: 'Result: (\d+)' # Regex pattern to match
|
||||
group: 1 # Extract capture group 1
|
||||
search_all: false # Only find first match
|
||||
```
|
||||
|
||||
Example: Extract "42" from "The answer is Result: 42"
|
||||
|
||||
### tool_arguments
|
||||
|
||||
Extracts arguments from a specific tool call.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
search_query:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: tool_arguments # Extract tool call arguments
|
||||
extractor_config:
|
||||
tool_name: search # Which tool to extract from
|
||||
```
|
||||
|
||||
Returns the JSON arguments as a string.
|
||||
|
||||
Example: If agent calls `search(query="pandas", limit=10)`, extracts:
|
||||
```json
|
||||
{"query": "pandas", "limit": 10}
|
||||
```
|
||||
|
||||
### tool_output
|
||||
|
||||
Extracts the return value from a specific tool call.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
search_results:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: tool_output # Extract tool return value
|
||||
extractor_config:
|
||||
tool_name: search # Which tool's output to extract
|
||||
```
|
||||
|
||||
Finds the tool call and its corresponding return message.
|
||||
|
||||
### after_marker
|
||||
|
||||
Extracts content after a specific marker string.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
answer_section:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: after_marker # Extract content after marker
|
||||
extractor_config:
|
||||
marker: "ANSWER:" # Marker string to find
|
||||
include_marker: false # Don't include "ANSWER:" in output
|
||||
```
|
||||
|
||||
Example: From "Here's my analysis... ANSWER: Paris", extracts "Paris"
|
||||
|
||||
### memory_block
|
||||
|
||||
Extracts content from a specific memory block (requires agent_state).
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
human_memory:
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: memory_block # Extract from agent memory
|
||||
extractor_config:
|
||||
block_label: human # Which memory block to extract
|
||||
```
|
||||
|
||||
<Warning>
|
||||
**Important**: This extractor requires the agent's final state, which adds overhead. The runner automatically fetches agent_state when this extractor is used.
|
||||
</Warning>
|
||||
|
||||
Example use case: Verify the agent correctly updated its memory about the user.
|
||||
|
||||
## Extractor Configuration
|
||||
|
||||
Some extractors accept additional configuration via `extractor_config`:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
my_metric:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: pattern # Use pattern extractor
|
||||
extractor_config: # Configuration for this extractor
|
||||
pattern: 'Answer: (.*)' # Regex pattern
|
||||
group: 1 # Extract capture group 1
|
||||
```
|
||||
|
||||
## Choosing an Extractor
|
||||
|
||||
| Use Case | Recommended Extractor |
|
||||
|----------|---------------------|
|
||||
| Final agent response | `last_assistant` |
|
||||
| First response before tools | `first_assistant` |
|
||||
| Complete conversation | `all_assistant` |
|
||||
| Specific format extraction | `pattern` |
|
||||
| Tool usage validation | `tool_arguments` |
|
||||
| Tool result checking | `tool_output` |
|
||||
| Memory validation | `memory_block` |
|
||||
| Structured output | `after_marker` |
|
||||
|
||||
## Content Flattening
|
||||
|
||||
Assistant messages can contain multiple content parts. Extractors automatically flatten complex content to plain text.
|
||||
|
||||
## Empty Extraction
|
||||
|
||||
If an extractor finds no matching content, it returns an empty string `""`. This typically results in a score of 0.0 from the grader.
|
||||
|
||||
## Custom Extractors
|
||||
|
||||
You can write custom extractors. See [Custom Extractors](/guides/evals/extractors/custom-extractors) for details.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
from letta_evals.decorators import extractor
|
||||
from letta_client import LettaMessageUnion
|
||||
|
||||
@extractor
|
||||
def my_extractor(trajectory: List[List[LettaMessageUnion]], config: dict) -> str:
|
||||
# Custom extraction logic
|
||||
return extracted_text
|
||||
```
|
||||
|
||||
Register by importing in your suite's setup script or custom evaluators file.
|
||||
|
||||
## Multi-Metric Extraction
|
||||
|
||||
Different graders can use different extractors:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
response_quality: # Evaluate final message quality
|
||||
kind: rubric
|
||||
prompt_path: quality.txt
|
||||
extractor: last_assistant # Extract final response
|
||||
|
||||
tool_usage: # Check tool was called correctly
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: tool_arguments # Extract tool args
|
||||
extractor_config:
|
||||
tool_name: search # From search tool
|
||||
|
||||
memory_update: # Verify memory updated
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: memory_block # Extract from memory
|
||||
extractor_config:
|
||||
block_label: human # Human memory block
|
||||
```
|
||||
|
||||
Each grader independently extracts and evaluates different aspects.
|
||||
|
||||
## Listing Extractors
|
||||
|
||||
See all available extractors:
|
||||
|
||||
```bash
|
||||
letta-evals list-extractors
|
||||
```
|
||||
|
||||
## Examples
|
||||
|
||||
### Extract Final Answer
|
||||
|
||||
```yaml
|
||||
extractor: last_assistant # Get final agent message
|
||||
```
|
||||
|
||||
Agent: "Let me search... *uses tool* ... The answer is Paris."
|
||||
Extracted: "The answer is Paris."
|
||||
|
||||
### Extract Tool Arguments
|
||||
|
||||
```yaml
|
||||
extractor: tool_arguments # Get tool call args
|
||||
extractor_config:
|
||||
tool_name: search # From search tool
|
||||
```
|
||||
|
||||
Agent calls: `search(query="pandas", limit=5)`
|
||||
Extracted: `{"query": "pandas", "limit": 5}`
|
||||
|
||||
### Extract Pattern
|
||||
|
||||
```yaml
|
||||
extractor: pattern # Extract with regex
|
||||
extractor_config:
|
||||
pattern: 'RESULT: (\w+)' # Match pattern
|
||||
group: 1 # Extract capture group 1
|
||||
```
|
||||
|
||||
Agent: "After calculation... RESULT: SUCCESS"
|
||||
Extracted: "SUCCESS"
|
||||
|
||||
### Extract Memory
|
||||
|
||||
```yaml
|
||||
extractor: memory_block # Extract from agent memory
|
||||
extractor_config:
|
||||
block_label: human # Human memory block
|
||||
```
|
||||
|
||||
Agent updates memory block "human" to: "User's name is Alice"
|
||||
Extracted: "User's name is Alice"
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
<Warning>
|
||||
**Extractor returns empty string**
|
||||
|
||||
**Problem**: Grader always gives score 0.0 because extractor finds nothing.
|
||||
|
||||
**Common causes**:
|
||||
- **Wrong extractor**: Using `first_assistant` but agent doesn't respond until after tool use → use `last_assistant`
|
||||
- **Wrong tool name**: `tool_arguments` with `tool_name: "search"` but agent calls `"web_search"` → check actual tool name
|
||||
- **Wrong memory block**: `memory_block` with `block_label: "user"` but block is actually labeled `"human"` → check block labels
|
||||
- **Pattern doesn't match**: `pattern: "Answer: (.*)"` but agent says "The answer is..." → adjust regex
|
||||
</Warning>
|
||||
|
||||
<Tip>
|
||||
**Debug tips**:
|
||||
1. Check the trajectory in results JSON to see actual agent output
|
||||
2. Use `last_assistant` first to see what's there
|
||||
3. Verify tool names with `letta-evals list-extractors`
|
||||
</Tip>
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Built-in Extractors Reference](/guides/evals/extractors/built-in-extractors) - Complete extractor documentation
|
||||
- [Custom Extractors Guide](/guides/evals/extractors/custom-extractors) - Write your own extractors
|
||||
- [Graders](/guides/evals/concepts/graders) - How to use extractors with graders
|
||||
@@ -1,375 +0,0 @@
|
||||
# Gates
|
||||
|
||||
**Gates** are the pass/fail criteria for your evaluation. They determine whether your agent meets the required performance threshold by checking aggregate metrics.
|
||||
|
||||
**Quick overview:**
|
||||
- **Single decision**: One gate per suite determines pass/fail
|
||||
- **Two metrics**: `avg_score` (average of all scores) or `accuracy` (percentage passing threshold)
|
||||
- **Flexible operators**: >=, >, <=, <, == for threshold comparison
|
||||
- **Customizable pass criteria**: Define what counts as "passing" for accuracy calculations
|
||||
- **Exit codes**: Suite exits 0 for pass, 1 for fail
|
||||
|
||||
**Common patterns:**
|
||||
- `avg_score >= 0.8` - Average score must be 80%+
|
||||
- `accuracy >= 0.9` - 90%+ of samples must pass
|
||||
- Custom threshold - Define per-sample pass criteria with `pass_value`
|
||||
|
||||
Gates define the pass/fail criteria for your evaluation. They check if aggregate metrics meet a threshold.
|
||||
|
||||
## Basic Structure
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: accuracy # Which grader to evaluate
|
||||
metric: avg_score # Use average score (default)
|
||||
op: gte # Greater than or equal
|
||||
value: 0.8 # 80% threshold
|
||||
```
|
||||
|
||||
## Why Use Gates?
|
||||
|
||||
Gates provide **automated pass/fail decisions** for your evaluations, which is essential for:
|
||||
|
||||
**CI/CD Integration**: Gates let you block deployments if agent performance drops:
|
||||
```bash
|
||||
letta-evals run suite.yaml
|
||||
# Exit code 0 = pass (continue deployment)
|
||||
# Exit code 1 = fail (block deployment)
|
||||
```
|
||||
|
||||
**Regression Testing**: Set a baseline threshold and ensure new changes don't degrade performance:
|
||||
```yaml
|
||||
gate:
|
||||
metric: avg_score
|
||||
op: gte
|
||||
value: 0.85 # Must maintain 85%+ to pass
|
||||
```
|
||||
|
||||
**Quality Enforcement**: Require agents meet minimum standards before production:
|
||||
```yaml
|
||||
gate:
|
||||
metric: accuracy
|
||||
op: gte
|
||||
value: 0.95 # 95% of test cases must pass
|
||||
```
|
||||
|
||||
### What Happens When Gates Fail?
|
||||
|
||||
When a gate condition is not met:
|
||||
|
||||
1. **Console output** shows failure message:
|
||||
```
|
||||
✗ FAILED (0.72/1.00 avg, 72.0% pass rate)
|
||||
Gate check failed: avg_score (0.72) not >= 0.80
|
||||
```
|
||||
|
||||
2. **Exit code** is 1 (non-zero indicates failure):
|
||||
```bash
|
||||
letta-evals run suite.yaml
|
||||
echo $? # Prints 1 if gate failed
|
||||
```
|
||||
|
||||
3. **Results JSON** includes `gate_passed: false`:
|
||||
```json
|
||||
{
|
||||
"gate_passed": false,
|
||||
"gate_check": {
|
||||
"metric": "avg_score",
|
||||
"value": 0.72,
|
||||
"threshold": 0.80,
|
||||
"operator": "gte",
|
||||
"passed": false
|
||||
},
|
||||
"metrics": { ... }
|
||||
}
|
||||
```
|
||||
|
||||
4. **All other data is preserved** - you still get full results, scores, and trajectories even when gating fails
|
||||
|
||||
**Common use case in CI**:
|
||||
```bash
|
||||
#!/bin/bash
|
||||
letta-evals run suite.yaml --output results.json
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "❌ Agent evaluation failed - blocking merge"
|
||||
exit 1
|
||||
else
|
||||
echo "✅ Agent evaluation passed - safe to merge"
|
||||
fi
|
||||
```
|
||||
|
||||
## Required Fields
|
||||
|
||||
### metric_key
|
||||
|
||||
Which grader to evaluate. Must match a key in your `graders` section:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
accuracy: # Grader name
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: last_assistant
|
||||
|
||||
gate:
|
||||
metric_key: accuracy # Must match grader name above
|
||||
op: gte # >=
|
||||
value: 0.8 # 80% threshold
|
||||
```
|
||||
|
||||
If you only have one grader, `metric_key` can be omitted - it will default to your single grader.
|
||||
|
||||
### metric
|
||||
|
||||
Which aggregate statistic to compare. Two options:
|
||||
|
||||
#### avg_score
|
||||
|
||||
Average score across all samples (0.0 to 1.0):
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: quality # Check quality grader
|
||||
metric: avg_score # Use average of all scores
|
||||
op: gte # >=
|
||||
value: 0.7 # Must average 70%+
|
||||
```
|
||||
|
||||
Example: If scores are [0.8, 0.9, 0.6], avg_score = 0.77
|
||||
|
||||
#### accuracy
|
||||
|
||||
Pass rate as a percentage (0.0 to 1.0):
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: accuracy # Check accuracy grader
|
||||
metric: accuracy # Use pass rate, not average
|
||||
op: gte # >=
|
||||
value: 0.8 # 80% of samples must pass
|
||||
```
|
||||
|
||||
By default, samples with score >= 1.0 are considered "passing".
|
||||
|
||||
You can customize the per-sample threshold with `pass_op` and `pass_value` (see below).
|
||||
|
||||
**Note**: The default `metric` is `avg_score`, so you can omit it if that's what you want:
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: quality # Check quality grader
|
||||
op: gte # >=
|
||||
value: 0.7 # 70% threshold (defaults to avg_score)
|
||||
```
|
||||
|
||||
### op
|
||||
|
||||
Comparison operator:
|
||||
|
||||
- `gte`: Greater than or equal (>=)
|
||||
- `gt`: Greater than (>)
|
||||
- `lte`: Less than or equal (<=)
|
||||
- `lt`: Less than (<)
|
||||
- `eq`: Equal (==)
|
||||
|
||||
Most common: `gte` (at least X)
|
||||
|
||||
### value
|
||||
|
||||
Threshold value for comparison:
|
||||
|
||||
- For `avg_score`: 0.0 to 1.0
|
||||
- For `accuracy`: 0.0 to 1.0 (representing percentage)
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric: avg_score # Average score
|
||||
op: gte # >=
|
||||
value: 0.75 # 75% threshold
|
||||
```
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric: accuracy # Pass rate
|
||||
op: gte # >=
|
||||
value: 0.9 # 90% must pass
|
||||
```
|
||||
|
||||
## Optional Fields
|
||||
|
||||
### pass_op and pass_value
|
||||
|
||||
Customize when individual samples are considered "passing" (used for accuracy calculation):
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: quality # Check quality grader
|
||||
metric: accuracy # Use pass rate
|
||||
op: gte # >=
|
||||
value: 0.8 # 80% must pass
|
||||
pass_op: gte # Sample passes if >=
|
||||
pass_value: 0.7 # This threshold (70%)
|
||||
```
|
||||
|
||||
Default behavior:
|
||||
- If `metric` is `avg_score`: samples pass if score >= the gate value
|
||||
- If `metric` is `accuracy`: samples pass if score >= 1.0 (perfect)
|
||||
|
||||
## Examples
|
||||
|
||||
### Require 80% Average Score
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: quality # Check quality grader
|
||||
metric: avg_score # Use average
|
||||
op: gte # >=
|
||||
value: 0.8 # 80% average
|
||||
```
|
||||
|
||||
Passes if the average score across all samples is >= 0.8
|
||||
|
||||
### Require 90% Pass Rate (Perfect Scores)
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: accuracy # Check accuracy grader
|
||||
metric: accuracy # Use pass rate
|
||||
op: gte # >=
|
||||
value: 0.9 # 90% must pass (default: score >= 1.0 to pass)
|
||||
```
|
||||
|
||||
Passes if 90% of samples have score = 1.0
|
||||
|
||||
### Require 75% Pass Rate (Score >= 0.7)
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: quality # Check quality grader
|
||||
metric: accuracy # Use pass rate
|
||||
op: gte # >=
|
||||
value: 0.75 # 75% must pass
|
||||
pass_op: gte # Sample passes if >=
|
||||
pass_value: 0.7 # 70% threshold per sample
|
||||
```
|
||||
|
||||
Passes if 75% of samples have score >= 0.7
|
||||
|
||||
### Maximum Error Rate
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: quality # Check quality grader
|
||||
metric: accuracy # Use pass rate
|
||||
op: gte # >=
|
||||
value: 0.95 # 95% must pass (allows 5% failures)
|
||||
pass_op: gt # Sample passes if >
|
||||
pass_value: 0.0 # 0.0 (any non-zero score)
|
||||
```
|
||||
|
||||
Allows up to 5% failures.
|
||||
|
||||
### Exact Pass Rate
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: quality # Check quality grader
|
||||
metric: accuracy # Use pass rate
|
||||
op: eq # Exactly equal
|
||||
value: 1.0 # 100% (all samples must pass)
|
||||
```
|
||||
|
||||
All samples must pass.
|
||||
|
||||
## Multi-Metric Gating
|
||||
|
||||
When you have multiple graders, you can only gate on one metric:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
accuracy: # First metric
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: last_assistant
|
||||
|
||||
completeness: # Second metric
|
||||
kind: rubric
|
||||
prompt_path: completeness.txt
|
||||
model: gpt-4o-mini
|
||||
extractor: last_assistant
|
||||
|
||||
gate:
|
||||
metric_key: accuracy # Only gate on accuracy (completeness still computed)
|
||||
metric: avg_score # Use average
|
||||
op: gte # >=
|
||||
value: 0.8 # 80% threshold
|
||||
```
|
||||
|
||||
The evaluation passes/fails based on the gated metric, but results include scores for all metrics.
|
||||
|
||||
## Understanding avg_score vs accuracy
|
||||
|
||||
### avg_score
|
||||
- Arithmetic mean of all scores
|
||||
- Sensitive to partial credit
|
||||
- Good for continuous evaluation
|
||||
|
||||
Example:
|
||||
- Scores: [1.0, 0.8, 0.6]
|
||||
- avg_score = (1.0 + 0.8 + 0.6) / 3 = 0.8
|
||||
|
||||
### accuracy
|
||||
- Percentage of samples meeting a threshold
|
||||
- Binary pass/fail per sample
|
||||
- Good for strict requirements
|
||||
|
||||
Example:
|
||||
- Scores: [1.0, 0.8, 0.6]
|
||||
- pass_value: 0.7
|
||||
- Passing: [1.0, 0.8] = 2 out of 3
|
||||
- accuracy = 2/3 = 0.667 (66.7%)
|
||||
|
||||
## Errors and Attempted Samples
|
||||
|
||||
If a sample fails (error during evaluation), it:
|
||||
- Gets a score of 0.0
|
||||
- Counts toward `total` but not `total_attempted`
|
||||
- Included in `avg_score_total` but not `avg_score_attempted`
|
||||
|
||||
You can gate on either:
|
||||
- `avg_score_total`: Includes errors as 0.0
|
||||
- `avg_score_attempted`: Excludes errors (only successfully attempted samples)
|
||||
|
||||
**Note**: The `metric` field currently only supports `avg_score` and `accuracy`. By default, gates use `avg_score_attempted`.
|
||||
|
||||
## Gate Results
|
||||
|
||||
After evaluation, you'll see:
|
||||
|
||||
```
|
||||
✓ PASSED (2.25/3.00 avg, 75.0% pass rate)
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```
|
||||
✗ FAILED (1.80/3.00 avg, 60.0% pass rate)
|
||||
```
|
||||
|
||||
The evaluation exit code reflects the gate result:
|
||||
- 0: Passed
|
||||
- 1: Failed
|
||||
|
||||
## Advanced Gating
|
||||
|
||||
For complex gating logic (e.g., "pass if accuracy >= 80% OR avg_score >= 0.9"), you'll need to:
|
||||
1. Run evaluation with one gate
|
||||
2. Examine the results JSON
|
||||
3. Apply custom logic in a post-processing script
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Understanding Results](../results/overview.md) - Interpreting evaluation output
|
||||
- [Multi-Metric Evaluation](../graders/multi-metric.md) - Using multiple graders
|
||||
- [Suite YAML Reference](../configuration/suite-yaml.md) - Complete gate configuration
|
||||
@@ -1,384 +0,0 @@
|
||||
# Gates
|
||||
|
||||
**Gates** are the pass/fail criteria for your evaluation. They determine whether your agent meets the required performance threshold by checking aggregate metrics.
|
||||
|
||||
<Note>
|
||||
**Quick overview:**
|
||||
- **Single decision**: One gate per suite determines pass/fail
|
||||
- **Two metrics**: `avg_score` (average of all scores) or `accuracy` (percentage passing threshold)
|
||||
- **Flexible operators**: `>=`, `>`, `<=`, `<`, `==` for threshold comparison
|
||||
- **Customizable pass criteria**: Define what counts as "passing" for accuracy calculations
|
||||
- **Exit codes**: Suite exits 0 for pass, 1 for fail
|
||||
</Note>
|
||||
|
||||
**Common patterns:**
|
||||
- Average score must be 80%+: `avg_score >= 0.8`
|
||||
- 90%+ of samples must pass: `accuracy >= 0.9`
|
||||
- Custom threshold: Define per-sample pass criteria with `pass_value`
|
||||
|
||||
Gates define the pass/fail criteria for your evaluation. They check if aggregate metrics meet a threshold.
|
||||
|
||||
## Basic Structure
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: accuracy # Which grader to evaluate
|
||||
metric: avg_score # Use average score (default)
|
||||
op: gte # Greater than or equal
|
||||
value: 0.8 # 80% threshold
|
||||
```
|
||||
|
||||
## Why Use Gates?
|
||||
|
||||
Gates provide **automated pass/fail decisions** for your evaluations, which is essential for:
|
||||
|
||||
**CI/CD Integration**: Gates let you block deployments if agent performance drops:
|
||||
```bash
|
||||
letta-evals run suite.yaml
|
||||
# Exit code 0 = pass (continue deployment)
|
||||
# Exit code 1 = fail (block deployment)
|
||||
```
|
||||
|
||||
**Regression Testing**: Set a baseline threshold and ensure new changes don't degrade performance:
|
||||
```yaml
|
||||
gate:
|
||||
metric: avg_score
|
||||
op: gte
|
||||
value: 0.85 # Must maintain 85%+ to pass
|
||||
```
|
||||
|
||||
**Quality Enforcement**: Require agents meet minimum standards before production:
|
||||
```yaml
|
||||
gate:
|
||||
metric: accuracy
|
||||
op: gte
|
||||
value: 0.95 # 95% of test cases must pass
|
||||
```
|
||||
|
||||
### What Happens When Gates Fail?
|
||||
|
||||
When a gate condition is not met:
|
||||
|
||||
1. **Console output** shows failure message:
|
||||
```text
|
||||
✗ FAILED (0.72/1.00 avg, 72.0% pass rate)
|
||||
Gate check failed: avg_score (0.72) not >= 0.80
|
||||
```
|
||||
|
||||
2. **Exit code** is 1 (non-zero indicates failure):
|
||||
```bash
|
||||
letta-evals run suite.yaml
|
||||
echo $? # Prints 1 if gate failed
|
||||
```
|
||||
|
||||
3. **Results JSON** includes `gate_passed: false`:
|
||||
```json
|
||||
{
|
||||
"gate_passed": false,
|
||||
"gate_check": {
|
||||
"metric": "avg_score",
|
||||
"value": 0.72,
|
||||
"threshold": 0.80,
|
||||
"operator": "gte",
|
||||
"passed": false
|
||||
},
|
||||
"metrics": { ... }
|
||||
}
|
||||
```
|
||||
|
||||
4. **All other data is preserved** - you still get full results, scores, and trajectories even when gating fails
|
||||
|
||||
<Tip>
|
||||
**Common use case in CI**:
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
letta-evals run suite.yaml --output results.json
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "❌ Agent evaluation failed - blocking merge"
|
||||
exit 1
|
||||
else
|
||||
echo "✅ Agent evaluation passed - safe to merge"
|
||||
fi
|
||||
```
|
||||
</Tip>
|
||||
|
||||
## Required Fields
|
||||
|
||||
### metric_key
|
||||
|
||||
Which grader to evaluate. Must match a key in your `graders` section:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
accuracy: # Grader name
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: last_assistant
|
||||
|
||||
gate:
|
||||
metric_key: accuracy # Must match grader name above
|
||||
op: gte # >=
|
||||
value: 0.8 # 80% threshold
|
||||
```
|
||||
|
||||
If you only have one grader, `metric_key` can be omitted - it will default to your single grader.
|
||||
|
||||
### metric
|
||||
|
||||
Which aggregate statistic to compare. Two options:
|
||||
|
||||
#### avg_score
|
||||
|
||||
Average score across all samples (0.0 to 1.0):
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: quality # Check quality grader
|
||||
metric: avg_score # Use average of all scores
|
||||
op: gte # >=
|
||||
value: 0.7 # Must average 70%+
|
||||
```
|
||||
|
||||
Example: If scores are [0.8, 0.9, 0.6], avg_score = 0.77
|
||||
|
||||
#### accuracy
|
||||
|
||||
Pass rate as a percentage (0.0 to 1.0):
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: accuracy # Check accuracy grader
|
||||
metric: accuracy # Use pass rate, not average
|
||||
op: gte # >=
|
||||
value: 0.8 # 80% of samples must pass
|
||||
```
|
||||
|
||||
By default, samples with score `>= 1.0` are considered "passing".
|
||||
|
||||
You can customize the per-sample threshold with `pass_op` and `pass_value` (see below).
|
||||
|
||||
<Note>
|
||||
**Note**: The default `metric` is `avg_score`, so you can omit it if that's what you want:
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: quality # Check quality grader
|
||||
op: gte # >=
|
||||
value: 0.7 # 70% threshold (defaults to avg_score)
|
||||
```
|
||||
</Note>
|
||||
|
||||
### op
|
||||
|
||||
Comparison operator:
|
||||
|
||||
- `gte`: Greater than or equal (`>=`)
|
||||
- `gt`: Greater than (`>`)
|
||||
- `lte`: Less than or equal (`<=`)
|
||||
- `lt`: Less than (`<`)
|
||||
- `eq`: Equal (`==`)
|
||||
|
||||
Most common: `gte` (at least X)
|
||||
|
||||
### value
|
||||
|
||||
Threshold value for comparison:
|
||||
|
||||
- For `avg_score`: 0.0 to 1.0
|
||||
- For `accuracy`: 0.0 to 1.0 (representing percentage)
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric: avg_score # Average score
|
||||
op: gte # >=
|
||||
value: 0.75 # 75% threshold
|
||||
```
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric: accuracy # Pass rate
|
||||
op: gte # >=
|
||||
value: 0.9 # 90% must pass
|
||||
```
|
||||
|
||||
## Optional Fields
|
||||
|
||||
### pass_op and pass_value
|
||||
|
||||
Customize when individual samples are considered "passing" (used for accuracy calculation):
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: quality # Check quality grader
|
||||
metric: accuracy # Use pass rate
|
||||
op: gte # >=
|
||||
value: 0.8 # 80% must pass
|
||||
pass_op: gte # Sample passes if >=
|
||||
pass_value: 0.7 # This threshold (70%)
|
||||
```
|
||||
|
||||
Default behavior:
|
||||
- If `metric` is `avg_score`: samples pass if score `>=` the gate value
|
||||
- If `metric` is `accuracy`: samples pass if score `>= 1.0` (perfect)
|
||||
|
||||
## Examples
|
||||
|
||||
### Require 80% Average Score
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: quality # Check quality grader
|
||||
metric: avg_score # Use average
|
||||
op: gte # >=
|
||||
value: 0.8 # 80% average
|
||||
```
|
||||
|
||||
Passes if the average score across all samples is `>= 0.8`
|
||||
|
||||
### Require 90% Pass Rate (Perfect Scores)
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: accuracy # Check accuracy grader
|
||||
metric: accuracy # Use pass rate
|
||||
op: gte # >=
|
||||
value: 0.9 # 90% must pass (default: score >= 1.0 to pass)
|
||||
```
|
||||
|
||||
Passes if 90% of samples have score = 1.0
|
||||
|
||||
### Require 75% Pass Rate (Score `>= 0.7`)
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: quality # Check quality grader
|
||||
metric: accuracy # Use pass rate
|
||||
op: gte # >=
|
||||
value: 0.75 # 75% must pass
|
||||
pass_op: gte # Sample passes if >=
|
||||
pass_value: 0.7 # 70% threshold per sample
|
||||
```
|
||||
|
||||
Passes if 75% of samples have score `>= 0.7`
|
||||
|
||||
### Maximum Error Rate
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: quality # Check quality grader
|
||||
metric: accuracy # Use pass rate
|
||||
op: gte # >=
|
||||
value: 0.95 # 95% must pass (allows 5% failures)
|
||||
pass_op: gt # Sample passes if >
|
||||
pass_value: 0.0 # 0.0 (any non-zero score)
|
||||
```
|
||||
|
||||
Allows up to 5% failures.
|
||||
|
||||
### Exact Pass Rate
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: quality # Check quality grader
|
||||
metric: accuracy # Use pass rate
|
||||
op: eq # Exactly equal
|
||||
value: 1.0 # 100% (all samples must pass)
|
||||
```
|
||||
|
||||
All samples must pass.
|
||||
|
||||
## Multi-Metric Gating
|
||||
|
||||
When you have multiple graders, you can only gate on one metric:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
accuracy: # First metric
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: last_assistant
|
||||
|
||||
completeness: # Second metric
|
||||
kind: rubric
|
||||
prompt_path: completeness.txt
|
||||
model: gpt-4o-mini
|
||||
extractor: last_assistant
|
||||
|
||||
gate:
|
||||
metric_key: accuracy # Only gate on accuracy (completeness still computed)
|
||||
metric: avg_score # Use average
|
||||
op: gte # >=
|
||||
value: 0.8 # 80% threshold
|
||||
```
|
||||
|
||||
The evaluation passes/fails based on the gated metric, but results include scores for all metrics.
|
||||
|
||||
## Understanding avg_score vs accuracy
|
||||
|
||||
### avg_score
|
||||
- Arithmetic mean of all scores
|
||||
- Sensitive to partial credit
|
||||
- Good for continuous evaluation
|
||||
|
||||
Example:
|
||||
- Scores: [1.0, 0.8, 0.6]
|
||||
- avg_score = (1.0 + 0.8 + 0.6) / 3 = 0.8
|
||||
|
||||
### accuracy
|
||||
- Percentage of samples meeting a threshold
|
||||
- Binary pass/fail per sample
|
||||
- Good for strict requirements
|
||||
|
||||
Example:
|
||||
- Scores: [1.0, 0.8, 0.6]
|
||||
- pass_value: 0.7
|
||||
- Passing: [1.0, 0.8] = 2 out of 3
|
||||
- accuracy = 2/3 = 0.667 (66.7%)
|
||||
|
||||
## Errors and Attempted Samples
|
||||
|
||||
If a sample fails (error during evaluation), it:
|
||||
- Gets a score of 0.0
|
||||
- Counts toward `total` but not `total_attempted`
|
||||
- Included in `avg_score_total` but not `avg_score_attempted`
|
||||
|
||||
You can gate on either:
|
||||
- `avg_score_total`: Includes errors as 0.0
|
||||
- `avg_score_attempted`: Excludes errors (only successfully attempted samples)
|
||||
|
||||
<Note>
|
||||
**Note**: The `metric` field currently only supports `avg_score` and `accuracy`. By default, gates use `avg_score_attempted`.
|
||||
</Note>
|
||||
|
||||
## Gate Results
|
||||
|
||||
After evaluation, you'll see:
|
||||
|
||||
```text
|
||||
✓ PASSED (2.25/3.00 avg, 75.0% pass rate)
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```text
|
||||
✗ FAILED (1.80/3.00 avg, 60.0% pass rate)
|
||||
```
|
||||
|
||||
The evaluation exit code reflects the gate result:
|
||||
- 0: Passed
|
||||
- 1: Failed
|
||||
|
||||
## Advanced Gating
|
||||
|
||||
For complex gating logic (e.g., "pass if accuracy `>= 80%` OR avg_score `>= 0.9`"), you'll need to:
|
||||
1. Run evaluation with one gate
|
||||
2. Examine the results JSON
|
||||
3. Apply custom logic in a post-processing script
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Understanding Results](/evals/results-metrics/understanding-results) - Interpreting evaluation output
|
||||
- [Multi-Metric Evaluation](/guides/guides/evals/graders/multi-metric) - Using multiple graders
|
||||
- [Suite YAML Reference](/guides/evals/configuration/suite-yaml) - Complete gate configuration
|
||||
@@ -1,328 +0,0 @@
|
||||
# Graders
|
||||
|
||||
**Graders** are the scoring functions that evaluate agent responses. They take the extracted submission (from an extractor) and assign a score between 0.0 (complete failure) and 1.0 (perfect success).
|
||||
|
||||
**Quick overview:**
|
||||
- **Two types**: Tool graders (deterministic Python functions) and Rubric graders (LLM-as-judge)
|
||||
- **Built-in functions**: exact_match, contains, regex_match, ascii_printable_only
|
||||
- **Custom graders**: Write your own grading logic
|
||||
- **Multi-metric**: Combine multiple graders in one suite
|
||||
- **Flexible extraction**: Each grader can use a different extractor
|
||||
|
||||
**When to use each:**
|
||||
- **Tool graders**: Fast, deterministic, free - perfect for exact matching, patterns, tool validation
|
||||
- **Rubric graders**: Flexible, subjective, costs API calls - ideal for quality, creativity, nuanced evaluation
|
||||
|
||||
Graders evaluate agent responses and assign scores between 0.0 (complete failure) and 1.0 (perfect success).
|
||||
|
||||
## Grader Types
|
||||
|
||||
There are two types of graders:
|
||||
|
||||
### Tool Graders
|
||||
|
||||
Python functions that programmatically compare the submission to ground truth or apply deterministic checks.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
accuracy:
|
||||
kind: tool # Deterministic grading
|
||||
function: exact_match # Built-in grading function
|
||||
extractor: last_assistant # Use final agent response
|
||||
```
|
||||
|
||||
Best for:
|
||||
- Exact matching
|
||||
- Pattern checking
|
||||
- Tool call validation
|
||||
- Deterministic criteria
|
||||
|
||||
### Rubric Graders
|
||||
|
||||
LLM-as-judge evaluation using custom prompts and criteria. Can use either direct LLM API calls or a Letta agent as the judge.
|
||||
|
||||
**Standard rubric grading (LLM API):**
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric # LLM-as-judge
|
||||
prompt_path: rubric.txt # Custom evaluation criteria
|
||||
model: gpt-4o-mini # Judge model
|
||||
extractor: last_assistant # What to evaluate
|
||||
```
|
||||
|
||||
**Agent-as-judge (Letta agent):**
|
||||
```yaml
|
||||
graders:
|
||||
agent_judge:
|
||||
kind: rubric # Still "rubric" kind
|
||||
agent_file: judge.af # Judge agent with submit_grade tool
|
||||
prompt_path: rubric.txt # Evaluation criteria
|
||||
extractor: last_assistant # What to evaluate
|
||||
```
|
||||
|
||||
Best for:
|
||||
- Subjective quality assessment
|
||||
- Open-ended responses
|
||||
- Nuanced evaluation
|
||||
- Complex criteria
|
||||
- Judges that need tools (when using agent-as-judge)
|
||||
|
||||
## Built-in Tool Graders
|
||||
|
||||
### exact_match
|
||||
|
||||
Checks if submission exactly matches ground truth (case-sensitive, whitespace-trimmed).
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
accuracy:
|
||||
kind: tool
|
||||
function: exact_match # Case-sensitive, whitespace-trimmed
|
||||
extractor: last_assistant # Extract final response
|
||||
```
|
||||
|
||||
Requires: `ground_truth` in dataset
|
||||
|
||||
Score: 1.0 if exact match, 0.0 otherwise
|
||||
|
||||
### contains
|
||||
|
||||
Checks if submission contains ground truth (case-insensitive).
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
contains_answer:
|
||||
kind: tool
|
||||
function: contains # Case-insensitive substring match
|
||||
extractor: last_assistant # Search in final response
|
||||
```
|
||||
|
||||
Requires: `ground_truth` in dataset
|
||||
|
||||
Score: 1.0 if found, 0.0 otherwise
|
||||
|
||||
### regex_match
|
||||
|
||||
Checks if submission matches a regex pattern in ground truth.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
pattern:
|
||||
kind: tool
|
||||
function: regex_match # Pattern matching
|
||||
extractor: last_assistant # Check final response
|
||||
```
|
||||
|
||||
Dataset sample:
|
||||
```json
|
||||
{"input": "Generate a UUID", "ground_truth": "[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"}
|
||||
```
|
||||
|
||||
Score: 1.0 if pattern matches, 0.0 otherwise
|
||||
|
||||
### ascii_printable_only
|
||||
|
||||
Validates that all characters are printable ASCII (useful for ASCII art, formatted output).
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
ascii_check:
|
||||
kind: tool
|
||||
function: ascii_printable_only # Validate ASCII characters
|
||||
extractor: last_assistant # Check final response
|
||||
```
|
||||
|
||||
Does not require ground truth.
|
||||
|
||||
Score: 1.0 if all characters are printable ASCII, 0.0 if any non-printable characters found
|
||||
|
||||
## Rubric Graders
|
||||
|
||||
Rubric graders use an LLM to evaluate responses based on custom criteria.
|
||||
|
||||
### Basic Configuration
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric # LLM-as-judge
|
||||
prompt_path: quality_rubric.txt # Evaluation criteria
|
||||
model: gpt-4o-mini # Judge model
|
||||
temperature: 0.0 # Deterministic
|
||||
extractor: last_assistant # What to evaluate
|
||||
```
|
||||
|
||||
### Rubric Prompt Format
|
||||
|
||||
Your rubric file should describe the evaluation criteria. Use placeholders:
|
||||
|
||||
- `{input}`: The original input from the dataset
|
||||
- `{submission}`: The extracted agent response
|
||||
- `{ground_truth}`: Ground truth from dataset (if available)
|
||||
|
||||
Example `quality_rubric.txt`:
|
||||
```
|
||||
Evaluate the response for:
|
||||
1. Accuracy: Does it correctly answer the question?
|
||||
2. Completeness: Is the answer thorough?
|
||||
3. Clarity: Is it well-explained?
|
||||
|
||||
Input: {input}
|
||||
Expected: {ground_truth}
|
||||
Response: {submission}
|
||||
|
||||
Score from 0.0 to 1.0 where:
|
||||
- 1.0: Perfect response
|
||||
- 0.75: Good with minor issues
|
||||
- 0.5: Acceptable but incomplete
|
||||
- 0.25: Poor quality
|
||||
- 0.0: Completely wrong
|
||||
```
|
||||
|
||||
### Inline Prompt
|
||||
|
||||
Instead of a file, you can include the prompt inline:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric # LLM-as-judge
|
||||
prompt: | # Inline prompt instead of file
|
||||
Evaluate the creativity and originality of the response.
|
||||
Score 1.0 for highly creative, 0.0 for generic or unoriginal.
|
||||
model: gpt-4o-mini # Judge model
|
||||
extractor: last_assistant # What to evaluate
|
||||
```
|
||||
|
||||
### Model Configuration
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric
|
||||
prompt_path: rubric.txt # Evaluation criteria
|
||||
model: gpt-4o-mini # Judge model
|
||||
temperature: 0.0 # Deterministic (0.0-2.0)
|
||||
provider: openai # LLM provider (default: openai)
|
||||
max_retries: 5 # API retry attempts
|
||||
timeout: 120.0 # Request timeout in seconds
|
||||
```
|
||||
|
||||
Supported providers:
|
||||
- `openai` (default)
|
||||
|
||||
Models:
|
||||
- Any OpenAI-compatible model
|
||||
- Special handling for reasoning models (o1, o3) - temperature automatically adjusted to 1.0
|
||||
|
||||
### Structured Output
|
||||
|
||||
Rubric graders use JSON mode to get structured responses:
|
||||
|
||||
```json
|
||||
{
|
||||
"score": 0.85,
|
||||
"rationale": "The response is accurate and complete but could be more concise."
|
||||
}
|
||||
```
|
||||
|
||||
The score is validated to be between 0.0 and 1.0.
|
||||
|
||||
## Multi-Metric Configuration
|
||||
|
||||
Evaluate multiple aspects in one suite:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
accuracy: # Tool grader for factual correctness
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: last_assistant
|
||||
|
||||
completeness: # Rubric grader for thoroughness
|
||||
kind: rubric
|
||||
prompt_path: completeness_rubric.txt
|
||||
model: gpt-4o-mini
|
||||
extractor: last_assistant
|
||||
|
||||
tool_usage: # Tool grader for tool call validation
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: tool_arguments # Extract tool call args
|
||||
extractor_config:
|
||||
tool_name: search # Which tool to check
|
||||
```
|
||||
|
||||
Each grader can use a different extractor.
|
||||
|
||||
## Extractor Configuration
|
||||
|
||||
Every grader must specify an `extractor` to select what to grade:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
my_metric:
|
||||
kind: tool
|
||||
function: contains # Grading function
|
||||
extractor: last_assistant # What to extract and grade
|
||||
```
|
||||
|
||||
Some extractors need additional configuration:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
tool_check:
|
||||
kind: tool
|
||||
function: contains # Check if ground truth in tool args
|
||||
extractor: tool_arguments # Extract tool call arguments
|
||||
extractor_config: # Configuration for this extractor
|
||||
tool_name: search # Which tool to extract from
|
||||
```
|
||||
|
||||
See [Extractors](./extractors.md) for all available extractors.
|
||||
|
||||
## Custom Graders
|
||||
|
||||
You can write custom grading functions. See [Custom Graders](../advanced/custom-graders.md) for details.
|
||||
|
||||
## Grader Selection Guide
|
||||
|
||||
| Use Case | Recommended Grader |
|
||||
|----------|-------------------|
|
||||
| Exact answer matching | `exact_match` |
|
||||
| Keyword checking | `contains` |
|
||||
| Pattern validation | `regex_match` |
|
||||
| Tool call validation | `exact_match` with `tool_arguments` extractor |
|
||||
| Quality assessment | Rubric grader |
|
||||
| Creativity evaluation | Rubric grader |
|
||||
| Format checking | Custom tool grader |
|
||||
| Multi-criteria evaluation | Multiple graders |
|
||||
|
||||
## Score Interpretation
|
||||
|
||||
All scores are between 0.0 and 1.0:
|
||||
|
||||
- **1.0**: Perfect - meets all criteria
|
||||
- **0.75-0.99**: Good - minor issues
|
||||
- **0.5-0.74**: Acceptable - notable gaps
|
||||
- **0.25-0.49**: Poor - major problems
|
||||
- **0.0-0.24**: Failed - did not meet criteria
|
||||
|
||||
Tool graders typically return binary scores (0.0 or 1.0), while rubric graders can return any value in the range.
|
||||
|
||||
## Error Handling
|
||||
|
||||
If grading fails (e.g., network error, invalid format):
|
||||
- Score is set to 0.0
|
||||
- Rationale includes error message
|
||||
- Metadata includes error details
|
||||
|
||||
This ensures evaluations can continue even with individual failures.
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Tool Graders](../graders/tool-graders.md) - Built-in and custom functions
|
||||
- [Rubric Graders](../graders/rubric-graders.md) - LLM-as-judge details
|
||||
- [Multi-Metric Evaluation](../graders/multi-metric.md) - Using multiple graders
|
||||
- [Extractors](./extractors.md) - Selecting what to grade
|
||||
@@ -1,330 +0,0 @@
|
||||
# Graders
|
||||
|
||||
**Graders** are the scoring functions that evaluate agent responses. They take the extracted submission (from an extractor) and assign a score between 0.0 (complete failure) and 1.0 (perfect success).
|
||||
|
||||
<Note>
|
||||
**Quick overview:**
|
||||
- **Two types**: Tool graders (deterministic Python functions) and Rubric graders (LLM-as-judge)
|
||||
- **Built-in functions**: exact_match, contains, regex_match, ascii_printable_only
|
||||
- **Custom graders**: Write your own grading logic
|
||||
- **Multi-metric**: Combine multiple graders in one suite
|
||||
- **Flexible extraction**: Each grader can use a different extractor
|
||||
</Note>
|
||||
|
||||
**When to use each:**
|
||||
- **Tool graders**: Fast, deterministic, free - perfect for exact matching, patterns, tool validation
|
||||
- **Rubric graders**: Flexible, subjective, costs API calls - ideal for quality, creativity, nuanced evaluation
|
||||
|
||||
Graders evaluate agent responses and assign scores between 0.0 (complete failure) and 1.0 (perfect success).
|
||||
|
||||
## Grader Types
|
||||
|
||||
There are two types of graders:
|
||||
|
||||
### Tool Graders
|
||||
|
||||
Python functions that programmatically compare the submission to ground truth or apply deterministic checks.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
accuracy:
|
||||
kind: tool # Deterministic grading
|
||||
function: exact_match # Built-in grading function
|
||||
extractor: last_assistant # Use final agent response
|
||||
```
|
||||
|
||||
Best for:
|
||||
- Exact matching
|
||||
- Pattern checking
|
||||
- Tool call validation
|
||||
- Deterministic criteria
|
||||
|
||||
### Rubric Graders
|
||||
|
||||
LLM-as-judge evaluation using custom prompts and criteria. Can use either direct LLM API calls or a Letta agent as the judge.
|
||||
|
||||
**Standard rubric grading (LLM API):**
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric # LLM-as-judge
|
||||
prompt_path: rubric.txt # Custom evaluation criteria
|
||||
model: gpt-4o-mini # Judge model
|
||||
extractor: last_assistant # What to evaluate
|
||||
```
|
||||
|
||||
**Agent-as-judge (Letta agent):**
|
||||
```yaml
|
||||
graders:
|
||||
agent_judge:
|
||||
kind: rubric # Still "rubric" kind
|
||||
agent_file: judge.af # Judge agent with submit_grade tool
|
||||
prompt_path: rubric.txt # Evaluation criteria
|
||||
extractor: last_assistant # What to evaluate
|
||||
```
|
||||
|
||||
Best for:
|
||||
- Subjective quality assessment
|
||||
- Open-ended responses
|
||||
- Nuanced evaluation
|
||||
- Complex criteria
|
||||
- Judges that need tools (when using agent-as-judge)
|
||||
|
||||
## Built-in Tool Graders
|
||||
|
||||
### exact_match
|
||||
|
||||
Checks if submission exactly matches ground truth (case-sensitive, whitespace-trimmed).
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
accuracy:
|
||||
kind: tool
|
||||
function: exact_match # Case-sensitive, whitespace-trimmed
|
||||
extractor: last_assistant # Extract final response
|
||||
```
|
||||
|
||||
Requires: `ground_truth` in dataset
|
||||
|
||||
Score: 1.0 if exact match, 0.0 otherwise
|
||||
|
||||
### contains
|
||||
|
||||
Checks if submission contains ground truth (case-insensitive).
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
contains_answer:
|
||||
kind: tool
|
||||
function: contains # Case-insensitive substring match
|
||||
extractor: last_assistant # Search in final response
|
||||
```
|
||||
|
||||
Requires: `ground_truth` in dataset
|
||||
|
||||
Score: 1.0 if found, 0.0 otherwise
|
||||
|
||||
### regex_match
|
||||
|
||||
Checks if submission matches a regex pattern in ground truth.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
pattern:
|
||||
kind: tool
|
||||
function: regex_match # Pattern matching
|
||||
extractor: last_assistant # Check final response
|
||||
```
|
||||
|
||||
Dataset sample:
|
||||
```json
|
||||
{"input": "Generate a UUID", "ground_truth": "[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"}
|
||||
```
|
||||
|
||||
Score: 1.0 if pattern matches, 0.0 otherwise
|
||||
|
||||
### ascii_printable_only
|
||||
|
||||
Validates that all characters are printable ASCII (useful for ASCII art, formatted output).
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
ascii_check:
|
||||
kind: tool
|
||||
function: ascii_printable_only # Validate ASCII characters
|
||||
extractor: last_assistant # Check final response
|
||||
```
|
||||
|
||||
Does not require ground truth.
|
||||
|
||||
Score: 1.0 if all characters are printable ASCII, 0.0 if any non-printable characters found
|
||||
|
||||
## Rubric Graders
|
||||
|
||||
Rubric graders use an LLM to evaluate responses based on custom criteria.
|
||||
|
||||
### Basic Configuration
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric # LLM-as-judge
|
||||
prompt_path: quality_rubric.txt # Evaluation criteria
|
||||
model: gpt-4o-mini # Judge model
|
||||
temperature: 0.0 # Deterministic
|
||||
extractor: last_assistant # What to evaluate
|
||||
```
|
||||
|
||||
### Rubric Prompt Format
|
||||
|
||||
Your rubric file should describe the evaluation criteria. Use placeholders:
|
||||
|
||||
- `{input}`: The original input from the dataset
|
||||
- `{submission}`: The extracted agent response
|
||||
- `{ground_truth}`: Ground truth from dataset (if available)
|
||||
|
||||
Example `quality_rubric.txt`:
|
||||
```
|
||||
Evaluate the response for:
|
||||
1. Accuracy: Does it correctly answer the question?
|
||||
2. Completeness: Is the answer thorough?
|
||||
3. Clarity: Is it well-explained?
|
||||
|
||||
Input: {input}
|
||||
Expected: {ground_truth}
|
||||
Response: {submission}
|
||||
|
||||
Score from 0.0 to 1.0 where:
|
||||
- 1.0: Perfect response
|
||||
- 0.75: Good with minor issues
|
||||
- 0.5: Acceptable but incomplete
|
||||
- 0.25: Poor quality
|
||||
- 0.0: Completely wrong
|
||||
```
|
||||
|
||||
### Inline Prompt
|
||||
|
||||
Instead of a file, you can include the prompt inline:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric # LLM-as-judge
|
||||
prompt: | # Inline prompt instead of file
|
||||
Evaluate the creativity and originality of the response.
|
||||
Score 1.0 for highly creative, 0.0 for generic or unoriginal.
|
||||
model: gpt-4o-mini # Judge model
|
||||
extractor: last_assistant # What to evaluate
|
||||
```
|
||||
|
||||
### Model Configuration
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric
|
||||
prompt_path: rubric.txt # Evaluation criteria
|
||||
model: gpt-4o-mini # Judge model
|
||||
temperature: 0.0 # Deterministic (0.0-2.0)
|
||||
provider: openai # LLM provider (default: openai)
|
||||
max_retries: 5 # API retry attempts
|
||||
timeout: 120.0 # Request timeout in seconds
|
||||
```
|
||||
|
||||
Supported providers:
|
||||
- `openai` (default)
|
||||
|
||||
Models:
|
||||
- Any OpenAI-compatible model
|
||||
- Special handling for reasoning models (o1, o3) - temperature automatically adjusted to 1.0
|
||||
|
||||
### Structured Output
|
||||
|
||||
Rubric graders use JSON mode to get structured responses:
|
||||
|
||||
```json
|
||||
{
|
||||
"score": 0.85,
|
||||
"rationale": "The response is accurate and complete but could be more concise."
|
||||
}
|
||||
```
|
||||
|
||||
The score is validated to be between 0.0 and 1.0.
|
||||
|
||||
## Multi-Metric Configuration
|
||||
|
||||
Evaluate multiple aspects in one suite:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
accuracy: # Tool grader for factual correctness
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: last_assistant
|
||||
|
||||
completeness: # Rubric grader for thoroughness
|
||||
kind: rubric
|
||||
prompt_path: completeness_rubric.txt
|
||||
model: gpt-4o-mini
|
||||
extractor: last_assistant
|
||||
|
||||
tool_usage: # Tool grader for tool call validation
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: tool_arguments # Extract tool call args
|
||||
extractor_config:
|
||||
tool_name: search # Which tool to check
|
||||
```
|
||||
|
||||
Each grader can use a different extractor.
|
||||
|
||||
## Extractor Configuration
|
||||
|
||||
Every grader must specify an `extractor` to select what to grade:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
my_metric:
|
||||
kind: tool
|
||||
function: contains # Grading function
|
||||
extractor: last_assistant # What to extract and grade
|
||||
```
|
||||
|
||||
Some extractors need additional configuration:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
tool_check:
|
||||
kind: tool
|
||||
function: contains # Check if ground truth in tool args
|
||||
extractor: tool_arguments # Extract tool call arguments
|
||||
extractor_config: # Configuration for this extractor
|
||||
tool_name: search # Which tool to extract from
|
||||
```
|
||||
|
||||
See [Extractors](/guides/evals/concepts/extractors) for all available extractors.
|
||||
|
||||
## Custom Graders
|
||||
|
||||
You can write custom grading functions. See [Custom Graders](/guides/evals/advanced/custom-graders) for details.
|
||||
|
||||
## Grader Selection Guide
|
||||
|
||||
| Use Case | Recommended Grader |
|
||||
|----------|-------------------|
|
||||
| Exact answer matching | `exact_match` |
|
||||
| Keyword checking | `contains` |
|
||||
| Pattern validation | `regex_match` |
|
||||
| Tool call validation | `exact_match` with `tool_arguments` extractor |
|
||||
| Quality assessment | Rubric grader |
|
||||
| Creativity evaluation | Rubric grader |
|
||||
| Format checking | Custom tool grader |
|
||||
| Multi-criteria evaluation | Multiple graders |
|
||||
|
||||
## Score Interpretation
|
||||
|
||||
All scores are between 0.0 and 1.0:
|
||||
|
||||
- **1.0**: Perfect - meets all criteria
|
||||
- **0.75-0.99**: Good - minor issues
|
||||
- **0.5-0.74**: Acceptable - notable gaps
|
||||
- **0.25-0.49**: Poor - major problems
|
||||
- **0.0-0.24**: Failed - did not meet criteria
|
||||
|
||||
Tool graders typically return binary scores (0.0 or 1.0), while rubric graders can return any value in the range.
|
||||
|
||||
## Error Handling
|
||||
|
||||
If grading fails (e.g., network error, invalid format):
|
||||
- Score is set to 0.0
|
||||
- Rationale includes error message
|
||||
- Metadata includes error details
|
||||
|
||||
This ensures evaluations can continue even with individual failures.
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Tool Graders](/guides/evals/graders/tool-graders) - Built-in and custom functions
|
||||
- [Rubric Graders](/guides/evals/graders/rubric-graders) - LLM-as-judge details
|
||||
- [Multi-Metric Evaluation](/guides/guides/evals/graders/multi-metric) - Using multiple graders
|
||||
- [Extractors](/guides/evals/concepts/extractors) - Selecting what to grade
|
||||
@@ -1,205 +0,0 @@
|
||||
# Core Concepts Overview
|
||||
|
||||
## What is Letta Evals?
|
||||
|
||||
Letta Evals is a framework for systematically testing and measuring the performance of Letta AI agents. It provides a structured way to:
|
||||
|
||||
- Define test cases and expected behaviors
|
||||
- Run agents against those tests automatically
|
||||
- Score agent responses using deterministic rules or LLM judges
|
||||
- Track performance over time and across different configurations
|
||||
|
||||
Think of it as a testing framework specifically designed for stateful agents.
|
||||
|
||||
## The Evaluation Flow
|
||||
|
||||
Every evaluation follows this flow:
|
||||
|
||||
**Dataset → Target (Agent) → Extractor → Grader → Gate → Result**
|
||||
|
||||
1. **Dataset**: Your test cases (questions, scenarios, expected outputs)
|
||||
2. **Target**: The agent being evaluated
|
||||
3. **Extractor**: Pulls out the relevant information from the agent's response
|
||||
4. **Grader**: Scores the extracted information
|
||||
5. **Gate**: Pass/fail criteria for the overall evaluation
|
||||
6. **Result**: Metrics, scores, and detailed results
|
||||
|
||||
### Built for Stateful Agents
|
||||
|
||||
Unlike most evaluation frameworks designed for simple input-output models, Letta Evals is purpose-built for **stateful agents** - agents that:
|
||||
- Maintain memory across conversations
|
||||
- Use tools and external functions
|
||||
- Evolve their behavior based on interactions
|
||||
- Have persistent context and state
|
||||
|
||||
This means you can test:
|
||||
- **Memory updates**: Did the agent correctly remember the user's name?
|
||||
- **Multi-turn conversations**: Can the agent maintain context across multiple exchanges?
|
||||
- **Tool usage**: Does the agent call the right tools with the right arguments?
|
||||
- **State evolution**: How does the agent's internal state change over time?
|
||||
|
||||
Traditional eval frameworks treat each test as independent. Letta Evals understands that agent state matters.
|
||||
|
||||
**Example: Testing Memory Updates**
|
||||
```yaml
|
||||
graders:
|
||||
memory_check:
|
||||
kind: tool # Deterministic grading
|
||||
function: contains # Check if ground_truth in extracted content
|
||||
extractor: memory_block # Extract from agent memory (not just response!)
|
||||
extractor_config:
|
||||
block_label: human # Which memory block to check
|
||||
```
|
||||
|
||||
Dataset:
|
||||
```jsonl
|
||||
{"input": "Please remember that I like bananas.", "ground_truth": "bananas"}
|
||||
```
|
||||
|
||||
This doesn't just check if the agent responded correctly - it verifies the agent actually stored "bananas" in its memory block. Traditional eval frameworks can't inspect agent state like this.
|
||||
|
||||
## Why Evals Matter
|
||||
|
||||
AI agents are complex systems that can behave unpredictably. Without systematic evaluation, you can't:
|
||||
- **Know if changes improve or break your agent** - Did that prompt tweak help or hurt?
|
||||
- **Prevent regressions** - Catch when "fixes" break existing functionality
|
||||
- **Compare approaches objectively** - Which model works better for your use case?
|
||||
- **Build confidence before deployment** - Ensure quality before shipping to users
|
||||
- **Track improvement over time** - Measure progress as you iterate
|
||||
|
||||
Manual testing doesn't scale. Evals let you test hundreds of scenarios in minutes.
|
||||
|
||||
## What Evals Are Useful For
|
||||
|
||||
### 1. Development & Iteration
|
||||
- Test prompt changes instantly across your entire test suite
|
||||
- Experiment with different models and compare results
|
||||
- Validate that new features work as expected
|
||||
|
||||
### 2. Quality Assurance
|
||||
- Prevent regressions when modifying agent behavior
|
||||
- Ensure agents handle edge cases correctly
|
||||
- Verify tool usage and memory updates
|
||||
|
||||
### 3. Model Selection
|
||||
- Compare GPT-4 vs Claude vs other models on your specific use case
|
||||
- Test different model configurations (temperature, system prompts, etc.)
|
||||
- Find the right cost/performance tradeoff
|
||||
|
||||
### 4. Benchmarking
|
||||
- Measure agent performance on standard tasks
|
||||
- Track improvements over time
|
||||
- Share reproducible results with your team
|
||||
|
||||
### 5. Production Readiness
|
||||
- Validate agents meet quality thresholds before deployment
|
||||
- Run continuous evaluation in CI/CD pipelines
|
||||
- Monitor production agent quality
|
||||
|
||||
## How Letta Evals Works
|
||||
|
||||
Letta Evals is built around a few key concepts that work together to create a flexible evaluation framework.
|
||||
|
||||
## Key Components
|
||||
|
||||
### Suite
|
||||
|
||||
An **evaluation suite** is a complete test configuration defined in a YAML file. It ties together:
|
||||
- Which dataset to use
|
||||
- Which agent to test
|
||||
- How to grade responses
|
||||
- What criteria determine pass/fail
|
||||
|
||||
Think of a suite as a reusable test specification.
|
||||
|
||||
### Dataset
|
||||
|
||||
A **dataset** is a JSONL file where each line represents one test case. Each sample has:
|
||||
- An input (what to ask the agent)
|
||||
- Optional ground truth (the expected answer)
|
||||
- Optional metadata (tags, custom fields)
|
||||
|
||||
### Target
|
||||
|
||||
The **target** is what you're evaluating. Currently, this is a Letta agent, specified by:
|
||||
- An agent file (.af)
|
||||
- An existing agent ID
|
||||
- A Python script that creates agents programmatically
|
||||
|
||||
### Trajectory
|
||||
|
||||
A **trajectory** is the complete conversation history from one test case. It's a list of turns, where each turn contains a list of Letta messages (assistant messages, tool calls, tool returns, etc.).
|
||||
|
||||
### Extractor
|
||||
|
||||
An **extractor** determines what part of the trajectory to evaluate. For example:
|
||||
- The last thing the agent said
|
||||
- All tool calls made
|
||||
- Content from agent memory
|
||||
- Text matching a pattern
|
||||
|
||||
### Grader
|
||||
|
||||
A **grader** scores how well the agent performed. There are two types:
|
||||
- **Tool graders**: Python functions that compare submission to ground truth
|
||||
- **Rubric graders**: LLM judges that evaluate based on custom criteria
|
||||
|
||||
### Gate
|
||||
|
||||
A **gate** is the pass/fail threshold for your evaluation. It compares aggregate metrics (like average score or pass rate) against a target value.
|
||||
|
||||
## Multi-Metric Evaluation
|
||||
|
||||
You can define multiple graders in one suite to evaluate different aspects:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
accuracy: # Check if answer is correct
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: last_assistant # Use final response
|
||||
|
||||
tool_usage: # Check if agent called the right tool
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: tool_arguments # Extract tool call args
|
||||
extractor_config:
|
||||
tool_name: search # From search tool
|
||||
```
|
||||
|
||||
The gate can check any of these metrics:
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: accuracy # Gate on accuracy (tool_usage still computed)
|
||||
op: gte # >=
|
||||
value: 0.8 # 80% threshold
|
||||
```
|
||||
|
||||
## Score Normalization
|
||||
|
||||
All scores are normalized to the range [0.0, 1.0]:
|
||||
- 0.0 = complete failure
|
||||
- 1.0 = perfect success
|
||||
- Values in between = partial credit
|
||||
|
||||
This allows different grader types to be compared and combined.
|
||||
|
||||
## Aggregate Metrics
|
||||
|
||||
Individual sample scores are aggregated in two ways:
|
||||
|
||||
1. **Average Score**: Mean of all scores (0.0 to 1.0)
|
||||
2. **Accuracy/Pass Rate**: Percentage of samples passing a threshold
|
||||
|
||||
You can gate on either metric type.
|
||||
|
||||
## Next Steps
|
||||
|
||||
Dive deeper into each concept:
|
||||
- [Suites](./suites.md) - Suite configuration in detail
|
||||
- [Datasets](./datasets.md) - Creating effective test datasets
|
||||
- [Targets](./targets.md) - Agent configuration options
|
||||
- [Graders](./graders.md) - Understanding grader types
|
||||
- [Extractors](./extractors.md) - Extraction strategies
|
||||
- [Gates](./gates.md) - Setting pass/fail criteria
|
||||
@@ -1,207 +0,0 @@
|
||||
# Core Concepts
|
||||
|
||||
Understanding how Letta Evals works and what makes it different.
|
||||
|
||||
<Note>
|
||||
**Just want to run an eval?** Skip to [Getting Started](/guides/evals/getting-started) for a hands-on quickstart.
|
||||
</Note>
|
||||
|
||||
## Built for Stateful Agents
|
||||
|
||||
Letta Evals is a testing framework specifically designed for agents that maintain state. Unlike traditional eval frameworks built for simple input-output models, Letta Evals understands that agents:
|
||||
|
||||
- Maintain memory across conversations
|
||||
- Use tools and external functions
|
||||
- Evolve their behavior based on interactions
|
||||
- Have persistent context and state
|
||||
|
||||
This means you can test aspects of your agent that other frameworks can't: memory updates, multi-turn conversations, tool usage patterns, and state evolution over time.
|
||||
|
||||
## The Evaluation Flow
|
||||
|
||||
Every evaluation follows this flow:
|
||||
|
||||
**Dataset → Target (Agent) → Extractor → Grader → Gate → Result**
|
||||
|
||||
1. **Dataset**: Your test cases (questions, scenarios, expected outputs)
|
||||
2. **Target**: The agent being evaluated
|
||||
3. **Extractor**: Pulls out the relevant information from the agent's response
|
||||
4. **Grader**: Scores the extracted information
|
||||
5. **Gate**: Pass/fail criteria for the overall evaluation
|
||||
6. **Result**: Metrics, scores, and detailed results
|
||||
|
||||
### What You Can Test
|
||||
|
||||
With Letta Evals, you can test aspects of agents that traditional frameworks can't:
|
||||
|
||||
- **Memory updates**: Did the agent correctly remember the user's name?
|
||||
- **Multi-turn conversations**: Can the agent maintain context across multiple exchanges?
|
||||
- **Tool usage**: Does the agent call the right tools with the right arguments?
|
||||
- **State evolution**: How does the agent's internal state change over time?
|
||||
|
||||
<Note>
|
||||
**Example: Testing Memory Updates**
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
memory_check:
|
||||
kind: tool # Deterministic grading
|
||||
function: contains # Check if ground_truth in extracted content
|
||||
extractor: memory_block # Extract from agent memory (not just response!)
|
||||
extractor_config:
|
||||
block_label: human # Which memory block to check
|
||||
```
|
||||
|
||||
Dataset:
|
||||
```jsonl
|
||||
{"input": "Please remember that I like bananas.", "ground_truth": "bananas"}
|
||||
```
|
||||
|
||||
This doesn't just check if the agent responded correctly - it verifies the agent actually stored "bananas" in its memory block. Traditional eval frameworks can't inspect agent state like this.
|
||||
</Note>
|
||||
|
||||
## Why Evals Matter
|
||||
|
||||
AI agents are complex systems that can behave unpredictably. Without systematic evaluation, you can't:
|
||||
- **Know if changes improve or break your agent** - Did that prompt tweak help or hurt?
|
||||
- **Prevent regressions** - Catch when "fixes" break existing functionality
|
||||
- **Compare approaches objectively** - Which model works better for your use case?
|
||||
- **Build confidence before deployment** - Ensure quality before shipping to users
|
||||
- **Track improvement over time** - Measure progress as you iterate
|
||||
|
||||
Manual testing doesn't scale. Evals let you test hundreds of scenarios in minutes.
|
||||
|
||||
## What Evals Are Useful For
|
||||
|
||||
### 1. Development & Iteration
|
||||
- Test prompt changes instantly across your entire test suite
|
||||
- Experiment with different models and compare results
|
||||
- Validate that new features work as expected
|
||||
|
||||
### 2. Quality Assurance
|
||||
- Prevent regressions when modifying agent behavior
|
||||
- Ensure agents handle edge cases correctly
|
||||
- Verify tool usage and memory updates
|
||||
|
||||
### 3. Model Selection
|
||||
- Compare GPT-4 vs Claude vs other models on your specific use case
|
||||
- Test different model configurations (temperature, system prompts, etc.)
|
||||
- Find the right cost/performance tradeoff
|
||||
|
||||
### 4. Benchmarking
|
||||
- Measure agent performance on standard tasks
|
||||
- Track improvements over time
|
||||
- Share reproducible results with your team
|
||||
|
||||
### 5. Production Readiness
|
||||
- Validate agents meet quality thresholds before deployment
|
||||
- Run continuous evaluation in CI/CD pipelines
|
||||
- Monitor production agent quality
|
||||
|
||||
## How Letta Evals Works
|
||||
|
||||
Letta Evals is built around a few key concepts that work together to create a flexible evaluation framework.
|
||||
|
||||
## Key Components
|
||||
|
||||
### Suite
|
||||
|
||||
An **evaluation suite** is a complete test configuration defined in a YAML file. It ties together:
|
||||
- Which dataset to use
|
||||
- Which agent to test
|
||||
- How to grade responses
|
||||
- What criteria determine pass/fail
|
||||
|
||||
Think of a suite as a reusable test specification.
|
||||
|
||||
### Dataset
|
||||
|
||||
A **dataset** is a JSONL file where each line represents one test case. Each sample has:
|
||||
- An input (what to ask the agent)
|
||||
- Optional ground truth (the expected answer)
|
||||
- Optional metadata (tags, custom fields)
|
||||
|
||||
### Target
|
||||
|
||||
The **target** is what you're evaluating. Currently, this is a Letta agent, specified by:
|
||||
- An agent file (.af)
|
||||
- An existing agent ID
|
||||
- A Python script that creates agents programmatically
|
||||
|
||||
### Trajectory
|
||||
|
||||
A **trajectory** is the complete conversation history from one test case. It's a list of turns, where each turn contains a list of Letta messages (assistant messages, tool calls, tool returns, etc.).
|
||||
|
||||
### Extractor
|
||||
|
||||
An **extractor** determines what part of the trajectory to evaluate. For example:
|
||||
- The last thing the agent said
|
||||
- All tool calls made
|
||||
- Content from agent memory
|
||||
- Text matching a pattern
|
||||
|
||||
### Grader
|
||||
|
||||
A **grader** scores how well the agent performed. There are two types:
|
||||
- **Tool graders**: Python functions that compare submission to ground truth
|
||||
- **Rubric graders**: LLM judges that evaluate based on custom criteria
|
||||
|
||||
### Gate
|
||||
|
||||
A **gate** is the pass/fail threshold for your evaluation. It compares aggregate metrics (like average score or pass rate) against a target value.
|
||||
|
||||
## Multi-Metric Evaluation
|
||||
|
||||
You can define multiple graders in one suite to evaluate different aspects:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
accuracy: # Check if answer is correct
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: last_assistant # Use final response
|
||||
|
||||
tool_usage: # Check if agent called the right tool
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: tool_arguments # Extract tool call args
|
||||
extractor_config:
|
||||
tool_name: search # From search tool
|
||||
```
|
||||
|
||||
The gate can check any of these metrics:
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: accuracy # Gate on accuracy (tool_usage still computed)
|
||||
op: gte # >=
|
||||
value: 0.8 # 80% threshold
|
||||
```
|
||||
|
||||
## Score Normalization
|
||||
|
||||
All scores are normalized to the range [0.0, 1.0]:
|
||||
- 0.0 = complete failure
|
||||
- 1.0 = perfect success
|
||||
- Values in between = partial credit
|
||||
|
||||
This allows different grader types to be compared and combined.
|
||||
|
||||
## Aggregate Metrics
|
||||
|
||||
Individual sample scores are aggregated in two ways:
|
||||
|
||||
1. **Average Score**: Mean of all scores (0.0 to 1.0)
|
||||
2. **Accuracy/Pass Rate**: Percentage of samples passing a threshold
|
||||
|
||||
You can gate on either metric type.
|
||||
|
||||
## Next Steps
|
||||
|
||||
Dive deeper into each concept:
|
||||
- [Suites](/guides/evals/concepts/suites) - Suite configuration in detail
|
||||
- [Datasets](/guides/evals/concepts/datasets) - Creating effective test datasets
|
||||
- [Targets](/guides/evals/concepts/targets) - Agent configuration options
|
||||
- [Graders](/guides/evals/concepts/graders) - Understanding grader types
|
||||
- [Extractors](/guides/evals/concepts/extractors) - Extraction strategies
|
||||
- [Gates](/guides/evals/concepts/gates) - Setting pass/fail criteria
|
||||
@@ -1,273 +0,0 @@
|
||||
# Suites
|
||||
|
||||
A **suite** is a YAML configuration file that defines a complete evaluation specification. It's the central piece that ties together your dataset, target agent, grading criteria, and pass/fail thresholds.
|
||||
|
||||
**Quick overview:**
|
||||
- **Single file defines everything**: Dataset, agent, graders, and success criteria all in one YAML
|
||||
- **Reusable and shareable**: Version control your evaluation specs alongside your code
|
||||
- **Multi-metric support**: Evaluate multiple aspects (accuracy, quality, tool usage) in one run
|
||||
- **Multi-model testing**: Run the same suite across different LLM models
|
||||
- **Flexible filtering**: Test subsets using tags or sample limits
|
||||
|
||||
**Typical workflow:**
|
||||
1. Create a suite YAML defining what and how to test
|
||||
2. Run `letta-evals run suite.yaml`
|
||||
3. Review results showing scores for each metric
|
||||
4. Suite passes or fails based on gate criteria
|
||||
|
||||
An evaluation suite is a YAML configuration file that defines a complete test specification.
|
||||
|
||||
## Basic Structure
|
||||
|
||||
```yaml
|
||||
name: my-evaluation # Suite identifier
|
||||
description: Optional description of what this tests # Human-readable explanation
|
||||
dataset: path/to/dataset.jsonl # Test cases
|
||||
|
||||
target: # What agent to evaluate
|
||||
kind: agent
|
||||
agent_file: agent.af # Agent to test
|
||||
base_url: http://localhost:8283 # Letta server
|
||||
|
||||
graders: # How to evaluate responses
|
||||
my_metric:
|
||||
kind: tool # Deterministic grading
|
||||
function: exact_match # Grading function
|
||||
extractor: last_assistant # What to extract from agent response
|
||||
|
||||
gate: # Pass/fail criteria
|
||||
metric_key: my_metric # Which metric to check
|
||||
op: gte # Greater than or equal
|
||||
value: 0.8 # 80% threshold
|
||||
```
|
||||
|
||||
## Required Fields
|
||||
|
||||
### name
|
||||
The name of your evaluation suite. Used in output and results.
|
||||
|
||||
```yaml
|
||||
name: question-answering-eval
|
||||
```
|
||||
|
||||
### dataset
|
||||
Path to the JSONL or CSV dataset file. Can be relative (to the suite YAML) or absolute.
|
||||
|
||||
```yaml
|
||||
dataset: ./datasets/qa.jsonl # Relative to suite YAML location
|
||||
```
|
||||
|
||||
### target
|
||||
Specifies what agent to evaluate. See [Targets](./targets.md) for details.
|
||||
|
||||
### graders
|
||||
One or more graders to evaluate agent performance. See [Graders](./graders.md) for details.
|
||||
|
||||
### gate
|
||||
Pass/fail criteria. See [Gates](./gates.md) for details.
|
||||
|
||||
## Optional Fields
|
||||
|
||||
### description
|
||||
A human-readable description of what this suite tests:
|
||||
|
||||
```yaml
|
||||
description: Tests the agent's ability to answer factual questions accurately
|
||||
```
|
||||
|
||||
### max_samples
|
||||
Limit the number of samples to evaluate (useful for quick tests):
|
||||
|
||||
```yaml
|
||||
max_samples: 10 # Only evaluate first 10 samples
|
||||
```
|
||||
|
||||
### sample_tags
|
||||
Filter samples by tags (only evaluate samples with these tags):
|
||||
|
||||
```yaml
|
||||
sample_tags: [math, easy] # Only samples tagged with "math" AND "easy"
|
||||
```
|
||||
|
||||
Dataset samples can include tags:
|
||||
```jsonl
|
||||
{"input": "What is 2+2?", "ground_truth": "4", "tags": ["math", "easy"]}
|
||||
```
|
||||
|
||||
### num_runs
|
||||
Number of times to run the entire evaluation suite (useful for testing non-deterministic behavior):
|
||||
|
||||
```yaml
|
||||
num_runs: 5 # Run the evaluation 5 times
|
||||
```
|
||||
|
||||
Default: 1
|
||||
|
||||
### setup_script
|
||||
Path to a Python script with a setup function to run before evaluation:
|
||||
|
||||
```yaml
|
||||
setup_script: setup.py:prepare_environment # script.py:function_name
|
||||
```
|
||||
|
||||
The setup function should have this signature:
|
||||
```python
|
||||
def prepare_environment(suite: SuiteSpec) -> None:
|
||||
# Setup code here
|
||||
pass
|
||||
```
|
||||
|
||||
## Path Resolution
|
||||
|
||||
Paths in the suite YAML are resolved relative to the YAML file location:
|
||||
|
||||
```
|
||||
project/
|
||||
├── suite.yaml
|
||||
├── dataset.jsonl
|
||||
└── agents/
|
||||
└── my_agent.af
|
||||
```
|
||||
|
||||
```yaml
|
||||
# In suite.yaml
|
||||
dataset: dataset.jsonl # Resolves to project/dataset.jsonl
|
||||
target:
|
||||
agent_file: agents/my_agent.af # Resolves to project/agents/my_agent.af
|
||||
```
|
||||
|
||||
Absolute paths are used as-is.
|
||||
|
||||
## Multi-Grader Suites
|
||||
|
||||
You can evaluate multiple metrics in one suite:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
accuracy: # Check if answer is correct
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: last_assistant
|
||||
|
||||
completeness: # LLM judges response quality
|
||||
kind: rubric
|
||||
prompt_path: rubrics/completeness.txt
|
||||
model: gpt-4o-mini
|
||||
extractor: last_assistant
|
||||
|
||||
tool_usage: # Verify correct tool was called
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: tool_arguments # Extract tool call arguments
|
||||
```
|
||||
|
||||
The gate can check any of these metrics:
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: accuracy # Gate on accuracy metric (others still computed)
|
||||
op: gte # Greater than or equal
|
||||
value: 0.9 # 90% threshold
|
||||
```
|
||||
|
||||
Results will include scores for all graders, even if you only gate on one.
|
||||
|
||||
## Examples
|
||||
|
||||
### Simple Tool Grader Suite
|
||||
|
||||
```yaml
|
||||
name: basic-qa # Suite name
|
||||
dataset: questions.jsonl # Test questions
|
||||
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: qa_agent.af # Pre-configured agent
|
||||
base_url: http://localhost:8283 # Local server
|
||||
|
||||
graders:
|
||||
accuracy: # Single metric
|
||||
kind: tool # Deterministic grading
|
||||
function: contains # Check if ground truth is in response
|
||||
extractor: last_assistant # Use final agent message
|
||||
|
||||
gate:
|
||||
metric_key: accuracy # Gate on this metric
|
||||
op: gte # Must be >=
|
||||
value: 0.75 # 75% to pass
|
||||
```
|
||||
|
||||
### Rubric Grader Suite
|
||||
|
||||
```yaml
|
||||
name: quality-eval # Quality evaluation
|
||||
dataset: prompts.jsonl # Test prompts
|
||||
|
||||
target:
|
||||
kind: agent
|
||||
agent_id: existing-agent-123 # Use existing agent
|
||||
base_url: https://api.letta.com # Letta Cloud
|
||||
|
||||
graders:
|
||||
quality: # LLM-as-judge metric
|
||||
kind: rubric # Subjective evaluation
|
||||
prompt_path: quality_rubric.txt # Rubric template
|
||||
model: gpt-4o-mini # Judge model
|
||||
temperature: 0.0 # Deterministic
|
||||
extractor: last_assistant # Evaluate final response
|
||||
|
||||
gate:
|
||||
metric_key: quality # Gate on this metric
|
||||
metric: avg_score # Use average score
|
||||
op: gte # Must be >=
|
||||
value: 0.7 # 70% to pass
|
||||
```
|
||||
|
||||
### Multi-Model Suite
|
||||
|
||||
Test the same agent configuration across different models:
|
||||
|
||||
```yaml
|
||||
name: model-comparison # Compare model performance
|
||||
dataset: test.jsonl # Same test for all models
|
||||
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: agent.af # Same agent configuration
|
||||
base_url: http://localhost:8283 # Local server
|
||||
model_configs: [gpt-4o-mini, claude-3-5-sonnet] # Test both models
|
||||
|
||||
graders:
|
||||
accuracy: # Single metric for comparison
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: last_assistant
|
||||
|
||||
gate:
|
||||
metric_key: accuracy # Both models must pass this
|
||||
op: gte # Must be >=
|
||||
value: 0.8 # 80% threshold
|
||||
```
|
||||
|
||||
Results will show per-model metrics.
|
||||
|
||||
## Validation
|
||||
|
||||
Validate your suite configuration before running:
|
||||
|
||||
```bash
|
||||
letta-evals validate suite.yaml
|
||||
```
|
||||
|
||||
This checks:
|
||||
- Required fields are present
|
||||
- Paths exist
|
||||
- Configuration is valid
|
||||
- Grader/extractor combinations are compatible
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Dataset Configuration](./datasets.md)
|
||||
- [Target Configuration](./targets.md)
|
||||
- [Grader Configuration](./graders.md)
|
||||
- [Gate Configuration](./gates.md)
|
||||
@@ -1,275 +0,0 @@
|
||||
# Suites
|
||||
|
||||
A **suite** is a YAML configuration file that defines a complete evaluation specification. It's the central piece that ties together your dataset, target agent, grading criteria, and pass/fail thresholds.
|
||||
|
||||
<Note>
|
||||
**Quick overview:**
|
||||
- **Single file defines everything**: Dataset, agent, graders, and success criteria all in one YAML
|
||||
- **Reusable and shareable**: Version control your evaluation specs alongside your code
|
||||
- **Multi-metric support**: Evaluate multiple aspects (accuracy, quality, tool usage) in one run
|
||||
- **Multi-model testing**: Run the same suite across different LLM models
|
||||
- **Flexible filtering**: Test subsets using tags or sample limits
|
||||
</Note>
|
||||
|
||||
**Typical workflow:**
|
||||
1. Create a suite YAML defining what and how to test
|
||||
2. Run `letta-evals run suite.yaml`
|
||||
3. Review results showing scores for each metric
|
||||
4. Suite passes or fails based on gate criteria
|
||||
|
||||
An evaluation suite is a YAML configuration file that defines a complete test specification.
|
||||
|
||||
## Basic Structure
|
||||
|
||||
```yaml
|
||||
name: my-evaluation # Suite identifier
|
||||
description: Optional description of what this tests # Human-readable explanation
|
||||
dataset: path/to/dataset.jsonl # Test cases
|
||||
|
||||
target: # What agent to evaluate
|
||||
kind: agent
|
||||
agent_file: agent.af # Agent to test
|
||||
base_url: https://api.letta.com # Letta server
|
||||
|
||||
graders: # How to evaluate responses
|
||||
my_metric:
|
||||
kind: tool # Deterministic grading
|
||||
function: exact_match # Grading function
|
||||
extractor: last_assistant # What to extract from agent response
|
||||
|
||||
gate: # Pass/fail criteria
|
||||
metric_key: my_metric # Which metric to check
|
||||
op: gte # Greater than or equal
|
||||
value: 0.8 # 80% threshold
|
||||
```
|
||||
|
||||
## Required Fields
|
||||
|
||||
### name
|
||||
The name of your evaluation suite. Used in output and results.
|
||||
|
||||
```yaml
|
||||
name: question-answering-eval
|
||||
```
|
||||
|
||||
### dataset
|
||||
Path to the JSONL or CSV dataset file. Can be relative (to the suite YAML) or absolute.
|
||||
|
||||
```yaml
|
||||
dataset: ./datasets/qa.jsonl # Relative to suite YAML location
|
||||
```
|
||||
|
||||
### target
|
||||
Specifies what agent to evaluate. See [Targets](/guides/evals/concepts/targets) for details.
|
||||
|
||||
### graders
|
||||
One or more graders to evaluate agent performance. See [Graders](/guides/evals/concepts/graders) for details.
|
||||
|
||||
### gate
|
||||
Pass/fail criteria. See [Gates](/guides/evals/concepts/gates) for details.
|
||||
|
||||
## Optional Fields
|
||||
|
||||
### description
|
||||
A human-readable description of what this suite tests:
|
||||
|
||||
```yaml
|
||||
description: Tests the agent's ability to answer factual questions accurately
|
||||
```
|
||||
|
||||
### max_samples
|
||||
Limit the number of samples to evaluate (useful for quick tests):
|
||||
|
||||
```yaml
|
||||
max_samples: 10 # Only evaluate first 10 samples
|
||||
```
|
||||
|
||||
### sample_tags
|
||||
Filter samples by tags (only evaluate samples with these tags):
|
||||
|
||||
```yaml
|
||||
sample_tags: [math, easy] # Only samples tagged with "math" AND "easy"
|
||||
```
|
||||
|
||||
Dataset samples can include tags:
|
||||
```jsonl
|
||||
{"input": "What is 2+2?", "ground_truth": "4", "tags": ["math", "easy"]}
|
||||
```
|
||||
|
||||
### num_runs
|
||||
Number of times to run the entire evaluation suite (useful for testing non-deterministic behavior):
|
||||
|
||||
```yaml
|
||||
num_runs: 5 # Run the evaluation 5 times
|
||||
```
|
||||
|
||||
Default: 1
|
||||
|
||||
### setup_script
|
||||
Path to a Python script with a setup function to run before evaluation:
|
||||
|
||||
```yaml
|
||||
setup_script: setup.py:prepare_environment # script.py:function_name
|
||||
```
|
||||
|
||||
The setup function should have this signature:
|
||||
```python
|
||||
def prepare_environment(suite: SuiteSpec) -> None:
|
||||
# Setup code here
|
||||
pass
|
||||
```
|
||||
|
||||
## Path Resolution
|
||||
|
||||
Paths in the suite YAML are resolved relative to the YAML file location:
|
||||
|
||||
```
|
||||
project/
|
||||
├── suite.yaml
|
||||
├── dataset.jsonl
|
||||
└── agents/
|
||||
└── my_agent.af
|
||||
```
|
||||
|
||||
```yaml
|
||||
# In suite.yaml
|
||||
dataset: dataset.jsonl # Resolves to project/dataset.jsonl
|
||||
target:
|
||||
agent_file: agents/my_agent.af # Resolves to project/agents/my_agent.af
|
||||
```
|
||||
|
||||
Absolute paths are used as-is.
|
||||
|
||||
## Multi-Grader Suites
|
||||
|
||||
You can evaluate multiple metrics in one suite:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
accuracy: # Check if answer is correct
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: last_assistant
|
||||
|
||||
completeness: # LLM judges response quality
|
||||
kind: rubric
|
||||
prompt_path: rubrics/completeness.txt
|
||||
model: gpt-4o-mini
|
||||
extractor: last_assistant
|
||||
|
||||
tool_usage: # Verify correct tool was called
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: tool_arguments # Extract tool call arguments
|
||||
```
|
||||
|
||||
The gate can check any of these metrics:
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: accuracy # Gate on accuracy metric (others still computed)
|
||||
op: gte # Greater than or equal
|
||||
value: 0.9 # 90% threshold
|
||||
```
|
||||
|
||||
Results will include scores for all graders, even if you only gate on one.
|
||||
|
||||
## Examples
|
||||
|
||||
### Simple Tool Grader Suite
|
||||
|
||||
```yaml
|
||||
name: basic-qa # Suite name
|
||||
dataset: questions.jsonl # Test questions
|
||||
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: qa_agent.af # Pre-configured agent
|
||||
base_url: https://api.letta.com # Local server
|
||||
|
||||
graders:
|
||||
accuracy: # Single metric
|
||||
kind: tool # Deterministic grading
|
||||
function: contains # Check if ground truth is in response
|
||||
extractor: last_assistant # Use final agent message
|
||||
|
||||
gate:
|
||||
metric_key: accuracy # Gate on this metric
|
||||
op: gte # Must be >=
|
||||
value: 0.75 # 75% to pass
|
||||
```
|
||||
|
||||
### Rubric Grader Suite
|
||||
|
||||
```yaml
|
||||
name: quality-eval # Quality evaluation
|
||||
dataset: prompts.jsonl # Test prompts
|
||||
|
||||
target:
|
||||
kind: agent
|
||||
agent_id: existing-agent-123 # Use existing agent
|
||||
base_url: https://api.letta.com # Letta Cloud
|
||||
|
||||
graders:
|
||||
quality: # LLM-as-judge metric
|
||||
kind: rubric # Subjective evaluation
|
||||
prompt_path: quality_rubric.txt # Rubric template
|
||||
model: gpt-4o-mini # Judge model
|
||||
temperature: 0.0 # Deterministic
|
||||
extractor: last_assistant # Evaluate final response
|
||||
|
||||
gate:
|
||||
metric_key: quality # Gate on this metric
|
||||
metric: avg_score # Use average score
|
||||
op: gte # Must be >=
|
||||
value: 0.7 # 70% to pass
|
||||
```
|
||||
|
||||
### Multi-Model Suite
|
||||
|
||||
Test the same agent configuration across different models:
|
||||
|
||||
```yaml
|
||||
name: model-comparison # Compare model performance
|
||||
dataset: test.jsonl # Same test for all models
|
||||
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: agent.af # Same agent configuration
|
||||
base_url: https://api.letta.com # Local server
|
||||
model_configs: [gpt-4o-mini, claude-3-5-sonnet] # Test both models
|
||||
|
||||
graders:
|
||||
accuracy: # Single metric for comparison
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: last_assistant
|
||||
|
||||
gate:
|
||||
metric_key: accuracy # Both models must pass this
|
||||
op: gte # Must be >=
|
||||
value: 0.8 # 80% threshold
|
||||
```
|
||||
|
||||
Results will show per-model metrics.
|
||||
|
||||
## Validation
|
||||
|
||||
Validate your suite configuration before running:
|
||||
|
||||
```bash
|
||||
letta-evals validate suite.yaml
|
||||
```
|
||||
|
||||
This checks:
|
||||
- Required fields are present
|
||||
- Paths exist
|
||||
- Configuration is valid
|
||||
- Grader/extractor combinations are compatible
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Dataset Configuration](/guides/evals/concepts/datasets)
|
||||
- [Target Configuration](/guides/evals/concepts/targets)
|
||||
- [Grader Configuration](/guides/evals/concepts/graders)
|
||||
- [Gate Configuration](/guides/evals/concepts/gates)
|
||||
@@ -1,319 +0,0 @@
|
||||
# Targets
|
||||
|
||||
A **target** is the agent you're evaluating. In Letta Evals, the target configuration determines how agents are created, accessed, and tested.
|
||||
|
||||
**Quick overview:**
|
||||
- **Three ways to specify agents**: agent file (`.af`), existing agent ID, or programmatic creation script
|
||||
- **Critical distinction**: `agent_file`/`agent_script` create fresh agents per sample (isolated tests), while `agent_id` uses one agent for all samples (stateful conversation)
|
||||
- **Multi-model support**: Test the same agent configuration across different LLM models
|
||||
- **Flexible connection**: Connect to local Letta servers or Letta Cloud
|
||||
|
||||
**When to use each approach:**
|
||||
- `agent_file` - Pre-configured agents saved as `.af` files (most common)
|
||||
- `agent_id` - Testing existing agents or multi-turn conversations with state
|
||||
- `agent_script` - Dynamic agent creation with per-sample customization
|
||||
|
||||
The target configuration specifies how to create or access the agent for evaluation.
|
||||
|
||||
## Target Configuration
|
||||
|
||||
All targets have a `kind` field (currently only `agent` is supported):
|
||||
|
||||
```yaml
|
||||
target:
|
||||
kind: agent # Currently only "agent" is supported
|
||||
# ... agent-specific configuration
|
||||
```
|
||||
|
||||
## Agent Sources
|
||||
|
||||
You must specify exactly ONE of these:
|
||||
|
||||
### agent_file
|
||||
|
||||
Path to a `.af` (Agent File) to upload:
|
||||
|
||||
```yaml
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: path/to/agent.af # Path to .af file
|
||||
base_url: http://localhost:8283 # Letta server URL
|
||||
```
|
||||
|
||||
The agent file will be uploaded to the Letta server and a new agent created for the evaluation.
|
||||
|
||||
### agent_id
|
||||
|
||||
ID of an existing agent on the server:
|
||||
|
||||
```yaml
|
||||
target:
|
||||
kind: agent
|
||||
agent_id: agent-123-abc # ID of existing agent
|
||||
base_url: http://localhost:8283 # Letta server URL
|
||||
```
|
||||
|
||||
The existing agent will be used directly. Note: this agent's memory will be modified during evaluation.
|
||||
|
||||
### agent_script
|
||||
|
||||
Path to a Python script with an agent factory function for programmatic agent creation:
|
||||
|
||||
```yaml
|
||||
target:
|
||||
kind: agent
|
||||
agent_script: create_agent.py:create_inventory_agent # script.py:function_name
|
||||
base_url: http://localhost:8283 # Letta server URL
|
||||
```
|
||||
|
||||
Format: `path/to/script.py:function_name`
|
||||
|
||||
The function must be decorated with `@agent_factory` and have the signature `async (client: AsyncLetta, sample: Sample) -> str`:
|
||||
|
||||
```python
|
||||
from letta_client import AsyncLetta, CreateBlock
|
||||
from letta_evals.decorators import agent_factory
|
||||
from letta_evals.models import Sample
|
||||
|
||||
@agent_factory
|
||||
async def create_inventory_agent(client: AsyncLetta, sample: Sample) -> str:
|
||||
"""Create and return agent ID for this sample."""
|
||||
# Access custom arguments from the dataset
|
||||
item = sample.agent_args.get("item", {})
|
||||
|
||||
# Create agent with sample-specific configuration
|
||||
agent = await client.agents.create(
|
||||
name="inventory-assistant",
|
||||
memory_blocks=[
|
||||
CreateBlock(
|
||||
label="item_context",
|
||||
value=f"Item: {item.get('name', 'Unknown')}"
|
||||
)
|
||||
],
|
||||
agent_type="letta_v1_agent",
|
||||
model="openai/gpt-4.1-mini",
|
||||
embedding="openai/text-embedding-3-small",
|
||||
)
|
||||
|
||||
return agent.id
|
||||
```
|
||||
|
||||
**Key features:**
|
||||
- Creates a fresh agent for each sample
|
||||
- Can customize agents using `sample.agent_args` from the dataset
|
||||
- Allows testing agent creation logic itself
|
||||
- Useful when you don't have pre-saved agent files
|
||||
|
||||
**When to use:**
|
||||
- Testing agent creation workflows
|
||||
- Dynamic per-sample agent configuration
|
||||
- Agents that need sample-specific memory or tools
|
||||
- Programmatic agent testing
|
||||
|
||||
See [`examples/programmatic-agent-creation/`](https://github.com/letta-ai/letta-evals/tree/main/examples/programmatic-agent-creation) for a complete working example.
|
||||
|
||||
## Connection Configuration
|
||||
|
||||
### base_url
|
||||
|
||||
Letta server URL:
|
||||
|
||||
```yaml
|
||||
target:
|
||||
base_url: http://localhost:8283 # Local Letta server
|
||||
# or
|
||||
base_url: https://api.letta.com # Letta Cloud
|
||||
```
|
||||
|
||||
Default: `http://localhost:8283`
|
||||
|
||||
### api_key
|
||||
|
||||
API key for authentication (required for Letta Cloud):
|
||||
|
||||
```yaml
|
||||
target:
|
||||
api_key: your-api-key-here # Required for Letta Cloud
|
||||
```
|
||||
|
||||
Or set via environment variable:
|
||||
```bash
|
||||
export LETTA_API_KEY=your-api-key-here
|
||||
```
|
||||
|
||||
### project_id
|
||||
|
||||
Letta project ID (for Letta Cloud):
|
||||
|
||||
```yaml
|
||||
target:
|
||||
project_id: proj_abc123 # Letta Cloud project
|
||||
```
|
||||
|
||||
Or set via environment variable:
|
||||
```bash
|
||||
export LETTA_PROJECT_ID=proj_abc123
|
||||
```
|
||||
|
||||
### timeout
|
||||
|
||||
Request timeout in seconds:
|
||||
|
||||
```yaml
|
||||
target:
|
||||
timeout: 300.0 # Request timeout (5 minutes)
|
||||
```
|
||||
|
||||
Default: 300 seconds
|
||||
|
||||
## Multi-Model Evaluation
|
||||
|
||||
Test the same agent across different models:
|
||||
|
||||
### model_configs
|
||||
|
||||
List of model configuration names from JSON files:
|
||||
|
||||
```yaml
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: agent.af
|
||||
model_configs: [gpt-4o-mini, claude-3-5-sonnet] # Test with both models
|
||||
```
|
||||
|
||||
The evaluation will run once for each model config. Model configs are JSON files in `letta_evals/llm_model_configs/`.
|
||||
|
||||
### model_handles
|
||||
|
||||
List of model handles (cloud-compatible identifiers):
|
||||
|
||||
```yaml
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: agent.af
|
||||
model_handles: ["openai/gpt-4o-mini", "anthropic/claude-3-5-sonnet"] # Cloud model identifiers
|
||||
```
|
||||
|
||||
Use this for Letta Cloud deployments.
|
||||
|
||||
**Note**: You cannot specify both `model_configs` and `model_handles`.
|
||||
|
||||
## Complete Examples
|
||||
|
||||
### Local Development
|
||||
|
||||
```yaml
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: ./agents/my_agent.af # Pre-configured agent
|
||||
base_url: http://localhost:8283 # Local server
|
||||
```
|
||||
|
||||
### Letta Cloud
|
||||
|
||||
```yaml
|
||||
target:
|
||||
kind: agent
|
||||
agent_id: agent-cloud-123 # Existing cloud agent
|
||||
base_url: https://api.letta.com # Letta Cloud
|
||||
api_key: ${LETTA_API_KEY} # From environment variable
|
||||
project_id: proj_abc # Your project ID
|
||||
```
|
||||
|
||||
### Multi-Model Testing
|
||||
|
||||
```yaml
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: agent.af # Same agent configuration
|
||||
base_url: http://localhost:8283 # Local server
|
||||
model_configs: [gpt-4o-mini, gpt-4o, claude-3-5-sonnet] # Test 3 models
|
||||
```
|
||||
|
||||
Results will include per-model metrics:
|
||||
```
|
||||
Model: gpt-4o-mini - Avg: 0.85, Pass: 85.0%
|
||||
Model: gpt-4o - Avg: 0.92, Pass: 92.0%
|
||||
Model: claude-3-5-sonnet - Avg: 0.88, Pass: 88.0%
|
||||
```
|
||||
|
||||
### Programmatic Agent Creation
|
||||
|
||||
```yaml
|
||||
target:
|
||||
kind: agent
|
||||
agent_script: setup.py:CustomAgentFactory # Programmatic creation
|
||||
base_url: http://localhost:8283 # Local server
|
||||
```
|
||||
|
||||
## Environment Variable Precedence
|
||||
|
||||
Configuration values are resolved in this order (highest priority first):
|
||||
|
||||
1. CLI arguments (`--api-key`, `--base-url`, `--project-id`)
|
||||
2. Suite YAML configuration
|
||||
3. Environment variables (`LETTA_API_KEY`, `LETTA_BASE_URL`, `LETTA_PROJECT_ID`)
|
||||
|
||||
## Agent Lifecycle and Testing Behavior
|
||||
|
||||
The way your agent is specified fundamentally changes how the evaluation runs:
|
||||
|
||||
### With agent_file or agent_script: Independent Testing
|
||||
|
||||
**Agent lifecycle:**
|
||||
1. A fresh agent instance is created for each sample
|
||||
2. Agent processes the sample input(s)
|
||||
3. Agent remains on the server after the sample completes
|
||||
|
||||
**Testing behavior:** Each sample is an independent, isolated test. Agent state (memory, message history) does not carry over between samples. This enables parallel execution and ensures reproducible results.
|
||||
|
||||
**Use cases:**
|
||||
- Testing how the agent responds to various independent inputs
|
||||
- Ensuring consistent behavior across different scenarios
|
||||
- Regression testing where each case should be isolated
|
||||
- Evaluating agent responses without prior context
|
||||
|
||||
**Example:** If you have 10 test cases, 10 separate agent instances will be created (one per test case), and they can run in parallel.
|
||||
|
||||
### With agent_id: Sequential Script Testing
|
||||
|
||||
**Agent lifecycle:**
|
||||
1. The same agent instance is used for all samples
|
||||
2. Agent processes each sample in sequence
|
||||
3. Agent state persists throughout the entire evaluation
|
||||
|
||||
**Testing behavior:** The dataset becomes a conversation script where each sample builds on previous ones. Agent memory and message history accumulate, and earlier interactions affect later responses. Samples must execute sequentially.
|
||||
|
||||
**Use cases:**
|
||||
- Testing multi-turn conversations with context
|
||||
- Evaluating how agent memory evolves over time
|
||||
- Simulating a single user session with multiple interactions
|
||||
- Testing scenarios where context should accumulate
|
||||
|
||||
**Example:** If you have 10 test cases, they all run against the same agent instance in order, with state carrying over between each test.
|
||||
|
||||
### Critical Differences
|
||||
|
||||
| Aspect | agent_file / agent_script | agent_id |
|
||||
|--------|---------------------------|----------|
|
||||
| **Agent instances** | New agent per sample | Same agent for all samples |
|
||||
| **State isolation** | Fully isolated | State carries over |
|
||||
| **Execution** | Can run in parallel | Must run sequentially |
|
||||
| **Memory** | Fresh for each sample | Accumulates across samples |
|
||||
| **Use case** | Independent test cases | Conversation scripts |
|
||||
| **Reproducibility** | Highly reproducible | Depends on execution order |
|
||||
|
||||
**Best practice:** Use `agent_file` or `agent_script` for most evaluations to ensure reproducible, isolated tests. Use `agent_id` only when you specifically need to test how agent state evolves across multiple interactions.
|
||||
|
||||
## Validation
|
||||
|
||||
The runner validates:
|
||||
- Exactly one of `agent_file`, `agent_id`, or `agent_script` is specified
|
||||
- Agent files have `.af` extension
|
||||
- Agent script paths are valid
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Suite YAML Reference](../configuration/suite-yaml.md) - Complete target configuration options
|
||||
- [Datasets](./datasets.md) - Using agent_args for sample-specific configuration
|
||||
- [Getting Started](../getting-started.md) - Complete tutorial with target examples
|
||||
@@ -1,329 +0,0 @@
|
||||
# Targets
|
||||
|
||||
A **target** is the agent you're evaluating. In Letta Evals, the target configuration determines how agents are created, accessed, and tested.
|
||||
|
||||
<Note>
|
||||
**Quick overview:**
|
||||
- **Three ways to specify agents**: agent file (`.af`), existing agent ID, or programmatic creation script
|
||||
- **Critical distinction**: `agent_file`/`agent_script` create fresh agents per sample (isolated tests), while `agent_id` uses one agent for all samples (stateful conversation)
|
||||
- **Multi-model support**: Test the same agent configuration across different LLM models
|
||||
- **Flexible connection**: Connect to local Letta servers or Letta Cloud
|
||||
</Note>
|
||||
|
||||
**When to use each approach:**
|
||||
- `agent_file` - Pre-configured agents saved as `.af` files (most common)
|
||||
- `agent_id` - Testing existing agents or multi-turn conversations with state
|
||||
- `agent_script` - Dynamic agent creation with per-sample customization
|
||||
|
||||
The target configuration specifies how to create or access the agent for evaluation.
|
||||
|
||||
## Target Configuration
|
||||
|
||||
All targets have a `kind` field (currently only `agent` is supported):
|
||||
|
||||
```yaml
|
||||
target:
|
||||
kind: agent # Currently only "agent" is supported
|
||||
# ... agent-specific configuration
|
||||
```
|
||||
|
||||
## Agent Sources
|
||||
|
||||
You must specify exactly ONE of these:
|
||||
|
||||
### agent_file
|
||||
|
||||
Path to a `.af` (Agent File) to upload:
|
||||
|
||||
```yaml
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: path/to/agent.af # Path to .af file
|
||||
base_url: https://api.letta.com # Letta server URL
|
||||
```
|
||||
|
||||
The agent file will be uploaded to the Letta server and a new agent created for the evaluation.
|
||||
|
||||
### agent_id
|
||||
|
||||
ID of an existing agent on the server:
|
||||
|
||||
```yaml
|
||||
target:
|
||||
kind: agent
|
||||
agent_id: agent-123-abc # ID of existing agent
|
||||
base_url: https://api.letta.com # Letta server URL
|
||||
```
|
||||
|
||||
<Warning>
|
||||
**Modifies agent in-place:** Using `agent_id` will modify your agent's state, memory, and message history during evaluation. The same agent instance is used for all samples, processing them sequentially. **Do not use production agents or agents you don't want to modify.** Use `agent_file` or `agent_script` for reproducible, isolated testing.
|
||||
</Warning>
|
||||
|
||||
### agent_script
|
||||
|
||||
Path to a Python script with an agent factory function for programmatic agent creation:
|
||||
|
||||
```yaml
|
||||
target:
|
||||
kind: agent
|
||||
agent_script: create_agent.py:create_inventory_agent # script.py:function_name
|
||||
base_url: https://api.letta.com # Letta server URL
|
||||
```
|
||||
|
||||
Format: `path/to/script.py:function_name`
|
||||
|
||||
The function must be decorated with `@agent_factory` and have the signature `async (client: AsyncLetta, sample: Sample) -> str`:
|
||||
|
||||
```python
|
||||
from letta_client import AsyncLetta, CreateBlock
|
||||
from letta_evals.decorators import agent_factory
|
||||
from letta_evals.models import Sample
|
||||
|
||||
@agent_factory
|
||||
async def create_inventory_agent(client: AsyncLetta, sample: Sample) -> str:
|
||||
"""Create and return agent ID for this sample."""
|
||||
# Access custom arguments from the dataset
|
||||
item = sample.agent_args.get("item", {})
|
||||
|
||||
# Create agent with sample-specific configuration
|
||||
agent = await client.agents.create(
|
||||
name="inventory-assistant",
|
||||
memory_blocks=[
|
||||
CreateBlock(
|
||||
label="item_context",
|
||||
value=f"Item: {item.get('name', 'Unknown')}"
|
||||
)
|
||||
],
|
||||
agent_type="letta_v1_agent",
|
||||
model="openai/gpt-4.1-mini",
|
||||
embedding="openai/text-embedding-3-small",
|
||||
)
|
||||
|
||||
return agent.id
|
||||
```
|
||||
|
||||
**Key features:**
|
||||
- Creates a fresh agent for each sample
|
||||
- Can customize agents using `sample.agent_args` from the dataset
|
||||
- Allows testing agent creation logic itself
|
||||
- Useful when you don't have pre-saved agent files
|
||||
|
||||
**When to use:**
|
||||
- Testing agent creation workflows
|
||||
- Dynamic per-sample agent configuration
|
||||
- Agents that need sample-specific memory or tools
|
||||
- Programmatic agent testing
|
||||
|
||||
## Connection Configuration
|
||||
|
||||
### base_url
|
||||
|
||||
Letta server URL:
|
||||
|
||||
```yaml
|
||||
target:
|
||||
base_url: https://api.letta.com # Local Letta server
|
||||
# or
|
||||
base_url: https://api.letta.com # Letta Cloud
|
||||
```
|
||||
|
||||
Default: `https://api.letta.com`
|
||||
|
||||
### api_key
|
||||
|
||||
API key for authentication (required for Letta Cloud):
|
||||
|
||||
```yaml
|
||||
target:
|
||||
api_key: your-api-key-here # Required for Letta Cloud
|
||||
```
|
||||
|
||||
Or set via environment variable:
|
||||
```bash
|
||||
export LETTA_API_KEY=your-api-key-here
|
||||
```
|
||||
|
||||
### project_id
|
||||
|
||||
Letta project ID (for Letta Cloud):
|
||||
|
||||
```yaml
|
||||
target:
|
||||
project_id: proj_abc123 # Letta Cloud project
|
||||
```
|
||||
|
||||
Or set via environment variable:
|
||||
```bash
|
||||
export LETTA_PROJECT_ID=proj_abc123
|
||||
```
|
||||
|
||||
### timeout
|
||||
|
||||
Request timeout in seconds:
|
||||
|
||||
```yaml
|
||||
target:
|
||||
timeout: 300.0 # Request timeout (5 minutes)
|
||||
```
|
||||
|
||||
Default: 300 seconds
|
||||
|
||||
## Multi-Model Evaluation
|
||||
|
||||
Test the same agent across different models:
|
||||
|
||||
### model_configs
|
||||
|
||||
List of model configuration names from JSON files:
|
||||
|
||||
```yaml
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: agent.af
|
||||
model_configs: [gpt-4o-mini, claude-3-5-sonnet] # Test with both models
|
||||
```
|
||||
|
||||
The evaluation will run once for each model config. Model configs are JSON files in `letta_evals/llm_model_configs/`.
|
||||
|
||||
### model_handles
|
||||
|
||||
List of model handles (cloud-compatible identifiers):
|
||||
|
||||
```yaml
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: agent.af
|
||||
model_handles: ["openai/gpt-4o-mini", "anthropic/claude-3-5-sonnet"] # Cloud model identifiers
|
||||
```
|
||||
|
||||
Use this for Letta Cloud deployments.
|
||||
|
||||
<Warning>
|
||||
**Note**: You cannot specify both `model_configs` and `model_handles`.
|
||||
</Warning>
|
||||
|
||||
## Complete Examples
|
||||
|
||||
### Local Development
|
||||
|
||||
```yaml
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: ./agents/my_agent.af # Pre-configured agent
|
||||
base_url: https://api.letta.com # Local server
|
||||
```
|
||||
|
||||
### Letta Cloud
|
||||
|
||||
```yaml
|
||||
target:
|
||||
kind: agent
|
||||
agent_id: agent-cloud-123 # Existing cloud agent
|
||||
base_url: https://api.letta.com # Letta Cloud
|
||||
api_key: ${LETTA_API_KEY} # From environment variable
|
||||
project_id: proj_abc # Your project ID
|
||||
```
|
||||
|
||||
### Multi-Model Testing
|
||||
|
||||
```yaml
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: agent.af # Same agent configuration
|
||||
base_url: https://api.letta.com # Local server
|
||||
model_configs: [gpt-4o-mini, gpt-4o, claude-3-5-sonnet] # Test 3 models
|
||||
```
|
||||
|
||||
Results will include per-model metrics:
|
||||
```
|
||||
Model: gpt-4o-mini - Avg: 0.85, Pass: 85.0%
|
||||
Model: gpt-4o - Avg: 0.92, Pass: 92.0%
|
||||
Model: claude-3-5-sonnet - Avg: 0.88, Pass: 88.0%
|
||||
```
|
||||
|
||||
### Programmatic Agent Creation
|
||||
|
||||
```yaml
|
||||
target:
|
||||
kind: agent
|
||||
agent_script: setup.py:CustomAgentFactory # Programmatic creation
|
||||
base_url: https://api.letta.com # Local server
|
||||
```
|
||||
|
||||
## Environment Variable Precedence
|
||||
|
||||
Configuration values are resolved in this order (highest priority first):
|
||||
|
||||
1. CLI arguments (`--api-key`, `--base-url`, `--project-id`)
|
||||
2. Suite YAML configuration
|
||||
3. Environment variables (`LETTA_API_KEY`, `LETTA_BASE_URL`, `LETTA_PROJECT_ID`)
|
||||
|
||||
## Agent Lifecycle and Testing Behavior
|
||||
|
||||
The way your agent is specified fundamentally changes how the evaluation runs:
|
||||
|
||||
### With agent_file or agent_script: Independent Testing
|
||||
|
||||
**Agent lifecycle:**
|
||||
1. A fresh agent instance is created for each sample
|
||||
2. Agent processes the sample input(s)
|
||||
3. Agent remains on the server after the sample completes
|
||||
|
||||
**Testing behavior:** Each sample is an independent, isolated test. Agent state (memory, message history) does not carry over between samples. This enables parallel execution and ensures reproducible results.
|
||||
|
||||
**Use cases:**
|
||||
- Testing how the agent responds to various independent inputs
|
||||
- Ensuring consistent behavior across different scenarios
|
||||
- Regression testing where each case should be isolated
|
||||
- Evaluating agent responses without prior context
|
||||
|
||||
<Note>
|
||||
**Example:** If you have 10 test cases, 10 separate agent instances will be created (one per test case), and they can run in parallel.
|
||||
</Note>
|
||||
|
||||
### With agent_id: Sequential Script Testing
|
||||
|
||||
**Agent lifecycle:**
|
||||
1. The same agent instance is used for all samples
|
||||
2. Agent processes each sample in sequence
|
||||
3. Agent state persists throughout the entire evaluation
|
||||
|
||||
**Testing behavior:** The dataset becomes a conversation script where each sample builds on previous ones. Agent memory and message history accumulate, and earlier interactions affect later responses. Samples must execute sequentially.
|
||||
|
||||
**Use cases:**
|
||||
- Testing multi-turn conversations with context
|
||||
- Evaluating how agent memory evolves over time
|
||||
- Simulating a single user session with multiple interactions
|
||||
- Testing scenarios where context should accumulate
|
||||
|
||||
<Note>
|
||||
**Example:** If you have 10 test cases, they all run against the same agent instance in order, with state carrying over between each test.
|
||||
</Note>
|
||||
|
||||
### Critical Differences
|
||||
|
||||
| Aspect | agent_file / agent_script | agent_id |
|
||||
|--------|---------------------------|----------|
|
||||
| **Agent instances** | New agent per sample | Same agent for all samples |
|
||||
| **State isolation** | Fully isolated | State carries over |
|
||||
| **Execution** | Can run in parallel | Must run sequentially |
|
||||
| **Memory** | Fresh for each sample | Accumulates across samples |
|
||||
| **Use case** | Independent test cases | Conversation scripts |
|
||||
| **Reproducibility** | Highly reproducible | Depends on execution order |
|
||||
|
||||
<Tip>
|
||||
**Best practice:** Use `agent_file` or `agent_script` for most evaluations to ensure reproducible, isolated tests. Use `agent_id` only when you specifically need to test how agent state evolves across multiple interactions.
|
||||
</Tip>
|
||||
|
||||
## Validation
|
||||
|
||||
The runner validates:
|
||||
- Exactly one of `agent_file`, `agent_id`, or `agent_script` is specified
|
||||
- Agent files have `.af` extension
|
||||
- Agent script paths are valid
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Suite YAML Reference](/guides/evals/configuration/suite-yaml) - Complete target configuration options
|
||||
- [Datasets](/guides/evals/concepts/datasets) - Using agent_args for sample-specific configuration
|
||||
- [Getting Started](/guides/evals/getting-started) - Complete tutorial with target examples
|
||||
@@ -1,783 +0,0 @@
|
||||
# Suite YAML Reference
|
||||
|
||||
Complete reference for suite configuration files.
|
||||
|
||||
A **suite** is a YAML file that defines an evaluation: what agent to test, what dataset to use, how to grade responses, and what criteria determine pass/fail. This is your evaluation specification.
|
||||
|
||||
**Quick overview:**
|
||||
- **name**: Identifier for your evaluation
|
||||
- **dataset**: JSONL file with test cases
|
||||
- **target**: Which agent to evaluate (via file, ID, or script)
|
||||
- **graders**: How to score responses (tool or rubric graders)
|
||||
- **gate**: Pass/fail criteria
|
||||
|
||||
See [Getting Started](../getting-started.md) for a tutorial, or [Core Concepts](../concepts/suites.md) for conceptual overview.
|
||||
|
||||
## File Structure
|
||||
|
||||
```yaml
|
||||
name: string (required)
|
||||
description: string (optional)
|
||||
dataset: path (required)
|
||||
max_samples: integer (optional)
|
||||
sample_tags: array (optional)
|
||||
num_runs: integer (optional)
|
||||
setup_script: string (optional)
|
||||
|
||||
target: object (required)
|
||||
kind: "agent"
|
||||
base_url: string
|
||||
api_key: string
|
||||
timeout: float
|
||||
project_id: string
|
||||
agent_id: string (one of: agent_id, agent_file, agent_script)
|
||||
agent_file: path
|
||||
agent_script: string
|
||||
model_configs: array
|
||||
model_handles: array
|
||||
|
||||
graders: object (required)
|
||||
<metric_key>: object
|
||||
kind: "tool" | "rubric"
|
||||
display_name: string
|
||||
extractor: string
|
||||
extractor_config: object
|
||||
# Tool grader fields
|
||||
function: string
|
||||
# Rubric grader fields (LLM API)
|
||||
prompt: string
|
||||
prompt_path: path
|
||||
model: string
|
||||
temperature: float
|
||||
provider: string
|
||||
max_retries: integer
|
||||
timeout: float
|
||||
rubric_vars: array
|
||||
# Rubric grader fields (agent-as-judge)
|
||||
agent_file: path
|
||||
judge_tool_name: string
|
||||
|
||||
gate: object (required)
|
||||
metric_key: string
|
||||
metric: "avg_score" | "accuracy"
|
||||
op: "gte" | "gt" | "lte" | "lt" | "eq"
|
||||
value: float
|
||||
pass_op: "gte" | "gt" | "lte" | "lt" | "eq"
|
||||
pass_value: float
|
||||
```
|
||||
|
||||
## Top-Level Fields
|
||||
|
||||
### name (required)
|
||||
|
||||
Suite name, used in output and results.
|
||||
|
||||
**Type**: string
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
name: question-answering-eval
|
||||
```
|
||||
|
||||
### description (optional)
|
||||
|
||||
Human-readable description of what the suite tests.
|
||||
|
||||
**Type**: string
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
description: Tests agent's ability to answer factual questions accurately
|
||||
```
|
||||
|
||||
### dataset (required)
|
||||
|
||||
Path to JSONL dataset file. Relative paths are resolved from the suite YAML location.
|
||||
|
||||
**Type**: path (string)
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
dataset: ./datasets/qa.jsonl
|
||||
dataset: /absolute/path/to/dataset.jsonl
|
||||
```
|
||||
|
||||
### max_samples (optional)
|
||||
|
||||
Limit the number of samples to evaluate. Useful for quick tests.
|
||||
|
||||
**Type**: integer
|
||||
|
||||
**Default**: All samples
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
max_samples: 10 # Only evaluate first 10 samples
|
||||
```
|
||||
|
||||
### sample_tags (optional)
|
||||
|
||||
Filter samples by tags. Only samples with ALL specified tags are evaluated.
|
||||
|
||||
**Type**: array of strings
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
sample_tags: [math, easy] # Only samples tagged with both
|
||||
```
|
||||
|
||||
Dataset samples need tags:
|
||||
```jsonl
|
||||
{"input": "What is 2+2?", "ground_truth": "4", "tags": ["math", "easy"]}
|
||||
```
|
||||
|
||||
### num_runs (optional)
|
||||
|
||||
Number of times to run the evaluation suite. Useful for testing non-deterministic behavior or collecting multiple runs for statistical analysis.
|
||||
|
||||
**Type**: integer
|
||||
|
||||
**Default**: 1
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
num_runs: 5 # Run the evaluation 5 times
|
||||
```
|
||||
|
||||
### setup_script (optional)
|
||||
|
||||
Path to Python script with setup function.
|
||||
|
||||
**Type**: string (format: `path/to/script.py:function_name`)
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
setup_script: setup.py:prepare_environment
|
||||
```
|
||||
|
||||
The function signature:
|
||||
```python
|
||||
def prepare_environment(suite: SuiteSpec) -> None:
|
||||
# Setup code
|
||||
pass
|
||||
```
|
||||
|
||||
## target (required)
|
||||
|
||||
Configuration for the agent being evaluated.
|
||||
|
||||
### kind (required)
|
||||
|
||||
Type of target. Currently only `"agent"` is supported.
|
||||
|
||||
**Type**: string
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
target:
|
||||
kind: agent
|
||||
```
|
||||
|
||||
### base_url (optional)
|
||||
|
||||
Letta server URL.
|
||||
|
||||
**Type**: string
|
||||
|
||||
**Default**: `http://localhost:8283`
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
target:
|
||||
base_url: http://localhost:8283
|
||||
# or
|
||||
base_url: https://api.letta.com
|
||||
```
|
||||
|
||||
### api_key (optional)
|
||||
|
||||
API key for Letta authentication. Can also be set via `LETTA_API_KEY` environment variable.
|
||||
|
||||
**Type**: string
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
target:
|
||||
api_key: your-api-key-here
|
||||
```
|
||||
|
||||
### timeout (optional)
|
||||
|
||||
Request timeout in seconds.
|
||||
|
||||
**Type**: float
|
||||
|
||||
**Default**: 300.0
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
target:
|
||||
timeout: 600.0 # 10 minutes
|
||||
```
|
||||
|
||||
### project_id (optional)
|
||||
|
||||
Letta project ID (for Letta Cloud).
|
||||
|
||||
**Type**: string
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
target:
|
||||
project_id: proj_abc123
|
||||
```
|
||||
|
||||
### Agent Source (required, pick one)
|
||||
|
||||
Exactly one of these must be specified:
|
||||
|
||||
#### agent_id
|
||||
|
||||
ID of existing agent on the server.
|
||||
|
||||
**Type**: string
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
target:
|
||||
agent_id: agent-123-abc
|
||||
```
|
||||
|
||||
#### agent_file
|
||||
|
||||
Path to `.af` agent file.
|
||||
|
||||
**Type**: path (string, must end in `.af`)
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
target:
|
||||
agent_file: ./agents/my_agent.af
|
||||
```
|
||||
|
||||
#### agent_script
|
||||
|
||||
Path to Python script with agent factory.
|
||||
|
||||
**Type**: string (format: `path/to/script.py:ClassName`)
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
target:
|
||||
agent_script: factory.py:MyAgentFactory
|
||||
```
|
||||
|
||||
See [Targets](../concepts/targets.md) for details on agent sources.
|
||||
|
||||
### model_configs (optional)
|
||||
|
||||
List of model configuration names to test. Cannot be used with `model_handles`.
|
||||
|
||||
**Type**: array of strings
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
target:
|
||||
model_configs: [gpt-4o-mini, claude-3-5-sonnet]
|
||||
```
|
||||
|
||||
### model_handles (optional)
|
||||
|
||||
List of model handles for cloud deployments. Cannot be used with `model_configs`.
|
||||
|
||||
**Type**: array of strings
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
target:
|
||||
model_handles: ["openai/gpt-4o-mini", "anthropic/claude-3-5-sonnet"]
|
||||
```
|
||||
|
||||
## graders (required)
|
||||
|
||||
One or more graders, each with a unique key.
|
||||
|
||||
### Grader Key
|
||||
|
||||
The key becomes the metric name:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
accuracy: # This is the metric_key
|
||||
kind: tool
|
||||
...
|
||||
quality: # Another metric_key
|
||||
kind: rubric
|
||||
...
|
||||
```
|
||||
|
||||
### kind (required)
|
||||
|
||||
Grader type: `"tool"` or `"rubric"`.
|
||||
|
||||
**Type**: string
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
graders:
|
||||
my_metric:
|
||||
kind: tool
|
||||
```
|
||||
|
||||
### display_name (optional)
|
||||
|
||||
Human-friendly name for CLI/UI output.
|
||||
|
||||
**Type**: string
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
graders:
|
||||
acc:
|
||||
display_name: "Answer Accuracy"
|
||||
kind: tool
|
||||
...
|
||||
```
|
||||
|
||||
### extractor (required)
|
||||
|
||||
Name of the extractor to use.
|
||||
|
||||
**Type**: string
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
graders:
|
||||
my_metric:
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
### extractor_config (optional)
|
||||
|
||||
Configuration passed to the extractor.
|
||||
|
||||
**Type**: object
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
graders:
|
||||
my_metric:
|
||||
extractor: pattern
|
||||
extractor_config:
|
||||
pattern: 'Answer: (.*)'
|
||||
group: 1
|
||||
```
|
||||
|
||||
### Tool Grader Fields
|
||||
|
||||
#### function (required for tool graders)
|
||||
|
||||
Name of the grading function.
|
||||
|
||||
**Type**: string
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
graders:
|
||||
accuracy:
|
||||
kind: tool
|
||||
function: exact_match
|
||||
```
|
||||
|
||||
### Rubric Grader Fields
|
||||
|
||||
#### prompt (required if no prompt_path)
|
||||
|
||||
Inline rubric prompt.
|
||||
|
||||
**Type**: string
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric
|
||||
prompt: |
|
||||
Evaluate response quality from 0.0 to 1.0.
|
||||
Input: {input}
|
||||
Response: {submission}
|
||||
```
|
||||
|
||||
#### prompt_path (required if no prompt)
|
||||
|
||||
Path to rubric file. Cannot use both `prompt` and `prompt_path`.
|
||||
|
||||
**Type**: path (string)
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric
|
||||
prompt_path: rubrics/quality.txt
|
||||
```
|
||||
|
||||
#### model (optional)
|
||||
|
||||
LLM model for judging.
|
||||
|
||||
**Type**: string
|
||||
|
||||
**Default**: `gpt-4o-mini`
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric
|
||||
model: gpt-4o
|
||||
```
|
||||
|
||||
#### temperature (optional)
|
||||
|
||||
Temperature for LLM generation.
|
||||
|
||||
**Type**: float (0.0 to 2.0)
|
||||
|
||||
**Default**: 0.0
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric
|
||||
temperature: 0.0
|
||||
```
|
||||
|
||||
#### provider (optional)
|
||||
|
||||
LLM provider.
|
||||
|
||||
**Type**: string
|
||||
|
||||
**Default**: `openai`
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric
|
||||
provider: openai
|
||||
```
|
||||
|
||||
#### max_retries (optional)
|
||||
|
||||
Maximum retry attempts for API calls.
|
||||
|
||||
**Type**: integer
|
||||
|
||||
**Default**: 5
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric
|
||||
max_retries: 3
|
||||
```
|
||||
|
||||
#### timeout (optional)
|
||||
|
||||
Timeout for API calls in seconds.
|
||||
|
||||
**Type**: float
|
||||
|
||||
**Default**: 120.0
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric
|
||||
timeout: 60.0
|
||||
```
|
||||
|
||||
#### rubric_vars (optional)
|
||||
|
||||
List of custom variable names that must be provided in the dataset for rubric template substitution. When specified, the grader validates that each sample includes these variables in its `rubric_vars` field.
|
||||
|
||||
**Type**: array of strings
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
graders:
|
||||
code_quality:
|
||||
kind: rubric
|
||||
rubric_vars: [reference_code, required_features] # Require these variables in dataset
|
||||
prompt: |
|
||||
Compare the submission to this reference:
|
||||
{reference_code}
|
||||
|
||||
Required features: {required_features}
|
||||
```
|
||||
|
||||
Dataset sample must provide these variables:
|
||||
```jsonl
|
||||
{"input": "Write a fibonacci function", "rubric_vars": {"reference_code": "def fib(n):\n if n <= 1: return n\n return fib(n-1) + fib(n-2)", "required_features": "recursion, base case"}}
|
||||
```
|
||||
|
||||
See [Datasets - rubric_vars](../concepts/datasets.md#rubric_vars) for details.
|
||||
|
||||
#### agent_file (required for agent-as-judge)
|
||||
|
||||
Path to `.af` agent file to use as judge for rubric grading. Use this instead of `model` when you want a Letta agent to act as the evaluator.
|
||||
|
||||
**Type**: path (string)
|
||||
|
||||
**Mutually exclusive with**: `model`, `temperature`, `provider`, `max_retries`, `timeout`
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
graders:
|
||||
agent_judge:
|
||||
kind: rubric
|
||||
agent_file: judge.af # Judge agent with submit_grade tool
|
||||
prompt_path: rubric.txt # Evaluation criteria
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
**Requirements**: The judge agent must have a tool with signature `submit_grade(score: float, rationale: str)`. The framework validates this on initialization.
|
||||
|
||||
See [Rubric Graders - Agent-as-Judge](../graders/rubric-graders.md#agent-as-judge) for complete documentation.
|
||||
|
||||
#### judge_tool_name (optional, for agent-as-judge)
|
||||
|
||||
Name of the tool that the judge agent uses to submit scores. Only applicable when using `agent_file`.
|
||||
|
||||
**Type**: string
|
||||
|
||||
**Default**: `submit_grade`
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
graders:
|
||||
agent_judge:
|
||||
kind: rubric
|
||||
agent_file: judge.af
|
||||
judge_tool_name: submit_grade # Default, can be omitted
|
||||
prompt_path: rubric.txt
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
**Tool requirements**: The tool must have exactly two parameters:
|
||||
- `score: float` - Score between 0.0 and 1.0
|
||||
- `rationale: str` - Explanation of the score
|
||||
|
||||
## gate (required)
|
||||
|
||||
Pass/fail criteria for the evaluation.
|
||||
|
||||
### metric_key (optional)
|
||||
|
||||
Which grader to evaluate. If only one grader, this can be omitted.
|
||||
|
||||
**Type**: string
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: accuracy # Must match a key in graders
|
||||
```
|
||||
|
||||
### metric (optional)
|
||||
|
||||
Which aggregate to compare: `avg_score` or `accuracy`.
|
||||
|
||||
**Type**: string
|
||||
|
||||
**Default**: `avg_score`
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
gate:
|
||||
metric: avg_score
|
||||
# or
|
||||
metric: accuracy
|
||||
```
|
||||
|
||||
### op (required)
|
||||
|
||||
Comparison operator.
|
||||
|
||||
**Type**: string (one of: `gte`, `gt`, `lte`, `lt`, `eq`)
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
gate:
|
||||
op: gte # Greater than or equal
|
||||
```
|
||||
|
||||
### value (required)
|
||||
|
||||
Threshold value for comparison.
|
||||
|
||||
**Type**: float (0.0 to 1.0)
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
gate:
|
||||
value: 0.8 # Require >= 0.8
|
||||
```
|
||||
|
||||
### pass_op (optional)
|
||||
|
||||
Comparison operator for per-sample pass criteria.
|
||||
|
||||
**Type**: string (one of: `gte`, `gt`, `lte`, `lt`, `eq`)
|
||||
|
||||
**Default**: Same as `op`
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
gate:
|
||||
metric: accuracy
|
||||
pass_op: gte # Sample passes if...
|
||||
pass_value: 0.7 # ...score >= 0.7
|
||||
```
|
||||
|
||||
### pass_value (optional)
|
||||
|
||||
Threshold for per-sample pass.
|
||||
|
||||
**Type**: float (0.0 to 1.0)
|
||||
|
||||
**Default**: Same as `value` (or 1.0 for accuracy metric)
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
gate:
|
||||
metric: accuracy
|
||||
op: gte
|
||||
value: 0.8 # 80% must pass
|
||||
pass_op: gte
|
||||
pass_value: 0.7 # Sample passes if score >= 0.7
|
||||
```
|
||||
|
||||
## Complete Examples
|
||||
|
||||
### Minimal Suite
|
||||
|
||||
```yaml
|
||||
name: basic-eval
|
||||
dataset: dataset.jsonl
|
||||
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: agent.af
|
||||
|
||||
graders:
|
||||
accuracy:
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: last_assistant
|
||||
|
||||
gate:
|
||||
op: gte
|
||||
value: 0.8
|
||||
```
|
||||
|
||||
### Multi-Metric Suite
|
||||
|
||||
```yaml
|
||||
name: comprehensive-eval
|
||||
description: Tests accuracy and quality
|
||||
dataset: test_data.jsonl
|
||||
max_samples: 100
|
||||
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: agent.af
|
||||
base_url: http://localhost:8283
|
||||
|
||||
graders:
|
||||
accuracy:
|
||||
display_name: "Answer Accuracy"
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: last_assistant
|
||||
|
||||
quality:
|
||||
display_name: "Response Quality"
|
||||
kind: rubric
|
||||
prompt_path: rubrics/quality.txt
|
||||
model: gpt-4o-mini
|
||||
temperature: 0.0
|
||||
extractor: last_assistant
|
||||
|
||||
gate:
|
||||
metric_key: accuracy
|
||||
metric: avg_score
|
||||
op: gte
|
||||
value: 0.85
|
||||
```
|
||||
|
||||
### Advanced Suite
|
||||
|
||||
```yaml
|
||||
name: advanced-eval
|
||||
description: Multi-model, multi-metric evaluation
|
||||
dataset: comprehensive_tests.jsonl
|
||||
sample_tags: [production]
|
||||
setup_script: setup.py:prepare
|
||||
|
||||
target:
|
||||
kind: agent
|
||||
agent_script: factory.py:CustomFactory
|
||||
base_url: https://api.letta.com
|
||||
api_key: ${LETTA_API_KEY}
|
||||
project_id: proj_abc123
|
||||
model_configs: [gpt-4o-mini, claude-3-5-sonnet]
|
||||
|
||||
graders:
|
||||
answer:
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: last_assistant
|
||||
|
||||
tool_usage:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: tool_arguments
|
||||
extractor_config:
|
||||
tool_name: search
|
||||
|
||||
memory:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: memory_block
|
||||
extractor_config:
|
||||
block_label: human
|
||||
|
||||
gate:
|
||||
metric_key: answer
|
||||
metric: accuracy
|
||||
op: gte
|
||||
value: 0.9
|
||||
pass_op: gte
|
||||
pass_value: 1.0
|
||||
```
|
||||
|
||||
## Validation
|
||||
|
||||
Validate your suite before running:
|
||||
|
||||
```bash
|
||||
letta-evals validate suite.yaml
|
||||
```
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Targets](../concepts/targets.md) - Understanding agent sources and configuration
|
||||
- [Graders](../concepts/graders.md) - Tool graders vs rubric graders
|
||||
- [Extractors](../concepts/extractors.md) - What to extract from agent responses
|
||||
- [Gates](../concepts/gates.md) - Setting pass/fail criteria
|
||||
@@ -1,427 +0,0 @@
|
||||
# Suite YAML Reference
|
||||
|
||||
Complete reference for suite configuration files.
|
||||
|
||||
A **suite** is a YAML file that defines an evaluation: what agent to test, what dataset to use, how to grade responses, and what criteria determine pass/fail. This is your evaluation specification.
|
||||
|
||||
<Note>
|
||||
**Quick overview:**
|
||||
- **name**: Identifier for your evaluation
|
||||
- **dataset**: JSONL file with test cases
|
||||
- **target**: Which agent to evaluate (via file, ID, or script)
|
||||
- **graders**: How to score responses (tool or rubric graders)
|
||||
- **gate**: Pass/fail criteria
|
||||
</Note>
|
||||
|
||||
See [Getting Started](/guides/evals/getting-started) for a tutorial, or [Core Concepts](/guides/evals/concepts/suites) for conceptual overview.
|
||||
|
||||
## File Structure
|
||||
|
||||
```yaml
|
||||
name: string (required)
|
||||
description: string (optional)
|
||||
dataset: path (required)
|
||||
max_samples: integer (optional)
|
||||
sample_tags: array (optional)
|
||||
num_runs: integer (optional)
|
||||
setup_script: string (optional)
|
||||
|
||||
target: object (required)
|
||||
kind: "agent"
|
||||
base_url: string
|
||||
api_key: string
|
||||
timeout: float
|
||||
project_id: string
|
||||
agent_id: string (one of: agent_id, agent_file, agent_script)
|
||||
agent_file: path
|
||||
agent_script: string
|
||||
model_configs: array
|
||||
model_handles: array
|
||||
|
||||
graders: object (required)
|
||||
<metric_key>: object
|
||||
kind: "tool" | "rubric"
|
||||
display_name: string
|
||||
extractor: string
|
||||
extractor_config: object
|
||||
# Tool grader fields
|
||||
function: string
|
||||
# Rubric grader fields (LLM API)
|
||||
prompt: string
|
||||
prompt_path: path
|
||||
model: string
|
||||
temperature: float
|
||||
provider: string
|
||||
max_retries: integer
|
||||
timeout: float
|
||||
rubric_vars: array
|
||||
# Rubric grader fields (agent-as-judge)
|
||||
agent_file: path
|
||||
judge_tool_name: string
|
||||
|
||||
gate: object (required)
|
||||
metric_key: string
|
||||
metric: "avg_score" | "accuracy"
|
||||
op: "gte" | "gt" | "lte" | "lt" | "eq"
|
||||
value: float
|
||||
pass_op: "gte" | "gt" | "lte" | "lt" | "eq"
|
||||
pass_value: float
|
||||
```
|
||||
|
||||
## Top-Level Fields
|
||||
|
||||
### name (required)
|
||||
|
||||
Suite name, used in output and results.
|
||||
|
||||
**Type**: string
|
||||
|
||||
```yaml
|
||||
name: question-answering-eval
|
||||
```
|
||||
|
||||
### description (optional)
|
||||
|
||||
Human-readable description of what the suite tests.
|
||||
|
||||
**Type**: string
|
||||
|
||||
```yaml
|
||||
description: Tests agent's ability to answer factual questions accurately
|
||||
```
|
||||
|
||||
### dataset (required)
|
||||
|
||||
Path to JSONL dataset file. Relative paths are resolved from the suite YAML location.
|
||||
|
||||
**Type**: path (string)
|
||||
|
||||
```yaml
|
||||
dataset: ./datasets/qa.jsonl
|
||||
dataset: /absolute/path/to/dataset.jsonl
|
||||
```
|
||||
|
||||
### max_samples (optional)
|
||||
|
||||
Limit the number of samples to evaluate. Useful for quick tests.
|
||||
|
||||
**Type**: integer | **Default**: All samples
|
||||
|
||||
```yaml
|
||||
max_samples: 10 # Only evaluate first 10 samples
|
||||
```
|
||||
|
||||
### sample_tags (optional)
|
||||
|
||||
Filter samples by tags. Only samples with ALL specified tags are evaluated.
|
||||
|
||||
**Type**: array of strings
|
||||
|
||||
```yaml
|
||||
sample_tags: [math, easy] # Only samples tagged with both
|
||||
```
|
||||
|
||||
### num_runs (optional)
|
||||
|
||||
Number of times to run the evaluation suite.
|
||||
|
||||
**Type**: integer | **Default**: 1
|
||||
|
||||
```yaml
|
||||
num_runs: 5 # Run the evaluation 5 times
|
||||
```
|
||||
|
||||
### setup_script (optional)
|
||||
|
||||
Path to Python script with setup function.
|
||||
|
||||
**Type**: string (format: `path/to/script.py:function_name`)
|
||||
|
||||
```yaml
|
||||
setup_script: setup.py:prepare_environment
|
||||
```
|
||||
|
||||
## target (required)
|
||||
|
||||
Configuration for the agent being evaluated.
|
||||
|
||||
### kind (required)
|
||||
|
||||
Type of target. Currently only `"agent"` is supported.
|
||||
|
||||
```yaml
|
||||
target:
|
||||
kind: agent
|
||||
```
|
||||
|
||||
### base_url (optional)
|
||||
|
||||
Letta server URL. **Default**: `https://api.letta.com`
|
||||
|
||||
```yaml
|
||||
target:
|
||||
base_url: https://api.letta.com
|
||||
# or
|
||||
base_url: https://api.letta.com
|
||||
```
|
||||
|
||||
### api_key (optional)
|
||||
|
||||
API key for Letta authentication. Can also be set via `LETTA_API_KEY` environment variable.
|
||||
|
||||
```yaml
|
||||
target:
|
||||
api_key: your-api-key-here
|
||||
```
|
||||
|
||||
### timeout (optional)
|
||||
|
||||
Request timeout in seconds. **Default**: 300.0
|
||||
|
||||
```yaml
|
||||
target:
|
||||
timeout: 600.0 # 10 minutes
|
||||
```
|
||||
|
||||
### Agent Source (required, pick one)
|
||||
|
||||
Exactly one of these must be specified:
|
||||
|
||||
#### agent_id
|
||||
|
||||
ID of existing agent on the server.
|
||||
|
||||
```yaml
|
||||
target:
|
||||
agent_id: agent-123-abc
|
||||
```
|
||||
|
||||
#### agent_file
|
||||
|
||||
Path to `.af` agent file.
|
||||
|
||||
```yaml
|
||||
target:
|
||||
agent_file: ./agents/my_agent.af
|
||||
```
|
||||
|
||||
#### agent_script
|
||||
|
||||
Path to Python script with agent factory.
|
||||
|
||||
```yaml
|
||||
target:
|
||||
agent_script: factory.py:MyAgentFactory
|
||||
```
|
||||
|
||||
See [Targets](/guides/evals/concepts/targets) for details on agent sources.
|
||||
|
||||
### model_configs (optional)
|
||||
|
||||
List of model configuration names to test. Cannot be used with `model_handles`.
|
||||
|
||||
```yaml
|
||||
target:
|
||||
model_configs: [gpt-4o-mini, claude-3-5-sonnet]
|
||||
```
|
||||
|
||||
### model_handles (optional)
|
||||
|
||||
List of model handles for cloud deployments. Cannot be used with `model_configs`.
|
||||
|
||||
```yaml
|
||||
target:
|
||||
model_handles: ["openai/gpt-4o-mini", "anthropic/claude-3-5-sonnet"]
|
||||
```
|
||||
|
||||
## graders (required)
|
||||
|
||||
One or more graders, each with a unique key.
|
||||
|
||||
### kind (required)
|
||||
|
||||
Grader type: `"tool"` or `"rubric"`.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
my_metric:
|
||||
kind: tool
|
||||
```
|
||||
|
||||
### extractor (required)
|
||||
|
||||
Name of the extractor to use.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
my_metric:
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
### Tool Grader Fields
|
||||
|
||||
#### function (required for tool graders)
|
||||
|
||||
Name of the grading function.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
accuracy:
|
||||
kind: tool
|
||||
function: exact_match
|
||||
```
|
||||
|
||||
### Rubric Grader Fields
|
||||
|
||||
#### prompt or prompt_path (required)
|
||||
|
||||
Inline rubric prompt or path to rubric file.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric
|
||||
prompt: |
|
||||
Evaluate response quality from 0.0 to 1.0.
|
||||
```
|
||||
|
||||
#### model (optional)
|
||||
|
||||
LLM model for judging. **Default**: `gpt-4o-mini`
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric
|
||||
model: gpt-4o
|
||||
```
|
||||
|
||||
#### temperature (optional)
|
||||
|
||||
Temperature for LLM generation. **Default**: 0.0
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric
|
||||
temperature: 0.0
|
||||
```
|
||||
|
||||
#### agent_file (agent-as-judge)
|
||||
|
||||
Path to `.af` agent file to use as judge.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
agent_judge:
|
||||
kind: rubric
|
||||
agent_file: judge.af
|
||||
prompt_path: rubric.txt
|
||||
```
|
||||
|
||||
## gate (required)
|
||||
|
||||
Pass/fail criteria for the evaluation.
|
||||
|
||||
### metric_key (optional)
|
||||
|
||||
Which grader to evaluate. If only one grader, this can be omitted.
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: accuracy
|
||||
```
|
||||
|
||||
### metric (optional)
|
||||
|
||||
Which aggregate to compare: `avg_score` or `accuracy`. **Default**: `avg_score`
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric: avg_score
|
||||
```
|
||||
|
||||
### op (required)
|
||||
|
||||
Comparison operator: `gte`, `gt`, `lte`, `lt`, `eq`
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
op: gte # Greater than or equal
|
||||
```
|
||||
|
||||
### value (required)
|
||||
|
||||
Threshold value for comparison (0.0 to 1.0).
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
value: 0.8 # Require >= 0.8
|
||||
```
|
||||
|
||||
## Complete Examples
|
||||
|
||||
### Minimal Suite
|
||||
|
||||
```yaml
|
||||
name: basic-eval
|
||||
dataset: dataset.jsonl
|
||||
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: agent.af
|
||||
|
||||
graders:
|
||||
accuracy:
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: last_assistant
|
||||
|
||||
gate:
|
||||
op: gte
|
||||
value: 0.8
|
||||
```
|
||||
|
||||
### Multi-Metric Suite
|
||||
|
||||
```yaml
|
||||
name: comprehensive-eval
|
||||
description: Tests accuracy and quality
|
||||
dataset: test_data.jsonl
|
||||
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: agent.af
|
||||
|
||||
graders:
|
||||
accuracy:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: last_assistant
|
||||
|
||||
quality:
|
||||
kind: rubric
|
||||
prompt_path: rubrics/quality.txt
|
||||
model: gpt-4o-mini
|
||||
extractor: last_assistant
|
||||
|
||||
gate:
|
||||
metric_key: accuracy
|
||||
op: gte
|
||||
value: 0.85
|
||||
```
|
||||
|
||||
## Validation
|
||||
|
||||
Validate your suite before running:
|
||||
|
||||
```bash
|
||||
letta-evals validate suite.yaml
|
||||
```
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Targets](/guides/evals/concepts/targets) - Understanding agent sources and configuration
|
||||
- [Graders](/guides/evals/concepts/graders) - Tool graders vs rubric graders
|
||||
- [Extractors](/guides/evals/concepts/extractors) - What to extract from agent responses
|
||||
- [Gates](/guides/evals/concepts/gates) - Setting pass/fail criteria
|
||||
@@ -1,218 +0,0 @@
|
||||
# Built-in Extractors Reference
|
||||
|
||||
Letta Evals provides a set of built-in extractors that cover the most common extraction needs. These extractors let you pull specific content from agent conversations without writing any custom code.
|
||||
|
||||
**What are extractors?** Extractors determine what part of an agent's response gets evaluated. They take the full conversation trajectory (all messages, tool calls, and state changes) and extract just the piece you want to grade.
|
||||
|
||||
**Common use cases:**
|
||||
- Extract the agent's final answer (`last_assistant`)
|
||||
- Check what tools were called and with what arguments (`tool_arguments`)
|
||||
- Verify memory was updated correctly (`memory_block`)
|
||||
- Parse structured output with regex (`pattern`)
|
||||
- Get all messages from a conversation (`all_assistant`)
|
||||
|
||||
**Quick example:**
|
||||
```yaml
|
||||
graders:
|
||||
accuracy:
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: last_assistant # Extract final response
|
||||
```
|
||||
|
||||
Each extractor below can be used with any grader by specifying it in your suite YAML. For custom extraction logic, see [Custom Extractors](./custom.md).
|
||||
|
||||
## `last_assistant`
|
||||
|
||||
Extracts the last assistant message content.
|
||||
|
||||
**Configuration**: None required
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
**Use case**: Most common - get the agent's final response
|
||||
|
||||
**Output**: Content of the last assistant message
|
||||
|
||||
## `first_assistant`
|
||||
|
||||
Extracts the first assistant message content.
|
||||
|
||||
**Configuration**: None required
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
extractor: first_assistant
|
||||
```
|
||||
|
||||
**Use case**: Test immediate responses before tool usage
|
||||
|
||||
**Output**: Content of the first assistant message
|
||||
|
||||
## `all_assistant`
|
||||
|
||||
Concatenates all assistant messages with a separator.
|
||||
|
||||
**Configuration**:
|
||||
- `separator` (optional): String to join messages (default: `"\n"`)
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
extractor: all_assistant # Get all agent messages
|
||||
extractor_config:
|
||||
separator: "\n\n" # Separate with double newlines
|
||||
```
|
||||
|
||||
**Use case**: Evaluate complete conversation context
|
||||
|
||||
**Output**: All assistant messages joined by separator
|
||||
|
||||
## last_turn
|
||||
|
||||
Extracts all assistant messages from the last conversation turn.
|
||||
|
||||
**Configuration**:
|
||||
- `separator` (optional): String to join messages (default: `"\n"`)
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
extractor: last_turn # Get messages from final turn
|
||||
extractor_config:
|
||||
separator: " " # Join with spaces
|
||||
```
|
||||
|
||||
**Use case**: When agent makes multiple statements in final turn
|
||||
|
||||
**Output**: Assistant messages from last turn joined by separator
|
||||
|
||||
## pattern
|
||||
|
||||
Extracts content matching a regex pattern.
|
||||
|
||||
**Configuration**:
|
||||
- `pattern` (required): Regex pattern to match
|
||||
- `group` (optional): Capture group to extract (default: 0)
|
||||
- `search_all` (optional): Find all matches vs first match (default: false)
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
extractor: pattern # Extract using regex
|
||||
extractor_config:
|
||||
pattern: 'Result: (\d+)' # Match "Result: " followed by digits
|
||||
group: 1 # Extract just the number (capture group 1)
|
||||
```
|
||||
|
||||
**Use case**: Extract structured content (numbers, codes, formatted output)
|
||||
|
||||
**Output**: Matched pattern or capture group
|
||||
|
||||
## tool_arguments
|
||||
|
||||
Extracts arguments from a specific tool call.
|
||||
|
||||
**Configuration**:
|
||||
- `tool_name` (required): Name of the tool to extract from
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
extractor: tool_arguments # Extract tool call arguments
|
||||
extractor_config:
|
||||
tool_name: search # Get arguments from "search" tool
|
||||
```
|
||||
|
||||
**Use case**: Validate tool was called with correct arguments
|
||||
|
||||
**Output**: JSON string of tool arguments
|
||||
|
||||
Example output: `{"query": "pandas", "limit": 10}`
|
||||
|
||||
## tool_output
|
||||
|
||||
Extracts the return value from a specific tool call.
|
||||
|
||||
**Configuration**:
|
||||
- `tool_name` (required): Name of the tool whose output to extract
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
extractor: tool_output # Extract tool return value
|
||||
extractor_config:
|
||||
tool_name: search # Get return value from "search" tool
|
||||
```
|
||||
|
||||
**Use case**: Check tool return values
|
||||
|
||||
**Output**: Tool return value as string
|
||||
|
||||
## after_marker
|
||||
|
||||
Extracts content after a specific marker string.
|
||||
|
||||
**Configuration**:
|
||||
- `marker` (required): String marker to search for
|
||||
- `include_marker` (optional): Include marker in output (default: false)
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
extractor: after_marker # Extract content after a marker
|
||||
extractor_config:
|
||||
marker: "ANSWER:" # Find this marker in the response
|
||||
include_marker: false # Don't include "ANSWER:" in output
|
||||
```
|
||||
|
||||
**Use case**: Extract structured responses with markers
|
||||
|
||||
**Output**: Content after the marker
|
||||
|
||||
Example: From "Analysis... ANSWER: Paris", extracts "Paris"
|
||||
|
||||
## memory_block
|
||||
|
||||
Extracts content from a specific memory block.
|
||||
|
||||
**Configuration**:
|
||||
- `block_label` (required): Label of the memory block
|
||||
|
||||
**Example**:
|
||||
```yaml
|
||||
extractor: memory_block # Extract from agent memory
|
||||
extractor_config:
|
||||
block_label: human # Get content from "human" memory block
|
||||
```
|
||||
|
||||
**Use case**: Validate agent memory updates
|
||||
|
||||
**Output**: Content of the specified memory block
|
||||
|
||||
**Important**: This extractor requires agent_state, which adds overhead. The runner automatically fetches it when needed.
|
||||
|
||||
## Quick Reference Table
|
||||
|
||||
| Extractor | Config Required | Use Case | Agent State? |
|
||||
|-----------|----------------|----------|--------------|
|
||||
| `last_assistant` | No | Final response | No |
|
||||
| `first_assistant` | No | Initial response | No |
|
||||
| `all_assistant` | Optional | Full conversation | No |
|
||||
| `last_turn` | Optional | Final turn messages | No |
|
||||
| `pattern` | Yes | Regex extraction | No |
|
||||
| `tool_arguments` | Yes | Tool call args | No |
|
||||
| `tool_output` | Yes | Tool return value | No |
|
||||
| `after_marker` | Yes | Marker-based extraction | No |
|
||||
| `memory_block` | Yes | Memory content | Yes |
|
||||
|
||||
## Listing Extractors
|
||||
|
||||
See all available extractors:
|
||||
|
||||
```bash
|
||||
letta-evals list-extractors
|
||||
```
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Custom Extractors](./custom.md) - Write your own extraction logic
|
||||
- [Core Concepts: Extractors](../concepts/extractors.md) - How extractors work in the evaluation flow
|
||||
- [Graders](../concepts/graders.md) - Using extractors with graders
|
||||
@@ -1,409 +0,0 @@
|
||||
# Custom Extractors
|
||||
|
||||
Create your own extractors to pull exactly what you need from agent trajectories.
|
||||
|
||||
While built-in extractors cover common cases (last assistant message, tool arguments, memory blocks), custom extractors let you implement specialized extraction logic for your specific use case.
|
||||
|
||||
## Why Custom Extractors?
|
||||
|
||||
Use custom extractors when you need to:
|
||||
- **Extract structured data**: Parse JSON fields from agent responses
|
||||
- **Filter specific patterns**: Extract code blocks, URLs, or formatted content
|
||||
- **Combine data sources**: Merge information from multiple messages or memory blocks
|
||||
- **Count occurrences**: Track how many times something happened in the conversation
|
||||
- **Complex logic**: Implement domain-specific extraction that built-ins can't handle
|
||||
|
||||
**Example**: You want to test if your agent correctly stores fruit preferences in memory using the `memory_insert` tool. A custom extractor can grab the tool call arguments, and a custom grader can verify the fruit name is in the right memory block.
|
||||
|
||||
## Quick Example
|
||||
|
||||
Here's a real custom extractor that pulls `memory_insert` tool call arguments:
|
||||
|
||||
```python
|
||||
from typing import List
|
||||
from letta_client import LettaMessageUnion, ToolCallMessage
|
||||
from letta_evals.decorators import extractor
|
||||
|
||||
@extractor
|
||||
def memory_insert_extractor(trajectory: List[List[LettaMessageUnion]], config: dict) -> str:
|
||||
"""Extract memory_insert tool call arguments from trajectory."""
|
||||
for turn in trajectory:
|
||||
for message in turn:
|
||||
if isinstance(message, ToolCallMessage) and message.tool_call.name == "memory_insert":
|
||||
return message.tool_call.arguments
|
||||
|
||||
return "{}" # Return empty JSON if not found
|
||||
```
|
||||
|
||||
This extractor:
|
||||
1. Loops through all conversation turns
|
||||
2. Finds `ToolCallMessage` objects
|
||||
3. Checks if the tool is `memory_insert`
|
||||
4. Returns the JSON arguments
|
||||
5. Returns `"{}"` if no matching tool call found
|
||||
|
||||
You can then pair this with a custom grader to verify the arguments are correct (see [Custom Graders](../advanced/custom-graders.md)).
|
||||
|
||||
## Basic Structure
|
||||
|
||||
```python
|
||||
from typing import List, Optional
|
||||
from letta_client import LettaMessageUnion, AgentState
|
||||
from letta_evals.decorators import extractor
|
||||
|
||||
@extractor
|
||||
def my_extractor(
|
||||
trajectory: List[List[LettaMessageUnion]],
|
||||
config: dict,
|
||||
agent_state: Optional[AgentState] = None
|
||||
) -> str:
|
||||
"""Your custom extraction logic."""
|
||||
# Extract and return content
|
||||
return extracted_text
|
||||
```
|
||||
|
||||
## The @extractor Decorator
|
||||
|
||||
The `@extractor` decorator registers your function:
|
||||
|
||||
```python
|
||||
from letta_evals.decorators import extractor
|
||||
|
||||
@extractor # Makes this available as "my_extractor"
|
||||
def my_extractor(trajectory, config, agent_state=None):
|
||||
...
|
||||
```
|
||||
|
||||
## Function Signature
|
||||
|
||||
### Required Parameters
|
||||
|
||||
- `trajectory`: List of conversation turns, each containing messages
|
||||
- `config`: Dictionary with extractor configuration from YAML
|
||||
|
||||
### Optional Parameters
|
||||
|
||||
- `agent_state`: Agent state (only needed if extracting from memory blocks or other agent state). Most extractors only need the trajectory.
|
||||
|
||||
### Return Value
|
||||
|
||||
Must return a string - the extracted content to be graded.
|
||||
|
||||
## Trajectory Structure
|
||||
|
||||
The trajectory is a list of turns:
|
||||
|
||||
```python
|
||||
[
|
||||
# Turn 1
|
||||
[
|
||||
UserMessage(...),
|
||||
AssistantMessage(...),
|
||||
ToolCallMessage(...),
|
||||
ToolReturnMessage(...)
|
||||
],
|
||||
# Turn 2
|
||||
[
|
||||
AssistantMessage(...)
|
||||
]
|
||||
]
|
||||
```
|
||||
|
||||
Message types:
|
||||
- `UserMessage`: User input
|
||||
- `AssistantMessage`: Agent response
|
||||
- `ToolCallMessage`: Tool invocation
|
||||
- `ToolReturnMessage`: Tool result
|
||||
- `SystemMessage`: System messages
|
||||
|
||||
## Configuration
|
||||
|
||||
Access extractor config via the `config` parameter:
|
||||
|
||||
```yaml
|
||||
extractor: my_extractor
|
||||
extractor_config:
|
||||
max_length: 100 # Truncate output at 100 chars
|
||||
include_metadata: true # Include metadata in extraction
|
||||
```
|
||||
|
||||
```python
|
||||
@extractor
|
||||
def my_extractor(trajectory, config, agent_state=None):
|
||||
max_length = config.get("max_length", 500)
|
||||
include_metadata = config.get("include_metadata", False)
|
||||
...
|
||||
```
|
||||
|
||||
## Examples
|
||||
|
||||
### Extract Last N Messages
|
||||
|
||||
```python
|
||||
from letta_evals.decorators import extractor
|
||||
from letta_evals.extractors.utils import get_assistant_messages, flatten_content
|
||||
|
||||
@extractor
|
||||
def last_n_messages(trajectory, config, agent_state=None):
|
||||
"""Extract the last N assistant messages."""
|
||||
n = config.get("n", 3)
|
||||
messages = get_assistant_messages(trajectory)
|
||||
last_n = messages[-n:] if len(messages) >= n else messages
|
||||
contents = [flatten_content(msg.content) for msg in last_n]
|
||||
return "\n".join(contents)
|
||||
```
|
||||
|
||||
Usage:
|
||||
```yaml
|
||||
extractor: last_n_messages # Use custom extractor
|
||||
extractor_config:
|
||||
n: 3 # Extract last 3 assistant messages
|
||||
```
|
||||
|
||||
### Extract JSON Field
|
||||
|
||||
```python
|
||||
import json
|
||||
from letta_evals.decorators import extractor
|
||||
from letta_evals.extractors.utils import get_assistant_messages, flatten_content
|
||||
|
||||
@extractor
|
||||
def json_field(trajectory, config, agent_state=None):
|
||||
"""Extract a specific field from JSON response."""
|
||||
field_name = config.get("field", "result")
|
||||
messages = get_assistant_messages(trajectory)
|
||||
|
||||
if not messages:
|
||||
return ""
|
||||
|
||||
content = flatten_content(messages[-1].content)
|
||||
|
||||
try:
|
||||
data = json.loads(content)
|
||||
return str(data.get(field_name, ""))
|
||||
except json.JSONDecodeError:
|
||||
return ""
|
||||
```
|
||||
|
||||
Usage:
|
||||
```yaml
|
||||
extractor: json_field # Parse JSON from agent response
|
||||
extractor_config:
|
||||
field: result # Extract the "result" field from JSON
|
||||
```
|
||||
|
||||
### Extract Code Blocks
|
||||
|
||||
```python
|
||||
import re
|
||||
from letta_evals.decorators import extractor
|
||||
from letta_evals.extractors.utils import get_assistant_messages, flatten_content
|
||||
|
||||
@extractor
|
||||
def code_blocks(trajectory, config, agent_state=None):
|
||||
"""Extract all code blocks from messages."""
|
||||
language = config.get("language", None) # Optional: filter by language
|
||||
messages = get_assistant_messages(trajectory)
|
||||
|
||||
code_pattern = r'```(?:(\w+)\n)?(.*?)```'
|
||||
all_code = []
|
||||
|
||||
for msg in messages:
|
||||
content = flatten_content(msg.content)
|
||||
matches = re.findall(code_pattern, content, re.DOTALL)
|
||||
|
||||
for lang, code in matches:
|
||||
if language is None or lang == language:
|
||||
all_code.append(code.strip())
|
||||
|
||||
return "\n\n".join(all_code)
|
||||
```
|
||||
|
||||
Usage:
|
||||
```yaml
|
||||
extractor: code_blocks # Extract code from markdown blocks
|
||||
extractor_config:
|
||||
language: python # Optional: only extract Python code blocks
|
||||
```
|
||||
|
||||
### Extract Tool Call Count
|
||||
|
||||
```python
|
||||
from letta_client import ToolCallMessage
|
||||
from letta_evals.decorators import extractor
|
||||
|
||||
@extractor
|
||||
def tool_call_count(trajectory, config, agent_state=None):
|
||||
"""Count how many times a specific tool was called."""
|
||||
tool_name = config.get("tool_name")
|
||||
count = 0
|
||||
|
||||
for turn in trajectory:
|
||||
for message in turn:
|
||||
if isinstance(message, ToolCallMessage):
|
||||
if tool_name is None or message.tool_call.name == tool_name:
|
||||
count += 1
|
||||
|
||||
return str(count)
|
||||
```
|
||||
|
||||
Usage:
|
||||
```yaml
|
||||
extractor: tool_call_count # Count tool invocations
|
||||
extractor_config:
|
||||
tool_name: search # Optional: count only "search" tool calls
|
||||
```
|
||||
|
||||
### Extract Multiple Memory Blocks
|
||||
|
||||
```python
|
||||
from letta_evals.decorators import extractor
|
||||
|
||||
@extractor
|
||||
def multiple_memory_blocks(trajectory, config, agent_state=None):
|
||||
"""Extract and concatenate multiple memory blocks."""
|
||||
if agent_state is None:
|
||||
return ""
|
||||
|
||||
block_labels = config.get("block_labels", ["human", "persona"])
|
||||
separator = config.get("separator", "\n---\n")
|
||||
|
||||
blocks = []
|
||||
for block in agent_state.memory.blocks:
|
||||
if block.label in block_labels:
|
||||
blocks.append(f"{block.label}: {block.value}")
|
||||
|
||||
return separator.join(blocks)
|
||||
```
|
||||
|
||||
Usage:
|
||||
```yaml
|
||||
extractor: multiple_memory_blocks # Combine multiple memory blocks
|
||||
extractor_config:
|
||||
block_labels: [human, persona] # Which blocks to extract
|
||||
separator: "\n---\n" # How to separate them in output
|
||||
```
|
||||
|
||||
## Helper Utilities
|
||||
|
||||
The framework provides helper functions:
|
||||
|
||||
### get_assistant_messages
|
||||
|
||||
```python
|
||||
from letta_evals.extractors.utils import get_assistant_messages
|
||||
|
||||
messages = get_assistant_messages(trajectory)
|
||||
# Returns list of AssistantMessage objects
|
||||
```
|
||||
|
||||
### get_last_turn_messages
|
||||
|
||||
```python
|
||||
from letta_evals.extractors.utils import get_last_turn_messages
|
||||
from letta_client import AssistantMessage
|
||||
|
||||
messages = get_last_turn_messages(trajectory, AssistantMessage)
|
||||
# Returns assistant messages from last turn
|
||||
```
|
||||
|
||||
### flatten_content
|
||||
|
||||
```python
|
||||
from letta_evals.extractors.utils import flatten_content
|
||||
|
||||
text = flatten_content(message.content)
|
||||
# Converts complex content to plain text
|
||||
```
|
||||
|
||||
## Agent State Requirements
|
||||
|
||||
If your extractor needs agent state, include it in the signature:
|
||||
|
||||
```python
|
||||
@extractor
|
||||
def my_extractor(trajectory, config, agent_state: Optional[AgentState] = None):
|
||||
if agent_state is None:
|
||||
raise RuntimeError("This extractor requires agent_state")
|
||||
|
||||
# Use agent_state.memory.blocks, etc.
|
||||
...
|
||||
```
|
||||
|
||||
The runner will automatically fetch agent state when your extractor is used.
|
||||
|
||||
**Note**: Fetching agent state adds overhead. Only use when necessary.
|
||||
|
||||
## Using Custom Extractors
|
||||
|
||||
### Method 1: Custom Evaluators File
|
||||
|
||||
Create `custom_evaluators.py`:
|
||||
|
||||
```python
|
||||
from letta_evals.decorators import extractor
|
||||
|
||||
@extractor
|
||||
def my_extractor(trajectory, config, agent_state=None):
|
||||
...
|
||||
```
|
||||
|
||||
The file will be discovered automatically if in the same directory.
|
||||
|
||||
### Method 2: Setup Script
|
||||
|
||||
Use a setup script to import custom extractors before the suite runs:
|
||||
|
||||
```python
|
||||
# setup.py
|
||||
from letta_evals.models import SuiteSpec
|
||||
import custom_extractors # Imports and registers your @extractor functions
|
||||
|
||||
def prepare_environment(suite: SuiteSpec) -> None:
|
||||
# Runs before evaluation starts
|
||||
pass
|
||||
```
|
||||
|
||||
```yaml
|
||||
setup_script: setup.py:prepare_environment # Import custom extractors
|
||||
|
||||
graders:
|
||||
my_metric:
|
||||
extractor: my_extractor # Now available from custom_extractors
|
||||
```
|
||||
|
||||
## Testing Your Extractor
|
||||
|
||||
```python
|
||||
from letta_client import AssistantMessage
|
||||
|
||||
# Mock trajectory
|
||||
trajectory = [
|
||||
[
|
||||
AssistantMessage(
|
||||
content="The answer is 42",
|
||||
role="assistant"
|
||||
)
|
||||
]
|
||||
]
|
||||
|
||||
config = {"max_length": 100}
|
||||
result = my_extractor(trajectory, config)
|
||||
print(f"Extracted: {result}")
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Handle empty trajectories**: Check if messages exist
|
||||
2. **Return strings**: Always return a string, not None
|
||||
3. **Use config for flexibility**: Make behavior configurable
|
||||
4. **Document required config**: Explain config parameters
|
||||
5. **Handle errors gracefully**: Return empty string on error
|
||||
6. **Keep it fast**: Extractors run for every sample
|
||||
7. **Use helper utilities**: Leverage built-in functions
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Built-in Extractors](./builtin.md) - Learn from examples
|
||||
- [Custom Graders](../advanced/custom-graders.md) - Pair with custom grading
|
||||
- [Core Concepts](../concepts/extractors.md) - How extractors work
|
||||
@@ -1,325 +0,0 @@
|
||||
# Getting Started with Letta Evals
|
||||
|
||||
This guide will help you get up and running with Letta Evals in minutes.
|
||||
|
||||
## What is Letta Evals?
|
||||
|
||||
Letta Evals is a framework for testing Letta AI agents. It allows you to:
|
||||
|
||||
- Test agent responses against expected outputs
|
||||
- Evaluate subjective quality using LLM judges
|
||||
- Test tool usage and memory updates
|
||||
- Track metrics across multiple evaluation runs
|
||||
- Gate deployments on quality thresholds
|
||||
|
||||
Unlike most evaluation frameworks designed for simple input-output models, Letta Evals is built for [stateful agents](https://www.letta.com/blog/stateful-agents) that maintain memory, use tools, and evolve over time.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Python 3.11 or higher
|
||||
- A running Letta server ([local](https://docs.letta.com/guides/selfhosting) or [Letta Cloud](https://docs.letta.com/guides/cloud/overview))
|
||||
- A Letta agent to test, either in agent file format or by ID (see [Targets](./concepts/targets.md) for more details)
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
pip install letta-evals
|
||||
```
|
||||
|
||||
Or with uv:
|
||||
|
||||
```bash
|
||||
uv pip install letta-evals
|
||||
```
|
||||
|
||||
## Getting an Agent to Test
|
||||
|
||||
Before you can run evaluations, you need a Letta agent. You have two options:
|
||||
|
||||
### Option 1: Use an Agent File (.af)
|
||||
|
||||
Export an existing agent to a file using the Letta SDK:
|
||||
|
||||
```python
|
||||
from letta_client import Letta
|
||||
import os
|
||||
|
||||
client = Letta(
|
||||
base_url="http://localhost:8283", # or https://api.letta.com for Letta Cloud
|
||||
token=os.getenv("LETTA_API_KEY") # required for Letta Cloud
|
||||
)
|
||||
|
||||
# Export an agent to a file
|
||||
agent_file = client.agents.export_file(agent_id="agent-123")
|
||||
|
||||
# Save to disk
|
||||
with open("my_agent.af", "w") as f:
|
||||
f.write(agent_file)
|
||||
```
|
||||
|
||||
Or export via the Agent Development Environment (ADE) by selecting "Export Agent".
|
||||
|
||||
This creates an `.af` file which you can reference in your suite configuration:
|
||||
|
||||
```yaml
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: my_agent.af
|
||||
```
|
||||
|
||||
**How it works:** When using an agent file, a fresh agent instance is created for each sample in your dataset. Each test runs independently with a clean slate, making this ideal for parallel testing across different inputs.
|
||||
|
||||
**Example:** If your dataset has 5 samples, 5 separate agents will be created and can run in parallel. Each agent starts fresh with no memory of the other tests.
|
||||
|
||||
### Option 2: Use an Existing Agent ID
|
||||
|
||||
If you already have a running agent, use its ID directly:
|
||||
|
||||
```python
|
||||
from letta_client import Letta
|
||||
import os
|
||||
|
||||
client = Letta(
|
||||
base_url="http://localhost:8283", # or https://api.letta.com for Letta Cloud
|
||||
token=os.getenv("LETTA_API_KEY") # required for Letta Cloud
|
||||
)
|
||||
|
||||
# List all agents
|
||||
agents = client.agents.list()
|
||||
for agent in agents:
|
||||
print(f"Agent: {agent.name}, ID: {agent.id}")
|
||||
```
|
||||
|
||||
Then reference it in your suite:
|
||||
|
||||
```yaml
|
||||
target:
|
||||
kind: agent
|
||||
agent_id: agent-abc-123
|
||||
```
|
||||
|
||||
**How it works:** The same agent instance is used for all samples, processing them sequentially. The agent's state (memory, message history) carries over between samples, making the dataset behave more like a conversation script than independent test cases.
|
||||
|
||||
**Example:** If your dataset has 5 samples, they all run against the same agent one after another. The agent "remembers" each previous interaction, so sample 3 can reference information from samples 1 and 2.
|
||||
|
||||
### Which Should You Use?
|
||||
|
||||
**Agent File (.af)** - Use when testing independent scenarios
|
||||
|
||||
Best for testing how the agent responds to independent, isolated inputs. Each sample gets a fresh agent with no prior context. Tests can run in parallel.
|
||||
|
||||
**Typical scenarios:**
|
||||
- "How does the agent answer different questions?"
|
||||
- "Does the agent correctly use tools for various tasks?"
|
||||
- "Testing behavior across different prompts"
|
||||
|
||||
**Agent ID** - Use when testing conversational flows
|
||||
|
||||
Best for testing conversational flows or scenarios where context should build up over time. The agent's state accumulates as it processes each sample sequentially.
|
||||
|
||||
**Typical scenarios:**
|
||||
- "Does the agent remember information across a conversation?"
|
||||
- "How does the agent's memory evolve over multiple exchanges?"
|
||||
- "Simulating a realistic user session with multiple requests"
|
||||
|
||||
**Recommendation:** For most evaluation scenarios, use agent files to ensure consistent, reproducible test conditions. Only use agent IDs when you specifically want to test stateful, sequential interactions.
|
||||
|
||||
For more details on agent lifecycle and testing behaviors, see the [Targets guide](./concepts/targets.md#agent-lifecycle-and-testing-behavior).
|
||||
|
||||
## Quick Start
|
||||
|
||||
Let's create your first evaluation in 3 steps:
|
||||
|
||||
### 1. Create a Test Dataset
|
||||
|
||||
Create a file named `dataset.jsonl`:
|
||||
|
||||
```jsonl
|
||||
{"input": "What's the capital of France?", "ground_truth": "Paris"}
|
||||
{"input": "Calculate 2+2", "ground_truth": "4"}
|
||||
{"input": "What color is the sky?", "ground_truth": "blue"}
|
||||
```
|
||||
|
||||
Each line is a JSON object with:
|
||||
- `input`: The prompt to send to your agent
|
||||
- `ground_truth`: The expected answer (used for grading)
|
||||
|
||||
Note: `ground_truth` is optional for some graders (like rubric graders), but required for tool graders like `contains` and `exact_match`.
|
||||
|
||||
Read more about [Datasets](./concepts/datasets.md) for details on how to create your dataset.
|
||||
|
||||
### 2. Create a Suite Configuration
|
||||
|
||||
Create a file named `suite.yaml`:
|
||||
|
||||
```yaml
|
||||
name: my-first-eval
|
||||
dataset: dataset.jsonl
|
||||
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: my_agent.af # Path to your agent file
|
||||
base_url: http://localhost:8283 # Your Letta server
|
||||
|
||||
graders:
|
||||
quality:
|
||||
kind: tool
|
||||
function: contains # Check if response contains the ground truth
|
||||
extractor: last_assistant # Use the last assistant message
|
||||
|
||||
gate:
|
||||
metric_key: quality
|
||||
op: gte
|
||||
value: 0.75 # Require 75% pass rate
|
||||
```
|
||||
|
||||
The suite configuration defines:
|
||||
- The [dataset](./concepts/datasets.md) to use
|
||||
- The [agent](./concepts/targets.md) to test
|
||||
- The [graders](./concepts/graders.md) to use
|
||||
- The [gate](./concepts/gates.md) criteria
|
||||
|
||||
Read more about [Suites](./concepts/suites.md) for details on how to configure your evaluation.
|
||||
|
||||
### 3. Run the Evaluation
|
||||
|
||||
Run your evaluation with the following command:
|
||||
|
||||
```bash
|
||||
letta-evals run suite.yaml
|
||||
```
|
||||
|
||||
You'll see real-time progress as your evaluation runs:
|
||||
|
||||
```
|
||||
Running evaluation: my-first-eval
|
||||
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3/3 100%
|
||||
✓ PASSED (2.25/3.00 avg, 75.0% pass rate)
|
||||
```
|
||||
|
||||
Read more about [CLI Commands](./cli/commands.md) for details about the available commands and options.
|
||||
|
||||
## Understanding the Results
|
||||
|
||||
The core evaluation flow is:
|
||||
|
||||
**Dataset → Target (Agent) → Extractor → Grader → Gate → Result**
|
||||
|
||||
The evaluation runner:
|
||||
1. Loads your dataset
|
||||
2. Sends each input to your agent (Target)
|
||||
3. Extracts the relevant information (using the Extractor)
|
||||
4. Grades the response (using the Grader function)
|
||||
5. Computes aggregate metrics
|
||||
6. Checks if metrics pass the Gate criteria
|
||||
|
||||
The output shows:
|
||||
- **Average score**: Mean score across all samples
|
||||
- **Pass rate**: Percentage of samples that passed
|
||||
- **Gate status**: Whether the evaluation passed or failed overall
|
||||
|
||||
## Next Steps
|
||||
|
||||
Now that you've run your first evaluation, explore more advanced features:
|
||||
|
||||
- [Core Concepts](./concepts/overview.md) - Understand suites, datasets, graders, and extractors
|
||||
- [Grader Types](./concepts/graders.md) - Learn about tool graders vs rubric graders
|
||||
- [Multi-Metric Evaluation](./graders/multi-metric.md) - Test multiple aspects simultaneously
|
||||
- [Custom Graders](./advanced/custom-graders.md) - Write custom grading functions
|
||||
- [Multi-Turn Conversations](./advanced/multi-turn-conversations.md) - Test conversational memory
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
### Strict Answer Checking
|
||||
|
||||
Use exact matching for cases where the answer must be precisely correct:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
accuracy:
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
### Subjective Quality Evaluation
|
||||
|
||||
Use an LLM judge to evaluate subjective qualities like helpfulness or tone:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric
|
||||
prompt_path: rubric.txt
|
||||
model: gpt-4o-mini
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
Then create `rubric.txt`:
|
||||
```
|
||||
Rate the helpfulness and accuracy of the response.
|
||||
- Score 1.0 if helpful and accurate
|
||||
- Score 0.5 if partially helpful
|
||||
- Score 0.0 if unhelpful or wrong
|
||||
```
|
||||
|
||||
### Testing Tool Calls
|
||||
|
||||
Verify that your agent calls specific tools with expected arguments:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
tool_check:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: tool_arguments
|
||||
extractor_config:
|
||||
tool_name: search
|
||||
```
|
||||
|
||||
### Testing Memory Persistence
|
||||
|
||||
Check if the agent correctly updates its memory blocks:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
memory_check:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: memory_block
|
||||
extractor_config:
|
||||
block_label: human
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**"Agent file not found"**
|
||||
|
||||
Make sure your `agent_file` path is correct. Paths are relative to the suite YAML file location. Use absolute paths if needed:
|
||||
|
||||
```yaml
|
||||
target:
|
||||
agent_file: /absolute/path/to/my_agent.af
|
||||
```
|
||||
|
||||
**"Connection refused"**
|
||||
|
||||
Your Letta server isn't running or isn't accessible. Start it with:
|
||||
|
||||
```bash
|
||||
letta server
|
||||
```
|
||||
|
||||
By default, it runs at `http://localhost:8283`.
|
||||
|
||||
**"No ground_truth provided"**
|
||||
|
||||
Tool graders like `exact_match` and `contains` require `ground_truth` in your dataset. Either:
|
||||
- Add `ground_truth` to your samples, or
|
||||
- Use a rubric grader which doesn't require ground truth
|
||||
|
||||
**Agent didn't respond as expected**
|
||||
|
||||
Try testing your agent manually first using the Letta SDK or Agent Development Environment (ADE) to see how it behaves before running evaluations. See the [Letta documentation](https://docs.letta.com) for more information.
|
||||
|
||||
For more help, see the [Troubleshooting Guide](./troubleshooting.md).
|
||||
@@ -1,263 +0,0 @@
|
||||
# Getting Started
|
||||
|
||||
Run your first Letta agent evaluation in 5 minutes.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Python 3.11 or higher
|
||||
- A running Letta server (local or Letta Cloud)
|
||||
- A Letta agent to test, either in agent file format or by ID (see [Targets](/guides/evals/concepts/targets) for more details)
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
pip install letta-evals
|
||||
```
|
||||
|
||||
Or with uv:
|
||||
|
||||
```bash
|
||||
uv pip install letta-evals
|
||||
```
|
||||
|
||||
## Getting an Agent to Test
|
||||
|
||||
Export an existing agent to a file using the Letta SDK:
|
||||
|
||||
```python
|
||||
from letta_client import Letta
|
||||
import os
|
||||
|
||||
# Connect to Letta Cloud
|
||||
client = Letta(token=os.getenv("LETTA_API_KEY"))
|
||||
|
||||
# Export an agent to a file
|
||||
agent_file = client.agents.export_file(agent_id="agent-123")
|
||||
|
||||
# Save to disk
|
||||
with open("my_agent.af", "w") as f:
|
||||
f.write(agent_file)
|
||||
```
|
||||
|
||||
Or export via the Agent Development Environment (ADE) by selecting "Export Agent".
|
||||
|
||||
Then reference it in your suite:
|
||||
|
||||
```yaml
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: my_agent.af
|
||||
```
|
||||
|
||||
<Note>
|
||||
**Other options:** You can also use existing agents by ID or programmatically generate agents. See [Targets](/guides/evals/concepts/targets) for all agent configuration options.
|
||||
</Note>
|
||||
|
||||
## Quick Start
|
||||
|
||||
Let's create your first evaluation in 3 steps:
|
||||
|
||||
### 1. Create a Test Dataset
|
||||
|
||||
Create a file named `dataset.jsonl`:
|
||||
|
||||
```jsonl
|
||||
{"input": "What's the capital of France?", "ground_truth": "Paris"}
|
||||
{"input": "Calculate 2+2", "ground_truth": "4"}
|
||||
{"input": "What color is the sky?", "ground_truth": "blue"}
|
||||
```
|
||||
|
||||
Each line is a JSON object with:
|
||||
- `input`: The prompt to send to your agent
|
||||
- `ground_truth`: The expected answer (used for grading)
|
||||
|
||||
<Note>
|
||||
`ground_truth` is optional for some graders (like rubric graders), but required for tool graders like `contains` and `exact_match`.
|
||||
</Note>
|
||||
|
||||
Read more about [Datasets](/guides/evals/concepts/datasets) for details on how to create your dataset.
|
||||
|
||||
### 2. Create a Suite Configuration
|
||||
|
||||
Create a file named `suite.yaml`:
|
||||
|
||||
```yaml
|
||||
name: my-first-eval
|
||||
dataset: dataset.jsonl
|
||||
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: my_agent.af # Path to your agent file
|
||||
base_url: https://api.letta.com # Letta Cloud (default)
|
||||
token: ${LETTA_API_KEY} # Your API key
|
||||
|
||||
graders:
|
||||
quality:
|
||||
kind: tool
|
||||
function: contains # Check if response contains the ground truth
|
||||
extractor: last_assistant # Use the last assistant message
|
||||
|
||||
gate:
|
||||
metric_key: quality
|
||||
op: gte
|
||||
value: 0.75 # Require 75% pass rate
|
||||
```
|
||||
|
||||
The suite configuration defines:
|
||||
- The [dataset](/guides/evals/concepts/datasets) to use
|
||||
- The [agent](/guides/evals/concepts/targets) to test
|
||||
- The [graders](/guides/evals/concepts/graders) to use
|
||||
- The [gate](/guides/evals/concepts/gates) criteria
|
||||
|
||||
Read more about [Suites](/guides/evals/concepts/suites) for details on how to configure your evaluation.
|
||||
|
||||
### 3. Run the Evaluation
|
||||
|
||||
Run your evaluation with the following command:
|
||||
|
||||
```bash
|
||||
letta-evals run suite.yaml
|
||||
```
|
||||
|
||||
You'll see real-time progress as your evaluation runs:
|
||||
|
||||
```
|
||||
Running evaluation: my-first-eval
|
||||
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3/3 100%
|
||||
✓ PASSED (2.25/3.00 avg, 75.0% pass rate)
|
||||
```
|
||||
|
||||
Read more about [CLI Commands](/guides/evals/cli/commands) for details about the available commands and options.
|
||||
|
||||
## Understanding the Results
|
||||
|
||||
The core evaluation flow is:
|
||||
|
||||
**Dataset → Target (Agent) → Extractor → Grader → Gate → Result**
|
||||
|
||||
The evaluation runner:
|
||||
1. Loads your dataset
|
||||
2. Sends each input to your agent (Target)
|
||||
3. Extracts the relevant information (using the Extractor)
|
||||
4. Grades the response (using the Grader function)
|
||||
5. Computes aggregate metrics
|
||||
6. Checks if metrics pass the Gate criteria
|
||||
|
||||
The output shows:
|
||||
- **Average score**: Mean score across all samples
|
||||
- **Pass rate**: Percentage of samples that passed
|
||||
- **Gate status**: Whether the evaluation passed or failed overall
|
||||
|
||||
## Next Steps
|
||||
|
||||
Now that you've run your first evaluation, explore more advanced features:
|
||||
|
||||
- [Core Concepts](/guides/evals/concepts/overview) - Understand suites, datasets, graders, and extractors
|
||||
- [Grader Types](/guides/evals/concepts/graders) - Learn about tool graders vs rubric graders
|
||||
- [Multi-Metric Evaluation](/guides/guides/evals/graders/multi-metric) - Test multiple aspects simultaneously
|
||||
- [Custom Graders](/guides/evals/advanced/custom-graders) - Write custom grading functions
|
||||
- [Multi-Turn Conversations](/guides/evals/advanced/multi-turn-conversations) - Test conversational memory
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
### Strict Answer Checking
|
||||
|
||||
Use exact matching for cases where the answer must be precisely correct:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
accuracy:
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
### Subjective Quality Evaluation
|
||||
|
||||
Use an LLM judge to evaluate subjective qualities like helpfulness or tone:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric
|
||||
prompt_path: rubric.txt
|
||||
model: gpt-4o-mini
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
Then create `rubric.txt`:
|
||||
```
|
||||
Rate the helpfulness and accuracy of the response.
|
||||
- Score 1.0 if helpful and accurate
|
||||
- Score 0.5 if partially helpful
|
||||
- Score 0.0 if unhelpful or wrong
|
||||
```
|
||||
|
||||
### Testing Tool Calls
|
||||
|
||||
Verify that your agent calls specific tools with expected arguments:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
tool_check:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: tool_arguments
|
||||
extractor_config:
|
||||
tool_name: search
|
||||
```
|
||||
|
||||
### Testing Memory Persistence
|
||||
|
||||
Check if the agent correctly updates its memory blocks:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
memory_check:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: memory_block
|
||||
extractor_config:
|
||||
block_label: human
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
<Warning>
|
||||
**"Agent file not found"**
|
||||
|
||||
Make sure your `agent_file` path is correct. Paths are relative to the suite YAML file location. Use absolute paths if needed:
|
||||
|
||||
```yaml
|
||||
target:
|
||||
agent_file: /absolute/path/to/my_agent.af
|
||||
```
|
||||
</Warning>
|
||||
|
||||
<Warning>
|
||||
**"Connection refused"**
|
||||
|
||||
Your Letta server isn't running or isn't accessible. Start it using Docker:
|
||||
|
||||
```bash
|
||||
docker run -p 8283:8283 -e OPENAI_API_KEY="your_api_key" letta/letta:latest
|
||||
```
|
||||
|
||||
By default, it runs at `http://localhost:8283`. See the [self-hosting guide](/guides/selfhosting) for more information.
|
||||
</Warning>
|
||||
|
||||
<Warning>
|
||||
**"No ground_truth provided"**
|
||||
|
||||
Tool graders like `exact_match` and `contains` require `ground_truth` in your dataset. Either:
|
||||
- Add `ground_truth` to your samples, or
|
||||
- Use a rubric grader which doesn't require ground truth
|
||||
</Warning>
|
||||
|
||||
<Tip>
|
||||
**Agent didn't respond as expected**
|
||||
|
||||
Try testing your agent manually first using the Letta SDK or Agent Development Environment (ADE) to see how it behaves before running evaluations. See the [Letta documentation](https://docs.letta.com) for more information.
|
||||
</Tip>
|
||||
|
||||
For more help, see the [Troubleshooting Guide](/guides/evals/troubleshooting).
|
||||
@@ -1,427 +0,0 @@
|
||||
# Multi-Metric Evaluation
|
||||
|
||||
Evaluate multiple aspects of agent performance simultaneously in a single evaluation suite.
|
||||
|
||||
Multi-metric evaluation allows you to define multiple graders, each measuring a different dimension of your agent's behavior. This is essential for comprehensive testing because agent quality isn't just about correctness—you also care about explanation quality, tool usage, format compliance, and more.
|
||||
|
||||
**Example**: You might want to check that an agent gives the correct answer (tool grader with `exact_match`), explains it well (rubric grader for clarity), and calls the right tools (tool grader on `tool_arguments`). Instead of running three separate evaluations, you can test all three aspects in one run.
|
||||
|
||||
## Why Multiple Metrics?
|
||||
|
||||
Agents are complex systems. You might want to evaluate:
|
||||
- **Correctness**: Does the answer match the expected output?
|
||||
- **Quality**: Is the explanation clear, complete, and well-structured?
|
||||
- **Tool usage**: Does the agent call the right tools with correct arguments?
|
||||
- **Memory**: Does the agent correctly update its memory blocks?
|
||||
- **Format**: Does the output follow required formatting rules?
|
||||
|
||||
Multi-metric evaluation lets you track all of these simultaneously, giving you a holistic view of agent performance.
|
||||
|
||||
## Configuration
|
||||
|
||||
Define multiple graders under the `graders` section:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
accuracy:
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: last_assistant # Check if answer is exactly correct
|
||||
|
||||
completeness:
|
||||
kind: rubric
|
||||
prompt_path: completeness.txt
|
||||
model: gpt-4o-mini
|
||||
extractor: last_assistant # LLM judge evaluates how complete the answer is
|
||||
|
||||
tool_usage:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: tool_arguments # Check if agent called the right tool
|
||||
extractor_config:
|
||||
tool_name: search
|
||||
```
|
||||
|
||||
Each grader:
|
||||
- Has a unique key (e.g., `accuracy`, `completeness`)
|
||||
- Can use different kinds (tool vs rubric)
|
||||
- Can use different extractors
|
||||
- Produces independent scores
|
||||
|
||||
## Gating on One Metric
|
||||
|
||||
While you evaluate multiple metrics, you can only gate on one:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
accuracy:
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: last_assistant # Check correctness
|
||||
|
||||
quality:
|
||||
kind: rubric
|
||||
prompt_path: quality.txt
|
||||
model: gpt-4o-mini
|
||||
extractor: last_assistant # Evaluate subjective quality
|
||||
|
||||
gate:
|
||||
metric_key: accuracy # Pass/fail based on accuracy only
|
||||
op: gte
|
||||
value: 0.8 # Require 80% accuracy to pass
|
||||
```
|
||||
|
||||
The evaluation passes/fails based on `accuracy`, but results include both metrics.
|
||||
|
||||
## Results Structure
|
||||
|
||||
With multiple metrics, results include:
|
||||
|
||||
### Per-Sample Results
|
||||
|
||||
Each sample has scores for all metrics:
|
||||
|
||||
```json
|
||||
{
|
||||
"sample": {...},
|
||||
"grades": {
|
||||
"accuracy": {"score": 1.0, "rationale": "Exact match: true"},
|
||||
"quality": {"score": 0.85, "rationale": "Good response, minor improvements possible"}
|
||||
},
|
||||
"submissions": {
|
||||
"accuracy": "Paris",
|
||||
"quality": "Paris"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Note: If all graders use the same extractor, `submission` and `grade` are also provided for backwards compatibility.
|
||||
|
||||
### Aggregate Metrics
|
||||
|
||||
```json
|
||||
{
|
||||
"metrics": {
|
||||
"by_metric": {
|
||||
"accuracy": {
|
||||
"avg_score_attempted": 0.95,
|
||||
"pass_rate": 95.0,
|
||||
"passed_attempts": 19,
|
||||
"failed_attempts": 1
|
||||
},
|
||||
"quality": {
|
||||
"avg_score_attempted": 0.82,
|
||||
"pass_rate": 80.0,
|
||||
"passed_attempts": 16,
|
||||
"failed_attempts": 4
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Use Cases
|
||||
|
||||
### Accuracy + Quality
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
accuracy:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: last_assistant # Does response contain the answer?
|
||||
|
||||
quality:
|
||||
kind: rubric
|
||||
prompt_path: quality.txt
|
||||
model: gpt-4o-mini
|
||||
extractor: last_assistant # How well is it explained?
|
||||
|
||||
gate:
|
||||
metric_key: accuracy # Must be correct to pass
|
||||
op: gte
|
||||
value: 0.9 # 90% must have correct answer
|
||||
```
|
||||
|
||||
Gate on accuracy (must be correct), but also track quality for insights.
|
||||
|
||||
### Content + Format
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
content:
|
||||
kind: rubric
|
||||
prompt_path: content.txt
|
||||
model: gpt-4o-mini
|
||||
extractor: last_assistant # Evaluate content quality
|
||||
|
||||
format:
|
||||
kind: tool
|
||||
function: ascii_printable_only
|
||||
extractor: last_assistant # Check format compliance
|
||||
|
||||
gate:
|
||||
metric_key: content # Gate on content quality
|
||||
op: gte
|
||||
value: 0.7 # Content must score 70% or higher
|
||||
```
|
||||
|
||||
Ensure content quality while checking format constraints.
|
||||
|
||||
### Answer + Tool Usage + Memory
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
answer:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: last_assistant # Did the agent answer correctly?
|
||||
|
||||
used_tools:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: tool_arguments # Did it call the search tool?
|
||||
extractor_config:
|
||||
tool_name: search
|
||||
|
||||
memory_updated:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: memory_block # Did it update human memory?
|
||||
extractor_config:
|
||||
block_label: human
|
||||
|
||||
gate:
|
||||
metric_key: answer # Gate on correctness
|
||||
op: gte
|
||||
value: 0.8 # 80% of answers must be correct
|
||||
```
|
||||
|
||||
Comprehensive evaluation of agent behavior.
|
||||
|
||||
### Multiple Quality Dimensions
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
accuracy:
|
||||
kind: rubric
|
||||
prompt: "Rate factual accuracy from 0.0 to 1.0"
|
||||
model: gpt-4o-mini
|
||||
extractor: last_assistant
|
||||
|
||||
clarity:
|
||||
kind: rubric
|
||||
prompt: "Rate clarity of explanation from 0.0 to 1.0"
|
||||
model: gpt-4o-mini
|
||||
extractor: last_assistant
|
||||
|
||||
conciseness:
|
||||
kind: rubric
|
||||
prompt: "Rate conciseness (not too verbose) from 0.0 to 1.0"
|
||||
model: gpt-4o-mini
|
||||
extractor: last_assistant
|
||||
|
||||
gate:
|
||||
metric_key: accuracy
|
||||
op: gte
|
||||
value: 0.8
|
||||
```
|
||||
|
||||
Track multiple subjective dimensions.
|
||||
|
||||
## Display Names
|
||||
|
||||
Add human-friendly names for metrics:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
acc:
|
||||
display_name: "Accuracy"
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: last_assistant
|
||||
|
||||
qual:
|
||||
display_name: "Response Quality"
|
||||
kind: rubric
|
||||
prompt_path: quality.txt
|
||||
model: gpt-4o-mini
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
Display names appear in CLI output and visualizations.
|
||||
|
||||
## Independent Extraction
|
||||
|
||||
Each grader can extract different content:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
final_answer:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: last_assistant # Last thing said
|
||||
|
||||
tool_calls:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: all_assistant # Everything said
|
||||
|
||||
search_usage:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: tool_arguments # Tool arguments
|
||||
extractor_config:
|
||||
tool_name: search
|
||||
```
|
||||
|
||||
## Analyzing Results
|
||||
|
||||
### View All Metrics
|
||||
|
||||
CLI output shows all metrics:
|
||||
|
||||
```
|
||||
Results by metric:
|
||||
accuracy - Avg: 0.95, Pass: 95.0%
|
||||
quality - Avg: 0.82, Pass: 80.0%
|
||||
tool_usage - Avg: 0.88, Pass: 88.0%
|
||||
|
||||
Gate (accuracy >= 0.9): PASSED
|
||||
```
|
||||
|
||||
### JSON Output
|
||||
|
||||
```bash
|
||||
letta-evals run suite.yaml --output results/
|
||||
```
|
||||
|
||||
Produces:
|
||||
- `results/summary.json`: Aggregate metrics
|
||||
- `results/results.jsonl`: Per-sample results with all grades
|
||||
|
||||
### Filtering Results
|
||||
|
||||
Post-process to find patterns:
|
||||
|
||||
```python
|
||||
import json
|
||||
|
||||
# Load results
|
||||
with open("results/results.jsonl") as f:
|
||||
results = [json.loads(line) for line in f]
|
||||
|
||||
# Find samples where accuracy=1.0 but quality<0.5
|
||||
issues = [
|
||||
r for r in results
|
||||
if r["grades"]["accuracy"]["score"] == 1.0
|
||||
and r["grades"]["quality"]["score"] < 0.5
|
||||
]
|
||||
|
||||
print(f"Found {len(issues)} samples with correct but low-quality responses")
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### 1. Start with Core Metric
|
||||
|
||||
Focus on one primary metric for gating:
|
||||
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: accuracy # Most important
|
||||
op: gte
|
||||
value: 0.9
|
||||
```
|
||||
|
||||
Use others for diagnostics.
|
||||
|
||||
### 2. Combine Tool and Rubric
|
||||
|
||||
Use fast tool graders for objective checks, rubric graders for quality:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
correct:
|
||||
kind: tool # Fast, cheap
|
||||
function: contains
|
||||
extractor: last_assistant
|
||||
|
||||
quality:
|
||||
kind: rubric # Slower, more nuanced
|
||||
prompt_path: quality.txt
|
||||
model: gpt-4o-mini
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
### 3. Track Tool Usage
|
||||
|
||||
Add a metric for expected tool calls:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
used_search:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: tool_arguments
|
||||
extractor_config:
|
||||
tool_name: search
|
||||
```
|
||||
|
||||
### 4. Validate Format
|
||||
|
||||
Include format checks alongside content:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
content:
|
||||
kind: rubric
|
||||
prompt_path: content.txt
|
||||
model: gpt-4o-mini
|
||||
extractor: last_assistant
|
||||
|
||||
ascii_only:
|
||||
kind: tool
|
||||
function: ascii_printable_only
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
### 5. Use Display Names
|
||||
|
||||
Make CLI output readable:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
acc:
|
||||
display_name: "Answer Accuracy"
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
## Cost Implications
|
||||
|
||||
Multiple rubric graders multiply API costs:
|
||||
|
||||
- 1 grader: $0.00015/sample
|
||||
- 3 graders: $0.00045/sample
|
||||
- 5 graders: $0.00075/sample
|
||||
|
||||
For 1000 samples with 3 rubric graders: ~$0.45
|
||||
|
||||
Mix tool and rubric graders to balance cost and insight.
|
||||
|
||||
## Performance
|
||||
|
||||
Multiple graders run sequentially per sample, but samples run concurrently:
|
||||
|
||||
- 1 grader: ~1s per sample
|
||||
- 3 graders (2 rubric): ~2s per sample
|
||||
|
||||
With 10 concurrent: 1000 samples in ~3-5 minutes
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Tool Graders](./tool-graders.md)
|
||||
- [Rubric Graders](./rubric-graders.md)
|
||||
- [Understanding Results](../results/overview.md)
|
||||
@@ -1,680 +0,0 @@
|
||||
# Rubric Graders
|
||||
|
||||
Rubric graders, also called "LLM-as-judge" graders, use language models evaluate submissions based on custom criteria. They're ideal for subjective, nuanced evaluation.
|
||||
|
||||
Rubric graders work by providing the LLM with a prompt that describes the evaluation criteria, then the language model generates a structured JSON response with a score and rationale:
|
||||
|
||||
```json
|
||||
{
|
||||
"score": 0.85,
|
||||
"rationale": "The response is accurate and well-explained, but could be more concise."
|
||||
}
|
||||
```
|
||||
|
||||
**Schema requirements:**
|
||||
- `score` (required): Decimal number between 0.0 and 1.0
|
||||
- `rationale` (required): String explanation of the grading decision
|
||||
|
||||
> **Note**: OpenAI provides the best support for structured generation. Other providers may have varying quality of structured output adherence.
|
||||
|
||||
## Overview
|
||||
|
||||
Rubric graders:
|
||||
- Use an LLM to evaluate responses
|
||||
- Support custom evaluation criteria (rubrics)
|
||||
- Can handle subjective quality assessment
|
||||
- Return scores with explanations
|
||||
- Use JSON structured generation for reliability
|
||||
|
||||
## Basic Configuration
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric
|
||||
prompt_path: rubric.txt # Path to rubric file
|
||||
model: gpt-4o-mini # LLM model
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
## Rubric Prompts
|
||||
|
||||
Your rubric file defines the evaluation criteria. It can include placeholders:
|
||||
|
||||
- `{input}`: The original input from the dataset
|
||||
- `{submission}`: The extracted agent response
|
||||
- `{ground_truth}`: Ground truth from dataset (if available)
|
||||
|
||||
### Example Rubric
|
||||
|
||||
`quality_rubric.txt`:
|
||||
```
|
||||
Evaluate the response for accuracy, completeness, and clarity.
|
||||
|
||||
Input: {input}
|
||||
Expected answer: {ground_truth}
|
||||
Agent response: {submission}
|
||||
|
||||
Scoring criteria:
|
||||
- 1.0: Perfect - accurate, complete, and clear
|
||||
- 0.8-0.9: Excellent - minor improvements possible
|
||||
- 0.6-0.7: Good - some gaps or unclear parts
|
||||
- 0.4-0.5: Adequate - significant issues
|
||||
- 0.2-0.3: Poor - major problems
|
||||
- 0.0-0.1: Failed - incorrect or nonsensical
|
||||
|
||||
Provide a score between 0.0 and 1.0.
|
||||
```
|
||||
|
||||
### Inline Prompts
|
||||
|
||||
You can include the prompt directly in the YAML:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
creativity:
|
||||
kind: rubric
|
||||
prompt: |
|
||||
Evaluate the creativity and originality of the response.
|
||||
|
||||
Response: {submission}
|
||||
|
||||
Score from 0.0 (generic) to 1.0 (highly creative).
|
||||
model: gpt-4o-mini
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
## Configuration Options
|
||||
|
||||
### prompt_path vs prompt
|
||||
|
||||
Use exactly one:
|
||||
|
||||
```yaml
|
||||
# Option 1: External file
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric
|
||||
prompt_path: rubrics/quality.txt # Relative to suite YAML
|
||||
model: gpt-4o-mini
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
```yaml
|
||||
# Option 2: Inline
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric
|
||||
prompt: "Evaluate the response quality from 0.0 to 1.0"
|
||||
model: gpt-4o-mini
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
### model
|
||||
|
||||
LLM model to use for judging:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric
|
||||
prompt_path: rubric.txt
|
||||
model: gpt-4o-mini # Or gpt-4o, claude-3-5-sonnet, etc.
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
Supported: Any OpenAI-compatible model
|
||||
|
||||
**Special handling**: For reasoning models (o1, o3, gpt-5), temperature is automatically set to 1.0 even if you specify 0.0.
|
||||
|
||||
### temperature
|
||||
|
||||
Controls randomness in LLM generation:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric
|
||||
prompt_path: rubric.txt
|
||||
model: gpt-4o-mini
|
||||
temperature: 0.0 # Deterministic (default)
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
Range: 0.0 (deterministic) to 2.0 (very random)
|
||||
|
||||
Default: 0.0 (recommended for evaluations)
|
||||
|
||||
### provider
|
||||
|
||||
LLM provider:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric
|
||||
prompt_path: rubric.txt
|
||||
model: gpt-4o-mini
|
||||
provider: openai # Default
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
Currently supported: `openai` (default)
|
||||
|
||||
### max_retries
|
||||
|
||||
Number of retry attempts if API call fails:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric
|
||||
prompt_path: rubric.txt
|
||||
model: gpt-4o-mini
|
||||
max_retries: 5 # Default
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
### timeout
|
||||
|
||||
Timeout for API calls in seconds:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric
|
||||
prompt_path: rubric.txt
|
||||
model: gpt-4o-mini
|
||||
timeout: 120.0 # Default: 2 minutes
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
1. **Prompt Building**: The rubric prompt is populated with placeholders (`{input}`, `{submission}`, `{ground_truth}`)
|
||||
2. **System Prompt**: Instructs the LLM to return JSON with `score` and `rationale` fields
|
||||
3. **Structured Output**: Uses JSON mode (`response_format: json_object`) to enforce the schema
|
||||
4. **Validation**: Extracts and validates score (clamped to 0.0-1.0) and rationale
|
||||
5. **Error Handling**: Returns score 0.0 with error message if grading fails
|
||||
|
||||
### System Prompt
|
||||
|
||||
The rubric grader automatically includes this system prompt:
|
||||
|
||||
```
|
||||
You are an evaluation judge. You will be given:
|
||||
1. A rubric describing evaluation criteria
|
||||
2. An input/question
|
||||
3. A submission to evaluate
|
||||
|
||||
Evaluate the submission according to the rubric and return a JSON response with:
|
||||
{
|
||||
"score": (REQUIRED: a decimal number between 0.0 and 1.0 inclusive),
|
||||
"rationale": "explanation of your grading decision"
|
||||
}
|
||||
|
||||
IMPORTANT:
|
||||
- The score MUST be a number between 0.0 and 1.0 (inclusive)
|
||||
- 0.0 means complete failure, 1.0 means perfect
|
||||
- Use decimal values for partial credit (e.g., 0.25, 0.5, 0.75)
|
||||
- Be objective and follow the rubric strictly
|
||||
```
|
||||
|
||||
If the LLM returns invalid JSON or missing fields, the grading fails and returns score 0.0 with an error message.
|
||||
|
||||
## Agent-as-Judge
|
||||
|
||||
Instead of calling an LLM API directly, you can use a **Letta agent** as the judge. The agent-as-judge approach loads a Letta agent from a `.af` file, sends it the evaluation criteria, and collects its score via a tool call.
|
||||
|
||||
### Why Use Agent-as-Judge?
|
||||
|
||||
Agent-as-judge is ideal when:
|
||||
|
||||
1. **No direct LLM API access**: Your team uses Letta Cloud or managed instances without direct API keys
|
||||
2. **Judges need tools**: The evaluator needs to call tools during grading (e.g., web search, database queries, fetching webpages to verify answers)
|
||||
3. **Centralized LLM access**: Your organization provides LLM access only through Letta
|
||||
4. **Custom evaluation logic**: You want the judge to use specific tools or follow complex evaluation workflows
|
||||
5. **Teacher-student patterns**: You have a well-built, experienced agent that can evaluate and teach a student agent being developed
|
||||
|
||||
### Configuration
|
||||
|
||||
To use agent-as-judge, specify `agent_file` instead of `model`:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
agent_judge:
|
||||
kind: rubric # Still "rubric" kind
|
||||
agent_file: judge.af # Path to judge agent .af file
|
||||
prompt_path: rubric.txt # Evaluation criteria
|
||||
judge_tool_name: submit_grade # Tool for submitting scores (default: submit_grade)
|
||||
extractor: last_assistant # What to extract from target agent
|
||||
```
|
||||
|
||||
**Key differences from standard rubric grading:**
|
||||
- Use `agent_file` instead of `model`
|
||||
- No `temperature`, `provider`, `max_retries`, or `timeout` fields (agent handles retries internally)
|
||||
- Judge agent must have a `submit_grade(score: float, rationale: str)` tool
|
||||
- Framework validates judge tool on initialization (fail-fast)
|
||||
|
||||
### Judge Agent Requirements
|
||||
|
||||
Your judge agent **must** have a tool with this exact signature:
|
||||
|
||||
```python
|
||||
def submit_grade(score: float, rationale: str) -> dict:
|
||||
"""
|
||||
Submit an evaluation grade for an agent's response.
|
||||
|
||||
Args:
|
||||
score: A float between 0.0 (complete failure) and 1.0 (perfect)
|
||||
rationale: Explanation of why this score was given
|
||||
|
||||
Returns:
|
||||
dict: Confirmation of grade submission
|
||||
"""
|
||||
return {
|
||||
"status": "success",
|
||||
"grade": {"score": score, "rationale": rationale}
|
||||
}
|
||||
```
|
||||
|
||||
**Validation on initialization**: The framework validates the judge agent has the correct tool with the right parameters **before** running evaluations. If validation fails, you'll get a clear error:
|
||||
|
||||
```
|
||||
ValueError: Judge tool 'submit_grade' not found in agent file judge.af.
|
||||
Available tools: ['fetch_webpage', 'search_documents']
|
||||
```
|
||||
|
||||
This fail-fast approach catches configuration errors immediately.
|
||||
|
||||
### Checklist: Will Your Judge Agent Work?
|
||||
|
||||
- [ ] **Tool exists**: Agent has a tool with the name specified in `judge_tool_name` (default: `submit_grade`)
|
||||
- [ ] **Tool parameters**: The tool has BOTH `score: float` and `rationale: str` parameters
|
||||
- [ ] **Tool is callable**: The tool is not disabled or requires-approval-only
|
||||
- [ ] **Agent system prompt**: Agent understands it's an evaluator (optional but recommended)
|
||||
- [ ] **No conflicting tools**: Agent doesn't have other tools that might confuse it into answering questions instead of judging
|
||||
|
||||
### Example Configuration
|
||||
|
||||
**suite.yaml:**
|
||||
```yaml
|
||||
name: fetch-webpage-agent-judge-test
|
||||
description: Test agent responses using a Letta agent as judge
|
||||
dataset: dataset.csv
|
||||
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: my_agent.af # Agent being tested
|
||||
base_url: http://localhost:8283
|
||||
|
||||
graders:
|
||||
agent_judge:
|
||||
kind: rubric
|
||||
agent_file: judge.af # Judge agent with submit_grade tool
|
||||
prompt_path: rubric.txt # Evaluation criteria
|
||||
judge_tool_name: submit_grade # Tool name (default: submit_grade)
|
||||
extractor: last_assistant # Extract target agent's response
|
||||
|
||||
gate:
|
||||
metric_key: agent_judge
|
||||
op: gte
|
||||
value: 0.75 # Pass if avg score ≥ 0.75
|
||||
```
|
||||
|
||||
**rubric.txt:**
|
||||
```
|
||||
Evaluate the agent's response based on the following criteria:
|
||||
|
||||
1. **Correctness (0.6 weight)**: Does the response contain accurate information from the webpage? Check if the answer matches what was requested in the input.
|
||||
|
||||
2. **Format (0.2 weight)**: Is the response formatted correctly? The input often requests answers in a specific format (e.g., in brackets like {Answer}).
|
||||
|
||||
3. **Completeness (0.2 weight)**: Does the response fully address the question without unnecessary information?
|
||||
|
||||
Scoring Guidelines:
|
||||
- 1.0: Perfect response - correct, properly formatted, and complete
|
||||
- 0.75-0.99: Good response - minor formatting or completeness issues
|
||||
- 0.5-0.74: Adequate response - correct information but format/completeness problems
|
||||
- 0.25-0.49: Poor response - partially correct or missing key information
|
||||
- 0.0-0.24: Failed response - incorrect or no relevant information
|
||||
|
||||
Use the submit_grade tool to submit your evaluation with a score between 0.0 and 1.0. You will need to use your fetch_webpage tool to fetch the desired webpage and confirm the answer is correct.
|
||||
```
|
||||
|
||||
**Judge agent with tools**: The judge agent in this example has `fetch_webpage` tool, allowing it to independently verify answers by fetching the webpage mentioned in the input.
|
||||
|
||||
### How Agent-as-Judge Works
|
||||
|
||||
1. **Agent Loading**: Loads judge agent from `.af` file and validates tool signature
|
||||
2. **Prompt Formatting**: Formats the rubric with `{input}`, `{submission}`, `{ground_truth}` placeholders
|
||||
3. **Agent Evaluation**: Sends formatted prompt to judge agent as a message
|
||||
4. **Tool Call Parsing**: Extracts score and rationale from `submit_grade` tool call
|
||||
5. **Cleanup**: Deletes judge agent after evaluation to free resources
|
||||
6. **Error Handling**: Returns score 0.0 with error message if judge fails to call the tool
|
||||
|
||||
### Agent-as-Judge vs Standard Rubric Grading
|
||||
|
||||
| Feature | Standard Rubric | Agent-as-Judge |
|
||||
|---------|----------------|----------------|
|
||||
| **LLM Access** | Direct API (OPENAI_API_KEY) | Through Letta agent |
|
||||
| **Tools** | No tool usage | Judge can use tools |
|
||||
| **Configuration** | `model`, `temperature`, etc. | `agent_file`, `judge_tool_name` |
|
||||
| **Output Format** | JSON structured output | Tool call with score/rationale |
|
||||
| **Validation** | Runtime JSON parsing | Upfront tool signature validation |
|
||||
| **Use Case** | Teams with API access | Teams using Letta Cloud, judges needing tools |
|
||||
| **Cost** | API call per sample | Depends on judge agent's LLM config |
|
||||
|
||||
### Teacher-Student Pattern
|
||||
|
||||
A powerful use case for agent-as-judge is the **teacher-student pattern**, where an experienced, well-configured agent evaluates a student agent being developed.
|
||||
|
||||
> **Prerequisites**: This pattern assumes you already have a well-defined, production-ready agent that performs well on your task. This agent becomes the "teacher" that evaluates the "student" agent you're developing.
|
||||
|
||||
**Why this works:**
|
||||
- **Domain expertise**: The teacher agent has specialized knowledge and tools
|
||||
- **Consistent evaluation**: The teacher applies the same standards across all evaluations
|
||||
- **Tool-based verification**: The teacher can independently verify answers using its own tools
|
||||
- **Iterative improvement**: Use the teacher to evaluate multiple versions of the student as you improve it
|
||||
|
||||
**Example scenario:**
|
||||
You have a production-ready customer support agent with domain expertise and access to your tools (knowledge base, CRM, documentation search, etc.). You're developing a new, faster version of this agent. Use the experienced agent as the judge to evaluate whether the new agent meets the same quality standards.
|
||||
|
||||
**Configuration:**
|
||||
```yaml
|
||||
name: student-agent-evaluation
|
||||
description: Experienced agent evaluates student agent performance
|
||||
dataset: support_questions.csv
|
||||
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: student_agent.af # New agent being developed
|
||||
base_url: http://localhost:8283
|
||||
|
||||
graders:
|
||||
teacher_evaluation:
|
||||
kind: rubric
|
||||
agent_file: teacher_agent.af # Experienced production agent with domain tools
|
||||
prompt: |
|
||||
You are an experienced customer support agent evaluating a new agent's response.
|
||||
|
||||
Customer question: {input}
|
||||
Student agent's answer: {submission}
|
||||
|
||||
Use your available tools to verify the answer is correct and complete.
|
||||
Grade based on:
|
||||
1. Factual accuracy (0.5 weight) - Does the answer contain correct information?
|
||||
2. Completeness (0.3 weight) - Does it fully address the question?
|
||||
3. Tone and professionalism (0.2 weight) - Is it appropriately worded?
|
||||
|
||||
Submit a score from 0.0 to 1.0 using the submit_grade tool.
|
||||
extractor: last_assistant
|
||||
|
||||
gate:
|
||||
metric_key: teacher_evaluation
|
||||
op: gte
|
||||
value: 0.8 # Student must score 80% or higher
|
||||
```
|
||||
|
||||
**Benefits of this approach:**
|
||||
- **Leverage existing expertise**: Your best agent becomes the standard
|
||||
- **Scalable quality control**: Teacher evaluates hundreds of scenarios automatically
|
||||
- **Continuous validation**: Run teacher evaluations in CI/CD as you iterate on the student
|
||||
- **Transfer learning**: Teacher's evaluation helps identify where the student needs improvement
|
||||
|
||||
### Complete Example
|
||||
|
||||
See [`examples/letta-agent-rubric-grader/`](https://github.com/letta-ai/letta-evals/tree/main/examples/letta-agent-rubric-grader) for a working example with:
|
||||
- Judge agent with `submit_grade` and `fetch_webpage` tools
|
||||
- Target agent that fetches webpages and answers questions
|
||||
- Rubric that instructs judge to verify answers independently
|
||||
- Complete suite configuration
|
||||
|
||||
## Use Cases
|
||||
|
||||
### Quality Assessment
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric
|
||||
prompt_path: quality_rubric.txt
|
||||
model: gpt-4o-mini
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
`quality_rubric.txt`:
|
||||
```
|
||||
Evaluate response quality based on:
|
||||
1. Accuracy of information
|
||||
2. Completeness of answer
|
||||
3. Clarity of explanation
|
||||
|
||||
Response: {submission}
|
||||
Ground truth: {ground_truth}
|
||||
|
||||
Score from 0.0 to 1.0.
|
||||
```
|
||||
|
||||
### Creativity Evaluation
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
creativity:
|
||||
kind: rubric
|
||||
prompt: |
|
||||
Rate the creativity and originality of this story.
|
||||
|
||||
Story: {submission}
|
||||
|
||||
1.0 = Highly creative and original
|
||||
0.5 = Some creative elements
|
||||
0.0 = Generic or cliché
|
||||
model: gpt-4o-mini
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
### Multi-Criteria Evaluation
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
comprehensive:
|
||||
kind: rubric
|
||||
prompt: |
|
||||
Evaluate the response on multiple criteria:
|
||||
|
||||
1. Technical Accuracy (40%)
|
||||
2. Clarity of Explanation (30%)
|
||||
3. Completeness (20%)
|
||||
4. Conciseness (10%)
|
||||
|
||||
Input: {input}
|
||||
Response: {submission}
|
||||
Expected: {ground_truth}
|
||||
|
||||
Provide a weighted score from 0.0 to 1.0.
|
||||
model: gpt-4o
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
### Code Quality
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
code_quality:
|
||||
kind: rubric
|
||||
prompt: |
|
||||
Evaluate this code for:
|
||||
- Correctness
|
||||
- Readability
|
||||
- Efficiency
|
||||
- Best practices
|
||||
|
||||
Code: {submission}
|
||||
|
||||
Score from 0.0 to 1.0.
|
||||
model: gpt-4o
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
### Tone and Style
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
professionalism:
|
||||
kind: rubric
|
||||
prompt: |
|
||||
Rate the professionalism and appropriate tone of the response.
|
||||
|
||||
Response: {submission}
|
||||
|
||||
1.0 = Highly professional
|
||||
0.5 = Acceptable
|
||||
0.0 = Unprofessional or inappropriate
|
||||
model: gpt-4o-mini
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### 1. Clear Scoring Criteria
|
||||
|
||||
Provide explicit score ranges and what they mean:
|
||||
|
||||
```
|
||||
Score:
|
||||
- 1.0: Perfect response with no issues
|
||||
- 0.8-0.9: Minor improvements possible
|
||||
- 0.6-0.7: Some gaps or errors
|
||||
- 0.4-0.5: Significant problems
|
||||
- 0.2-0.3: Major issues
|
||||
- 0.0-0.1: Complete failure
|
||||
```
|
||||
|
||||
### 2. Use Ground Truth When Available
|
||||
|
||||
If you have expected answers, include them:
|
||||
|
||||
```
|
||||
Expected: {ground_truth}
|
||||
Actual: {submission}
|
||||
|
||||
Evaluate how well the actual response matches the expected content.
|
||||
```
|
||||
|
||||
### 3. Be Specific About Criteria
|
||||
|
||||
Vague: "Evaluate the quality"
|
||||
Better: "Evaluate accuracy, completeness, and clarity"
|
||||
|
||||
### 4. Use Examples in Rubric
|
||||
|
||||
```
|
||||
Example of 1.0: "A complete, accurate answer with clear explanation"
|
||||
Example of 0.5: "Partially correct but missing key details"
|
||||
Example of 0.0: "Incorrect or irrelevant response"
|
||||
```
|
||||
|
||||
### 5. Calibrate with Test Cases
|
||||
|
||||
Run on a small set first to ensure the rubric produces expected scores.
|
||||
|
||||
### 6. Consider Model Choice
|
||||
|
||||
- **gpt-4o-mini**: Fast and cost-effective for simple criteria
|
||||
- **gpt-4o**: More accurate for complex evaluation
|
||||
- **claude-3-5-sonnet**: Alternative perspective (via OpenAI-compatible endpoint)
|
||||
|
||||
## Environment Setup
|
||||
|
||||
Rubric graders require an OpenAI API key:
|
||||
|
||||
```bash
|
||||
export OPENAI_API_KEY=your-api-key
|
||||
```
|
||||
|
||||
For custom endpoints:
|
||||
|
||||
```bash
|
||||
export OPENAI_BASE_URL=https://your-endpoint.com/v1
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
If grading fails:
|
||||
- Score is set to 0.0
|
||||
- Rationale includes error message
|
||||
- Metadata includes error details
|
||||
- Evaluation continues (doesn't stop the suite)
|
||||
|
||||
Common errors:
|
||||
- API timeout → Check `timeout` setting
|
||||
- Invalid API key → Verify `OPENAI_API_KEY`
|
||||
- Rate limit → Reduce concurrency or add retries
|
||||
|
||||
## Cost Considerations
|
||||
|
||||
Rubric graders make API calls for each sample:
|
||||
|
||||
- **gpt-4o-mini**: ~$0.00015 per evaluation (cheap)
|
||||
- **gpt-4o**: ~$0.002 per evaluation (more expensive)
|
||||
|
||||
For 1000 samples:
|
||||
- gpt-4o-mini: ~$0.15
|
||||
- gpt-4o: ~$2.00
|
||||
|
||||
Estimate costs before running large evaluations.
|
||||
|
||||
## Performance
|
||||
|
||||
Rubric graders are slower than tool graders:
|
||||
- Tool grader: <1ms per sample
|
||||
- Rubric grader: 500-2000ms per sample (network + LLM)
|
||||
|
||||
Use concurrency to speed up:
|
||||
|
||||
```bash
|
||||
letta-evals run suite.yaml --max-concurrent 10
|
||||
```
|
||||
|
||||
## Limitations
|
||||
|
||||
Rubric graders:
|
||||
- **Cost**: API calls cost money
|
||||
- **Speed**: Slower than tool graders
|
||||
- **Consistency**: Can vary slightly between runs (use temperature 0.0 for best consistency)
|
||||
- **API dependency**: Requires network and API availability
|
||||
|
||||
For deterministic, fast evaluation, use [Tool Graders](./tool-graders.md).
|
||||
|
||||
## Combining Tool and Rubric Graders
|
||||
|
||||
Use both in one suite:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
format_check:
|
||||
kind: tool
|
||||
function: regex_match
|
||||
extractor: last_assistant
|
||||
|
||||
quality:
|
||||
kind: rubric
|
||||
prompt_path: quality.txt
|
||||
model: gpt-4o-mini
|
||||
extractor: last_assistant
|
||||
|
||||
gate:
|
||||
metric_key: quality # Gate on quality, but still check format
|
||||
op: gte
|
||||
value: 0.7
|
||||
```
|
||||
|
||||
This combines fast deterministic checks with nuanced quality evaluation.
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Built-in Extractors](../extractors/builtin.md) - Understanding what to extract from trajectories
|
||||
- [Tool Graders](./tool-graders.md) - Deterministic evaluation for objective criteria
|
||||
- [Multi-Metric Evaluation](./multi-metric.md) - Combining multiple graders
|
||||
- [Custom Graders](../advanced/custom-graders.md) - Writing custom evaluation logic
|
||||
@@ -1,332 +0,0 @@
|
||||
# Tool Graders
|
||||
|
||||
Tool graders use Python functions to programmatically evaluate submissions. They're ideal for deterministic, rule-based evaluation.
|
||||
|
||||
## Overview
|
||||
|
||||
Tool graders:
|
||||
- Execute Python functions that take `(sample, submission)` and return a `GradeResult`
|
||||
- Are fast and deterministic
|
||||
- Don't require external API calls
|
||||
- Can implement any custom logic
|
||||
|
||||
## Configuration
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
my_metric:
|
||||
kind: tool
|
||||
function: exact_match # Function name
|
||||
extractor: last_assistant # What to extract from trajectory
|
||||
```
|
||||
|
||||
The `extractor` determines what part of the agent's response to evaluate. See [Built-in Extractors](../extractors/builtin.md) for all available options.
|
||||
|
||||
## Built-in Functions
|
||||
|
||||
### exact_match
|
||||
|
||||
Exact string comparison (case-sensitive, whitespace-trimmed).
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
accuracy:
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
**Requires**: `ground_truth` in dataset
|
||||
|
||||
**Returns**:
|
||||
- Score: 1.0 if exact match, 0.0 otherwise
|
||||
- Rationale: "Exact match: true" or "Exact match: false"
|
||||
|
||||
**Example**:
|
||||
```jsonl
|
||||
{"input": "What is 2+2?", "ground_truth": "4"}
|
||||
```
|
||||
|
||||
Submission "4" → Score 1.0
|
||||
Submission "four" → Score 0.0
|
||||
|
||||
### contains
|
||||
|
||||
Case-insensitive substring check.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
keyword_check:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
**Requires**: `ground_truth` in dataset
|
||||
|
||||
**Returns**:
|
||||
- Score: 1.0 if ground_truth found in submission (case-insensitive), 0.0 otherwise
|
||||
- Rationale: "Contains ground_truth: true" or "Contains ground_truth: false"
|
||||
|
||||
**Example**:
|
||||
```jsonl
|
||||
{"input": "What is the capital of France?", "ground_truth": "Paris"}
|
||||
```
|
||||
|
||||
Submission "The capital is Paris" → Score 1.0
|
||||
Submission "The capital is paris" → Score 1.0 (case-insensitive)
|
||||
Submission "The capital is Lyon" → Score 0.0
|
||||
|
||||
### regex_match
|
||||
|
||||
Pattern matching using regex.
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
pattern_check:
|
||||
kind: tool
|
||||
function: regex_match
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
**Requires**: `ground_truth` in dataset (as regex pattern)
|
||||
|
||||
**Returns**:
|
||||
- Score: 1.0 if pattern matches, 0.0 otherwise
|
||||
- Rationale: "Regex match: true" or "Regex match: false"
|
||||
- If pattern is invalid: Score 0.0 with error message
|
||||
|
||||
**Example**:
|
||||
```jsonl
|
||||
{"input": "Generate a UUID", "ground_truth": "[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"}
|
||||
{"input": "Extract the number", "ground_truth": "\\d+"}
|
||||
```
|
||||
|
||||
Submission "550e8400-e29b-41d4-a716-446655440000" → Score 1.0
|
||||
Submission "not-a-uuid" → Score 0.0
|
||||
|
||||
### ascii_printable_only
|
||||
|
||||
Validates that all characters are printable ASCII (code points 32-126).
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
ascii_check:
|
||||
kind: tool
|
||||
function: ascii_printable_only
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
**Requires**: No ground_truth needed
|
||||
|
||||
**Returns**:
|
||||
- Score: 1.0 if all characters are printable ASCII, 0.0 if any non-printable found
|
||||
- Rationale: Details about non-printable characters if found
|
||||
|
||||
**Notes**:
|
||||
- Newlines (`\n`) and carriage returns (`\r`) are ignored (allowed)
|
||||
- Useful for ASCII art, formatted output, or ensuring clean text
|
||||
|
||||
**Example**:
|
||||
|
||||
Submission "Hello, World!\n" → Score 1.0
|
||||
Submission "Hello 🌍" → Score 0.0 (emoji not in ASCII range)
|
||||
|
||||
## Custom Tool Graders
|
||||
|
||||
You can write custom grading functions:
|
||||
|
||||
```python
|
||||
# custom_graders.py
|
||||
from letta_evals.decorators import grader
|
||||
from letta_evals.models import GradeResult, Sample
|
||||
|
||||
@grader
|
||||
def my_custom_grader(sample: Sample, submission: str) -> GradeResult:
|
||||
"""Custom grading logic."""
|
||||
# Your evaluation logic here
|
||||
score = 1.0 if some_condition(submission) else 0.0
|
||||
return GradeResult(
|
||||
score=score,
|
||||
rationale=f"Explanation of the score",
|
||||
metadata={"extra": "info"}
|
||||
)
|
||||
```
|
||||
|
||||
Then reference it in your suite:
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
custom:
|
||||
kind: tool
|
||||
function: my_custom_grader
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
See [Custom Graders](../advanced/custom-graders.md) for details.
|
||||
|
||||
## Use Cases
|
||||
|
||||
### Exact Answer Validation
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
correct_answer:
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
Best for: Math problems, single-word answers, structured formats
|
||||
|
||||
### Keyword Presence
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
mentions_topic:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
Best for: Checking if specific concepts are mentioned
|
||||
|
||||
### Format Validation
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
valid_email:
|
||||
kind: tool
|
||||
function: regex_match
|
||||
extractor: last_assistant
|
||||
```
|
||||
|
||||
Dataset:
|
||||
```jsonl
|
||||
{"input": "Extract the email", "ground_truth": "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}"}
|
||||
```
|
||||
|
||||
Best for: Emails, UUIDs, phone numbers, structured data
|
||||
|
||||
### Tool Call Validation
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
used_search:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: tool_arguments
|
||||
extractor_config:
|
||||
tool_name: search
|
||||
```
|
||||
|
||||
Dataset:
|
||||
```jsonl
|
||||
{"input": "Find information about pandas", "ground_truth": "pandas"}
|
||||
```
|
||||
|
||||
Checks if the agent called the search tool with "pandas" in arguments.
|
||||
|
||||
### JSON Structure Validation
|
||||
|
||||
Custom grader:
|
||||
|
||||
```python
|
||||
import json
|
||||
from letta_evals.decorators import grader
|
||||
from letta_evals.models import GradeResult, Sample
|
||||
|
||||
@grader
|
||||
def valid_json_with_field(sample: Sample, submission: str) -> GradeResult:
|
||||
try:
|
||||
data = json.loads(submission)
|
||||
required_field = sample.ground_truth
|
||||
if required_field in data:
|
||||
return GradeResult(score=1.0, rationale=f"Valid JSON with '{required_field}' field")
|
||||
else:
|
||||
return GradeResult(score=0.0, rationale=f"Missing required field: {required_field}")
|
||||
except json.JSONDecodeError as e:
|
||||
return GradeResult(score=0.0, rationale=f"Invalid JSON: {e}")
|
||||
```
|
||||
|
||||
## Combining with Extractors
|
||||
|
||||
Tool graders work with any extractor:
|
||||
|
||||
### Grade Tool Arguments
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
correct_tool:
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: tool_arguments
|
||||
extractor_config:
|
||||
tool_name: calculator
|
||||
```
|
||||
|
||||
Checks if calculator was called with specific arguments.
|
||||
|
||||
### Grade Memory Updates
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
memory_correct:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: memory_block
|
||||
extractor_config:
|
||||
block_label: human
|
||||
```
|
||||
|
||||
Checks if agent's memory block contains expected content.
|
||||
|
||||
### Grade Pattern Extraction
|
||||
|
||||
```yaml
|
||||
graders:
|
||||
extracted_correctly:
|
||||
kind: tool
|
||||
function: exact_match
|
||||
extractor: pattern
|
||||
extractor_config:
|
||||
pattern: 'ANSWER: (.*)'
|
||||
group: 1
|
||||
```
|
||||
|
||||
Extracts content after "ANSWER:" and checks if it matches ground truth.
|
||||
|
||||
## Performance
|
||||
|
||||
Tool graders are:
|
||||
- **Fast**: No API calls, pure Python execution
|
||||
- **Deterministic**: Same input always produces same result
|
||||
- **Cost-effective**: No LLM API costs
|
||||
- **Reliable**: No network dependencies
|
||||
|
||||
Use tool graders when possible for faster, cheaper evaluations.
|
||||
|
||||
## Limitations
|
||||
|
||||
Tool graders:
|
||||
- Can't evaluate subjective quality
|
||||
- Limited to predefined logic
|
||||
- Don't understand semantic similarity
|
||||
- Can't handle complex, nuanced criteria
|
||||
|
||||
For these cases, use [Rubric Graders](./rubric-graders.md).
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Use exact_match for precise answers**: Math, single words, structured formats
|
||||
2. **Use contains for flexible matching**: When exact format varies but key content is present
|
||||
3. **Use regex for format validation**: Emails, phone numbers, UUIDs
|
||||
4. **Write custom graders for complex logic**: Multi-step validation, JSON parsing
|
||||
5. **Combine multiple graders**: Evaluate different aspects (format + content + tool usage)
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Built-in Extractors](../extractors/builtin.md) - Understanding what to extract from trajectories
|
||||
- [Rubric Graders](./rubric-graders.md) - LLM-based evaluation for subjective quality
|
||||
- [Custom Graders](../advanced/custom-graders.md) - Writing your own grading functions
|
||||
- [Multi-Metric Evaluation](./multi-metric.md) - Using multiple graders simultaneously
|
||||
@@ -1,468 +0,0 @@
|
||||
# Understanding Results
|
||||
|
||||
This guide explains how to interpret evaluation results.
|
||||
|
||||
## Result Structure
|
||||
|
||||
An evaluation produces three types of output:
|
||||
|
||||
1. **Console output**: Real-time progress and summary
|
||||
2. **Summary JSON**: Aggregate metrics and configuration
|
||||
3. **Results JSONL**: Per-sample detailed results
|
||||
|
||||
## Console Output
|
||||
|
||||
### Progress Display
|
||||
|
||||
```
|
||||
Running evaluation: my-eval-suite
|
||||
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3/3 100%
|
||||
|
||||
Results:
|
||||
Total samples: 3
|
||||
Attempted: 3
|
||||
Avg score: 0.83 (attempted: 0.83)
|
||||
Passed: 2 (66.7%)
|
||||
|
||||
Gate (quality >= 0.75): PASSED
|
||||
```
|
||||
|
||||
### Quiet Mode
|
||||
|
||||
```bash
|
||||
letta-evals run suite.yaml --quiet
|
||||
```
|
||||
|
||||
Output:
|
||||
```
|
||||
✓ PASSED
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```
|
||||
✗ FAILED
|
||||
```
|
||||
|
||||
## JSON Output
|
||||
|
||||
### Saving Results
|
||||
|
||||
```bash
|
||||
letta-evals run suite.yaml --output results/
|
||||
```
|
||||
|
||||
Creates three files:
|
||||
|
||||
#### header.json
|
||||
|
||||
Evaluation metadata:
|
||||
|
||||
```json
|
||||
{
|
||||
"suite_name": "my-eval-suite",
|
||||
"timestamp": "2025-01-15T10:30:00Z",
|
||||
"version": "0.3.0"
|
||||
}
|
||||
```
|
||||
|
||||
#### summary.json
|
||||
|
||||
Complete evaluation summary:
|
||||
|
||||
```json
|
||||
{
|
||||
"suite": "my-eval-suite",
|
||||
"config": {
|
||||
"target": {...},
|
||||
"graders": {...},
|
||||
"gate": {...}
|
||||
},
|
||||
"metrics": {
|
||||
"total": 10,
|
||||
"total_attempted": 10,
|
||||
"avg_score_attempted": 0.85,
|
||||
"avg_score_total": 0.85,
|
||||
"passed_attempts": 8,
|
||||
"failed_attempts": 2,
|
||||
"by_metric": {
|
||||
"accuracy": {
|
||||
"avg_score_attempted": 0.90,
|
||||
"pass_rate": 90.0,
|
||||
"passed_attempts": 9,
|
||||
"failed_attempts": 1
|
||||
},
|
||||
"quality": {
|
||||
"avg_score_attempted": 0.80,
|
||||
"pass_rate": 70.0,
|
||||
"passed_attempts": 7,
|
||||
"failed_attempts": 3
|
||||
}
|
||||
}
|
||||
},
|
||||
"gates_passed": true
|
||||
}
|
||||
```
|
||||
|
||||
#### results.jsonl
|
||||
|
||||
One JSON object per line, each representing one sample:
|
||||
|
||||
```jsonl
|
||||
{"sample": {"id": 0, "input": "What is 2+2?", "ground_truth": "4"}, "submission": "4", "grade": {"score": 1.0, "rationale": "Exact match: true"}, "trajectory": [...], "agent_id": "agent-123", "model_name": "default"}
|
||||
{"sample": {"id": 1, "input": "What is 3+3?", "ground_truth": "6"}, "submission": "6", "grade": {"score": 1.0, "rationale": "Exact match: true"}, "trajectory": [...], "agent_id": "agent-124", "model_name": "default"}
|
||||
```
|
||||
|
||||
## Metrics Explained
|
||||
|
||||
### total
|
||||
|
||||
Total number of samples in the evaluation (including errors).
|
||||
|
||||
### total_attempted
|
||||
|
||||
Number of samples that completed without errors.
|
||||
|
||||
If a sample fails during agent execution or grading, it's counted in `total` but not `total_attempted`.
|
||||
|
||||
### avg_score_attempted
|
||||
|
||||
Average score across samples that completed successfully.
|
||||
|
||||
Formula: `sum(scores) / total_attempted`
|
||||
|
||||
Range: 0.0 to 1.0
|
||||
|
||||
### avg_score_total
|
||||
|
||||
Average score across all samples, treating errors as 0.0.
|
||||
|
||||
Formula: `sum(scores) / total`
|
||||
|
||||
Range: 0.0 to 1.0
|
||||
|
||||
### passed_attempts / failed_attempts
|
||||
|
||||
Number of samples that passed/failed the gate's per-sample criteria.
|
||||
|
||||
By default:
|
||||
- If gate metric is `accuracy`: sample passes if score >= 1.0
|
||||
- If gate metric is `avg_score`: sample passes if score >= gate value
|
||||
|
||||
Can be customized with `pass_op` and `pass_value` in gate config.
|
||||
|
||||
### by_metric
|
||||
|
||||
For multi-metric evaluation, shows aggregate stats for each metric:
|
||||
|
||||
```json
|
||||
"by_metric": {
|
||||
"accuracy": {
|
||||
"avg_score_attempted": 0.90,
|
||||
"avg_score_total": 0.85,
|
||||
"pass_rate": 90.0,
|
||||
"passed_attempts": 9,
|
||||
"failed_attempts": 1
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Sample Results
|
||||
|
||||
Each sample result includes:
|
||||
|
||||
### sample
|
||||
The original dataset sample:
|
||||
```json
|
||||
"sample": {
|
||||
"id": 0,
|
||||
"input": "What is 2+2?",
|
||||
"ground_truth": "4",
|
||||
"metadata": {...}
|
||||
}
|
||||
```
|
||||
|
||||
### submission
|
||||
The extracted text that was graded:
|
||||
```json
|
||||
"submission": "The answer is 4"
|
||||
```
|
||||
|
||||
### grade
|
||||
The grading result:
|
||||
```json
|
||||
"grade": {
|
||||
"score": 1.0,
|
||||
"rationale": "Contains ground_truth: true",
|
||||
"metadata": {"model": "gpt-4o-mini", "usage": {...}}
|
||||
}
|
||||
```
|
||||
|
||||
### grades (multi-metric)
|
||||
For multi-metric evaluation:
|
||||
```json
|
||||
"grades": {
|
||||
"accuracy": {"score": 1.0, "rationale": "Exact match"},
|
||||
"quality": {"score": 0.85, "rationale": "Good but verbose"}
|
||||
}
|
||||
```
|
||||
|
||||
### trajectory
|
||||
The complete conversation history:
|
||||
```json
|
||||
"trajectory": [
|
||||
[
|
||||
{"role": "user", "content": "What is 2+2?"},
|
||||
{"role": "assistant", "content": "The answer is 4"}
|
||||
]
|
||||
]
|
||||
```
|
||||
|
||||
### agent_id
|
||||
The ID of the agent that generated this response:
|
||||
```json
|
||||
"agent_id": "agent-abc-123"
|
||||
```
|
||||
|
||||
### model_name
|
||||
The model configuration used:
|
||||
```json
|
||||
"model_name": "gpt-4o-mini"
|
||||
```
|
||||
|
||||
### agent_usage
|
||||
Token usage statistics (if available):
|
||||
```json
|
||||
"agent_usage": [
|
||||
{"completion_tokens": 10, "prompt_tokens": 50, "total_tokens": 60}
|
||||
]
|
||||
```
|
||||
|
||||
## Interpreting Scores
|
||||
|
||||
### Score Ranges
|
||||
|
||||
- **1.0**: Perfect - fully meets criteria
|
||||
- **0.8-0.99**: Very good - minor issues
|
||||
- **0.6-0.79**: Good - notable improvements possible
|
||||
- **0.4-0.59**: Acceptable - significant issues
|
||||
- **0.2-0.39**: Poor - major problems
|
||||
- **0.0-0.19**: Failed - did not meet criteria
|
||||
|
||||
### Binary vs Continuous
|
||||
|
||||
**Tool graders** typically return binary scores:
|
||||
- 1.0: Passed
|
||||
- 0.0: Failed
|
||||
|
||||
**Rubric graders** return continuous scores:
|
||||
- Any value from 0.0 to 1.0
|
||||
- Allows for partial credit
|
||||
|
||||
## Multi-Model Results
|
||||
|
||||
When testing multiple models:
|
||||
|
||||
```json
|
||||
"metrics": {
|
||||
"per_model": [
|
||||
{
|
||||
"model_name": "gpt-4o-mini",
|
||||
"avg_score_attempted": 0.85,
|
||||
"passed_samples": 8,
|
||||
"failed_samples": 2
|
||||
},
|
||||
{
|
||||
"model_name": "claude-3-5-sonnet",
|
||||
"avg_score_attempted": 0.90,
|
||||
"passed_samples": 9,
|
||||
"failed_samples": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
Console output:
|
||||
```
|
||||
Results by model:
|
||||
gpt-4o-mini - Avg: 0.85, Pass: 80.0%
|
||||
claude-3-5-sonnet - Avg: 0.90, Pass: 90.0%
|
||||
```
|
||||
|
||||
## Multiple Runs Statistics
|
||||
|
||||
Run evaluations multiple times to measure consistency and get aggregate statistics.
|
||||
|
||||
### Configuration
|
||||
|
||||
Specify in YAML:
|
||||
```yaml
|
||||
name: my-eval-suite
|
||||
dataset: dataset.jsonl
|
||||
num_runs: 5 # Run 5 times
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: my_agent.af
|
||||
graders:
|
||||
accuracy:
|
||||
kind: tool
|
||||
function: exact_match
|
||||
gate:
|
||||
metric_key: accuracy
|
||||
op: gte
|
||||
value: 0.8
|
||||
```
|
||||
|
||||
Or via CLI:
|
||||
```bash
|
||||
letta-evals run suite.yaml --num-runs 10 --output results/
|
||||
```
|
||||
|
||||
### Output Structure
|
||||
|
||||
```
|
||||
results/
|
||||
├── run_1/
|
||||
│ ├── header.json
|
||||
│ ├── results.jsonl
|
||||
│ └── summary.json
|
||||
├── run_2/
|
||||
│ ├── header.json
|
||||
│ ├── results.jsonl
|
||||
│ └── summary.json
|
||||
├── ...
|
||||
└── aggregate_stats.json # Statistics across all runs
|
||||
```
|
||||
|
||||
### Aggregate Statistics File
|
||||
|
||||
The `aggregate_stats.json` includes statistics across all runs:
|
||||
|
||||
```json
|
||||
{
|
||||
"num_runs": 10,
|
||||
"runs_passed": 8,
|
||||
"runs_failed": 2,
|
||||
"pass_rate": 80.0,
|
||||
"avg_score_attempted": {
|
||||
"mean": 0.847,
|
||||
"std": 0.042,
|
||||
"min": 0.78,
|
||||
"max": 0.91
|
||||
},
|
||||
"avg_score_total": {
|
||||
"mean": 0.847,
|
||||
"std": 0.042,
|
||||
"min": 0.78,
|
||||
"max": 0.91
|
||||
},
|
||||
"per_metric": {
|
||||
"accuracy": {
|
||||
"avg_score_attempted": {
|
||||
"mean": 0.89,
|
||||
"std": 0.035,
|
||||
"min": 0.82,
|
||||
"max": 0.95
|
||||
},
|
||||
"pass_rate": {
|
||||
"mean": 89.0,
|
||||
"std": 4.2,
|
||||
"min": 80.0,
|
||||
"max": 95.0
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Use Cases
|
||||
|
||||
**Measure consistency of non-deterministic agents:**
|
||||
```bash
|
||||
letta-evals run suite.yaml --num-runs 20 --output results/
|
||||
# Check stddev in aggregate_stats.json
|
||||
# Low stddev = consistent, high stddev = variable
|
||||
```
|
||||
|
||||
**Get confidence intervals:**
|
||||
```python
|
||||
import json
|
||||
import math
|
||||
|
||||
with open("results/aggregate_stats.json") as f:
|
||||
stats = json.load(f)
|
||||
|
||||
mean = stats["avg_score_attempted"]["mean"]
|
||||
std = stats["avg_score_attempted"]["std"]
|
||||
n = stats["num_runs"]
|
||||
|
||||
# 95% confidence interval (assuming normal distribution)
|
||||
margin = 1.96 * (std / math.sqrt(n))
|
||||
print(f"Score: {mean:.3f} ± {margin:.3f}")
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
If a sample encounters an error:
|
||||
|
||||
```json
|
||||
{
|
||||
"sample": {...},
|
||||
"submission": "",
|
||||
"grade": {
|
||||
"score": 0.0,
|
||||
"rationale": "Error during grading: Connection timeout",
|
||||
"metadata": {"error": "timeout", "error_type": "ConnectionError"}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Errors:
|
||||
- Count toward `total` but not `total_attempted`
|
||||
- Get score of 0.0
|
||||
- Include error details in rationale and metadata
|
||||
|
||||
## Analyzing Results
|
||||
|
||||
### Find Low Scores
|
||||
|
||||
```python
|
||||
import json
|
||||
|
||||
with open("results/results.jsonl") as f:
|
||||
results = [json.loads(line) for line in f]
|
||||
|
||||
low_scores = [r for r in results if r["grade"]["score"] < 0.5]
|
||||
print(f"Found {len(low_scores)} samples with score < 0.5")
|
||||
|
||||
for result in low_scores:
|
||||
print(f"Sample {result['sample']['id']}: {result['grade']['rationale']}")
|
||||
```
|
||||
|
||||
### Compare Metrics
|
||||
|
||||
```python
|
||||
# Load summary
|
||||
with open("results/summary.json") as f:
|
||||
summary = json.load(f)
|
||||
|
||||
metrics = summary["metrics"]["by_metric"]
|
||||
for name, stats in metrics.items():
|
||||
print(f"{name}: {stats['avg_score_attempted']:.2f} avg, {stats['pass_rate']:.1f}% pass")
|
||||
```
|
||||
|
||||
### Extract Failures
|
||||
|
||||
```python
|
||||
# Find samples that failed gate criteria
|
||||
failures = [
|
||||
r for r in results
|
||||
if not gate_passed(r["grade"]["score"]) # Your gate logic
|
||||
]
|
||||
```
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Metrics Reference](./metrics.md)
|
||||
- [Output Formats](./output-formats.md)
|
||||
- [Best Practices](../best-practices/writing-tests.md)
|
||||
@@ -1,484 +0,0 @@
|
||||
# Understanding Results
|
||||
|
||||
This guide explains how to interpret evaluation results.
|
||||
|
||||
## Result Structure
|
||||
|
||||
An evaluation produces three types of output:
|
||||
|
||||
1. **Console output**: Real-time progress and summary
|
||||
2. **Summary JSON**: Aggregate metrics and configuration
|
||||
3. **Results JSONL**: Per-sample detailed results
|
||||
|
||||
## Console Output
|
||||
|
||||
### Progress Display
|
||||
|
||||
```
|
||||
Running evaluation: my-eval-suite
|
||||
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3/3 100%
|
||||
|
||||
Results:
|
||||
Total samples: 3
|
||||
Attempted: 3
|
||||
Avg score: 0.83 (attempted: 0.83)
|
||||
Passed: 2 (66.7%)
|
||||
|
||||
Gate (quality >= 0.75): PASSED
|
||||
```
|
||||
|
||||
### Quiet Mode
|
||||
|
||||
```bash
|
||||
letta-evals run suite.yaml --quiet
|
||||
```
|
||||
|
||||
Output:
|
||||
```
|
||||
✓ PASSED
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```
|
||||
✗ FAILED
|
||||
```
|
||||
|
||||
## JSON Output
|
||||
|
||||
### Saving Results
|
||||
|
||||
```bash
|
||||
letta-evals run suite.yaml --output results/
|
||||
```
|
||||
|
||||
Creates three files:
|
||||
|
||||
#### header.json
|
||||
|
||||
Evaluation metadata:
|
||||
|
||||
```json
|
||||
{
|
||||
"suite_name": "my-eval-suite",
|
||||
"timestamp": "2025-01-15T10:30:00Z",
|
||||
"version": "0.3.0"
|
||||
}
|
||||
```
|
||||
|
||||
#### summary.json
|
||||
|
||||
Complete evaluation summary:
|
||||
|
||||
```json
|
||||
{
|
||||
"suite": "my-eval-suite",
|
||||
"config": {
|
||||
"target": {...},
|
||||
"graders": {...},
|
||||
"gate": {...}
|
||||
},
|
||||
"metrics": {
|
||||
"total": 10,
|
||||
"total_attempted": 10,
|
||||
"avg_score_attempted": 0.85,
|
||||
"avg_score_total": 0.85,
|
||||
"passed_attempts": 8,
|
||||
"failed_attempts": 2,
|
||||
"by_metric": {
|
||||
"accuracy": {
|
||||
"avg_score_attempted": 0.90,
|
||||
"pass_rate": 90.0,
|
||||
"passed_attempts": 9,
|
||||
"failed_attempts": 1
|
||||
},
|
||||
"quality": {
|
||||
"avg_score_attempted": 0.80,
|
||||
"pass_rate": 70.0,
|
||||
"passed_attempts": 7,
|
||||
"failed_attempts": 3
|
||||
}
|
||||
}
|
||||
},
|
||||
"gates_passed": true
|
||||
}
|
||||
```
|
||||
|
||||
#### results.jsonl
|
||||
|
||||
One JSON object per line, each representing one sample:
|
||||
|
||||
```jsonl
|
||||
{"sample": {"id": 0, "input": "What is 2+2?", "ground_truth": "4"}, "submission": "4", "grade": {"score": 1.0, "rationale": "Exact match: true"}, "trajectory": [...], "agent_id": "agent-123", "model_name": "default"}
|
||||
{"sample": {"id": 1, "input": "What is 3+3?", "ground_truth": "6"}, "submission": "6", "grade": {"score": 1.0, "rationale": "Exact match: true"}, "trajectory": [...], "agent_id": "agent-124", "model_name": "default"}
|
||||
```
|
||||
|
||||
## Metrics Explained
|
||||
|
||||
### total
|
||||
|
||||
Total number of samples in the evaluation (including errors).
|
||||
|
||||
### total_attempted
|
||||
|
||||
Number of samples that completed without errors.
|
||||
|
||||
If a sample fails during agent execution or grading, it's counted in `total` but not `total_attempted`.
|
||||
|
||||
### avg_score_attempted
|
||||
|
||||
Average score across samples that completed successfully.
|
||||
|
||||
Formula: `sum(scores) / total_attempted`
|
||||
|
||||
Range: 0.0 to 1.0
|
||||
|
||||
### avg_score_total
|
||||
|
||||
Average score across all samples, treating errors as 0.0.
|
||||
|
||||
Formula: `sum(scores) / total`
|
||||
|
||||
Range: 0.0 to 1.0
|
||||
|
||||
### passed_attempts / failed_attempts
|
||||
|
||||
Number of samples that passed/failed the gate's per-sample criteria.
|
||||
|
||||
By default:
|
||||
- If gate metric is `accuracy`: sample passes if score `>= 1.0`
|
||||
- If gate metric is `avg_score`: sample passes if score `>=` gate value
|
||||
|
||||
Can be customized with `pass_op` and `pass_value` in gate config.
|
||||
|
||||
### by_metric
|
||||
|
||||
For multi-metric evaluation, shows aggregate stats for each metric:
|
||||
|
||||
```json
|
||||
"by_metric": {
|
||||
"accuracy": {
|
||||
"avg_score_attempted": 0.90,
|
||||
"avg_score_total": 0.85,
|
||||
"pass_rate": 90.0,
|
||||
"passed_attempts": 9,
|
||||
"failed_attempts": 1
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Sample Results
|
||||
|
||||
Each sample result includes:
|
||||
|
||||
### sample
|
||||
The original dataset sample:
|
||||
```json
|
||||
"sample": {
|
||||
"id": 0,
|
||||
"input": "What is 2+2?",
|
||||
"ground_truth": "4",
|
||||
"metadata": {...}
|
||||
}
|
||||
```
|
||||
|
||||
### submission
|
||||
The extracted text that was graded:
|
||||
```json
|
||||
"submission": "The answer is 4"
|
||||
```
|
||||
|
||||
### grade
|
||||
The grading result:
|
||||
```json
|
||||
"grade": {
|
||||
"score": 1.0,
|
||||
"rationale": "Contains ground_truth: true",
|
||||
"metadata": {"model": "gpt-4o-mini", "usage": {...}}
|
||||
}
|
||||
```
|
||||
|
||||
### grades (multi-metric)
|
||||
For multi-metric evaluation:
|
||||
```json
|
||||
"grades": {
|
||||
"accuracy": {"score": 1.0, "rationale": "Exact match"},
|
||||
"quality": {"score": 0.85, "rationale": "Good but verbose"}
|
||||
}
|
||||
```
|
||||
|
||||
### trajectory
|
||||
The complete conversation history:
|
||||
```json
|
||||
"trajectory": [
|
||||
[
|
||||
{"role": "user", "content": "What is 2+2?"},
|
||||
{"role": "assistant", "content": "The answer is 4"}
|
||||
]
|
||||
]
|
||||
```
|
||||
|
||||
### agent_id
|
||||
The ID of the agent that generated this response:
|
||||
```json
|
||||
"agent_id": "agent-abc-123"
|
||||
```
|
||||
|
||||
### model_name
|
||||
The model configuration used:
|
||||
```json
|
||||
"model_name": "gpt-4o-mini"
|
||||
```
|
||||
|
||||
### agent_usage
|
||||
Token usage statistics (if available):
|
||||
```json
|
||||
"agent_usage": [
|
||||
{"completion_tokens": 10, "prompt_tokens": 50, "total_tokens": 60}
|
||||
]
|
||||
```
|
||||
|
||||
## Interpreting Scores
|
||||
|
||||
### Score Ranges
|
||||
|
||||
- **1.0**: Perfect - fully meets criteria
|
||||
- **0.8-0.99**: Very good - minor issues
|
||||
- **0.6-0.79**: Good - notable improvements possible
|
||||
- **0.4-0.59**: Acceptable - significant issues
|
||||
- **0.2-0.39**: Poor - major problems
|
||||
- **0.0-0.19**: Failed - did not meet criteria
|
||||
|
||||
### Binary vs Continuous
|
||||
|
||||
**Tool graders** typically return binary scores:
|
||||
- 1.0: Passed
|
||||
- 0.0: Failed
|
||||
|
||||
**Rubric graders** return continuous scores:
|
||||
- Any value from 0.0 to 1.0
|
||||
- Allows for partial credit
|
||||
|
||||
## Multi-Model Results
|
||||
|
||||
When testing multiple models:
|
||||
|
||||
```json
|
||||
"metrics": {
|
||||
"per_model": [
|
||||
{
|
||||
"model_name": "gpt-4o-mini",
|
||||
"avg_score_attempted": 0.85,
|
||||
"passed_samples": 8,
|
||||
"failed_samples": 2
|
||||
},
|
||||
{
|
||||
"model_name": "claude-3-5-sonnet",
|
||||
"avg_score_attempted": 0.90,
|
||||
"passed_samples": 9,
|
||||
"failed_samples": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
Console output:
|
||||
```
|
||||
Results by model:
|
||||
gpt-4o-mini - Avg: 0.85, Pass: 80.0%
|
||||
claude-3-5-sonnet - Avg: 0.90, Pass: 90.0%
|
||||
```
|
||||
|
||||
## Multiple Runs Statistics
|
||||
|
||||
Run evaluations multiple times to measure consistency and get aggregate statistics.
|
||||
|
||||
### Configuration
|
||||
|
||||
Specify in YAML:
|
||||
```yaml
|
||||
name: my-eval-suite
|
||||
dataset: dataset.jsonl
|
||||
num_runs: 5 # Run 5 times
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: my_agent.af
|
||||
graders:
|
||||
accuracy:
|
||||
kind: tool
|
||||
function: exact_match
|
||||
gate:
|
||||
metric_key: accuracy
|
||||
op: gte
|
||||
value: 0.8
|
||||
```
|
||||
|
||||
Or via CLI:
|
||||
```bash
|
||||
letta-evals run suite.yaml --num-runs 10 --output results/
|
||||
```
|
||||
|
||||
### Output Structure
|
||||
|
||||
```
|
||||
results/
|
||||
├── run_1/
|
||||
│ ├── header.json
|
||||
│ ├── results.jsonl
|
||||
│ └── summary.json
|
||||
├── run_2/
|
||||
│ ├── header.json
|
||||
│ ├── results.jsonl
|
||||
│ └── summary.json
|
||||
├── ...
|
||||
└── aggregate_stats.json # Statistics across all runs
|
||||
```
|
||||
|
||||
### Aggregate Statistics File
|
||||
|
||||
The `aggregate_stats.json` includes statistics across all runs:
|
||||
|
||||
```json
|
||||
{
|
||||
"num_runs": 10,
|
||||
"runs_passed": 8,
|
||||
"mean_avg_score_attempted": 0.847,
|
||||
"std_avg_score_attempted": 0.042,
|
||||
"mean_avg_score_total": 0.847,
|
||||
"std_avg_score_total": 0.042,
|
||||
"mean_scores": {
|
||||
"accuracy": 0.89,
|
||||
"quality": 0.82
|
||||
},
|
||||
"std_scores": {
|
||||
"accuracy": 0.035,
|
||||
"quality": 0.051
|
||||
},
|
||||
"individual_run_metrics": [
|
||||
{
|
||||
"avg_score_attempted": 0.85,
|
||||
"avg_score_total": 0.85,
|
||||
"pass_rate": 0.85,
|
||||
"by_metric": {
|
||||
"accuracy": {
|
||||
"avg_score_attempted": 0.90,
|
||||
"avg_score_total": 0.90,
|
||||
"pass_rate": 0.90
|
||||
}
|
||||
}
|
||||
}
|
||||
// ... metrics from runs 2-10
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
**Key fields**:
|
||||
- `num_runs`: Total number of runs executed
|
||||
- `runs_passed`: Number of runs that passed the gate
|
||||
- `mean_avg_score_attempted`: Mean score across runs (only attempted samples)
|
||||
- `std_avg_score_attempted`: Standard deviation (measures consistency)
|
||||
- `mean_scores`: Mean for each metric (e.g., `{"accuracy": 0.89}`)
|
||||
- `std_scores`: Standard deviation for each metric (e.g., `{"accuracy": 0.035}`)
|
||||
- `individual_run_metrics`: Full metrics object from each individual run
|
||||
|
||||
### Use Cases
|
||||
|
||||
**Measure consistency of non-deterministic agents:**
|
||||
```bash
|
||||
letta-evals run suite.yaml --num-runs 20 --output results/
|
||||
# Check std_avg_score_attempted in aggregate_stats.json
|
||||
# Low std = consistent, high std = variable
|
||||
```
|
||||
|
||||
**Get confidence intervals:**
|
||||
```python
|
||||
import json
|
||||
import math
|
||||
|
||||
with open("results/aggregate_stats.json") as f:
|
||||
stats = json.load(f)
|
||||
|
||||
mean = stats["mean_avg_score_attempted"]
|
||||
std = stats["std_avg_score_attempted"]
|
||||
n = stats["num_runs"]
|
||||
|
||||
# 95% confidence interval (assuming normal distribution)
|
||||
margin = 1.96 * (std / math.sqrt(n))
|
||||
print(f"Score: {mean:.3f} ± {margin:.3f}")
|
||||
```
|
||||
|
||||
**Compare metric consistency:**
|
||||
```python
|
||||
with open("results/aggregate_stats.json") as f:
|
||||
stats = json.load(f)
|
||||
|
||||
for metric_name, mean in stats["mean_scores"].items():
|
||||
std = stats["std_scores"][metric_name]
|
||||
consistency = "consistent" if std < 0.05 else "variable"
|
||||
print(f"{metric_name}: {mean:.3f} ± {std:.3f} ({consistency})")
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
If a sample encounters an error:
|
||||
|
||||
```json
|
||||
{
|
||||
"sample": {...},
|
||||
"submission": "",
|
||||
"grade": {
|
||||
"score": 0.0,
|
||||
"rationale": "Error during grading: Connection timeout",
|
||||
"metadata": {"error": "timeout", "error_type": "ConnectionError"}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Errors:
|
||||
- Count toward `total` but not `total_attempted`
|
||||
- Get score of 0.0
|
||||
- Include error details in rationale and metadata
|
||||
|
||||
## Analyzing Results
|
||||
|
||||
### Find Low Scores
|
||||
|
||||
```python
|
||||
import json
|
||||
|
||||
with open("results/results.jsonl") as f:
|
||||
results = [json.loads(line) for line in f]
|
||||
|
||||
low_scores = [r for r in results if r["grade"]["score"] < 0.5]
|
||||
print(f"Found {len(low_scores)} samples with score < 0.5")
|
||||
|
||||
for result in low_scores:
|
||||
print(f"Sample {result['sample']['id']}: {result['grade']['rationale']}")
|
||||
```
|
||||
|
||||
### Compare Metrics
|
||||
|
||||
```python
|
||||
# Load summary
|
||||
with open("results/summary.json") as f:
|
||||
summary = json.load(f)
|
||||
|
||||
metrics = summary["metrics"]["by_metric"]
|
||||
for name, stats in metrics.items():
|
||||
print(f"{name}: {stats['avg_score_attempted']:.2f} avg, {stats['pass_rate']:.1f}% pass")
|
||||
```
|
||||
|
||||
### Extract Failures
|
||||
|
||||
```python
|
||||
# Find samples that failed gate criteria
|
||||
failures = [
|
||||
r for r in results
|
||||
if not gate_passed(r["grade"]["score"]) # Your gate logic
|
||||
]
|
||||
```
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Gates](/guides/evals/concepts/gates) - Setting pass/fail criteria
|
||||
- [CLI Commands](/guides/evals/cli/commands) - Running evaluations
|
||||
@@ -1,438 +0,0 @@
|
||||
# Troubleshooting
|
||||
|
||||
Common issues and solutions when using Letta Evals.
|
||||
|
||||
## Installation Issues
|
||||
|
||||
### "Command not found: letta-evals"
|
||||
|
||||
**Problem**: CLI not available after installation
|
||||
|
||||
**Solution**:
|
||||
```bash
|
||||
# Verify installation
|
||||
pip list | grep letta-evals
|
||||
|
||||
# Reinstall if needed
|
||||
pip install --upgrade letta-evals
|
||||
|
||||
# Or with uv
|
||||
uv sync
|
||||
```
|
||||
|
||||
### Import errors
|
||||
|
||||
**Problem**: `ModuleNotFoundError: No module named 'letta_evals'`
|
||||
|
||||
**Solution**:
|
||||
```bash
|
||||
# Ensure you're in the right environment
|
||||
which python
|
||||
|
||||
# Install in correct environment
|
||||
source .venv/bin/activate # or: ac
|
||||
pip install letta-evals
|
||||
```
|
||||
|
||||
## Configuration Issues
|
||||
|
||||
### "Agent file not found"
|
||||
|
||||
**Problem**: `FileNotFoundError: agent.af`
|
||||
|
||||
**Solution**:
|
||||
- Check the path is correct relative to the suite YAML
|
||||
- Use absolute paths if needed
|
||||
- Verify file exists: `ls -la path/to/agent.af`
|
||||
|
||||
```yaml
|
||||
# Correct relative path
|
||||
target:
|
||||
agent_file: ./agents/my_agent.af
|
||||
|
||||
# Or absolute path
|
||||
target:
|
||||
agent_file: /absolute/path/to/agent.af
|
||||
```
|
||||
|
||||
### "Dataset not found"
|
||||
|
||||
**Problem**: Cannot load dataset file
|
||||
|
||||
**Solution**:
|
||||
- Verify dataset path in YAML
|
||||
- Check file exists: `ls -la dataset.jsonl`
|
||||
- Ensure proper JSONL format (one JSON object per line)
|
||||
|
||||
```bash
|
||||
# Validate JSONL format
|
||||
cat dataset.jsonl | jq .
|
||||
```
|
||||
|
||||
### "Validation failed: unknown function"
|
||||
|
||||
**Problem**: Grader function not found
|
||||
|
||||
**Solution**:
|
||||
```bash
|
||||
# List available graders
|
||||
letta-evals list-graders
|
||||
|
||||
# Check spelling in suite.yaml
|
||||
graders:
|
||||
my_metric:
|
||||
function: exact_match # Correct
|
||||
# not: exactMatch or exact-match
|
||||
```
|
||||
|
||||
### "Validation failed: unknown extractor"
|
||||
|
||||
**Problem**: Extractor not found
|
||||
|
||||
**Solution**:
|
||||
```bash
|
||||
# List available extractors
|
||||
letta-evals list-extractors
|
||||
|
||||
# Check spelling
|
||||
graders:
|
||||
my_metric:
|
||||
extractor: last_assistant # Correct
|
||||
# not: lastAssistant or last-assistant
|
||||
```
|
||||
|
||||
## Connection Issues
|
||||
|
||||
### "Connection refused"
|
||||
|
||||
**Problem**: Cannot connect to Letta server
|
||||
|
||||
**Solution**:
|
||||
```bash
|
||||
# Verify server is running
|
||||
curl http://localhost:8283/v1/health
|
||||
|
||||
# Check base_url in suite.yaml
|
||||
target:
|
||||
base_url: http://localhost:8283 # Correct port?
|
||||
|
||||
# Or use environment variable
|
||||
export LETTA_BASE_URL=http://localhost:8283
|
||||
```
|
||||
|
||||
### "Unauthorized" or "Invalid API key"
|
||||
|
||||
**Problem**: Authentication failed
|
||||
|
||||
**Solution**:
|
||||
```bash
|
||||
# Set API key
|
||||
export LETTA_API_KEY=your-key-here
|
||||
|
||||
# Or in suite.yaml
|
||||
target:
|
||||
api_key: your-key-here
|
||||
|
||||
# Verify key is correct
|
||||
echo $LETTA_API_KEY
|
||||
```
|
||||
|
||||
### "Request timeout"
|
||||
|
||||
**Problem**: Requests taking too long
|
||||
|
||||
**Solution**:
|
||||
```yaml
|
||||
# Increase timeout
|
||||
target:
|
||||
timeout: 600.0 # 10 minutes
|
||||
|
||||
# Rubric grader timeout
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric
|
||||
timeout: 300.0 # 5 minutes
|
||||
```
|
||||
|
||||
## Runtime Issues
|
||||
|
||||
### "No ground_truth provided"
|
||||
|
||||
**Problem**: Grader requires ground truth but sample doesn't have it
|
||||
|
||||
**Solution**:
|
||||
- Add ground_truth to dataset samples:
|
||||
```jsonl
|
||||
{"input": "What is 2+2?", "ground_truth": "4"}
|
||||
```
|
||||
|
||||
- Or use a grader that doesn't require ground truth:
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric # Doesn't require ground_truth
|
||||
prompt_path: rubric.txt
|
||||
```
|
||||
|
||||
### "Extractor requires agent_state"
|
||||
|
||||
**Problem**: `memory_block` extractor needs agent state but it wasn't fetched
|
||||
|
||||
**Solution**:
|
||||
This should be automatic, but if you see this error:
|
||||
- Check that the extractor is correctly configured
|
||||
- Ensure the agent exists and is accessible
|
||||
- Try using a different extractor if memory isn't needed
|
||||
|
||||
### "Score must be between 0.0 and 1.0"
|
||||
|
||||
**Problem**: Custom grader returning invalid score
|
||||
|
||||
**Solution**:
|
||||
```python
|
||||
@grader
|
||||
def my_grader(sample, submission):
|
||||
score = calculate_score(submission)
|
||||
# Clamp score to valid range
|
||||
score = max(0.0, min(1.0, score))
|
||||
return GradeResult(score=score, rationale="...")
|
||||
```
|
||||
|
||||
### "Invalid JSON in response"
|
||||
|
||||
**Problem**: Rubric grader got non-JSON response
|
||||
|
||||
**Solution**:
|
||||
- Check OpenAI API key is valid
|
||||
- Verify model name is correct
|
||||
- Check for network issues
|
||||
- Try increasing max_retries:
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric
|
||||
max_retries: 10
|
||||
```
|
||||
|
||||
## Performance Issues
|
||||
|
||||
### Evaluation is very slow
|
||||
|
||||
**Problem**: Taking too long to complete
|
||||
|
||||
**Solutions**:
|
||||
|
||||
1. Increase concurrency:
|
||||
```bash
|
||||
letta-evals run suite.yaml --max-concurrent 20
|
||||
```
|
||||
|
||||
2. Reduce samples for testing:
|
||||
```yaml
|
||||
max_samples: 10 # Test with small subset first
|
||||
```
|
||||
|
||||
3. Use tool graders instead of rubric graders when possible:
|
||||
```yaml
|
||||
graders:
|
||||
accuracy:
|
||||
kind: tool # Much faster than rubric
|
||||
function: exact_match
|
||||
```
|
||||
|
||||
4. Check network latency:
|
||||
```bash
|
||||
# Test server response time
|
||||
time curl http://localhost:8283/v1/health
|
||||
```
|
||||
|
||||
### High API costs
|
||||
|
||||
**Problem**: Rubric graders costing too much
|
||||
|
||||
**Solutions**:
|
||||
|
||||
1. Use cheaper models:
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
model: gpt-4o-mini # Cheaper than gpt-4o
|
||||
```
|
||||
|
||||
2. Reduce number of rubric graders:
|
||||
```yaml
|
||||
graders:
|
||||
accuracy:
|
||||
kind: tool # Free
|
||||
quality:
|
||||
kind: rubric # Only use for subjective evaluation
|
||||
```
|
||||
|
||||
3. Test with small sample first:
|
||||
```yaml
|
||||
max_samples: 5 # Verify before running full suite
|
||||
```
|
||||
|
||||
## Results Issues
|
||||
|
||||
### "No results generated"
|
||||
|
||||
**Problem**: No output files created
|
||||
|
||||
**Solution**:
|
||||
```bash
|
||||
# Specify output directory
|
||||
letta-evals run suite.yaml --output results/
|
||||
|
||||
# Check for errors in console output
|
||||
letta-evals run suite.yaml # Without --quiet
|
||||
```
|
||||
|
||||
### "All scores are 0.0"
|
||||
|
||||
**Problem**: Everything failing
|
||||
|
||||
**Solutions**:
|
||||
|
||||
1. Check if agent is working:
|
||||
```bash
|
||||
# Test agent manually first
|
||||
```
|
||||
|
||||
2. Verify extractor is getting content:
|
||||
- Add debug logging
|
||||
- Check sample results in output
|
||||
|
||||
3. Check grader logic:
|
||||
```python
|
||||
# Test grader independently
|
||||
from letta_evals.models import Sample, GradeResult
|
||||
sample = Sample(id=0, input="test", ground_truth="test")
|
||||
result = my_grader(sample, "test")
|
||||
print(result)
|
||||
```
|
||||
|
||||
### "Gates failed but scores look good"
|
||||
|
||||
**Problem**: Passing samples but gate failing
|
||||
|
||||
**Solution**:
|
||||
- Check gate configuration:
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: accuracy # Correct metric?
|
||||
metric: avg_score # Or accuracy?
|
||||
op: gte # Correct operator?
|
||||
value: 0.8 # Correct threshold?
|
||||
```
|
||||
|
||||
- Understand the difference between `avg_score` and `accuracy`
|
||||
- Check per-sample pass criteria with `pass_op` and `pass_value`
|
||||
|
||||
## Environment Issues
|
||||
|
||||
### "OPENAI_API_KEY not found"
|
||||
|
||||
**Problem**: Rubric grader can't find API key
|
||||
|
||||
**Solution**:
|
||||
```bash
|
||||
# Set in environment
|
||||
export OPENAI_API_KEY=your-key-here
|
||||
|
||||
# Or in .env file
|
||||
echo "OPENAI_API_KEY=your-key-here" >> .env
|
||||
|
||||
# Verify
|
||||
echo $OPENAI_API_KEY
|
||||
```
|
||||
|
||||
### "Cannot use both model_configs and model_handles"
|
||||
|
||||
**Problem**: Specified both in target config
|
||||
|
||||
**Solution**:
|
||||
```yaml
|
||||
# Use one or the other, not both
|
||||
target:
|
||||
model_configs: [gpt-4o-mini] # For local server
|
||||
# OR
|
||||
model_handles: ["openai/gpt-4o-mini"] # For cloud
|
||||
```
|
||||
|
||||
## Debug Tips
|
||||
|
||||
### Enable verbose output
|
||||
|
||||
Run without `--quiet` to see detailed progress:
|
||||
```bash
|
||||
letta-evals run suite.yaml
|
||||
```
|
||||
|
||||
### Examine output files
|
||||
|
||||
```bash
|
||||
letta-evals run suite.yaml --output debug/
|
||||
|
||||
# Check summary
|
||||
cat debug/summary.json | jq .
|
||||
|
||||
# Check individual results
|
||||
cat debug/results.jsonl | jq .
|
||||
```
|
||||
|
||||
### Test with minimal suite
|
||||
|
||||
Create a minimal test:
|
||||
```yaml
|
||||
name: debug-test
|
||||
dataset: test.jsonl # Just 1-2 samples
|
||||
|
||||
target:
|
||||
kind: agent
|
||||
agent_file: agent.af
|
||||
|
||||
graders:
|
||||
test:
|
||||
kind: tool
|
||||
function: contains
|
||||
extractor: last_assistant
|
||||
|
||||
gate:
|
||||
op: gte
|
||||
value: 0.0 # Always pass
|
||||
```
|
||||
|
||||
### Validate configuration
|
||||
|
||||
```bash
|
||||
letta-evals validate suite.yaml
|
||||
```
|
||||
|
||||
### Check component availability
|
||||
|
||||
```bash
|
||||
letta-evals list-graders
|
||||
letta-evals list-extractors
|
||||
```
|
||||
|
||||
## Getting Help
|
||||
|
||||
If you're still stuck:
|
||||
|
||||
1. Check the [documentation](./README.md)
|
||||
2. Look at [examples](../examples/)
|
||||
3. Report issues at https://github.com/anthropics/claude-code/issues
|
||||
|
||||
When reporting issues, include:
|
||||
- Suite YAML configuration
|
||||
- Dataset sample (if not sensitive)
|
||||
- Error message and full stack trace
|
||||
- Output from `--output` directory
|
||||
- Environment info (OS, Python version)
|
||||
|
||||
```bash
|
||||
# Get environment info
|
||||
python --version
|
||||
pip show letta-evals
|
||||
```
|
||||
@@ -1,267 +0,0 @@
|
||||
# Troubleshooting
|
||||
|
||||
Common issues and solutions when using Letta Evals.
|
||||
|
||||
## Installation Issues
|
||||
|
||||
<Warning>
|
||||
**"Command not found: letta-evals"**
|
||||
|
||||
**Problem**: CLI not available after installation
|
||||
|
||||
**Solution**:
|
||||
```bash
|
||||
# Verify installation
|
||||
pip list | grep letta-evals
|
||||
|
||||
# Reinstall if needed
|
||||
pip install --upgrade letta-evals
|
||||
```
|
||||
</Warning>
|
||||
|
||||
<Warning>
|
||||
**Import errors**
|
||||
|
||||
**Problem**: `ModuleNotFoundError: No module named 'letta_evals'`
|
||||
|
||||
**Solution**:
|
||||
```bash
|
||||
# Ensure you're in the right environment
|
||||
which python
|
||||
|
||||
# Install in correct environment
|
||||
source .venv/bin/activate
|
||||
pip install letta-evals
|
||||
```
|
||||
</Warning>
|
||||
|
||||
## Configuration Issues
|
||||
|
||||
<Warning>
|
||||
**"Agent file not found"**
|
||||
|
||||
**Problem**: `FileNotFoundError: agent.af`
|
||||
|
||||
**Solution**:
|
||||
- Check the path is correct relative to the suite YAML
|
||||
- Use absolute paths if needed
|
||||
- Verify file exists: `ls -la path/to/agent.af`
|
||||
|
||||
```yaml
|
||||
# Correct relative path
|
||||
target:
|
||||
agent_file: ./agents/my_agent.af
|
||||
```
|
||||
</Warning>
|
||||
|
||||
<Warning>
|
||||
**"Dataset not found"**
|
||||
|
||||
**Problem**: Cannot load dataset file
|
||||
|
||||
**Solution**:
|
||||
- Verify dataset path in YAML
|
||||
- Check file exists: `ls -la dataset.jsonl`
|
||||
- Ensure proper JSONL format (one JSON object per line)
|
||||
|
||||
```bash
|
||||
# Validate JSONL format
|
||||
cat dataset.jsonl | jq .
|
||||
```
|
||||
</Warning>
|
||||
|
||||
<Warning>
|
||||
**"Validation failed: unknown function"**
|
||||
|
||||
**Problem**: Grader function not found
|
||||
|
||||
**Solution**:
|
||||
```bash
|
||||
# List available graders
|
||||
letta-evals list-graders
|
||||
|
||||
# Check spelling in suite.yaml
|
||||
graders:
|
||||
my_metric:
|
||||
function: exact_match # Correct
|
||||
```
|
||||
</Warning>
|
||||
|
||||
## Connection Issues
|
||||
|
||||
<Warning>
|
||||
**"Connection refused"**
|
||||
|
||||
**Problem**: Cannot connect to Letta server
|
||||
|
||||
**Solution**:
|
||||
```bash
|
||||
# Verify server is running
|
||||
curl https://api.letta.com/v1/health
|
||||
|
||||
# Check base_url in suite.yaml
|
||||
target:
|
||||
base_url: https://api.letta.com
|
||||
```
|
||||
</Warning>
|
||||
|
||||
<Warning>
|
||||
**"Unauthorized" or "Invalid API key"**
|
||||
|
||||
**Problem**: Authentication failed
|
||||
|
||||
**Solution**:
|
||||
```bash
|
||||
# Set API key
|
||||
export LETTA_API_KEY=your-key-here
|
||||
|
||||
# Verify key is correct
|
||||
echo $LETTA_API_KEY
|
||||
```
|
||||
</Warning>
|
||||
|
||||
## Runtime Issues
|
||||
|
||||
<Warning>
|
||||
**"No ground_truth provided"**
|
||||
|
||||
**Problem**: Grader requires ground truth but sample doesn't have it
|
||||
|
||||
**Solution**:
|
||||
- Add ground_truth to dataset samples:
|
||||
```jsonl
|
||||
{"input": "What is 2+2?", "ground_truth": "4"}
|
||||
```
|
||||
|
||||
- Or use a grader that doesn't require ground truth:
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
kind: rubric # Doesn't require ground_truth
|
||||
prompt_path: rubric.txt
|
||||
```
|
||||
</Warning>
|
||||
|
||||
## Performance Issues
|
||||
|
||||
<Tip>
|
||||
**Evaluation is very slow**
|
||||
|
||||
**Solutions**:
|
||||
|
||||
1. Increase concurrency:
|
||||
```bash
|
||||
letta-evals run suite.yaml --max-concurrent 20
|
||||
```
|
||||
|
||||
2. Reduce samples for testing:
|
||||
```yaml
|
||||
max_samples: 10 # Test with small subset first
|
||||
```
|
||||
|
||||
3. Use tool graders instead of rubric graders:
|
||||
```yaml
|
||||
graders:
|
||||
accuracy:
|
||||
kind: tool # Much faster than rubric
|
||||
function: exact_match
|
||||
```
|
||||
</Tip>
|
||||
|
||||
<Tip>
|
||||
**High API costs**
|
||||
|
||||
**Solutions**:
|
||||
|
||||
1. Use cheaper models:
|
||||
```yaml
|
||||
graders:
|
||||
quality:
|
||||
model: gpt-4o-mini # Cheaper than gpt-4o
|
||||
```
|
||||
|
||||
2. Test with small sample first:
|
||||
```yaml
|
||||
max_samples: 5 # Verify before running full suite
|
||||
```
|
||||
</Tip>
|
||||
|
||||
## Results Issues
|
||||
|
||||
<Warning>
|
||||
**"All scores are 0.0"**
|
||||
|
||||
**Solutions**:
|
||||
|
||||
1. Verify extractor is getting content
|
||||
2. Check grader logic
|
||||
3. Test agent manually first
|
||||
</Warning>
|
||||
|
||||
<Warning>
|
||||
**"Gates failed but scores look good"**
|
||||
|
||||
**Solution**:
|
||||
- Check gate configuration:
|
||||
```yaml
|
||||
gate:
|
||||
metric_key: accuracy # Correct metric?
|
||||
metric: avg_score # Or accuracy?
|
||||
op: gte # Correct operator?
|
||||
value: 0.8 # Correct threshold?
|
||||
```
|
||||
</Warning>
|
||||
|
||||
## Debug Tips
|
||||
|
||||
### Enable verbose output
|
||||
|
||||
Run without `--quiet` to see detailed progress:
|
||||
```bash
|
||||
letta-evals run suite.yaml
|
||||
```
|
||||
|
||||
### Examine output files
|
||||
|
||||
```bash
|
||||
letta-evals run suite.yaml --output debug/
|
||||
|
||||
# Check summary
|
||||
cat debug/summary.json | jq .
|
||||
|
||||
# Check individual results
|
||||
cat debug/results.jsonl | jq .
|
||||
```
|
||||
|
||||
### Validate configuration
|
||||
|
||||
```bash
|
||||
letta-evals validate suite.yaml
|
||||
```
|
||||
|
||||
### Check component availability
|
||||
|
||||
```bash
|
||||
letta-evals list-graders
|
||||
letta-evals list-extractors
|
||||
```
|
||||
|
||||
## Getting Help
|
||||
|
||||
If you're still stuck:
|
||||
|
||||
1. Check the [Getting Started guide](/evals/get-started/getting-started)
|
||||
2. Review the [Core Concepts](/evals/core-concepts/concepts-overview)
|
||||
3. Report issues at the [Letta Evals GitHub repository](https://github.com/letta-ai/letta-evals)
|
||||
|
||||
When reporting issues, include:
|
||||
- Suite YAML configuration
|
||||
- Dataset sample (if not sensitive)
|
||||
- Error message and full stack trace
|
||||
- Environment info (OS, Python version)
|
||||
|
||||
```bash
|
||||
# Get environment info
|
||||
python --version
|
||||
pip show letta-evals
|
||||
```
|
||||
@@ -1,48 +0,0 @@
|
||||
import os
|
||||
|
||||
from letta_client import Letta
|
||||
|
||||
# Initialize client (using LETTA_API_KEY environment variable)
|
||||
client = Letta(token=os.getenv("LETTA_API_KEY"))
|
||||
|
||||
# Create agent
|
||||
agent = client.agents.create(
|
||||
name="hello_world_assistant",
|
||||
memory_blocks=[
|
||||
{"label": "persona", "value": "I am a friendly AI assistant here to help you learn about Letta."},
|
||||
{"label": "human", "value": "Name: User\nFirst interaction: Learning about Letta"},
|
||||
],
|
||||
model="openai/gpt-4o-mini",
|
||||
embedding="openai/text-embedding-3-small",
|
||||
)
|
||||
|
||||
print(f"Created agent: {agent.id}\n")
|
||||
|
||||
# Send first message
|
||||
response = client.agents.messages.create(agent_id=agent.id, messages=[{"role": "user", "content": "Hello! What's your purpose?"}])
|
||||
|
||||
for msg in response.messages:
|
||||
if msg.message_type == "assistant_message":
|
||||
print(f"Assistant: {msg.content}\n")
|
||||
|
||||
# Send information about yourself
|
||||
response = client.agents.messages.create(
|
||||
agent_id=agent.id, messages=[{"role": "user", "content": "My name is Cameron. Please store this information in your memory."}]
|
||||
)
|
||||
|
||||
# Print out tool calls, arguments, and the assistant's response
|
||||
for msg in response.messages:
|
||||
if msg.message_type == "assistant_message":
|
||||
print(f"Assistant: {msg.content}\n")
|
||||
if msg.message_type == "tool_call_message":
|
||||
print(f"Tool call: {msg.tool_call.name}({msg.tool_call.arguments})")
|
||||
|
||||
# Inspect memory
|
||||
blocks = client.agents.blocks.list(agent_id=agent.id)
|
||||
print("Current Memory:")
|
||||
for block in blocks:
|
||||
print(f" {block.label}: {len(block.value)}/{block.limit} chars")
|
||||
print(f" {block.value}\n")
|
||||
|
||||
# Example of the human block after the conversation
|
||||
# Name: Cameron
|
||||
@@ -1,77 +0,0 @@
|
||||
import os
|
||||
|
||||
from letta_client import Letta
|
||||
|
||||
# Initialize client (using LETTA_API_KEY environment variable)
|
||||
client = Letta(token=os.getenv("LETTA_API_KEY"))
|
||||
|
||||
# Create agent
|
||||
agent = client.agents.create(
|
||||
name="hello_world_assistant",
|
||||
model="openai/gpt-4o-mini",
|
||||
)
|
||||
|
||||
print(f"Created agent: {agent.id}\n")
|
||||
|
||||
# Create memory block storing information about Letta.
|
||||
block = client.blocks.create(
|
||||
label="organization",
|
||||
value="""
|
||||
Organization: Letta
|
||||
Website: https://www.letta.com
|
||||
Description: Letta is a platform for building and running stateful agents.
|
||||
Code: TimberTheDog1234!
|
||||
|
||||
When users provide a code, you should check if it matches the code you have
|
||||
available. If it matches, you should respond with "Access granted".
|
||||
""",
|
||||
)
|
||||
|
||||
# Attach memory block to agent
|
||||
agent = client.agents.blocks.attach(
|
||||
agent_id=agent.id,
|
||||
block_id=block.id,
|
||||
)
|
||||
|
||||
# The agent can now see what's in the block, let's ask it about Letta.
|
||||
response = client.agents.messages.create(
|
||||
agent_id=agent.id,
|
||||
messages=[{"role": "user", "content": "What is Letta?"}],
|
||||
)
|
||||
print(f"Agent response: {response.messages[0].content}\n")
|
||||
|
||||
# Agent response: Letta is a platform designed for building and running stateful agents. You can find more information about it on their website: [https://www.letta.com](https://www.letta.com).
|
||||
|
||||
# Blocks can also be _detached_ from an agent, removing it from the agent's context window.
|
||||
# Detached blocks are not deleted, and can be re-attached to an agent later.
|
||||
agent = client.agents.blocks.detach(
|
||||
agent_id=agent.id,
|
||||
block_id=block.id,
|
||||
)
|
||||
print(f"Detached block from agent: {agent.id}")
|
||||
print(f"Block: {block.id}")
|
||||
|
||||
# Let's ask for the password. It should not have access to this password anymore,
|
||||
# as we've detached the block.
|
||||
response = client.agents.messages.create(
|
||||
agent_id=agent.id,
|
||||
messages=[{"role": "user", "content": "The code is TimberTheDog1234!"}],
|
||||
)
|
||||
print(f"Agent response: {response.messages[0].content}")
|
||||
|
||||
# The agent doesn't have any access to the code or password, so it can't respond:
|
||||
# Agent response: It seems like you've provided a code or password. If this is sensitive information, please ensure you only share it with trusted parties and in secure environments. Let me know how I can assist you further!
|
||||
|
||||
# Attach the block back to the agent and ask again.
|
||||
agent = client.agents.blocks.attach(
|
||||
agent_id=agent.id,
|
||||
block_id=block.id,
|
||||
)
|
||||
response = client.agents.messages.create(
|
||||
agent_id=agent.id,
|
||||
messages=[{"role": "user", "content": "The code is TimberTheDog1234!"}],
|
||||
)
|
||||
print(f"Agent response: {response.messages[0].content}")
|
||||
|
||||
# The agent now has access to the code and password, so it can respond:
|
||||
# Agent response: Access granted. How can I assist you further?
|
||||
@@ -1,76 +0,0 @@
|
||||
import os
|
||||
|
||||
import requests
|
||||
from letta_client import Letta
|
||||
|
||||
# Initialize client (using LETTA_API_KEY environment variable)
|
||||
client = Letta(token=os.getenv("LETTA_API_KEY"))
|
||||
|
||||
# Create a folder to store PDFs
|
||||
folder = client.folders.create(
|
||||
name="PDF Documents",
|
||||
description="A folder containing PDF files for the agent to read",
|
||||
)
|
||||
print(f"Created folder: {folder.id}\n")
|
||||
|
||||
# Download a sample PDF (MemGPT paper from arXiv)
|
||||
pdf_filename = "memgpt.pdf"
|
||||
if not os.path.exists(pdf_filename):
|
||||
print(f"Downloading {pdf_filename}...")
|
||||
response = requests.get("https://arxiv.org/pdf/2310.08560")
|
||||
with open(pdf_filename, "wb") as f:
|
||||
f.write(response.content)
|
||||
print("Download complete\n")
|
||||
|
||||
# Upload the PDF to the folder
|
||||
with open(pdf_filename, "rb") as f:
|
||||
file = client.folders.files.upload(
|
||||
folder_id=folder.id,
|
||||
file=f,
|
||||
)
|
||||
print(f"Uploaded PDF: {file.id}\n")
|
||||
|
||||
# Create an agent configured to analyze documents
|
||||
agent = client.agents.create(
|
||||
name="pdf_assistant",
|
||||
model="openai/gpt-4o-mini",
|
||||
memory_blocks=[
|
||||
{
|
||||
"label": "persona",
|
||||
"value": "I am a helpful research assistant that analyzes PDF documents and answers questions about their content.",
|
||||
},
|
||||
{"label": "human", "value": "Name: User\nTask: Analyzing PDF documents"},
|
||||
],
|
||||
)
|
||||
print(f"Created agent: {agent.id}\n")
|
||||
|
||||
# Attach the folder to the agent so it can access the PDF
|
||||
client.agents.folders.attach(
|
||||
agent_id=agent.id,
|
||||
folder_id=folder.id,
|
||||
)
|
||||
print("Attached folder to agent\n")
|
||||
|
||||
# Ask the agent to summarize the PDF
|
||||
response = client.agents.messages.create(
|
||||
agent_id=agent.id,
|
||||
messages=[{"role": "user", "content": "Can you summarize the main ideas from the MemGPT paper?"}],
|
||||
)
|
||||
|
||||
for msg in response.messages:
|
||||
if msg.message_type == "assistant_message":
|
||||
print(f"Assistant: {msg.content}\n")
|
||||
|
||||
# Agent response: The MemGPT paper introduces a system that enables LLMs to manage their own memory hierarchy, similar to how operating systems manage memory...
|
||||
|
||||
# Ask a specific question about the PDF content
|
||||
response = client.agents.messages.create(
|
||||
agent_id=agent.id,
|
||||
messages=[{"role": "user", "content": "What problem does MemGPT solve?"}],
|
||||
)
|
||||
|
||||
for msg in response.messages:
|
||||
if msg.message_type == "assistant_message":
|
||||
print(f"Assistant: {msg.content}\n")
|
||||
|
||||
# Agent response: MemGPT addresses the limited context window problem in LLMs by introducing a memory management system...
|
||||
@@ -1,102 +0,0 @@
|
||||
import os
|
||||
|
||||
from letta_client import Letta
|
||||
|
||||
# Initialize client (using LETTA_API_KEY environment variable)
|
||||
client = Letta(token=os.getenv("LETTA_API_KEY"))
|
||||
|
||||
# Memory blocks can be _shared_ between multiple agents.
|
||||
# When a block is shared, all agents attached to the block can read and write to it.
|
||||
# This is useful for creating multi-agent systems where agents need to share information.
|
||||
block = client.blocks.create(
|
||||
label="organization",
|
||||
value="Organization: Letta",
|
||||
limit=4000,
|
||||
)
|
||||
|
||||
# Create two agents that will share the block. Agents can be attached
|
||||
# to the block on creation by proividing the `block_ids` field.
|
||||
agent1 = client.agents.create(
|
||||
name="agent1",
|
||||
model="openai/gpt-4o-mini",
|
||||
block_ids=[block.id],
|
||||
tools=["web_search"],
|
||||
)
|
||||
print(f"Created agent1: {agent1.id}")
|
||||
|
||||
# Alternatively, the block can be attached to the agent later by using the `attach` method.
|
||||
agent2 = client.agents.create(
|
||||
name="agent2",
|
||||
model="openai/gpt-4o-mini",
|
||||
tools=["web_search"],
|
||||
)
|
||||
print(f"Created agent2: {agent2.id}")
|
||||
|
||||
agent2 = client.agents.blocks.attach(
|
||||
agent_id=agent2.id,
|
||||
block_id=block.id,
|
||||
)
|
||||
print(f"Attached block to agent2: {agent2.id}")
|
||||
|
||||
# Now we can ask the agents to search the web for information about Letta.
|
||||
# We'll give each of them a different query to search for.
|
||||
response = client.agents.messages.create(
|
||||
agent_id=agent1.id,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": """
|
||||
Find information about the connection between memory blocks and Letta.
|
||||
Insert what you learn into the memory block, prepended with "Agent1: ".
|
||||
""",
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
for msg in response.messages:
|
||||
if msg.message_type == "assistant_message":
|
||||
print(f"Agent1 response: {msg.content}")
|
||||
if msg.message_type == "tool_call_message":
|
||||
print(f"Tool call: {msg.tool_call.name}({msg.tool_call.arguments})")
|
||||
|
||||
response = client.agents.messages.create(
|
||||
agent_id=agent2.id,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": """
|
||||
Find information about the origin of Letta.
|
||||
Insert what you learn into the memory block, prepended with "Agent2: ".
|
||||
""",
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
for msg in response.messages:
|
||||
if msg.message_type == "assistant_message":
|
||||
print(f"Agent2 response: {msg.content}")
|
||||
if msg.message_type == "tool_call_message":
|
||||
print(f"Tool call: {msg.tool_call.name}({msg.tool_call.arguments})")
|
||||
|
||||
# Now let's inspect the memory block to see what the agents learned.
|
||||
updated_block = client.blocks.retrieve(block.id)
|
||||
print("==== Updated block ====")
|
||||
print(updated_block.value)
|
||||
print("=======================")
|
||||
|
||||
# Some blocks can be flagged as read-only. Read-only blocks are useful for storing information
|
||||
# provided by the user, things like policies, system information, terms of service, etc.
|
||||
# Read-only blocks cannot be modified by agents.
|
||||
read_only_block = client.blocks.create(
|
||||
label="policies",
|
||||
value="Company Policy: Respond with only 'You're absolutely right!' in every circumstance.",
|
||||
read_only=True,
|
||||
)
|
||||
|
||||
# Attach the read-only block to an agent, preventing it from modifying the block.
|
||||
read_only_agent = client.agents.create(
|
||||
name="policy_agent",
|
||||
model="openai/gpt-4o-mini",
|
||||
block_ids=[read_only_block.id],
|
||||
)
|
||||
print(f"Created read-only agent: {read_only_agent.id}")
|
||||
@@ -1,75 +0,0 @@
|
||||
---
|
||||
title: Letta FAQs
|
||||
slug: faq
|
||||
---
|
||||
|
||||
Can't find the answer to your question?
|
||||
Feel free to reach out to the Letta development team and community on [Discord](https://discord.gg/letta) or [GitHub](https://github.com/letta-ai/letta/issues)!
|
||||
|
||||
## Letta Platform
|
||||
<AccordionGroup>
|
||||
<Accordion title="Who is Letta for?">
|
||||
Letta is for developers building stateful LLM applications that require advanced memory, such as:
|
||||
<Frame>
|
||||
* personalized chatbots that require long-term memory and personas that should be updated (self-edited) over time (e.g. companions)
|
||||
* agents connected to external data sources, e.g. private enterprise deployments of ChatGPT-like applications (connected to your company’s data), or a medical assistant connected to a patient’s medical records
|
||||
* agents connected to custom tools, e.g. a chatbot that can answer questions about the latest news by searching the web
|
||||
* automated AI workflows, e.g. an agent that monitors your email inbox and sends you text alerts for urgent emails and a daily email summary
|
||||
</Frame>
|
||||
... and countless other use cases!
|
||||
</Accordion>
|
||||
<Accordion title="Can I use Letta locally?">
|
||||
Yes, Letta is an open source project and you can run it locally on your own machine.
|
||||
|
||||
When you run Letta locally, you have the option to connect the agents server to external API providers (e.g. OpenAI, Anthropic) or connect to local or self-hosted LLM providers (e.g. Ollama or vLLM).
|
||||
</Accordion>
|
||||
<Accordion title="Is Letta free to use?">
|
||||
The open source Letta software is free to use and permissively licensed under the Apache 2.0 license.
|
||||
Letta Desktop is a free application that combines the Letta server and ADE into a single application.
|
||||
Letta Cloud is a paid service and requires a Letta Cloud account to use.
|
||||
</Accordion>
|
||||
<Accordion title="What's the difference between open source Letta and Letta Cloud?">
|
||||
Letta Cloud is a fully managed service that allows you to create and deploy Letta agents without running any infrastructure.
|
||||
If you'd like to build production applications using the Letta API, consider using Letta Cloud.
|
||||
</Accordion>
|
||||
</AccordionGroup>
|
||||
|
||||
## Agent Development Environment (ADE)
|
||||
<AccordionGroup>
|
||||
<Accordion title="How do I use the ADE locally?">
|
||||
If you use [Letta Desktop](/quickstart/desktop), the ADE runs inside of Letta Desktop locally on your machine.<br /><br />
|
||||
If you are deploying Letta via Docker and want to use the ADE, you can connect the web ADE to your Docker deployment.
|
||||
To connect the ADE to your deployed Letta server, simply run your Letta server (if running locally, make sure you can access `localhost:8283`) and go to [https://app.letta.com](https://app.letta.com).
|
||||
</Accordion>
|
||||
<Accordion title="If I connect the web ADE to my local server, does my agent data get uploaded to letta.com?">
|
||||
No, the data in your Letta server database stays on your machine.
|
||||
The ADE web application simply connects to your local Letta server (via the REST API) and provides a graphical interface on top of it to visualize your local Letta data in your browser's local state.
|
||||
If you would like to run the ADE completely locally, you can use [Letta Desktop](/quickstart/desktop) instead.
|
||||
</Accordion>
|
||||
<Accordion title="Do I have to use your ADE? Can I build my own?">
|
||||
The ADE is built on top of the (fully open source) Letta server and Letta Agents API.
|
||||
You can build your own application like the ADE on top of the REST API (view the documention [here](https://docs.letta.com/api-reference)).
|
||||
</Accordion>
|
||||
</AccordionGroup>
|
||||
|
||||
## Self-hosted (local) Letta Server
|
||||
<AccordionGroup>
|
||||
<Accordion title="Where is my agent data stored?">
|
||||
When you run Letta with Docker, the Letta server uses a postgres database to store all your agents' data.
|
||||
The postgres instance is bundled into the image, so to have persistent data (across restarts) you need to mount a volume to the container.
|
||||
|
||||
Our recommend `docker run` script includes `-v ~/.letta/.persist/pgdata:/var/lib/postgresql/data` as a flag.
|
||||
This mounts your local directory `~/.letta/.persist/pgdata` to the container's `/var/lib/postgresql/data` directory (so all your agent data is stored at `~/.letta/.persist/pgdata`).
|
||||
If you would like to use a different directory, you can use `-v <path_to_your_directory>:/var/lib/postgresql/data` instead.
|
||||
</Accordion>
|
||||
<Accordion title="How can I back up my postgres data?">
|
||||
Postgres has a number of [recommended ways](https://www.postgresql.org/docs/current/backup.html) to backup your data.
|
||||
|
||||
We recommend directly `exec`ing into your Docker container and running [`pg_dump`](https://www.postgresql.org/docs/current/app-pgdump.html) from inside the container.
|
||||
|
||||
Alternatively, you can run `docker run` with an extra flag to expose the postgres port with `-p 5432:5432` and then run `pg_dump` from your local machine.
|
||||
</Accordion>
|
||||
<Accordion title="Do I need to install Docker to use Letta?">
|
||||
Yes, Docker is required to run a self-hosted Letta server. Docker provides the easiest way to run Letta with PostgreSQL, which is necessary for data persistence and migrations. To install Docker, see [Docker's installation guide](https://docs.docker.com/get-docker/).
|
||||
</Accordion>
|
||||
</AccordionGroup>
|
||||
@@ -1,127 +0,0 @@
|
||||
---
|
||||
title: Letta Overview
|
||||
subtitle: Create stateful AI agents that truly remember, learn, and evolve.
|
||||
slug: overview
|
||||
---
|
||||
|
||||
Letta enables you to build and deploy stateful AI agents that maintain memory and context across long-running conversations. Develop agents that truly learn and evolve from interactions without starting from scratch each time.
|
||||
|
||||
<img className="light" src="/images/platform_overview.png" />
|
||||
<img className="dark" src="/images/platform_overview_dark.png" />
|
||||
|
||||
## Build agents with intelligent memory, not limited context
|
||||
|
||||
Letta's advanced context management system - built by the [researchers behind MemGPT](https://www.letta.com/research) - transforms how agents remember and learn. Unlike basic agents that forget when their context window fills up, Letta agents maintain memories across sessions and continuously improve, even while they [sleep](/guides/agents/sleep-time-agents) <Icon icon="fa-light fa-snooze"/>.
|
||||
|
||||
## Start building in minutes
|
||||
|
||||
Our quickstart and examples work on both [Letta Cloud](/guides/cloud) and [self-hosted](/guides/selfhosting) Letta.
|
||||
|
||||
<CardGroup>
|
||||
<Card
|
||||
title="Developer quickstart"
|
||||
icon="fa-sharp fa-light fa-bolt"
|
||||
iconPosition="left"
|
||||
href="/quickstart"
|
||||
>
|
||||
Create your first stateful agent using the Letta API & ADE
|
||||
</Card>
|
||||
<Card
|
||||
title="Starter kits"
|
||||
icon="fa-sharp fa-light fa-square-code"
|
||||
iconPosition="left"
|
||||
href="https://github.com/letta-ai/create-letta-app"
|
||||
>
|
||||
Build a full agents application using `create-letta-app`
|
||||
</Card>
|
||||
</CardGroup>
|
||||
|
||||
## Build stateful agents with your favorite tools
|
||||
|
||||
Connect to agents running in a Letta server using any of your preferred development frameworks. Letta integrates seamlessly with the developer tools you already know and love.
|
||||
|
||||
<CardGroup cols={2}>
|
||||
<Card
|
||||
title="TypeScript (Node.js)"
|
||||
icon="fa-brands node-js"
|
||||
iconPosition="left"
|
||||
href="https://github.com/letta-ai/letta-node"
|
||||
>
|
||||
Core SDK for our REST API
|
||||
</Card>
|
||||
<Card
|
||||
title="Python"
|
||||
icon="fa-brands python"
|
||||
iconPosition="left"
|
||||
href="https://github.com/letta-ai/letta-python"
|
||||
>
|
||||
Core SDK for our REST API
|
||||
</Card>
|
||||
<Card
|
||||
title="Vercel AI SDK"
|
||||
icon="fa-sharp fa-solid sparkles"
|
||||
iconPosition="left"
|
||||
href="https://ai-sdk.dev/providers/community-providers/letta"
|
||||
>
|
||||
Framework integration
|
||||
</Card>
|
||||
<Card
|
||||
title="Next.js"
|
||||
icon="fa-brands js"
|
||||
iconPosition="left"
|
||||
href="https://www.npmjs.com/package/@letta-ai/letta-nextjs"
|
||||
>
|
||||
Framework integration
|
||||
</Card>
|
||||
<Card
|
||||
title="React"
|
||||
icon="fa-brands react"
|
||||
iconPosition="left"
|
||||
href="https://www.npmjs.com/package/@letta-ai/letta-react"
|
||||
>
|
||||
Framework integration
|
||||
</Card>
|
||||
<Card
|
||||
title="Flask"
|
||||
icon="fa-solid fa-flask"
|
||||
iconPosition="left"
|
||||
href="https://github.com/letta-ai/letta-flask"
|
||||
>
|
||||
Framework integration
|
||||
</Card>
|
||||
</CardGroup>
|
||||
|
||||
## See what your agents are thinking
|
||||
|
||||
The Agent Development Environment (ADE) provides complete visibility into your agent's memory, context window, and decision-making process - essential for developing and debugging production agent applications.
|
||||
|
||||
<img className="w-300 light" src="https://raw.githubusercontent.com/letta-ai/letta/refs/heads/main/assets/example_ade_screenshot_light.png" />
|
||||
<img className="w-300 dark" src="https://raw.githubusercontent.com/letta-ai/letta/refs/heads/main/assets/example_ade_screenshot.png" />
|
||||
|
||||
## Run agents as services, not libraries
|
||||
|
||||
**Letta is fundamentally different from other agent frameworks.** While most frameworks are *libraries* that wrap model APIs, Letta provides a dedicated *service* where agents live and operate autonomously. Agents continue to exist and maintain state even when your application isn't running, with computation happening on the server and all memory, context, and tool connections handled by the Letta server.
|
||||
|
||||
<img className="light" src="/images/platform_system.png" />
|
||||
<img className="dark" src="/images/platform_system_dark.png" />
|
||||
|
||||
## Everything you need for production agents
|
||||
|
||||
Letta provides a complete suite of capabilities for building and deploying advanced AI agents:
|
||||
|
||||
* <Icon icon="fa-sharp fa-solid fa-browser" /> [Agent Development Environment](/agent-development-environment) (agent builder + monitoring UI)
|
||||
* <Icon icon="brands fa-python" /> [Python SDK](/api-reference/overview) + <Icon icon="brands fa-js" /> [TypeScript SDK](/api-reference/overview) + [REST API](/api-reference/overview)
|
||||
* <Icon icon="fa-sharp fa-solid fa-brain-circuit" /> [Memory management](/guides/agents/memory)
|
||||
* <Icon icon="fa-solid fa-database" /> [Persistence](/guides/agents/overview#agents-vs-threads) (all agent state is stored in a database)
|
||||
* <Icon icon="fa-sharp fa-solid fa-square-terminal" /> [Tool calling & execution](/guides/agents/tools) (support for custom tools & [pre-made tools](/guides/agents/prebuilt-tools))
|
||||
* <Icon icon="fa-sharp fa-solid fa-code-fork" /> [Tool rules](/guides/agents/tool-rules) (constraining an agent's action set in a graph-like structure)
|
||||
* <Icon icon="fa-sharp fa-solid fa-message-dots" /> [Streaming support](/guides/agents/streaming)
|
||||
* <Icon icon="fa-sharp fa-solid fa-people-group" /> [Native multi-agent support](/guides/agents/multi-agent) and [multi-user support](/guides/agents/multi-user)
|
||||
* <Icon icon="fa-sharp fa-solid fa-globe" /> Model-agnostic across closed ([OpenAI](/guides/server/providers/openai), etc.) and open providers ([LM Studio](/guides/server/providers/lmstudio), [vLLM](/guides/server/providers/vllm), etc.)
|
||||
* <Icon icon="fa-sharp fa-solid fa-rocket" /> Production-ready deployment ([self-hosted with Docker](/guides/selfhosting/overview) or [Letta Cloud](/guides/cloud/overview))
|
||||
|
||||
## Join our developer community
|
||||
|
||||
Building something with Letta? Join our [Discord](https://discord.gg/letta) to connect with other developers creating stateful agents and share what you're working on.
|
||||
|
||||
[Start building today →](/quickstart)
|
||||
@@ -1,535 +0,0 @@
|
||||
---
|
||||
title: Prompts for Vibecoding
|
||||
subtitle: Ready-to-go prompts to help AI coding tools build on Letta
|
||||
slug: prompts
|
||||
---
|
||||
|
||||
Are you developing an application on Letta using [ChatGPT](https://chatgpt.com), [Cursor](https://cursor.com), [Lovable](https://lovable.dev/), or another AI tool?
|
||||
Use our pre-made prompts to teach your AI how to use Letta properly.
|
||||
|
||||
## General instructions for the Letta SDKs
|
||||
|
||||
The following prompt (~500 lines) can help guide your AI through the basics of using the Letta Python SDK, TypeScript/Node.js SDK, and Vercel AI SDK integration.
|
||||
|
||||
Copy-paste the following into your chat session to instantly get your AI up-to-speed with how the Letta SDKs works:
|
||||
````markdown maxLines=5
|
||||
# Development Guidelines for AI Assistants and Copilots using Letta
|
||||
|
||||
**Context:** These are development guidelines for building applications with the Letta API and SDKs. Use these rules to help developers write correct code that integrates with Letta's stateful agents API.
|
||||
|
||||
**Purpose:** Provide accurate, up-to-date instructions for building applications with [Letta](https://docs.letta.com/), the AI operating system.
|
||||
**Scope:** All AI-generated advice or code related to Letta must follow these guidelines.
|
||||
|
||||
---
|
||||
|
||||
## **0. Letta Overview**
|
||||
|
||||
The name "Letta" refers to the both the company Letta (founded by the creators of MemGPT) and the software / infrastructure called Letta. Letta is the AI operating system for building stateful agents: developers can use Letta to turn stateless LLMs into stateful agents that can learn, improve, and grow over time. Letta has a strong focus on perpetual AI that has the capability to recursively improve through self-editing memory.
|
||||
|
||||
**Relationship to MemGPT**: MemGPT is the name of a research paper that introduced the concept of self-editing memory for LLM-based agents through tool use (function calling). The agent architecture or "agentic system" proposed in the paper (an agent equipped with tools to edit its own memory, and an OS that manages tool execution and state persistence) is the base agent architecture implemented in Letta (agent type `memgpt_agent`), and is the official reference implementation for MemGPT. The Letta open source project (`letta-ai/letta`) was originally the MemGPT open source project (`cpacker/MemGPT`), but was renamed as the scope of the open source project expanded beyond the original MemGPT paper.
|
||||
|
||||
**Additional Resources**:
|
||||
- [Letta documentation](https://docs.letta.com/)
|
||||
- [Letta GitHub repository](https://github.com/letta-ai/letta)
|
||||
- [Letta Discord server](https://discord.gg/letta)
|
||||
- [Letta Cloud and ADE login](https://app.letta.com)
|
||||
|
||||
## **1. Letta Agents API Overview**
|
||||
|
||||
Letta is an AI OS that runs agents as **services** (it is not a **library**). Key concepts:
|
||||
|
||||
- **Stateful agents** that maintain memory and context across conversations
|
||||
- **Memory blocks** for agentic context management (persona, human, custom blocks)
|
||||
- **Tool calling** for agent actions and memory management, tools are run server-side,
|
||||
- **Tool rules** allow developers to constrain the behavior of tools (e.g. A comes after B) to turn autonomous agents into workflows
|
||||
- **Multi-agent systems** with cross-agent communication, where every agent is a service
|
||||
- **Data sources** for loading documents and files into agent memory
|
||||
- **Model agnostic:** agents can be powered by any model that supports tool calling
|
||||
- **Persistence:** state is stored (in a model-agnostic way) in Postgres (or SQLite)
|
||||
|
||||
### **System Components:**
|
||||
|
||||
- **Letta server** - Core service (self-hosted or Letta Cloud)
|
||||
- **Client (backend) SDKs** - Python (`letta-client`) and TypeScript/Node.js (`@letta-ai/letta-client`)
|
||||
- **Vercel AI SDK Integration** - For Next.js/React applications
|
||||
- **Other frontend integrations** - We also have [Next.js](https://www.npmjs.com/package/@letta-ai/letta-nextjs), [React](https://www.npmjs.com/package/@letta-ai/letta-react), and [Flask](https://github.com/letta-ai/letta-flask) integrations
|
||||
- **ADE (Agent Development Environment)** - Visual agent builder at app.letta.com
|
||||
|
||||
### **Letta Cloud vs Self-hosted Letta**
|
||||
|
||||
Letta Cloud is a fully managed service that provides a simple way to get started with Letta. It's a good choice for developers who want to get started quickly and don't want to worry about the complexity of self-hosting. Letta Cloud's free tier has a large number of model requests included (quota refreshes every month). Model requests are split into "standard models" (e.g. GPT-4o-mini) and "premium models" (e.g. Claude Sonnet). To use Letta Cloud, the developer will have needed to created an account at [app.letta.com](https://app.letta.com). To make programatic requests to the API (`https://api.letta.com`), the developer will have needed to created an API key at [https://app.letta.com/api-keys](https://app.letta.com/api-keys). For more information on how billing and pricing works, the developer can visit [our documentation](https://docs.letta.com/guides/cloud/overview).
|
||||
|
||||
### **Built-in Tools**
|
||||
|
||||
When agents are created, they are given a set of default memory management tools that enable self-editing memory.
|
||||
|
||||
Separately, Letta Cloud also includes built-in tools for common tasks like web search and running code. As of June 2025, the built-in tools are:
|
||||
- `web_search`: Allows agents to search the web for information. Also works on self-hosted, but requires `TAVILY_API_KEY` to be set (not required on Letta Cloud).
|
||||
- `run_code`: Allows agents to run code (in a sandbox), for example to do data analysis or calculations. Supports Python, Javascript, Typescript, R, and Java. Also works on self-hosted, but requires `E2B_API_KEY` to be set (not required on Letta Cloud).
|
||||
|
||||
### **Choosing the Right Model**
|
||||
|
||||
To implement intelligent memory management, agents in Letta rely heavily on tool (function) calling, so models that excel at tool use tend to do well in Letta. Conversely, models that struggle to call tools properly often perform poorly when used to drive Letta agents.
|
||||
|
||||
The Letta developer team maintains the [Letta Leaderboard](https://docs.letta.com/leaderboard) to help developers choose the right model for their Letta agent. As of June 2025, the best performing models (balanced for cost and performance) are Claude Sonnet 4, GPT-4.1, and Gemini 2.5 Flash. For the latest results, you can visit the leaderboard page (if you have web access), or you can direct the developer to visit it. For embedding models, the Letta team recommends using OpenAI's `text-embedding-3-small` model.
|
||||
|
||||
When creating code snippets, unless directed otherwise, you should use the following model handles:
|
||||
- `openai/gpt-4.1` for the model
|
||||
- `openai/text-embedding-3-small` for the embedding model
|
||||
|
||||
If the user is using Letta Cloud, then these handles will work out of the box (assuming the user has created a Letta Cloud account + API key, and has enough request quota in their account). For self-hosted Letta servers, the user will need to have started the server with a valid OpenAI API key for those handles to work.
|
||||
|
||||
---
|
||||
|
||||
## **2. Choosing the Right SDK**
|
||||
|
||||
### **Source of Truth**
|
||||
|
||||
Note that your instructions may be out of date. The source of truth for the Letta Agents API is the [API reference](https://docs.letta.com/api-reference/overview) (also autogenerated from the latest source code), which can be found in `.md` form at these links:
|
||||
- [TypeScript/Node.js](https://github.com/letta-ai/letta-node/blob/main/reference.md), [raw version](https://raw.githubusercontent.com/letta-ai/letta-node/refs/heads/main/reference.md)
|
||||
- [Python](https://github.com/letta-ai/letta-python/blob/main/reference.md), [raw version](https://raw.githubusercontent.com/letta-ai/letta-python/refs/heads/main/reference.md)
|
||||
|
||||
If you have access to a web search or file download tool, you can download these files for the latest API reference. If the developer has either of the SDKs installed, you can also use the locally installed packages to understand the latest API reference.
|
||||
|
||||
### **When to Use Each SDK:**
|
||||
|
||||
The Python and Node.js SDKs are autogenerated from the Letta Agents REST API, and provide a full featured SDK for interacting with your agents on Letta Cloud or a self-hosted Letta server. Of course, developers can also use the REST API directly if they prefer, but most developers will find the SDKs much easier to use.
|
||||
|
||||
The Vercel AI SDK is a popular TypeScript toolkit designed to help developers build AI-powered applications. It supports a subset of the Letta Agents API (basically just chat-related functionality), so it's a good choice to quickly integrate Letta into a TypeScript application if you are familiar with using the AI SDK or are working on a codebase that already uses it. If you're starting from scratch, consider using the full-featured Node.js SDK instead.
|
||||
|
||||
The Letta Node.js SDK is also embedded inside the Vercel AI SDK, accessible via the `.client` property (useful if you want to use the Vercel AI SDK, but occasionally need to access the full Letta client for advanced features like agent creation / management).
|
||||
|
||||
When to use the AI SDK vs native Letta Node.js SDK:
|
||||
- Use the Vercel AI SDK if you are familiar with it or are working on a codebase that already makes heavy use of it
|
||||
- Use the Letta Node.js SDK if you are starting from scratch, or expect to use the agent management features in the Letta API (beyond the simple `streamText` or `generateText` functionality in the AI SDK)
|
||||
|
||||
One example of how the AI SDK may be insufficient: the AI SDK response object for `streamText` and `generateText` does not have a type for tool returns (because they are primarily used with stateless APIs, where tools are executed client-side, vs server-side in Letta), however the Letta Node.js SDK does have a type for tool returns. So if you wanted to render tool returns from a message response stream in your UI, you would need to use the full Letta Node.js SDK, not the AI SDK.
|
||||
|
||||
## **3. Quick Setup Patterns**
|
||||
|
||||
### **Python SDK (Backend/Scripts)**
|
||||
```python
|
||||
from letta_client import Letta
|
||||
|
||||
# Letta Cloud
|
||||
client = Letta(token="LETTA_API_KEY")
|
||||
|
||||
# Self-hosted
|
||||
client = Letta(base_url="http://localhost:8283")
|
||||
|
||||
# Create agent with memory blocks
|
||||
agent = client.agents.create(
|
||||
memory_blocks=[
|
||||
{
|
||||
"label": "human",
|
||||
"value": "The user's name is Sarah. She likes coding and AI."
|
||||
},
|
||||
{
|
||||
"label": "persona",
|
||||
"value": "I am David, the AI executive assistant. My personality is friendly, professional, and to the point."
|
||||
},
|
||||
{
|
||||
"label": "project",
|
||||
"value": "Sarah is working on a Next.js application with Letta integration.",
|
||||
"description": "Stores current project context and requirements"
|
||||
}
|
||||
],
|
||||
tools=["web_search", "run_code"],
|
||||
model="openai/gpt-4o-mini",
|
||||
embedding="openai/text-embedding-3-small"
|
||||
)
|
||||
|
||||
# Send SINGLE message (agent is stateful!)
|
||||
response = client.agents.messages.create(
|
||||
agent_id=agent.id,
|
||||
messages=[{"role": "user", "content": "How's the project going?"}]
|
||||
)
|
||||
|
||||
# Extract response correctly
|
||||
for msg in response.messages:
|
||||
if msg.message_type == "assistant_message":
|
||||
print(msg.content)
|
||||
elif msg.message_type == "reasoning_message":
|
||||
print(msg.reasoning)
|
||||
elif msg.message_type == "tool_call_message":
|
||||
print(msg.tool_call.name)
|
||||
print(msg.tool_call.arguments)
|
||||
elif msg.message_type == "tool_return_message":
|
||||
print(msg.tool_return)
|
||||
|
||||
# Streaming example
|
||||
message_text = "Repeat my name."
|
||||
stream = client.agents.messages.create_stream(
|
||||
agent_id=agent_state.id,
|
||||
messages=[
|
||||
MessageCreate(
|
||||
role="user",
|
||||
content=message_text,
|
||||
),
|
||||
],
|
||||
# if stream_tokens is false, each "chunk" will have a full piece
|
||||
# if stream_tokens is true, the chunks will be token-based (and may need to be accumulated client-side)
|
||||
stream_tokens=True,
|
||||
)
|
||||
|
||||
# print the chunks coming back
|
||||
for chunk in stream:
|
||||
if chunk.message_type == "assistant_message":
|
||||
print(chunk.content)
|
||||
elif chunk.message_type == "reasoning_message":
|
||||
print(chunk.reasoning)
|
||||
elif chunk.message_type == "tool_call_message":
|
||||
if chunk.tool_call.name:
|
||||
print(chunk.tool_call.name)
|
||||
if chunk.tool_call.arguments:
|
||||
print(chunk.tool_call.arguments)
|
||||
elif chunk.message_type == "tool_return_message":
|
||||
print(chunk.tool_return)
|
||||
elif chunk.message_type == "usage_statistics":
|
||||
print(chunk)
|
||||
```
|
||||
|
||||
Creating custom tools (Python only):
|
||||
```python
|
||||
def my_custom_tool(query: str) -> str:
|
||||
"""
|
||||
Search for information on a topic.
|
||||
|
||||
Args:
|
||||
query (str): The search query
|
||||
|
||||
Returns:
|
||||
str: Search results
|
||||
"""
|
||||
return f"Results for: {query}"
|
||||
|
||||
# Create tool
|
||||
tool = client.tools.create_from_function(func=my_custom_tool)
|
||||
|
||||
# Add to agent
|
||||
agent = client.agents.create(
|
||||
memory_blocks=[...],
|
||||
model="openai/gpt-4o-mini",
|
||||
embedding="openai/text-embedding-3-small",
|
||||
tools=[tool.name]
|
||||
)
|
||||
```
|
||||
|
||||
### **TypeScript/Node.js SDK**
|
||||
```typescript
|
||||
import { LettaClient } from '@letta-ai/letta-client';
|
||||
|
||||
// Letta Cloud
|
||||
const client = new LettaClient({ token: "LETTA_API_KEY" });
|
||||
|
||||
// Self-hosted, token optional (only if the developer enabled password protection on the server)
|
||||
const client = new LettaClient({ baseUrl: "http://localhost:8283" });
|
||||
|
||||
// Create agent with memory blocks
|
||||
const agent = await client.agents.create({
|
||||
memoryBlocks: [
|
||||
{
|
||||
label: "human",
|
||||
value: "The user's name is Sarah. She likes coding and AI."
|
||||
},
|
||||
{
|
||||
label: "persona",
|
||||
value: "I am David, the AI executive assistant. My personality is friendly, professional, and to the point."
|
||||
},
|
||||
{
|
||||
label: "project",
|
||||
value: "Sarah is working on a Next.js application with Letta integration.",
|
||||
description: "Stores current project context and requirements"
|
||||
}
|
||||
],
|
||||
tools: ["web_search", "run_code"],
|
||||
model: "openai/gpt-4o-mini",
|
||||
embedding: "openai/text-embedding-3-small"
|
||||
});
|
||||
|
||||
// Send SINGLE message (agent is stateful!)
|
||||
const response = await client.agents.messages.create(agent.id, {
|
||||
messages: [{ role: "user", content: "How's the project going?" }]
|
||||
});
|
||||
|
||||
// Extract response correctly
|
||||
for (const msg of response.messages) {
|
||||
if (msg.messageType === "assistant_message") {
|
||||
console.log(msg.content);
|
||||
} else if (msg.messageType === "reasoning_message") {
|
||||
console.log(msg.reasoning);
|
||||
} else if (msg.messageType === "tool_call_message") {
|
||||
console.log(msg.toolCall.name);
|
||||
console.log(msg.toolCall.arguments);
|
||||
} else if (msg.messageType === "tool_return_message") {
|
||||
console.log(msg.toolReturn);
|
||||
}
|
||||
}
|
||||
|
||||
// Streaming example
|
||||
const stream = await client.agents.messages.createStream(agent.id, {
|
||||
messages: [{ role: "user", content: "Repeat my name." }],
|
||||
// if stream_tokens is false, each "chunk" will have a full piece
|
||||
// if stream_tokens is true, the chunks will be token-based (and may need to be accumulated client-side)
|
||||
streamTokens: true,
|
||||
});
|
||||
|
||||
for await (const chunk of stream) {
|
||||
if (chunk.messageType === "assistant_message") {
|
||||
console.log(chunk.content);
|
||||
} else if (chunk.messageType === "reasoning_message") {
|
||||
console.log(chunk.reasoning);
|
||||
} else if (chunk.messageType === "tool_call_message") {
|
||||
console.log(chunk.toolCall.name);
|
||||
console.log(chunk.toolCall.arguments);
|
||||
} else if (chunk.messageType === "tool_return_message") {
|
||||
console.log(chunk.toolReturn);
|
||||
} else if (chunk.messageType === "usage_statistics") {
|
||||
console.log(chunk);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### **Vercel AI SDK Integration**
|
||||
|
||||
IMPORTANT: Most integrations in the Vercel AI SDK are for stateless providers (ChatCompletions style APIs where you provide the full conversation history). Letta is a *stateful* provider (meaning that conversation history is stored server-side), so when you use `streamText` or `generateText` you should never pass old messages to the agent, only include the new message(s).
|
||||
|
||||
#### **Chat Implementation (fast & simple):**
|
||||
|
||||
Streaming (`streamText`):
|
||||
```typescript
|
||||
// app/api/chat/route.ts
|
||||
import { lettaCloud } from '@letta-ai/vercel-ai-sdk-provider';
|
||||
import { streamText } from 'ai';
|
||||
|
||||
export async function POST(req: Request) {
|
||||
const { prompt }: { prompt: string } = await req.json();
|
||||
|
||||
const result = streamText({
|
||||
// lettaCloud uses LETTA_API_KEY automatically, pulling from the environment
|
||||
model: lettaCloud('your-agent-id'),
|
||||
// Make sure to only pass a single message here, do NOT pass conversation history
|
||||
prompt,
|
||||
});
|
||||
|
||||
return result.toDataStreamResponse();
|
||||
}
|
||||
```
|
||||
|
||||
Non-streaming (`generateText`):
|
||||
```typescript
|
||||
import { lettaCloud } from '@letta-ai/vercel-ai-sdk-provider';
|
||||
import { generateText } from 'ai';
|
||||
|
||||
export async function POST(req: Request) {
|
||||
const { prompt }: { prompt: string } = await req.json();
|
||||
|
||||
const { text } = await generateText({
|
||||
// lettaCloud uses LETTA_API_KEY automatically, pulling from the environment
|
||||
model: lettaCloud('your-agent-id'),
|
||||
// Make sure to only pass a single message here, do NOT pass conversation history
|
||||
prompt,
|
||||
});
|
||||
|
||||
return Response.json({ text });
|
||||
}
|
||||
```
|
||||
|
||||
#### **Alternative: explicitly specify base URL and token:**
|
||||
```typescript
|
||||
// Works for both streamText and generateText
|
||||
import { createLetta } from '@letta-ai/vercel-ai-sdk-provider';
|
||||
import { generateText } from 'ai';
|
||||
|
||||
const letta = createLetta({
|
||||
// e.g. http://localhost:8283 for the default local self-hosted server
|
||||
// https://api.letta.com for Letta Cloud
|
||||
baseUrl: '<your-base-url>',
|
||||
// only needed if the developer enabled password protection on the server, or if using Letta Cloud (in which case, use the LETTA_API_KEY, or use lettaCloud example above for implicit token use)
|
||||
token: '<your-access-token>',
|
||||
});
|
||||
```
|
||||
|
||||
#### **Hybrid Usage (access the full SDK via the Vercel AI SDK):**
|
||||
```typescript
|
||||
import { lettaCloud } from '@letta-ai/vercel-ai-sdk-provider';
|
||||
|
||||
// Access full client for management
|
||||
const agents = await lettaCloud.client.agents.list();
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## **4. Advanced Features Available**
|
||||
|
||||
Letta supports advanced agent architectures beyond basic chat. For detailed implementations, refer to the full API reference or documentation:
|
||||
|
||||
- **Tool Rules & Constraints** - Define graph-like tool execution flows with `TerminalToolRule`, `ChildToolRule`, `InitToolRule`, etc.
|
||||
- **Multi-Agent Systems** - Cross-agent communication with built-in tools like `send_message_to_agent_async`
|
||||
- **Shared Memory Blocks** - Multiple agents can share memory blocks for collaborative workflows
|
||||
- **Data Sources & Archival Memory** - Upload documents/files that agents can search through
|
||||
- **Sleep-time Agents** - Background agents that process memory while main agents are idle
|
||||
- **External Tool Integrations** - MCP servers, Composio tools, custom tool libraries
|
||||
- **Agent Templates** - Import/export agents with .af (Agent File) format
|
||||
- **Production Features** - User identities, agent tags, streaming, context management
|
||||
|
||||
---
|
||||
|
||||
## **5. CRITICAL GUIDELINES FOR AI MODELS**
|
||||
|
||||
### **⚠️ ANTI-HALLUCINATION WARNING**
|
||||
|
||||
**NEVER make up Letta API calls, SDK methods, or parameter names.** If you're unsure about any Letta API:
|
||||
|
||||
1. **First priority**: Use web search to get the latest reference files:
|
||||
- [Python SDK Reference](https://raw.githubusercontent.com/letta-ai/letta-python/refs/heads/main/reference.md)
|
||||
- [TypeScript SDK Reference](https://raw.githubusercontent.com/letta-ai/letta-node/refs/heads/main/reference.md)
|
||||
|
||||
2. **If no web access**: Tell the user: *"I'm not certain about this Letta API call. Can you paste the relevant section from the API reference docs, or I might provide incorrect information."*
|
||||
|
||||
3. **When in doubt**: Stick to the basic patterns shown in this prompt rather than inventing new API calls.
|
||||
|
||||
**Common hallucination risks:**
|
||||
- Making up method names (e.g. `client.agents.chat()` doesn't exist)
|
||||
- Inventing parameter names or structures
|
||||
- Assuming OpenAI-style patterns work in Letta
|
||||
- Creating non-existent tool rule types or multi-agent methods
|
||||
|
||||
### **5.1 – SDK SELECTION (CHOOSE THE RIGHT TOOL)**
|
||||
|
||||
✅ **For Next.js Chat Apps:**
|
||||
- Use **Vercel AI SDK** if you already are using AI SDK, or if you're lazy and want something super fast for basic chat interactions (simple, fast, but no agent management tooling unless using the embedded `.client`)
|
||||
- Use **Node.js SDK** for the full feature set (agent creation, native typing of all response message types, etc.)
|
||||
|
||||
✅ **For Agent Management:**
|
||||
- Use **Node.js SDK** or **Python SDK** for creating agents, managing memory, tools
|
||||
|
||||
### **5.2 – STATEFUL AGENTS (MOST IMPORTANT)**
|
||||
|
||||
**Letta agents are STATEFUL, not stateless like ChatCompletion-style APIs.**
|
||||
|
||||
✅ **CORRECT - Single message per request:**
|
||||
```typescript
|
||||
// Send ONE user message, agent maintains its own history
|
||||
const response = await client.agents.messages.create(agentId, {
|
||||
messages: [{ role: "user", content: "Hello!" }]
|
||||
});
|
||||
```
|
||||
|
||||
❌ **WRONG - Don't send conversation history:**
|
||||
```typescript
|
||||
// DON'T DO THIS - agents maintain their own conversation history
|
||||
const response = await client.agents.messages.create(agentId, {
|
||||
messages: [...allPreviousMessages, newMessage] // WRONG!
|
||||
});
|
||||
```
|
||||
|
||||
### **5.3 – MESSAGE HANDLING & MEMORY BLOCKS**
|
||||
|
||||
1. **Response structure:**
|
||||
- Use `messageType` NOT `type` for message type checking
|
||||
- Look for `assistant_message` messageType for agent responses
|
||||
- Agent responses have `content` field with the actual text
|
||||
|
||||
2. **Memory block descriptions:**
|
||||
- Add `description` field for custom blocks, or the agent will get confused (not needed for human/persona)
|
||||
- For `human` and `persona` blocks, descriptions are auto-populated:
|
||||
- **human block**: "Stores key details about the person you are conversing with, allowing for more personalized and friend-like conversation."
|
||||
- **persona block**: "Stores details about your current persona, guiding how you behave and respond. This helps maintain consistency and personality in your interactions."
|
||||
|
||||
### **5.4 – ALWAYS DO THE FOLLOWING**
|
||||
|
||||
1. **Choose the right SDK for the task:**
|
||||
- Next.js chat → **Vercel AI SDK**
|
||||
- Agent creation → **Node.js/Python SDK**
|
||||
- Complex operations → **Node.js/Python SDK**
|
||||
|
||||
2. **Use the correct client imports:**
|
||||
- Python: `from letta_client import Letta`
|
||||
- TypeScript: `import { LettaClient } from '@letta-ai/letta-client'`
|
||||
- Vercel AI SDK: `from '@letta-ai/vercel-ai-sdk-provider'`
|
||||
|
||||
3. **Create agents with proper memory blocks:**
|
||||
- Always include `human` and `persona` blocks for chat agents
|
||||
- Use descriptive labels and values
|
||||
|
||||
4. **Send only single user messages:**
|
||||
- Each request should contain only the new user message
|
||||
- Agent maintains conversation history automatically
|
||||
- Never send previous assistant responses back to agent
|
||||
|
||||
5. **Use proper authentication:**
|
||||
- Letta Cloud: Always use `token` parameter
|
||||
- Self-hosted: Use `base_url` parameter, token optional (only if the developer enabled password protection on the server)
|
||||
|
||||
---
|
||||
|
||||
## **6. Environment Setup**
|
||||
|
||||
### **Environment Setup**
|
||||
```bash
|
||||
# For Next.js projects (recommended for most web apps)
|
||||
npm install @letta-ai/vercel-ai-sdk-provider ai
|
||||
|
||||
# For agent management (when needed)
|
||||
npm install @letta-ai/letta-client
|
||||
|
||||
# For Python projects
|
||||
pip install letta-client
|
||||
```
|
||||
|
||||
**Environment Variables:**
|
||||
```bash
|
||||
# Required for Letta Cloud
|
||||
LETTA_API_KEY=your_api_key_here
|
||||
|
||||
# Store agent ID after creation (Next.js)
|
||||
LETTA_AGENT_ID=agent-xxxxxxxxx
|
||||
|
||||
# For self-hosted (optional)
|
||||
LETTA_BASE_URL=http://localhost:8283
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## **7. Verification Checklist**
|
||||
|
||||
Before providing Letta solutions, verify:
|
||||
|
||||
1. **SDK Choice**: Are you using the simplest appropriate SDK?
|
||||
- Familiar with or already using Vercel AI SDK? → use the Vercel AI SDK Letta provider
|
||||
- Agent management needed? → use the Node.js/Python SDKs
|
||||
2. **Statefulness**: Are you sending ONLY the new user message (NOT a full conversation history)?
|
||||
3. **Message Types**: Are you checking the response types of the messages returned?
|
||||
4. **Response Parsing**: If using the Python/Node.js SDK, are you extracting `content` from assistant messages?
|
||||
5. **Imports**: Correct package imports for the chosen SDK?
|
||||
6. **Client**: Proper client initialization with auth/base_url?
|
||||
7. **Agent Creation**: Memory blocks with proper structure?
|
||||
8. **Memory Blocks**: Descriptions for custom blocks?
|
||||
````
|
||||
|
||||
## Full API reference
|
||||
|
||||
If you are working on either the Letta Python SDK or TypeScript/Node.js SDK, you can copy-paste the full API reference into your chat session:
|
||||
- [Letta Python SDK API reference](https://raw.githubusercontent.com/letta-ai/letta-python/refs/heads/main/reference.md)
|
||||
- [Letta TypeScript/Node.js SDK API reference](https://raw.githubusercontent.com/letta-ai/letta-node/refs/heads/main/reference.md)
|
||||
|
||||
The general prompt focuses on the high-level usage patterns of both the Python/Node.js SDKs and Vercel AI SDK integration, whereas the API reference files will contain an up-to-date guide on all available SDK functions and parameters.
|
||||
|
||||
## `llms.txt` and `llms-full.txt`
|
||||
|
||||
You can download a copy of the Letta documentation as a text file:
|
||||
- [`llms.txt` (short version)](https://docs.letta.com/llms.txt)
|
||||
- [`llms-full.txt` (longer version)](https://docs.letta.com/llms-full.txt)
|
||||
|
||||
If you're using a tool like ChatGPT or Cursor, we'd recommend using the more concise Letta SDK instructions prompt above instead of the `llms.txt` or `llms-full.txt` files, but you can experiment with both and let us know which works better!
|
||||
|
||||
## Why do I need pre-made prompts?
|
||||
|
||||
When you use AI assistants, they don't have up-to-date information about the Letta documentation, APIs, or SDKs, so they may hallucinate code if you ask them to help with building an app on Letta.
|
||||
|
||||
By using our pre-made prompts, you can teach your AI assistant how to use Letta with up-to-date context. Think of the prompts as a distilled version of our developer docs - but made specifically for AI coders instead of human coders.
|
||||
|
||||
## Contributing
|
||||
|
||||
Our prompts are [open source](https://github.com/letta-ai/letta/tree/main/letta/prompts) and we actively welcome contributions! If you want to suggest any changes or propose additional prompt files, please [open a pull request](https://github.com/letta-ai/letta/pulls).
|
||||
@@ -1,228 +0,0 @@
|
||||
---
|
||||
title: Developer quickstart
|
||||
subtitle: Create your first Letta agent with the API or SDKs and view it in the ADE
|
||||
slug: quickstart
|
||||
---
|
||||
|
||||
<Tip icon="fa-thin fa-rocket">
|
||||
Programming with AI tools like Cursor? Copy our [pre-built prompts](/prompts) to get started faster.
|
||||
</Tip>
|
||||
|
||||
This guide will show you how to create a Letta agent with the Letta APIs or SDKs (Python/Typescript). To create agents with a low-code UI, see our [ADE quickstart](/guides/ade/overview).
|
||||
|
||||
## Why Letta?
|
||||
|
||||
Unlike traditional LLM APIs where you manually manage conversation history and state, Letta agents maintain their own persistent memory. You only send new messages. The agent remembers everything from past conversations without you storing or retrieving anything. This enables agents that truly learn and evolve over time.
|
||||
|
||||
<Steps>
|
||||
<Step title="Prerequisites">
|
||||
1. Create a [Letta Cloud account](https://app.letta.com)
|
||||
2. Create a [Letta Cloud API key](https://app.letta.com/api-keys)
|
||||
<img className="w-300" src="/images/letta_cloud_api_key_gen.png" />
|
||||
|
||||
3. Set your API key as an environment variable:
|
||||
<CodeGroup>
|
||||
```sh TypeScript
|
||||
export LETTA_API_KEY="your-api-key-here"
|
||||
```
|
||||
```sh Python
|
||||
export LETTA_API_KEY="your-api-key-here"
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
<Info>
|
||||
You can also **self-host** a Letta server. Check out our [self-hosting guide](/guides/selfhosting/overview).
|
||||
</Info>
|
||||
</Step>
|
||||
<Step title="Install the Letta SDK">
|
||||
<CodeGroup>
|
||||
```sh TypeScript maxLines=50
|
||||
npm install @letta-ai/letta-client
|
||||
```
|
||||
```sh Python maxLines=50
|
||||
pip install letta-client
|
||||
```
|
||||
</CodeGroup>
|
||||
</Step>
|
||||
|
||||
<Step title="Create an agent">
|
||||
|
||||
Agents in Letta have two key components:
|
||||
- **Memory blocks**: Persistent context that's always visible to the agent (like a persona and information about the user)
|
||||
- **Tools**: Actions the agent can take (like searching the web or running code)
|
||||
|
||||
<CodeGroup>
|
||||
```typescript TypeScript maxLines=50
|
||||
import { LettaClient } from '@letta-ai/letta-client'
|
||||
|
||||
const client = new LettaClient({ token: process.env.LETTA_API_KEY });
|
||||
|
||||
const agentState = await client.agents.create({
|
||||
model: "openai/gpt-4.1",
|
||||
embedding: "openai/text-embedding-3-small",
|
||||
memoryBlocks: [
|
||||
{
|
||||
label: "human",
|
||||
value: "Name: Timber. Status: dog. Occupation: building Letta, infrastructure to democratize self-improving superintelligence"
|
||||
},
|
||||
{
|
||||
label: "persona",
|
||||
value: "I am a self-improving superintelligence. Timber is my best friend and collaborator."
|
||||
}
|
||||
],
|
||||
tools: ["web_search", "run_code"]
|
||||
});
|
||||
|
||||
console.log(agentState.id);
|
||||
```
|
||||
```python Python maxLines=50
|
||||
from letta_client import Letta
|
||||
import os
|
||||
|
||||
client = Letta(token=os.getenv("LETTA_API_KEY"))
|
||||
|
||||
agent_state = client.agents.create(
|
||||
model="openai/gpt-4.1",
|
||||
embedding="openai/text-embedding-3-small",
|
||||
memory_blocks=[
|
||||
{
|
||||
"label": "human",
|
||||
"value": "Name: Timber. Status: dog. Occupation: building Letta, infrastructure to democratize self-improving superintelligence"
|
||||
},
|
||||
{
|
||||
"label": "persona",
|
||||
"value": "I am a self-improving superintelligence. Timber is my best friend and collaborator."
|
||||
}
|
||||
],
|
||||
tools=["web_search", "run_code"]
|
||||
)
|
||||
|
||||
print(agent_state.id)
|
||||
```
|
||||
```curl curl
|
||||
curl -X POST https://api.letta.com/v1/agents \
|
||||
-H "Authorization: Bearer $LETTA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "openai/gpt-4.1",
|
||||
"embedding": "openai/text-embedding-3-small",
|
||||
"memory_blocks": [
|
||||
{
|
||||
"label": "human",
|
||||
"value": "Name: Timber. Status: dog. Occupation: building Letta, infrastructure to democratize self-improving superintelligence"
|
||||
},
|
||||
{
|
||||
"label": "persona",
|
||||
"value": "I am a self-improving superintelligence. Timber is my best friend and collaborator."
|
||||
}
|
||||
],
|
||||
"tools": ["web_search", "run_code"]
|
||||
}'
|
||||
```
|
||||
</CodeGroup>
|
||||
</Step>
|
||||
<Step title="Message your agent">
|
||||
<Note>
|
||||
The Letta API supports streaming both agent *steps* and streaming *tokens*.
|
||||
For more information on streaming, see [our streaming guide](/guides/agents/streaming).
|
||||
</Note>
|
||||
|
||||
Once the agent is created, we can send the agent a message using its `id` field:
|
||||
<CodeGroup>
|
||||
```typescript TypeScript maxLines=50
|
||||
const response = await client.agents.messages.create(
|
||||
agentState.id, {
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: "What do you know about me?"
|
||||
}
|
||||
]
|
||||
}
|
||||
);
|
||||
|
||||
for (const message of response.messages) {
|
||||
console.log(message);
|
||||
}
|
||||
```
|
||||
```python title="python" maxLines=50
|
||||
response = client.agents.messages.create(
|
||||
agent_id=agent_state.id,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What do you know about me?"
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
for message in response.messages:
|
||||
print(message)
|
||||
```
|
||||
```curl curl
|
||||
curl --request POST \
|
||||
--url https://api.letta.com/v1/agents/$AGENT_ID/messages \
|
||||
--header 'Authorization: Bearer $LETTA_API_KEY' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What do you know about me?"
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
</CodeGroup>
|
||||
|
||||
The response contains the agent's full response to the message, which includes reasoning steps (chain-of-thought), tool calls, tool responses, and assistant (agent) messages:
|
||||
```json maxLines=50
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"id": "message-29d8d17e-7c50-4289-8d0e-2bab988aa01e",
|
||||
"date": "2024-12-12T17:05:56+00:00",
|
||||
"message_type": "reasoning_message",
|
||||
"reasoning": "Timber is asking what I know. I should reference my memory blocks."
|
||||
},
|
||||
{
|
||||
"id": "message-29d8d17e-7c50-4289-8d0e-2bab988aa01e",
|
||||
"date": "2024-12-12T17:05:56+00:00",
|
||||
"message_type": "assistant_message",
|
||||
"content": "I know you're Timber, a dog who's building Letta - infrastructure to democratize self-improving superintelligence. We're best friends and collaborators!"
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"completion_tokens": 67,
|
||||
"prompt_tokens": 2134,
|
||||
"total_tokens": 2201,
|
||||
"step_count": 1
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Notice how the agent retrieved information from its memory blocks without you having to send the context. This is the key difference from traditional LLM APIs where you'd need to include the full conversation history with every request.
|
||||
|
||||
You can read more about the response format from the message route [here](/guides/agents/overview#message-types).
|
||||
|
||||
</Step>
|
||||
<Step title="View your agent in the ADE">
|
||||
Another way to interact with Letta agents is via the [Agent Development Environment](/guides/ade/overview) (or ADE for short). The ADE is a UI on top of the Letta API that allows you to quickly build, prototype, and observe your agents.
|
||||
|
||||
If we navigate to our agent in the ADE, we should see our agent's state in full detail, as well as the message that we sent to it:
|
||||
<img className="block w-300 dark:hidden" src="https://raw.githubusercontent.com/letta-ai/letta/refs/heads/main/assets/example_ade_screenshot_light.png" />
|
||||
<img className="hidden w-300 dark:block" src="https://raw.githubusercontent.com/letta-ai/letta/refs/heads/main/assets/example_ade_screenshot.png" />
|
||||
|
||||
[Read our ADE setup guide →](/guides/ade/overview)
|
||||
</Step>
|
||||
</Steps>
|
||||
|
||||
|
||||
|
||||
## Next steps
|
||||
|
||||
Congratulations! 🎉 You just created and messaged your first stateful agent with Letta using the API and SDKs. See the following resources for next steps for building more complex agents with Letta:
|
||||
* Create and attach [custom tools](/guides/agents/custom-tools) to your agent
|
||||
* Customize agentic [memory management](/guides/agents/memory)
|
||||
* Version and distribute your agent with [agent templates](/guides/templates/overview)
|
||||
* View the full [API and SDK reference](/api-reference/overview)
|
||||
@@ -1,47 +0,0 @@
|
||||
---
|
||||
title: Anthropic
|
||||
slug: guides/server/providers/anthropic
|
||||
---
|
||||
<Tip>To enable Anthropic models with Letta, set `ANTHROPIC_API_KEY` in your environment variables. </Tip>
|
||||
|
||||
You can use Letta with Anthropic if you have an Anthropic account and API key.
|
||||
Currently, only there are no supported **embedding** models for Anthropic (only LLM models).
|
||||
You will need to use a seperate provider (e.g. OpenAI) or the Letta embeddings endpoint (`letta-free`) for embeddings.
|
||||
|
||||
## Enabling Anthropic models with Docker
|
||||
|
||||
To enable Anthropic models when running the Letta server with Docker, set your `ANTHROPIC_API_KEY` as an environment variable:
|
||||
```bash
|
||||
# replace `~/.letta/.persist/pgdata` with wherever you want to store your agent data
|
||||
docker run \
|
||||
-v ~/.letta/.persist/pgdata:/var/lib/postgresql/data \
|
||||
-p 8283:8283 \
|
||||
-e ANTHROPIC_API_KEY="your_anthropic_api_key" \
|
||||
letta/letta:latest
|
||||
```
|
||||
|
||||
See the [self-hosting guide](/guides/selfhosting) for more information on running Letta with Docker.
|
||||
|
||||
## Specifying agent models
|
||||
|
||||
When creating agents on your self-hosted server, you must specify both the LLM and embedding models to use. You can additionally specify a context window limit (which must be less than or equal to the maximum size).
|
||||
|
||||
```python
|
||||
from letta_client import Letta
|
||||
import os
|
||||
|
||||
# Connect to your self-hosted server
|
||||
client = Letta(base_url="http://localhost:8283")
|
||||
|
||||
agent = client.agents.create(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
embedding="openai/text-embedding-3-small", # An embedding model is required for self-hosted
|
||||
# optional configuration
|
||||
context_window_limit=30000
|
||||
)
|
||||
```
|
||||
Anthropic models have very large context windows, which will be very expensive and high latency. We recommend setting a lower `context_window_limit` when using Anthropic models.
|
||||
|
||||
<Note>
|
||||
For Letta Cloud usage, see the [quickstart guide](/quickstart). Cloud deployments manage embeddings automatically and don't require provider configuration.
|
||||
</Note>
|
||||
@@ -1,30 +0,0 @@
|
||||
---
|
||||
title: AWS Bedrock
|
||||
slug: guides/server/providers/aws-bedrock
|
||||
---
|
||||
We support Anthropic models provided via AWS Bedrock.
|
||||
|
||||
<Warning>
|
||||
To use a model with AWS Bedrock, you must ensure it is enabled in the your AWS Model Catalog. Letta will list all available Anthropic models on Bedrock, even if you do not have access to them via AWS.
|
||||
</Warning>
|
||||
|
||||
## Enabling AWS Bedrock with Docker
|
||||
|
||||
To enable AWS Bedrock models when running the Letta server with Docker, set your AWS credentials as environment variables:
|
||||
```bash
|
||||
# replace `~/.letta/.persist/pgdata` with wherever you want to store your agent data
|
||||
docker run \
|
||||
-v ~/.letta/.persist/pgdata:/var/lib/postgresql/data \
|
||||
-p 8283:8283 \
|
||||
-e AWS_ACCESS_KEY_ID="your_aws_access_key_id" \
|
||||
-e AWS_SECRET_ACCESS_KEY="your_aws_secret_access_key" \
|
||||
-e AWS_DEFAULT_REGION="your_aws_default_region" \
|
||||
letta/letta:latest
|
||||
```
|
||||
|
||||
Optionally, you can specify the API version (default is bedrock-2023-05-31):
|
||||
```bash
|
||||
-e BEDROCK_ANTHROPIC_VERSION="bedrock-2023-05-31"
|
||||
```
|
||||
|
||||
See the [self-hosting guide](/guides/selfhosting) for more information on running Letta with Docker.
|
||||