feat: Add support for simple text files (#2630)

This commit is contained in:
Matthew Zhou
2025-06-04 14:36:25 -07:00
committed by GitHub
parent 82b3222a52
commit bbdfc3f2cd
8 changed files with 405 additions and 506 deletions

View File

@@ -27,6 +27,11 @@ from letta.utils import safe_create_task, sanitize_filename
logger = get_logger(__name__)
mimetypes.add_type("text/markdown", ".md")
mimetypes.add_type("text/markdown", ".markdown")
mimetypes.add_type("application/jsonl", ".jsonl")
mimetypes.add_type("application/x-jsonlines", ".jsonl")
router = APIRouter(prefix="/sources", tags=["sources"])
@@ -174,7 +179,15 @@ async def upload_file_to_source(
"""
Upload a file to a data source.
"""
allowed_media_types = {"application/pdf", "text/plain", "application/json"}
allowed_media_types = {
"application/pdf",
"text/plain",
"text/markdown",
"text/x-markdown",
"application/json",
"application/jsonl",
"application/x-jsonlines",
}
# Normalize incoming Content-Type header (strip charset or any parameters).
raw_ct = file.content_type or ""
@@ -192,6 +205,9 @@ async def upload_file_to_source(
".pdf": "application/pdf",
".txt": "text/plain",
".json": "application/json",
".md": "text/markdown",
".markdown": "text/markdown",
".jsonl": "application/jsonl",
}
media_type = ext_map.get(ext, media_type)

View File

@@ -9,6 +9,16 @@ from letta.settings import settings
logger = get_logger(__name__)
SIMPLE_TEXT_MIME_TYPES = {
"text/plain",
"text/markdown",
"text/x-markdown",
"application/json",
"application/jsonl",
"application/x-jsonlines",
}
class MistralFileParser(FileParser):
"""Mistral-based OCR extraction"""
@@ -23,7 +33,7 @@ class MistralFileParser(FileParser):
# TODO: Kind of hacky...we try to exit early here?
# TODO: Create our internal file parser representation we return instead of OCRResponse
if mime_type == "text/plain":
if mime_type in SIMPLE_TEXT_MIME_TYPES or mime_type.startswith("text/"):
text = content.decode("utf-8", errors="replace")
return OCRResponse(
model=self.model,

531
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -73,7 +73,7 @@ llama-index = "^0.12.2"
llama-index-embeddings-openai = "^0.3.1"
e2b-code-interpreter = {version = "^1.0.3", optional = true}
anthropic = "^0.49.0"
letta_client = "^0.1.143"
letta_client = "^0.1.147"
openai = "^1.60.0"
opentelemetry-api = "1.30.0"
opentelemetry-sdk = "1.30.0"

22
tests/data/test.json Normal file
View File

@@ -0,0 +1,22 @@
{
"glossary": {
"title": "example glossary",
"GlossDiv": {
"title": "S",
"GlossList": {
"GlossEntry": {
"ID": "SGML",
"SortAs": "SGML",
"GlossTerm": "Standard Generalized Markup Language",
"Acronym": "SGML",
"Abbrev": "ISO 8879:1986",
"GlossDef": {
"para": "A meta-markup language, used to create markup languages such as DocBook.",
"GlossSeeAlso": ["GML", "XML"]
},
"GlossSee": "markup"
}
}
}
}
}

245
tests/data/test.md Normal file
View File

@@ -0,0 +1,245 @@
---
__Advertisement :)__
- __[pica](https://nodeca.github.io/pica/demo/)__ - high quality and fast image
resize in browser.
- __[babelfish](https://github.com/nodeca/babelfish/)__ - developer friendly
i18n with plurals support and easy syntax.
You will like those projects!
---
# h1 Heading 8-)
## h2 Heading
### h3 Heading
#### h4 Heading
##### h5 Heading
###### h6 Heading
## Horizontal Rules
___
---
***
## Typographic replacements
Enable typographer option to see result.
(c) (C) (r) (R) (tm) (TM) (p) (P) +-
test.. test... test..... test?..... test!....
!!!!!! ???? ,, -- ---
"Smartypants, double quotes" and 'single quotes'
## Emphasis
**This is bold text**
__This is bold text__
*This is italic text*
_This is italic text_
~~Strikethrough~~
## Blockquotes
> Blockquotes can also be nested...
>> ...by using additional greater-than signs right next to each other...
> > > ...or with spaces between arrows.
## Lists
Unordered
+ Create a list by starting a line with `+`, `-`, or `*`
+ Sub-lists are made by indenting 2 spaces:
- Marker character change forces new list start:
* Ac tristique libero volutpat at
+ Facilisis in pretium nisl aliquet
- Nulla volutpat aliquam velit
+ Very easy!
Ordered
1. Lorem ipsum dolor sit amet
2. Consectetur adipiscing elit
3. Integer molestie lorem at massa
1. You can use sequential numbers...
1. ...or keep all the numbers as `1.`
Start numbering with offset:
57. foo
1. bar
## Code
Inline `code`
Indented code
// Some comments
line 1 of code
line 2 of code
line 3 of code
Block code "fences"
```
Sample text here...
```
Syntax highlighting
``` js
var foo = function (bar) {
return bar++;
};
console.log(foo(5));
```
## Tables
| Option | Description |
| ------ | ----------- |
| data | path to data files to supply the data that will be passed into templates. |
| engine | engine to be used for processing templates. Handlebars is the default. |
| ext | extension to be used for dest files. |
Right aligned columns
| Option | Description |
| ------:| -----------:|
| data | path to data files to supply the data that will be passed into templates. |
| engine | engine to be used for processing templates. Handlebars is the default. |
| ext | extension to be used for dest files. |
## Links
[link text](http://dev.nodeca.com)
[link with title](http://nodeca.github.io/pica/demo/ "title text!")
Autoconverted link https://github.com/nodeca/pica (enable linkify to see)
## Images
![Minion](https://octodex.github.com/images/minion.png)
![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg "The Stormtroopocat")
Like links, Images also have a footnote style syntax
![Alt text][id]
With a reference later in the document defining the URL location:
[id]: https://octodex.github.com/images/dojocat.jpg "The Dojocat"
## Plugins
The killer feature of `markdown-it` is very effective support of
[syntax plugins](https://www.npmjs.org/browse/keyword/markdown-it-plugin).
### [Emojies](https://github.com/markdown-it/markdown-it-emoji)
> Classic markup: :wink: :cry: :laughing: :yum:
>
> Shortcuts (emoticons): :-) :-( 8-) ;)
see [how to change output](https://github.com/markdown-it/markdown-it-emoji#change-output) with twemoji.
### [Subscript](https://github.com/markdown-it/markdown-it-sub) / [Superscript](https://github.com/markdown-it/markdown-it-sup)
- 19^th^
- H~2~O
### [\<ins>](https://github.com/markdown-it/markdown-it-ins)
++Inserted text++
### [\<mark>](https://github.com/markdown-it/markdown-it-mark)
==Marked text==
### [Footnotes](https://github.com/markdown-it/markdown-it-footnote)
Footnote 1 link[^first].
Footnote 2 link[^second].
Inline footnote^[Text of inline footnote] definition.
Duplicated footnote reference[^second].
[^first]: Footnote **can have markup**
and multiple paragraphs.
[^second]: Footnote text.
### [Definition lists](https://github.com/markdown-it/markdown-it-deflist)
Term 1
: Definition 1
with lazy continuation.
Term 2 with *inline markup*
: Definition 2
{ some code, part of Definition 2 }
Third paragraph of definition 2.
_Compact style:_
Term 1
~ Definition 1
Term 2
~ Definition 2a
~ Definition 2b
### [Abbreviations](https://github.com/markdown-it/markdown-it-abbr)
This is HTML abbreviation example.
It converts "HTML", but keep intact partial entries like "xxxHTMLyyy" and so on.
*[HTML]: Hyper Text Markup Language
### [Custom containers](https://github.com/markdown-it/markdown-it-container)
::: warning
*here be dragons*
:::

File diff suppressed because one or more lines are too long

View File

@@ -1,4 +1,5 @@
import os
import re
import threading
import time
@@ -59,7 +60,10 @@ def agent_state(client: LettaSDKClient):
"file_path, expected_value, expected_label_regex",
[
("tests/data/test.txt", "test", r"test_[a-z0-9]+\.txt"),
# ("tests/data/memgpt_paper.pdf", "MemGPT", r"memgpt_paper_[a-z0-9]+\.pdf"),
("tests/data/memgpt_paper.pdf", "MemGPT", r"memgpt_paper_[a-z0-9]+\.pdf"),
("tests/data/toy_chat_fine_tuning.jsonl", '{"messages"', r"toy_chat_fine_tuning_[a-z0-9]+\.jsonl"),
("tests/data/test.md", "h2 Heading", r"test_[a-z0-9]+\.md"),
("tests/data/test.json", "glossary", r"test_[a-z0-9]+\.json"),
],
)
def test_file_upload_creates_source_blocks_correctly(
@@ -99,20 +103,22 @@ def test_file_upload_creates_source_blocks_correctly(
assert len(files) == 1
assert files[0].source_id == source.id
# # Check that blocks were created
# blocks = client.agents.retrieve(agent_id=agent_state.id)
# assert len(blocks) == 2
# assert any(expected_value in b.value for b in blocks)
# assert any(re.fullmatch(expected_label_regex, b.label) for b in blocks)
#
# # Remove file from source
# client.sources.files.delete(source_id=source.id, file_id=files[0].id)
#
# # Confirm blocks were removed
# blocks = client.agents.blocks.list(agent_id=agent_state.id)
# assert len(blocks) == 1
# assert not any(expected_value in b.value for b in blocks)
# assert not any(re.fullmatch(expected_label_regex, b.label) for b in blocks)
# Check that blocks were created
agent_state = client.agents.retrieve(agent_id=agent_state.id)
blocks = agent_state.memory.file_blocks
assert len(blocks) == 1
assert any(expected_value in b.value for b in blocks)
assert any(re.fullmatch(expected_label_regex, b.label) for b in blocks)
# Remove file from source
client.sources.files.delete(source_id=source.id, file_id=files[0].id)
# Confirm blocks were removed
agent_state = client.agents.retrieve(agent_id=agent_state.id)
blocks = agent_state.memory.file_blocks
assert len(blocks) == 0
assert not any(expected_value in b.value for b in blocks)
assert not any(re.fullmatch(expected_label_regex, b.label) for b in blocks)
def test_attach_existing_files_creates_source_blocks_correctly(client: LettaSDKClient, agent_state: AgentState):
@@ -149,20 +155,22 @@ def test_attach_existing_files_creates_source_blocks_correctly(client: LettaSDKC
# Attach after uploading the file
client.agents.sources.attach(source_id=source.id, agent_id=agent_state.id)
# # Get the agent state, check blocks exist
# blocks = client.agents.blocks.list(agent_id=agent_state.id)
# assert len(blocks) == 2
# assert "test" in [b.value for b in blocks]
# assert any(re.fullmatch(r"test_[a-z0-9]+\.txt", b.label) for b in blocks)
# Get the agent state, check blocks exist
agent_state = client.agents.retrieve(agent_id=agent_state.id)
blocks = agent_state.memory.file_blocks
assert len(blocks) == 1
assert "test" in [b.value for b in blocks]
assert any(re.fullmatch(r"test_[a-z0-9]+\.txt", b.label) for b in blocks)
# Detach the source
client.agents.sources.detach(source_id=source.id, agent_id=agent_state.id)
# # Get the agent state, check blocks do NOT exist
# blocks = client.agents.blocks.list(agent_id=agent_state.id)
# assert len(blocks) == 1
# assert "test" not in [b.value for b in blocks]
# assert not any(re.fullmatch(r"test_[a-z0-9]+\.txt", b.label) for b in blocks)
# Get the agent state, check blocks do NOT exist
agent_state = client.agents.retrieve(agent_id=agent_state.id)
blocks = agent_state.memory.file_blocks
assert len(blocks) == 0
assert "test" not in [b.value for b in blocks]
assert not any(re.fullmatch(r"test_[a-z0-9]+\.txt", b.label) for b in blocks)
def test_delete_source_removes_source_blocks_correctly(client: LettaSDKClient, agent_state: AgentState):
@@ -195,16 +203,18 @@ def test_delete_source_removes_source_blocks_correctly(client: LettaSDKClient, a
print("Waiting for jobs to complete...", job.status)
# Get the agent state, check blocks exist
# blocks = client.agents.blocks.list(agent_id=agent_state.id)
# assert len(blocks) == 2
# assert "test" in [b.value for b in blocks]
# assert any(re.fullmatch(r"test_[a-z0-9]+\.txt", b.label) for b in blocks)
agent_state = client.agents.retrieve(agent_id=agent_state.id)
blocks = agent_state.memory.file_blocks
assert len(blocks) == 1
assert "test" in [b.value for b in blocks]
assert any(re.fullmatch(r"test_[a-z0-9]+\.txt", b.label) for b in blocks)
# Remove file from source
client.sources.delete(source_id=source.id)
# Get the agent state, check blocks do NOT exist
# blocks = client.agents.blocks.list(agent_id=agent_state.id)
# assert len(blocks) == 1
# assert "test" not in [b.value for b in blocks]
# assert not any(re.fullmatch(r"test_[a-z0-9]+\.txt", b.label) for b in blocks)
agent_state = client.agents.retrieve(agent_id=agent_state.id)
blocks = agent_state.memory.file_blocks
assert len(blocks) == 0
assert "test" not in [b.value for b in blocks]
assert not any(re.fullmatch(r"test_[a-z0-9]+\.txt", b.label) for b in blocks)