feat: Add support for simple text files (#2630)
This commit is contained in:
@@ -27,6 +27,11 @@ from letta.utils import safe_create_task, sanitize_filename
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
mimetypes.add_type("text/markdown", ".md")
|
||||
mimetypes.add_type("text/markdown", ".markdown")
|
||||
mimetypes.add_type("application/jsonl", ".jsonl")
|
||||
mimetypes.add_type("application/x-jsonlines", ".jsonl")
|
||||
|
||||
|
||||
router = APIRouter(prefix="/sources", tags=["sources"])
|
||||
|
||||
@@ -174,7 +179,15 @@ async def upload_file_to_source(
|
||||
"""
|
||||
Upload a file to a data source.
|
||||
"""
|
||||
allowed_media_types = {"application/pdf", "text/plain", "application/json"}
|
||||
allowed_media_types = {
|
||||
"application/pdf",
|
||||
"text/plain",
|
||||
"text/markdown",
|
||||
"text/x-markdown",
|
||||
"application/json",
|
||||
"application/jsonl",
|
||||
"application/x-jsonlines",
|
||||
}
|
||||
|
||||
# Normalize incoming Content-Type header (strip charset or any parameters).
|
||||
raw_ct = file.content_type or ""
|
||||
@@ -192,6 +205,9 @@ async def upload_file_to_source(
|
||||
".pdf": "application/pdf",
|
||||
".txt": "text/plain",
|
||||
".json": "application/json",
|
||||
".md": "text/markdown",
|
||||
".markdown": "text/markdown",
|
||||
".jsonl": "application/jsonl",
|
||||
}
|
||||
media_type = ext_map.get(ext, media_type)
|
||||
|
||||
|
||||
@@ -9,6 +9,16 @@ from letta.settings import settings
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
SIMPLE_TEXT_MIME_TYPES = {
|
||||
"text/plain",
|
||||
"text/markdown",
|
||||
"text/x-markdown",
|
||||
"application/json",
|
||||
"application/jsonl",
|
||||
"application/x-jsonlines",
|
||||
}
|
||||
|
||||
|
||||
class MistralFileParser(FileParser):
|
||||
"""Mistral-based OCR extraction"""
|
||||
|
||||
@@ -23,7 +33,7 @@ class MistralFileParser(FileParser):
|
||||
|
||||
# TODO: Kind of hacky...we try to exit early here?
|
||||
# TODO: Create our internal file parser representation we return instead of OCRResponse
|
||||
if mime_type == "text/plain":
|
||||
if mime_type in SIMPLE_TEXT_MIME_TYPES or mime_type.startswith("text/"):
|
||||
text = content.decode("utf-8", errors="replace")
|
||||
return OCRResponse(
|
||||
model=self.model,
|
||||
|
||||
531
poetry.lock
generated
531
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -73,7 +73,7 @@ llama-index = "^0.12.2"
|
||||
llama-index-embeddings-openai = "^0.3.1"
|
||||
e2b-code-interpreter = {version = "^1.0.3", optional = true}
|
||||
anthropic = "^0.49.0"
|
||||
letta_client = "^0.1.143"
|
||||
letta_client = "^0.1.147"
|
||||
openai = "^1.60.0"
|
||||
opentelemetry-api = "1.30.0"
|
||||
opentelemetry-sdk = "1.30.0"
|
||||
|
||||
22
tests/data/test.json
Normal file
22
tests/data/test.json
Normal file
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"glossary": {
|
||||
"title": "example glossary",
|
||||
"GlossDiv": {
|
||||
"title": "S",
|
||||
"GlossList": {
|
||||
"GlossEntry": {
|
||||
"ID": "SGML",
|
||||
"SortAs": "SGML",
|
||||
"GlossTerm": "Standard Generalized Markup Language",
|
||||
"Acronym": "SGML",
|
||||
"Abbrev": "ISO 8879:1986",
|
||||
"GlossDef": {
|
||||
"para": "A meta-markup language, used to create markup languages such as DocBook.",
|
||||
"GlossSeeAlso": ["GML", "XML"]
|
||||
},
|
||||
"GlossSee": "markup"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
245
tests/data/test.md
Normal file
245
tests/data/test.md
Normal file
@@ -0,0 +1,245 @@
|
||||
---
|
||||
__Advertisement :)__
|
||||
|
||||
- __[pica](https://nodeca.github.io/pica/demo/)__ - high quality and fast image
|
||||
resize in browser.
|
||||
- __[babelfish](https://github.com/nodeca/babelfish/)__ - developer friendly
|
||||
i18n with plurals support and easy syntax.
|
||||
|
||||
You will like those projects!
|
||||
|
||||
---
|
||||
|
||||
# h1 Heading 8-)
|
||||
## h2 Heading
|
||||
### h3 Heading
|
||||
#### h4 Heading
|
||||
##### h5 Heading
|
||||
###### h6 Heading
|
||||
|
||||
|
||||
## Horizontal Rules
|
||||
|
||||
___
|
||||
|
||||
---
|
||||
|
||||
***
|
||||
|
||||
|
||||
## Typographic replacements
|
||||
|
||||
Enable typographer option to see result.
|
||||
|
||||
(c) (C) (r) (R) (tm) (TM) (p) (P) +-
|
||||
|
||||
test.. test... test..... test?..... test!....
|
||||
|
||||
!!!!!! ???? ,, -- ---
|
||||
|
||||
"Smartypants, double quotes" and 'single quotes'
|
||||
|
||||
|
||||
## Emphasis
|
||||
|
||||
**This is bold text**
|
||||
|
||||
__This is bold text__
|
||||
|
||||
*This is italic text*
|
||||
|
||||
_This is italic text_
|
||||
|
||||
~~Strikethrough~~
|
||||
|
||||
|
||||
## Blockquotes
|
||||
|
||||
|
||||
> Blockquotes can also be nested...
|
||||
>> ...by using additional greater-than signs right next to each other...
|
||||
> > > ...or with spaces between arrows.
|
||||
|
||||
|
||||
## Lists
|
||||
|
||||
Unordered
|
||||
|
||||
+ Create a list by starting a line with `+`, `-`, or `*`
|
||||
+ Sub-lists are made by indenting 2 spaces:
|
||||
- Marker character change forces new list start:
|
||||
* Ac tristique libero volutpat at
|
||||
+ Facilisis in pretium nisl aliquet
|
||||
- Nulla volutpat aliquam velit
|
||||
+ Very easy!
|
||||
|
||||
Ordered
|
||||
|
||||
1. Lorem ipsum dolor sit amet
|
||||
2. Consectetur adipiscing elit
|
||||
3. Integer molestie lorem at massa
|
||||
|
||||
|
||||
1. You can use sequential numbers...
|
||||
1. ...or keep all the numbers as `1.`
|
||||
|
||||
Start numbering with offset:
|
||||
|
||||
57. foo
|
||||
1. bar
|
||||
|
||||
|
||||
## Code
|
||||
|
||||
Inline `code`
|
||||
|
||||
Indented code
|
||||
|
||||
// Some comments
|
||||
line 1 of code
|
||||
line 2 of code
|
||||
line 3 of code
|
||||
|
||||
|
||||
Block code "fences"
|
||||
|
||||
```
|
||||
Sample text here...
|
||||
```
|
||||
|
||||
Syntax highlighting
|
||||
|
||||
``` js
|
||||
var foo = function (bar) {
|
||||
return bar++;
|
||||
};
|
||||
|
||||
console.log(foo(5));
|
||||
```
|
||||
|
||||
## Tables
|
||||
|
||||
| Option | Description |
|
||||
| ------ | ----------- |
|
||||
| data | path to data files to supply the data that will be passed into templates. |
|
||||
| engine | engine to be used for processing templates. Handlebars is the default. |
|
||||
| ext | extension to be used for dest files. |
|
||||
|
||||
Right aligned columns
|
||||
|
||||
| Option | Description |
|
||||
| ------:| -----------:|
|
||||
| data | path to data files to supply the data that will be passed into templates. |
|
||||
| engine | engine to be used for processing templates. Handlebars is the default. |
|
||||
| ext | extension to be used for dest files. |
|
||||
|
||||
|
||||
## Links
|
||||
|
||||
[link text](http://dev.nodeca.com)
|
||||
|
||||
[link with title](http://nodeca.github.io/pica/demo/ "title text!")
|
||||
|
||||
Autoconverted link https://github.com/nodeca/pica (enable linkify to see)
|
||||
|
||||
|
||||
## Images
|
||||
|
||||

|
||||

|
||||
|
||||
Like links, Images also have a footnote style syntax
|
||||
|
||||
![Alt text][id]
|
||||
|
||||
With a reference later in the document defining the URL location:
|
||||
|
||||
[id]: https://octodex.github.com/images/dojocat.jpg "The Dojocat"
|
||||
|
||||
|
||||
## Plugins
|
||||
|
||||
The killer feature of `markdown-it` is very effective support of
|
||||
[syntax plugins](https://www.npmjs.org/browse/keyword/markdown-it-plugin).
|
||||
|
||||
|
||||
### [Emojies](https://github.com/markdown-it/markdown-it-emoji)
|
||||
|
||||
> Classic markup: :wink: :cry: :laughing: :yum:
|
||||
>
|
||||
> Shortcuts (emoticons): :-) :-( 8-) ;)
|
||||
|
||||
see [how to change output](https://github.com/markdown-it/markdown-it-emoji#change-output) with twemoji.
|
||||
|
||||
|
||||
### [Subscript](https://github.com/markdown-it/markdown-it-sub) / [Superscript](https://github.com/markdown-it/markdown-it-sup)
|
||||
|
||||
- 19^th^
|
||||
- H~2~O
|
||||
|
||||
|
||||
### [\<ins>](https://github.com/markdown-it/markdown-it-ins)
|
||||
|
||||
++Inserted text++
|
||||
|
||||
|
||||
### [\<mark>](https://github.com/markdown-it/markdown-it-mark)
|
||||
|
||||
==Marked text==
|
||||
|
||||
|
||||
### [Footnotes](https://github.com/markdown-it/markdown-it-footnote)
|
||||
|
||||
Footnote 1 link[^first].
|
||||
|
||||
Footnote 2 link[^second].
|
||||
|
||||
Inline footnote^[Text of inline footnote] definition.
|
||||
|
||||
Duplicated footnote reference[^second].
|
||||
|
||||
[^first]: Footnote **can have markup**
|
||||
|
||||
and multiple paragraphs.
|
||||
|
||||
[^second]: Footnote text.
|
||||
|
||||
|
||||
### [Definition lists](https://github.com/markdown-it/markdown-it-deflist)
|
||||
|
||||
Term 1
|
||||
|
||||
: Definition 1
|
||||
with lazy continuation.
|
||||
|
||||
Term 2 with *inline markup*
|
||||
|
||||
: Definition 2
|
||||
|
||||
{ some code, part of Definition 2 }
|
||||
|
||||
Third paragraph of definition 2.
|
||||
|
||||
_Compact style:_
|
||||
|
||||
Term 1
|
||||
~ Definition 1
|
||||
|
||||
Term 2
|
||||
~ Definition 2a
|
||||
~ Definition 2b
|
||||
|
||||
|
||||
### [Abbreviations](https://github.com/markdown-it/markdown-it-abbr)
|
||||
|
||||
This is HTML abbreviation example.
|
||||
|
||||
It converts "HTML", but keep intact partial entries like "xxxHTMLyyy" and so on.
|
||||
|
||||
*[HTML]: Hyper Text Markup Language
|
||||
|
||||
### [Custom containers](https://github.com/markdown-it/markdown-it-container)
|
||||
|
||||
::: warning
|
||||
*here be dragons*
|
||||
:::
|
||||
5
tests/data/toy_chat_fine_tuning.jsonl
Normal file
5
tests/data/toy_chat_fine_tuning.jsonl
Normal file
File diff suppressed because one or more lines are too long
@@ -1,4 +1,5 @@
|
||||
import os
|
||||
import re
|
||||
import threading
|
||||
import time
|
||||
|
||||
@@ -59,7 +60,10 @@ def agent_state(client: LettaSDKClient):
|
||||
"file_path, expected_value, expected_label_regex",
|
||||
[
|
||||
("tests/data/test.txt", "test", r"test_[a-z0-9]+\.txt"),
|
||||
# ("tests/data/memgpt_paper.pdf", "MemGPT", r"memgpt_paper_[a-z0-9]+\.pdf"),
|
||||
("tests/data/memgpt_paper.pdf", "MemGPT", r"memgpt_paper_[a-z0-9]+\.pdf"),
|
||||
("tests/data/toy_chat_fine_tuning.jsonl", '{"messages"', r"toy_chat_fine_tuning_[a-z0-9]+\.jsonl"),
|
||||
("tests/data/test.md", "h2 Heading", r"test_[a-z0-9]+\.md"),
|
||||
("tests/data/test.json", "glossary", r"test_[a-z0-9]+\.json"),
|
||||
],
|
||||
)
|
||||
def test_file_upload_creates_source_blocks_correctly(
|
||||
@@ -99,20 +103,22 @@ def test_file_upload_creates_source_blocks_correctly(
|
||||
assert len(files) == 1
|
||||
assert files[0].source_id == source.id
|
||||
|
||||
# # Check that blocks were created
|
||||
# blocks = client.agents.retrieve(agent_id=agent_state.id)
|
||||
# assert len(blocks) == 2
|
||||
# assert any(expected_value in b.value for b in blocks)
|
||||
# assert any(re.fullmatch(expected_label_regex, b.label) for b in blocks)
|
||||
#
|
||||
# # Remove file from source
|
||||
# client.sources.files.delete(source_id=source.id, file_id=files[0].id)
|
||||
#
|
||||
# # Confirm blocks were removed
|
||||
# blocks = client.agents.blocks.list(agent_id=agent_state.id)
|
||||
# assert len(blocks) == 1
|
||||
# assert not any(expected_value in b.value for b in blocks)
|
||||
# assert not any(re.fullmatch(expected_label_regex, b.label) for b in blocks)
|
||||
# Check that blocks were created
|
||||
agent_state = client.agents.retrieve(agent_id=agent_state.id)
|
||||
blocks = agent_state.memory.file_blocks
|
||||
assert len(blocks) == 1
|
||||
assert any(expected_value in b.value for b in blocks)
|
||||
assert any(re.fullmatch(expected_label_regex, b.label) for b in blocks)
|
||||
|
||||
# Remove file from source
|
||||
client.sources.files.delete(source_id=source.id, file_id=files[0].id)
|
||||
|
||||
# Confirm blocks were removed
|
||||
agent_state = client.agents.retrieve(agent_id=agent_state.id)
|
||||
blocks = agent_state.memory.file_blocks
|
||||
assert len(blocks) == 0
|
||||
assert not any(expected_value in b.value for b in blocks)
|
||||
assert not any(re.fullmatch(expected_label_regex, b.label) for b in blocks)
|
||||
|
||||
|
||||
def test_attach_existing_files_creates_source_blocks_correctly(client: LettaSDKClient, agent_state: AgentState):
|
||||
@@ -149,20 +155,22 @@ def test_attach_existing_files_creates_source_blocks_correctly(client: LettaSDKC
|
||||
# Attach after uploading the file
|
||||
client.agents.sources.attach(source_id=source.id, agent_id=agent_state.id)
|
||||
|
||||
# # Get the agent state, check blocks exist
|
||||
# blocks = client.agents.blocks.list(agent_id=agent_state.id)
|
||||
# assert len(blocks) == 2
|
||||
# assert "test" in [b.value for b in blocks]
|
||||
# assert any(re.fullmatch(r"test_[a-z0-9]+\.txt", b.label) for b in blocks)
|
||||
# Get the agent state, check blocks exist
|
||||
agent_state = client.agents.retrieve(agent_id=agent_state.id)
|
||||
blocks = agent_state.memory.file_blocks
|
||||
assert len(blocks) == 1
|
||||
assert "test" in [b.value for b in blocks]
|
||||
assert any(re.fullmatch(r"test_[a-z0-9]+\.txt", b.label) for b in blocks)
|
||||
|
||||
# Detach the source
|
||||
client.agents.sources.detach(source_id=source.id, agent_id=agent_state.id)
|
||||
|
||||
# # Get the agent state, check blocks do NOT exist
|
||||
# blocks = client.agents.blocks.list(agent_id=agent_state.id)
|
||||
# assert len(blocks) == 1
|
||||
# assert "test" not in [b.value for b in blocks]
|
||||
# assert not any(re.fullmatch(r"test_[a-z0-9]+\.txt", b.label) for b in blocks)
|
||||
# Get the agent state, check blocks do NOT exist
|
||||
agent_state = client.agents.retrieve(agent_id=agent_state.id)
|
||||
blocks = agent_state.memory.file_blocks
|
||||
assert len(blocks) == 0
|
||||
assert "test" not in [b.value for b in blocks]
|
||||
assert not any(re.fullmatch(r"test_[a-z0-9]+\.txt", b.label) for b in blocks)
|
||||
|
||||
|
||||
def test_delete_source_removes_source_blocks_correctly(client: LettaSDKClient, agent_state: AgentState):
|
||||
@@ -195,16 +203,18 @@ def test_delete_source_removes_source_blocks_correctly(client: LettaSDKClient, a
|
||||
print("Waiting for jobs to complete...", job.status)
|
||||
|
||||
# Get the agent state, check blocks exist
|
||||
# blocks = client.agents.blocks.list(agent_id=agent_state.id)
|
||||
# assert len(blocks) == 2
|
||||
# assert "test" in [b.value for b in blocks]
|
||||
# assert any(re.fullmatch(r"test_[a-z0-9]+\.txt", b.label) for b in blocks)
|
||||
agent_state = client.agents.retrieve(agent_id=agent_state.id)
|
||||
blocks = agent_state.memory.file_blocks
|
||||
assert len(blocks) == 1
|
||||
assert "test" in [b.value for b in blocks]
|
||||
assert any(re.fullmatch(r"test_[a-z0-9]+\.txt", b.label) for b in blocks)
|
||||
|
||||
# Remove file from source
|
||||
client.sources.delete(source_id=source.id)
|
||||
|
||||
# Get the agent state, check blocks do NOT exist
|
||||
# blocks = client.agents.blocks.list(agent_id=agent_state.id)
|
||||
# assert len(blocks) == 1
|
||||
# assert "test" not in [b.value for b in blocks]
|
||||
# assert not any(re.fullmatch(r"test_[a-z0-9]+\.txt", b.label) for b in blocks)
|
||||
agent_state = client.agents.retrieve(agent_id=agent_state.id)
|
||||
blocks = agent_state.memory.file_blocks
|
||||
assert len(blocks) == 0
|
||||
assert "test" not in [b.value for b in blocks]
|
||||
assert not any(re.fullmatch(r"test_[a-z0-9]+\.txt", b.label) for b in blocks)
|
||||
|
||||
Reference in New Issue
Block a user