feat: Add support for simple text files (#2630)

2025-06-04 14:36:25 -07:00
parent 82b3222a52
commit bbdfc3f2cd
8 changed files with 405 additions and 506 deletions
--- a/letta/server/rest_api/routers/v1/sources.py
+++ b/letta/server/rest_api/routers/v1/sources.py
@@ -27,6 +27,11 @@ from letta.utils import safe_create_task, sanitize_filename

 logger = get_logger(__name__)

+mimetypes.add_type("text/markdown", ".md")
+mimetypes.add_type("text/markdown", ".markdown")
+mimetypes.add_type("application/jsonl", ".jsonl")
+mimetypes.add_type("application/x-jsonlines", ".jsonl")
+

 router = APIRouter(prefix="/sources", tags=["sources"])

@@ -174,7 +179,15 @@ async def upload_file_to_source(
    """
    Upload a file to a data source.
    """
-    allowed_media_types = {"application/pdf", "text/plain", "application/json"}
+    allowed_media_types = {
+        "application/pdf",
+        "text/plain",
+        "text/markdown",
+        "text/x-markdown",
+        "application/json",
+        "application/jsonl",
+        "application/x-jsonlines",
+    }

    # Normalize incoming Content-Type header (strip charset or any parameters).
    raw_ct = file.content_type or ""
@@ -192,6 +205,9 @@ async def upload_file_to_source(
                ".pdf": "application/pdf",
                ".txt": "text/plain",
                ".json": "application/json",
+                ".md": "text/markdown",
+                ".markdown": "text/markdown",
+                ".jsonl": "application/jsonl",
            }
            media_type = ext_map.get(ext, media_type)

--- a/letta/services/file_processor/parser/mistral_parser.py
+++ b/letta/services/file_processor/parser/mistral_parser.py
@@ -9,6 +9,16 @@ from letta.settings import settings
 logger = get_logger(__name__)


+SIMPLE_TEXT_MIME_TYPES = {
+    "text/plain",
+    "text/markdown",
+    "text/x-markdown",
+    "application/json",
+    "application/jsonl",
+    "application/x-jsonlines",
+}
+
+
 class MistralFileParser(FileParser):
    """Mistral-based OCR extraction"""

@@ -23,7 +33,7 @@ class MistralFileParser(FileParser):

            # TODO: Kind of hacky...we try to exit early here?
            # TODO: Create our internal file parser representation we return instead of OCRResponse
-            if mime_type == "text/plain":
+            if mime_type in SIMPLE_TEXT_MIME_TYPES or mime_type.startswith("text/"):
                text = content.decode("utf-8", errors="replace")
                return OCRResponse(
                    model=self.model,
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -73,7 +73,7 @@ llama-index = "^0.12.2"
 llama-index-embeddings-openai = "^0.3.1"
 e2b-code-interpreter = {version = "^1.0.3", optional = true}
 anthropic = "^0.49.0"
-letta_client = "^0.1.143"
+letta_client = "^0.1.147"
 openai = "^1.60.0"
 opentelemetry-api = "1.30.0"
 opentelemetry-sdk = "1.30.0"
--- a/tests/data/test.json
+++ b/tests/data/test.json
@@ -0,0 +1,22 @@
+{
+    "glossary": {
+        "title": "example glossary",
+		"GlossDiv": {
+            "title": "S",
+			"GlossList": {
+                "GlossEntry": {
+                    "ID": "SGML",
+					"SortAs": "SGML",
+					"GlossTerm": "Standard Generalized Markup Language",
+					"Acronym": "SGML",
+					"Abbrev": "ISO 8879:1986",
+					"GlossDef": {
+                        "para": "A meta-markup language, used to create markup languages such as DocBook.",
+						"GlossSeeAlso": ["GML", "XML"]
+                    },
+					"GlossSee": "markup"
+                }
+            }
+        }
+    }
+}
--- a/tests/data/test.md
+++ b/tests/data/test.md
@@ -0,0 +1,245 @@
+---
+__Advertisement :)__
+
+- __[pica](https://nodeca.github.io/pica/demo/)__ - high quality and fast image
+  resize in browser.
+- __[babelfish](https://github.com/nodeca/babelfish/)__ - developer friendly
+  i18n with plurals support and easy syntax.
+
+You will like those projects!
+
+---
+
+# h1 Heading 8-)
+## h2 Heading
+### h3 Heading
+#### h4 Heading
+##### h5 Heading
+###### h6 Heading
+
+
+## Horizontal Rules
+
+___
+
+---
+
+***
+
+
+## Typographic replacements
+
+Enable typographer option to see result.
+
+(c) (C) (r) (R) (tm) (TM) (p) (P) +-
+
+test.. test... test..... test?..... test!....
+
+!!!!!! ???? ,,  -- ---
+
+"Smartypants, double quotes" and 'single quotes'
+
+
+## Emphasis
+
+**This is bold text**
+
+__This is bold text__
+
+*This is italic text*
+
+_This is italic text_
+
+~~Strikethrough~~
+
+
+## Blockquotes
+
+
+> Blockquotes can also be nested...
+>> ...by using additional greater-than signs right next to each other...
+> > > ...or with spaces between arrows.
+
+
+## Lists
+
+Unordered
+
+ Create a list by starting a line with `+`, `-`, or `*`
+ Sub-lists are made by indenting 2 spaces:
+  - Marker character change forces new list start:
+    * Ac tristique libero volutpat at
+    + Facilisis in pretium nisl aliquet
+    - Nulla volutpat aliquam velit
+ Very easy!
+
+Ordered
+
+1. Lorem ipsum dolor sit amet
+2. Consectetur adipiscing elit
+3. Integer molestie lorem at massa
+
+
+1. You can use sequential numbers...
+1. ...or keep all the numbers as `1.`
+
+Start numbering with offset:
+
+57. foo
+1. bar
+
+
+## Code
+
+Inline `code`
+
+Indented code
+
+    // Some comments
+    line 1 of code
+    line 2 of code
+    line 3 of code
+
+
+Block code "fences"
+
+```
+Sample text here...
+```
+
+Syntax highlighting
+
+``` js
+var foo = function (bar) {
+  return bar++;
+};
+
+console.log(foo(5));
+```
+
+## Tables
+
+| Option | Description |
+| ------ | ----------- |
+| data   | path to data files to supply the data that will be passed into templates. |
+| engine | engine to be used for processing templates. Handlebars is the default. |
+| ext    | extension to be used for dest files. |
+
+Right aligned columns
+
+| Option | Description |
+| ------:| -----------:|
+| data   | path to data files to supply the data that will be passed into templates. |
+| engine | engine to be used for processing templates. Handlebars is the default. |
+| ext    | extension to be used for dest files. |
+
+
+## Links
+
+[link text](http://dev.nodeca.com)
+
+[link with title](http://nodeca.github.io/pica/demo/ "title text!")
+
+Autoconverted link https://github.com/nodeca/pica (enable linkify to see)
+
+
+## Images
+
+![Minion](https://octodex.github.com/images/minion.png)
+![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg "The Stormtroopocat")
+
+Like links, Images also have a footnote style syntax
+
+![Alt text][id]
+
+With a reference later in the document defining the URL location:
+
+[id]: https://octodex.github.com/images/dojocat.jpg  "The Dojocat"
+
+
+## Plugins
+
+The killer feature of `markdown-it` is very effective support of
+[syntax plugins](https://www.npmjs.org/browse/keyword/markdown-it-plugin).
+
+
+### [Emojies](https://github.com/markdown-it/markdown-it-emoji)
+
+> Classic markup: :wink: :cry: :laughing: :yum:
+>
+> Shortcuts (emoticons): :-) :-( 8-) ;)
+
+see [how to change output](https://github.com/markdown-it/markdown-it-emoji#change-output) with twemoji.
+
+
+### [Subscript](https://github.com/markdown-it/markdown-it-sub) / [Superscript](https://github.com/markdown-it/markdown-it-sup)
+
+- 19^th^
+- H~2~O
+
+
+### [\<ins>](https://github.com/markdown-it/markdown-it-ins)
+
++Inserted text++
+
+
+### [\<mark>](https://github.com/markdown-it/markdown-it-mark)
+
+==Marked text==
+
+
+### [Footnotes](https://github.com/markdown-it/markdown-it-footnote)
+
+Footnote 1 link[^first].
+
+Footnote 2 link[^second].
+
+Inline footnote^[Text of inline footnote] definition.
+
+Duplicated footnote reference[^second].
+
+[^first]: Footnote **can have markup**
+
+    and multiple paragraphs.
+
+[^second]: Footnote text.
+
+
+### [Definition lists](https://github.com/markdown-it/markdown-it-deflist)
+
+Term 1
+
+:   Definition 1
+with lazy continuation.
+
+Term 2 with *inline markup*
+
+:   Definition 2
+
+        { some code, part of Definition 2 }
+
+    Third paragraph of definition 2.
+
+_Compact style:_
+
+Term 1
+  ~ Definition 1
+
+Term 2
+  ~ Definition 2a
+  ~ Definition 2b
+
+
+### [Abbreviations](https://github.com/markdown-it/markdown-it-abbr)
+
+This is HTML abbreviation example.
+
+It converts "HTML", but keep intact partial entries like "xxxHTMLyyy" and so on.
+
+*[HTML]: Hyper Text Markup Language
+
+### [Custom containers](https://github.com/markdown-it/markdown-it-container)
+
+::: warning
+*here be dragons*
+:::
--- a/tests/data/toy_chat_fine_tuning.jsonl
+++ b/tests/data/toy_chat_fine_tuning.jsonl
--- a/tests/test_sources.py
+++ b/tests/test_sources.py
@@ -1,4 +1,5 @@
 import os
+import re
 import threading
 import time

@@ -59,7 +60,10 @@ def agent_state(client: LettaSDKClient):
    "file_path, expected_value, expected_label_regex",
    [
        ("tests/data/test.txt", "test", r"test_[a-z0-9]+\.txt"),
-        # ("tests/data/memgpt_paper.pdf", "MemGPT", r"memgpt_paper_[a-z0-9]+\.pdf"),
+        ("tests/data/memgpt_paper.pdf", "MemGPT", r"memgpt_paper_[a-z0-9]+\.pdf"),
+        ("tests/data/toy_chat_fine_tuning.jsonl", '{"messages"', r"toy_chat_fine_tuning_[a-z0-9]+\.jsonl"),
+        ("tests/data/test.md", "h2 Heading", r"test_[a-z0-9]+\.md"),
+        ("tests/data/test.json", "glossary", r"test_[a-z0-9]+\.json"),
    ],
 )
 def test_file_upload_creates_source_blocks_correctly(
@@ -99,20 +103,22 @@ def test_file_upload_creates_source_blocks_correctly(
    assert len(files) == 1
    assert files[0].source_id == source.id

-    # # Check that blocks were created
-    # blocks = client.agents.retrieve(agent_id=agent_state.id)
-    # assert len(blocks) == 2
-    # assert any(expected_value in b.value for b in blocks)
-    # assert any(re.fullmatch(expected_label_regex, b.label) for b in blocks)
-    #
-    # # Remove file from source
-    # client.sources.files.delete(source_id=source.id, file_id=files[0].id)
-    #
-    # # Confirm blocks were removed
-    # blocks = client.agents.blocks.list(agent_id=agent_state.id)
-    # assert len(blocks) == 1
-    # assert not any(expected_value in b.value for b in blocks)
-    # assert not any(re.fullmatch(expected_label_regex, b.label) for b in blocks)
+    # Check that blocks were created
+    agent_state = client.agents.retrieve(agent_id=agent_state.id)
+    blocks = agent_state.memory.file_blocks
+    assert len(blocks) == 1
+    assert any(expected_value in b.value for b in blocks)
+    assert any(re.fullmatch(expected_label_regex, b.label) for b in blocks)
+
+    # Remove file from source
+    client.sources.files.delete(source_id=source.id, file_id=files[0].id)
+
+    # Confirm blocks were removed
+    agent_state = client.agents.retrieve(agent_id=agent_state.id)
+    blocks = agent_state.memory.file_blocks
+    assert len(blocks) == 0
+    assert not any(expected_value in b.value for b in blocks)
+    assert not any(re.fullmatch(expected_label_regex, b.label) for b in blocks)


 def test_attach_existing_files_creates_source_blocks_correctly(client: LettaSDKClient, agent_state: AgentState):
@@ -149,20 +155,22 @@ def test_attach_existing_files_creates_source_blocks_correctly(client: LettaSDKC
    # Attach after uploading the file
    client.agents.sources.attach(source_id=source.id, agent_id=agent_state.id)

-    # # Get the agent state, check blocks exist
-    # blocks = client.agents.blocks.list(agent_id=agent_state.id)
-    # assert len(blocks) == 2
-    # assert "test" in [b.value for b in blocks]
-    # assert any(re.fullmatch(r"test_[a-z0-9]+\.txt", b.label) for b in blocks)
+    # Get the agent state, check blocks exist
+    agent_state = client.agents.retrieve(agent_id=agent_state.id)
+    blocks = agent_state.memory.file_blocks
+    assert len(blocks) == 1
+    assert "test" in [b.value for b in blocks]
+    assert any(re.fullmatch(r"test_[a-z0-9]+\.txt", b.label) for b in blocks)

    # Detach the source
    client.agents.sources.detach(source_id=source.id, agent_id=agent_state.id)

-    # # Get the agent state, check blocks do NOT exist
-    # blocks = client.agents.blocks.list(agent_id=agent_state.id)
-    # assert len(blocks) == 1
-    # assert "test" not in [b.value for b in blocks]
-    # assert not any(re.fullmatch(r"test_[a-z0-9]+\.txt", b.label) for b in blocks)
+    # Get the agent state, check blocks do NOT exist
+    agent_state = client.agents.retrieve(agent_id=agent_state.id)
+    blocks = agent_state.memory.file_blocks
+    assert len(blocks) == 0
+    assert "test" not in [b.value for b in blocks]
+    assert not any(re.fullmatch(r"test_[a-z0-9]+\.txt", b.label) for b in blocks)


 def test_delete_source_removes_source_blocks_correctly(client: LettaSDKClient, agent_state: AgentState):
@@ -195,16 +203,18 @@ def test_delete_source_removes_source_blocks_correctly(client: LettaSDKClient, a
        print("Waiting for jobs to complete...", job.status)

    # Get the agent state, check blocks exist
-    # blocks = client.agents.blocks.list(agent_id=agent_state.id)
-    # assert len(blocks) == 2
-    # assert "test" in [b.value for b in blocks]
-    # assert any(re.fullmatch(r"test_[a-z0-9]+\.txt", b.label) for b in blocks)
+    agent_state = client.agents.retrieve(agent_id=agent_state.id)
+    blocks = agent_state.memory.file_blocks
+    assert len(blocks) == 1
+    assert "test" in [b.value for b in blocks]
+    assert any(re.fullmatch(r"test_[a-z0-9]+\.txt", b.label) for b in blocks)

    # Remove file from source
    client.sources.delete(source_id=source.id)

    # Get the agent state, check blocks do NOT exist
-    # blocks = client.agents.blocks.list(agent_id=agent_state.id)
-    # assert len(blocks) == 1
-    # assert "test" not in [b.value for b in blocks]
-    # assert not any(re.fullmatch(r"test_[a-z0-9]+\.txt", b.label) for b in blocks)
+    agent_state = client.agents.retrieve(agent_id=agent_state.id)
+    blocks = agent_state.memory.file_blocks
+    assert len(blocks) == 0
+    assert "test" not in [b.value for b in blocks]
+    assert not any(re.fullmatch(r"test_[a-z0-9]+\.txt", b.label) for b in blocks)