MarkItDown.convert() does blocking file I/O and CPU-intensive PDF parsing. This was blocking the event loop during file uploads. Now wraps the entire markitdown pipeline (tempfile write, convert, cleanup) in asyncio.to_thread() to run in thread pool. 🐾 Generated with [Letta Code](https://letta.com) Co-authored-by: Letta <noreply@letta.com>
104 lines
4.0 KiB
Python
104 lines
4.0 KiB
Python
import logging
|
|
import os
|
|
import tempfile
|
|
|
|
from markitdown import MarkItDown
|
|
from mistralai import OCRPageObject, OCRResponse, OCRUsageInfo
|
|
|
|
from letta.log import get_logger
|
|
from letta.otel.tracing import trace_method
|
|
from letta.services.file_processor.file_types import is_simple_text_mime_type
|
|
from letta.services.file_processor.parser.base_parser import FileParser
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
# Suppress pdfminer warnings that occur during PDF processing
|
|
logging.getLogger("pdfminer.pdffont").setLevel(logging.ERROR)
|
|
logging.getLogger("pdfminer.pdfinterp").setLevel(logging.ERROR)
|
|
logging.getLogger("pdfminer.pdfpage").setLevel(logging.ERROR)
|
|
logging.getLogger("pdfminer.converter").setLevel(logging.ERROR)
|
|
|
|
|
|
class MarkitdownFileParser(FileParser):
|
|
"""Markitdown-based file parsing for documents"""
|
|
|
|
def __init__(self, model: str = "markitdown"):
|
|
self.model = model
|
|
|
|
@trace_method
|
|
async def extract_text(self, content: bytes, mime_type: str) -> OCRResponse:
|
|
"""Extract text using markitdown."""
|
|
import asyncio
|
|
|
|
try:
|
|
# Handle simple text files directly
|
|
if is_simple_text_mime_type(mime_type):
|
|
logger.info(f"Extracting text directly (no processing needed): {self.model}")
|
|
text = content.decode("utf-8", errors="replace")
|
|
return OCRResponse(
|
|
model=self.model,
|
|
pages=[
|
|
OCRPageObject(
|
|
index=0,
|
|
markdown=text,
|
|
images=[],
|
|
dimensions=None,
|
|
)
|
|
],
|
|
usage_info=OCRUsageInfo(pages_processed=1),
|
|
document_annotation=None,
|
|
)
|
|
|
|
logger.info(f"Extracting text using markitdown: {self.model}")
|
|
|
|
# Run CPU/IO-intensive markitdown processing in thread pool to avoid blocking event loop
|
|
def blocking_markitdown_convert():
|
|
# Create temporary file to pass to markitdown
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=self._get_file_extension(mime_type)) as temp_file:
|
|
temp_file.write(content)
|
|
temp_file_path = temp_file.name
|
|
|
|
try:
|
|
md = MarkItDown(enable_plugins=False)
|
|
result = md.convert(temp_file_path)
|
|
return result.text_content
|
|
finally:
|
|
# Clean up temporary file
|
|
os.unlink(temp_file_path)
|
|
|
|
# Run blocking operations in thread pool
|
|
text_content = await asyncio.to_thread(blocking_markitdown_convert)
|
|
|
|
return OCRResponse(
|
|
model=self.model,
|
|
pages=[
|
|
OCRPageObject(
|
|
index=0,
|
|
markdown=text_content,
|
|
images=[],
|
|
dimensions=None,
|
|
)
|
|
],
|
|
usage_info=OCRUsageInfo(pages_processed=1),
|
|
document_annotation=None,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Markitdown text extraction failed: {str(e)}")
|
|
raise
|
|
|
|
def _get_file_extension(self, mime_type: str) -> str:
|
|
"""Get file extension based on MIME type for markitdown processing."""
|
|
mime_to_ext = {
|
|
"application/pdf": ".pdf",
|
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
|
|
"application/vnd.ms-excel": ".xls",
|
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
|
"text/csv": ".csv",
|
|
"application/json": ".json",
|
|
"text/xml": ".xml",
|
|
"application/xml": ".xml",
|
|
}
|
|
return mime_to_ext.get(mime_type, ".txt")
|