import logging import os import tempfile from markitdown import MarkItDown from mistralai import OCRPageObject, OCRResponse, OCRUsageInfo from letta.log import get_logger from letta.otel.tracing import trace_method from letta.services.file_processor.file_types import is_simple_text_mime_type from letta.services.file_processor.parser.base_parser import FileParser logger = get_logger(__name__) # Suppress pdfminer warnings that occur during PDF processing logging.getLogger("pdfminer.pdffont").setLevel(logging.ERROR) logging.getLogger("pdfminer.pdfinterp").setLevel(logging.ERROR) logging.getLogger("pdfminer.pdfpage").setLevel(logging.ERROR) logging.getLogger("pdfminer.converter").setLevel(logging.ERROR) class MarkitdownFileParser(FileParser): """Markitdown-based file parsing for documents""" def __init__(self, model: str = "markitdown"): self.model = model @trace_method async def extract_text(self, content: bytes, mime_type: str) -> OCRResponse: """Extract text using markitdown.""" import asyncio try: # Handle simple text files directly if is_simple_text_mime_type(mime_type): logger.info(f"Extracting text directly (no processing needed): {self.model}") text = content.decode("utf-8", errors="replace") return OCRResponse( model=self.model, pages=[ OCRPageObject( index=0, markdown=text, images=[], dimensions=None, ) ], usage_info=OCRUsageInfo(pages_processed=1), document_annotation=None, ) logger.info(f"Extracting text using markitdown: {self.model}") # Run CPU/IO-intensive markitdown processing in thread pool to avoid blocking event loop def blocking_markitdown_convert(): # Create temporary file to pass to markitdown with tempfile.NamedTemporaryFile(delete=False, suffix=self._get_file_extension(mime_type)) as temp_file: temp_file.write(content) temp_file_path = temp_file.name try: md = MarkItDown(enable_plugins=False) result = md.convert(temp_file_path) return result.text_content finally: # Clean up temporary file os.unlink(temp_file_path) # Run blocking operations in thread pool text_content = await asyncio.to_thread(blocking_markitdown_convert) return OCRResponse( model=self.model, pages=[ OCRPageObject( index=0, markdown=text_content, images=[], dimensions=None, ) ], usage_info=OCRUsageInfo(pages_processed=1), document_annotation=None, ) except Exception as e: logger.error(f"Markitdown text extraction failed: {str(e)}") raise def _get_file_extension(self, mime_type: str) -> str: """Get file extension based on MIME type for markitdown processing.""" mime_to_ext = { "application/pdf": ".pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx", "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx", "application/vnd.ms-excel": ".xls", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx", "text/csv": ".csv", "application/json": ".json", "text/xml": ".xml", "application/xml": ".xml", } return mime_to_ext.get(mime_type, ".txt")