diff --git a/memgpt/utils.py b/memgpt/utils.py index 77658228..d8308983 100644 --- a/memgpt/utils.py +++ b/memgpt/utils.py @@ -11,6 +11,7 @@ import faiss import tiktoken import glob import sqlite3 +import fitz from tqdm import tqdm from memgpt.openai_tools import async_get_embedding_with_backoff @@ -98,6 +99,12 @@ def read_in_chunks(file_object, chunk_size): break yield data +def read_pdf_in_chunks(file, chunk_size): + doc = fitz.open(file) + for page in doc: + text = page.get_text() + yield text + def read_in_rows_csv(file_object, chunk_size): csvreader = csv.reader(file_object) header = next(csvreader) @@ -123,7 +130,11 @@ def total_bytes(pattern): def chunk_file(file, tkns_per_chunk=300, model='gpt-4'): encoding = tiktoken.encoding_for_model(model) with open(file, 'r') as f: - if file.endswith('.csv'): + if file.endswith('.pdf'): + lines = [l for l in read_pdf_in_chunks(file, tkns_per_chunk*8)] + if len(lines) == 0: + print(f"Warning: {file} did not have any extractable text.") + elif file.endswith('.csv'): lines = [l for l in read_in_rows_csv(f, tkns_per_chunk*8)] else: lines = [l for l in read_in_chunks(f, tkns_per_chunk*4)] diff --git a/requirements.txt b/requirements.txt index 6258b856..484bc1e2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,7 @@ geopy numpy openai pybars3 +pymupdf python-dotenv pytz rich