diff --git a/memgpt/utils.py b/memgpt/utils.py index d8308983..77658228 100644 --- a/memgpt/utils.py +++ b/memgpt/utils.py @@ -11,7 +11,6 @@ import faiss import tiktoken import glob import sqlite3 -import fitz from tqdm import tqdm from memgpt.openai_tools import async_get_embedding_with_backoff @@ -99,12 +98,6 @@ def read_in_chunks(file_object, chunk_size): break yield data -def read_pdf_in_chunks(file, chunk_size): - doc = fitz.open(file) - for page in doc: - text = page.get_text() - yield text - def read_in_rows_csv(file_object, chunk_size): csvreader = csv.reader(file_object) header = next(csvreader) @@ -130,11 +123,7 @@ def total_bytes(pattern): def chunk_file(file, tkns_per_chunk=300, model='gpt-4'): encoding = tiktoken.encoding_for_model(model) with open(file, 'r') as f: - if file.endswith('.pdf'): - lines = [l for l in read_pdf_in_chunks(file, tkns_per_chunk*8)] - if len(lines) == 0: - print(f"Warning: {file} did not have any extractable text.") - elif file.endswith('.csv'): + if file.endswith('.csv'): lines = [l for l in read_in_rows_csv(f, tkns_per_chunk*8)] else: lines = [l for l in read_in_chunks(f, tkns_per_chunk*4)] diff --git a/requirements.txt b/requirements.txt index 484bc1e2..6258b856 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,6 @@ geopy numpy openai pybars3 -pymupdf python-dotenv pytz rich