From e428e3666b8c8a0b9b38335e18e44a2266a7ac85 Mon Sep 17 00:00:00 2001 From: Charles Packer Date: Sat, 21 Oct 2023 14:18:43 -0700 Subject: [PATCH] Revert "Merge pull request #71 from cpacker/pdf-support" This reverts commit 1d566258642268084bf5029cb76c778cf19b9281, reversing changes made to b30c7836f011b3d2c336b5dd82c1136134ac58df. --- memgpt/utils.py | 13 +------------ requirements.txt | 1 - 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/memgpt/utils.py b/memgpt/utils.py index d8308983..77658228 100644 --- a/memgpt/utils.py +++ b/memgpt/utils.py @@ -11,7 +11,6 @@ import faiss import tiktoken import glob import sqlite3 -import fitz from tqdm import tqdm from memgpt.openai_tools import async_get_embedding_with_backoff @@ -99,12 +98,6 @@ def read_in_chunks(file_object, chunk_size): break yield data -def read_pdf_in_chunks(file, chunk_size): - doc = fitz.open(file) - for page in doc: - text = page.get_text() - yield text - def read_in_rows_csv(file_object, chunk_size): csvreader = csv.reader(file_object) header = next(csvreader) @@ -130,11 +123,7 @@ def total_bytes(pattern): def chunk_file(file, tkns_per_chunk=300, model='gpt-4'): encoding = tiktoken.encoding_for_model(model) with open(file, 'r') as f: - if file.endswith('.pdf'): - lines = [l for l in read_pdf_in_chunks(file, tkns_per_chunk*8)] - if len(lines) == 0: - print(f"Warning: {file} did not have any extractable text.") - elif file.endswith('.csv'): + if file.endswith('.csv'): lines = [l for l in read_in_rows_csv(f, tkns_per_chunk*8)] else: lines = [l for l in read_in_chunks(f, tkns_per_chunk*4)] diff --git a/requirements.txt b/requirements.txt index 484bc1e2..6258b856 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,6 @@ geopy numpy openai pybars3 -pymupdf python-dotenv pytz rich