Revert "Revert "Merge pull request #71 from cpacker/pdf-support""

This reverts commit 0d4faa135783e152a43c370e7e8e2ed04090a661.
This commit is contained in:
Vivian Fang
2023-10-21 14:21:02 -07:00
parent e428e3666b
commit 1ae38e4bec
2 changed files with 13 additions and 1 deletions

View File

@@ -11,6 +11,7 @@ import faiss
import tiktoken
import glob
import sqlite3
import fitz
from tqdm import tqdm
from memgpt.openai_tools import async_get_embedding_with_backoff
@@ -98,6 +99,12 @@ def read_in_chunks(file_object, chunk_size):
break
yield data
def read_pdf_in_chunks(file, chunk_size):
doc = fitz.open(file)
for page in doc:
text = page.get_text()
yield text
def read_in_rows_csv(file_object, chunk_size):
csvreader = csv.reader(file_object)
header = next(csvreader)
@@ -123,7 +130,11 @@ def total_bytes(pattern):
def chunk_file(file, tkns_per_chunk=300, model='gpt-4'):
encoding = tiktoken.encoding_for_model(model)
with open(file, 'r') as f:
if file.endswith('.csv'):
if file.endswith('.pdf'):
lines = [l for l in read_pdf_in_chunks(file, tkns_per_chunk*8)]
if len(lines) == 0:
print(f"Warning: {file} did not have any extractable text.")
elif file.endswith('.csv'):
lines = [l for l in read_in_rows_csv(f, tkns_per_chunk*8)]
else:
lines = [l for l in read_in_chunks(f, tkns_per_chunk*4)]

View File

@@ -6,6 +6,7 @@ geopy
numpy
openai
pybars3
pymupdf
python-dotenv
pytz
rich