Revert "Revert "Merge pull request #71 from cpacker/pdf-support""
This reverts commit 0d4faa135783e152a43c370e7e8e2ed04090a661.
This commit is contained in:
@@ -11,6 +11,7 @@ import faiss
|
||||
import tiktoken
|
||||
import glob
|
||||
import sqlite3
|
||||
import fitz
|
||||
from tqdm import tqdm
|
||||
from memgpt.openai_tools import async_get_embedding_with_backoff
|
||||
|
||||
@@ -98,6 +99,12 @@ def read_in_chunks(file_object, chunk_size):
|
||||
break
|
||||
yield data
|
||||
|
||||
def read_pdf_in_chunks(file, chunk_size):
|
||||
doc = fitz.open(file)
|
||||
for page in doc:
|
||||
text = page.get_text()
|
||||
yield text
|
||||
|
||||
def read_in_rows_csv(file_object, chunk_size):
|
||||
csvreader = csv.reader(file_object)
|
||||
header = next(csvreader)
|
||||
@@ -123,7 +130,11 @@ def total_bytes(pattern):
|
||||
def chunk_file(file, tkns_per_chunk=300, model='gpt-4'):
|
||||
encoding = tiktoken.encoding_for_model(model)
|
||||
with open(file, 'r') as f:
|
||||
if file.endswith('.csv'):
|
||||
if file.endswith('.pdf'):
|
||||
lines = [l for l in read_pdf_in_chunks(file, tkns_per_chunk*8)]
|
||||
if len(lines) == 0:
|
||||
print(f"Warning: {file} did not have any extractable text.")
|
||||
elif file.endswith('.csv'):
|
||||
lines = [l for l in read_in_rows_csv(f, tkns_per_chunk*8)]
|
||||
else:
|
||||
lines = [l for l in read_in_chunks(f, tkns_per_chunk*4)]
|
||||
|
||||
@@ -6,6 +6,7 @@ geopy
|
||||
numpy
|
||||
openai
|
||||
pybars3
|
||||
pymupdf
|
||||
python-dotenv
|
||||
pytz
|
||||
rich
|
||||
|
||||
Reference in New Issue
Block a user