Co-authored-by: Charles Packer <packercharles@gmail.com> Co-authored-by: cthomas <caren@letta.com> Co-authored-by: Shubham Naik <shubham.naik10@gmail.com> Co-authored-by: Shubham Naik <shub@memgpt.ai> Co-authored-by: mlong93 <35275280+mlong93@users.noreply.github.com> Co-authored-by: Mindy Long <mindy@letta.com> Co-authored-by: Kevin Lin <klin5061@gmail.com> Co-authored-by: Stephan Fitzpatrick <stephan@knowsuchagency.com> Co-authored-by: dboyliao <qmalliao@gmail.com> Co-authored-by: Sarah Wooders <sarahwooders@gmail.com> Co-authored-by: Jyotirmaya Mahanta <jyotirmaya.mahanta@gmail.com> Co-authored-by: Nicholas <102550462+ndisalvio3@users.noreply.github.com> Co-authored-by: tarunkumark <tkksctwo@gmail.com> Co-authored-by: Miao <one.lemorage@gmail.com> Co-authored-by: Krishnakumar R (KK) <65895020+kk-src@users.noreply.github.com> Co-authored-by: Shubham Naik <shub@letta.com> Co-authored-by: Will Sargent <will.sargent@gmail.com>
98 lines
3.3 KiB
Python
98 lines
3.3 KiB
Python
import mimetypes
|
|
import os
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import List, Optional
|
|
|
|
|
|
def extract_file_metadata(file_path) -> dict:
|
|
"""Extracts metadata from a single file."""
|
|
if not os.path.exists(file_path):
|
|
raise FileNotFoundError(file_path)
|
|
|
|
file_metadata = {
|
|
"file_name": os.path.basename(file_path),
|
|
"file_path": file_path,
|
|
"file_type": mimetypes.guess_type(file_path)[0] or "unknown",
|
|
"file_size": os.path.getsize(file_path),
|
|
"file_creation_date": datetime.fromtimestamp(os.path.getctime(file_path)).strftime("%Y-%m-%d"),
|
|
"file_last_modified_date": datetime.fromtimestamp(os.path.getmtime(file_path)).strftime("%Y-%m-%d"),
|
|
}
|
|
return file_metadata
|
|
|
|
|
|
def extract_metadata_from_files(file_list):
|
|
"""Extracts metadata for a list of files."""
|
|
metadata = []
|
|
for file_path in file_list:
|
|
file_metadata = extract_file_metadata(file_path)
|
|
if file_metadata:
|
|
metadata.append(file_metadata)
|
|
return metadata
|
|
|
|
|
|
def get_filenames_in_dir(
|
|
input_dir: str, recursive: bool = True, required_exts: Optional[List[str]] = None, exclude: Optional[List[str]] = None
|
|
):
|
|
"""
|
|
Recursively reads files from the directory, applying required_exts and exclude filters.
|
|
Ensures that required_exts and exclude do not overlap.
|
|
|
|
Args:
|
|
input_dir (str): The directory to scan for files.
|
|
recursive (bool): Whether to scan directories recursively.
|
|
required_exts (list): List of file extensions to include (e.g., ['pdf', 'txt']).
|
|
If None or empty, matches any file extension.
|
|
exclude (list): List of file patterns to exclude (e.g., ['*png', '*jpg']).
|
|
|
|
Returns:
|
|
list: A list of matching file paths.
|
|
"""
|
|
required_exts = required_exts or []
|
|
exclude = exclude or []
|
|
|
|
# Ensure required_exts and exclude do not overlap
|
|
ext_set = set(required_exts)
|
|
exclude_set = set(exclude)
|
|
overlap = ext_set & exclude_set
|
|
if overlap:
|
|
raise ValueError(f"Extensions in required_exts and exclude overlap: {overlap}")
|
|
|
|
def is_excluded(file_name):
|
|
"""Check if a file matches any pattern in the exclude list."""
|
|
for pattern in exclude:
|
|
if Path(file_name).match(pattern):
|
|
return True
|
|
return False
|
|
|
|
files = []
|
|
search_pattern = "**/*" if recursive else "*"
|
|
|
|
for file_path in Path(input_dir).glob(search_pattern):
|
|
if file_path.is_file() and not is_excluded(file_path.name):
|
|
ext = file_path.suffix.lstrip(".")
|
|
# If required_exts is empty, match any file
|
|
if not required_exts or ext in required_exts:
|
|
files.append(str(file_path))
|
|
|
|
return files
|
|
|
|
|
|
def assert_all_files_exist_locally(file_paths: List[str]) -> bool:
|
|
"""
|
|
Checks if all file paths in the provided list exist locally.
|
|
Raises a FileNotFoundError with a list of missing files if any do not exist.
|
|
|
|
Args:
|
|
file_paths (List[str]): List of file paths to check.
|
|
|
|
Returns:
|
|
bool: True if all files exist, raises FileNotFoundError if any file is missing.
|
|
"""
|
|
missing_files = [file_path for file_path in file_paths if not Path(file_path).exists()]
|
|
|
|
if missing_files:
|
|
raise FileNotFoundError(missing_files)
|
|
|
|
return True
|