117 lines
4.5 KiB
Python
117 lines
4.5 KiB
Python
"""
|
|
This file contains functions for loading data into Letta's archival storage.
|
|
|
|
Data can be loaded with the following command, once a load function is defined:
|
|
```
|
|
letta load <data-connector-type> --name <dataset-name> [ADDITIONAL ARGS]
|
|
```
|
|
|
|
"""
|
|
|
|
import uuid
|
|
from typing import Annotated, List, Optional
|
|
|
|
import typer
|
|
|
|
from letta import create_client
|
|
from letta.data_sources.connectors import DirectoryConnector
|
|
|
|
app = typer.Typer()
|
|
|
|
|
|
default_extensions = ".txt,.md,.pdf"
|
|
|
|
|
|
@app.command("directory")
|
|
def load_directory(
|
|
name: Annotated[str, typer.Option(help="Name of dataset to load.")],
|
|
input_dir: Annotated[Optional[str], typer.Option(help="Path to directory containing dataset.")] = None,
|
|
input_files: Annotated[List[str], typer.Option(help="List of paths to files containing dataset.")] = [],
|
|
recursive: Annotated[bool, typer.Option(help="Recursively search for files in directory.")] = False,
|
|
extensions: Annotated[str, typer.Option(help="Comma separated list of file extensions to load")] = default_extensions,
|
|
user_id: Annotated[Optional[uuid.UUID], typer.Option(help="User ID to associate with dataset.")] = None, # TODO: remove
|
|
description: Annotated[Optional[str], typer.Option(help="Description of the source.")] = None,
|
|
):
|
|
client = create_client()
|
|
|
|
# create connector
|
|
connector = DirectoryConnector(input_files=input_files, input_directory=input_dir, recursive=recursive, extensions=extensions)
|
|
|
|
# create source
|
|
source = client.create_source(name=name)
|
|
|
|
# load data
|
|
try:
|
|
client.load_data(connector, source_name=name)
|
|
except Exception as e:
|
|
typer.secho(f"Failed to load data from provided information.\n{e}", fg=typer.colors.RED)
|
|
client.delete_source(source.id)
|
|
|
|
|
|
# @app.command("webpage")
|
|
# def load_webpage(
|
|
# name: Annotated[str, typer.Option(help="Name of dataset to load.")],
|
|
# urls: Annotated[List[str], typer.Option(help="List of urls to load.")],
|
|
# ):
|
|
# try:
|
|
# from llama_index.readers.web import SimpleWebPageReader
|
|
#
|
|
# docs = SimpleWebPageReader(html_to_text=True).load_data(urls)
|
|
# store_docs(name, docs)
|
|
#
|
|
# except ValueError as e:
|
|
# typer.secho(f"Failed to load webpage from provided information.\n{e}", fg=typer.colors.RED)
|
|
|
|
|
|
@app.command("vector-database")
|
|
def load_vector_database(
|
|
name: Annotated[str, typer.Option(help="Name of dataset to load.")],
|
|
uri: Annotated[str, typer.Option(help="Database URI.")],
|
|
table_name: Annotated[str, typer.Option(help="Name of table containing data.")],
|
|
text_column: Annotated[str, typer.Option(help="Name of column containing text.")],
|
|
embedding_column: Annotated[str, typer.Option(help="Name of column containing embedding.")],
|
|
user_id: Annotated[Optional[uuid.UUID], typer.Option(help="User ID to associate with dataset.")] = None,
|
|
):
|
|
"""Load pre-computed embeddings into Letta from a database."""
|
|
raise NotImplementedError
|
|
# try:
|
|
# config = LettaConfig.load()
|
|
# connector = VectorDBConnector(
|
|
# uri=uri,
|
|
# table_name=table_name,
|
|
# text_column=text_column,
|
|
# embedding_column=embedding_column,
|
|
# embedding_dim=config.default_embedding_config.embedding_dim,
|
|
# )
|
|
# if not user_id:
|
|
# user_id = uuid.UUID(config.anon_clientid)
|
|
|
|
# ms = MetadataStore(config)
|
|
# source = Source(
|
|
# name=name,
|
|
# user_id=user_id,
|
|
# embedding_model=config.default_embedding_config.embedding_model,
|
|
# embedding_dim=config.default_embedding_config.embedding_dim,
|
|
# )
|
|
# ms.create_source(source)
|
|
# passage_storage = StorageConnector.get_storage_connector(TableType.PASSAGES, config, user_id)
|
|
# # TODO: also get document store
|
|
|
|
# # ingest data into passage/document store
|
|
# try:
|
|
# num_passages, num_documents = load_data(
|
|
# connector=connector,
|
|
# source=source,
|
|
# embedding_config=config.default_embedding_config,
|
|
# document_store=None,
|
|
# passage_store=passage_storage,
|
|
# )
|
|
# print(f"Loaded {num_passages} passages and {num_documents} documents from {name}")
|
|
# except Exception as e:
|
|
# typer.secho(f"Failed to load data from provided information.\n{e}", fg=typer.colors.RED)
|
|
# ms.delete_source(source_id=source.id)
|
|
|
|
# except ValueError as e:
|
|
# typer.secho(f"Failed to load VectorDB from provided information.\n{e}", fg=typer.colors.RED)
|
|
# raise
|