Skip to content

lancedb

langroid/vector_store/lancedb.py

LanceDB(config=LanceDBConfig())

Bases: VectorStore

Source code in langroid/vector_store/lancedb.py
def __init__(self, config: LanceDBConfig = LanceDBConfig()):
    super().__init__(config)
    if not has_lancedb:
        raise LangroidImportError("lancedb", "lancedb")

    self.config: LanceDBConfig = config
    emb_model = EmbeddingModel.create(config.embedding)
    self.embedding_fn: EmbeddingFunction = emb_model.embedding_fn()
    self.embedding_dim = emb_model.embedding_dims
    self.host = config.host
    self.port = config.port
    self.is_from_dataframe = False  # were docs ingested from a dataframe?
    self.df_metadata_columns: List[str] = []  # metadata columns from dataframe

    load_dotenv()
    if self.config.cloud:
        logger.warning(
            "LanceDB Cloud is not available yet. Switching to local storage."
        )
        config.cloud = False
    else:
        try:
            self.client = lancedb.connect(
                uri=config.storage_path,
            )
        except Exception as e:
            new_storage_path = config.storage_path + ".new"
            logger.warning(
                f"""
                Error connecting to local LanceDB at {config.storage_path}:
                {e}
                Switching to {new_storage_path}
                """
            )
            self.client = lancedb.connect(
                uri=new_storage_path,
            )

clear_all_collections(really=False, prefix='')

Clear all collections with the given prefix.

Source code in langroid/vector_store/lancedb.py
def clear_all_collections(self, really: bool = False, prefix: str = "") -> int:
    """Clear all collections with the given prefix."""
    if not really:
        logger.warning("Not deleting all collections, set really=True to confirm")
        return 0
    coll_names = [
        c for c in self.list_collections(empty=True) if c.startswith(prefix)
    ]
    if len(coll_names) == 0:
        logger.warning(f"No collections found with prefix {prefix}")
        return 0
    n_empty_deletes = 0
    n_non_empty_deletes = 0
    for name in coll_names:
        nr = self.client.open_table(name).head(1).shape[0]
        n_empty_deletes += nr == 0
        n_non_empty_deletes += nr > 0
        self.client.drop_table(name)
    logger.warning(
        f"""
        Deleted {n_empty_deletes} empty collections and 
        {n_non_empty_deletes} non-empty collections.
        """
    )
    return n_empty_deletes + n_non_empty_deletes

list_collections(empty=False)

Returns:

Type Description
List[str]

List of collection names that have at least one vector.

Parameters:

Name Type Description Default
empty bool

Whether to include empty collections.

False
Source code in langroid/vector_store/lancedb.py
def list_collections(self, empty: bool = False) -> List[str]:
    """
    Returns:
        List of collection names that have at least one vector.

    Args:
        empty (bool, optional): Whether to include empty collections.
    """
    colls = self.client.table_names(limit=None)
    if len(colls) == 0:
        return []
    if empty:  # include empty tbls
        return colls  # type: ignore
    counts = [self.client.open_table(coll).head(1).shape[0] for coll in colls]
    return [coll for coll, count in zip(colls, counts) if count > 0]

add_dataframe(df, content='content', metadata=[])

Add a dataframe to the collection. Args: df (pd.DataFrame): A dataframe content (str): The name of the column in the dataframe that contains the text content to be embedded using the embedding model. metadata (List[str]): A list of column names in the dataframe that contain metadata to be stored in the database. Defaults to [].

Source code in langroid/vector_store/lancedb.py
def add_dataframe(
    self,
    df: pd.DataFrame,
    content: str = "content",
    metadata: List[str] = [],
) -> None:
    """
    Add a dataframe to the collection.
    Args:
        df (pd.DataFrame): A dataframe
        content (str): The name of the column in the dataframe that contains the
            text content to be embedded using the embedding model.
        metadata (List[str]): A list of column names in the dataframe that contain
            metadata to be stored in the database. Defaults to [].
    """
    self.is_from_dataframe = True
    actual_metadata = metadata.copy()
    self.df_metadata_columns = actual_metadata  # could be updated below
    # get content column
    content_values = df[content].values.tolist()
    embedding_vecs = self.embedding_fn(content_values)

    # add vector column
    df["vector"] = embedding_vecs
    if content != "content":
        # rename content column to "content", leave existing column intact
        df = df.rename(columns={content: "content"}, inplace=False)

    if "id" not in df.columns:
        docs = dataframe_to_documents(df, content="content", metadata=metadata)
        ids = [str(d.id()) for d in docs]
        df["id"] = ids

    if "id" not in actual_metadata:
        actual_metadata += ["id"]

    colls = self.list_collections(empty=True)
    coll_name = self.config.collection_name
    if (
        coll_name not in colls
        or self.client.open_table(coll_name).head(1).shape[0] == 0
    ):
        # collection either doesn't exist or is empty, so replace it
        # and set new schema from df
        self.client.create_table(
            self.config.collection_name,
            data=df,
            mode="overwrite",
        )
        doc_cls = dataframe_to_document_model(
            df,
            content=content,
            metadata=actual_metadata,
            exclude=["vector"],
        )
        self.config.document_class = doc_cls  # type: ignore
    else:
        # collection exists and is not empty, so append to it
        tbl = self.client.open_table(self.config.collection_name)
        tbl.add(df)