base

`VectorStore(config)` ¶

Bases: ABC

Abstract base class for a vector store.

Source code in langroid/vector_store/base.py

def __init__(self, config: VectorStoreConfig):
    self.config = config
    if config.embedding_model is None:
        self.embedding_model = EmbeddingModel.create(config.embedding)
    else:
        self.embedding_model = config.embedding_model
    self.embedding_fn: EmbeddingFunction = self.embedding_model.embedding_fn()

`clear_empty_collections()` `abstractmethod` ¶

Clear all empty collections in the vector store. Returns the number of collections deleted.

Source code in langroid/vector_store/base.py

@abstractmethod
def clear_empty_collections(self) -> int:
    """Clear all empty collections in the vector store.
    Returns the number of collections deleted.
    """
    pass

`clear_all_collections(really=False, prefix='')` `abstractmethod` ¶

Clear all collections in the vector store.

Parameters:

Name	Type	Description	Default
`really`	`bool`	Whether to really clear all collections. Defaults to False.	`False`
`prefix`	`str`	Prefix of collections to clear.	`''`

Returns: int: Number of collections deleted.

Source code in langroid/vector_store/base.py

@abstractmethod
def clear_all_collections(self, really: bool = False, prefix: str = "") -> int:
    """
    Clear all collections in the vector store.

    Args:
        really (bool, optional): Whether to really clear all collections.
            Defaults to False.
        prefix (str, optional): Prefix of collections to clear.
    Returns:
        int: Number of collections deleted.
    """
    pass

`list_collections(empty=False)` `abstractmethod` ¶

List all collections in the vector store (only non empty collections if empty=False).

Source code in langroid/vector_store/base.py

@abstractmethod
def list_collections(self, empty: bool = False) -> List[str]:
    """List all collections in the vector store
    (only non empty collections if empty=False).
    """
    pass

`set_collection(collection_name, replace=False)` ¶

Set the current collection to the given collection name. Args: collection_name (str): Name of the collection. replace (bool, optional): Whether to replace the collection if it already exists. Defaults to False.

Source code in langroid/vector_store/base.py

def set_collection(self, collection_name: str, replace: bool = False) -> None:
    """
    Set the current collection to the given collection name.
    Args:
        collection_name (str): Name of the collection.
        replace (bool, optional): Whether to replace the collection if it
            already exists. Defaults to False.
    """

    self.config.collection_name = collection_name
    self.config.replace_collection = replace
    if replace:
        self.create_collection(collection_name, replace=True)

`create_collection(collection_name, replace=False)` `abstractmethod` ¶

Create a collection with the given name. Args: collection_name (str): Name of the collection. replace (bool, optional): Whether to replace the collection if it already exists. Defaults to False.

Source code in langroid/vector_store/base.py

@abstractmethod
def create_collection(self, collection_name: str, replace: bool = False) -> None:
    """Create a collection with the given name.
    Args:
        collection_name (str): Name of the collection.
        replace (bool, optional): Whether to replace the
            collection if it already exists. Defaults to False.
    """
    pass

`compute_from_docs(docs, calc)` ¶

Compute a result on a set of documents, using a dataframe calc string like df.groupby('state')['income'].mean().

If full_eval is False (default), the input expression is sanitized to prevent most common code injection attack vectors. If full_eval is True, sanitization is bypassed - use only with trusted input!

Source code in langroid/vector_store/base.py

def compute_from_docs(self, docs: List[Document], calc: str) -> str:
    """Compute a result on a set of documents,
    using a dataframe calc string like `df.groupby('state')['income'].mean()`.

    If full_eval is False (default), the input expression is sanitized to prevent
    most common code injection attack vectors.
    If full_eval is True, sanitization is bypassed - use only with trusted input!
    """
    # convert each doc to a dict, using dotted paths for nested fields
    dicts = [flatten_dict(doc.dict(by_alias=True)) for doc in docs]
    df = pd.DataFrame(dicts)

    try:
        # SECURITY MITIGATION: Eval input is sanitized to prevent most common
        # code injection attack vectors when full_eval is False.
        vars = {"df": df}
        if not self.config.full_eval:
            calc = sanitize_command(calc)
        code = compile(calc, "<calc>", "eval")
        result = eval(code, vars, {})
    except Exception as e:
        # return error message so LLM can fix the calc string if needed
        err = f"""
        Error encountered in pandas eval: {str(e)}
        """
        if isinstance(e, KeyError) and "not in index" in str(e):
            # Pd.eval sometimes fails on a perfectly valid exprn like
            # df.loc[..., 'column'] with a KeyError.
            err += """
            Maybe try a different way, e.g. 
            instead of df.loc[..., 'column'], try df.loc[...]['column']
            """
        return err
    return stringify(result)

`maybe_add_ids(documents)` ¶

Add ids to metadata if absent, since some vecdbs don't like having blank ids.

Source code in langroid/vector_store/base.py

def maybe_add_ids(self, documents: Sequence[Document]) -> None:
    """Add ids to metadata if absent, since some
    vecdbs don't like having blank ids."""
    for d in documents:
        if d.metadata.id in [None, ""]:
            d.metadata.id = ObjectRegistry.new_id()

`similar_texts_with_scores(text, k=1, where=None)` `abstractmethod` ¶

Find k most similar texts to the given text, in terms of vector distance metric (e.g., cosine similarity).

Parameters:

Name	Type	Description	Default
`text`	`str`	The text to find similar texts for.	required
`k`	`int`	Number of similar texts to retrieve. Defaults to 1.	`1`
`where`	`Optional[str]`	Where clause to filter the search.	`None`

Returns:

Type	Description
`List[Tuple[Document, float]]`	List[Tuple[Document,float]]: List of (Document, score) tuples.

Source code in langroid/vector_store/base.py

@abstractmethod
def similar_texts_with_scores(
    self,
    text: str,
    k: int = 1,
    where: Optional[str] = None,
) -> List[Tuple[Document, float]]:
    """
    Find k most similar texts to the given text, in terms of vector distance metric
    (e.g., cosine similarity).

    Args:
        text (str): The text to find similar texts for.
        k (int, optional): Number of similar texts to retrieve. Defaults to 1.
        where (Optional[str], optional): Where clause to filter the search.

    Returns:
        List[Tuple[Document,float]]: List of (Document, score) tuples.

    """
    pass

`add_context_window(docs_scores, neighbors=0)` ¶

In each doc's metadata, there may be a window_ids field indicating the ids of the chunks around the current chunk. These window_ids may overlap, so we - coalesce each overlapping groups into a single window (maintaining ordering), - create a new document for each part, preserving metadata,

We may have stored a longer set of window_ids than we need during chunking. Now, we just want neighbors on each side of the center of the window_ids list.

Parameters:

Name	Type	Description	Default
`docs_scores`	`List[Tuple[Document, float]]`	List of pairs of documents to add context windows to together with their match scores.	required
`neighbors`	`int`	Number of neighbors on "each side" of match to retrieve. Defaults to 0. "Each side" here means before and after the match, in the original text.	`0`

Returns:

Type	Description
`List[Tuple[Document, float]]`	List[Tuple[Document, float]]: List of (Document, score) tuples.

Source code in langroid/vector_store/base.py

def add_context_window(
    self, docs_scores: List[Tuple[Document, float]], neighbors: int = 0
) -> List[Tuple[Document, float]]:
    """
    In each doc's metadata, there may be a window_ids field indicating
    the ids of the chunks around the current chunk.
    These window_ids may overlap, so we
    - coalesce each overlapping groups into a single window (maintaining ordering),
    - create a new document for each part, preserving metadata,

    We may have stored a longer set of window_ids than we need during chunking.
    Now, we just want `neighbors` on each side of the center of the window_ids list.

    Args:
        docs_scores (List[Tuple[Document, float]]): List of pairs of documents
            to add context windows to together with their match scores.
        neighbors (int, optional): Number of neighbors on "each side" of match to
            retrieve. Defaults to 0.
            "Each side" here means before and after the match,
            in the original text.

    Returns:
        List[Tuple[Document, float]]: List of (Document, score) tuples.
    """
    # We return a larger context around each match, i.e.
    # a window of `neighbors` on each side of the match.
    docs = [d for d, s in docs_scores]
    scores = [s for d, s in docs_scores]
    if neighbors == 0:
        return docs_scores
    doc_chunks = [d for d in docs if d.metadata.is_chunk]
    if len(doc_chunks) == 0:
        return docs_scores
    window_ids_list = []
    id2metadata = {}
    # id -> highest score of a doc it appears in
    id2max_score: Dict[int | str, float] = {}
    for i, d in enumerate(docs):
        window_ids = d.metadata.window_ids
        if len(window_ids) == 0:
            window_ids = [d.id()]
        id2metadata.update({id: d.metadata for id in window_ids})

        id2max_score.update(
            {id: max(id2max_score.get(id, 0), scores[i]) for id in window_ids}
        )
        n = len(window_ids)
        chunk_idx = window_ids.index(d.id())
        neighbor_ids = window_ids[
            max(0, chunk_idx - neighbors) : min(n, chunk_idx + neighbors + 1)
        ]
        window_ids_list += [neighbor_ids]

    # window_ids could be from different docs,
    # and they may overlap, so we coalesce overlapping groups into
    # separate windows.
    window_ids_list = self.remove_overlaps(window_ids_list)
    final_docs = []
    final_scores = []
    for w in window_ids_list:
        metadata = copy.deepcopy(id2metadata[w[0]])
        metadata.window_ids = w
        document = Document(
            content="".join([d.content for d in self.get_documents_by_ids(w)]),
            metadata=metadata,
        )
        # make a fresh id since content is in general different
        document.metadata.id = ObjectRegistry.new_id()
        final_docs += [document]
        final_scores += [max(id2max_score[id] for id in w)]
    return list(zip(final_docs, final_scores))

`remove_overlaps(windows)` `staticmethod` ¶

Given a collection of windows, where each window is a sequence of ids, identify groups of overlapping windows, and for each overlapping group, order the chunk-ids using topological sort so they appear in the original order in the text.

Parameters:

Name	Type	Description	Default
`windows`	`List[int \| str]`	List of windows, where each window is a sequence of ids.	required

Returns:

Type	Description
`List[List[str]]`	List[int\|str]: List of windows, where each window is a sequence of ids, and no two windows overlap.

Source code in langroid/vector_store/base.py

@staticmethod
def remove_overlaps(windows: List[List[str]]) -> List[List[str]]:
    """
    Given a collection of windows, where each window is a sequence of ids,
    identify groups of overlapping windows, and for each overlapping group,
    order the chunk-ids using topological sort so they appear in the original
    order in the text.

    Args:
        windows (List[int|str]): List of windows, where each window is a
            sequence of ids.

    Returns:
        List[int|str]: List of windows, where each window is a sequence of ids,
            and no two windows overlap.
    """
    ids = set(id for w in windows for id in w)
    # id -> {win -> # pos}
    id2win2pos: Dict[str, Dict[int, int]] = {id: {} for id in ids}

    for i, w in enumerate(windows):
        for j, id in enumerate(w):
            id2win2pos[id][i] = j

    n = len(windows)
    # relation between windows:
    order = np.zeros((n, n), dtype=np.int8)
    for i, w in enumerate(windows):
        for j, x in enumerate(windows):
            if i == j:
                continue
            if len(set(w).intersection(x)) == 0:
                continue
            id = list(set(w).intersection(x))[0]  # any common id
            if id2win2pos[id][i] > id2win2pos[id][j]:
                order[i, j] = -1  # win i is before win j
            else:
                order[i, j] = 1  # win i is after win j

    # find groups of windows that overlap, like connected components in a graph
    groups = components(np.abs(order))

    # order the chunk-ids in each group using topological sort
    new_windows = []
    for g in groups:
        # find total ordering among windows in group based on order matrix
        # (this is a topological sort)
        _g = np.array(g)
        order_matrix = order[_g][:, _g]
        ordered_window_indices = topological_sort(order_matrix)
        ordered_window_ids = [windows[i] for i in _g[ordered_window_indices]]
        flattened = [id for w in ordered_window_ids for id in w]
        flattened_deduped = list(dict.fromkeys(flattened))
        # Note we are not going to split these, and instead we'll return
        # larger windows from concatenating the connected groups.
        # This ensures context is retained for LLM q/a
        new_windows += [flattened_deduped]

    return new_windows

`get_all_documents(where='')` `abstractmethod` ¶

Get all documents in the current collection, possibly filtered by where.

Source code in langroid/vector_store/base.py

@abstractmethod
def get_all_documents(self, where: str = "") -> List[Document]:
    """
    Get all documents in the current collection, possibly filtered by `where`.
    """
    pass

`get_documents_by_ids(ids)` `abstractmethod` ¶

Get documents by their ids. Args: ids (List[str]): List of document ids.

Returns:

Type	Description
`List[Document]`	List[Document]: List of documents

Source code in langroid/vector_store/base.py

@abstractmethod
def get_documents_by_ids(self, ids: List[str]) -> List[Document]:
    """
    Get documents by their ids.
    Args:
        ids (List[str]): List of document ids.

    Returns:
        List[Document]: List of documents
    """
    pass

base

VectorStore(config) ¶

clear_empty_collections() abstractmethod ¶

clear_all_collections(really=False, prefix='') abstractmethod ¶

list_collections(empty=False) abstractmethod ¶

set_collection(collection_name, replace=False) ¶

create_collection(collection_name, replace=False) abstractmethod ¶

compute_from_docs(docs, calc) ¶

maybe_add_ids(documents) ¶

similar_texts_with_scores(text, k=1, where=None) abstractmethod ¶

add_context_window(docs_scores, neighbors=0) ¶

remove_overlaps(windows) staticmethod ¶

get_all_documents(where='') abstractmethod ¶

get_documents_by_ids(ids) abstractmethod ¶

`VectorStore(config)` ¶

`clear_empty_collections()` `abstractmethod` ¶

`clear_all_collections(really=False, prefix='')` `abstractmethod` ¶

`list_collections(empty=False)` `abstractmethod` ¶

`set_collection(collection_name, replace=False)` ¶

`create_collection(collection_name, replace=False)` `abstractmethod` ¶

`compute_from_docs(docs, calc)` ¶

`maybe_add_ids(documents)` ¶

`similar_texts_with_scores(text, k=1, where=None)` `abstractmethod` ¶

`add_context_window(docs_scores, neighbors=0)` ¶

`remove_overlaps(windows)` `staticmethod` ¶

`get_all_documents(where='')` `abstractmethod` ¶

`get_documents_by_ids(ids)` `abstractmethod` ¶