Skip to content

pineconedb

langroid/vector_store/pineconedb.py

PineconeDB(config=PineconeDBConfig())

Bases: VectorStore

Source code in langroid/vector_store/pineconedb.py
def __init__(self, config: PineconeDBConfig = PineconeDBConfig()):
    super().__init__(config)
    if not has_pinecone:
        raise LangroidImportError("pinecone", "pinecone")
    self.config: PineconeDBConfig = config
    load_dotenv()
    key = os.getenv("PINECONE_API_KEY")

    if not key:
        raise ValueError("PINECONE_API_KEY not set, could not instantiate client")
    self.client = Pinecone(api_key=key)

    if config.collection_name:
        self.create_collection(
            collection_name=config.collection_name,
            replace=config.replace_collection,
        )

clear_all_collections(really=False, prefix='')

Returns:

Type Description
int

Number of Pinecone indexes that were deleted

Parameters:

Name Type Description Default
really bool

Optional[bool] - whether to really delete all Pinecone collections

False
prefix str

Optional[str] - string to match potential Pinecone indexes for deletion

''
Source code in langroid/vector_store/pineconedb.py
def clear_all_collections(self, really: bool = False, prefix: str = "") -> int:
    """
    Returns:
        Number of Pinecone indexes that were deleted

    Args:
        really: Optional[bool] - whether to really delete all Pinecone collections
        prefix: Optional[str] - string to match potential Pinecone
            indexes for deletion
    """
    if not really:
        logger.warning("Not deleting all collections, set really=True to confirm")
        return 0
    indexes = [
        c for c in self._list_index_metas(empty=True) if c.name.startswith(prefix)
    ]
    if len(indexes) == 0:
        logger.warning(f"No collections found with prefix {prefix}")
        return 0
    n_empty_deletes, n_non_empty_deletes = 0, 0
    for index_desc in indexes:
        self.delete_collection(collection_name=index_desc.name)
        n_empty_deletes += index_desc.total_vector_count == 0
        n_non_empty_deletes += index_desc.total_vector_count > 0
    logger.warning(
        f"""
        Deleted {n_empty_deletes} empty indexes and
        {n_non_empty_deletes} non-empty indexes
        """
    )
    return n_empty_deletes + n_non_empty_deletes

list_collections(empty=False)

Returns:

Type Description
List[str]

List of Pinecone indices that have at least one vector.

Parameters:

Name Type Description Default
empty bool

Optional[bool] - whether to include empty collections

False
Source code in langroid/vector_store/pineconedb.py
def list_collections(self, empty: bool = False) -> List[str]:
    """
    Returns:
        List of Pinecone indices that have at least one vector.

    Args:
        empty: Optional[bool] - whether to include empty collections
    """
    indexes = self.client.list_indexes()
    res: List[str] = []
    if empty:
        res.extend(indexes.names())
        return res

    for index in indexes.names():
        index_meta = self.client.Index(name=index)
        if index_meta.describe_index_stats().get("total_vector_count", 0) > 0:
            res.append(index)
    return res

create_collection(collection_name, replace=False)

Create a collection with the given name, optionally replacing an existing collection if replace is True.

Parameters:

Name Type Description Default
collection_name str

str - Configuration of the collection to create.

required
replace bool

Optional[Bool] - Whether to replace an existing collection with the same name. Defaults to False.

False
Source code in langroid/vector_store/pineconedb.py
def create_collection(self, collection_name: str, replace: bool = False) -> None:
    """
    Create a collection with the given name, optionally replacing an existing
    collection if `replace` is True.

    Args:
        collection_name: str - Configuration of the collection to create.
        replace: Optional[Bool] - Whether to replace an existing collection
            with the same name. Defaults to False.
    """
    pattern = re.compile(r"^[a-z0-9-]+$")
    if not pattern.match(collection_name):
        raise ValueError(
            "Pinecone index names must be lowercase alphanumeric characters or '-'"
        )
    self.config.collection_name = collection_name
    if collection_name in self.list_collections(empty=True):
        index = self.client.Index(name=collection_name)
        stats = index.describe_index_stats()
        status = self.client.describe_index(name=collection_name)
        if status["status"]["ready"] and stats["total_vector_count"] > 0:
            logger.warning(f"Non-empty collection {collection_name} already exists")
            if not replace:
                logger.warning("Not replacing collection")
                return
            else:
                logger.warning("Recreating fresh collection")
        self.delete_collection(collection_name=collection_name)

    payload = {
        "name": collection_name,
        "dimension": self.embedding_dim,
        "spec": self.config.spec,
        "metric": self.config.metric,
        "timeout": self.config.timeout,
    }

    if self.config.deletion_protection:
        payload["deletion_protection"] = self.config.deletion_protection

    try:
        self.client.create_index(**payload)
    except PineconeApiException as e:
        logger.error(e)

get_all_documents(prefix='', namespace='')

Returns:

Type Description
List[Document]

All documents for the collection currently defined in

List[Document]

the configuration object

Parameters:

Name Type Description Default
prefix str

str - document id prefix to search for

''
namespace str

str - partition of vectors to search within the index

''
Source code in langroid/vector_store/pineconedb.py
def get_all_documents(
    self, prefix: str = "", namespace: str = ""
) -> List[Document]:
    """
    Returns:
        All documents for the collection currently defined in
        the configuration object

    Args:
        prefix: str - document id prefix to search for
        namespace: str - partition of vectors to search within the index
    """
    if self.config.collection_name is None:
        raise ValueError("No collection name set, cannot retrieve docs")
    docs = []

    request_filters: Dict[str, Union[str, int]] = {
        "limit": self.config.pagination_size
    }
    if prefix:
        request_filters["prefix"] = prefix
    if namespace:
        request_filters["namespace"] = namespace

    index = self.client.Index(name=self.config.collection_name)

    while True:
        response = index.list_paginated(**request_filters)
        vectors = response.get("vectors", [])

        if not vectors:
            logger.warning("Received empty list while requesting for vector ids")
            logger.warning("Halting fetch requests")
            if settings.debug:
                logger.debug(f"Request for failed fetch was: {request_filters}")
            break

        docs.extend(
            self.get_documents_by_ids(
                ids=[vector.get("id") for vector in vectors],
                namespace=namespace if namespace else "",
            )
        )

        pagination_token = response.get("pagination", {}).get("next", None)

        if not pagination_token:
            break

        request_filters["pagination_token"] = pagination_token

    return docs

get_documents_by_ids(ids, namespace='')

Returns:

Type Description
List[Document]

Fetches document text embedded in Pinecone index metadata

Parameters:

Name Type Description Default
ids List[str]

List[str] - vector data object ids to retrieve

required
namespace str

str - partition of vectors to search within the index

''
Source code in langroid/vector_store/pineconedb.py
def get_documents_by_ids(
    self, ids: List[str], namespace: str = ""
) -> List[Document]:
    """
    Returns:
        Fetches document text embedded in Pinecone index metadata

    Args:
        ids: List[str] - vector data object ids to retrieve
        namespace: str - partition of vectors to search within the index
    """
    if self.config.collection_name is None:
        raise ValueError("No collection name set, cannot retrieve docs")
    index = self.client.Index(name=self.config.collection_name)

    if namespace:
        records = index.fetch(ids=ids, namespace=namespace)
    else:
        records = index.fetch(ids=ids)

    id_mapping = {key: value for key, value in records["vectors"].items()}
    ordered_payloads = [id_mapping[_id] for _id in ids if _id in id_mapping]
    return [
        self.transform_pinecone_vector(payload.get("metadata", {}))
        for payload in ordered_payloads
    ]

transform_pinecone_vector(metadata_dict)

Parses the metadata response from the Pinecone vector query and formats it into a dictionary that can be parsed by the Document class associated with the PineconeDBConfig class

Returns:

Type Description
Document

Well formed dictionary object to be transformed into a Document

Parameters:

Name Type Description Default
metadata_dict Dict[str, Any]

Dict - the metadata dictionary from the Pinecone vector query match

required
Source code in langroid/vector_store/pineconedb.py
def transform_pinecone_vector(self, metadata_dict: Dict[str, Any]) -> Document:
    """
    Parses the metadata response from the Pinecone vector query and
    formats it into a dictionary that can be parsed by the Document class
    associated with the PineconeDBConfig class

    Returns:
        Well formed dictionary object to be transformed into a Document

    Args:
        metadata_dict: Dict - the metadata dictionary from the Pinecone
            vector query match
    """
    return self.config.document_class(
        **{**metadata_dict, "metadata": {**metadata_dict}}
    )