Skip to content

models

langroid/embedding_models/models.py

FastEmbedEmbeddingsConfig

Bases: EmbeddingModelsConfig

Config for qdrant/fastembed embeddings, see here: https://github.com/qdrant/fastembed

EmbeddingFunctionCallable(model, batch_size=512)

A callable class designed to generate embeddings for a list of texts using the OpenAI API, with automatic retries on failure.

Attributes:

Name Type Description
model OpenAIEmbeddings

An instance of OpenAIEmbeddings that provides configuration and utilities for generating embeddings.

Methods:

Name Description
__call__

List[str]) -> Embeddings: Generate embeddings for a list of input texts.

Parameters:

Name Type Description Default
model OpenAIEmbeddings

An instance of OpenAIEmbeddings to use for

required
batch_size int

Batch size

512
Source code in langroid/embedding_models/models.py
def __init__(self, model: "OpenAIEmbeddings", batch_size: int = 512):
    """
    Initialize the EmbeddingFunctionCallable with a specific model.

    Args:
        model (OpenAIEmbeddings): An instance of OpenAIEmbeddings to use for
        generating embeddings.
        batch_size (int): Batch size
    """
    self.model = model
    self.batch_size = batch_size

OpenAIEmbeddings(config=OpenAIEmbeddingsConfig())

Bases: EmbeddingModel

Source code in langroid/embedding_models/models.py
def __init__(self, config: OpenAIEmbeddingsConfig = OpenAIEmbeddingsConfig()):
    super().__init__()
    self.config = config
    load_dotenv()
    self.config.api_key = os.getenv("OPENAI_API_KEY", "")
    self.config.organization = os.getenv("OPENAI_ORGANIZATION", "")
    if self.config.api_key == "":
        raise ValueError(
            """OPENAI_API_KEY env variable must be set to use 
            OpenAIEmbeddings. Please set the OPENAI_API_KEY value 
            in your .env file.
            """
        )
    self.client = OpenAI(base_url=self.config.api_base, api_key=self.config.api_key)
    self.tokenizer = tiktoken.encoding_for_model(self.config.model_name)

truncate_texts(texts)

Truncate texts to the embedding model's context length. TODO: Maybe we should show warning, and consider doing T5 summarization?

Source code in langroid/embedding_models/models.py
def truncate_texts(self, texts: List[str]) -> List[List[int]]:
    """
    Truncate texts to the embedding model's context length.
    TODO: Maybe we should show warning, and consider doing T5 summarization?
    """
    return [
        self.tokenizer.encode(text, disallowed_special=())[
            : self.config.context_length
        ]
        for text in texts
    ]

embedding_model(embedding_fn_type='openai')

Parameters:

Name Type Description Default
embedding_fn_type str

"openai" or "sentencetransformer" # others soon

'openai'

Returns: EmbeddingModel

Source code in langroid/embedding_models/models.py
def embedding_model(embedding_fn_type: str = "openai") -> EmbeddingModel:
    """
    Args:
        embedding_fn_type: "openai" or "sentencetransformer" # others soon
    Returns:
        EmbeddingModel
    """
    if embedding_fn_type == "openai":
        return OpenAIEmbeddings  # type: ignore
    elif embedding_fn_type == "fastembed":
        return FastEmbedEmbeddings  # type: ignore
    else:  # default sentence transformer
        return SentenceTransformerEmbeddings  # type: ignore