Skip to content

embedding_models

langroid/embedding_models/init.py

EmbeddingModel

Bases: ABC

Abstract base class for an embedding model.

similarity(text1, text2)

Compute cosine similarity between two texts.

Source code in langroid/embedding_models/base.py
def similarity(self, text1: str, text2: str) -> float:
    """Compute cosine similarity between two texts."""
    [emb1, emb2] = self.embedding_fn()([text1, text2])
    return float(
        np.array(emb1)
        @ np.array(emb2)
        / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
    )

OpenAIEmbeddings(config=OpenAIEmbeddingsConfig())

Bases: EmbeddingModel

Source code in langroid/embedding_models/models.py
def __init__(self, config: OpenAIEmbeddingsConfig = OpenAIEmbeddingsConfig()):
    super().__init__()
    self.config = config
    load_dotenv()
    self.config.api_key = os.getenv("OPENAI_API_KEY", "")
    self.config.organization = os.getenv("OPENAI_ORGANIZATION", "")
    if self.config.api_key == "":
        raise ValueError(
            """OPENAI_API_KEY env variable must be set to use 
            OpenAIEmbeddings. Please set the OPENAI_API_KEY value 
            in your .env file.
            """
        )
    self.client = OpenAI(base_url=self.config.api_base, api_key=self.config.api_key)
    self.tokenizer = tiktoken.encoding_for_model(self.config.model_name)

truncate_texts(texts)

Truncate texts to the embedding model's context length. TODO: Maybe we should show warning, and consider doing T5 summarization?

Source code in langroid/embedding_models/models.py
def truncate_texts(self, texts: List[str]) -> List[List[int]]:
    """
    Truncate texts to the embedding model's context length.
    TODO: Maybe we should show warning, and consider doing T5 summarization?
    """
    return [
        self.tokenizer.encode(text, disallowed_special=())[
            : self.config.context_length
        ]
        for text in texts
    ]

embedding_model(embedding_fn_type='openai')

Parameters:

Name Type Description Default
embedding_fn_type str

Type of embedding model to use. Options are: - "openai", - "azure-openai", - "sentencetransformer", or - "fastembed". (others may be added in the future)

'openai'

Returns: EmbeddingModel: The corresponding embedding model class.

Source code in langroid/embedding_models/models.py
def embedding_model(embedding_fn_type: str = "openai") -> EmbeddingModel:
    """
    Args:
        embedding_fn_type: Type of embedding model to use. Options are:
         - "openai",
         - "azure-openai",
         - "sentencetransformer", or
         - "fastembed".
            (others may be added in the future)
    Returns:
        EmbeddingModel: The corresponding embedding model class.
    """
    if embedding_fn_type == "openai":
        return OpenAIEmbeddings  # type: ignore
    elif embedding_fn_type == "azure-openai":
        return AzureOpenAIEmbeddings  # type: ignore
    elif embedding_fn_type == "fastembed":
        return FastEmbedEmbeddings  # type: ignore
    elif embedding_fn_type == "llamacppserver":
        return LlamaCppServerEmbeddings  # type: ignore
    else:  # default sentence transformer
        return SentenceTransformerEmbeddings  # type: ignore