Skip to content

langroid

embedding_models

langroid/langroid

embedding_models

langroid/embedding_models/init.py

`EmbeddingModel` ¶

Bases: ABC

Abstract base class for an embedding model.

`clone()` ¶

Return a copy of this embedding model suitable for use in cloned agents. Default behaviour attempts to deep-copy the model configuration and instantiate a fresh model of the same type; if that is not possible, the original instance is reused.

Source code in langroid/embedding_models/base.py

def clone(self) -> "EmbeddingModel":
    """
    Return a copy of this embedding model suitable for use in cloned agents.
    Default behaviour attempts to deep-copy the model configuration and
    instantiate a fresh model of the same type; if that is not possible,
    the original instance is reused.
    """
    config = getattr(self, "config", None)
    if config is not None and hasattr(config, "model_copy"):
        try:
            return type(self)(config.model_copy(deep=True))  # type: ignore[call-arg]
        except Exception:
            pass
    return self

`similarity(text1, text2)` ¶

Compute cosine similarity between two texts.

Source code in langroid/embedding_models/base.py

def similarity(self, text1: str, text2: str) -> float:
    """Compute cosine similarity between two texts."""
    [emb1, emb2] = self.embedding_fn()([text1, text2])
    return float(
        np.array(emb1)
        @ np.array(emb2)
        / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
    )

`OpenAIEmbeddings(config=OpenAIEmbeddingsConfig())` ¶

Bases: EmbeddingModel

Source code in langroid/embedding_models/models.py

def __init__(self, config: OpenAIEmbeddingsConfig = OpenAIEmbeddingsConfig()):
    super().__init__()
    self.config = config
    load_dotenv()

    # Check if using LangDB
    self.is_langdb = self.config.model_name.startswith("langdb/")

    if self.is_langdb:
        self.config.model_name = self.config.model_name.replace("langdb/", "")
        self.config.api_base = self.config.langdb_params.base_url
        project_id = self.config.langdb_params.project_id
        if project_id:
            self.config.api_base += "/" + project_id + "/v1"
        self.config.api_key = self.config.langdb_params.api_key

    if not self.config.api_key:
        self.config.api_key = os.getenv("OPENAI_API_KEY", "")

    self.config.organization = os.getenv("OPENAI_ORGANIZATION", "")

    if self.config.api_key == "":
        if self.is_langdb:
            raise ValueError(
                """
                LANGDB_API_KEY must be set in .env or your environment 
                to use OpenAIEmbeddings via LangDB.
                """
            )
        else:
            raise ValueError(
                """
                OPENAI_API_KEY must be set in .env or your environment 
                to use OpenAIEmbeddings.
                """
            )

    self.client = OpenAI(
        base_url=self.config.api_base,
        api_key=self.config.api_key,
        organization=self.config.organization,
    )
    model_for_tokenizer = self.config.model_name
    if model_for_tokenizer.startswith("openai/"):
        self.config.model_name = model_for_tokenizer.replace("openai/", "")
    self.tokenizer = tiktoken.encoding_for_model(self.config.model_name)

`truncate_texts(texts)` ¶

Truncate texts to the embedding model's context length. TODO: Maybe we should show warning, and consider doing T5 summarization?

Source code in langroid/embedding_models/models.py

def truncate_texts(self, texts: List[str]) -> List[str] | List[List[int]]:
    """
    Truncate texts to the embedding model's context length.
    TODO: Maybe we should show warning, and consider doing T5 summarization?
    """
    truncated_tokens = [
        self.tokenizer.encode(text, disallowed_special=())[
            : self.config.context_length
        ]
        for text in texts
    ]

    if self.is_langdb:
        # LangDB embedding endpt only works with strings, not tokens
        return [self.tokenizer.decode(tokens) for tokens in truncated_tokens]
    return truncated_tokens

`LlamaCppServerEmbeddings(config=LCSEC())` ¶

Bases: EmbeddingModel

Source code in langroid/embedding_models/models.py

def __init__(self, config: LCSEC = LCSEC()):
    super().__init__()
    self.config = config

    if self.config.api_base == "":
        raise ValueError(
            """Api Base MUST be set for Llama Server Embeddings.
            """
        )

    self.tokenize_url = self.config.api_base + "/tokenize"
    self.detokenize_url = self.config.api_base + "/detokenize"
    self.embedding_url = self.config.api_base + "/embeddings"

`GeminiEmbeddings(config=GeminiEmbeddingsConfig())` ¶

Bases: EmbeddingModel

Source code in langroid/embedding_models/models.py

def __init__(self, config: GeminiEmbeddingsConfig = GeminiEmbeddingsConfig()):
    try:
        from google import genai
    except ImportError as e:
        raise LangroidImportError(extra="google-genai", error=str(e))
    super().__init__()
    self.config = config
    load_dotenv()
    self.config.api_key = os.getenv("GEMINI_API_KEY", "")

    if self.config.api_key == "":
        raise ValueError(
            """
            GEMINI_API_KEY env variable must be set to use GeminiEmbeddings.
            """
        )
    self.client = genai.Client(api_key=self.config.api_key)

`generate_embeddings(texts)` ¶

Generates embeddings for a list of input texts.

Source code in langroid/embedding_models/models.py

def generate_embeddings(self, texts: List[str]) -> List[List[float]]:
    """Generates embeddings for a list of input texts."""
    all_embeddings: List[List[float]] = []

    for batch in batched(texts, self.config.batch_size):
        result = self.client.models.embed_content(  # type: ignore[attr-defined]
            model=self.config.model_name,
            contents=batch,  # type: ignore
        )

        if not hasattr(result, "embeddings") or not isinstance(
            result.embeddings, list
        ):
            raise ValueError(
                "Unexpected format for embeddings: missing or incorrect type"
            )

        # Extract .values from ContentEmbedding objects
        all_embeddings.extend(
            [emb.values for emb in result.embeddings]  # type: ignore
        )

    return all_embeddings

`embedding_model(embedding_fn_type='openai')` ¶

Parameters:

Name	Type	Description	Default
`embedding_fn_type`	`str`	Type of embedding model to use. Options are: - "openai", - "azure-openai", - "sentencetransformer", or - "fastembed". (others may be added in the future)	`'openai'`

Returns: EmbeddingModel: The corresponding embedding model class.

Source code in langroid/embedding_models/models.py

def embedding_model(embedding_fn_type: str = "openai") -> EmbeddingModel:
    """
    Args:
        embedding_fn_type: Type of embedding model to use. Options are:
         - "openai",
         - "azure-openai",
         - "sentencetransformer", or
         - "fastembed".
            (others may be added in the future)
    Returns:
        EmbeddingModel: The corresponding embedding model class.
    """
    if embedding_fn_type == "openai":
        return OpenAIEmbeddings  # type: ignore
    elif embedding_fn_type == "azure-openai":
        return AzureOpenAIEmbeddings  # type: ignore
    elif embedding_fn_type == "fastembed":
        return FastEmbedEmbeddings  # type: ignore
    elif embedding_fn_type == "llamacppserver":
        return LlamaCppServerEmbeddings  # type: ignore
    elif embedding_fn_type == "gemini":
        return GeminiEmbeddings  # type: ignore
    else:  # default sentence transformer
        return SentenceTransformerEmbeddings  # type: ignore