Source code for src.clients.huggingface_client

"""Hugging Face embedding client using transformers library.

This module provides functions for generating embeddings using generic
Hugging Face models with mean pooling and normalization.

Example:
    Generate embeddings using a Hugging Face model::

        from src.clients.huggingface_client import get_huggingface_embeddings

        embeddings = get_huggingface_embeddings(
            texts=["Hello world"],
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )
"""

from typing import List

import torch
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer



[docs]
def mean_pooling(
    model_output: torch.Tensor, attention_mask: torch.Tensor
) -> torch.Tensor:
    """Perform mean pooling on token embeddings to get sentence embedding.

    Args:
        model_output: Model output containing token embeddings.
        attention_mask: Attention mask for the input tokens.

    Returns:
        Mean-pooled sentence embeddings tensor.
    """
    token_embeddings = model_output[
        0
    ]  # First element of model_output contains all token embeddings
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )




[docs]
def get_huggingface_embeddings(
    texts: List[str], model_name: str, expected_dimension: int | None = None
) -> List[List[float]]:
    """Generate embeddings using a generic Hugging Face model.

    Loads the model and tokenizer, processes texts, and applies mean pooling
    with L2 normalization.

    Args:
        texts: List of texts to embed.
        model_name: Name of the Hugging Face model to use.
        expected_dimension: Expected embedding dimension for validation.

    Returns:
        List of normalized embedding vectors as lists of floats.

    Raises:
        ValueError: If embedding dimension doesn't match expected.
    """
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name)

        # For instruction-tuned models, add a prefix
        if "instruct" in model_name:
            texts = [f"passage: {text}" for text in texts]

        encoded_input = tokenizer(
            texts, padding=True, truncation=True, return_tensors="pt"
        )

        with torch.no_grad():
            model_output = model(**encoded_input)

        sentence_embeddings = mean_pooling(
            model_output, encoded_input["attention_mask"]
        )

        # Normalize embeddings
        normalized_embeddings = torch.nn.functional.normalize(
            sentence_embeddings, p=2, dim=1
        )

        if expected_dimension and normalized_embeddings.shape[0] > 0:
            actual_dimension = normalized_embeddings.shape[1]
            if actual_dimension != expected_dimension:
                raise ValueError(
                    f"Expected dimension {expected_dimension}, but got {actual_dimension} for model {model_name}"
                )

        return normalized_embeddings.tolist()

    except Exception as e:
        tqdm.write(f"❌ Error getting Hugging Face embeddings for {model_name}: {e}")
        return []