Source code for src.clients.huggingface_client

"""Hugging Face embedding client using transformers library.

This module provides functions for generating embeddings using generic
Hugging Face models with mean pooling and normalization.

Example:
    Generate embeddings using a Hugging Face model::

        from src.clients.huggingface_client import get_huggingface_embeddings

        embeddings = get_huggingface_embeddings(
            texts=["Hello world"],
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )
"""

from typing import List

import torch
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer


[docs] def mean_pooling( model_output: torch.Tensor, attention_mask: torch.Tensor ) -> torch.Tensor: """Perform mean pooling on token embeddings to get sentence embedding. Args: model_output: Model output containing token embeddings. attention_mask: Attention mask for the input tokens. Returns: Mean-pooled sentence embeddings tensor. """ token_embeddings = model_output[ 0 ] # First element of model_output contains all token embeddings input_mask_expanded = ( attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() ) return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp( input_mask_expanded.sum(1), min=1e-9 )
[docs] def get_huggingface_embeddings( texts: List[str], model_name: str, expected_dimension: int | None = None ) -> List[List[float]]: """Generate embeddings using a generic Hugging Face model. Loads the model and tokenizer, processes texts, and applies mean pooling with L2 normalization. Args: texts: List of texts to embed. model_name: Name of the Hugging Face model to use. expected_dimension: Expected embedding dimension for validation. Returns: List of normalized embedding vectors as lists of floats. Raises: ValueError: If embedding dimension doesn't match expected. """ try: tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModel.from_pretrained(model_name) # For instruction-tuned models, add a prefix if "instruct" in model_name: texts = [f"passage: {text}" for text in texts] encoded_input = tokenizer( texts, padding=True, truncation=True, return_tensors="pt" ) with torch.no_grad(): model_output = model(**encoded_input) sentence_embeddings = mean_pooling( model_output, encoded_input["attention_mask"] ) # Normalize embeddings normalized_embeddings = torch.nn.functional.normalize( sentence_embeddings, p=2, dim=1 ) if expected_dimension and normalized_embeddings.shape[0] > 0: actual_dimension = normalized_embeddings.shape[1] if actual_dimension != expected_dimension: raise ValueError( f"Expected dimension {expected_dimension}, but got {actual_dimension} for model {model_name}" ) return normalized_embeddings.tolist() except Exception as e: tqdm.write(f"❌ Error getting Hugging Face embeddings for {model_name}: {e}") return []