Source code for src.utils.models

"""SQLAlchemy ORM models for the ForzaEmbed database.

This module defines all database models used for storing embedding results,
metrics, and metadata. Uses SQLAlchemy 2.0 declarative mapping with type
annotations.

Models:
    Model: Stores model run configurations.
    EvaluationMetric: Stores evaluation metrics for each model run.
    GeneratedFile: Tracks generated output files.
    GlobalChart: Stores paths to global chart images.
    ProcessingResult: Stores detailed processing results per file.
    EmbeddingCache: Caches computed embeddings for reuse.
    TSNECoordinate: Caches t-SNE coordinate calculations.
"""

from datetime import datetime
from typing import Optional

from sqlalchemy import (
    BLOB,
    Column,
    DateTime,
    Float,
    ForeignKey,
    Integer,
    String,
    UniqueConstraint,
)
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship


[docs] class Base(DeclarativeBase): """Base class for all SQLAlchemy ORM models.""" pass
[docs] class Model(Base): """Stores model run configuration and metadata. Attributes: id: Primary key. name: Unique run name identifier. base_model_name: The underlying model name. type: Model type (api, fastembed, sentence_transformers, etc.). chunk_size: Chunk size used in this run. chunk_overlap: Chunk overlap used in this run. theme_name: Theme set name used. chunking_strategy: Chunking strategy used. similarity_metric: Similarity metric used. created_at: Timestamp of creation. metrics: Related evaluation metrics. generated_files: Related generated files. """ __tablename__ = "models" id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) name: Mapped[str] = mapped_column(String, unique=True, nullable=False) base_model_name: Mapped[str] = mapped_column(String, nullable=False) type: Mapped[str] = mapped_column(String, nullable=False) chunk_size: Mapped[int] = mapped_column(Integer, nullable=False) chunk_overlap: Mapped[int] = mapped_column(Integer, nullable=False) theme_name: Mapped[str] = mapped_column(String, nullable=False) chunking_strategy: Mapped[str] = mapped_column(String, nullable=False) similarity_metric: Mapped[Optional[str]] = mapped_column(String, nullable=True) created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow) # Relationships metrics: Mapped["EvaluationMetric"] = relationship( "EvaluationMetric", back_populates="model", cascade="all, delete-orphan" ) generated_files: Mapped[list["GeneratedFile"]] = relationship( "GeneratedFile", back_populates="model", cascade="all, delete-orphan" )
[docs] class EvaluationMetric(Base): """Stores evaluation metrics for a model run. Attributes: id: Primary key. model_name: Foreign key to the model. silhouette_score: Silhouette clustering score. intra_cluster_distance_normalized: Normalized intra-cluster distance. inter_cluster_distance_normalized: Normalized inter-cluster distance. embedding_computation_time: Time taken to compute embeddings. created_at: Timestamp of creation. model: Related model instance. """ __tablename__ = "evaluation_metrics" id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) model_name: Mapped[str] = mapped_column(ForeignKey("models.name"), nullable=False) silhouette_score: Mapped[Optional[float]] = mapped_column(Float, nullable=True) intra_cluster_distance_normalized: Mapped[Optional[float]] = mapped_column(Float, nullable=True) inter_cluster_distance_normalized: Mapped[Optional[float]] = mapped_column(Float, nullable=True) embedding_computation_time: Mapped[Optional[float]] = mapped_column(Float, nullable=True) created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow) model: Mapped["Model"] = relationship("Model", back_populates="metrics")
[docs] class GeneratedFile(Base): """Tracks generated output files for a model run. Attributes: id: Primary key. model_name: Foreign key to the model. file_type: Type of the generated file. file_path: Path to the generated file. created_at: Timestamp of creation. model: Related model instance. """ __tablename__ = "generated_files" id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) model_name: Mapped[str] = mapped_column(ForeignKey("models.name"), nullable=False) file_type: Mapped[str] = mapped_column(String, nullable=False) file_path: Mapped[str] = mapped_column(String, nullable=False) created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow) model: Mapped["Model"] = relationship("Model", back_populates="generated_files")
[docs] class GlobalChart(Base): """Stores paths to global chart images. Attributes: id: Primary key. chart_type: Type identifier for the chart. file_path: Path to the chart image file. created_at: Timestamp of creation. """ __tablename__ = "global_charts" id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) chart_type: Mapped[str] = mapped_column(String, nullable=False) file_path: Mapped[str] = mapped_column(String, nullable=False) created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
[docs] class ProcessingResult(Base): """Stores detailed processing results for each file. Attributes: id: Primary key. model_name: The model run name. file_id: Identifier for the processed file. results_blob: Serialized results data. created_at: Timestamp of creation. """ __tablename__ = "processing_results" __table_args__ = (UniqueConstraint("model_name", "file_id", name="uq_model_file"),) id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) model_name: Mapped[str] = mapped_column(String, nullable=False) file_id: Mapped[str] = mapped_column(String, nullable=False) results_blob: Mapped[bytes] = mapped_column(BLOB, nullable=False) created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
[docs] class EmbeddingCache(Base): """Caches computed embeddings for reuse. Attributes: model_name: The model name (part of composite primary key). text_hash: Hash of the embedded text (part of composite primary key). vector: Serialized embedding vector. dimension: Dimension of the embedding vector. created_at: Timestamp of creation. """ __tablename__ = "embedding_cache" # Composite primary key as defined in original schema model_name: Mapped[str] = mapped_column(String, primary_key=True) text_hash: Mapped[str] = mapped_column(String, primary_key=True) vector: Mapped[bytes] = mapped_column(BLOB, nullable=False) dimension: Mapped[int] = mapped_column(Integer, nullable=False) created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
[docs] class TSNECoordinate(Base): """Caches t-SNE coordinate calculations. Attributes: id: Primary key. tsne_key: Unique key for the t-SNE configuration. file_id: Identifier for the file. coordinates: Serialized coordinate data. created_at: Timestamp of creation. """ __tablename__ = "tsne_coordinates" __table_args__ = (UniqueConstraint("tsne_key", "file_id", name="uq_tsne_file"),) id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) tsne_key: Mapped[str] = mapped_column(String, nullable=False) file_id: Mapped[str] = mapped_column(String, nullable=False) coordinates: Mapped[bytes] = mapped_column(BLOB, nullable=False) created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)