Source code for src.utils.models

"""SQLAlchemy ORM models for the ForzaEmbed database.

This module defines all database models used for storing embedding results,
metrics, and metadata. Uses SQLAlchemy 2.0 declarative mapping with type
annotations.

Models:
    Model: Stores model run configurations.
    ProcessingResult: Stores detailed processing results per file.
    EmbeddingCache: Caches computed embeddings for reuse.
    ProjectionCoordinate: Caches dimensional reduction coordinates.
"""

from datetime import datetime
from typing import Optional

from sqlalchemy import (
    BLOB,
    Column,
    DateTime,
    Float,
    ForeignKey,
    Integer,
    String,
    UniqueConstraint,
)
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship


[docs] class Base(DeclarativeBase): """Base class for all SQLAlchemy ORM models.""" pass
[docs] class Model(Base): """Stores model run configuration and metadata. Attributes: id: Primary key. name: Unique run name identifier. base_model_name: The underlying model name. type: Model type (api, fastembed, sentence_transformers, etc.). chunk_size: Chunk size used in this run. chunk_overlap: Chunk overlap used in this run. theme_name: Theme set name used. chunking_strategy: Chunking strategy used. similarity_metric: Similarity metric used. created_at: Timestamp of creation. """ __tablename__ = "models" id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) name: Mapped[str] = mapped_column(String, unique=True, nullable=False) base_model_name: Mapped[str] = mapped_column(String, nullable=False) type: Mapped[str] = mapped_column(String, nullable=False) chunk_size: Mapped[int] = mapped_column(Integer, nullable=False) chunk_overlap: Mapped[int] = mapped_column(Integer, nullable=False) theme_name: Mapped[str] = mapped_column(String, nullable=False) chunking_strategy: Mapped[str] = mapped_column(String, nullable=False) similarity_metric: Mapped[Optional[str]] = mapped_column(String, nullable=True) created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
# Relationships
[docs] class ProcessingResult(Base): """Stores detailed processing results for each file. Attributes: id: Primary key. model_name: The model run name. file_id: Identifier for the processed file. results_blob: Serialized results data. created_at: Timestamp of creation. """ __tablename__ = "processing_results" __table_args__ = (UniqueConstraint("model_name", "file_id", name="uq_model_file"),) id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) model_name: Mapped[str] = mapped_column(String, nullable=False) file_id: Mapped[str] = mapped_column(String, nullable=False) results_blob: Mapped[bytes] = mapped_column(BLOB, nullable=False) created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
[docs] class EmbeddingCache(Base): """Caches computed embeddings for reuse. Attributes: model_name: The model name (part of composite primary key). text_hash: Hash of the embedded text (part of composite primary key). vector: Serialized embedding vector. dimension: Dimension of the embedding vector. created_at: Timestamp of creation. """ __tablename__ = "embedding_cache" # Composite primary key as defined in original schema model_name: Mapped[str] = mapped_column(String, primary_key=True) text_hash: Mapped[str] = mapped_column(String, primary_key=True) vector: Mapped[bytes] = mapped_column(BLOB, nullable=False) dimension: Mapped[int] = mapped_column(Integer, nullable=False) created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
[docs] class ProjectionCoordinate(Base): """Caches dimensional reduction coordinate calculations (t-SNE, UMAP, PCA). Attributes: id: Primary key. projection_key: Unique key for the projection configuration. file_id: Identifier for the file. coordinates: Serialized coordinate data. created_at: Timestamp of creation. """ __tablename__ = "projection_coordinates" __table_args__ = (UniqueConstraint("projection_key", "file_id", name="uq_projection_file"),) id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) projection_key: Mapped[str] = mapped_column(String, nullable=False) file_id: Mapped[str] = mapped_column(String, nullable=False) coordinates: Mapped[bytes] = mapped_column(BLOB, nullable=False) created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)