Source code for src.utils.data_loader

"""Data loading utilities for ForzaEmbed.

This module provides functions for loading markdown content from various
sources including directories and lists of strings. It handles file I/O
and content extraction.

Example:
    Load markdown files from a directory::

        from src.utils.data_loader import load_markdown_files

        files = load_markdown_files("markdowns/")
        for name, content in files:
            print(f"Loaded: {name}")
"""

import logging
from pathlib import Path
from typing import List, Tuple, Union



[docs]
def load_markdown_files(
    data_source: Union[str, Path, List[str]],
) -> List[Tuple[str, str]]:
    """Load markdown content from various sources.

    Supports loading from:
    1. A directory path (str or Path) to load all .md files.
    2. A list of strings where each string is markdown content.

    Args:
        data_source: The source of markdown data. Can be a directory path
            or a list of markdown content strings.

    Returns:
        List of tuples containing (name, content) pairs.

    Raises:
        TypeError: If data_source is not a supported type.
    """
    all_rows = []
    if isinstance(data_source, (str, Path)):
        directory = Path(data_source)
        if not directory.is_dir():
            logging.error(f"Data source is not a valid directory: {directory}")
            return []
        for file_path in directory.glob("*.md"):
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()
                # Use the filename stem as a generic name
                name = file_path.stem
                all_rows.append((name, content))
    elif isinstance(data_source, list) and all(
        isinstance(item, str) for item in data_source
    ):
        for i, content in enumerate(data_source):
            name = f"Text Content {i + 1}"
            all_rows.append((name, content))
    else:
        raise TypeError(
            "Unsupported data_source type. "
            "Please provide a directory path or a list of markdown strings."
        )
    return all_rows