"""Data loading module for HuggingFace datasets.""" from datasets import load_dataset from functools import cache from typing import Any import logging # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @cache def load_sample_dataset() -> Any: """Load the essential-web dataset sample.""" try: logger.info("Loading dataset from HuggingFace...") dataset = load_dataset("sumuks/essential-web-v1.0-sample-1M-with-cleaned-text") logger.info(f"Dataset loaded successfully with {len(dataset['train'])} samples") return dataset except Exception as e: logger.error(f"Failed to load dataset: {e}") raise def get_dataset_size() -> int: """Get total number of samples in the dataset.""" dataset = load_sample_dataset() return len(dataset['train']) def get_sample(index: int) -> tuple[str, str]: """Get original and cleaned text for a specific sample.""" dataset = load_sample_dataset() sample = dataset['train'][index] return sample['text'], sample['cleaned_text']