view_essentialweb_cleaned / data_loader.py
sumuks's picture
sumuks HF Staff
Create data_loader.py
3a6b206 verified
"""Data loading module for HuggingFace datasets."""
from datasets import load_dataset
from functools import cache
from typing import Any
import logging
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@cache
def load_sample_dataset() -> Any:
"""Load the essential-web dataset sample."""
try:
logger.info("Loading dataset from HuggingFace...")
dataset = load_dataset("sumuks/essential-web-v1.0-sample-1M-with-cleaned-text")
logger.info(f"Dataset loaded successfully with {len(dataset['train'])} samples")
return dataset
except Exception as e:
logger.error(f"Failed to load dataset: {e}")
raise
def get_dataset_size() -> int:
"""Get total number of samples in the dataset."""
dataset = load_sample_dataset()
return len(dataset['train'])
def get_sample(index: int) -> tuple[str, str]:
"""Get original and cleaned text for a specific sample."""
dataset = load_sample_dataset()
sample = dataset['train'][index]
return sample['text'], sample['cleaned_text']