Spaces:
Sleeping
Sleeping
"""Data loading module for HuggingFace datasets.""" | |
from datasets import load_dataset | |
from functools import cache | |
from typing import Any | |
import logging | |
# Set up logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
def load_sample_dataset() -> Any: | |
"""Load the essential-web dataset sample.""" | |
try: | |
logger.info("Loading dataset from HuggingFace...") | |
dataset = load_dataset("sumuks/essential-web-v1.0-sample-1M-with-cleaned-text") | |
logger.info(f"Dataset loaded successfully with {len(dataset['train'])} samples") | |
return dataset | |
except Exception as e: | |
logger.error(f"Failed to load dataset: {e}") | |
raise | |
def get_dataset_size() -> int: | |
"""Get total number of samples in the dataset.""" | |
dataset = load_sample_dataset() | |
return len(dataset['train']) | |
def get_sample(index: int) -> tuple[str, str]: | |
"""Get original and cleaned text for a specific sample.""" | |
dataset = load_sample_dataset() | |
sample = dataset['train'][index] | |
return sample['text'], sample['cleaned_text'] |