# train.py | |
from datasets import load_dataset | |
from transformers import AutoTokenizer | |
# Load FineWeb | |
dataset = load_dataset("HuggingFaceFW/fineweb", split="train[:1%]") # start small for testing | |
# Load tokenizer | |
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2") | |
# Tokenize | |
def tokenize(example): | |
return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512) | |
tokenized = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names) | |
tokenized.save_to_disk("tokenized_dataset") | |