{ | |
"base_tokenizer": "HuggingFaceTB/SmolLM2-135M", | |
"data_splits": [ | |
{ | |
"dataset_url": "thiomajid/english_hausa_mix", | |
"text_column": "text", | |
"split": "train", | |
"samples": "all", | |
"subset": null | |
}, | |
{ | |
"dataset_url": "thiomajid/english_hausa_mix", | |
"text_column": "text", | |
"split": "test", | |
"samples": "all", | |
"subset": null | |
} | |
], | |
"text_column": "text", | |
"base_output_dir": "./base_tokenizer", | |
"vocab_size": 64000, | |
"output_dir": "./merged_tokenizer", | |
"batch_size": 1000, | |
"hub_id": "thiomajid/hausa_english_blend" | |
} |