from datasets import load_dataset, concatenate_datasets from tokenizers import ByteLevelBPETokenizer from pathlib import Path dataset_language = "su" validation_split_percentage = 10 # load dataset # only the train subset for tokenizing purposes oscar = load_dataset( "oscar", f"unshuffled_deduplicated_{dataset_language}", split="train", ) cc100 = load_dataset("cc100", lang=dataset_language, split="train") mc4 = load_dataset("mc4", dataset_language, split="train") wiki_files = [str(x) for x in Path("../docs").glob("*.txt")] wiki = load_dataset("text", data_files=wiki_files) # want: text column only! oscar = oscar.remove_columns("id") mc4 = mc4.remove_columns(["url", "timestamp"]) cc100 = cc100.remove_columns("id") dataset = concatenate_datasets([oscar, mc4, cc100, wiki["train"]]) dataset = dataset.train_test_split(test_size=validation_split_percentage / 100, seed=42) # Instantiate tokenizer tokenizer = ByteLevelBPETokenizer() def batch_iterator(batch_size=10000): for i in range(0, len(dataset), batch_size): yield dataset["train"][i : i + batch_size]["text"] # Customized training tokenizer.train_from_iterator( batch_iterator(), vocab_size=50265, min_frequency=2, special_tokens=["", "", "", "", "",], ) # Save files to disk model_dir = "." tokenizer.save(f"{model_dir}/tokenizer.json")