w11wo commited on
Commit
cd9ad5b
1 Parent(s): 4296aa1

oscar tokenizers

Browse files
Files changed (2) hide show
  1. tokenizer.json +0 -0
  2. train_tokenizer.py +1 -1
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
train_tokenizer.py CHANGED
@@ -2,7 +2,7 @@ from datasets import load_dataset
2
  from tokenizers import ByteLevelBPETokenizer
3
 
4
  # load dataset
5
- dataset = load_dataset("mc4", "id", split="train")
6
 
7
  # Instantiate tokenizer
8
  tokenizer = ByteLevelBPETokenizer()
 
2
  from tokenizers import ByteLevelBPETokenizer
3
 
4
  # load dataset
5
+ dataset = load_dataset("oscar", "unshuffled_deduplicated_id", split="train")
6
 
7
  # Instantiate tokenizer
8
  tokenizer = ByteLevelBPETokenizer()