acul3 commited on
Commit
9e8706c
1 Parent(s): 1a41b06

keep memory

Browse files
Files changed (2) hide show
  1. run.sh +2 -3
  2. run_mlm_flax.py +3 -0
run.sh CHANGED
@@ -6,18 +6,17 @@ python3 run_mlm_flax.py \
6
  --config_name="./configs/base" \
7
  --tokenizer_name="./" \
8
  --dataset_name="munggok/KoPI" \
9
- --cache_dir="/data/cache" \
10
  --dataset_config_name="full" \
11
  --max_seq_length="512" \
12
  --pad_to_max_length \
13
  --per_device_train_batch_size="64" \
14
  --per_device_eval_batch_size="64" \
15
- --preprocessing_num_workers="96" \
16
  --adam_beta1="0.9" \
17
  --adam_beta2="0.98" \
18
  --adam_epsilon="1e-6" \
19
  --learning_rate="8e-5" \
20
- --num_train_epochs="15" \
21
  --weight_decay="0.01" \
22
  --save_strategy="steps" \
23
  --save_steps="10000" \
 
6
  --config_name="./configs/base" \
7
  --tokenizer_name="./" \
8
  --dataset_name="munggok/KoPI" \
 
9
  --dataset_config_name="full" \
10
  --max_seq_length="512" \
11
  --pad_to_max_length \
12
  --per_device_train_batch_size="64" \
13
  --per_device_eval_batch_size="64" \
14
+ --preprocessing_num_workers="64" \
15
  --adam_beta1="0.9" \
16
  --adam_beta2="0.98" \
17
  --adam_epsilon="1e-6" \
18
  --learning_rate="8e-5" \
19
+ --num_train_epochs="20" \
20
  --weight_decay="0.01" \
21
  --save_strategy="steps" \
22
  --save_steps="10000" \
run_mlm_flax.py CHANGED
@@ -648,6 +648,7 @@ def main():
648
  tokenize_function,
649
  input_columns=[text_column_name],
650
  batched=True,
 
651
  num_proc=data_args.preprocessing_num_workers,
652
  remove_columns=column_names,
653
  load_from_cache_file=not data_args.overwrite_cache,
@@ -663,6 +664,7 @@ def main():
663
  tokenized_datasets = datasets.map(
664
  tokenize_function,
665
  batched=True,
 
666
  num_proc=data_args.preprocessing_num_workers,
667
  remove_columns=column_names,
668
  load_from_cache_file=not data_args.overwrite_cache,
@@ -694,6 +696,7 @@ def main():
694
  tokenized_datasets = tokenized_datasets.map(
695
  group_texts,
696
  batched=True,
 
697
  num_proc=data_args.preprocessing_num_workers,
698
  load_from_cache_file=not data_args.overwrite_cache,
699
  )
 
648
  tokenize_function,
649
  input_columns=[text_column_name],
650
  batched=True,
651
+ keep_in_memory=True,
652
  num_proc=data_args.preprocessing_num_workers,
653
  remove_columns=column_names,
654
  load_from_cache_file=not data_args.overwrite_cache,
 
664
  tokenized_datasets = datasets.map(
665
  tokenize_function,
666
  batched=True,
667
+ keep_in_memory=True,
668
  num_proc=data_args.preprocessing_num_workers,
669
  remove_columns=column_names,
670
  load_from_cache_file=not data_args.overwrite_cache,
 
696
  tokenized_datasets = tokenized_datasets.map(
697
  group_texts,
698
  batched=True,
699
+ keep_in_memory=True,
700
  num_proc=data_args.preprocessing_num_workers,
701
  load_from_cache_file=not data_args.overwrite_cache,
702
  )