Untitled
ptrdung
plain_text
a year ago
4.1 kB
6
Indexable
import pandas as pd data = pd.read_parquet('/home/t12/QuangNV/mini_project_stp/notebooks/res/0000.parquet', engine='pyarrow') with open('/home/t12/QuangNV/mini_project_stp/data/raw/train/train.en', 'a') as file: for i in range(len(data)): file.write(data['translation'][i]['en']+'\n') with open('/home/t12/QuangNV/mini_project_stp/data/raw/train/train.vi', 'a') as file: for i in range(len(data)): file.write(data['translation'][i]['vi']+'\n') data = pd.read_parquet('/home/t12/QuangNV/mini_project_stp/notebooks/res/iwslt2015-en-vi_validation_0000.parquet', engine='pyarrow') with open('/home/t12/QuangNV/mini_project_stp/data/raw/val/val.en', 'a') as file: for i in range(len(data)): file.write(data['translation'][i]['en']+'\n') with open('/home/t12/QuangNV/mini_project_stp/data/raw/val/val.vi', 'a') as file: for i in range(len(data)): file.write(data['translation'][i]['vi']+'\n') data = pd.read_parquet('/home/t12/QuangNV/mini_project_stp/notebooks/res/iwslt2015-en-vi_test_0000.parquet', engine='pyarrow') with open('/home/t12/QuangNV/mini_project_stp/data/raw/test/test.en', 'a') as file: for i in range(len(data)): file.write(data['translation'][i]['en']+'\n') with open('/home/t12/QuangNV/mini_project_stp/data/raw/test/test.vi', 'a') as file: for i in range(len(data)): file.write(data['translation'][i]['vi']+'\n') RAW='/home/t12/QuangNV/mini_project_stp/data/raw' TMP='/home/t12/QuangNV/mini_project_stp/data/tmp' DUMP='/home/t12/QuangNV/mini_project_stp/data/dump' !python3 /home/t12/QuangNV/mini_project_stp/src/opennmt/scripts/bert_tokenize.py \ --bert bert-base-multilingual-cased \ --prefixes $RAW/train/train.en $RAW/train/train.vi $RAW/val/val.en $RAW/val/val.vi $RAW/test/test.en $RAW/test/test.vi \ --output_dir $TMP !python3 /home/t12/QuangNV/mini_project_stp/src/opennmt/scripts/bert_prepro.py --src $TMP/train.vi.bert \ --tgt $TMP/train.en.bert \ --output $DUMP/VIEN.db VSIZE=200000 FREQ=0 SHARD_SIZE=200000 !python3 /home/t12/QuangNV/mini_project_stp/src/opennmt/preprocess.py \ -train_src $TMP/train.vi.bert \ -train_tgt $TMP/train.en.bert \ -valid_src $TMP/val.vi.bert \ -valid_tgt $TMP/val.en.bert \ -save_data $DUMP/VIEN \ -src_seq_length 150 \ -tgt_seq_length 150 \ -src_vocab_size $VSIZE \ -tgt_vocab_size $VSIZE \ -vocab_size_multiple 8 \ -src_words_min_frequency $FREQ \ -tgt_words_min_frequency $FREQ \ -share_vocab \ -shard_size $SHARD_SIZE !mv $TMP/val.en.bert $DUMP/dev.en.bert !mv $TMP/test.en.bert $DUMP/test.en.bert !mv $TMP/val.vi.bert $DUMP/dev.vi.bert !mv $TMP/test.vi.bert $DUMP/test.vi.bert !cp /home/t12/QuangNV/mini_project_stp/data/raw/val/val.en /home/t12/QuangNV/mini_project_stp/data/dump/ref/dev.en !cp /home/t12/QuangNV/mini_project_stp/data/raw/test/test.en /home/t12/QuangNV/mini_project_stp/data/dump/ref/test.en !python3 /home/t12/QuangNV/mini_project_stp/src/run_cmlm_finetuning.py --train_file /home/t12/QuangNV/mini_project_stp/data/dump/VIEN.db \ --vocab_file /home/t12/QuangNV/mini_project_stp/data/dump/VIEN.vocab.pt \ --valid_src /home/t12/QuangNV/mini_project_stp/data/dump/dev.vi.bert \ --valid_tgt /home/t12/QuangNV/mini_project_stp/data/dump/dev.en.bert \ --bert_model bert-base-multilingual-cased \ --output_dir /home/t12/QuangNV/mini_project_stp/output/out \ --train_batch_size 2048 \ --learning_rate 5e-5 \ --valid_steps 5000 \ --num_train_steps 41000 \ --warmup_proportion 0.05 \ --gradient_accumulation_steps 1 \ # --fp16
Editor is loading...
Leave a Comment