data1
neyjrxdung
plain_text
2 years ago
4.2 kB
13
Indexable
import pandas as pd
data = pd.read_parquet('/home/t12/QuangNV/mini_project_stp/notebooks/res/0000.parquet', engine='pyarrow')
data
with open('/home/t12/QuangNV/mini_project_stp/data/raw/train/train.en', 'a') as file:
for i in range(len(data)):
file.write(data['translation'][i]['en']+'\n')
with open('/home/t12/QuangNV/mini_project_stp/data/raw/train/train.vi', 'a') as file:
for i in range(len(data)):
file.write(data['translation'][i]['vi']+'\n')
data = pd.read_parquet('/home/t12/QuangNV/mini_project_stp/notebooks/res/iwslt2015-en-vi_validation_0000.parquet', engine='pyarrow')
data
with open('/home/t12/QuangNV/mini_project_stp/data/raw/val/val.en', 'a') as file:
for i in range(len(data)):
file.write(data['translation'][i]['en']+'\n')
with open('/home/t12/QuangNV/mini_project_stp/data/raw/val/val.vi', 'a') as file:
for i in range(len(data)):
file.write(data['translation'][i]['vi']+'\n')
data = pd.read_parquet('/home/t12/QuangNV/mini_project_stp/notebooks/res/iwslt2015-en-vi_test_0000.parquet', engine='pyarrow')
data
with open('/home/t12/QuangNV/mini_project_stp/data/raw/test/test.en', 'a') as file:
for i in range(len(data)):
file.write(data['translation'][i]['en']+'\n')
with open('/home/t12/QuangNV/mini_project_stp/data/raw/test/test.vi', 'a') as file:
for i in range(len(data)):
file.write(data['translation'][i]['vi']+'\n')
RAW='/home/t12/QuangNV/mini_project_stp/data/raw'
TMP='/home/t12/QuangNV/mini_project_stp/data/tmp'
DUMP='/home/t12/QuangNV/mini_project_stp/data/dump'
!python3 /home/t12/QuangNV/mini_project_stp/src/opennmt/scripts/bert_tokenize.py \
--bert bert-base-multilingual-cased \
--prefixes $RAW/train/train.en $RAW/train/train.vi $RAW/val/val.en $RAW/val/val.vi $RAW/test/test.en $RAW/test/test.vi \
--output_dir $TMP
!python3 /home/t12/QuangNV/mini_project_stp/src/opennmt/scripts/bert_prepro.py --src $TMP/train.vi.bert \
--tgt $TMP/train.en.bert \
--output $DUMP/VIEN.db
VSIZE=200000
FREQ=0
SHARD_SIZE=200000
!python3 /home/t12/QuangNV/mini_project_stp/src/opennmt/preprocess.py \
-train_src $TMP/train.vi.bert \
-train_tgt $TMP/train.en.bert \
-valid_src $TMP/val.vi.bert \
-valid_tgt $TMP/val.en.bert \
-save_data $DUMP/VIEN \
-src_seq_length 150 \
-tgt_seq_length 150 \
-src_vocab_size $VSIZE \
-tgt_vocab_size $VSIZE \
-vocab_size_multiple 8 \
-src_words_min_frequency $FREQ \
-tgt_words_min_frequency $FREQ \
-share_vocab \
-shard_size $SHARD_SIZE
!mv $TMP/val.en.bert $DUMP/dev.en.bert
!mv $TMP/test.en.bert $DUMP/test.en.bert
!mv $TMP/val.vi.bert $DUMP/dev.vi.bert
!mv $TMP/test.vi.bert $DUMP/test.vi.bert
!cp /home/t12/QuangNV/mini_project_stp/data/raw/val/val.en /home/t12/QuangNV/mini_project_stp/data/dump/ref/dev.en
!cp /home/t12/QuangNV/mini_project_stp/data/raw/test/test.en /home/t12/QuangNV/mini_project_stp/data/dump/ref/test.en
!python3 /home/t12/QuangNV/mini_project_stp/src/run_cmlm_finetuning.py --train_file /home/t12/QuangNV/mini_project_stp/data/dump/VIEN.db \
--vocab_file /home/t12/QuangNV/mini_project_stp/data/dump/VIEN.vocab.pt \
--valid_src /home/t12/QuangNV/mini_project_stp/data/dump/dev.vi.bert \
--valid_tgt /home/t12/QuangNV/mini_project_stp/data/dump/dev.en.bert \
--bert_model bert-base-multilingual-cased \
--output_dir /home/t12/QuangNV/mini_project_stp/output/out \
--train_batch_size 16384 \
--learning_rate 5e-5 \
--valid_steps 5000 \
--num_train_steps 100000 \
--warmup_proportion 0.05 \
--gradient_accumulation_steps 1 \
--fp16Editor is loading...
Leave a Comment