Transformer-机器翻译
Transformer
机器翻译,是指使用计算机将一种自然语言转换为另一种自然语言的过程。这里,自然语言是指日常使用的人类语言(如中文、英语),区别于人工为特定目的创造的语言(如计算机编程语言)。
机器翻译是自然语言处理(NLP)的一个研究分支,是人工智能的终极目标之一,具有重要的科学研究价值。
2017年,谷歌在论文《Attention Is All You Need》中提出了Transformer模型。这种基于自注意力机制的模型能够很好地解决序列模型的问题,比如机器翻译。Transformer应用在机器翻译任务中,不仅提高了翻译的效果,由于其高度并行化的设计,还大幅提高了训练的效率。
本案例是《Attention Is All You Need》论文复现的体验案例,数据集为WMT2014 英语-德语数据集
具体算法介绍:https://marketplace.huaweicloud.cn/markets/aihub/modelhub/detail/?id=ba3ab3ce-bc77-4b44-af38-e6b6413bd836
注意事项:
1.本案例使用框架:PyTorch1.4.0
2.本案例使用硬件:GPU: 1*NVIDIA-V100NV32(32GB) | CPU: 8 核 64GB
3.运行代码方法: 点击本页面顶部菜单栏的三角形运行按钮或按Ctrl+Enter键 运行每个方块中的代码
4.JupyterLab的详细用法: 请参考《ModelAtrs JupyterLab使用指导》
5.碰到问题的解决办法**:** 请参考《ModelAtrs JupyterLab常见问题解决办法》
1.下载代码和数据
import moxing as mox
mox.file.copy_parallel('obs://obs-aigallery-zc/algorithm/Transformer_translate','./Transformer_translate')
INFO:root:Using MoXing-v1.17.3-43fbf97f
INFO:root:Using OBS-Python-SDK-3.20.7
2.模型训练
2.1依赖库的加载与安装
# -*- coding: utf-8 -*-
from __future__ import print_function
import os
root_path = './Transformer_translate'
os.chdir(root_path)
os.system('pip install --upgrade numpy')
os.system('pip install --ignore-installed PyYAML==5.4.1')
os.system('pip install fairseq')
os.system('pip install sacremoses')
os.system('pip install nltk')
os.system('pip install subword-nmt')
os.system('pip install tqdm')
0
2.2参数设置
import tqdm
import argparse
parser = argparse.ArgumentParser(description='Training')
parser.add_argument('--resume_net', default='no', type=str, help='resume net for retraining')
parser.add_argument('--training_dataset', default='./data/data-bin', help='Training dataset directory')
parser.add_argument('--save_folder', default='./outputs', help='Location to save checkpoint models')
parser.add_argument('--max_tokens', default=4096, type=int)
parser.add_argument('--optimizer', default='adam', type=str)
parser.add_argument('--weight_decay', default=0.0, type=float)
parser.add_argument('--lr', default=0.0007, type=float)
parser.add_argument('--clip_norm', default=0, type=float)
parser.add_argument('--load_weights', type=str, default='./model/model.pth',
metavar='PATH')
parser.add_argument('--eval', default='False', type=str, choices=['True', 'False'])
parser.add_argument('--restore', default='True', type=str, choices=['True', 'False'])
# For evaluation
parser.add_argument('--eval_result_path', default='./outputs/eval')
args, unknown = parser.parse_known_args()
import shutil
def get_files_tree(src="src_path"):
req_files = []
for r, d, files in os.walk(src):
for file in files:
src_file = os.path.join(r, file)
src_file = src_file.replace('\\', '/')
if src_file.endswith('.db'):
continue
req_files.append(src_file)
return req_files
def copy_tree_force(src_path="",dest_path=""):
"""
make sure that all the paths has correct slash characters.
"""
for cf in get_files_tree(src=src_path):
df= cf.replace(src_path, dest_path)
if not os.path.exists(os.path.dirname(df)):
os.makedirs(os.path.dirname(df))
shutil.copy2(cf, df)
train_input_dir = args.training_dataset
if not os.path.exists(train_input_dir):
os.makedirs(train_input_dir)
else:
print(train_input_dir, 'already exists')
if not os.path.exists(args.eval_result_path):
os.makedirs(args.eval_result_path)
model_path=args.load_weights
# Train
command='''
CUDA_VISIBLE_DEVICES=0 fairseq-train '''+ train_input_dir +''' \
--restore-file '''+ model_path +''' \
--save-dir '''+ args.save_folder +''' \
--optimizer '''+ args.optimizer +''' \
--adam-betas '(0.9, 0.98)' \
--clip-norm '''+ str(args.clip_norm) +''' \
--lr-scheduler inverse_sqrt \
--warmup-init-lr 1e-07 \
--warmup-updates 4000 \
--lr '''+ str(args.lr) +''' \
--min-lr 1e-09 \
--criterion label_smoothed_cross_entropy \
--label-smoothing 0.1 \
--weight-decay '''+ str(args.weight_decay) +''' \
--max-tokens '''+ str(args.max_tokens) +''' \
--save-interval-updates 50 \
--max-update 50 \
--keep-interval-updates 20 \
--decoder-attention-heads 16 \
--decoder-embed-dim 1024 \
--decoder-ffn-embed-dim 4096 \
--decoder-layerdrop 0 \
--decoder-layers 6 \
--decoder-output-dim 1024 \
--encoder-attention-heads 16 \
--encoder-embed-dim 1024 \
--encoder-ffn-embed-dim 4096 \
--encoder-layerdrop 0 \
--encoder-layers 6 \
--source-lang en \
--share-decoder-input-output-embed \
--target-lang de \
--optimizer adam \
--optimizer-overrides {} \
--reset-optimizer \
--save-interval 0 \
--keep-last-epochs 1 \
--max-epoch 31 \
--max-source-positions 1024 \
--max-target-positions 1024 \
--max-tokens 3584 \
--min-loss-scale 0.0001 \
--min-lr 1e-09 \
--stop-time-hours 0.1 \
--arch transformer_wmt_en_de
'''
./data/data-bin already exists
2.3开始训练
print(command)
ret = os.system(command)
copy_tree_force('./model', args.save_folder)
print('training end')
CUDA_VISIBLE_DEVICES=0 fairseq-train ./data/data-bin --restore-file ./model/model.pth --save-dir ./outputs --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0 --lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates 4000 --lr 0.0007 --min-lr 1e-09 --criterion label_smoothed_cross_entropy --label-smoothing 0.1 --weight-decay 0.0 --max-tokens 4096 --save-interval-updates 50 --max-update 50 --keep-interval-updates 20 --decoder-attention-heads 16 --decoder-embed-dim 1024 --decoder-ffn-embed-dim 4096 --decoder-layerdrop 0 --decoder-layers 6 --decoder-output-dim 1024 --encoder-attention-heads 16 --encoder-embed-dim 1024 --encoder-ffn-embed-dim 4096 --encoder-layerdrop 0 --encoder-layers 6 --source-lang en --share-decoder-input-output-embed --target-lang de --optimizer adam --optimizer-overrides {} --reset-optimizer --save-interval 0 --keep-last-epochs 1 --max-epoch 31 --max-source-positions 1024 --max-target-positions 1024 --max-tokens 3584 --min-loss-scale 0.0001 --min-lr 1e-09 --stop-time-hours 0.1 --arch transformer_wmt_en_de
0
[]
training end
3.模型测试
# -*- coding: utf-8 -*-
from tqdm import tqdm
from fairseq.models.transformer import TransformerModel
from nltk.translate.bleu_score import sentence_bleu
import os
import torch
import numpy as np
from PIL import Image
from io import BytesIO
from collections import OrderedDict
import torch.backends.cudnn as cudnn
cudnn.benchmark = True
class ModelClass():
def __init__(self, model_path):
self.model_path = model_path # 本行代码必须保留,且无需修改
self.device = torch.device("cpu")
print(self.model_path)
path=os.getcwd()
self.model= TransformerModel.from_pretrained(
model_name_or_path= './',
checkpoint_file='outputs/model.pth',
data_name_or_path='outputs/newstest2014',
bpe='subword_nmt',
bpe_codes='outputs/bpecodes'
)
self.model.eval()
self.model = self.model.to(self.device)
print('load model success')
def translate(self, data):
pre_text = self.model.translate(data)
return pre_text
需要翻译的自然语言需要是英语,输出为德语,可自行修改成想要翻译的话
text_input = "I am lucky."
Translator = ModelClass('./outputs/model.pth')
result = Translator.translate(text_input)
print(result)
INFO:fairseq.file_utils:loading archive file ./
INFO:fairseq.file_utils:loading archive file outputs/newstest2014
./outputs/model.pth
INFO:fairseq.tasks.translation:[en] dictionary: 32768 types
INFO:fairseq.tasks.translation:[de] dictionary: 32768 types
INFO:fairseq.models.fairseq_model:Namespace(activation_dropout=0.0, activation_fn='relu', adam_betas='(0.9, 0.98)', adam_eps=1e-08, adaptive_input=False, adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, arch='transformer_wmt_en_de_big', attention_dropout=0.1, batch_size=None, bpe='subword_nmt', bpe_codes='outputs/bpecodes', bpe_separator='@@', clip_norm=0.0, criterion='label_smoothed_cross_entropy', cross_self_attention=False, data='outputs/newstest2014', decoder_attention_heads=16, decoder_embed_dim=1024, decoder_embed_path=None, decoder_ffn_embed_dim=4096, decoder_input_dim=1024, decoder_layerdrop=0, decoder_layers=6, decoder_layers_to_keep=None, decoder_learned_pos=False, decoder_normalize_before=False, decoder_output_dim=1024, device_id=0, distributed_backend='nccl', distributed_init_method='tcp://learnfair0487:59946', distributed_port=59946, distributed_rank=0, distributed_world_size=128, dropout=0.3, encoder_attention_heads=16, encoder_embed_dim=1024, encoder_embed_path=None, encoder_ffn_embed_dim=4096, encoder_layerdrop=0, encoder_layers=6, encoder_layers_to_keep=None, encoder_learned_pos=False, encoder_normalize_before=False, eval_bleu_detok='space', eval_bleu_remove_bpe=None, eval_tokenized_bleu=False, fp16=True, ignore_prefix_size=0, label_smoothing=0.1, layernorm_embedding=False, left_pad_source=True, left_pad_target=False, log_format='json', log_interval=10, lr=[0.001], lr_scheduler='inverse_sqrt', lr_shrink=0.1, max_epoch=0, max_sentences=None, max_sentences_valid=None, max_source_positions=1024, max_target_positions=1024, max_tokens=3584, max_update=300000, min_lr=1e-09, momentum=0.99, no_cross_attention=False, no_epoch_checkpoints=False, no_progress_bar=False, no_save=False, no_scale_embedding=False, no_token_positional_embeddings=False, num_batch_buckets=0, optimizer='adam', quant_noise_pq=0, quant_noise_pq_block_size=8, quant_noise_scalar=0, relu_dropout=0.0, restore_file='checkpoint_last.pt', sample_without_replacement=256000, save_dir='/checkpoint02/myleott/2018-05-18/paracrawl_en_de.fp16.maxupd300000.upsamplewmt31.samp_wo_repl256000.transformer_wmt_en_de_big.shareemb.adam.beta0.9,0.98.initlr1e-07.warmup4000.lr0.001.clip0.0.drop0.3.wd0.0.ls0.1.maxtok3584.seed2.ngpu128', save_interval=1, secondary_train_data='/private/home/myleott/data/paracrawl/en-de/paracrawl-release1.en-de.no_url.shuf_uniq_norm.scored.filtered.preprocessed', seed=2, sentence_avg=False, share_all_embeddings=True, share_decoder_input_output_embed=True, skip_invalid_size_inputs_valid_test=False, source_lang='en', target_lang='de', task='translation', tie_adaptive_weights=False, train_subset='train', truncate_source=False, update_freq=[1.0], upsample_primary=31, use_old_adam=False, valid_subset='valid', validate_interval=1, warmup_init_lr=1e-07, warmup_updates=4000, weight_decay=0.0)
load model success
Ich bin Glück .
- 点赞
- 收藏
- 关注作者
评论(0)