文章目录
1. PPT讲解部分
1.1 应用
1.2 seq2seq
1.3 解码器和编码器的细节
1.4 训练
1.5 衡量生成序列的好坏的BLEU
1.6 小结
2. seq2seq代码
1. PPT讲解部分
1.1 应用
机器翻译 (1)给定一个源语言的句子,自动翻译成目标语言 (2)这两个句子可以有不同的长度
1.2 seq2seq
Encoder 编码器是一个RNN,读取输入句子,可以是双向的
Decoder 解码器是另外一个RNN,用来输出句子
Decoder 接受两个数据 (1)Encoder 解码器输出的隐状态值 (2)输出序列已经看见的或者生成的词元来预测下一个词元。
1.3 解码器和编码器的细节
Encoder编码器是没有输出的RNN
Encoder编码器最后时间步的隐状态用作Decoder解码器的初始隐状态
1.4 训练
训练时Encoder解码器使用目标句子
作为输入;在训练的时候,我们是知道目标句子的
推理: 推理是没有目标句子的。
1.5 衡量生成序列的好坏的BLEU
P n P_n P n ? 是预测中所有 n-gram 的精度
标签序列 ABCDEF和预测序列 ABBCD,有 p 1 = 4 / 5 , p 2 = 3 / 4 , p 3 = 1 / 3 , p 4 = 0 p_1=4/5,p_2=3/4,p_3=1/3,p_4=0 p 1 ? = 4 / 5 , p 2 ? = 3 / 4 , p 3 ? = 1 / 3 , p 4 ? = 0 p 1 : ( A , B , C , D ) / ( A , B , B , C , D ) = 4 / 5 p_1:(A,B,C,D)/(A,B,B,C,D)=4/5 p 1 ? : ( A , B , C , D ) / ( A , B , B , C , D ) = 4 / 5 p 2 : ( A B , B C , C D ) / ( A B , B B , B C , C D ) = 3 / 4 p_2:(AB,BC,CD)/(AB,BB,BC,CD)=3/4 p 2 ? : ( A B , B C , C D ) / ( A B , B B , B C , C D ) = 3 / 4 p 3 : ( B C D ) / ( A B B , B B C , B C D ) = 1 / 3 p_3:(BCD)/(ABB,BBC,BCD)=1/3 p 3 ? : ( B C D ) / ( A B B , B B C , B C D ) = 1 / 3 p 4 : ( 0 ) / ( A B B C , B B C D ) = 0 p_4:(0)/(ABBC,BBCD)=0 p 4 ? : ( 0 ) / ( A B B C , B B C D ) = 0
BLEU定义 B L E U = exp ? { min ? ( 0 , 1 ? l e n ( l a b e l ) l e n ( p r e d ) ) } ∏ n = 1 k p n 1 / 2 n (1) BLEU=\exp\{\min(0,1-\frac{len(label)}{len(pred)})\}\prod_{n=1}^kp_n^{1/2^n}\tag1 B L E U = exp {
min ( 0 , 1 ? l e n ( p r e d ) l e n ( l a b e l ) ? ) } n = 1 ∏ k ? p n 1 / 2 n ? ( 1 ) 惩 罚 过 短 的 预 测 : exp ? { min ? ( 0 , 1 ? l e n ( l a b e l ) l e n ( p r e d ) ) } (2) 惩罚过短的预测:\exp\{\min(0,1-\frac{len(label)}{len(pred)})\}\tag2 惩 罚 过 短 的 预 测 : exp {
min ( 0 , 1 ? l e n ( p r e d ) l e n ( l a b e l ) ? ) } ( 2 ) 长 匹 配 有 高 权 重 : ∏ n = 1 k p n 1 / 2 n (3) 长匹配有高权重:\prod_{n=1}^kp_n^{1/2^n}\tag3 长 匹 配 有 高 权 重 : n = 1 ∏ k ? p n 1 / 2 n ? ( 3 )
BLEU=0表示预测最差,BLUE=1表示预测最好
1.6 小结
Seq2seq从一个句子生成另一个句子
Encoder编码器和Decoder解码器都是RNN
将编码器最后时间隐状态来初始解码器隐状态来完成信息传递
常用BLEU来衡量生成序列的好坏
2. seq2seq代码
import collections
import math
import torch
from torch import nn
from d2l import torch as d2l
import matplotlib. pyplot as pltclass Seq2SeqEncoder ( d2l. Encoder) : def __init__ ( self, vocab_size, embed_size, num_hiddens, num_layers, dropout= 0 , ** kwargs) : super ( Seq2SeqEncoder, self) . __init__( ** kwargs) self. embedding = nn. Embedding( vocab_size, embed_size) self. rnn = nn. GRU( embed_size, num_hiddens, num_layers, dropout= dropout) def forward ( self, X, * args) : X = self. embedding( X) X = X. permute( 1 , 0 , 2 ) output, state = self. rnn( X) return output, stateencoder = Seq2SeqEncoder( vocab_size= 10 , embed_size= 8 , num_hiddens= 16 , num_layers= 2 ) encoder. eval ( )
X = torch. zeros( ( 4 , 7 ) , dtype= torch. long )
output, state = encoder( X) class Seq2SeqDecoder ( d2l. Decoder) : def __init__ ( self, vocab_size, embed_size, num_hiddens, num_layers, dropout= 0 , ** kwargs) : super ( Seq2SeqDecoder, self) . __init__( ** kwargs) self. embedding = nn. Embedding( vocab_size, embed_size) self. rnn = nn. GRU( embed_size + num_hiddens, num_hiddens, num_layers, dropout= dropout) self. dense = nn. Linear( num_hiddens, vocab_size) def init_state ( self, enc_outputs, * args) : return enc_outputs[ 1 ] def forward ( self, X, state) : X = self. embedding( X) . permute( 1 , 0 , 2 ) context = state[ - 1 ] . repeat( X. shape[ 0 ] , 1 , 1 ) X_and_context = torch. cat( ( X, context) , 2 ) output, state = self. rnn( X_and_context, state) output = self. dense( output) . permute( 1 , 0 , 2 ) return output, statedecoder = Seq2SeqDecoder( vocab_size= 10 , embed_size= 8 , num_hiddens= 16 , num_layers= 2 )
decoder. eval ( )
state_decoder = decoder. init_state( encoder( X) )
output_decoder, output_state = decoder( X, state_decoder)
print ( f"output_decoder.shape= {
output_decoder. shape} " )
print ( f"output_state.shape= {
output_state. shape} " ) def sequence_mask ( X, valid_len, value= 0 ) : maxlen = X. size( 1 ) mask = torch. arange( ( maxlen) , dtype= torch. float32, device= X. device) [ None , : ] < valid_len[ : , None ] X[ ~ mask] = valuereturn Xclass MaskedSoftmaxCELoss ( nn. CrossEntropyLoss) : def forward ( self, pred, label, valid_len) : weights = torch. ones_like( label) weights = sequence_mask( weights, valid_len) self. reduction = 'none' unweight_loss = super ( MaskedSoftmaxCELoss, self) . forward( pred. permute( 0 , 2 , 1 ) , label) weighted_loss = ( unweight_loss * weights) . mean( dim= 1 ) return weighted_lossloss = MaskedSoftmaxCELoss( )
y_loss = loss( torch. ones( 3 , 4 , 10 ) , torch. ones( ( 3 , 4 ) , dtype= torch. long ) , torch. tensor( [ 4 , 2 , 0 ] ) )
print ( f"y_loss = {
y_loss} " ) def train_seq2seq ( net, data_iter, lr, num_epochs, tgt_vocab, device) : def xavier_init_weights ( m) : if type ( m) == nn. Linear: nn. init. xavier_uniform_( m. weight) if type ( m) == nn. GRU: for param in m. _flat_weights_names: if "weight" in param: nn. init. xavier_uniform_( m. _parameters[ param] ) net. apply ( xavier_init_weights) net. to( device) optimizer = torch. optim. Adam( net. parameters( ) , lr= lr) loss = MaskedSoftmaxCELoss( ) net. train( ) animator = d2l. Animator( xlabel= 'epoch' , ylabel= 'loss' , xlim= [ 10 , num_epochs] ) for epoch in range ( num_epochs) : timer = d2l. Timer( ) metric = d2l. Accumulator( 2 ) for batch in data_iter: optimizer. zero_grad( ) X, X_vali_len, Y, Y_valid_len = [ x. to( device) for x in batch] bos = torch. tensor( [ tgt_vocab[ '<bos>' ] ] * Y. shape[ 0 ] , device= device) . reshape( - 1 , 1 ) dec_input = torch. cat( [ bos, Y[ : , : - 1 ] ] , 1 ) Y_hat, _ = net( X, dec_input, X_vali_len) l = loss( Y_hat, Y, Y_valid_len) l. sum ( ) . backward( ) d2l. grad_clipping( net, 1 ) num_tokens = Y_valid_len. sum ( ) optimizer. step( ) with torch. no_grad( ) : metric. add( l. sum ( ) , num_tokens) if ( epoch + 1 ) % 10 == 0 : animator. add( epoch + 1 , ( metric[ 0 ] / metric[ 1 ] , ) ) print ( f'loss {
metric[ 0 ] / metric[ 1 ] : .3f } , {
metric[ 1 ] / timer. stop( ) : .1f } ' f'tokens/sec on {
str ( device) } ' ) embed_size, num_hiddens, num_layers, dropout = 32 , 32 , 2 , 0.1
batch_size, num_steps = 64 , 10
lr, num_epochs, device = 0.005 , 300 , d2l. try_gpu( ) train_iter, src_vocab, tgt_vocab = d2l. load_data_nmt( batch_size, num_steps)
encoder = Seq2SeqEncoder( len ( src_vocab) , embed_size, num_hiddens, num_layers, dropout)
decoder = Seq2SeqDecoder( len ( tgt_vocab) , embed_size, num_hiddens, num_layers, dropout) net = d2l. EncoderDecoder( encoder, decoder)
train_seq2seq( net, train_iter, lr, num_epochs, tgt_vocab, device)
plt. show( )
loss0. 020 , 14236. 7tokens/ sec on cuda: 0