今天我们进入 Seq2Seq 的领域,了解这种更为复杂且功能强大的模型,它不仅能理解词汇(Word2Vec),还能把这些词汇串联成完整的句子。
Seq2Seq(Sequence-to-Sequence),就是从一个序列到另一个序列的转换。它不仅仅能理解单词之间的关系,而且还能把整个句子的意思打包,并解压成另一种形式的表达。
seq2seq
是一种神经网络架构,是由encoder(编码器)
和decoder(解码器)
两个RNN的组成的。其中encoder负责对输入句子的理解,转化为context vector
,decoder负责对理解后的句子的向量进行处理,解码,获得输出。
Seq2seq模型中的encoder接受一个长度为M的序列,得到1个 context vector,之后decoder把这一个context vector转化为长度为N的序列作为输出,从而构成一个
M to N
的模型,能够处理很多不定长输入输出的问题,比如:文本翻译,问答,文章摘要,关键字写诗等等
可以加入注意力机制(Attention Mechanism):使解码器能够在生成每个输出元素时“关注”输入序列中的不同部分,从而提高模型处理长序列和捕捉复杂依赖关系的能力。
任务:
完成一个模型,实现往模型输入一串数字,输出这串数字+0
12345678
,输出123456780
Dataloader
训练时可以使用GPU训练:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("训练设备为:", device)
由于输入的是数字,为了把这写数字和词典中的真实数字进行对应,可以把这些数字理解为字符串
class NumSequence:
UNK_TAG = "UNK"
PAD_TAG = "PAD"
EOS_TAG = "EOS" #句子开始
SOS_TAG = "SOS" #句子结束
UNK = 0
PAD = 1
EOS = 2
SOS = 3
def __init__(self):
self.dict = {
self.UNK_TAG : self.UNK,
self.PAD_TAG : self.PAD,
self.EOS_TAG : self.EOS,
self.SOS_TAG : self.SOS
}
# 字符串和数字对应的字典
for i in range(10):
self.dict[str(i)] = len(self.dict)
self.index2word = dict(zip(self.dict.values(),self.dict.keys()))
def __len__(self):
return len(self.dict)
def transform(self,sequence,max_len=None,add_eos=False):
sequence_list = list(str(sequence))
seq_len = len(sequence_list)+1 if add_eos else len(sequence_list)
if add_eos and max_len is not None:
assert max_len>= seq_len, "max_len 应该大于seq+eos的长度"
_sequence_index = [self.dict.get(i,self.UNK) for i in sequence_list]
if add_eos:
_sequence_index += [self.EOS]
if max_len is not None:
sequence_index = [self.PAD]*max_len
sequence_index[:seq_len] = _sequence_index
return sequence_index
else:
return _sequence_index
def inverse_transform(self,sequence_index):
result = []
for i in sequence_index:
if i==self.EOS:
break
result.append(self.index2word.get(int(i),self.UNK_TAG))
return result
num_sequence = NumSequence()
if __name__ == '__main__':
num_sequence = NumSequence()
print(num_sequence.dict)
print(num_sequence.index2word)
print(num_sequence.transform("232356",add_eos=True))
Dataset
from torch.utils.data import Dataset,DataLoader
import numpy as np
from word_sequence import num_sequence
import torch
import config
class RandomDataset(Dataset):
def __init__(self):
super(RandomDataset,self).__init__()
self.total_data_size = 500000
np.random.seed(10)
self.total_data = np.random.randint(1,100000000,size=[self.total_data_size])
def __getitem__(self, idx):
input = str(self.total_data[idx])
return input, input+ "0",len(input),len(input)+1
def __len__(self):
return self.total_data_size
DataLoader
在准备DataLoader
的过程中,可以通过定义的collate_fn来实现对dataset中batch数据的处理
def collate_fn(batch):
batch = sorted(batch,key=lambda x:x[3],reverse=True)
input,target,input_length,target_length = zip(*batch)
input = torch.LongTensor([num_sequence.transform(i,max_len=config.max_len) for i in input])
target = torch.LongTensor([num_sequence.transform(i,max_len=config.max_len,add_eos=True) for i in target])
input_length = torch.LongTensor(input_length)
target_length = torch.LongTensor(target_length)
return input,target,input_length,target_length
data_loader = DataLoader(dataset=RandomDataset(),batch_size=config.batch_size,collate_fn=collate_fn,drop_last=True)
目的就是为了对文本进行编码,把编码后的结果交给后续的程序使用,使用Embedding+GRU
import torch.nn as nn
from word_sequence import num_sequence
import config
class NumEncoder(nn.Module):
def __init__(self):
super(NumEncoder,self).__init__()
self.vocab_size = len(num_sequence)
self.dropout = config.dropout
self.embedding = nn.Embedding(num_embeddings=self.vocab_size,embedding_dim=config.embedding_dim,padding_idx=num_sequence.PAD)
self.gru = nn.GRU(input_size=config.embedding_dim,
hidden_size=config.hidden_size,
num_layers=1,
batch_first=True)
def forward(self, input,input_length):
embeded = self.embedding(input)
embeded = nn.utils.rnn.pack_padded_sequence(embeded,lengths=input_length,batch_first=True)
out,hidden = self.gru(embeded)
out,outputs_length = nn.utils.rnn.pad_packed_sequence(out,batch_first=True,padding_value=num_sequence.PAD)
return out,hidden
主要负责实现对编码之后结果的处理,得到预测值
import torch
import torch.nn as nn
import config
import random
import torch.nn.functional as F
from word_sequence import num_sequence
class NumDecoder(nn.Module):
def __init__(self):
super(NumDecoder,self).__init__()
self.max_seq_len = config.max_len
self.vocab_size = len(num_sequence)
self.embedding_dim = config.embedding_dim
self.dropout = config.dropout
self.embedding = nn.Embedding(num_embeddings=self.vocab_size,embedding_dim=self.embedding_dim,padding_idx=num_sequence.PAD)
self.gru = nn.GRU(input_size=self.embedding_dim,
hidden_size=config.hidden_size,
num_layers=1,
batch_first=True,
dropout=self.dropout)
self.log_softmax = nn.LogSoftmax()
self.fc = nn.Linear(config.hidden_size,self.vocab_size)
def forward(self, encoder_hidden,target,target_length):
decoder_input = torch.LongTensor([[num_sequence.SOS]]*config.batch_size)
decoder_outputs = torch.zeros(config.batch_size,config.max_len,self.vocab_size)
decoder_hidden = encoder_hidden
for t in range(config.max_len):
decoder_output_t , decoder_hidden = self.forward_step(decoder_input,decoder_hidden)
decoder_outputs[:,t,:] = decoder_output_t
use_teacher_forcing = random.random() > 0.5
if use_teacher_forcing:
decoder_input =target[:,t].unsqueeze(1)
else:
value, index = torch.topk(decoder_output_t, 1)
decoder_input = index
return decoder_outputs,decoder_hidden
def forward_step(self,decoder_input,decoder_hidden):
embeded = self.embedding(decoder_input)
out,decoder_hidden = self.gru(embeded,decoder_hidden)
out = out.squeeze(0)
out = F.log_softmax(self.fc(out),dim=-1)
out = out.squeeze(1)
return out,decoder_hidden
完成模型的搭建
import torch
import torch.nn as nn
class Seq2Seq(nn.Module):
def __init__(self,encoder,decoder):
super(Seq2Seq,self).__init__()
self.encoder = encoder
self.decoder = decoder
def forward(self, input,target,input_length,target_length):
encoder_outputs,encoder_hidden = self.encoder(input,input_length)
decoder_outputs,decoder_hidden = self.decoder(encoder_hidden,target,target_length)
return decoder_outputs,decoder_hidden
完成训练:
import torch
import config
from torch import optim
import torch.nn as nn
from encoder import NumEncoder
from decoder import NumDecoder
from seq2seq import Seq2Seq
from dataset import data_loader as train_dataloader
from word_sequence import num_sequence
encoder = NumEncoder()
decoder = NumDecoder()
model = Seq2Seq(encoder,decoder)
print(model)
optimizer = optim.Adam(model.parameters())
criterion= nn.NLLLoss(ignore_index=num_sequence.PAD,reduction="mean")
def get_loss(decoder_outputs,target):
target = target.view(-1)
decoder_outputs = decoder_outputs.view(config.batch_size*config.max_len,-1)
return criterion(decoder_outputs,target)
def train(epoch):
for idx,(input,target,input_length,target_len) in enumerate(train_dataloader):
optimizer.zero_grad()
##[seq_len,batch_size,vocab_size] [batch_size,seq_len]
decoder_outputs,decoder_hidden = model(input,target,input_length,target_len)
loss = get_loss(decoder_outputs,target)
loss.backward()
optimizer.step()
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, idx * len(input), len(train_dataloader.dataset),
100. * idx / len(train_dataloader), loss.item()))
torch.save(model.state_dict(), "model/seq2seq_model.pkl")
torch.save(optimizer.state_dict(), 'model/seq2seq_optimizer.pkl')
if __name__ == '__main__':
for i in range(5):
train(i)
Seq2Seq优点:能处理输入和输出长度不固定的序列转换任务,灵活性高
Seq2Seq缺点:使用固定上下文长度、训练和推理通常需要逐步处理输入和输出序列,以及参数量较少,面对复杂场景可能受限。