Transformer是一种深度学习模型,主要用于自然语言处理(NLP)任务。它由Vaswani等人在2017年的论文《Attention is All You Need》中提出,摒弃了传统的循环神经网络(RNN)和卷积神经网络(CNN),完全依赖于注意力机制来捕捉序列之间的关系。以下是Transformer模型的基础概念以及其工作原理:
当Transformer接收到一批不同的句子作为输入时,其处理流程如下:
常见问题:
原因:
解决方法:
import torch
import torch.nn as nn
import math
class PositionalEncoding(nn.Module):
def __init__(self, d_model, max_len=5000):
super(PositionalEncoding, self).__init__()
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return x
class TransformerModel(nn.Module):
def __init__(self, ntoken, d_model, nhead, d_hid, nlayers, dropout=0.5):
super().__init__()
self.model_type = 'Transformer'
self.pos_encoder = PositionalEncoding(d_model)
encoder_layers = nn.TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
self.transformer_encoder = nn.TransformerEncoder(encoder_layers, nlayers)
self.encoder = nn.Embedding(ntoken, d_model)
self.d_model = d_model
self.decoder = nn.Linear(d_model, ntoken)
self.init_weights()
def init_weights(self):
initrange = 0.1
self.encoder.weight.data.uniform_(-initrange, initrange)
self.decoder.bias.data.zero_()
self.decoder.weight.data.uniform_(-initrange, initrange)
def forward(self, src):
src = self.encoder(src) * math.sqrt(self.d_model)
src = self.pos_encoder(src)
output = self.transformer_encoder(src)
output = self.decoder(output)
return output
这个模型可以用于各种NLP任务,只需根据具体任务调整最后的输出层和解码策略即可。
领取专属 10元无门槛券
手把手带您无忧上云