自然语言处理(NLP)基于Transformer的中-英机器翻译
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了自然语言处理(NLP)基于Transformer的中-英机器翻译相关的知识,希望对你有一定的参考价值。
【自然语言处理(NLP)】基于Transformer的中-英机器翻译
(文章目录)
前言
(一)、任务描述
飞桨框架实现了Transformer的基本层,因此可以直接调用:TransformerEncoderLayer类定义了编码器端的一个层,包括多头注意力子层及逐位前馈网络子层;TransformerEncoder类堆叠TransformerEncoderLayer层,返回指定层数的编码器;TransformerDecoderLayer类定义了解码器端的一个层,包括多头自注意力子层、多头交叉注意力子层及逐位前馈网络子层;TransformerDecoder类堆叠TransformerDecoderLayer层,返回指定层数的解码器。
(二)、环境配置
本示例基于飞桨开源框架2.0版本。
import paddle
import numpy as np
import matplotlib.pyplot as plt
print(paddle.__version__)
输出结果如下图1所示:
一、数据准备
(一)、加载开发环境
import paddle
import paddle.nn as nn
import collections
# from paddle.nn.layer import Linear,Dropout,LayerNorm
import paddle
from .common import Linear, Dropout
from .norm import LayerNorm
from .. import functional as F
from ... import tensor
from ...fluid import layers 1
from ...fluid.dygraph import Layer, LayerList 1
from ...fluid.param_attr import ParamAttr 1
from paddle.nn.layer.common import Linear, Dropout
from paddle.nn.layer.norm import LayerNorm
from paddle.fluid import layers
from paddle.fluid.dygraph import Layer,LayerList
from paddle.fluid.param_attr import ParamAttr
import paddle.nn.functional as F
import re
import copy
from paddle import tensor
import numpy as np
print(paddle.__version__)
# cpu/gpu环境选择,在 paddle.set_device() 输入对应运行设备。
# device = paddle.set_device(gpu)
(二)、数据集加载
统计数据集信息,确定句子长度,我们采用包含90%句子长度的长度值作为句子的长度
# 统计数据集中句子的长度等信息
lines = open(data/data158128/cmn.txt,r,encoding=utf-8).readlines()
print(len(lines))
datas = []
dic_en =
dic_cn =
for line in lines:
ll = line.strip().split(\\t)
if len(ll)<2:
continue
datas.append([ll[0].lower().split( )[1:-1],list(ll[1])])
# print(ll[0])
if len(ll[0].split( )) not in dic_en:
dic_en[len(ll[0].split( ))] = 1
else:
dic_en[len(ll[0].split( ))] +=1
if len(ll[1]) not in dic_cn:
dic_cn[len(ll[1])] = 1
else:
dic_cn[len(ll[1])] +=1
keys_en = list(dic_en.keys())
keys_en.sort()
count = 0
# print(英文长度统计:)
for k in keys_en:
count += dic_en[k]
# print(k,dic_en[k],count/len(lines))
keys_cn = list(dic_cn.keys())
keys_cn.sort()
count = 0
# print(中文长度统计:)
for k in keys_cn:
count += dic_cn[k]
# print(k,dic_cn[k],count/len(lines))
en_length = 10
cn_length = 10
输出结果如下图2所示:
(三)、构建词表
# 构建中英文词表
en_vocab =
cn_vocab =
en_vocab[<pad>], en_vocab[<bos>], en_vocab[<eos>] = 0, 1, 2
cn_vocab[<pad>], cn_vocab[<bos>], cn_vocab[<eos>] = 0, 1, 2
en_idx, cn_idx = 3, 3
for en, cn in datas:
# print(en,cn)
for w in en:
if w not in en_vocab:
en_vocab[w] = en_idx
en_idx += 1
for w in cn:
if w not in cn_vocab:
cn_vocab[w] = cn_idx
cn_idx += 1
print(len(list(en_vocab)))
print(len(list(cn_vocab)))
# 英文词表长度:6057
# 中文词表长度:3533
输出结果如下图3所示:
(四)、创建数据集
接下来根据词表,我们将会创建一份实际的用于训练的用numpy array组织起来的数据集。
- 所有的句子都通过补充成为了长度相同的句子。
- 对于英文句子(源语言),我们将其反转了过来,这会带来更好的翻译的效果。
- 所创建的padded_cn_label_sents是训练过程中的预测的目标,即,每个中文的当前词去预测下一个词是什么词。
padded_en_sents = []
padded_cn_sents = []
padded_cn_label_sents = []
for en, cn in datas:
if len(en)>en_length:
en = en[:en_length]
if len(cn)>cn_length:
cn = cn[:cn_length]
padded_en_sent = en + [<eos>] + [<pad>] * (en_length - len(en))
padded_en_sent.reverse()
padded_cn_sent = [<bos>] + cn + [<eos>] + [<pad>] * (cn_length - len(cn))
padded_cn_label_sent = cn + [<eos>] + [<pad>] * (cn_length - len(cn) + 1)
padded_en_sents.append(np.array([en_vocab[w] for w in padded_en_sent]))
padded_cn_sents.append(np.array([cn_vocab[w] for w in padded_cn_sent]) )
padded_cn_label_sents.append(np.array([cn_vocab[w] for w in padded_cn_label_sent]))
train_en_sents = np.array(padded_en_sents)
train_cn_sents = np.array(padded_cn_sents)
train_cn_label_sents = np.array(padded_cn_label_sents)
print(train_en_sents.shape)
print(train_cn_sents.shape)
print(train_cn_label_sents.shape)
输出结果如下图4所示:
二、定义encoder,decoder内部实现
def _convert_param_attr_to_list(param_attr, n):
if isinstance(param_attr, (list, tuple)):
assert len(param_attr) == n, (
"length of param_attr should be %d when it is a list/tuple" % n)
param_attrs = []
for attr in param_attr:
if isinstance(attr, bool):
if attr:
param_attrs.append(ParamAttr._to_attr(None))
else:
param_attrs.append(False)
else:
param_attrs.append(ParamAttr._to_attr(attr))
# param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]
elif isinstance(param_attr, bool):
param_attrs = []
if param_attr:
param_attrs = [ParamAttr._to_attr(None) for i in range(n)]
else:
param_attrs = [False] * n
else:
param_attrs = []
attr = ParamAttr._to_attr(param_attr)
for i in range(n):
attr_i = copy.deepcopy(attr)
if attr.name:
attr_i.name = attr_i.name + "_" + str(i)
param_attrs.append(attr_i)
return param_attrs
class MultiHeadAttention(Layer):
Cache = collections.namedtuple("Cache", ["k", "v"])
StaticCache = collections.namedtuple("StaticCache", ["k", "v"])
def __init__(self,
embed_dim,
num_heads,
dropout=0.,
kdim=None,
vdim=None,
need_weights=False,
weight_attr=None,
bias_attr=None):
super(MultiHeadAttention, self).__init__()
self.embed_dim = embed_dim
self.kdim = kdim if kdim is not None else embed_dim
self.vdim = vdim if vdim is not None else embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.need_weights = need_weights
self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
self.q_proj = Linear(
embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
self.k_proj = Linear(
self.kdim, embed_dim, weight_attr, bias_attr=bias_attr)
self.v_proj = Linear(
self.vdim, embed_dim, weight_attr, bias_attr=bias_attr)
self.out_proj = Linear(
embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
def _prepare_qkv(self, query, key, value, cache=None):
q = self.q_proj(query)
q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
q = tensor.transpose(x=q, perm=[0, 2, 1, 3])
if isinstance(cache, self.StaticCache):
# for encoder-decoder attention in inference and has cached
k, v = cache.k, cache.v
else:
k, v = self.compute_kv(key, value)
if isinstance(cache, self.Cache):
# for decoder self-attention in inference
k = tensor.concat([cache.k, k], axis=2)
v = tensor.concat([cache.v, v], axis=2)
cache = self.Cache(k, v)
return (q, k, v) if cache is None else (q, k, v, cache)
def compute_kv(self, key, value):
k = self.k_proj(key)
v = self.v_proj(value)
k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
return k, v
def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
key = query if key is None else key
value = query if value is None else value
# compute q ,k ,v
if cache is None:
q, k, v = self._prepare_qkv(query, key, value, cache)
else:
q, k, v, cache = self._prepare_qkv(query, key, value, cache)
# scale dot product attention
# TODO(guosheng): use tensor.matmul, however it doesnt support `alpha`
product = layers.matmul(
x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5)
if attn_mask is not None:
# TODO(guosheng): support bool mask
product = product + attn_mask
weights = F.softmax(product)
if self.dropout:
weights = F.dropout(
weights,
self.dropout,
training=self.training,
mode="upscale_in_train")
out = tensor.matmul(weights, v)
# combine heads
out = tensor.transpose(out, perm=[0, 2, 1, 3])
out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
# project to output
out = self.out_proj(out)
outs = [out]
if self.need_weights:
outs.append(weights)
if cache is not None:
outs.append(cache)
return out if len(outs) == 1 else tuple(outs)
class TransformerEncoderLayer(Layer):
def __init__(self,
d_model,
nhead,
dim_feedforward,
dropout=0.1,
activation="relu",
attn_dropout=None,
act_dropout=None,
normalize_before=False,
weight_attr=None,
bias_attr=None):
self._config = locals()
self._config.pop("self")
self._config.pop("__class__", None) # py3
super(TransformerEncoderLayer, self).__init__()
attn_dropout = dropout if attn_dropout is None else attn_dropout
act_dropout = dropout if act_dropout is None else act_dropout
self.normalize_before = normalize_before
weight_attrs = _convert_param_attr_to_list(weight_attr, 2)
bias_attrs = _convert_param_attr_to_list(bias_attr, 2)
self.self_attn = MultiHeadAttention(
d_model,
nhead,
dropout=attn_dropout,
weight_attr=weight_attrs[0],
bias_attr=bias_attrs[0])
self.linear1 = Linear(
d_model, dim_feedforward, weight_attrs[1], bias_attr=bias_attrs[1])
self.dropout = Dropout(act_dropout, mode="upscale_in_train")
self.linear2 = Linear(
dim_feedforward, d_model, weight_attrs[1], bias_attr=bias_attrs[1])
self.norm1 = LayerNorm(d_model)
self.norm2 = LayerNorm(d_model)
self.dropout1 = Dropout(dropout, mode="upscale_in_train")
self.dropout2 = Dropout(dropout, mode="upscale_in_train")
self.activation = getattr(F, activation)
def forward(self, src, src_mask=None, cache=None):
residual = src
if self.normalize_before:
src = self.norm1(src)
if cache is None:
src = self.self_attn(src, src, src, src_mask)
else:
src, incremental_cache = self.self_attn(src, src, src, src_mask,
cache)
src = residual + self.dropout1(src)
if not self.normalize_before:
src = self.norm1(src)
residual = src
if self.normalize_before:
src = self.norm2(src)
src = self.linear2(self.dropout(self.activation(self.linear1(src))))
src = residual + self.dropout2(src)
if not self.normalize_before:
src = self.norm2(src)
return src if cache is None else (src, incremental_cache)
class TransformerEncoder(Layer):
def __init__(self, encoder_layer, num_layers, norm=None):
super(TransformerEncoder, self).__init__()
self.layers = LayerList([(encoder_layer if i == 0 else
type(encoder_layer)(**encoder_layer._config))
for i in range(num_layers)])
self.num_layers = num_layers
self.norm = norm
def forward(self, src, src_mask=None, cache=None):
output = src
new_caches = []
for i, mod in enumerate(self.layers):
if cache is None:
output = mod(output, src_mask=src_mask)
else:
output, new_cache = mod(output,
src_mask=src_mask,
cache=cache[i])
new_caches.append(new_cache)
if self.norm is not None:
output = self.norm(output)
return output if cache is None else (output, new_caches)
class TransformerDecoderLayer(Layer):
def __init__(self,
d_model,
nhead,
dim_feedforward,
dropout=0.1,
activation="relu",
attn_dropout=None,
act_dropout=None,
normalize_before=False,
weight_attr=None,
bias_attr=None):
self._config = locals()
self._config.pop("self")
self._config.pop("__class__", None) # py3
super(TransformerDecoderLayer, self).__init__()
attn_dropout = dropout if attn_dropout is None else attn_dropout
act_dropout = dropout if act_dropout is None else act_dropout
self.normalize_before = normalize_before
weight_attrs = _convert_param_attr_to_list(weight_attr, 3)
bias_attrs = _convert_param_attr_to_list(bias_attr, 3)
self.self_attn = MultiHeadAttention(
d_model,
nhead,
dropout=attn_dropout,
weight_attr=weight_attrs[0],
bias_attr=bias_attrs[0])
self.cross_attn = MultiHeadAttention(
d_model,
nhead,
dropout=attn_dropout,
weight_attr=weight_attrs[1],
bias_attr=bias_attrs[1])
self.linear1 = Linear(
d_model, dim_feedforward, weight_attrs[2], bias_attr=bias_attrs[2])
self.dropout = Dropout(act_dropout, mode="upscale_in_train")
self.linear2 = Linear(
dim_feedforward, d_model, weight_attrs[2], bias_attr=bias_attrs[2])
self.norm1 = LayerNorm(d_model)
self.norm2 = LayerNorm(d_model)
self.norm3 = LayerNorm(d_model)
self.dropout1 = Dropout(dropout, mode="upscale_in_train")
self.dropout2 = Dropout(dropout, mode="upscale_in_train")
self.dropout3 = Dropout(dropout, mode="upscale_in_train")
self.activation = getattr(F, activation)
def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
residual = tgt
if self.normalize_before:
tgt = self.norm1(tgt)
if cache is None:
tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, None)
else:
tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask,
cache[0])
tgt = residual + self.dropout1(tgt)
if not self.normalize_before:
tgt = self.norm1(tgt)
residual = tgt
if self.normalize_before:
tgt = self.norm2(tgt)
if cache is None:
tgt = self.cross_attn(tgt, memory, memory, memory_mask, None)
else:
tgt, static_cache = self.cross_attn(tgt, memory, memory,
memory_mask, cache[1])
tgt = residual + self.dropout2(tgt)
if not self.normalize_before:
tgt = self.norm2(tgt)
residual = tgt
if self.normalize_before:
tgt = self.norm3(tgt)
tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
tgt = residual + self.dropout3(tgt)
if not self.normalize_before:
tgt = self.norm3(tgt)
return tgt if cache is None else (tgt, (incremental_cache,
static_cache))
class TransformerDecoder(Layer):
def __init__(self, decoder_layer, num_layers, norm=None):
super(TransformerDecoder, self).__init__()
self.layers = LayerList([(decoder_layer if i == 0 else
type(decoder_layer)(**decoder_layer._config))
for i in range(num_layers)])
self.num_layers = num_layers
self.norm = norm
def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
output = tgt
new_caches = []
for i, mod in enumerate(self.layers):
if cache is None:
output = mod(output,
memory,
tgt_mask=tgt_mask,
memory_mask=memory_mask,
cache=None)
else:
output, new_cache = mod(output,
memory,
tgt_mask=tgt_mask,
memory_mask=memory_mask,
cache=cache[i])
new_caches.append(new_cache)
if self.norm is not None:
output = self.norm(output)
return output if cache is None else (output, new_caches)
三、构建基于Transformer的机器翻译模型
定义网络结构后,需要配置优化器、损失函数、评价指标。
(一)、定义超参数
- 首先定义超参数,用于后续模型的设计与训练
embedding_size = 128
hidden_size = 512
num_encoder_lstm_layers = 1
en_vocab_size = len(list(en_vocab))
cn_vocab_size = len(list(cn_vocab))
epochs = 20
batch_size = 16
(二)、定义Encoder
- 使用TransformerEncoder定义Encoder
# encoder: simply learn representation of source sentence
class Encoder(paddle.nn.Layer):
def __init__(self,en_vocab_size, embedding_size,num_layers=2,head_number=2,middle_units=512):
super(Encoder, self).__init__()
self.emb = paddle.nn.Embedding(en_vocab_size, embedding_size,)
"""
d_model (int) - 输入输出的维度。
nhead (int) - 多头注意力机制的Head数量。
dim_feedforward (int) - 前馈神经网络中隐藏层的大小。
"""
encoder_layer = TransformerEncoderLayer(embedding_size, head_number, middle_units)
self.encoder = TransformerEncoder(encoder_layer, num_layers)
def forward(self, x):
x = self.emb(x)
en_out = self.encoder(x)
return en_out
(三)、定义Decoder
- 使用TransformerDecoder定义Decoder
class Decoder(paddle.nn.Layer):
def __init__(self,cn_vocab_size, embedding_size,num_layers=2,head_number=2,middle_units=512):
super(Decoder, self).__init__()
self.emb = paddle.nn.Embedding(cn_vocab_size, embedding_size)
# self.decoder = Decoder(decoder_layer, num_layers)
decoder_layer = TransformerDecoderLayer(embedding_size, head_number, middle_units)
self.decoder = TransformerDecoder(decoder_layer, num_layers)
# for computing output logits
self.outlinear =paddle.nn.Linear(embedding_size, cn_vocab_size)
def forward(self, x, encoder_outputs):
x = self.emb(x)
# dec_input, enc_output,self_attn_mask, cross_attn_mask
de_out = self.decoder(x, encoder_outputs)
output = self.outlinear(de_out)
output = paddle.squeeze(output)
return output
四、模型训练
encoder = Encoder(en_vocab_size, embedding_size)
decoder = Decoder(cn_vocab_size, embedding_size)
opt = paddle.optimizer.Adam(learning_rate=0.0001,
parameters=encoder.parameters() + decoder.parameters())
for epoch in range(epochs):
print("epoch:".format(epoch))
# shuffle training data
perm = np.random.permutation(len(train_en_sents))
train_en_sents_shuffled = train_en_sents[perm]
train_cn_sents_shuffled = train_cn_sents[perm]
train_cn_label_sents_shuffled = train_cn_label_sents[perm]
# print(train_en_sents_shuffled.shape[0],train_en_sents_shuffled.shape[1])
for iteration in range(train_en_sents_shuffled.shape[0] // batch_size):
x_data = train_en_sents_shuffled[(batch_size*iteration):(batch_size*(iteration+1))]
sent = paddle.to_tensor(x_data)
en_repr = encoder(sent)
x_cn_data = train_cn_sents_shuffled[(batch_size*iteration):(batch_size*(iteration+1))]
x_cn_label_data = train_cn_label_sents_shuffled[(batch_size*iteration):(batch_size*(iteration+1))]
loss = paddle.zeros([1])
for i in range( cn_length + 2):
cn_word = paddle.to_tensor(x_cn_data[:,i:i+1])
cn_word_label = paddle.to_tensor(x_cn_label_data[:,i])
logits = decoder(cn_word, en_repr)
step_loss = F.cross_entropy(logits, cn_word_label)
loss += step_loss
loss = loss / (cn_length + 2)
if(iteration % 50 == 0):
print("iter , loss:".format(iteration, loss.numpy()))
loss.backward()
opt.step()
opt.clear_grad()
输出结果如下图5所示:
五、模型预测
- 随机从训练集中抽取几句话来进行预测
encoder.eval()
decoder.eval()
num_of_exampels_to_evaluate = 10
indices = np.random.choice(len(train_en_sents), num_of_exampels_to_evaluate, replace=False)
x_data = train_en_sents[indices]
sent = paddle.to_tensor(x_data)
en_repr = encoder(sent)
word = np.array(
[[cn_vocab[<bos>]]] * num_of_exampels_to_evaluate
)
word = paddle.to_tensor(word)
decoded_sent = []
for i in range(cn_length + 2):
logits = decoder(word, en_repr)
word = paddle.argmax(logits, axis=1)
decoded_sent.append(word.numpy())
word = paddle.unsqueeze(word, axis=-1)
results = np.stack(decoded_sent, axis=1)
for i in range(num_of_exampels_to_evaluate):
print(---------------------)
en_input = " ".join(datas[indices[i]][0])
ground_truth_translate = "".join(datas[indices[i]][1])
model_translate = ""
for k in results[i]:
w = list(cn_vocab)[k]
if w != <pad> and w != <eos>:
model_translate += w
print(en_input)
print("true: ".format(ground_truth_translate))
print("pred: ".format(model_translate))
输出结果如下图6所示:
总结
本系列文章内容为根据清华社出版的《自然语言处理实践》所作的相关笔记和感悟,其中代码均为基于百度飞桨开发,若有任何侵权和不妥之处,请私信于我,定积极配合处理,看到必回!!!
最后,引用本次活动的一句话,来作为文章的结语~( ̄▽ ̄~)~:
【**学习的最大理由是想摆脱平庸,早一天就多一份人生的精彩;迟一天就多一天平庸的困扰。**】
以上是关于自然语言处理(NLP)基于Transformer的中-英机器翻译的主要内容,如果未能解决你的问题,请参考以下文章