自然语言处理（NLP）基于Transformer的中-英机器翻译

Posted 2022-10-18

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了自然语言处理（NLP）基于Transformer的中-英机器翻译相关的知识，希望对你有一定的参考价值。

【自然语言处理（NLP）】基于Transformer的中-英机器翻译

(文章目录)

前言

(一)、任务描述

飞桨框架实现了Transformer的基本层，因此可以直接调用：TransformerEncoderLayer类定义了编码器端的一个层，包括多头注意力子层及逐位前馈网络子层；TransformerEncoder类堆叠TransformerEncoderLayer层，返回指定层数的编码器；TransformerDecoderLayer类定义了解码器端的一个层，包括多头自注意力子层、多头交叉注意力子层及逐位前馈网络子层；TransformerDecoder类堆叠TransformerDecoderLayer层，返回指定层数的解码器。

(二)、环境配置

本示例基于飞桨开源框架2.0版本。

import paddle
import numpy as np
import matplotlib.pyplot as plt
print(paddle.__version__)

输出结果如下图1所示：

一、数据准备

(一)、加载开发环境

import paddle
import paddle.nn as nn
import collections

# from paddle.nn.layer import Linear,Dropout,LayerNorm


import paddle
from .common import Linear, Dropout
from .norm import LayerNorm
from .. import functional as F
from ... import tensor
from ...fluid import layers 1
from ...fluid.dygraph import Layer, LayerList 1
from ...fluid.param_attr import ParamAttr 1


from paddle.nn.layer.common import Linear, Dropout
from paddle.nn.layer.norm import LayerNorm
from paddle.fluid import layers
from paddle.fluid.dygraph import Layer,LayerList
from paddle.fluid.param_attr import ParamAttr 
import paddle.nn.functional as F
import re
import copy
from paddle import tensor
import numpy as np

print(paddle.__version__)
# cpu/gpu环境选择，在 paddle.set_device() 输入对应运行设备。
# device = paddle.set_device(gpu)

(二)、数据集加载

统计数据集信息，确定句子长度，我们采用包含90%句子长度的长度值作为句子的长度

# 统计数据集中句子的长度等信息
lines =  open(data/data158128/cmn.txt,r,encoding=utf-8).readlines()
print(len(lines))
datas = []
dic_en = 
dic_cn = 
for line in lines:
    ll = line.strip().split(\\t)
    if len(ll)<2:
        continue
    datas.append([ll[0].lower().split( )[1:-1],list(ll[1])])
    # print(ll[0])
    if len(ll[0].split( )) not in dic_en:
        dic_en[len(ll[0].split( ))] = 1
    else:
        dic_en[len(ll[0].split( ))] +=1
    if len(ll[1]) not in dic_cn:
        dic_cn[len(ll[1])] = 1
    else:
        dic_cn[len(ll[1])] +=1
keys_en = list(dic_en.keys())
keys_en.sort()
count = 0
# print(英文长度统计：)
for k in keys_en:
    count += dic_en[k]
    # print(k,dic_en[k],count/len(lines))

keys_cn = list(dic_cn.keys())
keys_cn.sort()
count = 0
# print(中文长度统计：)
for k in keys_cn:
    count += dic_cn[k]
    # print(k,dic_cn[k],count/len(lines))
 
en_length = 10
cn_length = 10

输出结果如下图2所示：

(三)、构建词表

# 构建中英文词表
en_vocab = 
cn_vocab = 

en_vocab[<pad>], en_vocab[<bos>], en_vocab[<eos>] = 0, 1, 2
cn_vocab[<pad>], cn_vocab[<bos>], cn_vocab[<eos>] = 0, 1, 2
en_idx, cn_idx = 3, 3
for en, cn in datas:
    # print(en,cn)
    for w in en:
        if w not in en_vocab:
            en_vocab[w] = en_idx
            en_idx += 1
    for w in cn:
        if w not in cn_vocab:
            cn_vocab[w] = cn_idx
            cn_idx += 1

print(len(list(en_vocab)))
print(len(list(cn_vocab)))

# 英文词表长度：6057
# 中文词表长度：3533

输出结果如下图3所示：

(四)、创建数据集

接下来根据词表，我们将会创建一份实际的用于训练的用numpy array组织起来的数据集。

所有的句子都通过补充成为了长度相同的句子。
对于英文句子（源语言），我们将其反转了过来，这会带来更好的翻译的效果。
所创建的padded_cn_label_sents是训练过程中的预测的目标，即，每个中文的当前词去预测下一个词是什么词。

padded_en_sents = []
padded_cn_sents = []
padded_cn_label_sents = []
for en, cn in datas:
    if len(en)>en_length:
        en = en[:en_length]
    if len(cn)>cn_length:
        cn = cn[:cn_length]
    padded_en_sent = en + [<eos>] + [<pad>] * (en_length - len(en))
    padded_en_sent.reverse()

    padded_cn_sent = [<bos>] + cn + [<eos>] + [<pad>] * (cn_length - len(cn))
    padded_cn_label_sent = cn + [<eos>] + [<pad>] * (cn_length - len(cn) + 1)
    
    padded_en_sents.append(np.array([en_vocab[w] for w in padded_en_sent]))
    padded_cn_sents.append(np.array([cn_vocab[w] for w in padded_cn_sent]) )
    padded_cn_label_sents.append(np.array([cn_vocab[w] for w in padded_cn_label_sent]))

train_en_sents = np.array(padded_en_sents)
train_cn_sents = np.array(padded_cn_sents)
train_cn_label_sents = np.array(padded_cn_label_sents)
 
print(train_en_sents.shape)
print(train_cn_sents.shape)
print(train_cn_label_sents.shape)

输出结果如下图4所示：

二、定义encoder,decoder内部实现


def _convert_param_attr_to_list(param_attr, n): 
    if isinstance(param_attr, (list, tuple)):
        assert len(param_attr) == n, (
            "length of param_attr should be %d when it is a list/tuple" % n)
        param_attrs = []
        for attr in param_attr:
            if isinstance(attr, bool):
                if attr:
                    param_attrs.append(ParamAttr._to_attr(None))
                else:
                    param_attrs.append(False)
            else:
                param_attrs.append(ParamAttr._to_attr(attr))
        # param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]
    elif isinstance(param_attr, bool):
        param_attrs = []
        if param_attr:
            param_attrs = [ParamAttr._to_attr(None) for i in range(n)]
        else:
            param_attrs = [False] * n
    else:
        param_attrs = []
        attr = ParamAttr._to_attr(param_attr)
        for i in range(n):
            attr_i = copy.deepcopy(attr)
            if attr.name:
                attr_i.name = attr_i.name + "_" + str(i)
            param_attrs.append(attr_i)
    return param_attrs


class MultiHeadAttention(Layer): 

    Cache = collections.namedtuple("Cache", ["k", "v"])
    StaticCache = collections.namedtuple("StaticCache", ["k", "v"])

    def __init__(self,
                 embed_dim,
                 num_heads,
                 dropout=0.,
                 kdim=None,
                 vdim=None,
                 need_weights=False,
                 weight_attr=None,
                 bias_attr=None):
        super(MultiHeadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.need_weights = need_weights

        self.head_dim = embed_dim // num_heads
        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"

        self.q_proj = Linear(
            embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
        self.k_proj = Linear(
            self.kdim, embed_dim, weight_attr, bias_attr=bias_attr)
        self.v_proj = Linear(
            self.vdim, embed_dim, weight_attr, bias_attr=bias_attr)
        self.out_proj = Linear(
            embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)

    def _prepare_qkv(self, query, key, value, cache=None): 
        q = self.q_proj(query)
        q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
        q = tensor.transpose(x=q, perm=[0, 2, 1, 3])

        if isinstance(cache, self.StaticCache):
            # for encoder-decoder attention in inference and has cached
            k, v = cache.k, cache.v
        else:
            k, v = self.compute_kv(key, value)

        if isinstance(cache, self.Cache):
            # for decoder self-attention in inference
            k = tensor.concat([cache.k, k], axis=2)
            v = tensor.concat([cache.v, v], axis=2)
            cache = self.Cache(k, v)

        return (q, k, v) if cache is None else (q, k, v, cache)

    def compute_kv(self, key, value): 
        k = self.k_proj(key)
        v = self.v_proj(value)
        k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
        k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
        v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
        v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
        return k, v


    def forward(self, query, key=None, value=None, attn_mask=None, cache=None): 
        key = query if key is None else key
        value = query if value is None else value
        # compute q ,k ,v
        if cache is None:
            q, k, v = self._prepare_qkv(query, key, value, cache)
        else:
            q, k, v, cache = self._prepare_qkv(query, key, value, cache)

        # scale dot product attention
        # TODO(guosheng): use tensor.matmul, however it doesnt support `alpha`
        product = layers.matmul(
            x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5)
        if attn_mask is not None:
            # TODO(guosheng): support bool mask
            product = product + attn_mask
        weights = F.softmax(product)
        if self.dropout:
            weights = F.dropout(
                weights,
                self.dropout,
                training=self.training,
                mode="upscale_in_train")

        out = tensor.matmul(weights, v)

        # combine heads
        out = tensor.transpose(out, perm=[0, 2, 1, 3])
        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])

        # project to output
        out = self.out_proj(out)

        outs = [out]
        if self.need_weights:
            outs.append(weights)
        if cache is not None:
            outs.append(cache)
        return out if len(outs) == 1 else tuple(outs)


class TransformerEncoderLayer(Layer): 

    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward,
                 dropout=0.1,
                 activation="relu",
                 attn_dropout=None,
                 act_dropout=None,
                 normalize_before=False,
                 weight_attr=None,
                 bias_attr=None):
        self._config = locals()
        self._config.pop("self")
        self._config.pop("__class__", None)  # py3

        super(TransformerEncoderLayer, self).__init__()
        attn_dropout = dropout if attn_dropout is None else attn_dropout
        act_dropout = dropout if act_dropout is None else act_dropout
        self.normalize_before = normalize_before

        weight_attrs = _convert_param_attr_to_list(weight_attr, 2)
        bias_attrs = _convert_param_attr_to_list(bias_attr, 2)

        self.self_attn = MultiHeadAttention(
            d_model,
            nhead,
            dropout=attn_dropout,
            weight_attr=weight_attrs[0],
            bias_attr=bias_attrs[0])
        self.linear1 = Linear(
            d_model, dim_feedforward, weight_attrs[1], bias_attr=bias_attrs[1])
        self.dropout = Dropout(act_dropout, mode="upscale_in_train")
        self.linear2 = Linear(
            dim_feedforward, d_model, weight_attrs[1], bias_attr=bias_attrs[1])
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout, mode="upscale_in_train")
        self.dropout2 = Dropout(dropout, mode="upscale_in_train")
        self.activation = getattr(F, activation)

    def forward(self, src, src_mask=None, cache=None): 
        residual = src
        if self.normalize_before:
            src = self.norm1(src) 
        if cache is None:
            src = self.self_attn(src, src, src, src_mask)
        else:
            src, incremental_cache = self.self_attn(src, src, src, src_mask,
                                                    cache)

        src = residual + self.dropout1(src)
        if not self.normalize_before:
            src = self.norm1(src)

        residual = src
        if self.normalize_before:
            src = self.norm2(src)
        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
        src = residual + self.dropout2(src)
        if not self.normalize_before:
            src = self.norm2(src)
        return src if cache is None else (src, incremental_cache)

class TransformerEncoder(Layer): 
    def __init__(self, encoder_layer, num_layers, norm=None):
        super(TransformerEncoder, self).__init__()
        self.layers = LayerList([(encoder_layer if i == 0 else
                                  type(encoder_layer)(**encoder_layer._config))
                                 for i in range(num_layers)])
        self.num_layers = num_layers
        self.norm = norm

    def forward(self, src, src_mask=None, cache=None): 
        output = src
        new_caches = []
        for i, mod in enumerate(self.layers):
            if cache is None:
                output = mod(output, src_mask=src_mask)
            else:
                output, new_cache = mod(output,
                                        src_mask=src_mask,
                                        cache=cache[i])
                new_caches.append(new_cache)

        if self.norm is not None:
            output = self.norm(output)

        return output if cache is None else (output, new_caches)



class TransformerDecoderLayer(Layer): 

    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward,
                 dropout=0.1,
                 activation="relu",
                 attn_dropout=None,
                 act_dropout=None,
                 normalize_before=False,
                 weight_attr=None,
                 bias_attr=None):
        self._config = locals()
        self._config.pop("self")
        self._config.pop("__class__", None)  # py3

        super(TransformerDecoderLayer, self).__init__()
        attn_dropout = dropout if attn_dropout is None else attn_dropout
        act_dropout = dropout if act_dropout is None else act_dropout
        self.normalize_before = normalize_before

        weight_attrs = _convert_param_attr_to_list(weight_attr, 3)
        bias_attrs = _convert_param_attr_to_list(bias_attr, 3)

        self.self_attn = MultiHeadAttention(
            d_model,
            nhead,
            dropout=attn_dropout,
            weight_attr=weight_attrs[0],
            bias_attr=bias_attrs[0])
        self.cross_attn = MultiHeadAttention(
            d_model,
            nhead,
            dropout=attn_dropout,
            weight_attr=weight_attrs[1],
            bias_attr=bias_attrs[1])
        self.linear1 = Linear(
            d_model, dim_feedforward, weight_attrs[2], bias_attr=bias_attrs[2])
        self.dropout = Dropout(act_dropout, mode="upscale_in_train")
        self.linear2 = Linear(
            dim_feedforward, d_model, weight_attrs[2], bias_attr=bias_attrs[2])
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.norm3 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout, mode="upscale_in_train")
        self.dropout2 = Dropout(dropout, mode="upscale_in_train")
        self.dropout3 = Dropout(dropout, mode="upscale_in_train")
        self.activation = getattr(F, activation)

    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None): 
        residual = tgt
        if self.normalize_before:
            tgt = self.norm1(tgt)
        if cache is None:
            tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, None)
        else:
            tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask,
                                                    cache[0])
        tgt = residual + self.dropout1(tgt)
        if not self.normalize_before:
            tgt = self.norm1(tgt)

        residual = tgt
        if self.normalize_before:
            tgt = self.norm2(tgt)
        if cache is None:
            tgt = self.cross_attn(tgt, memory, memory, memory_mask, None)
        else:
            tgt, static_cache = self.cross_attn(tgt, memory, memory,
                                                memory_mask, cache[1])
        tgt = residual + self.dropout2(tgt)
        if not self.normalize_before:
            tgt = self.norm2(tgt)

        residual = tgt
        if self.normalize_before:
            tgt = self.norm3(tgt)
        tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
        tgt = residual + self.dropout3(tgt)
        if not self.normalize_before:
            tgt = self.norm3(tgt)
        return tgt if cache is None else (tgt, (incremental_cache,
                                                static_cache))

class TransformerDecoder(Layer):  
    def __init__(self, decoder_layer, num_layers, norm=None):
        super(TransformerDecoder, self).__init__()
        self.layers = LayerList([(decoder_layer if i == 0 else
                                  type(decoder_layer)(**decoder_layer._config))
                                 for i in range(num_layers)])
        self.num_layers = num_layers
        self.norm = norm

    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None): 
        output = tgt
        new_caches = []
        for i, mod in enumerate(self.layers):
            if cache is None:
                output = mod(output,
                             memory,
                             tgt_mask=tgt_mask,
                             memory_mask=memory_mask,
                             cache=None)
            else:
                output, new_cache = mod(output,
                                        memory,
                                        tgt_mask=tgt_mask,
                                        memory_mask=memory_mask,
                                        cache=cache[i])
                new_caches.append(new_cache)

        if self.norm is not None:
            output = self.norm(output)

        return output if cache is None else (output, new_caches)

三、构建基于Transformer的机器翻译模型

定义网络结构后，需要配置优化器、损失函数、评价指标。

(一)、定义超参数

首先定义超参数，用于后续模型的设计与训练

embedding_size = 128
hidden_size = 512
num_encoder_lstm_layers = 1
en_vocab_size = len(list(en_vocab))
cn_vocab_size = len(list(cn_vocab))
epochs = 20
batch_size = 16

(二)、定义Encoder

使用TransformerEncoder定义Encoder

# encoder: simply learn representation of source sentence
class Encoder(paddle.nn.Layer):
    def __init__(self,en_vocab_size, embedding_size,num_layers=2,head_number=2,middle_units=512):
        super(Encoder, self).__init__()
        self.emb = paddle.nn.Embedding(en_vocab_size, embedding_size,)
        """
        d_model (int) - 输入输出的维度。
        nhead (int) - 多头注意力机制的Head数量。
        dim_feedforward (int) - 前馈神经网络中隐藏层的大小。
        """
        
        encoder_layer = TransformerEncoderLayer(embedding_size, head_number, middle_units)
        self.encoder = TransformerEncoder(encoder_layer, num_layers) 

    def forward(self, x):
        x = self.emb(x)
        en_out = self.encoder(x)
        return en_out

(三)、定义Decoder

使用TransformerDecoder定义Decoder

class Decoder(paddle.nn.Layer):
    def __init__(self,cn_vocab_size, embedding_size,num_layers=2,head_number=2,middle_units=512):
        super(Decoder, self).__init__()
        self.emb = paddle.nn.Embedding(cn_vocab_size, embedding_size)
        
        # self.decoder = Decoder(decoder_layer, num_layers) 
        decoder_layer = TransformerDecoderLayer(embedding_size, head_number, middle_units)
        self.decoder = TransformerDecoder(decoder_layer, num_layers) 
   
        # for computing output logits
        self.outlinear =paddle.nn.Linear(embedding_size, cn_vocab_size)

    def forward(self, x,  encoder_outputs):
        x = self.emb(x)
        # dec_input, enc_output,self_attn_mask,  cross_attn_mask
        de_out = self.decoder(x, encoder_outputs)
        output = self.outlinear(de_out)
        output = paddle.squeeze(output)
        return  output

四、模型训练

encoder = Encoder(en_vocab_size, embedding_size)
decoder = Decoder(cn_vocab_size, embedding_size)

opt = paddle.optimizer.Adam(learning_rate=0.0001,
                            parameters=encoder.parameters() + decoder.parameters())

for epoch in range(epochs):
    print("epoch:".format(epoch))

    # shuffle training data
    perm = np.random.permutation(len(train_en_sents))
    train_en_sents_shuffled = train_en_sents[perm]
    train_cn_sents_shuffled = train_cn_sents[perm]
    train_cn_label_sents_shuffled = train_cn_label_sents[perm]
    # print(train_en_sents_shuffled.shape[0],train_en_sents_shuffled.shape[1])
    for iteration in range(train_en_sents_shuffled.shape[0] // batch_size):
        x_data = train_en_sents_shuffled[(batch_size*iteration):(batch_size*(iteration+1))]
        sent = paddle.to_tensor(x_data)
        en_repr = encoder(sent)

        x_cn_data = train_cn_sents_shuffled[(batch_size*iteration):(batch_size*(iteration+1))]
        x_cn_label_data = train_cn_label_sents_shuffled[(batch_size*iteration):(batch_size*(iteration+1))]
 
        loss = paddle.zeros([1]) 
        for i in range( cn_length + 2):
            cn_word = paddle.to_tensor(x_cn_data[:,i:i+1])
            cn_word_label = paddle.to_tensor(x_cn_label_data[:,i])

            logits = decoder(cn_word, en_repr)
            step_loss = F.cross_entropy(logits, cn_word_label)
            loss += step_loss

        loss = loss / (cn_length + 2)
        if(iteration % 50 == 0):
            print("iter , loss:".format(iteration, loss.numpy()))

        loss.backward()
        opt.step()
        opt.clear_grad()

输出结果如下图5所示：

五、模型预测

随机从训练集中抽取几句话来进行预测

encoder.eval（)
decoder.eval（)

num_of_exampels_to_evaluate = 10

indices = np.random.choice(len(train_en_sents),  num_of_exampels_to_evaluate, replace=False)
x_data = train_en_sents[indices]
sent = paddle.to_tensor(x_data)
en_repr = encoder(sent)

word = np.array(
    [[cn_vocab[<bos>]]] * num_of_exampels_to_evaluate
)
word = paddle.to_tensor(word)
 

decoded_sent = []
for i in range(cn_length + 2):
    logits  = decoder(word, en_repr)
    word = paddle.argmax(logits, axis=1)
    decoded_sent.append(word.numpy())
    word = paddle.unsqueeze(word, axis=-1)

results = np.stack(decoded_sent, axis=1)
for i in range(num_of_exampels_to_evaluate):
    print(---------------------)
    en_input = " ".join(datas[indices[i]][0])
    ground_truth_translate = "".join(datas[indices[i]][1])
    model_translate = ""
    for k in results[i]:
        w = list(cn_vocab)[k]
        if w != <pad> and w != <eos>:
            model_translate += w
    print(en_input)
    print("true: ".format(ground_truth_translate))
    print("pred: ".format(model_translate))

输出结果如下图6所示：

总结

本系列文章内容为根据清华社出版的《自然语言处理实践》所作的相关笔记和感悟，其中代码均为基于百度飞桨开发，若有任何侵权和不妥之处，请私信于我，定积极配合处理，看到必回！！！

最后，引用本次活动的一句话，来作为文章的结语～(￣▽￣～)~：

【**学习的最大理由是想摆脱平庸，早一天就多一份人生的精彩；迟一天就多一天平庸的困扰。**】

以上是关于自然语言处理（NLP）基于Transformer的中-英机器翻译的主要内容，如果未能解决你的问题，请参考以下文章

基于Transformer的NLP处理管线

深度学习之自然语言处理BERT

史上最细节的自然语言处理NLP/Transformer/BERT/Attention面试问题与答案

NLP特征处理器 Transformer和他的历史