6.12使用tensorflow来搭建一个Transformer

Posted 2021-06-07 炫云云

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了6.12使用tensorflow来搭建一个Transformer相关的知识，希望对你有一定的参考价值。

import tensorflow_datasets as tfds
import tensorflow as tf

import time
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

transformer模型结构

位置嵌入

因为不像RNN那样是按顺序处理输入，RNN有每个token的位置信息。而transformer是并行运行，这里要添加解释输入token顺序的方法,即在输入添加位置嵌入。
这里的位置嵌入并不编码单词在句子中的相对位置，语义相近的token位置无法显示。位置嵌入公式：

$\\begin{aligned} P E_{t, 2 i} &=\\sin \\left(\\frac{t} { 10000^{2 i / d}}\\right) \\\\ P E_{t, 2 i+1} &=\\cos \\left(\\frac{t} { 10000^{2 i / d}}\\right) \\end{aligned}$

其中 $t$ 是指当前词在句子中的位置， $i$ 是指位置嵌入中每个值的 $i n d e x$ ，其中 $i$ 在 $\\left[0, \\frac{d}{2}\\right]$ 范围内， $d$ 为位置embedding的维度。可以看出，在偶数位置，使用正弦编码，在奇数位置，使用余弦编码。

def get_angles(t, i, d):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d))
    return t * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)
    # 对位置嵌入中的偶数下标应用sin; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    # 对位置嵌入中的奇数下标应用cos; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[np.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)

np.newaxis作用

np.newaxis的功能:插入新维度

a=np.array([1,2,3,4,5])
aa=a[:,np.newaxis]
b =a[np.newaxis,:]
print(aa.shape,b.shape)
print (aa)
print (b)

(5, 1) (1, 5)
[[1]
 [2]
 [3]
 [4]
 [5]]
[[1 2 3 4 5]]

n, d = 2048, 512
pos_encoding = positional_encoding(n, d) #(1, 2048, 512)

pos_encoding = pos_encoding[0] # (2048, 512)
# pos_encoding = tf.transpose(pos_encoding, (1, 0))
pos_encoding = tf.reshape(pos_encoding, (n, d//2, 2))#(2048, 256, 2)
pos_encoding = tf.transpose(pos_encoding, (2, 1, 0))#(2, 256, 2048)
pos_encoding = tf.reshape(pos_encoding, (d, n))#(512, 2048)

plt.pcolormesh(pos_encoding, cmap='RdBu')
plt.ylabel('Depth')
plt.xlabel('Position')
plt.colorbar()
plt.show()

png

Mask编码

padding mask

mask序列中的pad为0标记。它确保模型不会将填充作为输入。这里将pad位置变成1，表示为mask，之后乘以一个很大的负数，attention就不会注意到了。

def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    #添加额外的维度以向注意记录添加填充。
    return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

x = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
create_padding_mask(x)

<tf.Tensor: shape=(3, 1, 1, 5), dtype=float32, numpy=
array([[[[0., 0., 1., 1., 0.]]],

[[[0., 0., 0., 1., 1.]]],

[[[1., 1., 1., 0., 0.]]]], dtype=float32)>

Sequence mask

mask 序列中的未来的tokens 。

这意味着要预测第三个单词，只会使用第一个和第二个单词。同样地，要预测第四个单词，只有第一个、第二个和第三个单词会被使用，以此类推。

深色的为mask.变成1.

def Sequence_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # (seq_len, seq_len)

tf.linalg.band_part(input,num_lower,num_upper)函数

参数描述

input: 输入的张量
num_lower ：输入数值为负数时，表示下方的对角矩阵元素全部保留；
num_upper：输入数值为负数时，表示上方的对角矩阵元素全部保留；

x = tf.random.uniform((1, 3))
size= x.shape[1]
a = tf.linalg.band_part(tf.ones((size, size)), -1, 0)
print(a)
temp = Sequence_mask(x.shape[1])
temp

tf.Tensor(
[[1. 0. 0.]
 [1. 1. 0.]
 [1. 1. 1.]], shape=(3, 3), dtype=float32)





<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[0., 1., 1.],
       [0., 0., 1.],
       [0., 0., 0.]], dtype=float32)>

self-attention

根据查询q,计算出v中要注意的那部分，即在v选择出和某个特定任务相关的信息。

这里查询q根据k 计算出注意力权重分布。得出不同q的上下文向量。
self-attention:q=k=v.

$\\operatorname{Attention}(Q, K, V)=\\operatorname{softmax}\\left(\\frac{Q K^{T}}{\\sqrt{d_{k}}}\\right) V$

mask 乘以-1e9(接近负无穷)。这样做是因为mask 是用Q和K的缩放矩阵乘法求和的，并且在softmax之前立即应用。目标是使这些单元归零，而对softmax的大的负输入在输出中接近于零。

q,k :必须有匹配的倒数第一维度，即depth=depth
k, v :必须有匹配的倒数第二维度.即seq_len_k = seq_len_v.
mask： 根据其类型有不同的形状(填充或向前看)但它必须是广播的附加。

x = [
    [1, 0, 1, 0], # Input 1
    [0, 2, 0, 2], # Input 2
    [1, 1, 1, 1]  # Input 3
    ]
x = tf.constant(x, dtype=tf.float32)

w_key = [
    [0, 0, 1],
    [1, 1, 0],
    [0, 1, 0],
    [1, 1, 0]
  ]
w_query = [
    [1, 0, 1],
    [1, 0, 0],
    [0, 0, 1],
    [0, 1, 1]
  ]
w_value = [
    [0, 2, 0],
    [0, 3, 0],
    [1, 0, 3],
    [1, 1, 0]
  ]
w_key = tf.constant(w_key, dtype=tf.float32)
w_query = tf.constant(w_query, dtype=tf.float32)
w_value = tf.constant(w_value, dtype=tf.float32)

keys = x @ w_key
querys = x @ w_query
values = x @ w_value

print(keys)
print(querys)
print(values)

tf.Tensor(
[[0. 1. 1.]
 [4. 4. 0.]
 [2. 3. 1.]], shape=(3, 3), dtype=float32)
tf.Tensor(
[[1. 0. 2.]
 [2. 2. 2.]
 [2. 1. 3.]], shape=(3, 3), dtype=float32)
tf.Tensor(
[[1. 2. 3.]
 [2. 8. 0.]
 [2. 6. 3.]], shape=(3, 3), dtype=float32)

attn_scores =  tf.matmul(querys, keys, transpose_b=True) 
print(attn_scores)
# 缩放 matmul_q
dk = tf.cast(tf.shape(keys)[-1], tf.float32)
scaled_attention =  attn_scores / tf.math.sqrt(dk)
print(scaled_attention)

attn_scores_softmax = tf.nn.softmax(scaled_attention, axis=-1)
print(attn_scores_softmax)

weighted_values = tf.matmul(attn_scores_softmax, values)  
print(weighted_values)

tf.Tensor(
[[ 2.  4.  4.]
 [ 4. 16. 12.]
 [ 4. 12. 10.]], shape=(3, 3), dtype=float32)
tf.Tensor(
[[1.1547005 2.309401  2.309401 ]
 [2.309401  9.237604  6.9282036]
 [2.309401  6.9282036 5.773503 ]], shape=(3, 3), dtype=float32)
tf.Tensor(
[[1.3612580e-01 4.3193710e-01 4.3193710e-01]
 [8.9044747e-04 9.0884256e-01 9.0266943e-02]
 [7.4448902e-03 7.5470763e-01 2.3784748e-01]], shape=(3, 3), dtype=float32)
tf.Tensor(
[[1.8638742  6.319371   1.7041887 ]
 [1.9991095  7.814123   0.27347216]
 [1.9925551  7.4796357  0.73587716]], shape=(3, 3), dtype=float32)

def self_attention(q, k, v, mask):
    """
    Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable
          to (..., seq_len_q, seq_len_k). Defaults to None.

    Returns:
        output, attention_weights
    """
    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits  = matmul_qk/tf.math.sqrt(dk)
    # 给缩放后的attention加 mask 
    if mask is not None:
        scaled_attention_logits +=  (mask * -1e9)
    #在最后一个轴(seq_len_k)上对Softmax进行归一化，以便获得分数
    attention_weights  = tf.nn.softmax(scaled_attention_logits,axis = -1) # (..., seq_len_q, seq_len_k)
    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)
    return output, attention_weights

由于对K进行了softmax归一化，其值决定了对Q的重要程度。

输出表示注意权值和V(值)向量的乘积。这确保您想要关注的词保持原样，不相关的词被排除。

def print_out(q, k, v):
    temp_out, temp_attn = self_attention(
      q, k, v, None)
    print('Attention weights are:')
    print(temp_attn)
    print('Output is:')
    print(temp_out)
    

np.set_printoptions(suppress=True)#小数不需要以科学计数法的形式输出

temp_k = tf.constant([[10, 0, 0],
                      [0, 10, 0],
                      [0, 0, 10],
                      [0, 0, 10]], dtype=tf.float32)  # (4, 3)

temp_v = tf.constant([[1, 0],
                      [10, 0],
                      [100, 5],
                      [1000, 6]], dtype=tf.float32)  # (4, 2)


temp_q = tf.constant([[0, 10, 0]], dtype=tf.float32)  # (1, 3)
print_out(temp_q, temp_k, temp_v)
# 这个' query '与第二个' key '对齐，因此返回第二个' value '。

temp_q = tf.constant([[0, 0, 10]], dtype=tf.float32)  # (1, 3)
# 这个' query '与重复的' key '(第三个和第四个)对齐，因此返回是所有相关的 ' value ' 的平均值。
print_out(temp_q, temp_k, temp_v)

Attention weights are:
tf.Tensor([[0. 1. 0. 0.]], shape=(1, 4), dtype=float32)
Output is:
tf.Tensor([[10.  0.]], shape=(1, 2), dtype=float32)
Attention weights are:
tf.Tensor([[0.  0.  0.5 0.5]], shape=(1, 4), dtype=float32)
Output is:
tf.Tensor([[550.    5.5]], shape=(1, 2), dtype=float32)

将所有queries 传递到一起。

temp_q = tf.constant([[0, 0, 10],
                      [0, 10, 0],
                      [10, 10, 0]], dtype=tf.float32)  # (3, 3)
print_out(temp_q, temp_k, temp_v)

Attention weights are:
tf.Tensor(
[[0.  0.  0.5 0.5]
 [0.  1.  0.  0. ]
 [0.5 0.5 0.  0. ]], shape=(3, 4), dtype=float32)
Output is:
tf.Tensor(
[[550.    5.5]
 [ 10.    0. ]
 [  5.5   0. ]], shape=(3, 2), dtype=float32)

Multi-head attention

Multi-head attention 包括四个部分:

线性层:划分多头
self attention.
拼接所有的 heads.
最后全连接层

每个多头注意块获得三个输入;Q(查询)，K(键)，V(值)。这些是通过线性层，分裂成多个头。

上面定义的self attention应用于每个head(为提高效率而广播)。在注意步骤中必须使用适当的mask。然后将每个头的注意力输出连接起来（tf.transpose, and tf.reshape），并通过一个最后的全连接层。

Q、K和V不是单一的注意力头，而是被分成多个注意力头，因为它允许模型共同关注来自不同表征空间的不同位置的信息。拆分后每个头的维数降低，因此总计算代价与单个全维头注意力相同。


class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        assert d_model % self.num_heads == 0 
        self.depth = d_model // self.num_heads#每个头的维度
        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)
        
        self.dense = tf.keras.layers.Dense(d_model)
        
    def split_heads(self, x, batch_size):
        """
        将最后一个维度拆分为(num_heads, depth),即tf.reshape。
        使用转置，使形状为 (batch_size, num_heads, seq_len, depth)
        """
        x = tf.reshape(x, (batch_size,-1,self.num_heads,  self.depth ))
        return tf.transpose(x, perm=[0, 2, 1, 3])
        
    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]
        
        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)
        #划分多头
        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
        self_attentions, attention_weights = self_attention(q, k, v, mask)
        # self_attentions.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        self_attentions = tf.transpose(self_attentions, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)
        concat_attention  = tf.reshape(self_attentions,(batch_size, -1, self.d_model))# (batch_size, seq_len_q, d_model)
        
        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
        return output, attention_weights

创建一个MultiHeadAttention层来尝试一下。在序列的每个位置y上，MultiHeadAttention在序列的所有其他位置上运行所有8个注意力头，在每个位置返回一个相同长度的新向量。

temp_mha = MultiHeadAttention(d_model=512, num_heads=8)
y = tf.random.uniform((1, 60, 512))  # (batch_size, seq_len, d_model)
out, attn = temp_mha(y, k=y, q=y, mask=None)
out.shape, attn.shape

(TensorShape([1, 60, 512]), TensorShape([1, 8, 60, 60]))

前馈网络

FNN的计算公式如下：
$\\operatorname{FFN}(x)=\\max \\left(0, x W_{1}+b_{1}\\right) W_{2}+b_{2}$

即由两个完全连接的层组成，层与层之间使用ReLU激活。

def feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
        tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
    ])


sample_ffn = feed_forward_network(512, 2048)
sample_ffn(tf.random.uniform((64, 50, 512))).shape

TensorShape([64, 50, 512])

Encoder and decoder

transformer模型遵循与sequence to sequence 一样的模式。

输入语句通过N个编码器层，为序列中的每个单词/token 生成输出。
N个解码器关注编码器的输出和它自己的输入(自我注意)来预测下一个单词。

Encoder layer

每个编码器层都由子层组成:

多头注意力(带 padding mask)
前馈网络

每一个子层都有一个残差连接，然后进行层归一化。残差连接有助于避免深度网络中的消失梯度问题。

每个子层的输出为LayerNorm(x + sublayer (x))。规范化是在d_model(最后一维)轴上完成的。

class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = feed_forward_network(d_model, dff)
        
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
    def call(self, x, training, mask):
        attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
        
        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
        
        return out2

sample_encoder_layer = EncoderLayer(512, 8, 2048)
sample_encoder_layer_output = sample_encoder_layer(
    tf.random.uniform((64, 43, 512)), False, None)
sample_encoder_layer_output.shape  # (batch_size, input_seq_len, d_model)

TensorShape([64, 43, 512])

Decoder layer

每个解码器层由子层组成:

Mask多头注意(带有向前看Sequence mask和padding mask)
多头注意(带有padding mask))。V(值)和K(键)为编码器输出。Q (query)为Mask多头注意子层的输出。
前馈网络

由于Q接收解码器的第一个注意块的输出，K接收编码器的输出，注意权值表示基于编码器输出对解码器的输入给予不同重要性。换句话说，解码器通过查看编码器的输出并self-attending自己的输出来预测下一个单词。

class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        self.ffn = feed_forward_network(d_model, dff)
        
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)
        
    def call(self, x, enc_output, training,
           Sequence_mask, padding_mask):
        # enc_output.shape == (batch_size, input_seq_len, d_model)
        attn1, attn_weights_block1 = self.mha1(x, x, x, Sequence_mask)  # (batch_size, target_seq_len, d_model)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)
        
        attn2, attn_weights_block2 = self.mha2(
            enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)
        
        ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)
        
        return out3, attn_weights_block1, attn_weights_block2

sample_decoder_layer = DecoderLayer(512, 8, 2048)

sample_decoder_layer_output, _, _ = sample_decoder_layer(
    tf.random.uniform((64, 50, 512)), sample_encoder_layer_output,
    False, None, None)

sample_decoder_layer_output.shape  # (batch_size, target_seq_len, d_model)

TensorShape([64, 50, 512])

Encoder

编码器包括:

输入嵌入
位置编码
N层编码器

位置编码+输入嵌入 = 输入.
编码器的输出是解码器的输入。

class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
               maximum_position_encoding, rate=0.1):
        super(Encoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        
        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding,
                                            self.d_model)
        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
                       for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)
        
    def call(self, x, training, mask):
        seq_len = tf.shape(x)[1]
        # adding embedding and position encoding.
        x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]#在每个token添加
        x = self.dropout(x, training=training)
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)
        
        return x  # (batch_size, input_seq_len, d_model)

sample_encoder = Encoder(num_layers=2, d_model=512, num_heads=8,
                         dff=2048, input_vocab_size=8500,
                         maximum_position_encoding=10000)
temp_input = tf.random.uniform((64, 62), dtype=tf.int64, minval=0, maxval=200)

sample_encoder_output = sample_encoder(temp_input, training=False, mask=None)

print(sample_encoder_output.shape)  # (batch_size, input_seq_len, d_model)

(64, 62, 512)

Decoder

解码器包括:

输出嵌入
位置编码
N解码器层

解码器的输出是最终线性层的输入。

class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
               maximum_position_encoding, rate=0.1):
        super(Decoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        
        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)
        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate)
                       for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)
        
    def call(self, x, enc_output, training,
           look_ahead_mask, padding_mask):
        seq_len = tf.shape(x)[1]
        attention_weights = {}
        x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]
        
        x = self.dropout(x, training=training)
        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                             look_ahead_mask, padding_mask)
            attention_weights[f'decoder_layer{i+1}_block1'] = block1
            attention_weights[f'decoder_layer{i+1}_block2'] = block2
        # x.shape == (batch_size, target_seq_len, d_model)
        return x, attention_weights

sample_decoder = Decoder(num_layers=2, d_model=512, num_heads=8,
                         dff=2048, target_vocab_size=8000,
                         maximum_position_encoding=5000)
temp_input = tf.random.uniform((64, 26), dtype=tf.int64, minval=0, maxval=200)

output, attn = sample_decoder(temp_input,
                              enc_output=sample_encoder_output,
                              training=False,
                              look_ahead_mask=None,
                              padding_mask=None)

output.shape, attn['decoder_layer2_block2'].shape

(TensorShape([64, 26, 512]), TensorShape([64, 8, 26, 62]))

组成Transformer

Transformer由编码器、解码器和最后的线性层组成。

class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
               target_vocab_size, pe_input, pe_target, rate=0.1):
        super(Transformer, self).__init__()
        self.tokenizer = Encoder(num_layers, d_model, num_heads, dff,
                             input_vocab_size, pe_input, rate)
        
        self.decoder = Decoder(num_layers, d_model, num_heads, dff,
                           target_vocab_size, pe_target, rate)
        
        self.final_layer = tf.keras.layers.Dense(target_vocab_size)
    def call(self, inp, tar, training, enc_padding_mask,
           look_ahead_mask, dec_padding_mask):
        enc_output = self.tokenizer(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)
        
        # dec_output.shape == (batch_size, tar_seq_len, d_model)
        dec_output, attention_weights = self.decoder(
            tar, enc_output, training, look_ahead_mask, dec_padding_mask)
        
        final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)
        return final_output, attention_weights

sample_transformer = Transformer(
    num_layers=2, d_model=512, num_heads=8, dff=2048,
    input_vocab_size=8500, target_vocab_size=8000,
    pe_input=10000, pe_target=6000)

temp_input = tf.random.uniform((64, 38), dtype=tf.int64, minval=0, maxval=200)
temp_target = tf.random.uniform((64, 36), dtype=tf.int64, minval=0, maxval=200)

fn_out, _ = sample_transformer(temp_input, temp_target, training=False,
                               enc_padding_mask=None,
                               look_ahead_mask=None,
                               dec_padding_mask=None)

fn_out.shape  # (batch_size, tar_seq_len, target_vocab_size)

TensorShape([64, 36, 8000])

机器翻译

使用TFDS从TED Talks Open translation Project加载葡萄牙语-英语翻译数据集。

该数据集包含约50000个训练示例、1100个验证示例和2000个测试示例。

examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True,as_supervised=True)
train_examples, val_examples = examples['train'], examples['validation']

从训练数据集中创建自定义子tokenizer 。

tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (en.numpy() for pt, en in train_examples), target_vocab_size=2**13)

tokenizer_pt = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (pt.numpy() for pt, en in train_examples), target_vocab_size=2**13)


sample_string = 'l like tensorflow,l like nlp.'

tokenized_string = tokenizer_en.encode(sample_string)
print ('Tokenized string is {}'.format(tokenized_string))

original_string = tokenizer_en.decode(tokenized_string)
print ('The original string: {}'.format(original_string))

assert original_string == sample_string

Tokenized string is [246, 53, 1378, 2824, 2424, 7875, 246, 53, 6635, 7943, 7877]
The original string: l like tensorflow,l like nlp.

如果单词不在其字典中，则tokenizer通过将字符串分解成子单词来对其进行编码。

for ts in tokenized_string:
    print ('{} ----> {}'.format(ts, tokenizer_en.decode([ts])))

246 ----> l 
53 ----> like 
1378 ----> ten
2824 ----> sor
2424 ----> flow
7875 ----> ,
246 ----> l 
53 ----> like 
6635 ----> nl
7943 ----> p
7877 ----> .

BUFFER_SIZE = 20000
BATCH_SIZE = 64

向输入和目标添加一个开始和结束标记。

def encode(lang1, lang2):
    lang1 = [tokenizer_pt.vocab_size] + tokenizer_pt.encode(
        lang1.numpy()) + [tokenizer_pt.vocab_size+1]
    lang2 = [tokenizer_en.vocab_size] + tokenizer_en.encode(
        lang2.numpy()) + [tokenizer_en.vocab_size+1]
    return lang1, lang2

您想要使用 Dataset.map以将此函数应用于数据集的每个元素。数据集。 Dataset.map 以图形模式运行。

图张量没有值。
在图形模式下，只能使用TensorFlow的Ops和函数。

所以不能直接.map这个函数:需要将它包装在tf.py_function中。tf.py_function将向包装好的python函数传递常规张量(带有一个值和一个用来访问它的.numpy()方法)。

def tf_encode(pt, en):
    result_pt, result_en = tf.py_function(encode, [pt, en], [tf.int64, tf.int64])
    result_pt.set_shape([None])
    result_en.set_shape([None])

    return result_pt, result_en

TensorFlow中，张量具有静态形状和动态形状

静态形状：创建一个张量或者由操作推导出一个张量时,初始状态的形状

tf.Tensor.get_shape:获取静态形状
tf.Tensor.set_shape():更新Tensor对象的静态形状，通常用于在不能直接推断的情况下

动态形状：一种描述原始张量在执行过程中的形状(动态变化)

tf.reshape:创建一个具有不同动态形状的新张量

# 对于静态形状来说，一旦张量形状固定了，就不能再次设置静态形状了，并且不能跨维度修改 1D->1D 2D->2D
# 动态形状会创建一个新的张量,改变动态形状时一定要注意元素数量要匹配  1D->2D  1D->3D
 
tensor = tf.placeholder(tf.int32)
print(tensor)

# set_shape() 修改静态形状(只能修改一次,且不能跨维度修改)，固定静态形状
tensor.set_shape([3, 2])
print(tensor)

# reshape() 修改动态形状会创建一个新的张量，可以跨维度修改，但元素数量要匹配
tensor_reshape = tf.reshape(tensor, [2, 3])  # [-1, 3] -1表示行数可以任意。
print(tensor_reshape)

注意:为了保持这个示例较小且快速训练，删除长度超过40个tokens的示例。

MAX_LENGTH = 40

def filter_max_length(x, y, max_length=MAX_LENGTH):
    return tf.logical_and(tf.size(x) <= max_length,
                        tf.size(y) <= max_length)# False or True

train_dataset = train_examples.map(tf_encode)
train_dataset = train_dataset.filter(filter_max_length)
# 将数据集缓存到内存中，以便在读取时获得加速。
train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)


val_dataset = val_examples.map(tf_encode)
val_dataset = val_dataset.filter(filter_max_length).padded_batch(BATCH_SIZE)


pt_batch, en_batch = next(iter(val_dataset))
pt_batch, en_batch

(<tf.Tensor: shape=(64, 38), dtype=int64, numpy=
 array([[8214,  342, 3032, ...,    0,    0,    0],
        [8214,   95,  198, ...,    0,    0,    0],
        [8214, 4479, 7990, ...,    0,    0,    0],
        ...,
        [8214,  584,   12, ...,    0,    0,    0],
        [8214,   59, 1548, ...,    0,    0,    0],
        [8214,  118,   34, ...,    0,    0,    0]], dtype=int64)>,
 <tf.Tensor: shape=(64, 40), dtype=int64, numpy=
 array([[8087,   98,   25, ...,    0,    0,    0],
        [8087,   12,   20, ...,    0,    0,    0],
        [8087,   12, 5453, ...,    0,    0,    0],
        ...,
        [8087,   18, 2059, ...,    0,    0,    0],
        [8087,   16, 1436, ...,    0,    0,    0],
        [8087,   15,   57, ...,    0,    0,    0]], dtype=int64)>)

设置超参数

为了保持这个示例较小且相对较快，num_layers、d_model和dff的值被降低了。

transformer 基本模型中使用的值为:num_layers=6, d_model = 512, dff = 2048。

注意:通过更改下面的值，您可以获得在许多任务上达到最先进状态的模型。


num_layers = 4
d_model = 128
dff = 512
num_heads = 8

input_vocab_size = tokenizer_pt.vocab_size + 2
target_vocab_size = tokenizer_en.vocab_size + 2
dropout_rate = 0.1

优化器

根据公式使用自定义学习率策略的Adam优化器

在这里插入图片描述

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)#求step平方根的倒数
        arg2 = step * (self.warmup_steps ** -1.5)
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

temp_learning_rate_schedule = CustomSchedule(d_model)

plt.plot(temp_learning_rate_schedule(tf.range(40000, dtype=tf.float32)))
plt.ylabel("Learning Rate")
plt.xlabel("Train Step")

Text(0.5, 0, 'Train Step')

损失和指标

由于目标序列被填充，所以在计算损失时应用padding mask是很重要的。

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))# mask的位置为False
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)# mask的位置为0
    loss_ *= mask
    
    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

def accuracy_function(real, pred):
    accuracies = tf.equal(real, tf.argmax(pred, axis=2))
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    accuracies = tf.math.logical_and(mask, accuracies)#对没有mask计算accuracies
    
    accuracies = tf.cast(accuracies, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)

real= tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
mask = tf.math.logical_not(tf.math.equal(real, 0))
print(tf.math.equal(real, 0))
print(mask)
print(tf.cast(mask, dtype=tf.float32))

tf.Tensor(
[[False False  True  True False]
 [False False False  True  True]
 [ True  True  True False False]], shape=(3, 5), dtype=bool)
tf.Tensor(
[[ True  True False False  True]
 [ True  True  True False False]
 [False False False  True  True]], shape=(3, 5), dtype=bool)
tf.Tensor(
[[1. 1. 0. 0. 1.]
 [1. 1. 1. 0. 0.]
 [0. 0. 0. 1. 1.]], shape=(3, 5), dtype=float32)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

训练和检查

transformer = Transformer(num_layers, d_model, num_heads, dff,
                          input_vocab_size, target_vocab_size, 
                          pe_input=input_vocab_size, 
                          pe_target=target_vocab_size,
                          rate=dropout_rate)
def create_masks(inp, tar):
    # Encoder padding mask
    enc_padding_mask = create_padding_mask(inp)
    #用于解码器的第二个注意块。
    # 这个padding mask 用于mask编码器输出。
    dec_padding_mask = create_padding_mask(inp)
    
    # 用于解码器的第一个注意块。
    # 它用于填充和屏蔽解码器输入中未来的tokens 
    sequence_mask = Sequence_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, sequence_mask)#最以上是关于6.12使用tensorflow来搭建一个Transformer的主要内容，如果未能解决你的问题，请参考以下文章