Theano，循环神经网络，误差为 nan

Posted 2023-02-16

技术标签:

【中文标题】Theano，循环神经网络，误差为 nan【英文标题】：Theano , recurrent neural network, error is nan 【发布时间】：2016-03-15 08:25:33 【问题描述】：

我正在尝试复制最近关于单一进化神经网络的工作。改编自作者发布的代码，我写了如下代码

import matplotlib.pyplot as plt
import numpy as np
import theano
import theano.tensor as T
import sys

#theano.config.exception_verbosity='high'
class RNN(object):
    def __init__(self,n_in,n_out,n_hid,learning_rate = 0.000001):
        self.dtype = theano.config.floatX
        self.learning_rate = learning_rate
        self.n_in = n_in
        self.n_hid = n_hid
        self.n_out = n_out
        self.generate_parameters()
        self.params = [self.V_re, self.V_im, self.U, self.hidden_bias, self.reflection, self.out_bias, self.theta, self.scale]
        inp = T.matrix(dtype = self.dtype)
        target = T.matrix(dtype = self.dtype)
        h_0_batch = self.h_0
        non_sequences = [self.theta, self.V_re, self.V_im, self.hidden_bias, self.scale, self.out_bias, self.U]
        sequences = inp
        self.index_permute = np.random.permutation(self.n_hid)
        h_t,_ = theano.scan(fn=self.one_step,sequences=sequences,non_sequences=non_sequences,outputs_info=h_0_batch)


        y_t = T.dot(h_t[self.h_t.shape[0]-1], self.U) + self.out_bias
        self.p_y_given_x = T.nnet.softmax(y_t)
        #cost = T.nnet.categorical_crossentropy(y, y_t).mean()
        self.y_t = T.argmax(self.p_y_given_x, axis = 1)
        self.lr = theano.shared(np.cast[self.dtype](self.learning_rate))
        self.cost = -T.sum(target*T.log(self.p_y_given_x) + (1.- target)*T.log(1. - self.p_y_given_x))
        self.learn_rnn_fn = self.get_train_graph(target, inp, self.cost)
        self.pred_rnn_fn = self.get_pred_graph(inp)
        print "Built model"



    def do_fft(self,input, n_hidden):
        fft_input = T.reshape(input, (input.shape[0], 2, n_hidden))
        fft_input = fft_input.dimshuffle(0,2,1)
        fft_output = cufft(fft_input) / T.sqrt(n_hidden)
        fft_output = fft_output.dimshuffle(0,2,1)
        output = T.reshape(fft_output, (input.shape[0], 2*n_hidden))
        return output

    def do_ifft(self,input, n_hidden):
        ifft_input = T.reshape(input, (input.shape[0], 2, n_hidden))
        ifft_input = ifft_input.dimshuffle(0,2,1)
        ifft_output = cuifft(ifft_input) / T.sqrt(n_hidden)
        ifft_output = ifft_output.dimshuffle(0,2,1)
        output = T.reshape(ifft_output, (input.shape[0], 2*n_hidden))
        return output


    def scale_diag(self,input, n_hidden, diag):
        input_re = input[:, :n_hidden]
        input_im = input[:, n_hidden:]
        Diag = T.nlinalg.AllocDiag()(diag)
        input_re_times_Diag = T.dot(input_re, Diag)
        input_im_times_Diag = T.dot(input_im, Diag)
        return T.concatenate([input_re_times_Diag, input_im_times_Diag], axis=1)

    def times_diag(self,input, n_hidden, diag):
        input_re = input[:, :n_hidden]
        input_im = input[:, n_hidden:]
        Re = T.nlinalg.AllocDiag()(T.cos(diag))
        Im = T.nlinalg.AllocDiag()(T.sin(diag))
        input_re_times_Re = T.dot(input_re, Re)
        input_re_times_Im = T.dot(input_re, Im)
        input_im_times_Re = T.dot(input_im, Re)
        input_im_times_Im = T.dot(input_im, Im)

        return T.concatenate([input_re_times_Re - input_im_times_Im,
                          input_re_times_Im + input_im_times_Re], axis=1)

    def vec_permutation(self,input, n_hidden, index_permute):
        re = input[:, :n_hidden]
        im = input[:, n_hidden:]
        re_permute = re[:, index_permute]
        im_permute = im[:, index_permute]

        return T.concatenate([re_permute, im_permute], axis=1)      

    def times_reflection(self,input, n_hidden, reflection):
        input_re = input[:, :n_hidden]
        input_im = input[:, n_hidden:]
        reflect_re = reflection[:n_hidden]
        reflect_im = reflection[n_hidden:]
        vstarv = (reflect_re**2 + reflect_im**2).sum()
        input_re_reflect = input_re - 2 / vstarv * (T.outer(T.dot(input_re, reflect_re), reflect_re) 
                                                    + T.outer(T.dot(input_re, reflect_im), reflect_im) 
                                                    - T.outer(T.dot(input_im, reflect_im), reflect_re) 
                                                    + T.outer(T.dot(input_im, reflect_re), reflect_im))
        input_im_reflect = input_im - 2 / vstarv * (T.outer(T.dot(input_im, reflect_re), reflect_re) 
                                                    + T.outer(T.dot(input_im, reflect_im), reflect_im) 
                                                    + T.outer(T.dot(input_re, reflect_im), reflect_re) 
                                                    - T.outer(T.dot(input_re, reflect_re), reflect_im))

        return T.concatenate([input_re_reflect, input_im_reflect], axis=1)      

    def sample_weights(self,SizeX, SizeY):
        values = np.ndarray([SizeX, SizeY], dtype = self.dtype)
        for dx in range(SizeX):
            row_val = np.random.normal(loc = 0.0, scale = 0.1, size=(SizeY,))
            values[dx,:] = row_val
        _,svs,_ = np.linalg.svd(values)
        values = values / svs[0]
        return values

    def generate_parameters(self):
        np.random.seed(1234)
        rng = np.random.RandomState(1234)
        self.V_re = theano.shared(self.sample_weights(self.n_in, self.n_hid))
        self.V_im = theano.shared(self.sample_weights(self.n_in, self.n_hid))
        self.U = theano.shared(self.sample_weights(2 * self.n_hid, self.n_out))
        self.hidden_bias = theano.shared(np.asarray(rng.uniform(low=-0.01,
                                                       high=0.01,
                                                       size=(self.n_hid,)),
                                               dtype=self.dtype))

        self.reflection = theano.shared(self.sample_weights(2, 2*self.n_hid))
        self.out_bias = theano.shared(np.zeros((self.n_out,), dtype=self.dtype))
        self.theta = theano.shared(self.sample_weights(3, self.n_hid))
        bucket = np.sqrt(2.) * np.sqrt(3. / 2 / self.n_hid)
        self.h_0 = theano.shared(np.asarray(rng.uniform(low=-bucket,
                                               high=bucket,
                                               size=(1, 2 * self.n_hid)), 
                                       dtype=self.dtype),
                        name='h_0')

        self.scale = theano.shared(np.zeros((self.n_hid,), dtype=self.dtype))

    def logistic_function(self,vec):
        return 1/(1 + T.exp(-vec))
    def activ_tan(self,vec):
        return T.tanh(vec)
    def one_step(self,x_t, h_prev, theta, V_re, V_im, hidden_bias, scale, out_bias, U):
                # Compute hidden linear transform
        step1 = self.times_diag(h_prev, self.n_hid, self.theta[0,:])
        step2 = step1
#        step2 = do_fft(step1, n_hidden)
        step3 = self.times_reflection(step2, self.n_hid, self.reflection[0,:])
        step4 = self.vec_permutation(step3, self.n_hid, self.index_permute)
        step5 = self.times_diag(step4, self.n_hid, theta[1,:])
        step6 = step5
#        step6 = do_ifft(step5, n_hidden)
        step7 = self.times_reflection(step6, self.n_hid, self.reflection[1,:])
        step8 = self.times_diag(step7, self.n_hid, self.theta[2,:])     
        step9 = self.scale_diag(step8, self.n_hid, self.scale)

        hidden_lin_output = step9
        # Compute data linear transform
        data_lin_output_re = T.dot(x_t, V_re)
        data_lin_output_im = T.dot(x_t, V_im)
        data_lin_output = T.concatenate([data_lin_output_re, data_lin_output_im], axis=0)

        # Total linear output        
        lin_output = hidden_lin_output + data_lin_output
        lin_output_re = lin_output[:, :self.n_hid]
        lin_output_im = lin_output[:, self.n_hid:] 


        # Apply non-linearity ----------------------------


        # scale RELU nonlinearity
        modulus = T.sqrt(lin_output_re ** 2 + lin_output_im ** 2)
        rescale = T.maximum(modulus + hidden_bias.dimshuffle('x',0), 0.) / (modulus + 1e-5)
        nonlin_output_re = lin_output_re * rescale
        nonlin_output_im = lin_output_im * rescale

        h_t = T.concatenate([nonlin_output_re, 
                             nonlin_output_im], axis=1)
        return h_t

        #cost = -T.sum(T.log(np.abs(p_y_given_x))[0][T.argmax(target)])
    def get_train_graph(self, target, inp, cost):
        grads = []
        for param in self.params:
            grads.append(T.grad(cost, param))
        update = []
        for param,grad in zip(self.params, grads):
            update.append((param, param - grad*self.lr))
        train_fn = theano.function(inputs = [inp,target], outputs = cost, updates = update)
        return train_fn

    def get_pred_graph(self,inp):
        predictions = theano.function(inputs = [inp], outputs = self.y_t)
        return predictions

def convert_string(file):
    f = open(file,'r')
    text = f.read()
    f.close()
    inp = np.zeros([len(text), 256],dtype=theano.config.floatX)
    out = np.zeros([len(text), 256],dtype=theano.config.floatX)
    counter = 0
    for char in text:
        if(counter > 0):
            inp[counter][ord(char)] = 1
            out[counter-1][ord(char)] = 1
        counter = counter + 1
    return [inp, out]

train_data = convert_string("log")
j = 0
model = RNN(256,256,1000)
n_epoch=10
dtype = dtype = theano.config.floatX
def train_rnn(train_data, n_epoch = 100):
    train_err = np.ndarray(n_epoch)
    for i in range(n_epoch):
        for j in range(len(train_data[0])):
            tempInp = np.zeros([1,256],dtype=dtype);
            tempInp[0] = train_data[0][j]
            tempOut = np.zeros([1,256],dtype=dtype);
            tempOut[0] = train_data[1][j]
            train_cost = model.learn_rnn_fn(tempInp, tempOut)
            sys.stdout.write((model.pred_rnn_fn(tempInp)))
            train_err[i]=train_err[i]+ train_cost
            train_err[i]= train_err[i]/len(train_data[0])
            print train_cost
        print "\n"
    return train_err

train_errors = train_rnn(train_data, n_epoch)
print train_errors
def plot_learning_curve(train_err):
    plt.plot(np.arange(n_epoch), train_errors, 'b-')
    plt.xlabel('epochs')
    plt.ylabel('error')
    plt.show()
plot_learning_curve(train_errors)

但是在 6.23 的初始错误之后，我得到了所有的 nans。有人可以解释代码中是否存在错误。我发布了整个代码，这样我就不会错过错误的部分（而且我不知道它是哪一个）

谢谢！

【问题讨论】：

【参考方案1】：

虽然 Marcin Możejko 关于日志中 NaN 数值问题的来源可能是正确的，但 theano 的文档在 dealing with NaNs 上提供了很好的一般性建议。

特别是如何使用NaN guard - 只要张量变量包含NaN，就会抛出错误：

from theano.compile.nanguardmode import NanGuardMode

...
... = theano.function(..., mode=NanGuardMode(nan_is_error=True,
                                             inf_is_error=True,
                                             big_is_error=True)
                     )

【讨论】：

以上是关于Theano，循环神经网络，误差为 nan的主要内容，如果未能解决你的问题，请参考以下文章

theano学习指南--词向量的循环神经网络(翻译)

使用 Keras 和 Theano 进行模型选择需要很长时间

找到为什么是nan的问题，是因为误差没有变小，而是越来越大

RNN 循环神经网络-BF 求导过程

循环神经网络-极其详细的推导BPTT