Theano,循环神经网络,误差为 nan
Posted
技术标签:
【中文标题】Theano,循环神经网络,误差为 nan【英文标题】:Theano , recurrent neural network, error is nan 【发布时间】:2016-03-15 08:25:33 【问题描述】:我正在尝试复制最近关于单一进化神经网络的工作。改编自作者发布的代码,我写了如下代码
import matplotlib.pyplot as plt
import numpy as np
import theano
import theano.tensor as T
import sys
#theano.config.exception_verbosity='high'
class RNN(object):
def __init__(self,n_in,n_out,n_hid,learning_rate = 0.000001):
self.dtype = theano.config.floatX
self.learning_rate = learning_rate
self.n_in = n_in
self.n_hid = n_hid
self.n_out = n_out
self.generate_parameters()
self.params = [self.V_re, self.V_im, self.U, self.hidden_bias, self.reflection, self.out_bias, self.theta, self.scale]
inp = T.matrix(dtype = self.dtype)
target = T.matrix(dtype = self.dtype)
h_0_batch = self.h_0
non_sequences = [self.theta, self.V_re, self.V_im, self.hidden_bias, self.scale, self.out_bias, self.U]
sequences = inp
self.index_permute = np.random.permutation(self.n_hid)
h_t,_ = theano.scan(fn=self.one_step,sequences=sequences,non_sequences=non_sequences,outputs_info=h_0_batch)
y_t = T.dot(h_t[self.h_t.shape[0]-1], self.U) + self.out_bias
self.p_y_given_x = T.nnet.softmax(y_t)
#cost = T.nnet.categorical_crossentropy(y, y_t).mean()
self.y_t = T.argmax(self.p_y_given_x, axis = 1)
self.lr = theano.shared(np.cast[self.dtype](self.learning_rate))
self.cost = -T.sum(target*T.log(self.p_y_given_x) + (1.- target)*T.log(1. - self.p_y_given_x))
self.learn_rnn_fn = self.get_train_graph(target, inp, self.cost)
self.pred_rnn_fn = self.get_pred_graph(inp)
print "Built model"
def do_fft(self,input, n_hidden):
fft_input = T.reshape(input, (input.shape[0], 2, n_hidden))
fft_input = fft_input.dimshuffle(0,2,1)
fft_output = cufft(fft_input) / T.sqrt(n_hidden)
fft_output = fft_output.dimshuffle(0,2,1)
output = T.reshape(fft_output, (input.shape[0], 2*n_hidden))
return output
def do_ifft(self,input, n_hidden):
ifft_input = T.reshape(input, (input.shape[0], 2, n_hidden))
ifft_input = ifft_input.dimshuffle(0,2,1)
ifft_output = cuifft(ifft_input) / T.sqrt(n_hidden)
ifft_output = ifft_output.dimshuffle(0,2,1)
output = T.reshape(ifft_output, (input.shape[0], 2*n_hidden))
return output
def scale_diag(self,input, n_hidden, diag):
input_re = input[:, :n_hidden]
input_im = input[:, n_hidden:]
Diag = T.nlinalg.AllocDiag()(diag)
input_re_times_Diag = T.dot(input_re, Diag)
input_im_times_Diag = T.dot(input_im, Diag)
return T.concatenate([input_re_times_Diag, input_im_times_Diag], axis=1)
def times_diag(self,input, n_hidden, diag):
input_re = input[:, :n_hidden]
input_im = input[:, n_hidden:]
Re = T.nlinalg.AllocDiag()(T.cos(diag))
Im = T.nlinalg.AllocDiag()(T.sin(diag))
input_re_times_Re = T.dot(input_re, Re)
input_re_times_Im = T.dot(input_re, Im)
input_im_times_Re = T.dot(input_im, Re)
input_im_times_Im = T.dot(input_im, Im)
return T.concatenate([input_re_times_Re - input_im_times_Im,
input_re_times_Im + input_im_times_Re], axis=1)
def vec_permutation(self,input, n_hidden, index_permute):
re = input[:, :n_hidden]
im = input[:, n_hidden:]
re_permute = re[:, index_permute]
im_permute = im[:, index_permute]
return T.concatenate([re_permute, im_permute], axis=1)
def times_reflection(self,input, n_hidden, reflection):
input_re = input[:, :n_hidden]
input_im = input[:, n_hidden:]
reflect_re = reflection[:n_hidden]
reflect_im = reflection[n_hidden:]
vstarv = (reflect_re**2 + reflect_im**2).sum()
input_re_reflect = input_re - 2 / vstarv * (T.outer(T.dot(input_re, reflect_re), reflect_re)
+ T.outer(T.dot(input_re, reflect_im), reflect_im)
- T.outer(T.dot(input_im, reflect_im), reflect_re)
+ T.outer(T.dot(input_im, reflect_re), reflect_im))
input_im_reflect = input_im - 2 / vstarv * (T.outer(T.dot(input_im, reflect_re), reflect_re)
+ T.outer(T.dot(input_im, reflect_im), reflect_im)
+ T.outer(T.dot(input_re, reflect_im), reflect_re)
- T.outer(T.dot(input_re, reflect_re), reflect_im))
return T.concatenate([input_re_reflect, input_im_reflect], axis=1)
def sample_weights(self,SizeX, SizeY):
values = np.ndarray([SizeX, SizeY], dtype = self.dtype)
for dx in range(SizeX):
row_val = np.random.normal(loc = 0.0, scale = 0.1, size=(SizeY,))
values[dx,:] = row_val
_,svs,_ = np.linalg.svd(values)
values = values / svs[0]
return values
def generate_parameters(self):
np.random.seed(1234)
rng = np.random.RandomState(1234)
self.V_re = theano.shared(self.sample_weights(self.n_in, self.n_hid))
self.V_im = theano.shared(self.sample_weights(self.n_in, self.n_hid))
self.U = theano.shared(self.sample_weights(2 * self.n_hid, self.n_out))
self.hidden_bias = theano.shared(np.asarray(rng.uniform(low=-0.01,
high=0.01,
size=(self.n_hid,)),
dtype=self.dtype))
self.reflection = theano.shared(self.sample_weights(2, 2*self.n_hid))
self.out_bias = theano.shared(np.zeros((self.n_out,), dtype=self.dtype))
self.theta = theano.shared(self.sample_weights(3, self.n_hid))
bucket = np.sqrt(2.) * np.sqrt(3. / 2 / self.n_hid)
self.h_0 = theano.shared(np.asarray(rng.uniform(low=-bucket,
high=bucket,
size=(1, 2 * self.n_hid)),
dtype=self.dtype),
name='h_0')
self.scale = theano.shared(np.zeros((self.n_hid,), dtype=self.dtype))
def logistic_function(self,vec):
return 1/(1 + T.exp(-vec))
def activ_tan(self,vec):
return T.tanh(vec)
def one_step(self,x_t, h_prev, theta, V_re, V_im, hidden_bias, scale, out_bias, U):
# Compute hidden linear transform
step1 = self.times_diag(h_prev, self.n_hid, self.theta[0,:])
step2 = step1
# step2 = do_fft(step1, n_hidden)
step3 = self.times_reflection(step2, self.n_hid, self.reflection[0,:])
step4 = self.vec_permutation(step3, self.n_hid, self.index_permute)
step5 = self.times_diag(step4, self.n_hid, theta[1,:])
step6 = step5
# step6 = do_ifft(step5, n_hidden)
step7 = self.times_reflection(step6, self.n_hid, self.reflection[1,:])
step8 = self.times_diag(step7, self.n_hid, self.theta[2,:])
step9 = self.scale_diag(step8, self.n_hid, self.scale)
hidden_lin_output = step9
# Compute data linear transform
data_lin_output_re = T.dot(x_t, V_re)
data_lin_output_im = T.dot(x_t, V_im)
data_lin_output = T.concatenate([data_lin_output_re, data_lin_output_im], axis=0)
# Total linear output
lin_output = hidden_lin_output + data_lin_output
lin_output_re = lin_output[:, :self.n_hid]
lin_output_im = lin_output[:, self.n_hid:]
# Apply non-linearity ----------------------------
# scale RELU nonlinearity
modulus = T.sqrt(lin_output_re ** 2 + lin_output_im ** 2)
rescale = T.maximum(modulus + hidden_bias.dimshuffle('x',0), 0.) / (modulus + 1e-5)
nonlin_output_re = lin_output_re * rescale
nonlin_output_im = lin_output_im * rescale
h_t = T.concatenate([nonlin_output_re,
nonlin_output_im], axis=1)
return h_t
#cost = -T.sum(T.log(np.abs(p_y_given_x))[0][T.argmax(target)])
def get_train_graph(self, target, inp, cost):
grads = []
for param in self.params:
grads.append(T.grad(cost, param))
update = []
for param,grad in zip(self.params, grads):
update.append((param, param - grad*self.lr))
train_fn = theano.function(inputs = [inp,target], outputs = cost, updates = update)
return train_fn
def get_pred_graph(self,inp):
predictions = theano.function(inputs = [inp], outputs = self.y_t)
return predictions
def convert_string(file):
f = open(file,'r')
text = f.read()
f.close()
inp = np.zeros([len(text), 256],dtype=theano.config.floatX)
out = np.zeros([len(text), 256],dtype=theano.config.floatX)
counter = 0
for char in text:
if(counter > 0):
inp[counter][ord(char)] = 1
out[counter-1][ord(char)] = 1
counter = counter + 1
return [inp, out]
train_data = convert_string("log")
j = 0
model = RNN(256,256,1000)
n_epoch=10
dtype = dtype = theano.config.floatX
def train_rnn(train_data, n_epoch = 100):
train_err = np.ndarray(n_epoch)
for i in range(n_epoch):
for j in range(len(train_data[0])):
tempInp = np.zeros([1,256],dtype=dtype);
tempInp[0] = train_data[0][j]
tempOut = np.zeros([1,256],dtype=dtype);
tempOut[0] = train_data[1][j]
train_cost = model.learn_rnn_fn(tempInp, tempOut)
sys.stdout.write((model.pred_rnn_fn(tempInp)))
train_err[i]=train_err[i]+ train_cost
train_err[i]= train_err[i]/len(train_data[0])
print train_cost
print "\n"
return train_err
train_errors = train_rnn(train_data, n_epoch)
print train_errors
def plot_learning_curve(train_err):
plt.plot(np.arange(n_epoch), train_errors, 'b-')
plt.xlabel('epochs')
plt.ylabel('error')
plt.show()
plot_learning_curve(train_errors)
但是在 6.23 的初始错误之后,我得到了所有的 nans。有人可以解释代码中是否存在错误。我发布了整个代码,这样我就不会错过错误的部分(而且我不知道它是哪一个)
谢谢!
【问题讨论】:
【参考方案1】:虽然 Marcin Możejko 关于日志中 NaN 数值问题的来源可能是正确的,但 theano 的文档在 dealing with NaNs 上提供了很好的一般性建议。
特别是如何使用NaN guard - 只要张量变量包含NaN,就会抛出错误:
from theano.compile.nanguardmode import NanGuardMode
...
... = theano.function(..., mode=NanGuardMode(nan_is_error=True,
inf_is_error=True,
big_is_error=True)
)
【讨论】:
以上是关于Theano,循环神经网络,误差为 nan的主要内容,如果未能解决你的问题,请参考以下文章