RNN的损失根本没有减少
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了RNN的损失根本没有减少相关的知识,希望对你有一定的参考价值。
我已经尝试将权重初始化参数,学习速率和批量大小以及激活函数更改为ReLu仍然没有减少损失这是代码:
import torch
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable
import numpy as np
no_time_steps = 28
input_size = 28
hidden_size = 30
output_size = 10
batch_size = 100
num_epochs = 2
learning_rate = 0.01
dtype = torch.DoubleTensor
# MNIST Dataset
train_dataset = dsets.MNIST(root='./data/',
train=True,
transform=transforms.ToTensor(),
download=True)
test_dataset = dsets.MNIST(root='./data/',
train=False,
transform=transforms.ToTensor())
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
batch_size=batch_size,
shuffle=False)
class RNN(torch.nn.Module):
def __init__(self,input_size,hidden_size,output_size,batch_size):
super(RNN, self).__init__()
self.input_size=input_size
self.hidden_size=hidden_size
self.output_size=output_size
self.wxh=Variable(torch.randn(input_size,hidden_size).type(dtype)*0.1,requires_grad=True)
self.whh=Variable(torch.randn(hidden_size,hidden_size).type(dtype)*0.1,requires_grad=True)
self.why=Variable(torch.randn(hidden_size,output_size).type(dtype)*0.1,requires_grad=True)
self.by=Variable(torch.Tensor(batch_size,output_size).type(dtype).zero_(),requires_grad=True)
self.bh=Variable(torch.Tensor(batch_size,hidden_size).type(dtype).zero_(),requires_grad=True)
self.mWxh= torch.zeros_like(self.wxh)
self.mWhh= torch.zeros_like(self.whh)
self.mWhy= torch.zeros_like(self.why)
self.mbh= torch.zeros_like(self.bh)
self.mby= torch.zeros_like(self.by)
self.dwxh, self.dwhh, self.dwhy = torch.zeros_like(self.wxh), torch.zeros_like(self.whh), torch.zeros_like(self.why)
self.dbh, self.dby = torch.zeros_like(self.bh), torch.zeros_like(self.by)
def hidden_init(self,batch_size):
self.hidden={}
self.hidden[0]=Variable(torch.Tensor(batch_size,hidden_size).type(dtype).zero_())
def tanh(self,value):
return (torch.exp(value)-torch.exp(-value))/(torch.exp(value)+torch.exp(-value))
def parameter(self):
self.params = torch.nn.ParameterList([torch.nn.Parameter(self.wxh.data),torch.nn.Parameter(self.whh.data),torch.nn.Parameter(self.why.data),torch.nn.Parameter(self.bh.data),torch.nn.Parameter(self.by.data)])
return self.params
def grad_data(self):
print(self.dwxh,self.dwhy)
def softmax(self,value):
return torch.exp(value) / torch.sum(torch.exp(value))
def updatess(self,lr):
for param, dparam, mem in zip([self.wxh, self.whh, self.why, self.bh, self.by],
[self.dwxh,self.dwhh,self.dwhy,self.dbh,self.dby],
[self.mWxh, self.mWhh, self.mWhy, self.mbh, self.mby]):
mem.data += dparam.data * dparam.data
param.data += -learning_rate * dparam.data / torch.sqrt(mem.data + 1e-8)
def forward(self,inputs,batch_size,no_time_steps,labels):
self.hidden_init(batch_size)
inputs=Variable(inputs.type(dtype))
self.output=Variable(torch.Tensor(no_time_steps,batch_size,self.output_size).type(dtype))
for t in xrange(no_time_steps):
if t==0:
self.hidden[t]=torch.matmul(self.hidden[0],self.whh)
#print 'time ',t#,"Inputs",inputs[:,t,:],"Weights",self.wxh
#print "hidden MATRIX",inputs[:,t,:]
self.hidden[t]+=torch.matmul(inputs[:,t,:],self.wxh)
self.hidden[t]=self.tanh(self.hidden[t]+self.bh)
#print 'time ',t#,"Inputs",inputs[:,t,:],"Weights",self.wxh
#print "HIDDEN MATRIX",self.hidden[t]
else:
self.hidden[t]=torch.matmul(self.hidden[t-1],self.whh)#+torch.matmul(self.hidden[t-1],self.whh)
#print 'time ',t#,"Inputs",inputs[:,t,:],"Weights",self.wxh
self.hidden[t]+=torch.matmul(inputs[:,t,:],self.wxh)
self.hidden[t]=self.tanh(self.hidden[t]+self.bh)
#print 'time ',t#,"Inputs",inputs[:,t,:],"Weights",self.wxh
#print "############################################################################################"
#print "hidden MATRIX",self.hidden[t]
self.output[t]=self.softmax(torch.matmul(self.hidden[t],self.why)+self.by)
#print "OUTPUT MATRIX",self.output[t]
return self.output
def backward(self,loss,label,inputs):
inputs=Variable(inputs.type(dtype))
self.dhnext = torch.zeros_like(self.hidden[0])
self.dy=self.output[27].clone()
#print(self.dy.shape)
self.dy[:,int(label[0])]=self.dy[:,int(label[0])]-1
#print(self.dy.shape)
self.dwhy += torch.matmul( self.hidden[27].t(),self.dy)
self.dby += self.dy
for t in reversed(xrange(no_time_steps)):
self.dh = torch.matmul(self.dy,self.why.t()) + self.dhnext # backprop into h
self.dhraw = (1 - self.hidden[t] * self.hidden[t]) * self.dh # backprop through tanh nonlinearity
self.dbh += self.dhraw #derivative of hidden bias
self.dwxh += torch.matmul(inputs[:,t,:].t(),self.dhraw) #derivative of input to hidden layer weight
self.dwhh += torch.matmul( self.hidden[t-1].t(),self.dhraw) #derivative of hidden layer to hidden layer weight
self.dhnext = torch.matmul(self.dhraw,self.whh.t())
rnn=RNN(input_size,hidden_size,output_size,batch_size)
def onehot(values,shape):
temp=torch.Tensor(shape).zero_()
for k,j in enumerate(labels):
temp[k][int(j)]=1
return Variable(temp)
for epoch in range(5):
for i, (images, labels) in enumerate(train_loader):
images = images.view(-1, no_time_steps, input_size)
outputs = rnn(images,batch_size,no_time_steps,labels)
labels = Variable(labels.double())
output=outputs[27,:,:]
labelss=onehot(labels,output.shape)
#print output
loss=-torch.mul(torch.log(output),labelss.double())
#print loss
loss=torch.sum(loss)
#print(labels)
rnn.backward(loss,labels,images)
rnn.updatess(0.01)
if i==1110:
break
if (i+1) % 100 == 0:
print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f'
%(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.data[0]))
OUTPUT:
Epoch [1/2],Step [100/600],损失:714.8081
Epoch [1/2],Step [200/600],损失:692.7232
Epoch [1/2],Step [300/600],损失:700.1103
Epoch [1/2],Step [400/600],损失:698.5468
Epoch [1/2],Step [500/600],损失:702.1227
Epoch [1/2],Step [600/600],损失:705.9571
在这样的代码中很难找到错误。我建议稍微简化一下:
1)如果你做self.wxh=Parameter
而不是self.wxh=Variable
,pytorch会自动处理参数,所以你所有的Variable
都改为Parameter
。并删除您的参数功能。
2)如果你用具有定义的backward
函数的函数定义了forward
函数,pytorch会自动处理backward
函数。所以删除你的backward
函数,以防它有错误。
3)使用loss=torch.mean(loss)
而不是loss=torch.sum(loss)
,因为那时你的学习率与批量大小无关。
4)使用backward
在pytorch中有点棘手,所以请使用优化器:
optimizer = torch.optim.SGD(rnn.parameters(), lr=0.03)
for epoch in range(5):
...
optimizer.zero_grad()
loss.backward()
optimizer.step()
如果在这之后,它仍然没有学习。您的RNN可能存在问题。因此,尝试使用pytorch预定义的RNN来查看您的数据集是否可以通过RNN学习。
如果这样做可以解决问题。您可以逐个撤消上述更改,以发现问题所在。
以上是关于RNN的损失根本没有减少的主要内容,如果未能解决你的问题,请参考以下文章
用于连续数据预测的 RNN 算法中的损失值和 val_loss 值不减少