RNN的损失根本没有减少

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了RNN的损失根本没有减少相关的知识,希望对你有一定的参考价值。

我已经尝试将权重初始化参数,学习速率和批量大小以及激活函数更改为ReLu仍然没有减少损失这是代码:

import torch
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable
import numpy as np


no_time_steps = 28
input_size = 28
hidden_size = 30
output_size = 10
batch_size = 100
num_epochs = 2
learning_rate = 0.01
dtype = torch.DoubleTensor


# MNIST Dataset
train_dataset = dsets.MNIST(root='./data/',
                        train=True, 
                        transform=transforms.ToTensor(),
                        download=True)

test_dataset = dsets.MNIST(root='./data/',
                       train=False, 
                       transform=transforms.ToTensor())

train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                       batch_size=batch_size, 
                                       shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                      batch_size=batch_size, 
                                      shuffle=False)

class RNN(torch.nn.Module):
def __init__(self,input_size,hidden_size,output_size,batch_size):
    super(RNN, self).__init__()
    self.input_size=input_size
    self.hidden_size=hidden_size
    self.output_size=output_size
    self.wxh=Variable(torch.randn(input_size,hidden_size).type(dtype)*0.1,requires_grad=True)
    self.whh=Variable(torch.randn(hidden_size,hidden_size).type(dtype)*0.1,requires_grad=True)
    self.why=Variable(torch.randn(hidden_size,output_size).type(dtype)*0.1,requires_grad=True)
    self.by=Variable(torch.Tensor(batch_size,output_size).type(dtype).zero_(),requires_grad=True)
    self.bh=Variable(torch.Tensor(batch_size,hidden_size).type(dtype).zero_(),requires_grad=True)

    self.mWxh= torch.zeros_like(self.wxh)
    self.mWhh= torch.zeros_like(self.whh)
    self.mWhy= torch.zeros_like(self.why)
    self.mbh= torch.zeros_like(self.bh)
    self.mby= torch.zeros_like(self.by)
    self.dwxh, self.dwhh, self.dwhy = torch.zeros_like(self.wxh), torch.zeros_like(self.whh), torch.zeros_like(self.why)
    self.dbh, self.dby = torch.zeros_like(self.bh), torch.zeros_like(self.by)

def hidden_init(self,batch_size):
    self.hidden={}
    self.hidden[0]=Variable(torch.Tensor(batch_size,hidden_size).type(dtype).zero_())

def tanh(self,value):
    return (torch.exp(value)-torch.exp(-value))/(torch.exp(value)+torch.exp(-value))

def parameter(self):
    self.params = torch.nn.ParameterList([torch.nn.Parameter(self.wxh.data),torch.nn.Parameter(self.whh.data),torch.nn.Parameter(self.why.data),torch.nn.Parameter(self.bh.data),torch.nn.Parameter(self.by.data)])
    return self.params

def grad_data(self):
    print(self.dwxh,self.dwhy)

def softmax(self,value):
    return torch.exp(value) / torch.sum(torch.exp(value))

def updatess(self,lr):
    for param, dparam, mem in zip([self.wxh, self.whh, self.why, self.bh, self.by],
                            [self.dwxh,self.dwhh,self.dwhy,self.dbh,self.dby],
                            [self.mWxh, self.mWhh, self.mWhy, self.mbh, self.mby]):
        mem.data += dparam.data * dparam.data
        param.data += -learning_rate * dparam.data / torch.sqrt(mem.data + 1e-8)                                                                                                                


def forward(self,inputs,batch_size,no_time_steps,labels):
    self.hidden_init(batch_size)

    inputs=Variable(inputs.type(dtype))
    self.output=Variable(torch.Tensor(no_time_steps,batch_size,self.output_size).type(dtype))

    for t in xrange(no_time_steps):
        if t==0:
            self.hidden[t]=torch.matmul(self.hidden[0],self.whh)
            #print 'time  ',t#,"Inputs",inputs[:,t,:],"Weights",self.wxh
            #print "hidden MATRIX",inputs[:,t,:]
            self.hidden[t]+=torch.matmul(inputs[:,t,:],self.wxh)
            self.hidden[t]=self.tanh(self.hidden[t]+self.bh)
            #print 'time  ',t#,"Inputs",inputs[:,t,:],"Weights",self.wxh
            #print "HIDDEN MATRIX",self.hidden[t]
        else:
            self.hidden[t]=torch.matmul(self.hidden[t-1],self.whh)#+torch.matmul(self.hidden[t-1],self.whh) 
            #print 'time  ',t#,"Inputs",inputs[:,t,:],"Weights",self.wxh
            self.hidden[t]+=torch.matmul(inputs[:,t,:],self.wxh)
            self.hidden[t]=self.tanh(self.hidden[t]+self.bh)
        #print 'time  ',t#,"Inputs",inputs[:,t,:],"Weights",self.wxh
        #print "############################################################################################"
        #print "hidden MATRIX",self.hidden[t]
        self.output[t]=self.softmax(torch.matmul(self.hidden[t],self.why)+self.by)
        #print "OUTPUT MATRIX",self.output[t]            
    return self.output
def backward(self,loss,label,inputs):
    inputs=Variable(inputs.type(dtype))
    self.dhnext = torch.zeros_like(self.hidden[0])
    self.dy=self.output[27].clone()
        #print(self.dy.shape)
    self.dy[:,int(label[0])]=self.dy[:,int(label[0])]-1
        #print(self.dy.shape)
    self.dwhy += torch.matmul( self.hidden[27].t(),self.dy)
    self.dby += self.dy        
    for t in reversed(xrange(no_time_steps)):
        self.dh = torch.matmul(self.dy,self.why.t()) + self.dhnext # backprop into h  
        self.dhraw = (1 - self.hidden[t] * self.hidden[t]) * self.dh # backprop through tanh nonlinearity          
        self.dbh += self.dhraw #derivative of hidden bias
        self.dwxh += torch.matmul(inputs[:,t,:].t(),self.dhraw) #derivative of input to hidden layer weight
        self.dwhh += torch.matmul( self.hidden[t-1].t(),self.dhraw) #derivative of hidden layer to hidden layer weight
        self.dhnext = torch.matmul(self.dhraw,self.whh.t())            

rnn=RNN(input_size,hidden_size,output_size,batch_size)
def onehot(values,shape):
temp=torch.Tensor(shape).zero_()
for k,j in enumerate(labels):
    temp[k][int(j)]=1
return Variable(temp)

for epoch in range(5):
for i, (images, labels) in enumerate(train_loader):
    images = images.view(-1, no_time_steps, input_size)
    outputs = rnn(images,batch_size,no_time_steps,labels)
    labels = Variable(labels.double())
    output=outputs[27,:,:]
    labelss=onehot(labels,output.shape)
    #print output
    loss=-torch.mul(torch.log(output),labelss.double())
    #print loss
    loss=torch.sum(loss)
    #print(labels)
    rnn.backward(loss,labels,images)
    rnn.updatess(0.01)
    if i==1110:
        break
    if (i+1) % 100 == 0:
        print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' 
               %(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.data[0]))

OUTPUT:

Epoch [1/2],Step [100/600],损失:714.8081

Epoch [1/2],Step [200/600],损失:692.7232

Epoch [1/2],Step [300/600],损失:700.1103

Epoch [1/2],Step [400/600],损失:698.5468

Epoch [1/2],Step [500/600],损失:702.1227

Epoch [1/2],Step [600/600],损失:705.9571

答案

在这样的代码中很难找到错误。我建议稍微简化一下:

1)如果你做self.wxh=Parameter而不是self.wxh=Variable,pytorch会自动处理参数,所以你所有的Variable都改为Parameter。并删除您的参数功能。

2)如果你用具有定义的backward函数的函数定义了forward函数,pytorch会自动处理backward函数。所以删除你的backward函数,以防它有错误。

3)使用loss=torch.mean(loss)而不是loss=torch.sum(loss),因为那时你的学习率与批量大小无关。

4)使用backward在pytorch中有点棘手,所以请使用优化器:

optimizer = torch.optim.SGD(rnn.parameters(), lr=0.03)

for epoch in range(5):
    ...
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

如果在这之后,它仍然没有学习。您的RNN可能存在问题。因此,尝试使用pytorch预定义的RNN来查看您的数据集是否可以通过RNN学习。

如果这样做可以解决问题。您可以逐个撤消上述更改,以发现问题所在。

以上是关于RNN的损失根本没有减少的主要内容,如果未能解决你的问题,请参考以下文章

用于连续数据预测的 RNN 算法中的损失值和 val_loss 值不减少

RNN+CTC 模型似乎没有正确获取数据维度

培训减少,验证 - 增加。培训损失,验证损失减少

使用Tensorflow后端的Keras LSTM RNN中令人费解的训练损失与纪元...行为的任何原因

Tensorflow 损失在我的 RNN 中有所不同

为啥损失减少而准确率却没有增加? PyTorch