RNN的损失根本没有减少

Question

我已经尝试将权重初始化参数，学习速率和批量大小以及激活函数更改为ReLu仍然没有减少损失这是代码：

import torch
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable
import numpy as np


no_time_steps = 28
input_size = 28
hidden_size = 30
output_size = 10
batch_size = 100
num_epochs = 2
learning_rate = 0.01
dtype = torch.DoubleTensor


# MNIST Dataset
train_dataset = dsets.MNIST(root='./data/',
                        train=True, 
                        transform=transforms.ToTensor(),
                        download=True)

test_dataset = dsets.MNIST(root='./data/',
                       train=False, 
                       transform=transforms.ToTensor())

train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                       batch_size=batch_size, 
                                       shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                      batch_size=batch_size, 
                                      shuffle=False)

class RNN(torch.nn.Module):
def __init__(self,input_size,hidden_size,output_size,batch_size):
    super(RNN, self).__init__()
    self.input_size=input_size
    self.hidden_size=hidden_size
    self.output_size=output_size
    self.wxh=Variable(torch.randn(input_size,hidden_size).type(dtype)*0.1,requires_grad=True)
    self.whh=Variable(torch.randn(hidden_size,hidden_size).type(dtype)*0.1,requires_grad=True)
    self.why=Variable(torch.randn(hidden_size,output_size).type(dtype)*0.1,requires_grad=True)
    self.by=Variable(torch.Tensor(batch_size,output_size).type(dtype).zero_(),requires_grad=True)
    self.bh=Variable(torch.Tensor(batch_size,hidden_size).type(dtype).zero_(),requires_grad=True)

    self.mWxh= torch.zeros_like(self.wxh)
    self.mWhh= torch.zeros_like(self.whh)
    self.mWhy= torch.zeros_like(self.why)
    self.mbh= torch.zeros_like(self.bh)
    self.mby= torch.zeros_like(self.by)
    self.dwxh, self.dwhh, self.dwhy = torch.zeros_like(self.wxh), torch.zeros_like(self.whh), torch.zeros_like(self.why)
    self.dbh, self.dby = torch.zeros_like(self.bh), torch.zeros_like(self.by)

def hidden_init(self,batch_size):
    self.hidden={}
    self.hidden[0]=Variable(torch.Tensor(batch_size,hidden_size).type(dtype).zero_())

def tanh(self,value):
    return (torch.exp(value)-torch.exp(-value))/(torch.exp(value)+torch.exp(-value))

def parameter(self):
    self.params = torch.nn.ParameterList([torch.nn.Parameter(self.wxh.data),torch.nn.Parameter(self.whh.data),torch.nn.Parameter(self.why.data),torch.nn.Parameter(self.bh.data),torch.nn.Parameter(self.by.data)])
    return self.params

def grad_data(self):
    print(self.dwxh,self.dwhy)

def softmax(self,value):
    return torch.exp(value) / torch.sum(torch.exp(value))

def updatess(self,lr):
    for param, dparam, mem in zip([self.wxh, self.whh, self.why, self.bh, self.by],
                            [self.dwxh,self.dwhh,self.dwhy,self.dbh,self.dby],
                            [self.mWxh, self.mWhh, self.mWhy, self.mbh, self.mby]):
        mem.data += dparam.data * dparam.data
        param.data += -learning_rate * dparam.data / torch.sqrt(mem.data + 1e-8)                                                                                                                


def forward(self,inputs,batch_size,no_time_steps,labels):
    self.hidden_init(batch_size)

    inputs=Variable(inputs.type(dtype))
    self.output=Variable(torch.Tensor(no_time_steps,batch_size,self.output_size).type(dtype))

    for t in xrange(no_time_steps):
        if t==0:
            self.hidden[t]=torch.matmul(self.hidden[0],self.whh)
            #print 'time  ',t#,"Inputs",inputs[:,t,:],"Weights",self.wxh
            #print "hidden MATRIX",inputs[:,t,:]
            self.hidden[t]+=torch.matmul(inputs[:,t,:],self.wxh)
            self.hidden[t]=self.tanh(self.hidden[t]+self.bh)
            #print 'time  ',t#,"Inputs",inputs[:,t,:],"Weights",self.wxh
            #print "HIDDEN MATRIX",self.hidden[t]
        else:
            self.hidden[t]=torch.matmul(self.hidden[t-1],self.whh)#+torch.matmul(self.hidden[t-1],self.whh) 
            #print 'time  ',t#,"Inputs",inputs[:,t,:],"Weights",self.wxh
            self.hidden[t]+=torch.matmul(inputs[:,t,:],self.wxh)
            self.hidden[t]=self.tanh(self.hidden[t]+self.bh)
        #print 'time  ',t#,"Inputs",inputs[:,t,:],"Weights",self.wxh
        #print "############################################################################################"
        #print "hidden MATRIX",self.hidden[t]
        self.output[t]=self.softmax(torch.matmul(self.hidden[t],self.why)+self.by)
        #print "OUTPUT MATRIX",self.output[t]            
    return self.output
def backward(self,loss,label,inputs):
    inputs=Variable(inputs.type(dtype))
    self.dhnext = torch.zeros_like(self.hidden[0])
    self.dy=self.output[27].clone()
        #print(self.dy.shape)
    self.dy[:,int(label[0])]=self.dy[:,int(label[0])]-1
        #print(self.dy.shape)
    self.dwhy += torch.matmul( self.hidden[27].t(),self.dy)
    self.dby += self.dy        
    for t in reversed(xrange(no_time_steps)):
        self.dh = torch.matmul(self.dy,self.why.t()) + self.dhnext # backprop into h  
        self.dhraw = (1 - self.hidden[t] * self.hidden[t]) * self.dh # backprop through tanh nonlinearity          
        self.dbh += self.dhraw #derivative of hidden bias
        self.dwxh += torch.matmul(inputs[:,t,:].t(),self.dhraw) #derivative of input to hidden layer weight
        self.dwhh += torch.matmul( self.hidden[t-1].t(),self.dhraw) #derivative of hidden layer to hidden layer weight
        self.dhnext = torch.matmul(self.dhraw,self.whh.t())            

rnn=RNN(input_size,hidden_size,output_size,batch_size)
def onehot(values,shape):
temp=torch.Tensor(shape).zero_()
for k,j in enumerate(labels):
    temp[k][int(j)]=1
return Variable(temp)

for epoch in range(5):
for i, (images, labels) in enumerate(train_loader):
    images = images.view(-1, no_time_steps, input_size)
    outputs = rnn(images,batch_size,no_time_steps,labels)
    labels = Variable(labels.double())
    output=outputs[27,:,:]
    labelss=onehot(labels,output.shape)
    #print output
    loss=-torch.mul(torch.log(output),labelss.double())
    #print loss
    loss=torch.sum(loss)
    #print(labels)
    rnn.backward(loss,labels,images)
    rnn.updatess(0.01)
    if i==1110:
        break
    if (i+1) % 100 == 0:
        print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' 
               %(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.data[0]))

OUTPUT：

Epoch [1/2]，Step [100/600]，损失：714.8081

Epoch [1/2]，Step [200/600]，损失：692.7232

Epoch [1/2]，Step [300/600]，损失：700.1103

Epoch [1/2]，Step [400/600]，损失：698.5468

Epoch [1/2]，Step [500/600]，损失：702.1227

Epoch [1/2]，Step [600/600]，损失：705.9571