Keras vs PyTorch LSTM 不同的结果

Posted

技术标签:

【中文标题】Keras vs PyTorch LSTM 不同的结果【英文标题】:Keras vs PyTorch LSTM different results 【发布时间】:2019-11-16 19:50:00 【问题描述】:

尝试使用 Keras 和 PyTorch 在同一数据集上获得类似的结果。

数据

from numpy import array
from numpy import hstack

from sklearn.model_selection import train_test_split  
 

# split a multivariate sequence into samples
def split_sequences(sequences, n_steps):
    X, y = list(), list()
    for i in range(len(sequences)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the dataset
        if end_ix > len(sequences):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
        X.append(seq_x)
        y.append(seq_y)
    return array(X), array(y)

def get_data():
    # define input sequence
    in_seq1 = array([x for x in range(0,500,10)])/1
    in_seq2 = array([x for x in range(5,505,10)])/1
    out_seq = array([in_seq1[i]+in_seq2[i] for i in range(len(in_seq1))])
    # convert to [rows, columns] structure
    in_seq1 = in_seq1.reshape((len(in_seq1), 1))
    in_seq2 = in_seq2.reshape((len(in_seq2), 1))
    out_seq = out_seq.reshape((len(out_seq), 1))
    # horizontally stack columns
    dataset = hstack((in_seq1, in_seq2, out_seq))
    
    n_features = 2 # this is number of parallel inputs
    n_timesteps = 3 # this is number of timesteps
    
    # convert into input/output
    X, y = split_sequences(dataset, n_timesteps)
    print(X.shape, y.shape)
    X_train,x_test,Y_train, y_test = train_test_split(X,y,test_size = 0.2,shuffle=False)
    
    return X_train,x_test,Y_train, y_test

Keras

from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense   

from sklearn.metrics import mean_squared_error
 
import testing.TimeSeries.datacreator as dc # !!!!change this!!!!     
X_train,x_test,Y_train, y_test = dc.get_data() 

n_features = 2 # this is number of parallel inputs
n_timesteps = 3 # this is number of timesteps

# define model
model = Sequential()
model.add(LSTM(1024, activation='relu', 
               input_shape=(n_timesteps, n_features),
               kernel_initializer='uniform',
               recurrent_initializer='uniform'))
model.add(Dense(512, activation='relu'))
model.add(Dense(1))
opt = keras.optimizers.Adam(lr=0.001, 
                      beta_1=0.9, 
                      beta_2=0.999, 
                      epsilon=keras.optimizers.K.epsilon(), 
                      decay=0.0, 
                      amsgrad=False)
model.compile(optimizer=opt, loss='mse')
# fit model
model.fit(X_train, Y_train, epochs=200, verbose=1,validation_data=(x_test,y_test))    
    
yhat = model.predict(x_test, verbose=0)    
    
mean_squared_error(y_test, yhat)  

PyTorch - 模块类

import numpy as np
import torch
import torch.nn.functional as F
 
from sklearn.metrics import mean_squared_error
 
import testing.TimeSeries.datacreator as dc # !!!! change this !!!!   
X_train,x_test,Y_train, y_test =   dc.get_data()  
n_features = 2 # this is number of parallel inputs
n_timesteps = 3 # this is number of timesteps    

class MV_LSTM(torch.nn.Module):
    def __init__(self,n_features,seq_length):
        super(MV_LSTM, self).__init__()
        self.n_features = n_features # number of parallel inputs
        self.seq_len = seq_length # number of timesteps
        self.n_hidden = 1024 # number of hidden states
        self.n_layers = 1 # number of LSTM layers (stacked)
    
        self.l_lstm = torch.nn.LSTM(input_size = n_features, 
                                 hidden_size = self.n_hidden,
                                 num_layers = self.n_layers, 
                                 batch_first = True)
        # according to pytorch docs LSTM output is 
        # (batch_size,seq_len, num_directions * hidden_size)
        # when considering batch_first = True
        self.l_linear = torch.nn.Linear(self.n_hidden*self.seq_len, 512)
#        self.l_linear1 = torch.nn.Linear(512, 512)
        self.l_linear2 = torch.nn.Linear(512, 1)
        
    
    def init_hidden(self, batch_size):
        # even with batch_first = True this remains same as docs
        hidden_state = torch.zeros(self.n_layers,batch_size,self.n_hidden).to(next(self.parameters()).device)
        cell_state = torch.zeros(self.n_layers,batch_size,self.n_hidden).to(next(self.parameters()).device)
        self.hidden = (hidden_state, cell_state)
    
    
    def forward(self, x):        
        batch_size, seq_len, _ = x.size()
        
        lstm_out, self.hidden = self.l_lstm(x,self.hidden)
        # lstm_out(with batch_first = True) is 
        # (batch_size,seq_len,num_directions * hidden_size)
        # for following linear layer we want to keep batch_size dimension and merge rest       
        # .contiguous() -> solves tensor compatibility error
        x = lstm_out.contiguous().view(batch_size,-1)
        x = F.relu(x)
        x = F.relu(self.l_linear(x))
#        x = F.relu(self.l_linear1(x))
        x = self.l_linear2(x)
        return x

PyTorch - 初始化和训练

# create NN
mv_net = MV_LSTM(n_features,n_timesteps)
criterion = torch.nn.MSELoss()
import keras # for epsilon constant
optimizer = torch.optim.Adam(mv_net.parameters(), 
                             lr=1e-3,
                             betas=[0.9,0.999],
                             eps=keras.optimizers.K.epsilon(),
                             weight_decay=0,
                             amsgrad=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
mv_net.to(device)


train_episodes = 200
batch_size = 32
eval_batch_size = 32

for t in range(train_episodes):
    # TRAIN
    mv_net.train()
    for b in range(0,len(X_train),batch_size):
        inpt = X_train[b:b+batch_size,:,:]
        target = Y_train[b:b+batch_size]    
        
        x_batch = torch.tensor(inpt,dtype=torch.float32).to(device)    
        y_batch = torch.tensor(target,dtype=torch.float32).to(device) 
    
        mv_net.init_hidden(x_batch.size(0))
        
        output = mv_net(x_batch) 
        loss = criterion(output.view(-1), y_batch)  
        
        loss.backward()
        optimizer.step()        
        optimizer.zero_grad() 
    
    # EVAL    
    mv_net.eval()
    mv_net.init_hidden(eval_batch_size)
    acc = 0
    for b in range(0,len(x_test),eval_batch_size):
        inpt = x_test[b:b+eval_batch_size,:,:]
        target = y_test[b:b+eval_batch_size]    
        
        x_batch = torch.tensor(inpt,dtype=torch.float32).to(device)    
        y_batch = torch.tensor(target,dtype=torch.float32).to(device) 
        mv_net.init_hidden(x_batch.size(0))
        
        output = mv_net(x_batch)
        acc += mean_squared_error(y_batch.cpu().detach().numpy(), output.view(-1).cpu().detach().numpy()) 
    print('step:' , t , 'train loss:' , round(loss.item(),3),'eval acc:',round(acc/len(x_test),3))


mv_net.init_hidden(len(x_test))
val = torch.tensor(x_test,dtype=torch.float32).to(device) 
otp = mv_net(val) 
print(mean_squared_error(y_test, otp.view(-1).cpu().detach().numpy()))

结果

Keras 产生的测试 MSE 几乎为 0,但 PyTorch 大约为 6000,这相差太大了

我在 PyTorch 代码中尝试了一些调整,但没有一个能让我接近类似的 keras,即使使用相同的优化参数

我看不出(有点教程)PyTorch 代码有什么问题

【问题讨论】:

能否请您检查差异是在 LSTM 的前向还是优化过程中? 我遇到了类似的问题,我的代码的 keras 版本似乎总是表现得更好。我认为这与权重初始化有关,因为两个框架对它们的初始化不同。 我有同样的问题 keras > pytorch for lstm 【参考方案1】:

我知道已经晚了将近一年。但我遇到了同样的问题,我认为问题如下。从 keras 文档中它说:

return_sequences:布尔值。是否返回最后的输出 输出序列,或完整序列。

这基本上意味着您的self.l_linear 的输入形状需要是torch.nn.Linear(1024, 512) 而不是self.n_hidden*self.seq_len, 512

现在您还需要像 keras 一样做,并且只使用前向传递中的最后一个输出:

    def forward(self, x):        
        batch_size, seq_len, _ = x.size()

        lstm_out, self.hidden = self.l_lstm(x,self.hidden)

        x = lstm_out[:,-1]
        x = torch.nn.functional.relu(x)
        x = torch.nn.functional.relu(self.l_linear(x))
        x = self.l_linear2(x)
        return x

当我运行您的示例时(我需要对其进行一些调整才能使其运行),我得到非常相似的训练损失。

Keras:

38/38 [==============================] - 0s 6ms/step - loss: 67.6081 - val_loss: 325.9259

PyTorch:

步数:199 火车损失:41.043 eval acc:1142.688

我希望这可以帮助其他遇到类似问题的人。

另外请注意,keras 默认会重置隐藏状态(stateful=False)。

【讨论】:

以上是关于Keras vs PyTorch LSTM 不同的结果的主要内容,如果未能解决你的问题,请参考以下文章

Keras vs PyTorch:谁是「第一」深度学习框架?

Pytorch vs. Keras:Pytorch 模型严重过拟合

Keras LSTM 的内部工作原理

Keras vs. PyTorch

Keras vs. PyTorch in Transfer Learning

深度学习之 Keras vs Tensorflow vs Pytorch 三种深度学习框架