在 colab 崩溃模型中不使用recurrent_dropout?

Posted

技术标签:

【中文标题】在 colab 崩溃模型中不使用recurrent_dropout?【英文标题】:Not using recurrent_dropout in colab crashing model? 【发布时间】:2020-09-03 16:58:35 【问题描述】:

我正在尝试训练一个简单的 tensorflow 模型来检测推文的情绪。数组的数据类型和大小是一致的,并且当recurrent_dropout 设置为某个浮点值时,模型训练得很好。但是,这会禁用 cuDNN,我真的很想加快速度(不是我们所有人),但是每当我删除经常性 dropout 参数时,模型训练都会在第一个 epoch 结束之前崩溃。

以下是相关代码,我省略了导入和加载 csv 文件。相关代码之后是最终输入尺寸和错误代码。此外,我已经弄清楚为什么 colab 似乎在削减训练数据。 Colab 会在拆分成批次后显示序列数,因此默认批次大小为 32,我们得到 859 个序列。不使用经常性 dropout 时的崩溃问题仍然是一个问题。旁注,这段代码是一个非常粗略的草稿,数据清理都在同一个笔记本中完成,因此缺乏典型的格式。

def remove_case(X):
    removed_case = []
    X = X.copy()
    for text in X:
        text = str(text).lower()
        removed_case.append(text)
    X = removed_case
    return X


def remove_hyperlinks(X):
    removed_hyperlinks = []
    X = X.copy()
    for text in X:
        text = str(text)
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'https\S+', '', text)
        text = re.sub(r'www\S+', '', text)
        removed_hyperlinks.append(text)
    X = removed_hyperlinks
    return X


def remove_punctuation(X):
    removed_punc = []
    X = X.copy()
    for text in X:
        text = str(text)
        text = "".join([char for char in text if char not in punctuation])
        removed_punc.append(text)
    X = removed_punc
    return X


def split_text(X):
    split_tweets = []
    X = X.copy()
    for text in X:
        text = str(text).split()
        split_tweets.append(text)
    X = split_tweets
    return X


def map_sentiment(X, l, m, n):
    keys = ['negative', 'neutral', 'positive']
    values = [l, m, n]
    dictionary = dict(zip(keys, values))
    X = X.copy()
    X = X.map(dictionary)
    return X


# # def sentiment_to_onehot(X):
#     sentiment_foofs = []
#     X = X.copy()
#     for integer in X:
#         if integer == "negative":  # Negative
#             integer = [1, 0, 0]
#         elif integer == "neutral":  # Neutral
#             integer = [0, 1, 0]
#         elif integer == "positive":  # Positive
#             integer = [0, 0, 1]
#         else:
#             break
#         sentiment_foofs.append(integer)
#     X = sentiment_foofs
#     return X


train_no_punc_lowercase = train.copy()
train_no_punc_lowercase['text'] = remove_case(train_no_punc_lowercase['text'])
train_no_punc_lowercase['text'] = remove_hyperlinks(train_no_punc_lowercase['text'])
train_no_punc_lowercase['text'] = remove_punctuation(train_no_punc_lowercase['text'])
train_no_punc_lowercase['sentiment'] = map_sentiment(train_no_punc_lowercase['sentiment'], 0, 1, 2)
train_no_punc_lowercase.head()

test_no_punc_lowercase = test.copy()
test_no_punc_lowercase['text'] = remove_case(test_no_punc_lowercase['text'])
test_no_punc_lowercase['text'] = remove_hyperlinks(test_no_punc_lowercase['text'])
test_no_punc_lowercase['text'] = remove_punctuation(test_no_punc_lowercase['text'])
test_no_punc_lowercase['sentiment'] = map_sentiment(test_no_punc_lowercase['sentiment'], 0, 1, 2)

features = train.columns.tolist()
features.remove('textID')  # all unique, high cardinality feature
features.remove('selected_text')  # target
target = 'selected_text'

X_train_no_punc_lowercase = train_no_punc_lowercase[features]
y_train_no_punc_lowercase = train_no_punc_lowercase[target]
X_test_no_punc_lowercase = test_no_punc_lowercase[features]


def stemming_column(df_column):
    ps = PorterStemmer()
    stemmed_word_list = []
    for i, string in enumerate(df_column):
        tokens = word_tokenize(string)
        new_string = ""
        for j, words in enumerate(tokens):
            new_string = new_string + ps.stem(words) + " "
        stemmed_word_list.append(new_string)
    return stemmed_word_list


def create_lookup_table(list1, list2):
    main_list = []
    lookup_dict = 
    i = 1  # used to create a value in the dictionary
    main_list.append(list1)
    main_list.append(list2)
    for list in main_list:
        for string in list:
            for word in string.split():
                if word not in lookup_dict:
                    lookup_dict[word] = i
                    i += 1
    return lookup_dict


def encode(input_list, input_dict):
    encoded_list = []
    for string in input_list:
        sentence_list = []
        for word in string.split():
            sentence_list.append(input_dict[word])  # value lookup from dictionary.. int
        encoded_list.append(sentence_list)
    return encoded_list


def pad_data(list_of_lists):
    padded_data = tf.keras.preprocessing.sequence.pad_sequences(list_of_lists, padding='post')
    return padded_data


def create_array_sentiment_integers(list):
    sent_int_list = []
    for sentiment in list:
        sent_int_list.append(sentiment)
    return np.asarray(sent_int_list, dtype=np.int32)


X_train_stemmed_list = stemming_column(X_train_no_punc_lowercase['text'])
X_test_stemmed_list = stemming_column(X_test_no_punc_lowercase['text'])
lookup_table = create_lookup_table(X_train_stemmed_list, X_test_stemmed_list)

X_train_encoded_list = encode(X_train_stemmed_list, lookup_table)
X_train_padded_data = pad_data(X_train_encoded_list)

Y_train = create_array_sentiment_integers(train_no_punc_lowercase['sentiment'])
max_features = 3  # 3 choices 0, 1, 2

Y_train_final = np.zeros((Y_train.shape[0], max_features), dtype=np.float32)
Y_train_final[np.arange(Y_train.shape[0]), Y_train] = 1.0

input_dimension = len(lookup_table) + 1
output_dimension = 64
input_length = 33

model = Sequential()
model.add(tf.keras.layers.Embedding(input_dim=input_dimension,
                                    output_dim=output_dimension,
                                    input_length=input_length,
                                    mask_zero=True))

model.add(tf.keras.layers.LSTM(512, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(tf.keras.layers.Dense(256, activation='sigmoid'))

model.add(tf.keras.layers.Dropout(0.2))
model.add(Dense(3, activation='softmax'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train_padded_data, Y_train_final, validation_split=0.20, epochs=10)

model.save('Tweet_sentiment.model')

此外,这里是数据集的形状..

x train shape:  (27481, 33, 1) x train type:  <class 'numpy.ndarray'> y train shape:  (27481, 3)

错误代码

Epoch 1/3
363/859 [===========>..................] - ETA: 9s - loss: 0.5449 - accuracy: 0.5674
---------------------------------------------------------------------------
UnknownError                              Traceback (most recent call last)
<ipython-input-103-1d4af3962607> in <module>()
----> 1 model.fit(X_train_padded_data, Y_train_final, epochs=3,)

8 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
     58     ctx.ensure_initialized()
     59     tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
---> 60                                         inputs, attrs, num_outputs)
     61   except core._NotOkStatusException as e:
     62     if name is not None:

UnknownError:  [_Derived_]  CUDNN_STATUS_BAD_PARAM
in tensorflow/stream_executor/cuda/cuda_dnn.cc(1496): 'cudnnSetRNNDataDescriptor( data_desc.get(), data_type, layout, max_seq_length, batch_size, data_size, seq_lengths_array, (void*)&padding_fill)'
     [[node cond_38/then/_0/CudnnRNNV3]]
     [[sequential_5/lstm_4/StatefulPartitionedCall]] [Op:__inference_train_function_36098]

Function call stack:
train_function -> train_function -> train_function

【问题讨论】:

你能展示你的数据集的形状吗? @ZabirAlNazi 当然,只是在错误代码之前添加到原始帖子中。 @NickP,看起来Number of Time Steps 非常高,因此当您申请recurrent_dropout 时,由于一些Time Steps 被丢弃,它不会崩溃。 input_dimension 的值是多少?另外,请分享完整的代码,以便我们为您提供帮助。谢谢! @TensorflowWarriors 您好,感谢您的回复,我已更新帖子以包含实际代码。我会说我对输入参数以及如何正确处理时间步长的理解可能存在一些差距。感谢您与我们联系。 【参考方案1】:

我在您的代码中发现了一些问题。它们在下面提到:

您正在使用input_dimension = len(lookup_table) + 1len(lookup_table) 只不过是 Number of Time Steps。它的价值会很高,至少3万多。建议仅使用这些值的子集。因此,您可以设置input_dimension = 10000input_dimension = 15000(您可以尝试使用此值)应该可以解决问题。话虽如此,它不会影响模型的准确性。

为什么设置 Recurrent Dropout 浮点值有效 ==> 当我们设置 Recurrent Dropout 时,它实际上会删除 Number of Time Stepsinput_dimension 在您的情况下,并且因此它不会崩溃。

只有在LSTM Layer 之后有另一个LSTM Layer 时,才应该使用return_sequences=True。因为你只有一个LSTM Layer,所以return_sequences应该设置为False 因为你有 3 个类,你不应该使用binary_crossentropy。如果你不是 One-Hot-Encoding 你应该使用 sparse_categorical_crossentropy 你应该使用 Targetcategorical_crossentropy 如果你是 One-Hot-Encoding 你的Target。 您确定要在Embedding Layer 中使用Masking 吗?

另外,我看到您正在为 Data-Preprocessing 使用许多函数和许多代码行,例如删除 Hyperlinks、删除 PunctuationsTokenizing 等。

所以,我想我会为 Text Classification 提供一个 End-To-End Tutorial,这将帮助您以及 Stack Overflow Community .相同的代码如下所示:

#!pip install tensorflow==2.1
#!pip install nltk
#!pip install tika
#!pip install textblob
#!pip3 install --upgrade numpy
#!pip install scikit-learn

# To handle Paths
import os

# To remove Hyperlinks and Dates
import re

# To remove Puncutations
import string

# This helps to remove the unnecessary words from our Text Data
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

# To Parse the Input Data Files
from tika import parser

from textblob import TextBlob

# In order to use the Libraries of Tensorflow
import tensorflow as tf

# For Preprocessing the Text => To Tokenize the Text
from tensorflow.keras.preprocessing.text import Tokenizer
# If the Two Articles are of different length, pad_sequences will make the length equal
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Package for performing Numerical Operations
import numpy as np

# MatplotLib for Plotting Graphs
import matplotlib.pyplot as plt

# To shuffle the Data
from random import shuffle

# To Partition the Data into Train Data and Test Data
from sklearn.model_selection import train_test_split

# To add Regularizer in order to reduce Overfitting
from tensorflow.keras.regularizers import l2

# Give the Path of our Data
Path_Of_Data = 'Data'

# Extract the Labels from the Folders inside the Path mentioned above
Unique_Labels_List = ['negative', 'neutral', 'positive']

def GetNumericLabel(EachLabel):
    if EachLabel=='negative':
        return 0
    elif EachLabel=='neutral':
        return 1
    elif EachLabel=='positive':
        return 2

def Pre_Process_Data_And_Create_BOW(folder_path):
  #creating empty lists in order to Create Resume Text and the respective Label
  Resumes_List = []
  Labels_List = []
  for EachLabel in Unique_Labels_List:      
      for root, dirs, files in os.walk(os.path.join(folder_path, EachLabel),topdown=False):
        for file in files:
          i = 0
          if file.endswith('.pdf'):
            #Access individual file
            Full_Resume_Path = os.path.join(root, file)
            # Parse the Data inside the file
            file_data = parser.from_file(Full_Resume_Path)
            # Extract the Content of the File
            Resume_Text = file_data['content']

            # Below Code removes the Hyperlinks in the Resume, like LinkedIn Profile, Certifications, etc..
            HyperLink_Regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
            Text_Without_HL = re.sub(HyperLink_Regex, ' ', Resume_Text, flags=re.MULTILINE)

            # Below Code removes the Date from the Resume
            Date_regEx = r'(?:\d1,2[-/th|st|nd|rd\s]*)?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)?[a-z\s,.]*(?:\d1,2[-/th|st|nd|rd)\s,]*)+(?:\d2,4)+'
            CleanedText = re.sub(Date_regEx,' ',Text_Without_HL)

            List_Of_All_Punctuations = list(string.punctuation)
            Important_Punctuations = ['#', '.', '+' , '-'] #Add more, if any other Punctuation is observed as Important

            NewLineChar = '\n'

            # Below Set Comprises all the Punctuations, which can be Removed from the Text of Resume
            Total_Punct = len(List_Of_All_Punctuations)

            for EachImpPunct in Important_Punctuations:
                for CountOfPunct in range(Total_Punct):
                    if CountOfPunct == Total_Punct:
                        break
                    elif EachImpPunct == List_Of_All_Punctuations[CountOfPunct]:
                        del List_Of_All_Punctuations[CountOfPunct]
                        Total_Punct = Total_Punct - 1

            List_Of_All_Punctuations.append(NewLineChar)

            for EachPunct in List_Of_All_Punctuations:
                CleanedText = CleanedText.replace(EachPunct, " ")

            # Below Code converts all the Words in the Resume to Lowercase ======> Check if it has to come after Tokenization if Splitting Code is delet instead of integed
            #Final_Cleaned_Resume_Text = Text_Without_Punct.lower()
            Final_Cleaned_Resume_Text = CleanedText.lower()

            #Code to remove Stopwords from each Resume
            for word in STOPWORDS:
                #stop_token = ' ' + word + ' '
                stop_token = word
                Resume_Text = Final_Cleaned_Resume_Text.replace(stop_token, ' ')
                #Resume_Text = Resume_Text.replace(' ', ' ')
            Resumes_List.append(Resume_Text)
            Numeric_Label = GetNumericLabel(EachLabel)
            Labels_List.append(Numeric_Label)
      #print('Successfully executed for the Folder, ', EachLabel)
  #Return Final Lists
  return Resumes_List, Labels_List

#calling the function and passing the path
Resumes_List,  Labels_List = Pre_Process_Data_And_Create_BOW(Path_Of_Data)

vocab_size = 10000 # This is very important for you
# We want the Output of the Embedding Layer to be 64
embedding_dim = 64
max_length = 800
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
# Taking 80% of the Data as Training Data and remaining 20% will be for Test Data
training_portion = .8

# Size of Train Data is 80% of the Entire Dataset => 0.8 * 2225

Train_Resume_Size = int(len(Resumes_List) * training_portion)

Labels_List = np.asarray(Labels_List)

Train_Resume_Data, Validation_Resume_Data, Train_Labels, Validation_Labels = \
                    train_test_split(Resumes_List, Labels_List, train_size = training_portion, 
                                     shuffle = True
                                     , stratify= Labels_List)

from statistics import mean

print('Average Number of Words in Each Training Resume is '.format(mean([len(i) for i in Train_Resume_Data])))

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(Train_Resume_Data)
word_index = tokenizer.word_index

# Convert the Word Tokens into Integer equivalents, before passing it to keras embedding layer
train_sequences = tokenizer.texts_to_sequences(Train_Resume_Data)

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

validation_sequences = tokenizer.texts_to_sequences(Validation_Resume_Data)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(len(validation_sequences))
print(validation_padded.shape)

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

# Check your Data
def decode_article(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_article(train_padded[10]))
print('-------------------------------------------------------------------------')
print(Train_Resume_Data[10])

Regularizer = l2(0.001)

model = tf.keras.Sequential([
    # Add an Embedding layer expecting input vocab of size 5000, and output embedding dimension of size 64 we set at the top
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              embeddings_regularizer = Regularizer),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    # use ReLU in place of tanh function since they are very good alternatives of each other.
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    # Add a Dense layer with 3 units and softmax activation.
    # When we have multiple outputs, softmax convert outputs layers into a probability distribution.
    tf.keras.layers.Dense(3, activation='softmax')
])
model.summary()

#Using Early Stopping in order to handle Overfitting
ES_Callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

model.compile(loss = tf.keras.losses.SparseCategoricalCrossentropy(), optimizer='adam', metrics=['accuracy'])

num_epochs = 100

history = model.fit(x = train_padded, y = Train_Labels, epochs=num_epochs, 
                    callbacks=[ES_Callback],
                    validation_data=(validation_padded, Validation_Labels),
                    batch_size = 32, shuffle=True, verbose=1)

def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

version = 1
MODEL_DIR = 'Resume_Classification_Model'
export_path = os.path.join(MODEL_DIR, str(version))

tf.keras.models.save_model(model = model, filepath = export_path)

!ls -l export_path

!saved_model_cli show --dir export_path --all

更多信息请参考Beautiful Article。

希望这能解决您的问题。快乐学习!

【讨论】:

以上是关于在 colab 崩溃模型中不使用recurrent_dropout?的主要内容,如果未能解决你的问题,请参考以下文章

matplotlib 绘图在谷歌 colab 中不起作用

colab 在使用 keras.utils.to_categorical 时崩溃

python 使用Vanilla Recurrent Neural Network的最小字符级语言模型,Python / numpy

python 使用Vanilla Recurrent Neural Network的最小字符级语言模型,Python / numpy

python 使用Vanilla Recurrent Neural Network的最小字符级语言模型,Python / numpy

python 使用Vanilla Recurrent Neural Network的最小字符级语言模型,Python / numpy