NLP学不会打我 半小时学会基本操作 15 一百行实现 Bert 二分类 (附代码)
Posted 我是小白呀
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了NLP学不会打我 半小时学会基本操作 15 一百行实现 Bert 二分类 (附代码)相关的知识,希望对你有一定的参考价值。
概述
从今天开始我们将开启一段自然语言处理 (NLP) 的旅程. 自然语言处理可以让来处理, 理解, 以及运用人类的语言, 实现机器语言和人类语言之间的沟通桥梁.
Bert
Bert (Bidirecrional Encoder Representation from Transformers) 是一个预训练的语言表征模型. Bert 主要利用了 Transformer 的 Encoder 结构, 这里就不多赘述.
简单说一说
在大家的鼓励下这届的 CCF 荣获 4 个冠军, 1 个亚军, 在天池中荣获第 4 名. 如图:
100 行实现 Bert
以下代码是全网最简单的 Bert 实现, 部分为比赛源码.
网络架构
Model: "model"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_1 (InputLayer) [(None, 780)] 0
__________________________________________________________________________________________________
input_2 (InputLayer) [(None, 780)] 0
__________________________________________________________________________________________________
tf_bert_model (TFBertModel) TFBaseModelOutputWit 102267648 input_1[0][0]
input_2[0][0]
__________________________________________________________________________________________________
dense (Dense) (None, 2) 7690 tf_bert_model[0][1]
==================================================================================================
Total params: 102,275,338
Trainable params: 102,275,338
Non-trainable params: 0
__________________________________________________________________________________________________
超参数
# 超参数
EPOCHS = 50 # 迭代次数
BATCH_SIZE = 8 # 单词训练样本数目
learning_rate = 0.00003 # 学习率
INPUT_DIM = 36782 + 1
MAX_LENGTH = 780
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) # 优化器
loss = tf.keras.losses.CategoricalCrossentropy() # 损失
bert_tokenizer = BertTokenizer.from_pretrained('Langboat/mengzi-bert-base') # Bert的分词器
get_data
def get_data():
"""
读取数据
:return: 返回分批完的训练集和测试集
"""
# 读取数据
data_train = pd.read_csv("../data/train.csv")
print(data_train.head(), data_train.shape)
data_val = pd.read_csv("../data/val.csv")
print(data_val.head(), data_val.shape)
# 预处理
data_train["label"] = data_train["label"].apply(lambda x: x.split(","))
print(data_train.head())
data_val["label"] = data_val["label"].apply(lambda x: x.split(","))
print(data_val.head())
# 获取X,y
X_train = data_train["text"].values.tolist()
y_train = data_train["label"].values.tolist()
y_train = np.asarray(y_train, dtype=np.float32)
X_val = data_val["text"].values.tolist()
y_val = data_val["label"].values.tolist()
y_val = np.asarray(y_val, dtype=np.float32)
# Tokenizer
X_train = bert_tokenizer(X_train, padding=True, truncation=True, max_length=MAX_LENGTH)
X_val = bert_tokenizer(X_val, padding=True, truncation=True, max_length=MAX_LENGTH)
print("=" * 20, "字个数:", bert_tokenizer.vocab_size, "=" * 20)
# 获取input/mask
train_input = X_train["input_ids"]
train_mask = X_train["attention_mask"]
train_input = np.asarray(train_input)
train_mask = np.asarray(train_mask)
val_input = X_val["input_ids"]
val_mask = X_val["attention_mask"]
val_input = np.asarray(val_input)
val_mask = np.asarray(val_mask)
return train_input, val_input, train_mask, val_mask, y_train, y_val
main
def main():
# 获取数据
X_train_input, X_test_input, X_train_mask, X_test_mask, y_train, y_test = get_data()
# 调试输出
print(X_train_input[:5], X_train_input.shape)
print(X_test_input[:5], X_test_input.shape)
print(X_train_mask[:5], X_train_mask.shape)
print(X_test_mask[:5], X_test_mask.shape)
print(y_train[:5], y_train.shape)
print(y_test[:5], y_test.shape)
# Bert模型
bert = TFBertModel.from_pretrained("Langboat/mengzi-bert-base", from_pt=True)
input_ids = tf.keras.Input(shape=(MAX_LENGTH,), dtype=tf.int32)
masks = tf.keras.Input(shape=(MAX_LENGTH,), dtype=tf.int32)
bert = bert([input_ids, masks])
bert = bert[1]
classifier = Dense(2, activation="softmax")(bert)
# 模型
model = Model(inputs=[input_ids, masks], outputs=classifier)
print(model.summary())
# 组合
model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])
# 保存
checkpoint = tf.keras.callbacks.ModelCheckpoint(
"../model/bert_mengzi/bert_mengzi.ckpt", monitor='val_loss',
verbose=1, save_best_only=True, mode='min',
save_weights_only=True
)
# 训练
model.fit([X_train_input, X_train_mask], y_train, validation_data=([X_test_input, X_test_mask], y_test),
epochs=EPOCHS, batch_size=BATCH_SIZE,
callbacks=[checkpoint])
完整代码
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.python.keras import Model
from tensorflow.keras.layers import Dense
from transformers import BertTokenizer, TFBertModel
# 超参数
EPOCHS = 50 # 迭代次数
BATCH_SIZE = 8 # 单词训练样本数目
learning_rate = 0.00003 # 学习率
INPUT_DIM = 36782 + 1
MAX_LENGTH = 780
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) # 优化器
loss = tf.keras.losses.CategoricalCrossentropy() # 损失
bert_tokenizer = BertTokenizer.from_pretrained('Langboat/mengzi-bert-base') # Bert的分词器
def get_data():
"""
读取数据
:return: 返回分批完的训练集和测试集
"""
# 读取数据
data_train = pd.read_csv("../data/train.csv")
print(data_train.head(), data_train.shape)
data_val = pd.read_csv("../data/val.csv")
print(data_val.head(), data_val.shape)
# 预处理
data_train["label"] = data_train["label"].apply(lambda x: x.split(","))
print(data_train.head())
data_val["label"] = data_val["label"].apply(lambda x: x.split(","))
print(data_val.head())
# 获取X,y
X_train = data_train["text"].values.tolist()
y_train = data_train["label"].values.tolist()
y_train = np.asarray(y_train, dtype=np.float32)
X_val = data_val["text"].values.tolist()
y_val = data_val["label"].values.tolist()
y_val = np.asarray(y_val, dtype=np.float32)
# Tokenizer
X_train = bert_tokenizer(X_train, padding=True, truncation=True, max_length=MAX_LENGTH)
X_val = bert_tokenizer(X_val, padding=True, truncation=True, max_length=MAX_LENGTH)
print("=" * 20, "字个数:", bert_tokenizer.vocab_size, "=" * 20)
# 获取input/mask
train_input = X_train["input_ids"]
train_mask = X_train["attention_mask"]
train_input = np.asarray(train_input)
train_mask = np.asarray(train_mask)
val_input = X_val["input_ids"]
val_mask = X_val["attention_mask"]
val_input = np.asarray(val_input)
val_mask = np.asarray(val_mask)
return train_input, val_input, train_mask, val_mask, y_train, y_val
def main():
# 获取数据
X_train_input, X_test_input, X_train_mask, X_test_mask, y_train, y_test = get_data()
# 调试输出
print(X_train_input[:5], X_train_input.shape)
print(X_test_input[:5], X_test_input.shape)
print(X_train_mask[:5], X_train_mask.shape)
print(X_test_mask[:5], X_test_mask.shape)
print(y_train[:5], y_train.shape)
print(y_test[:5], y_test.shape)
# Bert模型
bert = TFBertModel.from_pretrained("Langboat/mengzi-bert-base", from_pt=True)
input_ids = tf.keras.Input(shape=(MAX_LENGTH,), dtype=tf.int32)
masks = tf.keras.Input(shape=(MAX_LENGTH,), dtype=tf.int32)
bert = bert([input_ids, masks])
bert = bert[1]
classifier = Dense(2, activation="softmax")(bert)
# 模型
model = Model(inputs=[input_ids, masks], outputs=classifier)
print(model.summary())
# 组合
model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])
# 保存
checkpoint = tf.keras.callbacks.ModelCheckpoint(
"../model/bert_mengzi/bert_mengzi.ckpt", monitor='val_loss',
verbose=1, save_best_only=True, mode='min',
save_weights_only=True
)
# 训练
model.fit([X_train_input, X_train_mask], y_train, validation_data=([X_test_input, X_test_mask], y_test),
epochs=EPOCHS, batch_size=BATCH_SIZE,
callbacks=[checkpoint])
if __name__ == '__main__':
main()
以上是关于NLP学不会打我 半小时学会基本操作 15 一百行实现 Bert 二分类 (附代码)的主要内容,如果未能解决你的问题,请参考以下文章
NLP⚠️学不会打我! 半小时学会基本操作 8⚠️ 新闻分类
NLP⚠️学不会打我! 半小时学会基本操作 8⚠️ 新闻分类