NLP讯飞英文学术论文分类挑战赛Top10开源多方案–3 TextCNN Fasttext 方案
Posted Better Bench
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了NLP讯飞英文学术论文分类挑战赛Top10开源多方案–3 TextCNN Fasttext 方案相关的知识,希望对你有一定的参考价值。
目录
1 相关信息
- 【NLP】讯飞英文学术论文分类挑战赛Top10开源多方案–1 赛后总结与分析
- 【NLP】讯飞英文学术论文分类挑战赛Top10开源多方案–2 数据分析
- 【NLP】讯飞英文学术论文分类挑战赛Top10开源多方案–3 TextCNN Fasttext 方案
- 【NLP】讯飞英文学术论文分类挑战赛Top10开源多方案–4 机器学习LGB 方案
- 【NLP】讯飞英文学术论文分类挑战赛Top10开源多方案–5 Bert 方案
- 【NLP】讯飞英文学术论文分类挑战赛Top10开源多方案–6 提分方案
2 引言
(1)拿到数据,用的第一个方案就是Fasttext去跑一个Baseline出来,Fasttext训练贼快,能够在短时间内构建一个baseline。以下实现采用预训练词向量的方式去实现的baseline,另一种不使用词向量的词嵌入的方式可以使用torchtext的data.Field去实现。这里不做讲述。
预训练词向量方案的处理步骤
- 数据预处理
- 去换行符
- 去特殊符号
- 删除单个字符
- 删除多个空格
- 数字泛化
- 字母小写
- 词性还原
- 去停用词
- 选择分词工具
- from nltk import WordPunctTokenizer
- 选择Padding工具
- tf.keras.preprocessing.sequence.pad_sequences(sequence_train, maxlen=args.embedding_size)
- 选择给单词编码的工具
- tf.keras.preprocessing.text.Tokenizer(lower=True)
- 用预训练词向量的方式训练word2vec、fasttext、glove词向量模型,再根据单词编码获得词向量嵌入矩阵
- 选择网络结构
- TextCNN
- Fasttext
- DPCNN
- TextRNN
- 开始训练
- 初始化网络权重
- 交叉验证
- 对抗训练
- FGM
- FGD
(2)我重点调试了TextCNN和FastText模型,在加上使用预处理过程,交叉验证方式后,分别达到0.78+和0.77+的准确率,在加入高质量的伪标签数据后,得到0.8076和0.8070的准确率。说明数值的质量很大部分决定了精度。其他的网络结构和参数的选择都是小的提分点。由于时间有限,TextRNN和DPCNN没有调参,线上精度并不高,在任务中就没有使用这两种模型。
(3)词向量矩阵通过对比word2vec、fasttext、glove以及三种各128维度合并的词向量矩阵。发现单个word2vec 128维度最佳。训练参数如下
iter = 20
min_count = 3
sample = 1e-5
window = 5
(4)尝试过对抗训练方法,有FGM和FGD两种方法,FGD效果稍微较好,但是epoch必须得大,20以上,加入了对抗训练后,训练速度很慢,收敛也很慢。
3 方案
3.1 加载包
安装包
!pip install nltk
!pip install gensim
!pip install python-Levenshtein
!pip install tensorflow-gpu
!pip install glove-python-binary
加载包
from nltk import WordPunctTokenizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
# import tensorflow as tf
from tqdm import tqdm
from torch.utils import data
import nltk
import re
from nltk.stem import WordNetLemmatizer
import pickle
import torch
import os
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import torch.nn.functional as F
from sklearn import metrics
from torch.utils.data import DataLoader
cache_dir = 'cache'
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
3.2 参数设置
class Config(object):
def __init__(self):
self.train_all = "data/train.csv"
# 生成的预处理后的数据,为了加快二次启动程序
self.train_path = "predata/train_x.csv"
self.valid_path = "predata/valid_x.csv"
self.testall_path = "data/test.csv"
self.test_path = "predata/test_x.csv"
# label的类别
self.label_path = "predata/label_id2cate.pkl"
# 生成的预处理后的数据,为了加快二次启动程序
self.process_trainset_path = "predata/train_set.npy"
self.process_trainlabel_path = "predata/train_label.npy"
self.process_testset_path = "predata/test_set.npy"
# 词向量模型
self.fastText_path = "model/fasttext.bin"
self.word2vec_path = "model/word2vec.bin"
# 嵌入长度
self.embedding_size = 128
# 词典最大词数
self.max_vocab_size = 50000
# 词向量维度
self.max_len = 128
# 分类类别数
self.num_class = 39
# 模型保存路径
self.save_path = "saved/"
self.batch_size = 1000
self.lr = 0.001
self.num_epochs = 8# 50
# 可选FastText或TextCNN
self.model = "TextCNN"
args = Config()
3.3 TextCNN 网络结构
class TextCNN(nn.Module):
def __init__(self, args, pretrained_path):
super(TextCNN, self).__init__()
self.dim_embed = args.embedding_size
# 在文本分类任务中,测试0.6模型精确率最高
self.dropout = 0.6
self.num_filters = 256
self.kernel_size = (4, 5, 3)
self.max_len = args.max_len
self.n_vocab = pretrained_path.shape[0] # 不使用预训练词向量时的词典长度
self.num_classes = args.num_class# f分类类别数
self.pretrained = True
self.pretrained_path = pretrained_path
if self.pretrained:
self.embedding = nn.Embedding.from_pretrained(self.pretrained_path, freeze=False)
else:
self.embedding = nn.Embedding(self.n_vocab, self.dim_embed, padding_idx=self.n_vocab - 1)
self.conv1 = nn.Conv2d(1, self.num_filters, (self.kernel_size[0], self.dim_embed))
self.conv2 = nn.Conv2d(1, self.num_filters, (self.kernel_size[1], self.dim_embed))
self.conv3 = nn.Conv2d(1, self.num_filters, (self.kernel_size[2], self.dim_embed))
self.max_pool1 = nn.MaxPool2d((self.max_len - self.kernel_size[0] + 1, 1))
self.max_pool2 = nn.MaxPool2d((self.max_len - self.kernel_size[1] + 1, 1))
self.max_pool3 = nn.MaxPool2d((self.max_len - self.kernel_size[1] + 1, 1))
self.dropout = nn.Dropout(self.dropout)
self.fc = nn.Linear(self.num_filters * 3, self.num_classes)
def forward(self, x):
batch_size = x.shape[0]
x = self.embedding(x) # [batch_size, max_len, dim_embed]
x = x.unsqueeze(1) # [batch_size, 1, max_len, dim_embed]
x1 = F.relu(self.conv1(x)) # [batch_size, num_filters, max_len-kernel_size[0], 1]
x2 = F.relu(self.conv2(x)) # [batch_size, num_filters, max_len-kernel_size[1], 1]
x3 = F.relu(self.conv3(x)) # [batch_size, num_filters, max_len-kernel_size[2], 1]
x1 = self.max_pool1(x1) # [batch_size, num_filters, 1, 1]
x2 = self.max_pool2(x2) # [batch_size, num_filters, 1, 1]
x3 = self.max_pool3(x3) # [batch_size, num_filters, 1, 1]
x = torch.cat((x1, x2, x3), -1) # [batch_size, num_filters, 1, 3]
x = x.view(batch_size, 1, -1) # [batch_size, 1, num_filters*3]
x = self.dropout(x)
x = self.fc(x) # [batch_size, 1, 2]
x = x.view(-1, self.num_classes) # [batch_size, 2]
return x
3.4 FastText 网络结构
class FastText(nn.Module):
def __init__(self, args, pretrained_path):
super(FastText, self).__init__()
self.dim_embed = args.embedding_size
self.hidden_size = 256
self.n_vocab = pretrained_path.shape[0]
self.num_classes = args.num_class
self.pretrained = True
self.pretrained_path = pretrained_path
# 在文本分类任务中,0.6精度最高
self.dropout = 0.6
if self.pretrained:
self.embedding = nn.Embedding.from_pretrained(
self.pretrained_path, freeze=False)
else:
self.embedding = nn.Embedding(self.n_vocab, self.dim_embed)
self.dropout = nn.Dropout(self.dropout)
self.fc1 = nn.Linear(self.dim_embed, self.hidden_size)
self.fc2 = nn.Linear(self.hidden_size, self.num_classes)
def forward(self, x):
batch_size = x.shape[0]
x = self.embedding(x)
x = x.mean(dim=1)
x = self.dropout(x)
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
return x
3.5 TextRNN网络结构
class TextRNN(nn.Module):
def __init__(self, args, pretrained_path):
super(TextRNN, self).__init__()
self.pretrained = True
self.pretrained_path = pretrained_path
self.n_vocab = pretrained_path.shape[0]
self.dim_embed = args.embedding_size
self.hidden_size = 64
self.num_layers = 2
self.dropout = 0.4
self.num_classes = args.num_class
if self.pretrained:
self.embedding = nn.Embedding.from_pretrained(
self.pretrained_path, freeze=False)
else:
self.embedding = nn.Embedding(
self.n_vocab, self.dim_embed, padding_idx=self.n_vocab - 1)
self.lstm = nn.LSTM(self.dim_embed, self.hidden_size, self.num_layers,
bidirectional=True, batch_first=True, dropout=self.dropout)
self.fc = nn.Linear(self.hidden_size * 2, self.num_classes)
def forward(self, x):
x = self.embedding(x)
x, _ = self.lstm(x)
x = self.fc(x[:, -1, :])
return x
3.6 DPCNN 网络结构
class DPCNN(nn.Module):
def __init__(self, args, pretrained_path):
super(DPCNN, self).__init__()
self.dim_embed = args.embedding_size
self.num_filters = 256
self.kernel_size = 3
self.n_vocab = pretrained_path.shape[0]
self.num_classes = args.num_class
self.pretrained = True
self.pretrained_path = pretrained_path
if self.pretrained:
self.embedding = nn.Embedding.from_pretrained(self.pretrained_path, freeze=False)
else:
self.embedding = nn.Embedding(self.n_vocab, self.dim_embed)
self.conv_region = nn.Conv2d(1, self.num_filters, (self.kernel_size, self.dim_embed), stride=1)
self.conv = nn.Conv2d(self.num_filters, self.num_filters, (self.kernel_size, 1), stride=1)
self.max_pool = nn.MaxPool2d(kernel_size=(self.kernel_size, 1), stride=2)
self.padding1 = nn.ZeroPad2d((0, 0, 1, 1)) # top bottom
self.padding2 = nn.ZeroPad2d((0, 0, 0, 1)) # bottom
self.relu = nn.ReLU()
self.fc = nn.Linear(self.num_filters, self.num_classes)
def forward(self, x):
x = self.embedding(x) # [batch_size, max_len, dim_embed]
x = x.unsqueeze(1) # [batch_size, 1, max_len, dim_embed]
x = self.conv_region(x) # [batch_size, num_filters, max_len-kernel_size, 1]
x = self.padding1(x) # [batch_size, num_filters, max_len, 1]
x = self.relu(x)
x = self.conv(x) # [batch_size, num_filters, max_len-kernel_size, 1]
x = self.padding1(x) # [batch_size, num_filters, max_len, 1]
x = self.relu(x)
x = self.conv(x) # [batch_size, num_filters, max_len-kernel_size, 1]
while x.size()[2] > 2:
x = self._block(x) # [batch_size, num_filters, 1, 1]
x = x.squeeze() # [batch_size, num_filters]
x = self.fc(x) # [batch_size, num_classes]
return x
def _block(self, x):
x = self.padding2(x)
px = self.max_pool(x)
x = self.padding1(px)
x = self.relu(x)
x = self.conv(x)
x = self.padding1(x)
x = self.relu(x)
x = self.conv(x)
# Short Cut
x = x + px
return x
3.5 数据预处理
from nltk.stem import WordNetLemmatizer
import re
import nltk
import numpy as np
en_stop = set(nltk.corpus.stopwords.words('english'))
custom_stop_words = [
'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure',
'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.',
'al.', 'elsevier', 'pmc', 'czi', 'www'
]
for word in custom_stop_words:
en_stop.add(word)
def preprocess_text(document):
stemmer = WordNetLemmatizer()
document = str(document)
# 替换换行符
document = document.replace("\\n", ' ')
document = document.replace("/'", '')
# 删除特殊字符
document = re.sub(r'\\W', ' ', document)
# 删除所有单个字符
document = re.sub(r'\\s+[a-zA-Z]\\s+', ' ', document)
# 从开头删除单个字符
document = re.sub(r'\\^[a-zA-Z]\\s+', ' ', document)
# 用单个空格替换多个空格
document = re.sub(r'\\s+', ' ', document, flags=re.I)
# 数字泛化:,所有大于9的数字都被hashs替换了。即成为# #,123变成# # #或15.80€变成# #,# #€。
document = re.sub('[0-9]{5,}', '#####', document)
document = re.sub('[0-9]{4}', '####', document)
document = re.sub('[0-9]{3}', '###', document)
document = re.sub('[0-9]{2}', '##', document)
# 转换为小写
document = document.lower()
# 词形还原
tokens = document.split()
tokens = [stemmer.lemmatize(word) for word in tokens]
# 去停用词
tokens = [word for word in tokens if word not in en_stop]
# 去超短文本
tokens = [word for word in tokens if len(word) > 3]
preprocessed_text = ' '.join(tokens)
return preprocessed_text
3.6 训练词向量
import nltk
from torch.utils import data
from tqdm import tqdm
# import tensorflow as tf
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from gensim.models import FastText
import torch
from glove import Glove
from glove import Corpus
from nltk import WordPunctTokenizer
def build_word2vec(args,train):
trainall_title = list(train['title'])
trainall_abstract = list(train['abstract'])
trainall_combine = np.empty_like(trainall_title)
for i in range(len(trainall_title)):
trainall_combine[i] = trainall_title[i] + ' <sep> ' + trainall_abstract[i]
# Prepare FastText以上是关于NLP讯飞英文学术论文分类挑战赛Top10开源多方案–3 TextCNN Fasttext 方案的主要内容,如果未能解决你的问题,请参考以下文章
NLP讯飞英文学术论文分类挑战赛Top10开源多方案–5 Bert 方案
NLP讯飞英文学术论文分类挑战赛Top10开源多方案–5 Bert 方案
NLP讯飞英文学术论文分类挑战赛Top10开源多方案--2 数据分析
NLP讯飞英文学术论文分类挑战赛Top10开源多方案--2 数据分析