中文文本分类

Posted lokvahkoor

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了中文文本分类相关的知识,希望对你有一定的参考价值。

代码分解

代码包括四个部分,分别是:

  • 工具类:utils
  • 训练及测试代码:train_eval
  • 模型:models.TextCNN
  • 主函数:main

在notebook中依次运行前三个部分,最后执行main就可以开始训练了

colab链接:https://colab.research.google.com/drive/1vUnHAjmA3OTt5o47HQkQLCXA8-rtsZEs

具体代码及解析如下:

utils

"""
utils
"""
import os
import torch
import numpy as np
import pickle as pkl
from tqdm import tqdm
import time
from datetime import timedelta


MAX_VOCAB_SIZE = 10000  # 词表长度限制
UNK, PAD = '<UNK>', '<PAD>'  # 未知字,padding符号


def build_vocab(file_path, tokenizer, max_size, min_freq):
  """
  构建一个词表:
  首先对数据集中的每一行句子按字/空格进行分割,然后统计所有元素的出现频率
  接下来按照频率从高到低的顺序对所有频率大于min_freq的元素进行排序,取前max_size个元素
  最后按照频率降序构建字典vocab_dic:{元素:序号},vocab_dic的最后两个元素是'<UNK>'和'<PAD>'
  """
    vocab_dic = {}
    with open(file_path, 'r', encoding='UTF-8') as f:
        for line in tqdm(f):  # 处理每一行
            lin = line.strip()  # 移除头尾空格或换行符
            if not lin:  # 跳过空行
                continue
            content = lin.split('	')[0]  # 句子和标签通过tab分割,前面的是句子内容,后面的是标签
            for word in tokenizer(content):  # 按空格分割或者按字分割
                vocab_dic[word] = vocab_dic.get(word, 0) + 1  # 统计词频或字频
        # 遍历词典,筛选出词频大于min_freq的词,然后按照词频从高到低排序,取前max_size个词,组成新的列表vocab_list,vocab_list中的元素为元组(word, freq)
        vocab_list = sorted([_ for _ in vocab_dic.items() if _[1] >= min_freq], key=lambda x: x[1], reverse=True)[:max_size]
        # 构建字典vocab_dic,key为词,value为索引(按词频升序)
        vocab_dic = {word_count[0]: idx for idx, word_count in enumerate(vocab_list)}
        # 在vocab_dic的最后增加两个元素:{'<UNK>':len(vocab_dic)}和{'<PAD>':len(vocab_dic)+1}
        vocab_dic.update({UNK: len(vocab_dic), PAD: len(vocab_dic) + 1})
    return vocab_dic


def build_dataset(config, ues_word):  # 构建数据集
  """
  加载数据集:
  对数据集中的每一行,先分离内容和标签
  然后对句子内容,按指定的方式进行分割(依照空格或字符),接着根据pad_size进行补足或截断
  接着把分割后的元素,通过词表转化成一串序号words_line
  最后把所有句子处理后的结果组成一个大列表,列表中的元素为:[(words_line, int(label), seq_len),...]
  """
    if ues_word:
        tokenizer = lambda x: x.split(' ')  # 以空格隔开,word-level
    else:
        tokenizer = lambda x: [y for y in x]  # char-level
    if os.path.exists(config.vocab_path):  # 如果有词表的话,加载词表
        vocab = pkl.load(open(config.vocab_path, 'rb'))
    else:  # 没有词表的话,就调用build_vocab()自己构建
        vocab = build_vocab(config.train_path, tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1)
        pkl.dump(vocab, open(config.vocab_path, 'wb'))  # 构建完了之后保存为pickle
    print(f"Vocab size: {len(vocab)}")

    def load_dataset(path, pad_size=32):
        contents = []
        with open(path, 'r', encoding='UTF-8') as f:
            for line in tqdm(f):  # 打开数据文件,按行读取
                lin = line.strip()  # 移除头尾空格或换行符
                if not lin:
                    continue  # 跳过空行
                content, label = lin.split('	')  # 句子和标签通过tab分割,前面的是句子内容,后面的是标签
                token = tokenizer(content)  # 按空格或字符来分割句子
                seq_len = len(token)  # 得到分割后的元素数量
                if pad_size:  # 如果有指定填充长度
                    if len(token) < pad_size:  #  如果分割后的元素数量小于填充长度
                        token.extend([PAD] * (pad_size - len(token)))  # padding填充
                    else:  # 如果分割后的元素数量大于填充长度
                        token = token[:pad_size]  # 直接截断至填充长度
                        seq_len = pad_size  # 更新元素数量
                # word to id
                words_line = []  # words_line是句子通过词表转化后得到的数字表示
                for word in token:  # 对于句子中的每一个元素
                    # 拿到该元素在词表中的序号,然后将这个序号添加到words_line中。如果该元素不在词表中,则填入'<UNK>'(未知字)的序号
                    words_line.append(vocab.get(word, vocab.get(UNK)))
                contents.append((words_line, int(label), seq_len))  # 在contents中存入一个元组,元组的内容为(words_line,数字标签,元素数量)
        return contents
    train = load_dataset(config.train_path, config.pad_size)
    dev = load_dataset(config.dev_path, config.pad_size)
    test = load_dataset(config.test_path, config.pad_size)
    return vocab, train, dev, test


class DatasetIterater(object):
  """
  根据数据集产生batch
  这里需要注意的是,在_to_tensor()中,代码把batch中的数据处理成了`(x, seq_len), y`的形式
  其中x是words_line,seq_len是pad前的长度(超过pad_size的设为pad_size),y是数据标签
  """
    def __init__(self, batches, batch_size, device):  # 这里的batches就是经过build_dataset()中的load_dataset()处理后得到的contents
        self.batch_size = batch_size  # batch的容量(一次进多少个句子)
        self.batches = batches  # 数据集
        self.n_batches = len(batches) // batch_size  # 数据集大小整除batch容量
        self.residue = False  # 记录batch能否覆盖整个数据集,false代表可以,true代表不可以。residuere是‘剩余物,残渣'的意思
        if len(batches) % self.n_batches != 0:
            self.residue = True
        self.index = 0  # 迭代用的索引
        self.device = device  # 训练设备

    def _to_tensor(self, datas):
        x = torch.LongTensor([_[0] for _ in datas]).to(self.device)  # 句子words_line
        y = torch.LongTensor([_[1] for _ in datas]).to(self.device)  # 标签

        # pad前的长度(超过pad_size的设为pad_size)
        seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
        return (x, seq_len), y

    def __next__(self):
        if self.residue and self.index == self.n_batches:  # 如果batch外还剩下一点句子,并且迭代到了最后一个batchbatch
            batches = self.batches[self.index * self.batch_size: len(self.batches)]  # 直接拿出剩下的所有数据
            self.index += 1
            batches = self._to_tensor(batches)
            return batches

        elif self.index >= self.n_batches:
            self.index = 0
            raise StopIteration
        else:  # 迭代器的入口,刚开始self.index是0,肯定小于self.n_batches
            batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]  # 正常取一个batch的数据
            self.index += 1
            batches = self._to_tensor(batches)  # 转化为tensor
            return batches

    def __iter__(self):
        return self

    def __len__(self):
        if self.residue:
            return self.n_batches + 1
        else:
            return self.n_batches


def build_iterator(dataset, config):  # 这里的dataset是经过build_dataset()处理后得到的数据
    iter = DatasetIterater(dataset, config.batch_size, config.device)
    return iter


def get_time_dif(start_time):
    """获取已使用时间"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))

train_eval

"""
train_eval
"""
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import metrics
import time
from tensorboardX import SummaryWriter


# 权重初始化,默认xavier(如果不初始化,则默认的随机权重会特别大,对模型训练造成影响)
def init_network(model, method='xavier', exclude='embedding', seed=123):
    for name, w in model.named_parameters():  # 迭代网络中所有可训练的参数
        if exclude not in name:  # 排除名字中包含指定关键词的参数(默认为'embedding')
            if 'weight' in name:  # 对权重进行初始化
                if method == 'xavier':
                    nn.init.xavier_normal_(w)  # 调用不同的初始化方法
                elif method == 'kaiming':
                    nn.init.kaiming_normal_(w)
                else:
                    nn.init.normal_(w)
            elif 'bias' in name:  # 对偏置进行初始化
                nn.init.constant_(w, 0)
            else:  # 跳过除权重和偏置外的其他参数
                pass


def train(config, model, train_iter, dev_iter, test_iter):
    start_time = time.time()
    model.train()  # model.train()将启用BatchNormalization和Dropout,相应的,model.eval()则不启用BatchNormalization和Dropout
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)  # 指定优化方法

    # 学习率指数衰减,每次epoch:学习率 = gamma * 学习率
    # scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
    total_batch = 0  # 记录进行到多少batch
    dev_best_loss = float('inf')
    last_improve = 0  # 记录上次验证集loss下降的batch数
    flag = False  # 记录是否很久没有效果提升(用这个标记来跳出嵌套的循环)
    writer = SummaryWriter(log_dir=config.log_path + '/' + time.strftime('%m-%d_%H.%M', time.localtime()))
    for epoch in range(config.num_epochs):
        print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
        # scheduler.step() # 学习率衰减
        for i, (trains, labels) in enumerate(train_iter):
            outputs = model(trains)
            model.zero_grad()
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()
            if total_batch % 100 == 0:
                # 每多少轮输出在训练集和验证集上的效果
                true = labels.data.cpu()  # 从gpu tensor中取出标签数据
                predic = torch.max(outputs.data, 1)[1].cpu()  # 返回每一行中最大值的列索引
                train_acc = metrics.accuracy_score(true, predic)  # 计算这个batch的分类准确率
                dev_acc, dev_loss = evaluate(config, model, dev_iter)  # 计算开发集上的准确率和训练误差
                if dev_loss < dev_best_loss:  # 使用开发集判断模型性能是否提升
                    dev_best_loss = dev_loss
                    torch.save(model.state_dict(), config.save_path)
                    improve = '*'
                    last_improve = total_batch
                else:
                    improve = ''
                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6},  Train Loss: {1:>5.2},  Train Acc: {2:>6.2%},  Val Loss: {3:>5.2},  Val Acc: {4:>6.2%},  Time: {5} {6}'
                print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
                writer.add_scalar("loss/train", loss.item(), total_batch)
                writer.add_scalar("loss/dev", dev_loss, total_batch)
                writer.add_scalar("acc/train", train_acc, total_batch)
                writer.add_scalar("acc/dev", dev_acc, total_batch)
                model.train()
            total_batch += 1
            if total_batch - last_improve > config.require_improvement:
                # 开发集loss超过一定数量的batch没下降,则结束训练
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break
        if flag:
            break
    writer.close()
    test(config, model, test_iter)


def test(config, model, test_iter):
    # test
    model.load_state_dict(torch.load(config.save_path))
    model.eval()
    start_time = time.time()
    test_acc, test_loss, test_report, test_confusion = evaluate(config, model, test_iter, test=True)
    msg = 'Test Loss: {0:>5.2},  Test Acc: {1:>6.2%}'
    print(msg.format(test_loss, test_acc))
    print("Precision, Recall and F1-Score...")
    print(test_report)
    print("Confusion Matrix...")
    print(test_confusion)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


def evaluate(config, model, data_iter, test=False):
    model.eval()  # 不启用BatchNormalization和Dropout
    loss_total = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    with torch.no_grad():  # 不追踪梯度
        for texts, labels in data_iter:  # 对数据集中的每一组数据
            outputs = model(texts)  # 使用模型进行预测
            loss = F.cross_entropy(outputs, labels)  # 计算模型损失
            loss_total += loss  # 累加模型损失
            labels = labels.data.cpu().numpy()
            predic = torch.max(outputs.data, 1)[1].cpu().numpy()
            labels_all = np.append(labels_all, labels)  # 记录标签
            predict_all = np.append(predict_all, predic)  # 记录预测结果

    acc = metrics.accuracy_score(labels_all, predict_all)  # 计算分类误差
    if test:
        report = metrics.classification_report(labels_all, predict_all, target_names=config.class_list, digits=4)
        confusion = metrics.confusion_matrix(labels_all, predict_all)
        return acc, loss_total / len(data_iter), report, confusion
    return acc, loss_total / len(data_iter)  # 返回分类误差和平均模型损失

models.TextCNN

"""
models.TextCNN
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


class Config(object):

    """配置参数"""
    def __init__(self, dataset, embedding):
        self.model_name = 'TextCNN'
        self.train_path = dataset + '/data/train.txt'                                # 训练集
        self.dev_path = dataset + '/data/dev.txt'                                    # 验证集
        self.test_path = dataset + '/data/test.txt'                                  # 测试集
        self.class_list = [x.strip() for x in open(
            dataset + '/data/class.txt', encoding='utf-8').readlines()]              # 类别名单
        self.vocab_path = dataset + '/data/vocab.pkl'                                # 词表
        self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt'        # 模型训练结果
        self.log_path = dataset + '/log/' + self.model_name
        self.embedding_pretrained = torch.tensor(
            np.load(dataset + '/data/' + embedding)["embeddings"].astype('float32'))            if embedding != 'random' else None                                       # 预训练词向量
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')   # 设备

        self.dropout = 0.5                                              # 随机失活
        self.require_improvement = 1000                                 # 若超过1000batch效果还没提升,则提前结束训练
        self.num_classes = len(self.class_list)                         # 类别数
        self.n_vocab = 0                                                # 词表大小,在运行时赋值
        self.num_epochs = 20                                            # epoch数
        self.batch_size = 128                                           # mini-batch大小
        self.pad_size = 32                                              # 每句话处理成的长度(短填长切)
        self.learning_rate = 1e-3                                       # 学习率
        self.embed = self.embedding_pretrained.size(1)            if self.embedding_pretrained is not None else 300           # 字向量维度
        self.filter_sizes = (2, 3, 4)                                   # 卷积核尺寸
        self.num_filters = 256                                          # 卷积核数量(channels数)


'''Convolutional Neural Networks for Sentence Classification'''


class Model(nn.Module):
    def __init__(self, config):
        super(Model, self).__init__()
        if config.embedding_pretrained is not None:
            self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
        else:
            self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1)
        self.convs = nn.ModuleList(
            [nn.Conv2d(1, config.num_filters, (k, config.embed)) for k in config.filter_sizes])
        self.dropout = nn.Dropout(config.dropout)
        self.fc = nn.Linear(config.num_filters * len(config.filter_sizes), config.num_classes)

    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x

    def forward(self, x):
        out = self.embedding(x[0])
        out = out.unsqueeze(1)
        out = torch.cat([self.conv_and_pool(out, conv) for conv in self.convs], 1)
        out = self.dropout(out)
        out = self.fc(out)
        return out

main

"""
主函数
"""
import time
import torch
import numpy as np
# from train_eval import train, init_network
# from importlib import import_module
import argparse  # 解析参数用的包

dataset = '/content/drive/My Drive/ChineseTextClassification/Chinese-Text-Classification-Pytorch-master/THUCNews'  # 数据集

embedding = 'embedding_SougouNews.npz'  # 搜狗新闻:embedding_SougouNews.npz, 腾讯:embedding_Tencent.npz, 随机初始化:random
model_name = 'TextCNN'  # 选择模型:TextCNN, TextRNN, FastText, TextRCNN, TextRNN_Att, DPCNN, Transformer
# from utils import build_dataset, build_iterator, get_time_dif

config = Config(dataset, embedding)  # 加载神经网络模型的参数
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic = True  # 固定随机因子,保证每次结果一样

start_time = time.time()
print("Loading data...")
vocab, train_data, dev_data, test_data = build_dataset(config, False)  # 使用build_dataset()来加载数据集 parser.add_argument('--word', default=False, type=bool, help='True for word, False for char')
train_iter = build_iterator(train_data, config)  # 使用build_iterator()来生成相应的迭代器
dev_iter = build_iterator(dev_data, config)
test_iter = build_iterator(test_data, config)
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)

# train
config.n_vocab = len(vocab)
model = Model(config).to(config.device)  # 使用指定的设备来训练模型
if model_name != 'Transformer':
    init_network(model)
print(model.parameters)
train(config, model, train_iter, dev_iter, test_iter)

在GPU下的运行结果

Loading data...
Vocab size: 4762
180000it [00:04, 40629.57it/s]
10000it [00:00, 10535.39it/s]
10000it [00:01, 7211.66it/s]
Time usage: 0:00:07
<bound method Module.parameters of Model(
  (embedding): Embedding(4762, 300)
  (convs): ModuleList(
    (0): Conv2d(1, 256, kernel_size=(2, 300), stride=(1, 1))
    (1): Conv2d(1, 256, kernel_size=(3, 300), stride=(1, 1))
    (2): Conv2d(1, 256, kernel_size=(4, 300), stride=(1, 1))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=768, out_features=10, bias=True)
)>
Epoch [1/20]
Iter:      0,  Train Loss:   2.3,  Train Acc:  7.03%,  Val Loss:   2.7,  Val Acc: 12.33%,  Time: 0:00:03 *
Iter:    100,  Train Loss:  0.74,  Train Acc: 73.44%,  Val Loss:   0.7,  Val Acc: 78.33%,  Time: 0:00:06 *
Iter:    200,  Train Loss:  0.71,  Train Acc: 77.34%,  Val Loss:  0.55,  Val Acc: 83.33%,  Time: 0:00:09 *
Iter:    300,  Train Loss:  0.46,  Train Acc: 85.94%,  Val Loss:  0.49,  Val Acc: 84.88%,  Time: 0:00:13 *
Iter:    400,  Train Loss:  0.68,  Train Acc: 81.25%,  Val Loss:  0.47,  Val Acc: 85.53%,  Time: 0:00:16 *
Iter:    500,  Train Loss:  0.36,  Train Acc: 89.06%,  Val Loss:  0.43,  Val Acc: 86.47%,  Time: 0:00:19 *
Iter:    600,  Train Loss:  0.52,  Train Acc: 84.38%,  Val Loss:  0.43,  Val Acc: 86.51%,  Time: 0:00:22 *
Iter:    700,  Train Loss:  0.46,  Train Acc: 82.81%,  Val Loss:  0.41,  Val Acc: 87.19%,  Time: 0:00:26 *
Iter:    800,  Train Loss:  0.47,  Train Acc: 85.94%,  Val Loss:  0.39,  Val Acc: 87.70%,  Time: 0:00:29 *
Iter:    900,  Train Loss:  0.47,  Train Acc: 85.16%,  Val Loss:  0.39,  Val Acc: 87.99%,  Time: 0:00:32 *
Iter:   1000,  Train Loss:  0.35,  Train Acc: 86.72%,  Val Loss:  0.39,  Val Acc: 88.03%,  Time: 0:00:35 *
Iter:   1100,  Train Loss:  0.42,  Train Acc: 86.72%,  Val Loss:  0.38,  Val Acc: 88.35%,  Time: 0:00:39 *
Iter:   1200,  Train Loss:  0.39,  Train Acc: 85.16%,  Val Loss:  0.37,  Val Acc: 88.65%,  Time: 0:00:42 *
Iter:   1300,  Train Loss:  0.44,  Train Acc: 85.16%,  Val Loss:  0.36,  Val Acc: 88.44%,  Time: 0:00:45 *
Iter:   1400,  Train Loss:  0.53,  Train Acc: 82.81%,  Val Loss:  0.36,  Val Acc: 88.82%,  Time: 0:00:49 *
Epoch [2/20]
Iter:   1500,  Train Loss:  0.47,  Train Acc: 85.94%,  Val Loss:  0.35,  Val Acc: 88.77%,  Time: 0:00:52 *
Iter:   1600,  Train Loss:  0.36,  Train Acc: 85.94%,  Val Loss:  0.35,  Val Acc: 89.14%,  Time: 0:00:56 *
Iter:   1700,  Train Loss:  0.37,  Train Acc: 87.50%,  Val Loss:  0.34,  Val Acc: 89.36%,  Time: 0:00:59 *
Iter:   1800,  Train Loss:  0.32,  Train Acc: 87.50%,  Val Loss:  0.36,  Val Acc: 88.69%,  Time: 0:01:02 
Iter:   1900,  Train Loss:  0.34,  Train Acc: 89.84%,  Val Loss:  0.35,  Val Acc: 89.13%,  Time: 0:01:05 
Iter:   2000,  Train Loss:  0.37,  Train Acc: 88.28%,  Val Loss:  0.34,  Val Acc: 89.19%,  Time: 0:01:09 *
Iter:   2100,  Train Loss:  0.42,  Train Acc: 85.94%,  Val Loss:  0.34,  Val Acc: 89.44%,  Time: 0:01:12 *
Iter:   2200,  Train Loss:  0.28,  Train Acc: 90.62%,  Val Loss:  0.34,  Val Acc: 89.33%,  Time: 0:01:15 *
Iter:   2300,  Train Loss:  0.36,  Train Acc: 92.97%,  Val Loss:  0.34,  Val Acc: 89.45%,  Time: 0:01:19 
Iter:   2400,  Train Loss:  0.33,  Train Acc: 89.84%,  Val Loss:  0.34,  Val Acc: 89.57%,  Time: 0:01:22 
Iter:   2500,  Train Loss:  0.17,  Train Acc: 94.53%,  Val Loss:  0.33,  Val Acc: 89.85%,  Time: 0:01:25 *
Iter:   2600,  Train Loss:   0.3,  Train Acc: 89.84%,  Val Loss:  0.33,  Val Acc: 89.76%,  Time: 0:01:28 
Iter:   2700,  Train Loss:  0.26,  Train Acc: 91.41%,  Val Loss:  0.33,  Val Acc: 89.84%,  Time: 0:01:32 
Iter:   2800,  Train Loss:   0.4,  Train Acc: 85.16%,  Val Loss:  0.33,  Val Acc: 89.62%,  Time: 0:01:35 
Epoch [3/20]
Iter:   2900,  Train Loss:  0.32,  Train Acc: 89.84%,  Val Loss:  0.33,  Val Acc: 89.77%,  Time: 0:01:38 
Iter:   3000,  Train Loss:  0.22,  Train Acc: 91.41%,  Val Loss:  0.33,  Val Acc: 89.80%,  Time: 0:01:41 
Iter:   3100,  Train Loss:  0.27,  Train Acc: 92.97%,  Val Loss:  0.34,  Val Acc: 89.58%,  Time: 0:01:44 
Iter:   3200,  Train Loss:  0.31,  Train Acc: 90.62%,  Val Loss:  0.33,  Val Acc: 89.78%,  Time: 0:01:48 
Iter:   3300,  Train Loss:  0.38,  Train Acc: 88.28%,  Val Loss:  0.33,  Val Acc: 89.71%,  Time: 0:01:51 
Iter:   3400,  Train Loss:  0.33,  Train Acc: 85.94%,  Val Loss:  0.34,  Val Acc: 89.88%,  Time: 0:01:54 
Iter:   3500,  Train Loss:  0.19,  Train Acc: 92.19%,  Val Loss:  0.33,  Val Acc: 89.76%,  Time: 0:01:58 
No optimization for a long time, auto-stopping...
Test Loss:  0.31,  Test Acc: 90.77%
Precision, Recall and F1-Score...
               precision    recall  f1-score   support

      finance     0.9095    0.9050    0.9073      1000
       realty     0.9147    0.9330    0.9238      1000
       stocks     0.8770    0.8340    0.8549      1000
    education     0.9393    0.9590    0.9490      1000
      science     0.8529    0.8640    0.8584      1000
      society     0.9021    0.9120    0.9070      1000
     politics     0.9050    0.8760    0.8902      1000
       sports     0.9466    0.9570    0.9518      1000
         game     0.9336    0.9000    0.9165      1000
entertainment     0.8958    0.9370    0.9159      1000

     accuracy                         0.9077     10000
    macro avg     0.9076    0.9077    0.9075     10000
 weighted avg     0.9076    0.9077    0.9075     10000

Confusion Matrix...
[[905  17  37   5   8   6  10   5   2   5]
 [  9 933  13   2   3  15   3   7   2  13]
 [ 55  28 834   3  36   2  31   3   5   3]
 [  1   2   1 959   4   9   5   5   1  13]
 [  5   9  28   5 864  17  14   5  34  19]
 [  4  16   2  19  13 912  21   2   4   7]
 [ 10   5  22  16  24  29 876   4   1  13]
 [  2   2   5   2   3   6   1 957   5  17]
 [  1   3   7   6  46   4   4  10 900  19]
 [  3   5   2   4  12  11   3  13  10 937]]
Time usage: 0:00:00

以上是关于中文文本分类的主要内容,如果未能解决你的问题,请参考以下文章

多类文本分类:如果输入与类不匹配,则新类

用pickle加速sklearn/机器学习的分类任务?

在片段中设置文本颜色

错误代码:错误域 = NSCocoaErrorDomain 代码 = 3840“JSON 文本没有以数组或对象和允许未设置片段的选项开头。”

在片段中动态添加文本视图

无法在对话框片段中的文本视图上设置文本