python 使用sklearn进行文本分类

Posted 2021-05-09

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了python 使用sklearn进行文本分类相关的知识，希望对你有一定的参考价值。

import logging
import numpy as np
from optparse import OptionParser
import sys
from time import time
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier

from Reader import reader
from bench import benchmark


# set logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s',filename="classifier.log")
# create logger
param_logger = logging.getLogger("paramLogger")
result_logger = logging.getLogger("resultLogger")
param_logger.setLevel(logging.INFO)
result_logger.setLevel(logging.INFO)
#create formatter
fmt = "%(asctime)-15s %(levelname)s :: %(message)s"
formatter = logging.Formatter(fmt=fmt)
# create handler
sh = logging.StreamHandler(stream=None)
sh.setLevel(logging.INFO)
fh_param = logging.FileHandler(filename="para.log",mode="a")
fh_param.setLevel(logging.INFO)
fh_resu = logging.FileHandler(filename="result.log",mode="a")
fh_resu.setLevel(logging.INFO)
# add handler and formatter to logger
fh_param.setFormatter(formatter)
fh_resu.setFormatter(formatter)
param_logger.addHandler(fh_param)
result_logger.addHandler(sh)
result_logger.addHandler(fh_resu)


# parse commandline arguments
op = OptionParser()
op.add_option("--report",
              action="store_true", dest="print_report",
              help="Print a detailed classification report.")
op.add_option("--chi2_select",
              action="store", type="int", dest="select_chi2",
              help="Select some number of features using a chi-squared test")
op.add_option("--confusion_matrix",
              action="store_true", dest="print_cm",
              help="Print the confusion matrix.")
op.add_option("--top10",
              action="store_true", dest="print_top10",
              help="Print ten most discriminative terms per class"
                   " for every classifier.")
op.add_option("--all_categories",
              action="store_true", dest="all_categories",
              help="Whether to use all categories or not.")
op.add_option("--use_hashing",
              action="store_true",
              help="Use a hashing vectorizer.")
op.add_option("--n_features",
              action="store", type=int, default=2 ** 16,
              help="n_features when using the hashing vectorizer.")
op.add_option("--filtered",
              action="store_true",
              help="Remove newsgroup information that is easily overfit: "
                   "headers, signatures, and quoting.")

(opts, args) = op.parse_args()

# read the datafile
sens, labels, labels_encoder = reader(data_file="V_0.0.7_generated_question_intent_3000.txt")
target_names = list(labels_encoder.classes_)
param_logger.info("target_names are {}".format(target_names))

# split the the corpus into train and test
X_raw_train, X_raw_test, y_train, y_test = train_test_split(sens, labels, test_size=0.33, random_state=42)
# extract features from the training data using a sparse vectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5)
X_train = vectorizer.fit_transform(X_raw_train)
X_test = vectorizer.transform(X_raw_test)
feature_names = vectorizer.get_feature_names()

param_logger.info("feature_name's length is {}".format(len(feature_names)))


results = []
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
        (Perceptron(n_iter=50), "Perceptron"),
        (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (RandomForestClassifier(n_estimators=100), "Random forest")):
    result_logger.info('=' * 80)
    result_logger.info(name)
    results.append(benchmark(clf,X_train, y_train, X_test, y_test, target_names,opts))

for penalty in ["l2", "l1"]:
    result_logger.info('=' * 80)
    result_logger.info("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(penalty=penalty, dual=False,
                                       tol=1e-3),X_train, y_train, X_test, y_test, target_names,opts))

    # Train SGD model
    results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                           penalty=penalty),X_train, y_train, X_test, y_test, target_names,opts))

# Train SGD with Elastic Net penalty
result_logger.info('=' * 80)
result_logger.info("Elastic-Net penalty")
results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                       penalty="elasticnet"),X_train, y_train, X_test, y_test, target_names,opts))

# Train NearestCentroid without threshold
result_logger.info('=' * 80)
result_logger.info("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid(),X_train, y_train, X_test, y_test, target_names,opts))

# Train sparse Naive Bayes classifiers
result_logger.info('=' * 80)
result_logger.info("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=.01),X_train, y_train, X_test, y_test, target_names,opts))
results.append(benchmark(BernoulliNB(alpha=.01),X_train, y_train, X_test, y_test, target_names,opts))

result_logger.info('=' * 80)
result_logger.info("LinearSVC with L1-based feature selection")
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
results.append(benchmark(Pipeline([
    ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False,
                                                    tol=1e-3))),
    ('classification', LinearSVC(penalty="l2"))]),X_train, y_train, X_test, y_test, target_names,opts))

# # make some plots
#
# indices = np.arange(len(results))
#
# results = [[x[i] for x in results] for i in range(4)]
#
# clf_names, score, training_time, test_time = results
# training_time = np.array(training_time) / np.max(training_time)
# test_time = np.array(test_time) / np.max(test_time)

# plt.figure(figsize=(12, 8))
# plt.title("Score")
# plt.barh(indices, score, .2, label="score", color='navy')
# plt.barh(indices + .3, training_time, .2, label="training time",
#          color='c')
# plt.barh(indices + .6, test_time, .2, label="test time", color='darkorange')
# plt.yticks(())
# plt.legend(loc='best')
# plt.subplots_adjust(left=.25)
# plt.subplots_adjust(top=.95)
# plt.subplots_adjust(bottom=.05)
#
# for i, c in zip(indices, clf_names):
#     plt.text(-.3, i, c)
#
# plt.show()

from sklearn import preprocessing


def reader(data_file):
    """
    :param data_file: line1=raw sentence, line0=raw category line2=\n
    :return: sens:List: raw sentence  labels:List: label number labels_encoder:LabelEncoder() in sklearn
    """
    with open(data_file) as f:
        lines = f.readlines()

    cats = []
    sens = []
    for num, line in enumerate(lines):
        if num % 3 == 0:
            cats.append(line.strip())
        elif num % 3 == 1:
            sens.append(line.strip())

    # make the raw categories into numbers,i.e.["Male","Female"] into [0,1]
    enc = preprocessing.LabelEncoder()
    labels_encoder = enc.fit(cats)
    #unique_cats = list(labels_encoder.classes_)
    labels = labels_encoder.transform(cats).tolist()  # make numpy into list

    return sens, labels, labels_encoder

以上是关于python 使用sklearn进行文本分类的主要内容，如果未能解决你的问题，请参考以下文章