python 使用sklearn进行文本分类
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 使用sklearn进行文本分类相关的知识,希望对你有一定的参考价值。
import logging
import numpy as np
from optparse import OptionParser
import sys
from time import time
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from Reader import reader
from bench import benchmark
# set logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s',filename="classifier.log")
# create logger
param_logger = logging.getLogger("paramLogger")
result_logger = logging.getLogger("resultLogger")
param_logger.setLevel(logging.INFO)
result_logger.setLevel(logging.INFO)
#create formatter
fmt = "%(asctime)-15s %(levelname)s :: %(message)s"
formatter = logging.Formatter(fmt=fmt)
# create handler
sh = logging.StreamHandler(stream=None)
sh.setLevel(logging.INFO)
fh_param = logging.FileHandler(filename="para.log",mode="a")
fh_param.setLevel(logging.INFO)
fh_resu = logging.FileHandler(filename="result.log",mode="a")
fh_resu.setLevel(logging.INFO)
# add handler and formatter to logger
fh_param.setFormatter(formatter)
fh_resu.setFormatter(formatter)
param_logger.addHandler(fh_param)
result_logger.addHandler(sh)
result_logger.addHandler(fh_resu)
# parse commandline arguments
op = OptionParser()
op.add_option("--report",
action="store_true", dest="print_report",
help="Print a detailed classification report.")
op.add_option("--chi2_select",
action="store", type="int", dest="select_chi2",
help="Select some number of features using a chi-squared test")
op.add_option("--confusion_matrix",
action="store_true", dest="print_cm",
help="Print the confusion matrix.")
op.add_option("--top10",
action="store_true", dest="print_top10",
help="Print ten most discriminative terms per class"
" for every classifier.")
op.add_option("--all_categories",
action="store_true", dest="all_categories",
help="Whether to use all categories or not.")
op.add_option("--use_hashing",
action="store_true",
help="Use a hashing vectorizer.")
op.add_option("--n_features",
action="store", type=int, default=2 ** 16,
help="n_features when using the hashing vectorizer.")
op.add_option("--filtered",
action="store_true",
help="Remove newsgroup information that is easily overfit: "
"headers, signatures, and quoting.")
(opts, args) = op.parse_args()
# read the datafile
sens, labels, labels_encoder = reader(data_file="V_0.0.7_generated_question_intent_3000.txt")
target_names = list(labels_encoder.classes_)
param_logger.info("target_names are {}".format(target_names))
# split the the corpus into train and test
X_raw_train, X_raw_test, y_train, y_test = train_test_split(sens, labels, test_size=0.33, random_state=42)
# extract features from the training data using a sparse vectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5)
X_train = vectorizer.fit_transform(X_raw_train)
X_test = vectorizer.transform(X_raw_test)
feature_names = vectorizer.get_feature_names()
param_logger.info("feature_name's length is {}".format(len(feature_names)))
results = []
for clf, name in (
(RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
(Perceptron(n_iter=50), "Perceptron"),
(PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
(KNeighborsClassifier(n_neighbors=10), "kNN"),
(RandomForestClassifier(n_estimators=100), "Random forest")):
result_logger.info('=' * 80)
result_logger.info(name)
results.append(benchmark(clf,X_train, y_train, X_test, y_test, target_names,opts))
for penalty in ["l2", "l1"]:
result_logger.info('=' * 80)
result_logger.info("%s penalty" % penalty.upper())
# Train Liblinear model
results.append(benchmark(LinearSVC(penalty=penalty, dual=False,
tol=1e-3),X_train, y_train, X_test, y_test, target_names,opts))
# Train SGD model
results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
penalty=penalty),X_train, y_train, X_test, y_test, target_names,opts))
# Train SGD with Elastic Net penalty
result_logger.info('=' * 80)
result_logger.info("Elastic-Net penalty")
results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
penalty="elasticnet"),X_train, y_train, X_test, y_test, target_names,opts))
# Train NearestCentroid without threshold
result_logger.info('=' * 80)
result_logger.info("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid(),X_train, y_train, X_test, y_test, target_names,opts))
# Train sparse Naive Bayes classifiers
result_logger.info('=' * 80)
result_logger.info("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=.01),X_train, y_train, X_test, y_test, target_names,opts))
results.append(benchmark(BernoulliNB(alpha=.01),X_train, y_train, X_test, y_test, target_names,opts))
result_logger.info('=' * 80)
result_logger.info("LinearSVC with L1-based feature selection")
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
results.append(benchmark(Pipeline([
('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False,
tol=1e-3))),
('classification', LinearSVC(penalty="l2"))]),X_train, y_train, X_test, y_test, target_names,opts))
# # make some plots
#
# indices = np.arange(len(results))
#
# results = [[x[i] for x in results] for i in range(4)]
#
# clf_names, score, training_time, test_time = results
# training_time = np.array(training_time) / np.max(training_time)
# test_time = np.array(test_time) / np.max(test_time)
# plt.figure(figsize=(12, 8))
# plt.title("Score")
# plt.barh(indices, score, .2, label="score", color='navy')
# plt.barh(indices + .3, training_time, .2, label="training time",
# color='c')
# plt.barh(indices + .6, test_time, .2, label="test time", color='darkorange')
# plt.yticks(())
# plt.legend(loc='best')
# plt.subplots_adjust(left=.25)
# plt.subplots_adjust(top=.95)
# plt.subplots_adjust(bottom=.05)
#
# for i, c in zip(indices, clf_names):
# plt.text(-.3, i, c)
#
# plt.show()
from sklearn import preprocessing
def reader(data_file):
"""
:param data_file: line1=raw sentence, line0=raw category line2=\n
:return: sens:List: raw sentence labels:List: label number labels_encoder:LabelEncoder() in sklearn
"""
with open(data_file) as f:
lines = f.readlines()
cats = []
sens = []
for num, line in enumerate(lines):
if num % 3 == 0:
cats.append(line.strip())
elif num % 3 == 1:
sens.append(line.strip())
# make the raw categories into numbers,i.e.["Male","Female"] into [0,1]
enc = preprocessing.LabelEncoder()
labels_encoder = enc.fit(cats)
#unique_cats = list(labels_encoder.classes_)
labels = labels_encoder.transform(cats).tolist() # make numpy into list
return sens, labels, labels_encoder
以上是关于python 使用sklearn进行文本分类的主要内容,如果未能解决你的问题,请参考以下文章
算法 | 使用sklearn自带的贝叶斯分类器进行文本分类和参数调优