python 20newsgroups_benchmark.py
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 20newsgroups_benchmark.py相关的知识,希望对你有一定的参考价值。
import pandas as pd
from ggplot import *
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import roc_curve
# vectorizer
from sklearn.feature_extraction.text import HashingVectorizer
# our classifiers
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space'
]
data_train = fetch_20newsgroups(subset='train', categories=categories,
shuffle=True, random_state=42)
data_test = fetch_20newsgroups(subset='test', categories=categories,
shuffle=True, random_state=42)
categories = data_train.target_names
vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=1000)
X_train = vectorizer.fit_transform(data_train.data)
X_test = vectorizer.transform(data_test.data)
y_train = data_train.target==0
y_test = data_test.target==0
clfs = [
("MultinomialNB", MultinomialNB()),
("BernoulliNB", BernoulliNB()),
("KNeighborsClassifier", KNeighborsClassifier()),
("RandomForestClassifier", RandomForestClassifier()),
("SVM", SVC(probability=True))
]
all_results = None
for name, clf in clfs:
clf.fit(X_train.todense(), y_train)
probs = clf.predict_proba(X_test.todense())[:,1]
fpr, tpr, thresh = roc_curve(y_test, probs)
results = pd.DataFrame({
"name": name,
"fpr": fpr,
"tpr": tpr
})
if all_results is None:
all_results = results
else:
all_results = all_results.append(results)
ggplot(aes(x='fpr', y='tpr', color='name'), data=all_results) + \
geom_step() + \
geom_abline(color="black") + \
ggtitle("Text Classification Benchmark on 20 News Groups")
以上是关于python 20newsgroups_benchmark.py的主要内容,如果未能解决你的问题,请参考以下文章
我的Python成长之路---第六天---Python基础(20)---2016年2月20日(晴)
python的20库
Python内置函数(20)——exec
Python 练习实例20
2019-03-18 Python time 将11月20日转换为2015-11-20
Python Day20