《Python机器学习及实践》----模型实用技巧

Posted wangshuang1631

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了《Python机器学习及实践》----模型实用技巧相关的知识,希望对你有一定的参考价值。

本片博客是根据《Python机器学习及实践》一书中的实例,所有代码均在本地编译通过。数据为从该书指定的百度网盘上下载的,或者是sklearn自带数据下载到本地使用的。
代码片段:

measurements = ['city': 'Dubai','temperature': 33,'city': 'London','temperature': 12.,'city': 'San Fransisco','temperature': 18.]
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
print vec.fit_transform(measurements).toarray()
print vec.get_feature_names()

from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all')
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.25, random_state = 33)
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer()
X_count_train = count_vec.fit_transform(X_train)
X_count_test = count_vec.transform(X_test)
from sklearn.naive_bayes import MultinomialNB
mnb_count = MultinomialNB()
mnb_count.fit(X_count_train,y_train)
print 'The accuracy of classifying 20newsgroups using Navie Bayes (CountVectorizer without filtering stopwords):',mnb_count.score(X_count_test,y_test)
y_count_predict = mnb_count.predict(X_count_test)
from sklearn.metrics import classification_report
print classification_report(y_test,y_count_predict,target_names=news.target_names)

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer()
X_tfidf_train = tfidf_vec.fit_transform(X_train)
X_tfidf_test = tfidf_vec.transform(X_test)
mnb_tfidf = MultinomialNB()
mnb_tfidf.fit(X_tfidf_train,y_train)
print 'The accuracy of classifying 20newsgroups using Navie Bayes (TfidfVectorizer without filtering stopwords):',mnb_tfidf.score(X_tfidf_test,y_test)
y_tfidf_predict = mnb_tfidf.predict(X_tfidf_test)
print classification_report(y_test,y_tfidf_predict,target_names=news.target_names)

count_filter_vec,tfidf_filter_vec = CountVectorizer(analyzer='word',stop_words='english'),TfidfVectorizer(analyzer='word',stop_words='english')
X_count_filter_train = count_filter_vec.fit_transform(X_train)
X_count_filter_test = count_filter_vec.transform(X_test)
X_tfidf_filter_train = tfidf_filter_vec.fit_transform(X_train)
X_tfidf_filter_test = tfidf_filter_vec.transform(X_test)
mnb_count_filter = MultinomialNB()
mnb_count_filter.fit(X_count_filter_train,y_train)
print 'The accuracy of classifying 20newsgroups using Navie Bayes (CountVectorizer by filtering stopwords):',mnb_count_filter.score(X_count_filter_test,y_test)
y_count_filter_predict = mnb_count_filter.predict(X_count_filter_test)
mnb_tfidf_filter = MultinomialNB()
mnb_tfidf_filter.fit(X_tfidf_filter_train,y_train)
print 'The accuracy of classifying 20newsgroups using Navie Bayes (TfidfVectorizer by filtering stopwords):',mnb_tfidf_filter.score(X_tfidf_filter_test,y_test)
y_tfidf_filter_predict = mnb_tfidf_filter.predict(X_tfidf_filter_test)
from sklearn.metrics import classification_report
print classification_report(y_test,y_count_filter_predict,target_names=news.target_names)
print classification_report(y_test,y_tfidf_filter_predict,target_names=news.target_names)

import pandas as pd
titanic = pd.read_csv('D:\\Source Code\\machinelearn\\\\titanic.txt')
y = titanic['survived']
X = titanic.drop(['row.names', 'name', 'survived'], axis = 1)
X['age'].fillna(X['age'].mean(), inplace=True)
X.fillna('UNKNOWN', inplace=True)
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
X_train = vec.fit_transform(X_train.to_dict(orient='record'))
X_test = vec.transform(X_test.to_dict(orient='record'))
print len(vec.feature_names_)
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train, y_train)
dt.score(X_test, y_test)
from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=20)
X_train_fs = fs.fit_transform(X_train, y_train)
dt.fit(X_train_fs, y_train)
X_test_fs = fs.transform(X_test)
dt.score(X_test_fs, y_test)
from sklearn.cross_validation import cross_val_score
import numpy as np
percentiles = range(1, 100, 2)
results = []
for i in percentiles:
    fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile = i)
    X_train_fs = fs.fit_transform(X_train, y_train)
    scores = cross_val_score(dt, X_train_fs, y_train, cv=5)
    results = np.append(results, scores.mean())
    print results
    opt = np.where(results == results.max())[0]
    print 'Optimal number of features %d' %percentiles[opt]
import pylab as pl
pl.plot(percentiles, results)
pl.xlabel('percentiles of features')
pl.ylabel('accuracy')
pl.show()
from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=7)
X_train_fs = fs.fit_transform(X_train, y_train)
dt.fit(X_train_fs, y_train)
X_test_fs = fs.transform(X_test)
dt.score(X_test_fs, y_test)

X_train = [[6], [8], [10], [14], [18]]
y_train = [[7], [9], [13], [17.5], [18]]
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
import numpy as np
xx = np.linspace(0, 26, 100)
xx = xx.reshape(xx.shape[0], 1)
yy = regressor.predict(xx)
import matplotlib.pyplot as plt
plt.scatter(X_train, y_train)
plt1, = plt.plot(xx, yy, label="Degree=1")
plt.axis([0, 25, 0, 25])
plt.xlabel('Diameter of Pizza')
plt.ylabel('Price of Pizza')
plt.legend(handles = [plt1])
plt.show()
print 'The R-squared value of Linear Regressor performing on the training data is', regressor.score(X_train, y_train)
from sklearn.preprocessing import PolynomialFeatures
poly2 = PolynomialFeatures(degree=2)
X_train_poly2 = poly2.fit_transform(X_train)
regressor_poly2 = LinearRegression()
regressor_poly2.fit(X_train_poly2, y_train)
xx_poly2 = poly2.transform(xx)
yy_poly2 = regressor_poly2.predict(xx_poly2)
plt.scatter(X_train, y_train)
plt1, = plt.plot(xx, yy, label='Degree=1')
plt2, = plt.plot(xx, yy_poly2, label='Degree=2')
plt.axis([0, 25, 0, 25])
plt.xlabel('Diameter of Pizza')
plt.ylabel('Price of Pizza')
plt.legend(handles = [plt1, plt2])
plt.show()
print 'The R-squared value of Polynominal Regressor (Degree=2) performing on the training data is', regressor_poly2.score(X_train_poly2, y_train)
from sklearn.preprocessing import PolynomialFeatures
poly4 = PolynomialFeatures(degree=4)
X_train_poly4 = poly4.fit_transform(X_train)
regressor_poly4 = LinearRegression()
regressor_poly4.fit(X_train_poly4, y_train)
xx_poly4 = poly4.transform(xx)
yy_poly4 = regressor_poly4.predict(xx_poly4)
plt.scatter(X_train, y_train)
plt1, = plt.plot(xx, yy, label='Degree=1')
plt2, = plt.plot(xx, yy_poly2, label='Degree=2')
plt4, = plt.plot(xx, yy_poly4, label='Degree=4')
plt.axis([0, 25, 0, 25])
plt.xlabel('Diameter of Pizza')
plt.ylabel('Price of Pizza')
plt.legend(handles = [plt1, plt2, plt4])
plt.show()
print 'The R-squared value of Polynominal Regressor (Degree=4) performing on the training data is',regressor_poly4.score(X_train_poly4, y_train)
X_test = [[6], [8], [11], [16]]
y_test = [[8], [12], [15], [18]]
regressor.score(X_test, y_test)
X_test_poly2 = poly2.transform(X_test)
regressor_poly2.score(X_test_poly2, y_test)
X_test_poly4 = poly4.transform(X_test)
regressor_poly4.score(X_test_poly4, y_test)

from sklearn.linear_model import Lasso
lasso_poly4 = Lasso()
lasso_poly4.fit(X_train_poly4, y_train)
print lasso_poly4.score(X_test_poly4, y_test)
print lasso_poly4.coef_
print regressor_poly4.score(X_test_poly4, y_test)
print regressor_poly4.coef_
print regressor_poly4.coef_
print np.sum(regressor_poly4.coef_ ** 2)
from sklearn.linear_model import Ridge
ridge_poly4 = Ridge()
ridge_poly4.fit(X_train_poly4, y_train)
print ridge_poly4.score(X_test_poly4, y_test)
print ridge_poly4.coef_
print np.sum(ridge_poly4.coef_ ** 2)

from sklearn.datasets import fetch_20newsgroups
import numpy as np
news = fetch_20newsgroups(subset='all')
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(news.data[:3000], news.target[:3000], test_size=0.25, random_state=33)
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
clf = Pipeline([('vect', TfidfVectorizer(stop_words='english', analyzer='word')), ('svc', SVC())])
parameters = 'svc__gamma': np.logspace(-2, 1, 4), 'svc__C': np.logspace(-1, 1, 3)
from sklearn.grid_search import GridSearchCV
gs = GridSearchCV(clf, parameters, verbose=2, refit=True, cv=3)
time_= gs.fit(X_train, y_train)
gs.best_params_, gs.best_score_
print gs.score(X_test, y_test)

from sklearn.datasets import fetch_20newsgroups
import numpy as np
news = fetch_20newsgroups(subset='all')
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(news.data[:3000], news.target[:3000], test_size=0.25, random_state=33)
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
clf = Pipeline([('vect', TfidfVectorizer(stop_words='english', analyzer='word')), ('svc', SVC())])
parameters = 'svc__gamma': np.logspace(-2, 1, 4), 'svc__C': np.logspace(-1, 1, 3)
from sklearn.grid_search import GridSearchCV
gs = GridSearchCV(clf, parameters, verbose=2, refit=True, cv=3, n_jobs=-1)
time_= gs.fit(X_train, y_train)
gs.best_params_, gs.best_score_
print gs.score(X_test, y_test)

以上是关于《Python机器学习及实践》----模型实用技巧的主要内容,如果未能解决你的问题,请参考以下文章

PYTHON机器学习及实践_从零开始通往KAGGLE竞赛之路pdf

Python 机器学习及实践 Coding 无监督学习经典模型 数据聚类 and 特征降维

Python机器学习及实践——基础篇4(朴素贝叶斯)

Python机器学习及实践 课后小题

《Python机器学习及实践》----监督学习经典模型

《Python机器学习及实践》----监督学习经典模型