python 使用/不使用超参数调整的ML算法测试。
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 使用/不使用超参数调整的ML算法测试。相关的知识,希望对你有一定的参考价值。
# calculate training and cv prediction with or without tunning the hyperparameters
def prediction(X_train:'array',y_train:'array',estimator:'sklearn estimator',dparam_grid:dict={})->tuple:
"""
Calculate training and cv prediction with or without tunning the hyperparameters.
X_train, y_train -- arrays with features and target data.
estimator -- sklearn estimator to be used.
dparam_grid -- dictionary of hyperparameters ans their values to be tested. If it is empty, the estimator is not tunned.
return -- tuple(training prediciton array, cv prediction array, fitted estimator)
"""
## prediction without tunning
if len(dparam_grid) == 0: clf = estimator
## prediction with tunning
else:
# fit grid search
print('[info] tunning the HYPERPARAMETERS...')
# fit grid
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=estimator, param_grid=dparam_grid,n_jobs=2,cv=10)
grid.fit(X_train, y_train)
# results
print('[info] best score',grid.best_score_)
print('[info] best hyperparams:')
for k, v in grid.best_params_.items():
print(k,v)
## forecast
clf = grid.best_estimator_
# prediction
print('[info] estimator:\n',clf)
from sklearn.model_selection import cross_val_predict
yhat_train = clf.fit(X_train,y_train).predict(X_train)
yhat_cv = cross_val_predict(clf,X_train,y_train,cv=10, n_jobs=2,method='predict')
# return
return (yhat_train,yhat_cv,clf)
## algorithm validation: scores, features importance and learning curve
def validation(RESULT:'dataframe',folder_output:str,starget:str,lx:list,estimator:'sklearn estimator',X_train:'array',y_train:'array'):
"""
Algorithm validation: scores, features importance and learning curve.
RESULT -- df with real target 'y', training prediction 'yhat_train' and cv prediction 'yhat_cv'.
folder_output -- folder where store the outputs.
starget -- restaurant id or name.
lx -- list of features for Features Importance.
estimator -- sklearn GBR estimator (with features importance method). It also used for the Learning Curve.
X_train, y_train: arrays of features and target for the Learning Curve.
"""
# ### Scores
# validation: rmse
from sklearn.metrics import mean_squared_error
error_train = np.sqrt(mean_squared_error(RESULT.y.values,RESULT.yhat_train.values))
error_cv = np.sqrt(mean_squared_error(RESULT.y.values,RESULT.yhat_cv.values))
print('[info] scores:')
print('RMSE: train = %.3f cv = %.3f'%(error_train, error_cv))
# validation: rmsle
def rmsle(y_true:'array',y_pred:'array'):
import numpy as np
assert len(y_true) == len(y_pred)
return np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean() ** 0.5
error_train_log = rmsle(RESULT.y.values,RESULT.yhat_train.values)
error_cv_log = rmsle(RESULT.y.values,RESULT.yhat_cv.values)
print('RMSLE: train = %.3f cv = %.3f\n'%(error_train_log, error_cv_log))
## store plots
path_output_training = os.path.join(folder_output,'plot-validation_training-%s.png'%starget)
path_output_cv = os.path.join(folder_output,'plot-validation_cv-%s.png'%starget)
# training plot
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(20,4))
RESULT[['y','yhat_train']].plot(ax=ax,title='Training Error: rmse = %.3f rmsle = %.3f'%(error_train, error_train_log))
plt.savefig(path_output_training,bbox_inches='tight',transparent=False)
# cv plot
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(20,4))
RESULT[['y','yhat_cv']].plot(ax=ax,title='CV Error: rmse = %.3f rmsle = %.3f'%(error_cv, error_cv_log))
plt.savefig(path_output_cv,bbox_inches='tight',transparent=False)
# close plot
plt.cla() # Clear axis
plt.clf() # Clear figure
plt.close() # Close a figure window
# store results
path_output_result = os.path.join(folder_output,'data-results-training_cv-%s.csv'%starget)
RESULT.to_csv(path_output_result,index=True)
print('[info] it was saved the file with results and plots.')
# ### Features Importance
try:
# collect
FI = pd.DataFrame({'feature':lx,'importance':estimator.feature_importances_})
FI.sort_values(['importance'], ascending=[0], inplace=True)
# conversion to %
FI['importance'] = FI['importance'].apply(lambda x: x*100.)
FI.rename({'importance':'importance(%)'},inplace=True)
# save
path_output_importante = os.path.join(folder_output,'table-features_importance-%s.csv'%starget)
FI.to_csv(path_output_importante,index=False)
print('[info] it was saved the file with Features Importance table.\n')
# clean
del(FI)
except:
print('[warning] this estimator does not have a Features Importance method.')
# ### Learning Curve
from sklearn.learning_curve import learning_curve
from sklearn.metrics import mean_squared_error, make_scorer
# custom scorer
def frmse(y_true, y_pred):
return -np.sqrt(mean_squared_error(y_true, y_pred))
srmse = make_scorer(frmse,greater_is_better=False)
# build learning curve
train_sizes, train_scores, valid_scores = learning_curve(estimator, X_train, y_train, train_sizes=np.array([ 0.1, 0.33, 0.55, 0.78, 1. ]), cv=10, scoring=srmse)
# store results in a pandas df to be plotted
LC = pd.DataFrame({'sizes':train_sizes, 'train':np.mean(train_scores,axis=1), 'cv':np.mean(valid_scores,axis=1)}).set_index('sizes')
# plot
path_output_learning = os.path.join(folder_output,'plot-learning_curve-%s.png'%starget)
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(8,4))
LC.plot(title='Learning Curve',ax=ax)
plt.savefig(path_output_learning,bbox_inches='tight',transparent=False)
# close plot
plt.cla() # Clear axis
plt.clf() # Clear figure
plt.close() # Close a figure window
print('\n[info] it was saved the Learning Curve.')
# clean
del(LC)
# return
return None
## data preparation to be used for prediction.
def preparation(sample:'dataframe',lx:list,ly:list,npoly:int=0,iscaling:bool=True)->tuple:
"""
Data preparation to be used for prediction.
sample -- dataframe with data: features + target.
lx -- list of features to be used.
ly -- list of target to be used.
npoly -- degree of polynomial features transformation. If npoly=0, it is not used this transformation.
iscaling -- scale or not scale the features.
return -- tuple(X,y,lx,ly)
"""
## features
X = sample[lx].as_matrix()
# polynomial features generation
if npoly>0:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
X = poly.fit_transform(X)
lx = poly.get_feature_names(input_features=lx)
# scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
## target
y = sample[ly[0]].values
# return
return (X,y,lx,ly)
以上是关于python 使用/不使用超参数调整的ML算法测试。的主要内容,如果未能解决你的问题,请参考以下文章
通过 pyspark.ml.tuning.TrainValidationSplit 调整后如何获得最佳参数?