python 使用/不使用超参数调整的ML算法测试。

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 使用/不使用超参数调整的ML算法测试。相关的知识,希望对你有一定的参考价值。

# calculate training and cv prediction with or without tunning the hyperparameters
def prediction(X_train:'array',y_train:'array',estimator:'sklearn estimator',dparam_grid:dict={})->tuple:
  """
  Calculate training and cv prediction with or without tunning the hyperparameters.

  X_train, y_train -- arrays with features and target data.
  estimator -- sklearn estimator to be used.
  dparam_grid -- dictionary of hyperparameters ans their values to be tested. If it is empty, the estimator is not tunned.
  return -- tuple(training prediciton array, cv prediction array, fitted estimator)
  """

  ## prediction without tunning
  if len(dparam_grid) == 0: clf = estimator
  ## prediction with tunning
  else:
    # fit grid search
    print('[info] tunning the HYPERPARAMETERS...')
    # fit grid
    from sklearn.model_selection import GridSearchCV
    grid = GridSearchCV(estimator=estimator, param_grid=dparam_grid,n_jobs=2,cv=10)
    grid.fit(X_train, y_train)
    # results
    print('[info] best score',grid.best_score_)
    print('[info] best hyperparams:')
    for k, v in grid.best_params_.items():
        print(k,v)

    ## forecast
    clf = grid.best_estimator_

  # prediction
  print('[info] estimator:\n',clf)
  from sklearn.model_selection import cross_val_predict
  yhat_train = clf.fit(X_train,y_train).predict(X_train)
  yhat_cv = cross_val_predict(clf,X_train,y_train,cv=10, n_jobs=2,method='predict')

  # return
  return (yhat_train,yhat_cv,clf)
## algorithm validation: scores, features importance and learning curve
def validation(RESULT:'dataframe',folder_output:str,starget:str,lx:list,estimator:'sklearn estimator',X_train:'array',y_train:'array'):
	"""
	Algorithm validation: scores, features importance and learning curve.

	RESULT -- df with real target 'y', training prediction 'yhat_train' and cv prediction 'yhat_cv'.
	folder_output -- folder where store the outputs.
	starget -- restaurant id or name.
	lx -- list of features for Features Importance.
	estimator -- sklearn GBR estimator (with features importance method). It also used for the Learning Curve.
	X_train, y_train: arrays of features and target for the Learning Curve.
	"""

	# ### Scores

	# validation: rmse
	from sklearn.metrics import mean_squared_error
	error_train = np.sqrt(mean_squared_error(RESULT.y.values,RESULT.yhat_train.values))
	error_cv = np.sqrt(mean_squared_error(RESULT.y.values,RESULT.yhat_cv.values))
	print('[info] scores:')
	print('RMSE: train = %.3f   cv = %.3f'%(error_train, error_cv))

	# validation: rmsle
	def rmsle(y_true:'array',y_pred:'array'):
	  import numpy as np
	  assert len(y_true) == len(y_pred)
	  return np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean() ** 0.5
	error_train_log = rmsle(RESULT.y.values,RESULT.yhat_train.values)
	error_cv_log = rmsle(RESULT.y.values,RESULT.yhat_cv.values)
	print('RMSLE: train = %.3f   cv = %.3f\n'%(error_train_log, error_cv_log))

	## store plots
	path_output_training = os.path.join(folder_output,'plot-validation_training-%s.png'%starget)
	path_output_cv = os.path.join(folder_output,'plot-validation_cv-%s.png'%starget)
	# training plot
	import matplotlib.pyplot as plt
	fig, ax = plt.subplots(figsize=(20,4))
	RESULT[['y','yhat_train']].plot(ax=ax,title='Training Error: rmse = %.3f   rmsle = %.3f'%(error_train, error_train_log))
	plt.savefig(path_output_training,bbox_inches='tight',transparent=False)
	# cv plot
	import matplotlib.pyplot as plt
	fig, ax = plt.subplots(figsize=(20,4))
	RESULT[['y','yhat_cv']].plot(ax=ax,title='CV Error: rmse = %.3f   rmsle = %.3f'%(error_cv, error_cv_log))
	plt.savefig(path_output_cv,bbox_inches='tight',transparent=False)
	# close plot
	plt.cla()   # Clear axis
	plt.clf()   # Clear figure
	plt.close() # Close a figure window

	# store results
	path_output_result = os.path.join(folder_output,'data-results-training_cv-%s.csv'%starget)
	RESULT.to_csv(path_output_result,index=True)
	print('[info] it was saved the file with results and plots.')




	# ### Features Importance
	try:
		# collect
		FI = pd.DataFrame({'feature':lx,'importance':estimator.feature_importances_})
		FI.sort_values(['importance'], ascending=[0], inplace=True)
		# conversion to %
		FI['importance'] = FI['importance'].apply(lambda x: x*100.)
		FI.rename({'importance':'importance(%)'},inplace=True)
		# save
		path_output_importante = os.path.join(folder_output,'table-features_importance-%s.csv'%starget)
		FI.to_csv(path_output_importante,index=False)
		print('[info] it was saved the file with Features Importance table.\n')
		# clean
		del(FI)
	except:
		print('[warning] this estimator does not have a Features Importance method.')



	# ### Learning Curve

	from sklearn.learning_curve import learning_curve
	from sklearn.metrics import mean_squared_error, make_scorer
	# custom scorer
	def frmse(y_true, y_pred):
	  return -np.sqrt(mean_squared_error(y_true, y_pred))
	srmse = make_scorer(frmse,greater_is_better=False)

	# build learning curve
	train_sizes, train_scores, valid_scores = learning_curve(estimator, X_train, y_train, train_sizes=np.array([ 0.1, 0.33, 0.55, 0.78, 1. ]), cv=10, scoring=srmse)
	# store results in a pandas df to be plotted
	LC = pd.DataFrame({'sizes':train_sizes, 'train':np.mean(train_scores,axis=1), 'cv':np.mean(valid_scores,axis=1)}).set_index('sizes')
	# plot
	path_output_learning = os.path.join(folder_output,'plot-learning_curve-%s.png'%starget)
	import matplotlib.pyplot as plt
	fig, ax = plt.subplots(figsize=(8,4))
	LC.plot(title='Learning Curve',ax=ax)
	plt.savefig(path_output_learning,bbox_inches='tight',transparent=False)
	# close plot
	plt.cla()   # Clear axis
	plt.clf()   # Clear figure
	plt.close() # Close a figure window
	print('\n[info] it was saved the Learning Curve.')
	# clean
	del(LC)


	# return 
	return None
## data preparation to be used for prediction.
def preparation(sample:'dataframe',lx:list,ly:list,npoly:int=0,iscaling:bool=True)->tuple:
	"""
	Data preparation to be used for prediction.
	sample -- dataframe with data: features + target.
	lx -- list of features to be used.
	ly -- list of target to be used.
	npoly -- degree of polynomial features transformation. If npoly=0, it is not used this transformation.
	iscaling -- scale or not scale the features.
	return -- tuple(X,y,lx,ly)
	"""


	## features
	X = sample[lx].as_matrix()

	# polynomial features generation
	if npoly>0:
		from sklearn.preprocessing import PolynomialFeatures
		poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
		X = poly.fit_transform(X)
		lx = poly.get_feature_names(input_features=lx)

	# scaling
	from sklearn.preprocessing import MinMaxScaler
	scaler = MinMaxScaler()
	X = scaler.fit_transform(X)


	## target
	y = sample[ly[0]].values

	# return
	return (X,y,lx,ly)

以上是关于python 使用/不使用超参数调整的ML算法测试。的主要内容,如果未能解决你的问题,请参考以下文章

[Spark2.0]ML 调优:模型选择和超参数调优

[Spark2.0]ML 调优:模型选择和超参数调优

通过 pyspark.ml.tuning.TrainValidationSplit 调整后如何获得最佳参数?

如何进行超参数调整?

吴恩达深度学习专项课程3学习笔记/week1/Setting up ML Application

实现 GridSearchCV 和 Pipelines 以执行 KNN 算法的超参数调整