查找和使用 XGBoost 回归管道中的前 10 个功能
Posted
技术标签:
【中文标题】查找和使用 XGBoost 回归管道中的前 10 个功能【英文标题】:Find and use top 10 features in XGBoost regression pipeline 【发布时间】:2022-01-21 20:49:31 【问题描述】:我想用XGBRegressor
获得前10 个功能ft_weights_xgb_reg.sort_values(by='weight', ascending=False).head(10)
我获得前10 个功能。但是我怎么能在我的管道中使用它呢?
我有这门课FeatureSelector_Only_Top_10
,我怎么能只使用前10个功能,然后打印出来?例如print(grid.feature_selection_top_10.top10features)
。
进口:
import time
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.manifold import TSNE
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
XGB:
xgb_reg_start = time.time()
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train_nor, y_train)
training_preds_xgb_reg = xgb_reg.predict(X_train_nor)
val_preds_xgb_reg = xgb_reg.predict(X_test_nor)
xgb_reg_end = time.time()
print(f"Time taken to run: round((xgb_reg_end - xgb_reg_start)/60,1) minutes")
print("\nTraining MSE:", round(metrics.mean_squared_error(y_train, training_preds_xgb_reg),4))
print("Validation MSE:", round(metrics.mean_squared_error(y_test, val_preds_xgb_reg),4))
print("\nTraining r2:", round(metrics.r2_score(y_train, training_preds_xgb_reg),4))
print("Validation r2:", round(metrics.r2_score(y_test, val_preds_xgb_reg),4))
ft_weights_xgb_reg = pd.DataFrame(xgb_reg.feature_importances_, columns=['weight'], index=X_train.columns)
ft_weights_xgb_reg.sort_values('weight', inplace=True)
ft_weights_xgb_reg.sort_values(by='weight', ascending=False).head(10)
管道:
class FeatureSelector_Only_Top_10(BaseEstimator, TransformerMixin):
def __init__(self,n_components = 10):
self.n_components = n_components
def fit(self, X, y = None):
# Don't know
return self
def transform(self, X, y = None):
# Don't know
return X
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30)
steps = [#('feature_selection_top_10', FeatureSelector_Only_Top_10()),
#('feature_selection', SelectFromModel(estimator=LogisticRegression(max_iter=100))),
('lasso', Lasso(alpha=0.03))]
pipeline = Pipeline(steps)
parameteres =
grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5)
grid.fit(X_train, y_train)
print(grid.best_params_)
print("score = %3.2f" %(grid.score(X_test,y_test)))
【问题讨论】:
【参考方案1】:您可以在管道中包含SelectFromModel
,以便根据重要性权重提取前 10 个特征,无需创建自定义转换器。如documentation中所述,如果要选择10个功能需要设置max_features=10
和threshold=-np.inf
。
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_regression
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression
X, y = make_regression(n_features=100, n_samples=1000, random_state=42)
X = pd.DataFrame(data=X, columns=['x' + str(i) for i in range(X.shape[1])])
y = pd.Series(y, name='y')
pipeline = Pipeline([
('selector', SelectFromModel(estimator=XGBRegressor(), max_features=10, threshold=-np.inf)),
('regressor', LinearRegression())
])
pipeline.fit(X, y)
selected_features = pipeline['selector'].get_support()
print(selected_features.sum())
# 10
selected_features_names = X.columns[selected_features].tolist()
print(selected_features_names)
# ['x0', 'x14', 'x17', 'x35', 'x42', 'x43', 'x57', 'x71', 'x84', 'x95']
selected_features_importances = pipeline['selector'].estimator_.feature_importances_[selected_features]
print(selected_features_importances)
# [0.09361505 0.18474296 0.14420615 0.01952794 0.10946904 0.02192107 0.03307951 0.02948984 0.02851948 0.1216883]
selected_features_coefficients = pipeline['regressor'].coef_
print(selected_features_coefficients)
# [49.43000693 83.91437854 78.25242596 -0.76411769 56.67970515 0.16829694 28.81967319 0.50277914 24.55006237 68.17120687]
【讨论】:
【参考方案2】:如果您想在Pipeline
中选择您的数据集的最佳特征N
,您应该定义一个自定义转换器。
此对象应在 transform()
方法期间从 xgboost 训练并选择 N
最佳特征。然后在transform()
方法中,这个转换器应该相应地过滤你的数据集。
我会这样做:
from sklearn.datasets import make_regression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import Lasso
import pandas as pd
import xgboost as xgb
class FeatureSelector_Only_Top_10(BaseEstimator, TransformerMixin):
def __init__(self,n_components = 10):
self.n_components = n_components
self.top_n_features = None
def fit(self, X, y = None):
X = pd.DataFrame(X)
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X, y)
self.top_n_features = (pd.DataFrame(
xgb_reg.feature_importances_,
columns=['weight'],
index=X.columns)
.sort_values(by='weight', ascending=False)
.head(10)
)
return self
def transform(self, X, y = None):
return pd.DataFrame(X).filter(self.top_n_features.index)
X, y = make_regression(n_features=50)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30)
steps = [('feature_selection_top_10', FeatureSelector_Only_Top_10()),
('lasso', Lasso(alpha=0.03))]
pipeline = Pipeline(steps)
pipeline.fit(X, y)
print("score = %3.2f" %(pipeline.score(X_test,y_test)))
#retrieve the top N features and their weights
pipeline['feature_selection_top_10'].top_n_features
【讨论】:
是否可以选择获取前 10 个值的名称和分数? 我刚刚修改了答案以检索前 n 个特征及其权重。以上是关于查找和使用 XGBoost 回归管道中的前 10 个功能的主要内容,如果未能解决你的问题,请参考以下文章
XGBoost 与 GridSearchCV、缩放、PCA 和 sklearn 管道中的 Early-Stopping
R语言使用xgboost构建回归模型:vtreat包为xgboost回归模型进行数据预处理(缺失值填充缺失值标识离散变量独热onehot编码)构建出生体重的xgboost模型回归模型