查找和使用 XGBoost 回归管道中的前 10 个功能

Posted 2023-03-12

技术标签:

【中文标题】查找和使用 XGBoost 回归管道中的前 10 个功能【英文标题】：Find and use top 10 features in XGBoost regression pipeline 【发布时间】：2022-01-21 20:49:31 【问题描述】：

我想用XGBRegressor 获得前10 个功能ft_weights_xgb_reg.sort_values(by='weight', ascending=False).head(10) 我获得前10 个功能。但是我怎么能在我的管道中使用它呢？

我有这门课FeatureSelector_Only_Top_10，我怎么能只使用前10个功能，然后打印出来？例如print(grid.feature_selection_top_10.top10features)。

进口：

import time
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.manifold import TSNE
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso

XGB：

xgb_reg_start = time.time()

xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train_nor, y_train)
training_preds_xgb_reg = xgb_reg.predict(X_train_nor)
val_preds_xgb_reg = xgb_reg.predict(X_test_nor)

xgb_reg_end = time.time()

print(f"Time taken to run: round((xgb_reg_end - xgb_reg_start)/60,1) minutes")
print("\nTraining MSE:", round(metrics.mean_squared_error(y_train, training_preds_xgb_reg),4))
print("Validation MSE:", round(metrics.mean_squared_error(y_test, val_preds_xgb_reg),4))
print("\nTraining r2:", round(metrics.r2_score(y_train, training_preds_xgb_reg),4))
print("Validation r2:", round(metrics.r2_score(y_test, val_preds_xgb_reg),4))

ft_weights_xgb_reg = pd.DataFrame(xgb_reg.feature_importances_, columns=['weight'], index=X_train.columns)
ft_weights_xgb_reg.sort_values('weight', inplace=True)
ft_weights_xgb_reg.sort_values(by='weight', ascending=False).head(10)

管道：

class FeatureSelector_Only_Top_10(BaseEstimator, TransformerMixin):
    def __init__(self,n_components = 10):
        self.n_components = n_components
       

    def fit(self, X, y = None):
       # Don't know
        return self

    def transform(self, X, y = None):
        # Don't know
        return X

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30)
    
steps = [#('feature_selection_top_10', FeatureSelector_Only_Top_10()),
         #('feature_selection', SelectFromModel(estimator=LogisticRegression(max_iter=100))),
         ('lasso', Lasso(alpha=0.03))]

pipeline = Pipeline(steps) 
parameteres =  

grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5)                
grid.fit(X_train, y_train)
print(grid.best_params_)                    
print("score = %3.2f" %(grid.score(X_test,y_test)))

【问题讨论】：

【参考方案1】：

您可以在管道中包含SelectFromModel，以便根据重要性权重提取前 10 个特征，无需创建自定义转换器。如documentation中所述，如果要选择10个功能需要设置max_features=10和threshold=-np.inf。

import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_regression
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression

X, y = make_regression(n_features=100, n_samples=1000, random_state=42)

X = pd.DataFrame(data=X, columns=['x' + str(i) for i in range(X.shape[1])])
y = pd.Series(y, name='y')

pipeline = Pipeline([
    ('selector', SelectFromModel(estimator=XGBRegressor(), max_features=10, threshold=-np.inf)),
    ('regressor', LinearRegression())
])

pipeline.fit(X, y)

selected_features = pipeline['selector'].get_support()
print(selected_features.sum())
# 10

selected_features_names = X.columns[selected_features].tolist()
print(selected_features_names)
# ['x0', 'x14', 'x17', 'x35', 'x42', 'x43', 'x57', 'x71', 'x84', 'x95']

selected_features_importances = pipeline['selector'].estimator_.feature_importances_[selected_features]
print(selected_features_importances)
# [0.09361505 0.18474296 0.14420615 0.01952794 0.10946904 0.02192107 0.03307951 0.02948984 0.02851948 0.1216883]

selected_features_coefficients = pipeline['regressor'].coef_
print(selected_features_coefficients)
# [49.43000693 83.91437854 78.25242596 -0.76411769 56.67970515  0.16829694 28.81967319  0.50277914 24.55006237 68.17120687]

【讨论】：

【参考方案2】：

如果您想在Pipeline 中选择您的数据集的最佳特征N，您应该定义一个自定义转换器。

此对象应在 transform() 方法期间从 xgboost 训练并选择 N 最佳特征。然后在transform() 方法中，这个转换器应该相应地过滤你的数据集。

我会这样做：

from sklearn.datasets import make_regression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import Lasso

import pandas as pd
import xgboost as xgb

class FeatureSelector_Only_Top_10(BaseEstimator, TransformerMixin):
    def __init__(self,n_components = 10):
        self.n_components = n_components
        self.top_n_features = None

    def fit(self, X, y = None):
        X = pd.DataFrame(X)
        xgb_reg = xgb.XGBRegressor()
        xgb_reg.fit(X, y)
        self.top_n_features = (pd.DataFrame(
                                    xgb_reg.feature_importances_,
                                    columns=['weight'],
                                    index=X.columns)
                                .sort_values(by='weight', ascending=False)
                                .head(10)
                              )
        return self

    def transform(self, X, y = None):
        return pd.DataFrame(X).filter(self.top_n_features.index)




X, y = make_regression(n_features=50)
    
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30)
    
steps = [('feature_selection_top_10', FeatureSelector_Only_Top_10()),
         ('lasso', Lasso(alpha=0.03))]

pipeline = Pipeline(steps) 

pipeline.fit(X, y)       
print("score = %3.2f" %(pipeline.score(X_test,y_test)))

#retrieve the top N features and their weights
pipeline['feature_selection_top_10'].top_n_features

【讨论】：

是否可以选择获取前 10 个值的名称和分数？我刚刚修改了答案以检索前 n 个特征及其权重。

以上是关于查找和使用 XGBoost 回归管道中的前 10 个功能的主要内容，如果未能解决你的问题，请参考以下文章