如何更改 - 使用 for 循环调用多个函数 - 使用管道调用类？

Posted 2023-03-12

技术标签:

【中文标题】如何更改 - 使用 for 循环调用多个函数 - 使用管道调用类？【英文标题】：How do I change - using for loops to call multiple functions - into - using a pipeline to call a class? 【发布时间】：2019-08-23 09:38:45 【问题描述】：

所以基本要求是，我从用户那里得到一个模型字典，以及它们的超参数字典并给出报告。目前的目标是二进制分类，但以后可以扩展。

这就是我目前正在做的事情：

import numpy as np
import pandas as pd
# import pandas_profiling as pp
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score, make_scorer
from sklearn import datasets
# import joblib
import warnings
warnings.filterwarnings('ignore')

cancer = datasets.load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target
target = df['target']
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='target', axis=1), target, test_size=0.4, random_state=13, stratify=target)

def build_model(model_name, model_class, params=None):
    """
    return model instance
    """
    if 'Ridge' in model_name:
        model = model_class(penalty='l2')
    elif 'Lasso' in model_name:
        model = model_class(penalty='l1')
    elif 'Ensemble' in model_name:
        model = model_class(estimators=[('rf', RandomForestClassifier()), ('gbm', GradientBoostingClassifier())], voting='hard')
    else:
        model = model_class()

    if params is not None:
        print('Custom Model Parameters provided. Implementing Randomized Search for  model'.format(model_name))
        rscv = RandomizedSearchCV(estimator=model, param_distributions=params[model_name],
                                  random_state=22, n_iter=10, cv=5, verbose=1, n_jobs=-1,
                                 scoring=make_scorer(f1_score), error_score=0.0)
        return rscv

    print('No model parameters provided. Using sklearn default values for  model'.format(model_name))
    return model

def fit_model(model_name, model_instance, xTrain, yTrain):
    """
    fit model
    """
    if model_name == 'SVM':
        scaler = StandardScaler()
        model = model_instance.fit(scaler.fit_transform(xTrain), yTrain)
    else:
        model = model_instance.fit(xTrain, yTrain)

    return model

def predict_vals(fitted_model, xTest):
    """
    predict and return vals
    """
    if model_name == 'SVM':
        scaler = StandardScaler()
        y_prediction = fitted_model.predict(scaler.fit_transform(xTest))
    else:
        y_prediction = fitted_model.predict(xTest)

    return y_prediction

def get_metrics(yTest, y_prediction):
    """
    get metrics after getting prediction
    """
    return [recall_score(yTest, y_prediction),
            precision_score(yTest, y_prediction), 
            f1_score(yTest, y_prediction),
           roc_auc_score(yTest, y_prediction)]

def model_report(list_of_metrics):
    """
    add metrics to df, return df
    """
    df = pd.DataFrame(list_of_metrics, columns=['Model', 'Recall', 'Precision', 'f1', 'roc_auc'])
    df = df.round(3)
    return df

models = 
    'Logistic Regression Ridge': LogisticRegression,
    'Logistic Regression Lasso': LogisticRegression,
    'Random Forest': RandomForestClassifier,
    'SVM': SVC,
    'GBM': GradientBoostingClassifier,
    'EnsembleRFGBM': VotingClassifier


model_parameters = 
    'SVM': 
        'C': np.random.uniform(50, 1, [25]),#[1, 10, 100, 1000],
        'class_weight': ['balanced'],
        'gamma': [0.0001, 0.001],
        'kernel': ['linear']
    ,
    'Random Forest': 
        'n_estimators': [5, 10, 50, 100, 200],
        'max_depth': [3, 5, 10, 20, 40],
        'criterion': ['gini', 'entropy'],
        'bootstrap': [True, False],
        'min_samples_leaf': [np.random.randint(1,10)]
    ,
    'Logistic Regression Ridge': 
        'C': np.random.rand(25),
        'class_weight': ['balanced']
    ,
    'Logistic Regression Lasso': 
        'C': np.random.rand(25),
        'class_weight': ['balanced']
    ,
    'GBM': 
        'n_estimators': [10, 50, 100, 200, 500],
        'max_depth': [3, 5, 10, None],
        'min_samples_leaf': [np.random.randint(1,10)]
    ,
    'EnsembleRFGBM': 
        'rf__n_estimators': [5, 10, 50, 100, 200],
        'rf__max_depth': [3, 5, 10, 20, 40],
        'rf__min_samples_leaf': [np.random.randint(1,10)],
        'gbm__n_estimators': [10, 50, 100, 200, 500],
        'gbm__max_depth': [3, 5, 10, None],
        'gbm__min_samples_leaf': [np.random.randint(1,10)]

没有参数我得到以下报告。

# without parameters
lst = []
for model_name, model_class in models.items():
    model_instance = build_model(model_name, model_class)
    fitted_model = fit_model(model_name, model_instance, X_train, y_train)
    y_predicted = predict_vals(fitted_model, X_test)
    metrics = get_metrics(y_test, y_predicted)

    lst.append([model_name] + metrics)

model_report(lst)

将参数作为输入

# with parameters
lst = []
for model_name, model_class in models.items():
    model_instance = build_model(model_name, model_class, model_parameters)
    fitted_model = fit_model(model_name, model_instance, X_train, y_train)
    y_predicted = predict_vals(fitted_model, X_test)
    metrics = get_metrics(y_test, y_predicted)

    lst.append([model_name] + metrics)

model_report(lst)

现在交给我的任务如下。

从用户处获取模型字典及其参数。如果未提供参数，则使用模型的默认值。将报告作为输出提供（如图所示）

有人告诉我应该将函数更改为类。并尽可能避免使用 for 循环。

我的挑战：

如何将所有函数更改为类和方法？基本上我的前辈想要类似的东西

report.getReport # gives the dataFrame of the report

但在我看来，上面的内容可以在如下函数中完成（我不明白类为什么/如何有益）

customReport(whatever inputs I'd like to give) # gives df of report

for loops

类似的东西

customPipeline = Pipeline([ ('rf', RandomForestClassifier(with relevant params from params dict),
                             'SVC', SVC(with relevant params from params dict)) ] )

类似的解决方案我发现是here，但我想避免使用for loops。

另一个相关的解决方案 here 正在使用一个可以在不同模型之间切换的类。但在这里我会要求用户能够选择是否要执行 Gridsearch/RandomizedSearch/CV/None。我的想法是我使用这个类，然后将它继承到另一个类，用户可以提供输入以选择 Gridsearch/RandomizedSearch/CV/None 等。我不确定我的想法是否正确。

注意一个完整的工作解决方案是可取的（会喜欢它）但不是强制性的。如果您的答案有一个可以给我指导如何进行的框架，那没关系。我可以探索并从中学习。

【问题讨论】：

这个问题太笼统了，不清楚。在目前的形式中，它看起来像是一个家庭作业问题。请改写此内容并添加您尝试实现的内容，以实现对您的要求。作为初学者，您可以查看estimator design of scikit-learn here。此外，您当前的代码将为 SVM 提供错误的结果，因为您在每次调用预测时都拟合了一个新的 StandardScaler。正如您保存模型一样，您还需要在拟合期间保存缩放器对象。这就是管道可以帮助您的地方。 @VivekKumar (1) 我不熟悉使用 python 类。在发布时，我没有尝试过任何东西。请在下面查看我的答案，了解我提出的工作解决方案。我认为我的解决方案更好地解释了我想要的。请随时编辑我的问题/详细信息，以获得更好的英文措辞。 (2) 我在StandardScaler 上理解并同意你的看法。这只是一个一次性的解决方案。这方面还有很多需要改进的地方。我是使用sklearn 的新手，所以我还在学习。 @VivekKumar In its current form its looking like a homework problem 我无法证明这一点，这是作为初级开发人员分配给我的任务，不管是否有作业，我的重点是学会解决问题。如果这是因为 解决我的作业而导致的，我深表歉意。那不是我的本意。 【参考方案1】：

你可以考虑使用map()，详情看这里：https://www.geeksforgeeks.org/python-map-function/

一些程序员有避免原始循环的习惯 - “原始循环是函数内部的任何循环，其中函数的用途大于算法由循环实现”。更多细节在这里：https://sean-parent.stlab.cc/presentations/2013-09-11-cpp-seasoning/cpp-seasoning.pdf

我认为这就是要求您删除 for 循环的原因。

【讨论】：

我添加了一个可行的解决方案。请看一下，如果您有任何建议，请告诉我。 :)【参考方案2】：

我已经实施了一个可行的解决方案。我应该更好地表达我的问题。我最初误解了GridsearchCV 或RandomizedSearchCV 在内部是如何工作的。 cv_results_ 提供所有可用的网格结果。我以为只有best estimator 可供我们使用。

使用这个，对于每种类型的模型，我取了最大值rank_test_score，并得到了构成模型的参数。在这个例子中，它是 4 个模型。现在我用我的测试数据运行每个模型，即每个模型的最佳参数组合，并预测所需的分数。我认为这个解决方案可以扩展到RandomizedSearchCV 和更多其他选项。

注意：这只是一个简单的解决方案。需要做很多修改，比如需要为特定模型缩放数据等。这个解决方案只是作为一个起点，可以根据用户的需要进行修改。

this answerClfSwitcher() class 的致谢。

以下是类的实现（欢迎提出改进建议）。

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
import warnings
warnings.filterwarnings('ignore')

cancer = datasets.load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target
target = df['target']
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='target', axis=1), target, test_size=0.4, random_state=13, stratify=target)

class ClfSwitcher(BaseEstimator):

    def __init__(self, model=RandomForestClassifier()):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 

        self.model = model


    def fit(self, X, y=None, **kwargs):
        self.model.fit(X, y)
        return self


    def predict(self, X, y=None):
        return self.model.predict(X)


    def predict_proba(self, X):
        return self.model.predict_proba(X)

    def score(self, X, y):
        return self.estimator.score(X, y)

class report(ClfSwitcher):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.grid = None
        self.full_report = None
        self.concise_report = None
        self.scoring_metrics = 
            'precision': precision_score,
            'recall': recall_score,
            'f1': f1_score,
            'roc_auc': roc_auc_score
        


    def griddy(self, pipeLine, parameters, **kwargs):
        self.grid = GridSearchCV(pipeLine, parameters, scoring='accuracy', n_jobs=-1)


    def fit_grid(self, X_train, y_train=None, **kwargs):
        self.grid.fit(X_train, y_train)

    def make_grid_report(self):
        self.full_report = pd.DataFrame(self.grid.cv_results_)

    @staticmethod
    def get_names(col):
        return col.__class__.__name__

    @staticmethod
    def calc_score(col, metric):
        return round(metric(y_test, col.fit(X_train, y_train).predict(X_test)), 4)


    def make_concise_report(self):
        self.concise_report = pd.DataFrame(self.grid.cv_results_)
        self.concise_report['model_names'] = self.concise_report['param_cst__model'].apply(self.get_names)
        self.concise_report = self.concise_report.sort_values(['model_names', 'rank_test_score'], ascending=[True, False]) \
                                                .groupby(['model_names']).head(1)[['param_cst__model', 'model_names']] \
                                                .reset_index(drop=True)

        for metric_name, metric_func in self.scoring_metrics.items():
            self.concise_report[metric_name] = self.concise_report['param_cst__model'].apply(self.calc_score, metric=metric_func)

        self.concise_report = self.concise_report[['model_names', 'precision', 'recall', 'f1', 'roc_auc', 'param_cst__model']]

pipeline = Pipeline([
    ('cst', ClfSwitcher()),
])

parameters = [
    
        'cst__model': [RandomForestClassifier()],
        'cst__model__n_estimators': [10, 20],
        'cst__model__max_depth': [5, 10],
        'cst__model__criterion': ['gini', 'entropy']
    ,
    
        'cst__model': [SVC()],
        'cst__model__C': [10, 20],
        'cst__model__kernel': ['linear'],
        'cst__model__gamma': [0.0001, 0.001]
    ,
    
        'cst__model': [LogisticRegression()],
        'cst__model__C': [13, 17],
        'cst__model__penalty': ['l1', 'l2']
    ,
    
        'cst__model': [GradientBoostingClassifier()],
        'cst__model__n_estimators': [10, 50],
        'cst__model__max_depth': [3, 5],
        'cst__model__min_samples_leaf': [1, 2]
    
]

my_report = report()
my_report.griddy(pipeline, parameters, scoring='f1')
my_report.fit_grid(X_train, y_train)
my_report.make_concise_report()
my_report.concise_report

根据需要输出报告。

【讨论】：

以上是关于如何更改 - 使用 for 循环调用多个函数 - 使用管道调用类？的主要内容，如果未能解决你的问题，请参考以下文章