特征的相关性分析--评分卡分箱

Posted 龙鸣丿

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了特征的相关性分析--评分卡分箱相关的知识,希望对你有一定的参考价值。

分箱
(1)等频分箱
(2)确保每个箱中都有0和1
(3)定义WOE和IV函数
(4)卡方检验,合并箱体,画出IV曲线
(5)用最佳分箱个数分箱,并验证分箱结果
(6)将选取最佳分箱个数的过程包装为函数,对所有特征进行分箱选择
(7)计算各箱的WOE并映射到数据中
(8)接下来,把所有的WOE映射到原始数据中
建模与模型验证
制作评分卡

数据集:制作评分卡时用到的数据card.csv-机器学习文档类资源-CSDN下载

分箱步骤:

# 在进行计算IV值后,对相邻的组进行卡方检验,卡方检验的P值很大的组进行合并,直到数据中的组数小于设定的N箱为止
# 我们让一个特征分别分为2-20箱,观察每个分箱个数下的IV值如何变化,找出最合适的分箱个数
# 分箱完毕后,我们计算每个箱的WOE值,bad%,观察分箱效果
# 这些完毕后,我们可以对各个特征都进行分箱,然后观察每个特征的IV值,以此来挑选特征
# 卡方检验,合并箱体,画出IV曲线

代码记录:

# _*_ coding : utf-8 _*_
# @Time : 2022/7/2 16:37
# @Author 王拓
import matplotlib
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression as LR


def fill_missing_rf(X, y, to_fill):
    df = X.copy()
    fill = df.loc[:,to_fill]
    df = pd.concat([df.loc[:,df.columns != to_fill],pd.DataFrame(y)],axis=1)
    Ytrain = fill[fill.notnull()]
    Ytest = fill[fill.isnull()]
    Xtrain = df.iloc[Ytrain.index,:]
    Xtest = df.iloc[Ytest.index,:]
    # 用随机森林回归来填补缺失值
    from sklearn.ensemble import RandomForestRegressor as rfr
    rfr = rfr(n_estimators=100).fit(Xtrain, Ytrain)
    Ypredict = rfr.predict(Xtest)
    return Ypredict


def graphforbestbin(DF, X, Y, n=5,q=20,graph=True):
    """
    自动最优分箱函数,基于卡方检验的分箱
    :param DF: 需要输入的数据
    :param X: 需要分箱的列名
    :param Y: 分箱数据对应的标签Y列名
    :param n: 保留分箱个数
    :param q: 初始分箱的个数
    :param graph: 是否要画出IV图像
    区间为前开后闭
    """
    DF = DF[[X,Y]].copy()
    DF["qcut"], bins = pd.qcut(DF[X], retbins = True, q=q,duplicates="drop")
    coount_y0 = DF.loc[DF[Y]==0].groupby(by="qcut").count()[Y]
    coount_y1 = DF.loc[DF[Y]==1].groupby(by="qcut").count()[Y]
    num_bins = [*zip(bins,bins[1:],coount_y0,coount_y1)]
    for i in range(q):
        if 0 in num_bins[0][2:]:
            num_bins[0:2] = [(
                num_bins[0][0],
                num_bins[1][1],
                num_bins[0][2] + num_bins[1][2],
                num_bins[0][3] + num_bins[1][3])]
            continue

        for i in range(len(num_bins)):
            if 0 in num_bins[i][2:]:
                num_bins[i-1:i+1] = [(
                    num_bins[i-1][0],
                    num_bins[i][1],
                    num_bins[i-1][2] + num_bins[i][2],
                    num_bins[i-1][3] + num_bins[i][3])]
                break
        else:
            break

    def get_woe(num_bins):
        columns = ["min", "max", "count_0", "count_1"]
        df = pd.DataFrame(num_bins, columns=columns)
        df["total"] = df.count_0 + df.count_1
        df["percentage"] = df.total / df.total.sum()
        df["bad_rate"] = df.count_1 / df.total
        df["good%"] = df.count_0 / df.count_0.sum()
        df["bad%"] = df.count_1 / df.count_0.sum()
        df["woe"] = np.log(df["good%"] / df["bad%"])
        return df

    def get_iv(df):
        rate = df["good%"] - df["bad%"]
        iv = np.sum(rate * df.woe)
        return iv

    IV = []
    axisx = []
    while len(num_bins) > n:
        pvs = []
        # 获取num_bins_两两之间的卡方检验的置信度(卡方值)
        for i in range(len(num_bins) - 1):
            x1 = num_bins[i][2:]
            x2 = num_bins[i + 1][2:]
            # 0返回chi2值,1返回P值
            # chi2 = scipy.stats.chi2_contingency([x1,x2])[0]
            pv = scipy.stats.chi2_contingency([x1, x2])[1]
            pvs.append(pv)

        # 通过P值进行处理,合并P值最大的两组(将P值最大的那组和它下面的那组进行合并)
        i = pvs.index(max(pvs))
        num_bins[i:i + 2] = [(
            num_bins[i][0],
            num_bins[i + 1][1],
            num_bins[i][2] + num_bins[i + 1][2],
            num_bins[i][3] + num_bins[i + 1][3])]
        bins_df = pd.DataFrame(get_woe(num_bins))
        axisx.append(len(num_bins))
        IV.append(get_iv(bins_df))
    if graph:
        plt.figure()
        plt.plot(axisx, IV)
        plt.xticks(axisx)
        plt.xlabel("number of box")
        plt.ylabel("IV")
        plt.show()
    return bins_df


# 自动分箱函数,合并箱体
def get_bin(num_bins_,n):
    while len(num_bins_) > n:
        pvs = []
        # 获取num_bins_两两之间的卡方检验的置信度(卡方值)
        for i in range(len(num_bins_) - 1):
            x1 = num_bins_[i][2:]
            x2 = num_bins_[i + 1][2:]
            # 0返回chi2值,1返回P值
            # chi2 = scipy.stats.chi2_contingency([x1,x2])[0]
            pv = scipy.stats.chi2_contingency([x1, x2])[1]
            pvs.append(pv)

        # 通过P值进行处理,合并P值最大的两组(将P值最大的那组和它下面的那组进行合并)
        i = pvs.index(max(pvs))
        num_bins_[i:i + 2] = [(
            num_bins_[i][0],
            num_bins_[i + 1][1],
            num_bins_[i][2] + num_bins_[i + 1][2],
            num_bins_[i][3] + num_bins_[i + 1][3])]
    return num_bins_


def get_woe(num_bins):
    columns = ["min", "max", "count_0", "count_1"]
    df = pd.DataFrame(num_bins, columns=columns)
    df["total"] = df.count_0 + df.count_1
    df["percentage"] = df.total / df.total.sum()
    df["bad_rate"] = df.count_1 / df.total
    df["good%"] = df.count_0 / df.count_0.sum()
    df["bad%"] = df.count_1 / df.count_0.sum()
    df["woe"] = np.log(df["good%"] / df["bad%"])
    return df


def get_iv(df):
    rate = df["good%"] - df["bad%"]
    iv = np.sum(rate * df.woe)
    return iv


if __name__=='__main__':
    data = pd.read_csv(r"C:\\Users\\DL\\Downloads\\rankingcard.csv", index_col=0)
    # print(data.head())
    print(data.info())

    data.drop_duplicates(inplace=True)
    data.index = range(data.shape[0])
    data.info()
    # 缺失值的数目
    data.isnull().sum()
    data.isnull().sum() / data.shape[0]
    data.isnull().mean()
    data["NumberOfDependents"].fillna(data["NumberOfDependents"].mean(), inplace=True)
    data.isnull().mean()

    X = data.iloc[:, 1:]
    y = data["SeriousDlqin2yrs"]  # y = data.iloc[:,0]
    # X.shape
    y_pred = fill_missing_rf(X, y, "MonthlyIncome")
    # y_pred.shape
    # 将数据覆盖
    data.loc[data.loc[:, "MonthlyIncome"].isnull(), "MonthlyIncome"] = y_pred
    # data.loc[data.loc[:,"MonthlyIncome"].isnull(),"MonthlyIncome"].shape
    data.info()
    # 描述性统计
    print(data.describe([0.01, 0.0, 0.25, .5, .75, .9, .99]).T)
    (data["age"] == 0).sum()
    data = data[data["age"] != 0]
    # data.shape
    data[data.loc[:, "NumberOfTimes90DaysLate"] > 90].count()
    data.loc[:, "NumberOfTimes90DaysLate"].value_counts()
    data = data[data.loc[:, "NumberOfTimes90DaysLate"] < 90]
    # 恢复索引
    data.index = range(data.shape[0])
    data.info()

    x = data.iloc[:, 1:]
    y = data.iloc[:, 0]
    y.value_counts()

    n_sample = x.shape[0]
    n_1_sample = y.value_counts()[1]
    n_0_sample = y.value_counts()[0]
    print('样本个数:; 1占:.2%; 0占:.2%'.format(n_sample, n_1_sample / n_sample, n_0_sample / n_sample))
    import imblearn
    from imblearn.over_sampling import SMOTE
    sm = SMOTE(random_state=42) # 实例化
    x,y = sm.fit_resample(x,y)
    n_sample = x.shape[0]
    n_1_sample = y.value_counts()[1]
    n_0_sample = y.value_counts()[0]
    print('样本个数:; 1占:.2%; 0占:.2%'.format(n_sample, n_1_sample / n_sample, n_0_sample / n_sample))
    print('-----------------')
    from sklearn.model_selection import train_test_split
    X = pd.DataFrame(x)
    y = pd.DataFrame(y)
    X_train,X_vali,Y_train,Y_vali = train_test_split(x,y,test_size=0.3,random_state=420)
    model_data = pd.concat([Y_train, X_train], axis=1)
    model_data.index = range(model_data.shape[0])
    model_data.columns = data.columns
    vali_data = pd.concat([Y_vali, X_vali], axis=1)
    vali_data.index = range(vali_data.shape[0])
    vali_data.columns = data.columns
    model_data.to_csv(r"C:\\Users\\DL\\Downloads\\model_data.csv")
    vali_data.to_csv(r"C:\\Users\\DL\\Downloads\\vali_data.csv")
    model_data["qcut"], updown = pd.qcut(model_data["age"], retbins=True, q=20)
    # model_data.head()
    # model_data["qcut"]
    model_data["qcut"].value_counts()
    coount_y0 = model_data[model_data["SeriousDlqin2yrs"] == 0].groupby(by="qcut").count()["SeriousDlqin2yrs"]
    coount_y1 = model_data[model_data["SeriousDlqin2yrs"] == 1].groupby(by="qcut").count()["SeriousDlqin2yrs"]
    # coount_y0
    # num_bins值分别为每个区间的上届,下界,0出现的次数,1出现的次数
    num_bins = [*zip(updown, updown[1:], coount_y0, coount_y1)]
    # num_bins

    # 分箱:确保每个箱中都有 0和1
    # for i in range(20):
    #     # 如果第一个组没有包含正样本或负样本,向后合并
    #     if 0 in num_bins[0][2:]:
    #         num_bins[0:2] = [(
    #             num_bins[0][0],
    #             num_bins[1][1],
    #             num_bins[0][2]+num_bins[1][2],
    #             num_bins[0][3]+num_bins[1][3]
    #         )]
    #         continue
    #
    # 合并了之后,第一行的组是否一定有两种样本了呢,不一定
    # 如果原本的第一组和第二组都没有包含正样本,或者都没有包含负样本,那即便合并了之后,第一行的组也还是没有包含两种样本
    #
    columns = ["min", "max", "count_0", "count_1"]
    df = pd.DataFrame(num_bins, columns=columns)
    df["total"] = df.count_0 + df.count_1
    df["percentage"] = df.total / df.total.sum()
    df["bad_rate"] = df.count_1 / df.total
    df["good%"] = df.count_0 / df.count_0.sum()
    df["bad%"] = df.count_1 / df.count_0.sum()
    df["woe"] = np.log(df["good%"] / df["bad%"])
    rate = df["good%"] - df["bad%"]
    iv_age = np.sum(rate * df.woe)
    # 在进行计算IV值后,对相邻的组进行卡方检验,卡方检验的P值很大的组进行合并,直到数据中的组数小于设定的N箱为止
    # 我们让一个特征分别分为2-20箱,观察每个分箱个数下的IV值如何变化,找出最合适的分箱个数
    # 分箱完毕后,我们计算每个箱的WOE值,bad%,观察分箱效果
    # 这些完毕后,我们可以对各个特征都进行分箱,然后观察每个特征的IV值,以此来挑选特征
    # 卡方检验,合并箱体,画出IV曲线

    num_bins_ = num_bins.copy()
    import matplotlib.pyplot as plt
    import scipy

    IV = []
    axisx = []
    while len(num_bins_) > 2:
        pvs = []
        # 获取num_bins_两两之间的卡方检验的置信度(卡方值)
        for i in range(len(num_bins_) - 1):
            x1 = num_bins_[i][2:]
            x2 = num_bins_[i + 1][2:]
            # 0返回chi2值,1返回P值
            # chi2 = scipy.stats.chi2_contingency([x1,x2])[0]
            pv = scipy.stats.chi2_contingency([x1, x2])[1]
            pvs.append(pv)

        # 通过P值进行处理,合并P值最大的两组(将P值最大的那组和它下面的那组进行合并)
        i = pvs.index(max(pvs))
        num_bins_[i:i + 2] = [(
            num_bins_[i][0],
            num_bins_[i + 1][1],
            num_bins_[i][2] + num_bins_[i + 1][2],
            num_bins_[i][3] + num_bins_[i + 1][3])]
        bins_df = get_woe(num_bins_)
        axisx.append(len(num_bins_))
        IV.append(get_iv(bins_df))

    plt.figure()
    plt.plot(axisx, IV)
    plt.xticks(axisx)
    plt.xlabel("number of box")
    plt.ylabel("IV")
    plt.show()

    afterbins = get_bin(num_bins, 6)
    # afterbins
    bins_df = get_woe(num_bins)
    # bins_df
    # model_data.columns
    for i in model_data.columns[1:-1]:
        print(i)
        graphforbestbin(model_data,i,"SeriousDlqin2yrs",n=2,q=20,graph=True)

    auto_col_bins = "RevolvingUtilizationOfUnsecuredLines": 6,
                     "age": 5,
                     "DebtRatio": 4,
                     "MonthlyIncome": 3,
                     "NumberOfOpenCreditLinesAndLoans": 5

    # 不能使用自动分箱的变量
    hand_bins = "NumberOfTime30-59DaysPastDueNotWorse": [0, 1, 2, 13]
        , "NumberOfTimes90DaysLate": [0, 1, 2, 17]
        , "NumberRealEstateLoansOrLines": [0, 1, 2, 4, 54]
        , "NumberOfTime60-89DaysPastDueNotWorse": [0, 1, 2, 8]
        , "NumberOfDependents": [0, 1, 2, 3]
    # 保证区间覆盖使用 np.inf替换最大值,用-np.inf替换最小值
    hand_bins = k: [-np.inf, *v[:-1], np.inf] for k, v in hand_bins.items()

    bins_of_col = 
    # 生成自动分箱的分箱区间和分箱后的 IV 值
    for col in auto_col_bins:
        bins_df = graphforbestbin(model_data, col
                                  , "SeriousDlqin2yrs"
                                  , n=auto_col_bins[col]
                                  # 使用字典的性质来取出每个特征所对应的箱的数量
                                  , q=20
                                  , graph=False)
        bins_list = sorted(set(bins_df["min"]).union(bins_df["max"]))
        # 保证区间覆盖使用 np.inf 替换最大值 -np.inf 替换最小值
        bins_list[0], bins_list[-1] = -np.inf, np.inf
        bins_of_col[col] = bins_list
        # 合并手动分箱数据
    bins_of_col.update(hand_bins)
    data = model_data.copy()
    data = data[["age", "SeriousDlqin2yrs"]].copy()

    # pd.cut,可以根据已知的分箱间隔把数据分箱 参数为 pd.cut(数据,以列表表示的分箱间隔)
    data["cut"] = pd.cut(data["age"], [-np.inf, 48.49986200790144, 58.757170160044694, 64.0,
                                       74.0, np.inf])
    data.groupby("cut")["SeriousDlqin2yrs"].value_counts()
    data.groupby("cut")["SeriousDlqin2yrs"].value_counts().unstack()
    # 计算woe值
    bins_df["woe"] = np.log((bins_df[0] / bins_df[0].sum()) / (bins_df[1] / bins_df[1].sum()))


    def get_woe(df, col, y, bins):
        df = df[[col, y]].copy()
        df["cut"] = pd.cut(df[col], bins)
        bins_df = df.groupby("cut")[y].value_counts().unstack()
        woe = bins_df["woe"] = np.log((bins_df[0] / bins_df[0].sum()) / (bins_df[1] / bins_df[1].sum()))
        return woe


    # 将所有特征的WOE存储到字典当中
    woeall = 
    for col in bins_of_col:
        woeall[col] = get_woe(model_data, col, "SeriousDlqin2yrs", bins_of_col[col])
    # woeall
    # 不希望覆盖掉原本的数据,创建一个新的DataFrame,索引和原始数据model_data一模一样
    model_woe = pd.DataFrame(index=model_data.index)
    # 将原数据分箱后,按箱的结果把WOE结构用map函数映射到数据中
    model_woe["age"] = pd.cut(model_data["age"], bins_of_col["age"]).map(woeall["age"])

    # model_woe["age"]
    # 对所有特征都能这么写
    for col in bins_of_col:
        model_woe[col] = pd.cut(model_data[col], bins_of_col[col]).map(woeall[col])

    # 将标签补充到数据中
    model_woe["SeriousDlqin2yrs"] = model_data["SeriousDlqin2yrs"]
    # 建模与验证
    # 处理测试集
    vali_woe = pd.DataFrame(index=vali_data.index)
    for col in bins_of_col:
        vali_woe[col] = pd.cut(vali_data[col], bins_of_col[col]).map(woeall[col])

    vali_woe["SeriousDlqin2yrs"] = vali_data["SeriousDlqin2yrs"]

    vali_x = vali_woe.iloc[:, :-1]
    vali_y = vali_woe.iloc[:, -1]

    x = model_woe.iloc[:, :-1]
    y = model_woe.iloc[:, -1]

    from sklearn.linear_model import LogisticRegression as LR

    lr = LR().fit(x, y)
    lr.score(vali_x, vali_y)
    # 0.774959318464631
    c_1 = np.linspace(0.01, 1, 20)
    c_2 = np.linspace(0.01, 0.2, 20)

    score = []
    for i in c_2:
        lr = LR(solver='liblinear', C=i).fit(x, y)
        score.append(lr.score(vali_x, vali_y))
    plt.figure()
    plt.plot(c_2, score)
    plt.show()
    lr.n_iter_  # 查看迭代次数
    # array([5], dtype=int32) 迭代次数为5
    score = []
    for i in [1, 2, 3, 4, 5, 6]:
        lr = LR(solver='liblinear', C=0.025, max_iter=i).fit(x, y)
        score.append(lr.score(vali_x, vali_y))
    plt.figure()
    plt.plot([1, 2, 3, 4, 5, 6], score)
    plt.show()

    import scikitplot as skplt

    vali_proba_df = pd.DataFrame(lr.predict_proba(vali_x))
    skplt.metrics.plot_roc(vali_y, vali_proba_df,
                           plot_micro=False, figsize=(6, 6),
                           plot_macro=False)
    # todo

以上是关于特征的相关性分析--评分卡分箱的主要内容,如果未能解决你的问题,请参考以下文章

第四章 数据的预处理与特征构建(续)

第四章 数据的预处理与特征构建(续)

评分卡应用 - 利用Toad进行有监督分箱(卡方分箱/决策树分箱)

评分卡应用 - 利用Toad进行有监督分箱(卡方分箱/决策树分箱)

快速分箱方法

kaggle-制作评分卡