特征的相关性分析--评分卡分箱
Posted 龙鸣丿
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了特征的相关性分析--评分卡分箱相关的知识,希望对你有一定的参考价值。
分箱
(1)等频分箱
(2)确保每个箱中都有0和1
(3)定义WOE和IV函数
(4)卡方检验,合并箱体,画出IV曲线
(5)用最佳分箱个数分箱,并验证分箱结果
(6)将选取最佳分箱个数的过程包装为函数,对所有特征进行分箱选择
(7)计算各箱的WOE并映射到数据中
(8)接下来,把所有的WOE映射到原始数据中
建模与模型验证
制作评分卡
数据集:制作评分卡时用到的数据card.csv-机器学习文档类资源-CSDN下载
分箱步骤:
# 在进行计算IV值后,对相邻的组进行卡方检验,卡方检验的P值很大的组进行合并,直到数据中的组数小于设定的N箱为止
# 我们让一个特征分别分为2-20箱,观察每个分箱个数下的IV值如何变化,找出最合适的分箱个数
# 分箱完毕后,我们计算每个箱的WOE值,bad%,观察分箱效果
# 这些完毕后,我们可以对各个特征都进行分箱,然后观察每个特征的IV值,以此来挑选特征
# 卡方检验,合并箱体,画出IV曲线
代码记录:
# _*_ coding : utf-8 _*_
# @Time : 2022/7/2 16:37
# @Author 王拓
import matplotlib
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression as LR
def fill_missing_rf(X, y, to_fill):
df = X.copy()
fill = df.loc[:,to_fill]
df = pd.concat([df.loc[:,df.columns != to_fill],pd.DataFrame(y)],axis=1)
Ytrain = fill[fill.notnull()]
Ytest = fill[fill.isnull()]
Xtrain = df.iloc[Ytrain.index,:]
Xtest = df.iloc[Ytest.index,:]
# 用随机森林回归来填补缺失值
from sklearn.ensemble import RandomForestRegressor as rfr
rfr = rfr(n_estimators=100).fit(Xtrain, Ytrain)
Ypredict = rfr.predict(Xtest)
return Ypredict
def graphforbestbin(DF, X, Y, n=5,q=20,graph=True):
"""
自动最优分箱函数,基于卡方检验的分箱
:param DF: 需要输入的数据
:param X: 需要分箱的列名
:param Y: 分箱数据对应的标签Y列名
:param n: 保留分箱个数
:param q: 初始分箱的个数
:param graph: 是否要画出IV图像
区间为前开后闭
"""
DF = DF[[X,Y]].copy()
DF["qcut"], bins = pd.qcut(DF[X], retbins = True, q=q,duplicates="drop")
coount_y0 = DF.loc[DF[Y]==0].groupby(by="qcut").count()[Y]
coount_y1 = DF.loc[DF[Y]==1].groupby(by="qcut").count()[Y]
num_bins = [*zip(bins,bins[1:],coount_y0,coount_y1)]
for i in range(q):
if 0 in num_bins[0][2:]:
num_bins[0:2] = [(
num_bins[0][0],
num_bins[1][1],
num_bins[0][2] + num_bins[1][2],
num_bins[0][3] + num_bins[1][3])]
continue
for i in range(len(num_bins)):
if 0 in num_bins[i][2:]:
num_bins[i-1:i+1] = [(
num_bins[i-1][0],
num_bins[i][1],
num_bins[i-1][2] + num_bins[i][2],
num_bins[i-1][3] + num_bins[i][3])]
break
else:
break
def get_woe(num_bins):
columns = ["min", "max", "count_0", "count_1"]
df = pd.DataFrame(num_bins, columns=columns)
df["total"] = df.count_0 + df.count_1
df["percentage"] = df.total / df.total.sum()
df["bad_rate"] = df.count_1 / df.total
df["good%"] = df.count_0 / df.count_0.sum()
df["bad%"] = df.count_1 / df.count_0.sum()
df["woe"] = np.log(df["good%"] / df["bad%"])
return df
def get_iv(df):
rate = df["good%"] - df["bad%"]
iv = np.sum(rate * df.woe)
return iv
IV = []
axisx = []
while len(num_bins) > n:
pvs = []
# 获取num_bins_两两之间的卡方检验的置信度(卡方值)
for i in range(len(num_bins) - 1):
x1 = num_bins[i][2:]
x2 = num_bins[i + 1][2:]
# 0返回chi2值,1返回P值
# chi2 = scipy.stats.chi2_contingency([x1,x2])[0]
pv = scipy.stats.chi2_contingency([x1, x2])[1]
pvs.append(pv)
# 通过P值进行处理,合并P值最大的两组(将P值最大的那组和它下面的那组进行合并)
i = pvs.index(max(pvs))
num_bins[i:i + 2] = [(
num_bins[i][0],
num_bins[i + 1][1],
num_bins[i][2] + num_bins[i + 1][2],
num_bins[i][3] + num_bins[i + 1][3])]
bins_df = pd.DataFrame(get_woe(num_bins))
axisx.append(len(num_bins))
IV.append(get_iv(bins_df))
if graph:
plt.figure()
plt.plot(axisx, IV)
plt.xticks(axisx)
plt.xlabel("number of box")
plt.ylabel("IV")
plt.show()
return bins_df
# 自动分箱函数,合并箱体
def get_bin(num_bins_,n):
while len(num_bins_) > n:
pvs = []
# 获取num_bins_两两之间的卡方检验的置信度(卡方值)
for i in range(len(num_bins_) - 1):
x1 = num_bins_[i][2:]
x2 = num_bins_[i + 1][2:]
# 0返回chi2值,1返回P值
# chi2 = scipy.stats.chi2_contingency([x1,x2])[0]
pv = scipy.stats.chi2_contingency([x1, x2])[1]
pvs.append(pv)
# 通过P值进行处理,合并P值最大的两组(将P值最大的那组和它下面的那组进行合并)
i = pvs.index(max(pvs))
num_bins_[i:i + 2] = [(
num_bins_[i][0],
num_bins_[i + 1][1],
num_bins_[i][2] + num_bins_[i + 1][2],
num_bins_[i][3] + num_bins_[i + 1][3])]
return num_bins_
def get_woe(num_bins):
columns = ["min", "max", "count_0", "count_1"]
df = pd.DataFrame(num_bins, columns=columns)
df["total"] = df.count_0 + df.count_1
df["percentage"] = df.total / df.total.sum()
df["bad_rate"] = df.count_1 / df.total
df["good%"] = df.count_0 / df.count_0.sum()
df["bad%"] = df.count_1 / df.count_0.sum()
df["woe"] = np.log(df["good%"] / df["bad%"])
return df
def get_iv(df):
rate = df["good%"] - df["bad%"]
iv = np.sum(rate * df.woe)
return iv
if __name__=='__main__':
data = pd.read_csv(r"C:\\Users\\DL\\Downloads\\rankingcard.csv", index_col=0)
# print(data.head())
print(data.info())
data.drop_duplicates(inplace=True)
data.index = range(data.shape[0])
data.info()
# 缺失值的数目
data.isnull().sum()
data.isnull().sum() / data.shape[0]
data.isnull().mean()
data["NumberOfDependents"].fillna(data["NumberOfDependents"].mean(), inplace=True)
data.isnull().mean()
X = data.iloc[:, 1:]
y = data["SeriousDlqin2yrs"] # y = data.iloc[:,0]
# X.shape
y_pred = fill_missing_rf(X, y, "MonthlyIncome")
# y_pred.shape
# 将数据覆盖
data.loc[data.loc[:, "MonthlyIncome"].isnull(), "MonthlyIncome"] = y_pred
# data.loc[data.loc[:,"MonthlyIncome"].isnull(),"MonthlyIncome"].shape
data.info()
# 描述性统计
print(data.describe([0.01, 0.0, 0.25, .5, .75, .9, .99]).T)
(data["age"] == 0).sum()
data = data[data["age"] != 0]
# data.shape
data[data.loc[:, "NumberOfTimes90DaysLate"] > 90].count()
data.loc[:, "NumberOfTimes90DaysLate"].value_counts()
data = data[data.loc[:, "NumberOfTimes90DaysLate"] < 90]
# 恢复索引
data.index = range(data.shape[0])
data.info()
x = data.iloc[:, 1:]
y = data.iloc[:, 0]
y.value_counts()
n_sample = x.shape[0]
n_1_sample = y.value_counts()[1]
n_0_sample = y.value_counts()[0]
print('样本个数:; 1占:.2%; 0占:.2%'.format(n_sample, n_1_sample / n_sample, n_0_sample / n_sample))
import imblearn
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42) # 实例化
x,y = sm.fit_resample(x,y)
n_sample = x.shape[0]
n_1_sample = y.value_counts()[1]
n_0_sample = y.value_counts()[0]
print('样本个数:; 1占:.2%; 0占:.2%'.format(n_sample, n_1_sample / n_sample, n_0_sample / n_sample))
print('-----------------')
from sklearn.model_selection import train_test_split
X = pd.DataFrame(x)
y = pd.DataFrame(y)
X_train,X_vali,Y_train,Y_vali = train_test_split(x,y,test_size=0.3,random_state=420)
model_data = pd.concat([Y_train, X_train], axis=1)
model_data.index = range(model_data.shape[0])
model_data.columns = data.columns
vali_data = pd.concat([Y_vali, X_vali], axis=1)
vali_data.index = range(vali_data.shape[0])
vali_data.columns = data.columns
model_data.to_csv(r"C:\\Users\\DL\\Downloads\\model_data.csv")
vali_data.to_csv(r"C:\\Users\\DL\\Downloads\\vali_data.csv")
model_data["qcut"], updown = pd.qcut(model_data["age"], retbins=True, q=20)
# model_data.head()
# model_data["qcut"]
model_data["qcut"].value_counts()
coount_y0 = model_data[model_data["SeriousDlqin2yrs"] == 0].groupby(by="qcut").count()["SeriousDlqin2yrs"]
coount_y1 = model_data[model_data["SeriousDlqin2yrs"] == 1].groupby(by="qcut").count()["SeriousDlqin2yrs"]
# coount_y0
# num_bins值分别为每个区间的上届,下界,0出现的次数,1出现的次数
num_bins = [*zip(updown, updown[1:], coount_y0, coount_y1)]
# num_bins
# 分箱:确保每个箱中都有 0和1
# for i in range(20):
# # 如果第一个组没有包含正样本或负样本,向后合并
# if 0 in num_bins[0][2:]:
# num_bins[0:2] = [(
# num_bins[0][0],
# num_bins[1][1],
# num_bins[0][2]+num_bins[1][2],
# num_bins[0][3]+num_bins[1][3]
# )]
# continue
#
# 合并了之后,第一行的组是否一定有两种样本了呢,不一定
# 如果原本的第一组和第二组都没有包含正样本,或者都没有包含负样本,那即便合并了之后,第一行的组也还是没有包含两种样本
#
columns = ["min", "max", "count_0", "count_1"]
df = pd.DataFrame(num_bins, columns=columns)
df["total"] = df.count_0 + df.count_1
df["percentage"] = df.total / df.total.sum()
df["bad_rate"] = df.count_1 / df.total
df["good%"] = df.count_0 / df.count_0.sum()
df["bad%"] = df.count_1 / df.count_0.sum()
df["woe"] = np.log(df["good%"] / df["bad%"])
rate = df["good%"] - df["bad%"]
iv_age = np.sum(rate * df.woe)
# 在进行计算IV值后,对相邻的组进行卡方检验,卡方检验的P值很大的组进行合并,直到数据中的组数小于设定的N箱为止
# 我们让一个特征分别分为2-20箱,观察每个分箱个数下的IV值如何变化,找出最合适的分箱个数
# 分箱完毕后,我们计算每个箱的WOE值,bad%,观察分箱效果
# 这些完毕后,我们可以对各个特征都进行分箱,然后观察每个特征的IV值,以此来挑选特征
# 卡方检验,合并箱体,画出IV曲线
num_bins_ = num_bins.copy()
import matplotlib.pyplot as plt
import scipy
IV = []
axisx = []
while len(num_bins_) > 2:
pvs = []
# 获取num_bins_两两之间的卡方检验的置信度(卡方值)
for i in range(len(num_bins_) - 1):
x1 = num_bins_[i][2:]
x2 = num_bins_[i + 1][2:]
# 0返回chi2值,1返回P值
# chi2 = scipy.stats.chi2_contingency([x1,x2])[0]
pv = scipy.stats.chi2_contingency([x1, x2])[1]
pvs.append(pv)
# 通过P值进行处理,合并P值最大的两组(将P值最大的那组和它下面的那组进行合并)
i = pvs.index(max(pvs))
num_bins_[i:i + 2] = [(
num_bins_[i][0],
num_bins_[i + 1][1],
num_bins_[i][2] + num_bins_[i + 1][2],
num_bins_[i][3] + num_bins_[i + 1][3])]
bins_df = get_woe(num_bins_)
axisx.append(len(num_bins_))
IV.append(get_iv(bins_df))
plt.figure()
plt.plot(axisx, IV)
plt.xticks(axisx)
plt.xlabel("number of box")
plt.ylabel("IV")
plt.show()
afterbins = get_bin(num_bins, 6)
# afterbins
bins_df = get_woe(num_bins)
# bins_df
# model_data.columns
for i in model_data.columns[1:-1]:
print(i)
graphforbestbin(model_data,i,"SeriousDlqin2yrs",n=2,q=20,graph=True)
auto_col_bins = "RevolvingUtilizationOfUnsecuredLines": 6,
"age": 5,
"DebtRatio": 4,
"MonthlyIncome": 3,
"NumberOfOpenCreditLinesAndLoans": 5
# 不能使用自动分箱的变量
hand_bins = "NumberOfTime30-59DaysPastDueNotWorse": [0, 1, 2, 13]
, "NumberOfTimes90DaysLate": [0, 1, 2, 17]
, "NumberRealEstateLoansOrLines": [0, 1, 2, 4, 54]
, "NumberOfTime60-89DaysPastDueNotWorse": [0, 1, 2, 8]
, "NumberOfDependents": [0, 1, 2, 3]
# 保证区间覆盖使用 np.inf替换最大值,用-np.inf替换最小值
hand_bins = k: [-np.inf, *v[:-1], np.inf] for k, v in hand_bins.items()
bins_of_col =
# 生成自动分箱的分箱区间和分箱后的 IV 值
for col in auto_col_bins:
bins_df = graphforbestbin(model_data, col
, "SeriousDlqin2yrs"
, n=auto_col_bins[col]
# 使用字典的性质来取出每个特征所对应的箱的数量
, q=20
, graph=False)
bins_list = sorted(set(bins_df["min"]).union(bins_df["max"]))
# 保证区间覆盖使用 np.inf 替换最大值 -np.inf 替换最小值
bins_list[0], bins_list[-1] = -np.inf, np.inf
bins_of_col[col] = bins_list
# 合并手动分箱数据
bins_of_col.update(hand_bins)
data = model_data.copy()
data = data[["age", "SeriousDlqin2yrs"]].copy()
# pd.cut,可以根据已知的分箱间隔把数据分箱 参数为 pd.cut(数据,以列表表示的分箱间隔)
data["cut"] = pd.cut(data["age"], [-np.inf, 48.49986200790144, 58.757170160044694, 64.0,
74.0, np.inf])
data.groupby("cut")["SeriousDlqin2yrs"].value_counts()
data.groupby("cut")["SeriousDlqin2yrs"].value_counts().unstack()
# 计算woe值
bins_df["woe"] = np.log((bins_df[0] / bins_df[0].sum()) / (bins_df[1] / bins_df[1].sum()))
def get_woe(df, col, y, bins):
df = df[[col, y]].copy()
df["cut"] = pd.cut(df[col], bins)
bins_df = df.groupby("cut")[y].value_counts().unstack()
woe = bins_df["woe"] = np.log((bins_df[0] / bins_df[0].sum()) / (bins_df[1] / bins_df[1].sum()))
return woe
# 将所有特征的WOE存储到字典当中
woeall =
for col in bins_of_col:
woeall[col] = get_woe(model_data, col, "SeriousDlqin2yrs", bins_of_col[col])
# woeall
# 不希望覆盖掉原本的数据,创建一个新的DataFrame,索引和原始数据model_data一模一样
model_woe = pd.DataFrame(index=model_data.index)
# 将原数据分箱后,按箱的结果把WOE结构用map函数映射到数据中
model_woe["age"] = pd.cut(model_data["age"], bins_of_col["age"]).map(woeall["age"])
# model_woe["age"]
# 对所有特征都能这么写
for col in bins_of_col:
model_woe[col] = pd.cut(model_data[col], bins_of_col[col]).map(woeall[col])
# 将标签补充到数据中
model_woe["SeriousDlqin2yrs"] = model_data["SeriousDlqin2yrs"]
# 建模与验证
# 处理测试集
vali_woe = pd.DataFrame(index=vali_data.index)
for col in bins_of_col:
vali_woe[col] = pd.cut(vali_data[col], bins_of_col[col]).map(woeall[col])
vali_woe["SeriousDlqin2yrs"] = vali_data["SeriousDlqin2yrs"]
vali_x = vali_woe.iloc[:, :-1]
vali_y = vali_woe.iloc[:, -1]
x = model_woe.iloc[:, :-1]
y = model_woe.iloc[:, -1]
from sklearn.linear_model import LogisticRegression as LR
lr = LR().fit(x, y)
lr.score(vali_x, vali_y)
# 0.774959318464631
c_1 = np.linspace(0.01, 1, 20)
c_2 = np.linspace(0.01, 0.2, 20)
score = []
for i in c_2:
lr = LR(solver='liblinear', C=i).fit(x, y)
score.append(lr.score(vali_x, vali_y))
plt.figure()
plt.plot(c_2, score)
plt.show()
lr.n_iter_ # 查看迭代次数
# array([5], dtype=int32) 迭代次数为5
score = []
for i in [1, 2, 3, 4, 5, 6]:
lr = LR(solver='liblinear', C=0.025, max_iter=i).fit(x, y)
score.append(lr.score(vali_x, vali_y))
plt.figure()
plt.plot([1, 2, 3, 4, 5, 6], score)
plt.show()
import scikitplot as skplt
vali_proba_df = pd.DataFrame(lr.predict_proba(vali_x))
skplt.metrics.plot_roc(vali_y, vali_proba_df,
plot_micro=False, figsize=(6, 6),
plot_macro=False)
# todo
以上是关于特征的相关性分析--评分卡分箱的主要内容,如果未能解决你的问题,请参考以下文章
评分卡应用 - 利用Toad进行有监督分箱(卡方分箱/决策树分箱)