以German信用数据为例的logistics regression算法在评分卡上的实践
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了以German信用数据为例的logistics regression算法在评分卡上的实践相关的知识,希望对你有一定的参考价值。
以德国信用数据为例,用logistict regression算法做信用评分卡原理性实现,因此并未考虑feature selection.
第一步:导入必要的库
import pandas as pd import numpy as np from sklearn.cross_validation import train_test_split
第二步:导入数据
german = pd.read_csv(‘D:/CreditDatasets/german.data‘, sep=‘ ‘, header=None) german.columns = [‘Status_of_existing_checking_account‘, ‘Duration_in_month‘, ‘Credit_history‘,‘Purpose‘, ‘Credit_amount‘, ‘Savings_account‘, ‘Present_employment_since‘,‘Installment_rate‘, ‘Personal_status_and_sex‘, ‘Other_debtors‘, ‘Present_residence_since‘,‘Property‘, ‘Age‘, ‘Other_installment_plans‘, ‘Housing‘, ‘Number_of_existing_credits‘,‘Job‘, ‘Number_of_people‘, ‘Telephone‘, ‘foreign_worker‘, ‘default‘] Grp = german.groupby(‘default‘) total_good = Grp.size()[1] total_bad = Grp.size()[2]
第三步:分别计算名义变量和数值变量的woe值,对取值较少的数值变量也用名义变量woe计算方法实现,其余数值变量均5等分
def CalcWOE(VarName): WOE_Map = pd.DataFrame() Vars = np.unique(german[VarName]) for v in Vars: tmp = german[VarName] == v grp = german[tmp].groupby(‘default‘) Good = grp.size()[1] Bad = grp.size()[2] good_ratio = float(Good)/total_good bad_ratio = float(Bad)/total_bad WOE = np.log(bad_ratio/good_ratio) IV = (bad_ratio - good_ratio)*WOE result = pd.DataFrame([[VarName, v, WOE, IV]], index=None, columns=[‘variable‘, ‘class‘, ‘woe‘, ‘iv‘]) WOE_Map = WOE_Map.append(result, ignore_index=True) return WOE_Map # nominal variable woe status_checking_account_woe = CalcWOE(‘Status_of_existing_checking_account‘) Credit_history_woe = CalcWOE(‘Credit_history‘) Purpose_woe = CalcWOE(‘Purpose‘) Savings_account_woe = CalcWOE(‘Savings_account‘) Present_employment_since_woe= CalcWOE(‘Present_employment_since‘) Personal_status_and_sex_woe = CalcWOE(‘Personal_status_and_sex‘) Other_debtors_woe = CalcWOE(‘Other_debtors‘) Property_woe = CalcWOE(‘Property‘) Other_installment_plans_woe = CalcWOE(‘Other_installment_plans‘) Housing_woe = CalcWOE(‘Housing‘) Job_woe = CalcWOE(‘Job‘) Telephone_woe = CalcWOE(‘Telephone‘) foreign_worker_woe = CalcWOE(‘foreign_worker‘) # numeric variable woe, no binning Installment_rate_woe = CalcWOE(‘Installment_rate‘) Present_residence_since_woe = CalcWOE(‘Present_residence_since‘) Number_of_existing_credits_woe = CalcWOE(‘Number_of_existing_credits‘) Number_of_people_woe = CalcWOE(‘Number_of_people‘) def CalcWOE_bin(VarName,N): WOE_Map = pd.DataFrame() max_value = max(german[VarName]) min_value = min(german[VarName]) bin = float(max_value - min_value)/N for i in range(N): bin_U = min_value + (i+1)*bin bin_L = bin_U - bin if i == 1: tmp = (german[VarName] >= bin_L) & (german[VarName] <= bin_U) grp = german[tmp].groupby(‘default‘) else: tmp = (german[VarName] > bin_L) & (german[VarName] <= bin_U) grp = german[tmp].groupby(‘default‘) Good = grp.size()[1] Bad = grp.size()[2] good_ratio = float(Good)/total_good bad_ratio = float(Bad)/total_bad WOE = np.log(bad_ratio/good_ratio) IV = (bad_ratio - good_ratio)*WOE result = pd.DataFrame([[VarName, [bin_L, bin_U, WOE], WOE, IV]], index=None, columns=[‘variable‘, ‘class+woe‘, ‘woe‘, ‘iv‘]) WOE_Map = WOE_Map.append(result, ignore_index=True) return WOE_Map Duration_in_month_woe = CalcWOE_bin(‘Duration_in_month‘, 5) Credit_amount_woe = CalcWOE_bin(‘Credit_amount‘, 5) Age_woe = CalcWOE_bin(‘Age‘, 5)
第四步:用woe值替代原来的值
def ReplaceWOE(VarName, SourceDF, VarWOE): dict1 = dict.fromkeys(VarWOE[‘class‘]) j = 0 for key in dict1: dict1[key] = VarWOE[‘woe‘][j] j = j + 1 SourceDF[VarName] = SourceDF[VarName].map(dict1) return SourceDF german_woe = german temp = ReplaceWOE(‘Status_of_existing_checking_account‘, german_woe, status_checking_account_woe) temp1 = ReplaceWOE(‘Credit_history‘, temp, Credit_history_woe) temp = ReplaceWOE(‘Purpose‘, temp1, Purpose_woe) temp1 = ReplaceWOE(‘Savings_account‘, temp, Savings_account_woe) temp = ReplaceWOE(‘Present_employment_since‘, temp1, Present_employment_since_woe) temp1 = ReplaceWOE(‘Personal_status_and_sex‘, temp, Personal_status_and_sex_woe) temp = ReplaceWOE(‘Other_debtors‘, temp1, Other_debtors_woe) temp1 = ReplaceWOE(‘Property‘, temp, Property_woe) temp = ReplaceWOE(‘Other_installment_plans‘, temp1, Other_installment_plans_woe) temp1 = ReplaceWOE(‘Housing‘, temp, Housing_woe) temp = ReplaceWOE(‘Job‘, temp1, Job_woe) temp1 = ReplaceWOE(‘Telephone‘, temp, Telephone_woe) temp = ReplaceWOE(‘foreign_worker‘, temp1, foreign_worker_woe) temp1 = ReplaceWOE(‘Installment_rate‘, temp, Installment_rate_woe) temp = ReplaceWOE(‘Present_residence_since‘, temp1, Present_residence_since_woe) temp1 = ReplaceWOE(‘Number_of_existing_credits‘, temp, Number_of_existing_credits_woe) temp = ReplaceWOE(‘Number_of_people‘, temp1, Number_of_people_woe) def ReplaceWOE_bin(VarName, SourceDF, VarWOE): items = np.unique(SourceDF[VarName]) m = min(SourceDF[VarName]) dict2 = {} for it in items: if it == m: dict2[it] = VarWOE[‘class+woe‘][0][2] else: for l, u, w in VarWOE[‘class+woe‘]: if (it > l) & (it <= u): dict2[it] = w SourceDF[VarName] = SourceDF[VarName].map(dict2) return SourceDF temp1 = ReplaceWOE_bin(‘Duration_in_month‘, temp, Duration_in_month_woe) temp = ReplaceWOE_bin(‘Credit_amount‘, temp1, Credit_amount_woe) temp1 = ReplaceWOE_bin(‘Age‘, temp, Age_woe)
第五步:将数据集拆分为训练集和测试集
X = temp1[list(temp1.columns)[:-1]] y = temp1[‘default‘] - 1 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
第六步:在训练集上应用logistic regression算法
from sklearn.linear_model.logistic import LogisticRegression classifier = LogisticRegression() classifier.fit(X_train, y_train) predictions = classifier.predict(X_test)
第七步:评估模型分类精度
from sklearn.metrics import accuracy_score # print ‘Accuracy:‘, accuracy_score(y_test, predictions) from sklearn.cross_validation import cross_val_score scores = cross_val_score(classifier, X_train, y_train, cv=5) # print np.mean(scores), scores
第八步:创建评分卡
# score = A - B*log(theta) # P0 = A - B*log(theta0), P0 + PDO = A - B*log(2*theta0) P0 = 600 PDO = 20 theta0 = 1.0/60 B = PDO/np.log(2) A = P0 + B*np.log(theta0) coef = classifier.coef_ beta0 = classifier.intercept_ status_checking_account_woe[‘score‘] = (A - B*beta0)/20 - B*coef[0][0]*status_checking_account_woe[‘woe‘] Duration_in_month_woe[‘score‘] = (A - B*beta0)/20 - B*coef[0][1]*Duration_in_month_woe[‘woe‘] Credit_history_woe[‘score‘] = (A - B*beta0)/20 - B*coef[0][2]*Credit_history_woe[‘woe‘] Purpose_woe[‘score‘] = (A - B*beta0)/20 - B*coef[0][3]*Purpose_woe[‘woe‘] Credit_amount_woe[‘score‘] = (A - B*beta0)/20 - B*coef[0][4]*Credit_amount_woe[‘woe‘] Savings_account_woe[‘score‘] = (A - B*beta0)/20 - B*coef[0][5]*Savings_account_woe[‘woe‘] Present_employment_since_woe[‘score‘] = (A - B*beta0)/20 - B*coef[0][6]*Present_employment_since_woe[‘woe‘] Installment_rate_woe[‘score‘] = (A - B*beta0)/20 - B*coef[0][7]*Installment_rate_woe[‘woe‘] Personal_status_and_sex_woe[‘score‘] = (A - B*beta0)/20 - B*coef[0][8]*Personal_status_and_sex_woe[‘woe‘] Other_debtors_woe[‘score‘] = (A - B*beta0)/20 - B*coef[0][9]*Other_debtors_woe[‘woe‘] Present_residence_since_woe[‘score‘] = (A - B*beta0)/20 - B*coef[0][10]*Present_residence_since_woe[‘woe‘] Property_woe[‘score‘] = (A - B*beta0)/20 - B*coef[0][11]*Property_woe[‘woe‘] Age_woe[‘score‘] = (A - B*beta0)/20 - B*coef[0][12]*Age_woe[‘woe‘] Other_installment_plans_woe[‘score‘] = (A - B*beta0)/20 - B*coef[0][13]*Other_installment_plans_woe[‘woe‘] Housing_woe[‘score‘] = (A - B*beta0)/20 - B*coef[0][14]*Housing_woe[‘woe‘] Number_of_existing_credits_woe[‘score‘] = (A - B*beta0)/20 - B*coef[0][15]*Number_of_existing_credits_woe[‘woe‘] Job_woe[‘score‘] = (A - B*beta0)/20 - B*coef[0][16]*Job_woe[‘woe‘] Number_of_people_woe[‘score‘] = (A - B*beta0)/20 - B*coef[0][17]*Number_of_people_woe[‘woe‘] Telephone_woe[‘score‘] = (A - B*beta0)/20 - B*coef[0][18]*Telephone_woe[‘woe‘] foreign_worker_woe[‘score‘] = (A - B*beta0)/20 - B*coef[0][19]*foreign_worker_woe[‘woe‘]
初次用python实现,不当之处请不吝批评指正!
本文出自 “师兄的学习园地” 博客,请务必保留此出处http://4292565.blog.51cto.com/4282565/1861560
以上是关于以German信用数据为例的logistics regression算法在评分卡上的实践的主要内容,如果未能解决你的问题,请参考以下文章