Posted 公众号_python风控模型






Part 1. KS的计算方法

Part 2. IV的计算方法

Part 3. AUC的计算方法

Part 4. PSI的计算方法

Part 5. 总结



Part 1. KS的计算方法

在《求是汪在路上:风控模型—区分度评估指标(KS)深入理解应用》这篇中,我们介绍过KS相关知识。另外可作为延伸阅读的一篇论文是《A comparison study of computational methods of Kolmogorov Smirnov statistic in credit scoring》,其摘要如下:

图 1 - 论文摘要

该文章提出了KS的三种计算方法,分别是不分箱法、等频分箱法(equal-size binning)和等距分箱法(equal-width binning),同时从计算值、排序性、几何含义三个维度对比,得出:1)不分箱法能到最大且唯一的值;2)只有等频分箱法能用于评估分数的排序性。

  1. 不分箱法。


  • 对连续变量(或特指模型分数)排序后,计算累积好人数和累积坏人数;
  • 得到累积好人捕捉率和累积坏人捕捉率
  • 两者相减得到差值,并取绝对值(可用abs方法)。
  • 计算所有绝对值的最大值。

2. 分箱法。




drop table if exists db.bin_table;
create table db.bin_table as
select bucket
,round(min(score), 6) as min_score
,round(max(score), 6) as max_score
,count(y) as tot
,sum(y) as bad
,count(y) - sum(y) as good
from (
select distinct a.order_id, a.score, a.y
,case when score is null then B00
when score < percent[0] then B01
when score >= percent[0] and score < percent[1] then B02
when score >= percent[1] and score < percent[2] then B03
when score >= percent[2] and score < percent[3] then B04
when score >= percent[3] and score < percent[4] then B05
when score >= percent[4] and score < percent[5] then B06
when score >= percent[5] and score < percent[6] then B07
when score >= percent[6] and score < percent[7] then B08
when score >= percent[7] and score < percent[8] then B09
when score >= percent[8] and score < percent[9] then B10
when score >= percent[9] and score < percent[10] then B11
when score >= percent[10] and score < percent[11] then B12
when score >= percent[11] and score < percent[12] then B13
when score >= percent[12] and score < percent[13] then B14
when score >= percent[13] and score < percent[14] then B15
when score >= percent[14] and score < percent[15] then B16
when score >= percent[15] and score < percent[16] then B17
when score >= percent[16] and score < percent[17] then B18
when score >= percent[17] and score < percent[18] then B19
when score >= percent[18] then B20
end as bucket
from db.score_y a
left join (
select percentile_approx(score,
array(0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50
,0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95), 9999) as percent
from db.score_y
where y in (0,1)
) b on 1 = 1
) t
group by bucket
order by bucket;


bucket  min_score max_score tot
B01 6.70E-04 0.005784 2503
B02 0.005784 0.007873 2499
B03 0.007874 0.009712 2493
B04 0.009712 0.011443 2502
B05 0.011444 0.013065 2504
B06 0.013065 0.014663 2496
B07 0.014664 0.016121 2499
B08 0.016122 0.017430 2499
B09 0.017431 0.018744 2502
B10 0.018744 0.020009 2501
B11 0.020009 0.021595 2500
B12 0.021596 0.023448 2499
B13 0.023449 0.025643 2500
B14 0.025644 0.028224 2499
B15 0.028226 0.031589 2499
B16 0.031590 0.036023 2499
B17 0.036025 0.042335 2500
B18 0.042337 0.052232 2500
B19 0.052234 0.072308 2500
B20 0.072309 0.838530 2506


drop table if exists db.bin_table;
create table db.bin_table as
select bucket
,round(min(score), 6) as min_score
,round(max(score), 6) as max_score
,count(y) as tot
,sum(y) as bad
,count(y) - sum(y) as good
from (
select a.order_id, a.score, a.y
,case when rn_percent >= 0.00 and rn_percent < 0.05 then B01
when rn_percent >= 0.05 and rn_percent < 0.10 then B02
when rn_percent >= 0.10 and rn_percent < 0.15 then B03
when rn_percent >= 0.15 and rn_percent < 0.20 then B04
when rn_percent >= 0.20 and rn_percent < 0.25 then B05
when rn_percent >= 0.25 and rn_percent < 0.30 then B06
when rn_percent >= 0.30 and rn_percent < 0.35 then B07
when rn_percent >= 0.35 and rn_percent < 0.40 then B08
when rn_percent >= 0.40 and rn_percent < 0.45 then B09
when rn_percent >= 0.45 and rn_percent < 0.50 then B10
when rn_percent >= 0.50 and rn_percent < 0.55 then B11
when rn_percent >= 0.55 and rn_percent < 0.60 then B12
when rn_percent >= 0.60 and rn_percent < 0.65 then B13
when rn_percent >= 0.65 and rn_percent < 0.70 then B14
when rn_percent >= 0.70 and rn_percent < 0.75 then B15
when rn_percent >= 0.75 and rn_percent < 0.80 then B16
when rn_percent >= 0.80 and rn_percent < 0.85 then B17
when rn_percent >= 0.85 and rn_percent < 0.90 then B18
when rn_percent >= 0.90 and rn_percent < 0.95 then B19
when rn_percent >= 0.95 and rn_percent < 1.01 then B20
end as bucket
from (
select *, percent_rank() over (order by rn asc ) as rn_percent
from (select *, row_number() over (order by score desc) rn from db.score_y) a
) a
) t
group by bucket
order by bucket;


bucket  min_score max_score tot
B01 0.072342 0.838530 2500
B02 0.052284 0.072341 2500
B03 0.042361 0.052282 2500
B04 0.036033 0.042356 2500
B05 0.031597 0.036032 2500
B06 0.028234 0.031595 2500
B07 0.025647 0.028232 2500
B08 0.023452 0.025644 2500
B09 0.021597 0.023452 2500
B10 0.020010 0.021596 2500
B11 0.018746 0.020010 2500
B12 0.017434 0.018745 2500
B13 0.016125 0.017433 2500
B14 0.014666 0.016124 2500
B15 0.013065 0.014666 2500
B16 0.011445 0.013065 2500
B17 0.009718 0.011445 2500
B18 0.007872 0.009717 2500
B19 0.005780 0.007871 2500
B20 6.70E-04 0.005780 2500


select max(abs(cum_bad / tot_bad - cum_good / tot_good)) as KS
from (
select bucket
,sum(bad) over (order by bucket asc) as cum_bad
,sum(bad) over () as tot_bad
,sum(good) over (order by bucket asc) as cum_good
,sum(good) over () as tot_good
from db.bin_table
order by bucket
) t ;


select max(abs(cum_bad / tot_bad - cum_good / tot_good)) as KS
from (
select order_id, score
,count(if(y = 1, order_id, null)) over(order by score asc) as cum_bad
,count(if(y = 1, order_id, null)) over() as tot_bad
,count(if(y = 0, order_id, null)) over(order by score asc) as cum_good
,count(if(y = 0, order_id, null)) over() as tot_good
from db.score_y
where y in (0,1)
) c;



create table db.score_y_new as
select * from (
select score1 as name, score1 as score from db.score_y
union all
select score2 as name, score2 as score from db.score_y
) t

Part 2. IV的计算方法



select sum((bad / tot_bad) * ln(bad / tot_bad) - (good / tot_good) * ln(good / tot_good)) as IV
from (
select bucket, bad, good
,sum(bad) over() as tot_bad
,sum(good) over() as tot_good
from db.bin_table
) a;

Part 3. AUC的计算方法

ROC曲线下方的面积即为AUROC(Area Under ROC),简称AUC。理论上,对于连续分数分布,我们可将计算公式定义为:





图 2 - ROC与AUC


select sum((tpr + pre_tpr) / 2 * (fpr - pre_fpr)) as AUC
from (
select order_id, score
,cum_bad / tot_bad as tpr
,cum_good / tot_good as fpr
,lead(cum_bad / tot_bad) over(order by cum_bad / tot_bad desc) as pre_tpr
,lead(cum_good / tot_good) over(order by cum_good / tot_good desc) as pre_fpr
from (
select order_id, score
,count(if(y = 1, order_id, null)) over(order by score desc) as cum_bad
,count(if(y = 1, order_id, null)) over() as tot_bad
,count(if(y = 0, order_id, null)) over(order by score desc) as cum_good
,count(if(y = 0, order_id, null)) over() as tot_good
from db.score_y
where y in (0,1)
) b
) t;

Part 4. PSI的计算方法



drop table if exists db.psi_temp;
create table db.psi_temp as
select gp, bucket, count(1) as cnt
from (
select a.order_id, a.score, a.gp
,case when score is null then B00
when score < percent[0] then B01
when score >= percent[0] and score < percent[1] then B02
when score >= percent[1] and score < percent[2] then B03
when score >= percent[2] and score < percent[3] then B04
when score >= percent[3] and score < percent[4] then B05
when score >= percent[4] and score < percent[5] then B06
when score >= percent[5] and score < percent[6] then B07
when score >= percent[6] and score < percent[7] then B08
when score >= percent[7] and score < percent[8] then B09
when score >= percent[8] and score < percent[9] then B10
when score >= percent[9] and score < percent[10] then B11
when score >= percent[10] and score < percent[11] then B12
when score >= percent[11] and score < percent[12] then B13
when score >= percent[12] and score < percent[13] then B14
when score >= percent[13] and score < percent[14] then B15
when score >= percent[14] and score < percent[15] then B16
when score >= percent[15] and score < percent[16] then B17
when score >= percent[16] and score < percent[17] then B18
when score >= percent[17] and score < percent[18] then B19
when score >= percent[18] then B20
end as bucket
from db.score a
left join (
select percentile_approx(score,
array(0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50
,0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95), 9999) as percent
from db.score
where gp = 基准组
) b on 1 = 1
) t
group by gp, bucket
order by gp, bucket;


gp      bucket  cnt
基准组 B01 1000
基准组 B02 1000
基准组 B03 1004
基准组 B04 1000
基准组 B05 1001
基准组 B06 1002
基准组 B07 999
基准组 B08 1002
基准组 B09 1002
基准组 B10 999
基准组 B11 1002
基准组 B12 1002
基准组 B13 1000
基准组 B14 1002
基准组 B15 1000
基准组 B16 1002
基准组 B17 1001
基准组 B18 1001
基准组 B19 1001
基准组 B20 1004
对比组 B01 1557
对比组 B02 1563
对比组 B03 1404
对比组 B04 1545
对比组 B05 1464
对比组 B06 1489
对比组 B07 1533
对比组 B08 1407
对比组 B09 1590
对比组 B10 1460
对比组 B11 1530
对比组 B12 1490
对比组 B13 1417
对比组 B14 1524
对比组 B15 1455
对比组 B16 1468
对比组 B17 1506
对比组 B18 1521
对比组 B19 1540
对比组 B20 1513


select gp, sum((a.rate - b.rate) * ln(a.rate / b.rate)) as PSI
from (
select gp, bucket, cnt / sum(cnt) over (partition by gp) as rate
from db.psi_temp
) a
left join (
select gp, bucket, cnt / sum(cnt) over (partition by gp) as rate
from db.psi_temp
where gp = 基准组
) b on a.bucket = b.bucket
group by gp
order by gp;

gp PSI
基准组 0.0
对比组 0.0012083365466523768

Part 5. 总结







版权声明:文章来自公众号(python风控模型),未经许可,不得抄袭。遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。




深入理解风控中的 KS 原理


