机器学习模型可解释性
Posted 屁屁和铭仔的数据之路
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了机器学习模型可解释性相关的知识,希望对你有一定的参考价值。
机器学习模型可解释性
https://compstat-lmu.github.io/iml_methods_limitations/pdp.html
https://christophm.github.io/interpretable-ml-book/
https://pdpbox.readthedocs.io/en/latest/
https://github.com/AustinRochford/PyCEbox
全局解释
模型输入是如何影响到输出的?
局部解释
针对这个输入,为什么会有这个输出?
线性模型的系数作为全局解释的局限
只有在特征scale后系数才有参考性(统一量纲)
相关性的特征会使得系数代表的重要性没有参考性。
L1正则化会从相关特征中随机萱蕚一个
任何正则化都会使得正常解读线性模型系数来得到特征重要性的方法无效。
接下来介绍几种特征重要性的方法:
Drop Feature Importance
其实就是剔除某个特征来得到他的重要性,代码实现如下:
def drop_feature_importance(est,X,y):
base_score = np.mean(cross_val_score(est,X,y))
scores = []
for feature in range(X.shape[1]):
mask = np.ones(X.shape[1],'bool')
False =
X_new = X[:,mask]
this_score = np.mean(cross_val_score(est,X_new,y))
- this_score)
return np.array(scores)
上面方式的缺点:
并没有真的解释了模型
没办法解决特征相关性的问题
很慢 但还是可以用来做特征选择。
Permutation importance
kaggle上已经讲烂的方式,伪代码如下:
def permutation_importance(est, X, y, n_repeat=100):
baseline_score = estimator.score(X, y)
for f_idx in range(X.shape[1]):
for repeat in range(n_repeat):
X_new = X.copy()
X_new[:, f_idx] = np.random.shuffle(X[:, f_idx])
feature_score = estimator.score(X_new, y)
scores[f_idx, repeat] = baseline_score - feature_score
这个应该在测试集上来进行检验
还是很慢
接下来是一个case_study,首先生成数据
from sklearn.preprocessing import scale
# 35 13 50?
rng = np.random.RandomState(13)
n_samples = 100000
n_informative = 2
n_correlated_per_inf = 2
n_noise = 4
noise_std = .0001
noise_correlated_std = .51
noise_y = .3
X_original = rng.uniform(-1, 1, size=(n_samples, n_informative))
#coef = rng.normal(size=n_informative)
coef = np.array([-3.2, 1.4])
y = np.dot(X_original, coef) + rng.normal(scale=noise_y, size=n_samples)
correlated_transform = np.zeros((n_correlated_per_inf * n_informative, n_informative))
for i in range(n_informative):
correlated_transform[i * n_correlated_per_inf: (i + 1) * n_correlated_per_inf, i] = rng.normal(size=n_correlated_per_inf)
X_original += rng.normal(scale=np.array([1, 1]) * noise_correlated_std, size=X_original.shape)
X_correlated = np.dot(X_original, correlated_transform.T)
X = np.hstack([X_correlated, np.zeros((n_samples, n_noise))])
X += rng.normal(scale=noise_std, size=X.shape)
X = scale(X)
看下特征相关性
plt.imshow(np.cov(X, rowvar=False), cmap='bwr_r')
plt.title("Synthetic data Covariance")
plt.xlabel("feature index")
plt.ylabel("feature index")
plt.colorbar()
plt.savefig("images/covariance.png")
可视化一下每个特征与标签的关系
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
for i, ax in enumerate(axes.ravel()):
ax.plot(X[::10, i], y[::10], '.', alpha=.1)
ax.set_xlabel("feature {}".format(i))
ax.set_ylabel("target y")
plt.savefig("images/toy_data_scatter.png")
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0).
from sklearn.linear_model import LassoCV, RidgeCV, LinearRegression
for i in range(X.shape[1]):
lr = LinearRegression().fit(X_train[:, [i]], y_train)
print(lr.score(X_test[:, [i]], y_test))
>>>
0.4532506867871262
0.4532514261726276
0.09275569004419326
0.09275599504713594
-0.00020773972150145426
-0.00017276124646903313
-0.00018329268681771538
-0.00020494411292082404
lasso = LassoCV().fit(X_train, y_train)
lasso.score(X_test, y_test)
>>>
0.5453241219700229
ridge = RidgeCV().fit(X_train, y_train)
ridge.score(X_test, y_test)
>>>
0.5453306487434062
lr = LinearRegression().fit(X_train, y_train)
lr.score(X_test, y_test)
>>>
0.5453299378456281
from sklearn.decomposition import PCA
pca = PCA(n_components=.99).fit(X_train)
X_train_pca = pca.transform(X_train)
lr_pca = LinearRegression().fit(X_train_pca, y_train)
inverse_lr_pca_coef = pca.inverse_transform(lr_pca.coef_)
lr_pca.score(pca.transform(X_test), y_test)
>>>
0.5453305950074661
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
param_grid = {'max_leaf_nodes': range(5, 40, 5)}
grid = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=10, n_jobs=3)
grid.fit(X_train, y_train)
grid.score(X_test, y_test)
>>>
0.5452326112689287
定义一个函数来可视化各个模型的特征重要性
def plot_importance(some_dict):
plt.figure(figsize=(10, 4))
df = pd.DataFrame(some_dict)
ax = plt.gca()
df.plot.bar(ax=ax, width=.9)
ax.set_ylim(-1.5, 1.5)
ax.set_xlim(-.5, len(df) - .5)
ax.set_xlabel("feature index")
ax.set_ylabel("importance value")
plt.vlines(np.arange(.5, len(df) -1), -1.5, 1.5, linewidth=.5)
# 这里是传入了一个字典
tree = grid.best_estimator_
plot_importance({'lasso': lasso.coef_, 'ridge': ridge.coef_, 'lr': lr.coef_, 'tree': tree.feature_importances_, 'rf':rf.feature_importances_})
plt.title("Coefficients and entropy improvement on large data")
plt.savefig("images/standard_importances.png")
Drop Feature Importance特征检验
from sklearn.model_selection import cross_val_score
def drop_feature_importance(est, X, y):
base_score = np.mean(cross_val_score(est, X, y))
scores = []
for feature in range(X.shape[1]):
mask = np.ones(X.shape[1], 'bool')
mask[feature] = False
X_new = X[:, mask]
this_score = np.mean(cross_val_score(est, X_new, y))
scores.append(base_score - this_score)
return np.array(scores)
from sklearn.inspection import permutation_importance
perm_ridge_test = permutation_importance(ridge, X_test, y_test)['importances_mean']
perm_lasso_test = permutation_importance(lasso, X_test, y_test)['importances_mean']
perm_tree_test = permutation_importance(tree, X_test, y_test)['importances_mean']
perm_rf_test = permutation_importance(rf, X_test, y_test)['importances_mean']
perm_lr_test = permutation_importance(lr, X_test, y_test)['importances_mean']
tree = grid.best_estimator_
plot_importance({'lasso': perm_lasso_test, 'ridge': perm_ridge_test, 'lr': perm_lr_test,'tree': perm_tree_test, 'rf':perm_rf_test})
plt.title("Permutation importance on test set (large training data)")
plt.savefig("images/permutation_importance_big.png")
Partial Dependence
固定一个其他系数,对一个变量进行遍历得到他在模型中的可解释性.看代码:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
boston = load_boston()
X_train, X_test, y_train, y_test = train_test_split(
boston.data, boston.target, random_state=0)
gbrt = GradientBoostingRegressor().fit(X_train, y_train)
gbrt.score(X_test, y_test)
刚发现原来可以直接画图:
from sklearn.ensemble.partial_dependence import plot_partial_dependence
axs = plot_partial_dependence(gbrt, X_train, np.argsort(gbrt.feature_importances_)[-6:],
feature_names=boston.feature_names,
n_jobs=3, grid_resolution=50)
plt.tight_layout()
二维可视化:
axs = plot_partial_dependence(gbrt, X_train, [np.argsort(gbrt.feature_importances_)[-2:]],
feature_names=boston.feature_names,
n_jobs=3, grid_resolution=50)
PDL有个缺点,就是某些的复杂的特征情况他没办法表现,这里就可以用ICEBOS
from sklearn.datasets import make_blobs
X = np.random.normal(size=(2000, 3))
w = np.array([0, .5, .1])
y = np.dot(X, w) + np.random.normal(scale=0.3, size=(2000,))
mask = X[:, 0] > 0
#X[mask, 1] -= 2
y[mask] = 1 - y[mask]
plt.plot(X[:, 0], y, 'o')
plt.scatter(X[:, 1], y, alpha=.5, s=4)
这种特征中PDL来画的话就会发现特征重要性被中线稀释了
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.inspection import plot_partial_dependence
fig, axes = plt.subplots(2, 1)
axes[0].hist(y, bins='auto')
gb = HistGradientBoostingRegressor().fit(X, y)
pdp = plot_partial_dependence(gb, X, [1], ax=axes[1])
plt.xlabel("feature")
用Icebox图就可以很好的体现出不同的样本的特性
from pycebox.ice import ice, ice_plot
ice_df = ice(pd.DataFrame(X), 1, gb.predict, num_grid_points=100)
ice_plot(ice_df, frac_to_plot=1, plot_pdp=True,
c='k', alpha=0.1, linewidth=0.3)
plt.ylabel("partial dependence")
plt.xlabel("feature")
from pdpbox import pdp
feature_isolate = pdp.pdp_isolate(gb, pd.DataFrame(X), [0, 1, 2], 1, num_grid_points=100)
fig, axes = pdp.pdp_plot(feature_isolate, 0, plot_lines=True, frac_to_plot=1)
以上是关于机器学习模型可解释性的主要内容,如果未能解决你的问题,请参考以下文章