如何计算分层 K 折交叉验证的不平衡数据集的误报率?
Posted
技术标签:
【中文标题】如何计算分层 K 折交叉验证的不平衡数据集的误报率?【英文标题】:How to compute false positive rate of an imbalanced dataset for Stratified K fold cross validation? 【发布时间】:2021-12-26 00:59:39 【问题描述】:以下几行是我能够计算准确率、精度、召回率和 f1 分数的示例代码。如何计算分层 K 折交叉验证的误报率 (FPR)?
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score,
f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
scoring = 'accuracy' : make_scorer(accuracy_score),
'precision' : make_scorer(precision_score),
'recall' : make_scorer(recall_score),
'f1_score' : make_scorer(f1_score)
skfold = StratifiedKFold(n_splits=10)
dt_clf = DecisionTreeClassifier()
results = cross_validate(estimator=dt_clf,
X=data_train_X,
y=target_train_Y,
cv=skfold,
scoring=scoring)
print("Results", results)
【问题讨论】:
【参考方案1】:我使用逻辑回归编写了这段代码。您可以将其替换为您喜欢的任何其他二进制分类算法。
#Importing required libraries
from sklearn.model_selection import KFold
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
data = load_breast_cancer(as_frame = True)
df = result.frame
X = result.iloc[:,:-1]
y = result.iloc[:,-1]
#shffling
X = X.sample(frac = 1)
y= y.sample(frac = 1)
#Implementing cross validation
kf = KFold(n_splits=10)#, random_state=None
model = LogisticRegression(max_iter=1000000)#(solver= 'liblinear')
acc_score = list()
res_tpr = list()
res_fpr = list()
for train_index , test_index in kf.split(X):
#X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
#y_train , y_test = y[train_index] , y[test_index]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model.fit(X_train,y_train)
pred_values = model.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, pred_values, labels=[0, 1]).ravel()
print(f'True Positives: tp')
print(f'False Positives: fp')
print(f'True Negatives: tn')
print(f'False Negatives: fn')
tpr=(np.divide(tp,(tp+fn)))
fpr=(np.divide(fp,(fp+tn)))
if tp==0:
tpr=0
if fp==0:
fpr=0
print('tpr=%.4f fpr=%.3f' % ( mean(tpr), mean(fpr)))
res_tpr.append(mean(tpr))
res_fpr.append(mean(fpr))
print('---------------------')
acc = accuracy_score(pred_values , y_test)
acc_score.append(acc)
avg_acc_score = np.sum(acc_score)/10
total_tpr=np.sum(res_tpr)/10
total_fpr=np.sum(res_fpr)/10
print('\n\n',' total_tpr=%.4f total_fpr=%.3f' % (total_tpr,total_fpr))
#print('\n\n','accuracy of each fold - '.format(acc_score))
print('\n\n','Avg accuracy : '.format(avg_acc_score))
【讨论】:
【参考方案2】:您可以按如下方式定义自定义记分器:
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
def false_positive_rate(y_true, y_pred):
# false positive
fp = ((y_pred == 1) & (y_true == 0)).sum()
# true negative
tn = ((y_pred == 0) & (y_true == 0)).sum()
# false positive rate
return fp / (fp + tn)
scoring =
'accuracy': make_scorer(accuracy_score),
'precision': make_scorer(precision_score),
'recall': make_scorer(recall_score),
'f1_score': make_scorer(f1_score),
'false_positive_rate': make_scorer(false_positive_rate),
skf = StratifiedKFold(n_splits=3)
clf = DecisionTreeClassifier(random_state=42)
X, y = make_classification(random_state=42)
results = cross_validate(estimator=clf, X=X, y=y, cv=skf, scoring=scoring)
print(results['test_false_positive_rate'])
# [0.11764706 0.11764706 0.0625]
【讨论】:
以上是关于如何计算分层 K 折交叉验证的不平衡数据集的误报率?的主要内容,如果未能解决你的问题,请参考以下文章