交叉熵损失不等于 lgbm 中的二进制对数损失

Posted

技术标签:

【中文标题】交叉熵损失不等于 lgbm 中的二进制对数损失【英文标题】:cross entropy loss not equivalent to binary log loss in lgbm 【发布时间】:2021-02-28 09:49:33 【问题描述】:

试图解决的问题: 通过聚合标签(加权平均值的平均值)和基于相同特征的权重求和来压缩训练实例,同时保持二进制对数损失与交叉熵损失相同。下面是一个示例,log_loss 的测试用例表明二进制 log loss 等价于 weighted log loss。

original data:                                compressed_data

feature, label, weight, prediction            feature, label,  weight, prediction 
    x1,   1,     1,        0.8                 x1,      1/3,     3,       0.8
    x1,   0,     2,        0.8        -->            
    x2,   1,     2,        0.1                 x2,      2/3,     3,       0.1
    x2,   0,     1,        0.1
    x3,   1,     1,        0.9                 x3,      1,       1,       0.9

问题:二进制对数损失并不总是等同于lgbm中的交叉熵损失,模型性​​能变化(例如对数损失、平均精度和ROC_AUC)是温和的,但实际预测和预测分布非常显着。实验 1 表明它们在二进制标签情况下是等价的,而实验 2 表明在某些情况下二进制 log 损失与交叉熵不一致(查看示例了解更多详情)。

首先,验证二进制日志损失与numpy的交叉熵损失相同

import numpy as np
import pandas as pd 
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from lightgbm.sklearn import LGBMRegressor, LGBMClassifier
import lightgbm


# use X of cancer data as training feature for both experiment 1 and 2 
X, _ = load_breast_cancer(return_X_y=True)


def logloss(y_true, y_pred, weight):    
    l = np.mean((-(y_true * np.log(y_pred))-((1-y_true)*np.log(1-y_pred)))*weight)
    # normalize loss
    l = l*y_true.shape[0]/weight.sum()
    return l

"""
feature, label, weight, prediction            feature, label,  weight, prediction 
    x1,   1,     1/3,       0.7                
    x1,   1,     1/3,       0.7        -->       x1,    2/3,      1,       0.7      
    x1,   0,     1/3,       0.7
"""

l1 = logloss(np.array([1,1,0]), np.array([0.7,0.7,.7]), np.array([1/3,1/3,1/3]))
l2 = logloss(np.array([2/3]), np.array([0.7]), np.array([1]))

"""
feature, label, weight, prediction            feature, label,  weight, prediction 
    x1,   1,     1,        0.8                 x1,      1/3,     3,       0.8
    x1,   0,     2,        0.8        -->            
    x2,   1,     2,        0.1                 x2,      2/3,     3,       0.1
    x2,   0,     1,        0.1
    x3,   1,     1,        0.9                 x3,      1,       1,       0.9
"""
l3 = logloss(np.array([1,0,1,0,1]), 
             np.array([0.8,0.8,0.1,0.1,0.9]), 
             np.array([1,2,2,1,1]))
l4 = logloss(np.array([1/3,2/3,1]), np.array([0.8,0.1,0.9]), np.array([3,3,1]))

np.testing.assert_almost_equal(l1, l2, decimal=4)
np.testing.assert_almost_equal(l3, l4, decimal=4) 

实验一(二元对数损失相当于二元标签情况下的交叉熵损失):

######## data for experiment 1
np.random.seed(42)
n = X.shape[0]
y_binary = np.random.randint(0,2,size=(n))
eps = 1e-2
y_float = np.random.uniform(eps,1-eps,size=(n))

lgbm_params = 
    'boosting_type': 'gbdt',
    'class_weight': None,
    'colsample_bytree':1,
    'importance_type': 'split',
    'learning_rate': 0.06472914709339864,
    'max_depth': 46,
    'min_child_weight': 0.001, 
    'min_split_gain': 0.0,
    'n_estimators': 20,
    'n_jobs': 1,
    'num_leaves': 178,
    'random_state': 1574094090,
    'reg_alpha': 0.4894283599023894,
    'reg_lambda': 0.09743058458885945,
    'silent': True,
    'subsample':1,
#     'subsample_for_bin': 200000, # try larger values (10M+)
#     'subsample_freq': 252,
    'min_data_in_bin':1,
    'min_child_samples':1,    
 

X_train_array, X_test_array, y_train_binary, y_test_binary, y_train_float, y_test_float = \
    train_test_split(X, y_binary, y_float, test_size=0.3, random_state=1)

##### binary label case in sklearn API that binary objective is equivalent to cross_entropy objective
binary_model1 = LGBMClassifier(objective='binary')
binary_model1.set_params(**lgbm_params)

binary_model1.fit(
    X_train_array, 
    y_train_binary, 
    sample_weight=np.ones(X_train_array.shape[0])
)

binary_model2 = LGBMRegressor(objective='cross_entropy')
binary_model2.set_params(**lgbm_params)
binary_model2.fit(
    X_train_array, 
    y_train_binary, 
    sample_weight=np.ones(X_train_array.shape[0])
)

binary_pred_1 = binary_model1.predict_proba(X_test_array)[:,1]
binary_pred_2 = binary_model2.predict(X_test_array)
binary_y_pred_diff = binary_pred_1-binary_pred_2

# binary log loss and cross_entropy loss are same given binary labels
np.testing.assert_almost_equal(binary_pred_1, binary_pred_2, decimal=4)

实验 2:交叉熵损失可能与对数损失不同(不知道为什么)

######## data for experiment 2 

def make_compressed_df(X, fixed_ratio=None): 
    """
    this function stimulates compressed data that instances with same feature will be deduped
    and label becomes mean of these instance labels and weight becomes sum of these instance weight
    ex.
    
    args:
        fixed_ratio: int or None, if int, raito of pos_count/neg_count is consistent (key of the experiment!)
    
    original_data:                  compressed_data: 
    
    feature, label, weight            feature, label, pos_count, neg_count, weight, 
        x1,   1,     1                   
        x1,   1,     1        -->       x1,    2/3,       2,         1,       3
        x1,   0,     1
        -------------------------------------------------
        x2,   0,     1                   
        x2,   1,     1        -->       x2,    1/2,       1,         1,       2
        -------------------------------------------------   
        x3,   1,     1                   
        x3,   1,     1        -->       x3,    2/2,       2,         0,       2
        
    """
    compressed_df = pd.DataFrame(X)
    pos_count = np.random.randint(1,3,size=(X.shape[0]))
    compressed_df['pos_count'] = pos_count 
    if fixed_ratio:
        compressed_df['neg_count'] = int(fixed_ratio)*compressed_df['pos_count']
    else:
        neg_count = np.random.randint(1,3,size=(X.shape[0]))
        compressed_df['neg_count'] = neg_count 
        
    compressed_df['total_count'] = compressed_df['pos_count']+compressed_df['neg_count']

    compressed_df['weight'] = compressed_df['pos_count']+compressed_df['neg_count']
    compressed_df['label'] = compressed_df['pos_count']/compressed_df['total_count']
    
    return compressed_df


def restore_data(df):
    """
    restore original features, labels and weight based on pos_count and neg_count.
    instances with same feature will repeat (pos_count+neg_count) times, labels will become
    [1]*pos_count+[0]*neg_count, and weight becomes weight/(pos_count+neg_count)
    
    ex.
    
        compressed_data:                                     original_data: 
    
        feature, label, pos_count, neg_count, weight         feature, label, weight
                                                                x1,    1,     1
        x1,    2/3,       2,         1,       3        -->      x1,    1,     1
                                                                x1,    0,     1
        -------------------------------------------------
                                                                x2,    0,     1
        x2,    1/2,       1,         1,       2        -->      x2,    1,     1
        -------------------------------------------------                 
                                                                x3,    1,     1
        x3,    2/2,       2,         0,       2        -->      x3,    1,     1
        
        
    """     
    pos_df = df.loc[df.index.repeat(df['pos_count'])]
    pos_df['label'] = 1
    
    neg_df = df.loc[df.index.repeat(df['neg_count'])]
    neg_df['label'] = 0
    
    df = pd.concat([pos_df, neg_df], axis=0)
    del pos_df, neg_df
    df['weight'] = df['weight']/df['total_count']
    df = df.drop(['pos_count', 'neg_count', 'total_count'], axis=1)    
    return df


def make_compressed_and_restored_data(X, fixed_ratio):
    np.random.seed(42)
    compressed_df = make_compressed_df(X, fixed_ratio)
    compressed_train_df, compressed_test_df = train_test_split(
        compressed_df, test_size=0.3, random_state=1)
    
    restored_train_df = restore_data(compressed_train_df)
    restored_test_df = restore_data(compressed_test_df)
    
    return (compressed_train_df, compressed_test_df), (restored_train_df, restored_test_df)


# when ratio of pos_count/neg_count is not fixed, objectives are different
(compressed_train_random_ratio_df, compressed_test_df), \
    (restored_train_random_ratio_df, restored_test_random_ratio_df) = \
    make_compressed_and_restored_data(X, fixed_ratio=None)

model1 = LGBMClassifier(objective='binary')
model1.set_params(**lgbm_params)

model1.fit(
    restored_train_random_ratio_df.iloc[:,:30], 
    restored_train_random_ratio_df['label'], 
    sample_weight=restored_train_random_ratio_df['weight']
)

model2 = LGBMRegressor(objective='cross_entropy')
model2.set_params(**lgbm_params)
model2.fit(
    compressed_train_random_ratio_df.iloc[:,:30], 
    compressed_train_random_ratio_df['label'], 
    sample_weight=compressed_train_random_ratio_df['weight']
)

y1 = model1.predict_proba(compressed_test_df.iloc[:,:30])[:,1]
y2 = model2.predict(compressed_test_df.iloc[:,:30])
# this assertion fails
np.testing.assert_almost_equal(y1, y2, decimal=4)


# when ratio of pos_count/neg_count is  fixed, objectives are same
(compressed_train_fixed_ratio_df, compressed_test_fixed_ratio_df), \
    (restored_train_fixed_ratio_df, restored_test_fixed_ratio_df) = \
    make_compressed_and_restored_data(X, fixed_ratio=2)

model3 = LGBMClassifier(objective='binary')
model3.set_params(**lgbm_params)

model3.fit(
    restored_train_fixed_ratio_df.iloc[:,:30], 
    restored_train_fixed_ratio_df['label'], 
    sample_weight=restored_train_fixed_ratio_df['weight']
)

model4 = LGBMRegressor(objective='cross_entropy')
model4.set_params(**lgbm_params)
model4.fit(
    compressed_train_fixed_ratio_df.iloc[:,:30], 
    compressed_train_fixed_ratio_df['label'], 
    sample_weight=compressed_train_fixed_ratio_df['weight']
)

y3 = model3.predict_proba(compressed_test_fixed_ratio_df.iloc[:,:30])[:,1]
y4 = model4.predict(compressed_test_fixed_ratio_df.iloc[:,:30])
# this assertion passes
np.testing.assert_almost_equal(y3, y4, decimal=4)

【问题讨论】:

【参考方案1】:

看起来这个问题在这里和官方LightGBM repo 中交叉发布。

LightGBM 维护者已在此处提供了答案:https://github.com/microsoft/LightGBM/issues/3576。

【讨论】:

你说的对,就是我在github上发的那个还没解决。

以上是关于交叉熵损失不等于 lgbm 中的二进制对数损失的主要内容,如果未能解决你的问题,请参考以下文章

交叉熵损失函数修正Huber损失极大似然估计负对数似然似然与交叉熵KL散度

从极大似然到对数损失函数和交叉熵损失函数,以及对数损失优化取值范围

如何在 LGBM 中编写自定义损失函数?

交叉熵和对数损失函数之间的关系

均方差交叉熵及公式推导

[人工智能-深度学习-14]:神经网络基础 - 常见loss损失函数之逻辑分类,对数函数,交叉熵函数