如何将箱线图与平均线连接起来

Posted

技术标签:

【中文标题】如何将箱线图与平均线连接起来【英文标题】:How to connect boxplots with a mean line 【发布时间】:2021-08-30 15:47:33 【问题描述】:

以下代码:

import pandas as pd
import numpy as np

data_dict = 'Best fit': [395.0, 401.0, 358.0, 443.0, 357.0, 378.0, 356.0, 356.0, 403.0, 380.0, 397.0, 406.0, 409.0, 414.0, 350.0, 433.0, 345.0, 376.0, 374.0, 379.0, 9.0, 13.0, 10.0, 13.0, 16.0, 12.0, 6.0, 11.0, 20.0, 10.0, 12.0, 11.0, 15.0, 11.0, 11.0, 11.0, 15.0, 10.0, 8.0, 18.0, 864.0, 803.0, 849.0, 858.0, 815.0, 856.0, 927.0, 878.0, 834.0, 837.0, 811.0, 857.0, 848.0, 869.0, 861.0, 820.0, 887.0, 842.0, 834.0, np.nan], 'MDP': [332, 321, 304, 377, 304, 313, 289, 314, 341, 321, 348, 334, 361, 348, 292, 362, 285, 316, 291, 318, 3, 6, 5, 5, 4, 5, 4, 3, 8, 6, 4, 0, 8, 1, 4, 0, 9, 5, 3, 8, 770, 770, 819, 751, 822, 842, 758, 825, 886, 830, 774, 839, 779, 821, 812, 850, 822, 786, 874, 831], 'Q-Learning': [358, 329, 309, 381, 302, 319, 296, 315, 343, 318, 338, 336, 360, 357, 299, 363, 287, 337, 301, 334, 3, 6, 5, 5, 4, 5, 4, 3, 8, 6, 4, 0, 8, 1, 4, 0, 9, 5, 3, 8, 771, 833, 757, 837, 831, 784, 806, 890, 843, 775, 838, 776, 824, 830, 834, 827, 791, 868, 816, 806], 'parametrized_factor': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2]
data = pd.DataFrame(data_dict)

# figure size
plt.figure(figsize=(12, 8))

# melt the dataframe into a long form
dfm = data.melt(id_vars='parametrized_factor')

# plot
ax = sns.boxplot(data=dfm, x='variable', y='value', hue='parametrized_factor', linewidth=0.7, palette="Set3")

ax.yaxis.grid(True) # Hide the horizontal gridlines
ax.xaxis.grid(True) # Show the vertical gridlines
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# ADDED: Remove labels.
ax.set_ylabel('Rejection ratio')    
ax.set_xlabel('')

plt.show()

绘制以下内容:

有没有办法连接每个图例类别的“最佳匹配”、“MDP”和“Q-Learning”?

换句话说,如何通过连接平均值的线来连接相同颜色的箱线图?

【问题讨论】:

【参考方案1】:

您可以创建点图并重新计算闪避宽度。对于箱线图,在默认距离 0.8 上平均分布有 3 个框。对于点图,线条被放置在宽度的极限处,因此需要缩放以使其适合箱线图。更多信息请参见this github issue。

请注意,您不需要计算均值,因为这是default estimator for pointplot。可以使用ci=None 抑制平均值的误差线。

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

data_dict = 'Best fit': [395.0, 401.0, 358.0, 443.0, 357.0, 378.0, 356.0, 356.0, 403.0, 380.0, 397.0, 406.0, 409.0, 414.0, 350.0, 433.0, 345.0, 376.0, 374.0, 379.0, 9.0, 13.0, 10.0, 13.0, 16.0, 12.0, 6.0, 11.0, 20.0, 10.0, 12.0, 11.0, 15.0, 11.0, 11.0, 11.0, 15.0, 10.0, 8.0, 18.0, 864.0, 803.0, 849.0, 858.0, 815.0, 856.0, 927.0, 878.0, 834.0, 837.0, 811.0, 857.0, 848.0, 869.0, 861.0, 820.0, 887.0, 842.0, 834.0, np.nan], 'MDP': [332, 321, 304, 377, 304, 313, 289, 314, 341, 321, 348, 334, 361, 348, 292, 362, 285, 316, 291, 318, 3, 6, 5, 5, 4, 5, 4, 3, 8, 6, 4, 0, 8, 1, 4, 0, 9, 5, 3, 8, 770, 770, 819, 751, 822, 842, 758, 825, 886, 830, 774, 839, 779, 821, 812, 850, 822, 786, 874, 831], 'Q-Learning': [358, 329, 309, 381, 302, 319, 296, 315, 343, 318, 338, 336, 360, 357, 299, 363, 287, 337, 301, 334, 3, 6, 5, 5, 4, 5, 4, 3, 8, 6, 4, 0, 8, 1, 4, 0, 9, 5, 3, 8, 771, 833, 757, 837, 831, 784, 806, 890, 843, 775, 838, 776, 824, 830, 834, 827, 791, 868, 816, 806], 'parametrized_factor': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2]
data = pd.DataFrame(data_dict)

sns.set_style('darkgrid')
plt.figure(figsize=(12, 8))

dfm = data.melt(id_vars='parametrized_factor')

ax = sns.boxplot(data=dfm, x='variable', y='value', hue='parametrized_factor', linewidth=0.7, palette="Set3")
sns.pointplot(data=dfm, x='variable', y='value', hue='parametrized_factor', ci=None,
              dodge=.8 - .8 / 3, scale=0.3, color='black', marker='D')

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles[:4], labels=labels[:3] + ["means"], title="parametrized factor",
          bbox_to_anchor=(1.02, 1.02), loc='upper left')

ax.set_ylabel('Rejection ratio')
ax.set_xlabel('')
plt.tight_layout()
plt.show()

【讨论】:

【参考方案2】: 计算每个组的平均值,然后用seaborn.lineplot 将它们添加到现有的axseaborn.boxplot 中设置dodge=False 请记住,箱线图中的线是中位数,而不是平均值。 使用showmeans=True 将方法添加到boxplot,然后根据需要从lineplot 中删除marker='o'。 正如指出的JohanC'sanswer: sns.pointplot(data=dfm, x='variable', y='value', hue='parametrized_factor', ax=ax)可以不用计算dfm_mean,但是没有legend=False参数,这就需要手动管理图例。 另外,我认为使用dodge=False 比计算偏移量更直接。 任何一个答案都是可行的,具体取决于您的要求。
# calculate the mean for each group and convert to long format with melt
dfm_mean = data.groupby('parametrized_factor', as_index=False).mean().melt(id_vars='parametrized_factor')

# plot
# figure size
plt.figure(figsize=(12, 8))

# create the boxplot but set dodge to false, so all plots are on the same x-axis line
ax = sns.boxplot(data=dfm, x='variable', y='value', hue='parametrized_factor', linewidth=0.7, palette="Set3", dodge=False)

# plot a line plot with markers for the means
sns.lineplot(data=dfm_mean, x='variable', y='value', hue='parametrized_factor', marker='o', ax=ax, legend=False)

# set the legend outside
ax.legend(title='Factor', bbox_to_anchor=(1.05, 1), loc='upper left')

如果dodge 不是False,则结果为:

【讨论】:

以上是关于如何将箱线图与平均线连接起来的主要内容,如果未能解决你的问题,请参考以下文章

箱线图(boxplot)简介与举例

使用多个连接的箱线图更改 Matplotlib 中的轴刻度

如何看箱线图??

如何在ggplot的箱线图中按组绘制平均值

Boxplot ggplot2:在分组箱线图中显示平均值和观察次数

如何使用ggplot2显示箱线图中的所有平均值? [复制]