如何将箱线图与平均线连接起来
Posted
技术标签:
【中文标题】如何将箱线图与平均线连接起来【英文标题】:How to connect boxplots with a mean line 【发布时间】:2021-08-30 15:47:33 【问题描述】:以下代码:
import pandas as pd
import numpy as np
data_dict = 'Best fit': [395.0, 401.0, 358.0, 443.0, 357.0, 378.0, 356.0, 356.0, 403.0, 380.0, 397.0, 406.0, 409.0, 414.0, 350.0, 433.0, 345.0, 376.0, 374.0, 379.0, 9.0, 13.0, 10.0, 13.0, 16.0, 12.0, 6.0, 11.0, 20.0, 10.0, 12.0, 11.0, 15.0, 11.0, 11.0, 11.0, 15.0, 10.0, 8.0, 18.0, 864.0, 803.0, 849.0, 858.0, 815.0, 856.0, 927.0, 878.0, 834.0, 837.0, 811.0, 857.0, 848.0, 869.0, 861.0, 820.0, 887.0, 842.0, 834.0, np.nan], 'MDP': [332, 321, 304, 377, 304, 313, 289, 314, 341, 321, 348, 334, 361, 348, 292, 362, 285, 316, 291, 318, 3, 6, 5, 5, 4, 5, 4, 3, 8, 6, 4, 0, 8, 1, 4, 0, 9, 5, 3, 8, 770, 770, 819, 751, 822, 842, 758, 825, 886, 830, 774, 839, 779, 821, 812, 850, 822, 786, 874, 831], 'Q-Learning': [358, 329, 309, 381, 302, 319, 296, 315, 343, 318, 338, 336, 360, 357, 299, 363, 287, 337, 301, 334, 3, 6, 5, 5, 4, 5, 4, 3, 8, 6, 4, 0, 8, 1, 4, 0, 9, 5, 3, 8, 771, 833, 757, 837, 831, 784, 806, 890, 843, 775, 838, 776, 824, 830, 834, 827, 791, 868, 816, 806], 'parametrized_factor': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2]
data = pd.DataFrame(data_dict)
# figure size
plt.figure(figsize=(12, 8))
# melt the dataframe into a long form
dfm = data.melt(id_vars='parametrized_factor')
# plot
ax = sns.boxplot(data=dfm, x='variable', y='value', hue='parametrized_factor', linewidth=0.7, palette="Set3")
ax.yaxis.grid(True) # Hide the horizontal gridlines
ax.xaxis.grid(True) # Show the vertical gridlines
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
# ADDED: Remove labels.
ax.set_ylabel('Rejection ratio')
ax.set_xlabel('')
plt.show()
绘制以下内容:
有没有办法连接每个图例类别的“最佳匹配”、“MDP”和“Q-Learning”?
换句话说,如何通过连接平均值的线来连接相同颜色的箱线图?
【问题讨论】:
【参考方案1】:您可以创建点图并重新计算闪避宽度。对于箱线图,在默认距离 0.8 上平均分布有 3 个框。对于点图,线条被放置在宽度的极限处,因此需要缩放以使其适合箱线图。更多信息请参见this github issue。
请注意,您不需要计算均值,因为这是default estimator for pointplot
。可以使用ci=None
抑制平均值的误差线。
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
data_dict = 'Best fit': [395.0, 401.0, 358.0, 443.0, 357.0, 378.0, 356.0, 356.0, 403.0, 380.0, 397.0, 406.0, 409.0, 414.0, 350.0, 433.0, 345.0, 376.0, 374.0, 379.0, 9.0, 13.0, 10.0, 13.0, 16.0, 12.0, 6.0, 11.0, 20.0, 10.0, 12.0, 11.0, 15.0, 11.0, 11.0, 11.0, 15.0, 10.0, 8.0, 18.0, 864.0, 803.0, 849.0, 858.0, 815.0, 856.0, 927.0, 878.0, 834.0, 837.0, 811.0, 857.0, 848.0, 869.0, 861.0, 820.0, 887.0, 842.0, 834.0, np.nan], 'MDP': [332, 321, 304, 377, 304, 313, 289, 314, 341, 321, 348, 334, 361, 348, 292, 362, 285, 316, 291, 318, 3, 6, 5, 5, 4, 5, 4, 3, 8, 6, 4, 0, 8, 1, 4, 0, 9, 5, 3, 8, 770, 770, 819, 751, 822, 842, 758, 825, 886, 830, 774, 839, 779, 821, 812, 850, 822, 786, 874, 831], 'Q-Learning': [358, 329, 309, 381, 302, 319, 296, 315, 343, 318, 338, 336, 360, 357, 299, 363, 287, 337, 301, 334, 3, 6, 5, 5, 4, 5, 4, 3, 8, 6, 4, 0, 8, 1, 4, 0, 9, 5, 3, 8, 771, 833, 757, 837, 831, 784, 806, 890, 843, 775, 838, 776, 824, 830, 834, 827, 791, 868, 816, 806], 'parametrized_factor': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2]
data = pd.DataFrame(data_dict)
sns.set_style('darkgrid')
plt.figure(figsize=(12, 8))
dfm = data.melt(id_vars='parametrized_factor')
ax = sns.boxplot(data=dfm, x='variable', y='value', hue='parametrized_factor', linewidth=0.7, palette="Set3")
sns.pointplot(data=dfm, x='variable', y='value', hue='parametrized_factor', ci=None,
dodge=.8 - .8 / 3, scale=0.3, color='black', marker='D')
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles[:4], labels=labels[:3] + ["means"], title="parametrized factor",
bbox_to_anchor=(1.02, 1.02), loc='upper left')
ax.set_ylabel('Rejection ratio')
ax.set_xlabel('')
plt.tight_layout()
plt.show()
【讨论】:
【参考方案2】: 计算每个组的平均值,然后用seaborn.lineplot
将它们添加到现有的ax
在seaborn.boxplot
中设置dodge=False
请记住,箱线图中的线是中位数,而不是平均值。
使用showmeans=True
将方法添加到boxplot
,然后根据需要从lineplot
中删除marker='o'
。
正如指出的JohanC'sanswer:
sns.pointplot(data=dfm, x='variable', y='value', hue='parametrized_factor', ax=ax)
可以不用计算dfm_mean
,但是没有legend=False
参数,这就需要手动管理图例。
另外,我认为使用dodge=False
比计算偏移量更直接。
任何一个答案都是可行的,具体取决于您的要求。
# calculate the mean for each group and convert to long format with melt
dfm_mean = data.groupby('parametrized_factor', as_index=False).mean().melt(id_vars='parametrized_factor')
# plot
# figure size
plt.figure(figsize=(12, 8))
# create the boxplot but set dodge to false, so all plots are on the same x-axis line
ax = sns.boxplot(data=dfm, x='variable', y='value', hue='parametrized_factor', linewidth=0.7, palette="Set3", dodge=False)
# plot a line plot with markers for the means
sns.lineplot(data=dfm_mean, x='variable', y='value', hue='parametrized_factor', marker='o', ax=ax, legend=False)
# set the legend outside
ax.legend(title='Factor', bbox_to_anchor=(1.05, 1), loc='upper left')
如果dodge
不是False
,则结果为:
【讨论】:
以上是关于如何将箱线图与平均线连接起来的主要内容,如果未能解决你的问题,请参考以下文章