如何将箱线图与平均线连接起来

How to connect boxplots with a mean line

以下代码:

import pandas as pd
import numpy as np

data_dict = {'Best fit': [395.0, 401.0, 358.0, 443.0, 357.0, 378.0, 356.0, 356.0, 403.0, 380.0, 397.0, 406.0, 409.0, 414.0, 350.0, 433.0, 345.0, 376.0, 374.0, 379.0, 9.0, 13.0, 10.0, 13.0, 16.0, 12.0, 6.0, 11.0, 20.0, 10.0, 12.0, 11.0, 15.0, 11.0, 11.0, 11.0, 15.0, 10.0, 8.0, 18.0, 864.0, 803.0, 849.0, 858.0, 815.0, 856.0, 927.0, 878.0, 834.0, 837.0, 811.0, 857.0, 848.0, 869.0, 861.0, 820.0, 887.0, 842.0, 834.0, np.nan], 'MDP': [332, 321, 304, 377, 304, 313, 289, 314, 341, 321, 348, 334, 361, 348, 292, 362, 285, 316, 291, 318, 3, 6, 5, 5, 4, 5, 4, 3, 8, 6, 4, 0, 8, 1, 4, 0, 9, 5, 3, 8, 770, 770, 819, 751, 822, 842, 758, 825, 886, 830, 774, 839, 779, 821, 812, 850, 822, 786, 874, 831], 'Q-Learning': [358, 329, 309, 381, 302, 319, 296, 315, 343, 318, 338, 336, 360, 357, 299, 363, 287, 337, 301, 334, 3, 6, 5, 5, 4, 5, 4, 3, 8, 6, 4, 0, 8, 1, 4, 0, 9, 5, 3, 8, 771, 833, 757, 837, 831, 784, 806, 890, 843, 775, 838, 776, 824, 830, 834, 827, 791, 868, 816, 806], 'parametrized_factor': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2]}
data = pd.DataFrame(data_dict)

# figure size
plt.figure(figsize=(12, 8))

# melt the dataframe into a long form
dfm = data.melt(id_vars='parametrized_factor')

# plot
ax = sns.boxplot(data=dfm, x='variable', y='value', hue='parametrized_factor', linewidth=0.7, palette="Set3")

ax.yaxis.grid(True) # Hide the horizontal gridlines
ax.xaxis.grid(True) # Show the vertical gridlines
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# ADDED: Remove labels.
ax.set_ylabel('Rejection ratio')    
ax.set_xlabel('')

plt.show()

绘制以下内容:

有没有办法连接每个图例类别的 'Best Fit'、'MDP 和 'Q-Learning'?

换句话说,如何通过连接其平均值的线连接相同颜色的箱线图?

  • 计算每个组的平均值,然后将它们添加到现有的 axseaborn.lineplot
  • seaborn.boxplot
  • 中设置dodge=False
  • 请记住,箱线图中的线是中位数,而不是均值。
    • 使用 showmeans=True 将方法添加到 boxplot,然后根据需要从 lineplot 中删除 marker='o'
  • 正如所指出的 JohanC's
    • sns.pointplot(data=dfm, x='variable', y='value', hue='parametrized_factor', ax=ax)可以不用计算dfm_mean,但是没有legend=False参数,需要手动管理图例。
    • 此外,我认为使用 dodge=False 比计算偏移量更直接。
    • 根据您的要求,任一答案都可行。
# calculate the mean for each group and convert to long format with melt
dfm_mean = data.groupby('parametrized_factor', as_index=False).mean().melt(id_vars='parametrized_factor')

# plot
# figure size
plt.figure(figsize=(12, 8))

# create the boxplot but set dodge to false, so all plots are on the same x-axis line
ax = sns.boxplot(data=dfm, x='variable', y='value', hue='parametrized_factor', linewidth=0.7, palette="Set3", dodge=False)

# plot a line plot with markers for the means
sns.lineplot(data=dfm_mean, x='variable', y='value', hue='parametrized_factor', marker='o', ax=ax, legend=False)

# set the legend outside
ax.legend(title='Factor', bbox_to_anchor=(1.05, 1), loc='upper left')

  • 如果 dodge 不是 False,结果是:

您可以创建点图并重新计算闪避宽度。对于箱形图,有 3 个箱子在默认距离 0.8 内平均分布。对于点图,线条位于宽度的极限,因此需要缩放以使其适合箱线图。有关更多信息,请参阅 this github issue

请注意,您不需要计算均值,因为那是 default estimator for pointplot。可以使用 ci=None.

抑制平均值的误差线
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

data_dict = {'Best fit': [395.0, 401.0, 358.0, 443.0, 357.0, 378.0, 356.0, 356.0, 403.0, 380.0, 397.0, 406.0, 409.0, 414.0, 350.0, 433.0, 345.0, 376.0, 374.0, 379.0, 9.0, 13.0, 10.0, 13.0, 16.0, 12.0, 6.0, 11.0, 20.0, 10.0, 12.0, 11.0, 15.0, 11.0, 11.0, 11.0, 15.0, 10.0, 8.0, 18.0, 864.0, 803.0, 849.0, 858.0, 815.0, 856.0, 927.0, 878.0, 834.0, 837.0, 811.0, 857.0, 848.0, 869.0, 861.0, 820.0, 887.0, 842.0, 834.0, np.nan], 'MDP': [332, 321, 304, 377, 304, 313, 289, 314, 341, 321, 348, 334, 361, 348, 292, 362, 285, 316, 291, 318, 3, 6, 5, 5, 4, 5, 4, 3, 8, 6, 4, 0, 8, 1, 4, 0, 9, 5, 3, 8, 770, 770, 819, 751, 822, 842, 758, 825, 886, 830, 774, 839, 779, 821, 812, 850, 822, 786, 874, 831], 'Q-Learning': [358, 329, 309, 381, 302, 319, 296, 315, 343, 318, 338, 336, 360, 357, 299, 363, 287, 337, 301, 334, 3, 6, 5, 5, 4, 5, 4, 3, 8, 6, 4, 0, 8, 1, 4, 0, 9, 5, 3, 8, 771, 833, 757, 837, 831, 784, 806, 890, 843, 775, 838, 776, 824, 830, 834, 827, 791, 868, 816, 806], 'parametrized_factor': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2]}
data = pd.DataFrame(data_dict)

sns.set_style('darkgrid')
plt.figure(figsize=(12, 8))

dfm = data.melt(id_vars='parametrized_factor')

ax = sns.boxplot(data=dfm, x='variable', y='value', hue='parametrized_factor', linewidth=0.7, palette="Set3")
sns.pointplot(data=dfm, x='variable', y='value', hue='parametrized_factor', ci=None,
              dodge=.8 - .8 / 3, scale=0.3, color='black', marker='D')

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles[:4], labels=labels[:3] + ["means"], title="parametrized factor",
          bbox_to_anchor=(1.02, 1.02), loc='upper left')

ax.set_ylabel('Rejection ratio')
ax.set_xlabel('')
plt.tight_layout()
plt.show()