绘制直方图 bin 的箱线图以进行直方图比较
Drawing a boxplot of the bins of histograms for histogram comparation
我正在尝试比较五个直方图(可能以非正统、非传统的方式)。
我已经绘制了直方图并在图表中重叠了它们的箱子。现在我想绘制每个区间内数据分布的箱线图(每个区间都有 5 个直方图的计数,所以我要求一种嵌套分布)。从图形上看,这应该类似于所附图片。
这是我创建重叠直方图的代码。
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
u30_diameter_01 = pd.Series([6.76, 5.03, 6.57, 5.52, 8.81, 9.50, 4.96, 6.19, 5.61, 4.86, 4.41, 6.04, 6.39, 4.61, 6.90, 6.38, 6.20, 6.75, 4.76, 6.18, 6.13, 7.95, 4.22, 6.16, 8.40, 6.90, 6.77, 6.11, 6.15, 6.43])
u30_diameter_02 = pd.Series([4.98, 5.16, 6.43, 3.93, 5.41, 6.39, 6.35, 5.43, 6.69, 7.10, 7.55, 5.92, 6.50, 5.36, 7.27, 4.10, 6.89, 4.82, 4.34, 5.67, 8.95, 7.53, 4.91, 5.30, 6.62, 6.90, 5.98, 6.05, 4.37, 6.42, 5.03, 7.78, 6.28, 7.81, 5.81, 5.99, 6.22, 6.48, 4.43, 5.18])
u30_diameter_03 = pd.Series([5.80, 5.72, 6.19, 5.69, 4.19, 4.82, 6.32, 6.14, 6.59, 4.90, 6.39, 5.93, 5.17, 6.00, 6.64, 7.48, 5.61, 5.79, 4.79, 5.99, 5.11, 6.08, 6.68, 5.84, 6.88, 6.81, 6.07, 4.55, 6.20, 5.50])
u30_diameter_04 = pd.Series([5.21, 7.85, 6.48, 4.44, 6.91, 6.30, 8.51, 5.99, 7.74, 5.47, 6.19, 8.44, 8.03, 8.37, 6.80, 6.49, 5.65, 6.06, 7.04, 4.55, 6.76, 7.99, 5.64, 5.97, 6.21])
u30_diameter_05 = pd.Series([7.24, 7.42, 6.85, 8.14, 7.03, 6.52, 5.82, 6.92, 7.44, 7.91, 7.18, 7.99, 6.19, 7.44, 7.37, 6.54, 7.11, 7.77, 8.67, 8.35, 7.12, 5.10, 8.29, 6.36, 7.81])
u30_diameter = pd.concat([u30_diameter_01, u30_diameter_02, u30_diameter_03, u30_diameter_04, u30_diameter_05], ignore_index=True).to_frame()
u30_diameter.columns = ['Value']
u30_diameter['Photo'] = pd.Series(['1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1',
'2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2',
'3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3',
'4','4','4','4','4','4','4','4','4','4','4','4','4','4','4','4','4','4','4','4','4','4','4','4','4',
'5','5','5','5','5','5','5','5','5','5','5','5','5','5','5','5','5','5','5','5','5','5','5','5','5'])
diameter_range = np.arange(3.75,9.75,0.5)
sns.histplot(data=u30_diameter, x="Value", hue="Photo", element="step", bins=12)
这是创建框的额外代码(我手动计算每个间隔的计数方式)。
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
u30_d_01 = pd.Series([1,4,1,0,0])
u30_d_02 = pd.Series([3,2,3,2,0])
u30_d_03 = pd.Series([3,6,3,1,1])
u30_d_04 = pd.Series([2,4,4,3,0])
u30_d_05 = pd.Series([8,6,11,5,2])
u30_d_06 = pd.Series([4,9,5,3,3])
u30_d_07 = pd.Series([5,3,2,4,5])
u30_d_08 = pd.Series([0,3,1,0,1])
u30_d_09 = pd.Series([1,2,0,4,4])
u30_d_10 = pd.Series([1,0,0,3,3])
u30_d_11 = pd.Series([1,1,0,0,1])
u30_d_12 = pd.Series([1,0,0,0,0])
u30_d = pd.concat([u30_d_01, u30_d_02, u30_d_03, u30_d_04, u30_d_05, u30_d_06, u30_d_07, u30_d_08, u30_d_09, u30_d_10, u30_d_11, u30_d_12], ignore_index=True).to_frame()
u30_d.columns = ['Value']
u30_d['Photo'] = pd.Series([
'1','1','1','1','1','2','2','2','2','2','3','3','3','3','3','4','4','4','4','4',
'5','5','5','5','5','6','6','6','6','6','7','7','7','7','7','8','8','8','8','8',
'9','9','9','9','9','10','10','10','10','10','11','11','11','11','11','12','12','12','12','12'
])
fig, ax = plt.subplots(figsize=(5,5))
sns.boxplot(ax=ax,x='Photo',y='Value',data=u30_d)
ax.set_xlabel('Diameter, \u03BCm')
ax.set_ylabel('Count')
fig.savefig('U30_d_box.png', facecolor='white', edgecolor='none')
这是一张图片,大致代表了我的目标。
谢谢!
在您的直方图上,x 轴表示值,而 y 轴表示计数。出于这个原因,箱线图(代表数量的分位数分布)不能是垂直的而是水平的,因为它必须代表 'Value'
轴上的分位数分布。
话虽这么说,你可以用一个子图来做到这一点:
fig, ax = plt.subplots(2, 1, sharex = 'all', figsize = (6, 8))
sns.histplot(ax = ax[0], data=u30_diameter, x="Value", hue="Photo", element="step", bins=12)
sns.boxplot(ax = ax[1], data = u30_diameter, x = 'Value', y = 'Photo', hue = 'Photo', dodge = False)
plt.show()
如果您想绘制每个箱子的计数箱线图,则必须计算箱子和计数:
n_bins = 12
fig, ax = plt.subplots()
_, bins, _ = ax.hist(u30_diameter['Value'], bins = n_bins)
counts = []
for photo in u30_diameter['Photo'].unique():
n, _, _ = ax.hist(u30_diameter[u30_diameter['Photo'] == photo]['Value'], bins = bins)
counts.append(n)
counts = np.array(counts)
bins = [(bins[i + 1] + bins[i])/2 for i in range(n_bins)]
现在您可以擦除之前用于计算 bin 和计数的绘图并绘制您想要的绘图:
ax.cla()
sns.histplot(ax=ax, data=u30_diameter, x="Value", hue="Photo", element="step", bins=12)
ax.boxplot(counts, positions=bins, manage_ticks=False, patch_artist=True, boxprops={'facecolor': 'blue', 'alpha': 0.5})
plt.show()
我正在尝试比较五个直方图(可能以非正统、非传统的方式)。 我已经绘制了直方图并在图表中重叠了它们的箱子。现在我想绘制每个区间内数据分布的箱线图(每个区间都有 5 个直方图的计数,所以我要求一种嵌套分布)。从图形上看,这应该类似于所附图片。
这是我创建重叠直方图的代码。
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
u30_diameter_01 = pd.Series([6.76, 5.03, 6.57, 5.52, 8.81, 9.50, 4.96, 6.19, 5.61, 4.86, 4.41, 6.04, 6.39, 4.61, 6.90, 6.38, 6.20, 6.75, 4.76, 6.18, 6.13, 7.95, 4.22, 6.16, 8.40, 6.90, 6.77, 6.11, 6.15, 6.43])
u30_diameter_02 = pd.Series([4.98, 5.16, 6.43, 3.93, 5.41, 6.39, 6.35, 5.43, 6.69, 7.10, 7.55, 5.92, 6.50, 5.36, 7.27, 4.10, 6.89, 4.82, 4.34, 5.67, 8.95, 7.53, 4.91, 5.30, 6.62, 6.90, 5.98, 6.05, 4.37, 6.42, 5.03, 7.78, 6.28, 7.81, 5.81, 5.99, 6.22, 6.48, 4.43, 5.18])
u30_diameter_03 = pd.Series([5.80, 5.72, 6.19, 5.69, 4.19, 4.82, 6.32, 6.14, 6.59, 4.90, 6.39, 5.93, 5.17, 6.00, 6.64, 7.48, 5.61, 5.79, 4.79, 5.99, 5.11, 6.08, 6.68, 5.84, 6.88, 6.81, 6.07, 4.55, 6.20, 5.50])
u30_diameter_04 = pd.Series([5.21, 7.85, 6.48, 4.44, 6.91, 6.30, 8.51, 5.99, 7.74, 5.47, 6.19, 8.44, 8.03, 8.37, 6.80, 6.49, 5.65, 6.06, 7.04, 4.55, 6.76, 7.99, 5.64, 5.97, 6.21])
u30_diameter_05 = pd.Series([7.24, 7.42, 6.85, 8.14, 7.03, 6.52, 5.82, 6.92, 7.44, 7.91, 7.18, 7.99, 6.19, 7.44, 7.37, 6.54, 7.11, 7.77, 8.67, 8.35, 7.12, 5.10, 8.29, 6.36, 7.81])
u30_diameter = pd.concat([u30_diameter_01, u30_diameter_02, u30_diameter_03, u30_diameter_04, u30_diameter_05], ignore_index=True).to_frame()
u30_diameter.columns = ['Value']
u30_diameter['Photo'] = pd.Series(['1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1','1',
'2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2',
'3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3','3',
'4','4','4','4','4','4','4','4','4','4','4','4','4','4','4','4','4','4','4','4','4','4','4','4','4',
'5','5','5','5','5','5','5','5','5','5','5','5','5','5','5','5','5','5','5','5','5','5','5','5','5'])
diameter_range = np.arange(3.75,9.75,0.5)
sns.histplot(data=u30_diameter, x="Value", hue="Photo", element="step", bins=12)
这是创建框的额外代码(我手动计算每个间隔的计数方式)。
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
u30_d_01 = pd.Series([1,4,1,0,0])
u30_d_02 = pd.Series([3,2,3,2,0])
u30_d_03 = pd.Series([3,6,3,1,1])
u30_d_04 = pd.Series([2,4,4,3,0])
u30_d_05 = pd.Series([8,6,11,5,2])
u30_d_06 = pd.Series([4,9,5,3,3])
u30_d_07 = pd.Series([5,3,2,4,5])
u30_d_08 = pd.Series([0,3,1,0,1])
u30_d_09 = pd.Series([1,2,0,4,4])
u30_d_10 = pd.Series([1,0,0,3,3])
u30_d_11 = pd.Series([1,1,0,0,1])
u30_d_12 = pd.Series([1,0,0,0,0])
u30_d = pd.concat([u30_d_01, u30_d_02, u30_d_03, u30_d_04, u30_d_05, u30_d_06, u30_d_07, u30_d_08, u30_d_09, u30_d_10, u30_d_11, u30_d_12], ignore_index=True).to_frame()
u30_d.columns = ['Value']
u30_d['Photo'] = pd.Series([
'1','1','1','1','1','2','2','2','2','2','3','3','3','3','3','4','4','4','4','4',
'5','5','5','5','5','6','6','6','6','6','7','7','7','7','7','8','8','8','8','8',
'9','9','9','9','9','10','10','10','10','10','11','11','11','11','11','12','12','12','12','12'
])
fig, ax = plt.subplots(figsize=(5,5))
sns.boxplot(ax=ax,x='Photo',y='Value',data=u30_d)
ax.set_xlabel('Diameter, \u03BCm')
ax.set_ylabel('Count')
fig.savefig('U30_d_box.png', facecolor='white', edgecolor='none')
这是一张图片,大致代表了我的目标。
谢谢!
在您的直方图上,x 轴表示值,而 y 轴表示计数。出于这个原因,箱线图(代表数量的分位数分布)不能是垂直的而是水平的,因为它必须代表 'Value'
轴上的分位数分布。
话虽这么说,你可以用一个子图来做到这一点:
fig, ax = plt.subplots(2, 1, sharex = 'all', figsize = (6, 8))
sns.histplot(ax = ax[0], data=u30_diameter, x="Value", hue="Photo", element="step", bins=12)
sns.boxplot(ax = ax[1], data = u30_diameter, x = 'Value', y = 'Photo', hue = 'Photo', dodge = False)
plt.show()
如果您想绘制每个箱子的计数箱线图,则必须计算箱子和计数:
n_bins = 12
fig, ax = plt.subplots()
_, bins, _ = ax.hist(u30_diameter['Value'], bins = n_bins)
counts = []
for photo in u30_diameter['Photo'].unique():
n, _, _ = ax.hist(u30_diameter[u30_diameter['Photo'] == photo]['Value'], bins = bins)
counts.append(n)
counts = np.array(counts)
bins = [(bins[i + 1] + bins[i])/2 for i in range(n_bins)]
现在您可以擦除之前用于计算 bin 和计数的绘图并绘制您想要的绘图:
ax.cla()
sns.histplot(ax=ax, data=u30_diameter, x="Value", hue="Photo", element="step", bins=12)
ax.boxplot(counts, positions=bins, manage_ticks=False, patch_artist=True, boxprops={'facecolor': 'blue', 'alpha': 0.5})
plt.show()