跨多列数据框 ggplot 创建多个箱线图

Creating multiple boxplots across multiple columns dataframe ggplot

我有一个数据框和一个数据框列表,我想用它们来制作多个箱线图。数据框包含来自多个实验室的结果的多种分析物的化学信息。

数据框列表存储汇总统计结果、平均值、中位数等。此信息单独存储,因为异常值已从计算统计数据中删除。

我要制作的箱线图将显示所有值

我的数据框示例如下

structure(list(Determination_No = c(1L, 2L, 3L, 4L, 5L, 6L, 1L, 
2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 
6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 
4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L), LAB.ID = c(2L, 2L, 2L, 2L, 
2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 
5L, 5L, 5L, 5L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 
10L, 10L, 10L, 10L, 10L, 10L, 12L, 12L, 12L, 12L, 12L, 12L), 
    Fe = c(55.94, 55.7, 56.59, 56.5, 55.98, 55.93, 56.83, 56.54, 
    56.18, 56.5, 56.51, 56.34, 56.39, 56.43, 56.53, 56.31, 56.47, 
    56.35, 56.32, 56.29, 56.31, 56.32, 56.39, 56.32, 56.48, 56.4, 
    56.54, 56.43, 56.73, 56.62, 56.382, 56.258, 56.442, 56.258, 
    56.532, 56.264, 56.3, 56.5, 56.2, 56.5, 56.7, 56.5, 56.11, 
    56.46, 56.1, 56.35, 56.36, 56.37), SiO2 = c(7.63, 7.65, 7.73, 
    7.67, 7.67, 7.67, 7.84, 7.69, 7.59, 7.77, 7.74, 7.64, 7.67, 
    7.74, 7.62, 7.81, 7.66, 7.8, 7.91, 7.84, 7.96, 7.87, 7.84, 
    7.92, 7.77, 7.83, 7.76, 7.78, 7.65, 7.74, 7.936, 7.685, 7.863, 
    7.838, 7.828, 7.767, 7.872684992, 7.851291827, 7.872684992, 
    7.722932832, 7.680146501, 7.615967003, 7.64, 7.71, 7.71, 
    7.65, 7.82, 7.68), Al2O3 = c(2.01, 2.02, 2.03, 2.01, 2.02, 
    2, 2.01, 2.01, 2, 2.02, 2.02, 2.03, 2, 2.03, 1.99, 2.01, 
    2.01, 2.01, 2.02, 2.02, 2.05, 2.03, 2.02, 2.03, 1.88, 1.9, 
    1.89, 1.88, 1.88, 1.87, 2.053, 2.044, 2.041, 2.038, 2.008, 
    2.02, 2.002830415, 2.021725042, 2.021725042, 1.983935789, 
    2.002830415, 2.021725042, 2.09, 2.05, 1.96, 2.09, 2.06, 2.02
    )), row.names = c(NA, -48L), class = "data.frame")


下面是我的数据框列表示例,主要是出于显示原因放在一起,但是,我已经意识到我的图表需要一些信息

df.summary<-
list(Fe = structure(c("Min", "Max", "Median", "Mean", "Std Dev", 
"Coeff. Variation", "Dev. From Cert Mean", "   NA", "   NA", 
"   NA", "  NaN", "   NA", "  NA", "  NaN", "56.18", "56.83", 
"56.50", "56.48", "0.218", "0.39", " 0.13", "56.31", "56.53", 
"56.41", "56.41", "0.080", "0.14", " 0.01", "56.29", "56.39", 
"56.32", "56.33", "0.034", "0.06", "-0.15", "56.40", "56.73", 
"56.51", "56.53", "0.125", "0.22", " 0.22", "56.26", "56.53", 
"56.32", "56.36", "0.116", "0.20", "-0.09", "56.20", "56.70", 
"56.50", "56.45", "0.176", "0.31", " 0.08", "56.10", "56.46", 
"56.36", "56.29", "0.150", "0.27", "-0.21", "56.10", "56.83", 
"56.41", "56.41", "0.153", "0.27", ""), .Dim = c(7L, 10L), .Dimnames = list(
    c("LabMinSummary", "LabMaxSummary", "LabMedianSummary", "LabMeanSummary", 
    "lab.SDSummary", "cv.summmary", "LabDevMean.Summary"), c("", 
    "2", "3", "4", "5", "7", "8", "10", "12", ""))), SiO2 = structure(c("Min", 
"Max", "Median", "Mean", "Std Dev", "Coeff. Variation", "Dev. From Cert Mean", 
"7.63", "7.73", "7.67", "7.67", "0.033", "0.44", "-1.09", "7.59", 
"7.84", "7.72", "7.71", "0.091", "1.18", "-0.55", "7.62", "7.81", 
"7.70", "7.72", "0.079", "1.02", "-0.48", "7.84", "7.96", "7.89", 
"7.89", "0.048", "0.61", " 1.75", "7.65", "7.83", "7.76", "7.76", 
"0.060", "0.77", " 0.01", "7.68", "7.94", "7.83", "7.82", "0.086", 
"1.10", " 0.84", "7.62", "7.87", "7.79", "7.77", "0.111", "1.43", 
" 0.19", "7.64", "7.82", "7.70", "7.70", "0.065", "0.84", "-0.68", 
"7.59", "7.96", "7.74", "7.74", "0.097", "1.25", ""), .Dim = c(7L, 
10L), .Dimnames = list(c("LabMinSummary", "LabMaxSummary", "LabMedianSummary", 
"LabMeanSummary", "lab.SDSummary", "cv.summmary", "LabDevMean.Summary"
), c("", "2", "3", "4", "5", "7", "8", "10", "12", ""))), Al2O3 = structure(c("Min", 
"Max", "Median", "Mean", "Std Dev", "Coeff. Variation", "Dev. From Cert Mean", 
"2.00", "2.03", "2.01", "2.01", "0.010", "0.52", "-0.16", "2.00", 
"2.03", "2.01", "2.01", "0.010", "0.52", "-0.16", "1.99", "2.03", 
"2.01", "2.01", "0.013", "0.66", "-0.49", "2.02", "2.05", "2.02", 
"2.03", "0.012", "0.58", " 0.50", "  NA", "  NA", "  NA", " NaN", 
"   NA", "  NA", "  NaN", "2.01", "2.05", "2.04", "2.03", "0.017", 
"0.82", " 0.78", "1.98", "2.02", "2.01", "2.01", "0.015", "0.77", 
"-0.45", "  NA", "  NA", "  NA", " NaN", "   NA", "  NA", "  NaN", 
"1.98", "2.05", "2.01", "2.01", "0.016", "0.77", ""), .Dim = c(7L, 
10L), .Dimnames = list(c("LabMinSummary", "LabMaxSummary", "LabMedianSummary", 
"LabMeanSummary", "lab.SDSummary", "cv.summmary", "LabDevMean.Summary"
), c("", "2", "3", "4", "5", "7", "8", "10", "12", ""))))

对于单个分析物,我在下面有以下代码,可以为每个实验室生成我想要的单个分析物。

Plotlaborder <- unique(df$LAB.ID)

df %>%
  mutate(LAB.ID = factor(LAB.ID, levels = Plotlaborder)) %>%
  ggplot(outlier.shape = NA, mapping = aes(x = LAB.ID, y = df2$Fe, color = LAB.ID)) +
  stat_boxplot(geom = 'errorbar')+
  geom_boxplot(outlier.shape = NA) +
  geom_hline(linetype = 'dashed', color = 'blue', size = 0.75,
             mapping = aes(yintercept = as.numeric(df.summary[["Fe"]][[4,10]]))) + # Add a line for the accepted mean
  geom_hline(linetype = 'dashed', color = 'firebrick', size = 0.75,
             mapping = aes(yintercept = as.numeric(df.summary[["Fe"]][[4,10]]) - (as.numeric(df.summary[["Fe"]][[5,10]])) * 3)) +
  geom_hline(linetype = 'dashed', color = 'firebrick', size = 0.75,
             mapping = aes(yintercept = as.numeric(df.summary[["Fe"]][[4,10]]) + (as.numeric(df.summary[["Fe"]][[5,10]])) * 3)) +
  ggtitle("Fe Box Plot") +
  theme(plot.title = element_text(hjust = 0.5)) +
  xlab(label = "Lab No") +
  ylab("Fe values %")

我想使用 lapply 或 Map 为每个分析物生成相同的图表,我想从我的数据框列表 (df.summary) 中传递平均值和 sd 值来创建我的 geo_hline 值并将分析物名称分配给标题和 ylab。

最好的前进方向是什么?

我认为一个简单的解决方案是将您的代码包装到一个函数中并使用 df.summary 的名称作为参数:

library(tidyverse)

Plotlaborder <- unique(df$LAB.ID)
mycompound <- names(df.summary)
df <- df %>%
  mutate(LAB.ID = factor(LAB.ID, levels = Plotlaborder))
myplot <- function(compound) {
    png(filename=paste0("boxplot_",compound,".png"),width = 480, height = 480,units = "px")
    print(ggplot(df,outlier.shape = NA, mapping = aes(x = LAB.ID, y = .data[[compound]], color = LAB.ID)) +
    stat_boxplot(geom = 'errorbar')+
    geom_boxplot(outlier.shape = NA) +
    geom_hline(linetype = 'dashed', color = 'blue', size = 0.75,
               mapping = aes(yintercept = as.numeric(df.summary[[compound]][[4,10]]))) + # Add a line for the accepted mean
    geom_hline(linetype = 'dashed', color = 'firebrick', size = 0.75,
               mapping = aes(yintercept = as.numeric(df.summary[[compound]][[4,10]]) - (as.numeric(df.summary[[compound]][[5,10]])) * 3)) +
    geom_hline(linetype = 'dashed', color = 'firebrick', size = 0.75,
               mapping = aes(yintercept = as.numeric(df.summary[[compound]][[4,10]]) + (as.numeric(df.summary[[compound]][[5,10]])) * 3)) +
    ggtitle(paste0(compound," Box Plot")) +
    theme(plot.title = element_text(hjust = 0.5)) +
    xlab(label = "Lab No") +
    ylab(paste0(compound," values %")))
    dev.off()
}


lapply(mycompound, myplot)

这将在您的工作目录中保存与 length(df.summary) 一样多的绘图。

list.files(pattern = "png")
[1] "boxplot_Al2O3.png" "boxplot_Fe.png"    "boxplot_SiO2.png"

生成的输出示例

您可以在 df.summary 上使用 imap :

library(tidyverse)

df <- df %>% mutate(LAB.ID = factor(LAB.ID, levels = Plotlaborder)) 

imap(df.summary, ~{
  ggplot(df, outlier.shape = NA, 
        mapping = aes(x = LAB.ID, y = .data[[.y]], color = LAB.ID)) +
    stat_boxplot(geom = 'errorbar')+
    geom_boxplot(outlier.shape = NA) +
    geom_hline(linetype = 'dashed', color = 'blue', size = 0.75,
               mapping = aes(yintercept = as.numeric(.x[[4,10]]))) + 
    geom_hline(linetype = 'dashed', color = 'firebrick', size = 0.75,
               mapping = aes(yintercept = as.numeric(.x[[4,10]]) - (as.numeric(.x[[5,10]])) * 3)) +
    geom_hline(linetype = 'dashed', color = 'firebrick', size = 0.75,
               mapping = aes(yintercept = as.numeric(.x[[4,10]]) + (as.numeric(.x[[5,10]])) * 3)) +
    ggtitle(paste0(.y, " Box Plot")) +
    theme(plot.title = element_text(hjust = 0.5)) +
    xlab(label = "Lab No") +
    ylab(paste0(.y, 'values %'))
}) -> list_plot

这将在 list_plot 中生成地块列表,并且可以通过 list_plot[[1]]list_plot[[2]] 等访问各个地块。