在闪避 ggplot2 箱线图中的组内和组间添加显着性条

Adding significance bars within and between groups in dodged ggplot2 boxplots

我有一些数据想要 1) 绘制为分组箱线图,以及 2) 添加显着性条 A) 在每个组内的箱线图之间和 B) 在不同组的特定箱线图之间。我的数据看起来像这样:

library("ggplot2")

df <- data.frame(enzyme = c(rep("A", 9), rep("B", 9), rep("C", 9)),
                 substrate = c(rep("1", 3), rep("2", 3), rep("3", 3),
                               rep("1", 3), rep("4", 3), rep("5", 3),
                               rep("1", 3), rep("4", 3), rep("5", 3)),
                 AUC = c(6.64, 6.56, 6.21, 5.96, 6.12, 6.24, 6.02, 6.32, 6.12,
                        0, 0, 0, 5.99, 6.26, 5.94, 0, 0, 0,
                        0, 0, 0, 5.99, 6.11, 6.13, 0, 0, 0))

q <- ggplot(df, aes(x = enzyme, y = AUC, color = substrate)) +
  geom_boxplot(show.legend = F,
               position = position_dodge2(width = 0.75, preserve = "single")) +       
  geom_point(show.legend = F, size = 2, position = position_dodge2(width = 0.75, preserve = "single"))

plot(q)

我知道我可以使用以下方法在组之间添加显着性条:

q + geom_signif(comparisons = list(c("A", "B"), c("A", "C"), c("B", "C")),
                test = "t.test", map_signif_level = T)

但是,这些比较对我的数据没有意义。

相反,我想 A) 在同一组的箱线图之间添加显着性条。我想我可以听从 Simon 的建议,他建议我通过为条形图定义 p 值、标签和 y 坐标 () 来手动添加条形图,尽管对于我的数据集来说这会更加困难,因为我每个组有三个子组,而不是两个。

最后,我还想 B) 添加比较来自不同组的两个特定子组的显着性条。

我的问题是,有没有使用现有 functions/packages 的简单方法来做到这一点?如果我必须手动执行此操作,有人可以提出一个好的策略吗?我将不胜感激!

我想了想,想出了一个冗长的解决方案。如果有人有更简洁的方法,请告诉我!

## significance bars within and between subgroups

# rearrange df, one unique sample per column, rows are replicates
df.split <- do.call(cbind, sapply(split(df, df$enzyme), function(x) {
  sapply(split(x, x$substrate), function(x) {x$AUC}) }) )
# keep track of sample names
sample.names <- do.call(c, lapply(split(df, df$enzyme), function(x) {
  unique(paste0(x$enzyme, ".", x$substrate)) }) )
colnames(df.split) <- sample.names
# perform statistical tests between every pairwise combination of 
# samples/columns in df.split
df.tests <- apply(combn(seq_along(sample.names), 2), 2, 
                       function(x) {
                         t.test(df.split[ ,x[1]], df.split[ ,x[2]])$p.value })
# keep track of sample pairs
sample.pairs <- apply(combn(seq_along(sample.names), 2), 2, 
                      function(x) {
                        paste0(colnames(df.split)[x[1]], "X", 
                               colnames(df.split)[x[2]]) })
names(df.tests) <- sample.pairs

# think about how the significance bars will be laid out: because there are
# three subgroups per enzyme, the bars for the three pairwise comparisons on
# the same plot would overlap. This needs to be done in layers

# select tests of interest for each layer
within.tests.1 <- c("A.1XA.2", "A.2XA.3", 
                    "B.1XB.4", "B.4XB.5", 
                    "C.1XC.4", "C.4XC.5")
within.tests.2 <- c("A.1XA.3", "B.1XB.5","C.1XC.5")
between.tests.1 <- c("A.1XB.4", "B.4XC.4")
between.tests.2 <- c("A.1XC.4")
  
p.values.1 <- df.tests[which(names(df.tests) %in% within.tests.1)]
p.values.2 <- df.tests[which(names(df.tests) %in% within.tests.2)]
p.values.3 <- df.tests[which(names(df.tests) %in% between.tests.1)]
p.values.4 <- df.tests[which(names(df.tests) %in% between.tests.2)]

# convert p-values into easily read labels, with NaN values omitted
p.values.1 <- replace(p.values.1, is.na(p.values.1), 1)
p.values.2 <- replace(p.values.2, is.na(p.values.2), 1)
p.values.3 <- replace(p.values.3, is.na(p.values.3), 1)
p.values.4 <- replace(p.values.4, is.na(p.values.4), 1)
labels.1 <- symnum(p.values.1, corr = FALSE, cutpoints = c(0,  .001,.01,.05, 1),
                   symbols = c("***","**","*",""))
labels.2 <- symnum(p.values.2, corr = FALSE, cutpoints = c(0,  .001,.01,.05, 1),
                   symbols = c("***","**","*",""))
labels.3 <- symnum(p.values.3, corr = FALSE, cutpoints = c(0,  .001,.01,.05, 1),
                   symbols = c("***","**","*",""))
labels.4 <- symnum(p.values.4, corr = FALSE, cutpoints = c(0,  .001,.01,.05, 1),
                   symbols = c("***","**","*",""))

# determine coordinates for significance bars 

# y values for layer 1 should all be just above the highest data point of all
# samples being compared
y.values.1 <- do.call(max, lapply(unlist(strsplit(names(labels.1), "X")), 
                                  function(x) {
                                    df.split[, which(colnames(df.split) %in% x)] }) ) + 0.3 %>% 
  rep(times = length(labels.1))
# y values for layer 2 should be higher than those of layer 1
y.values.2 <- y.values.1[c(1, 3, 5)] + 0.4
# y values for layer 3 should all be above the highest data point of all
# samples being compared, and higher than layer 2
y.values.3 <- do.call(max, lapply(unlist(strsplit(names(labels.3), "X")), 
                                  function(x) {
  df.split[, which(colnames(df.split) %in% x)] }) ) + 1.2 %>% 
  rep(times = length(labels.3))
# y values for layer 4 should be higher than those of layer 3
y.values.4 <- y.values.3[1] + 0.5

# for x values, first boxplot is always at x = 1
# since there are three groups per x = 1 and preserve = "single", the width of
# each subgroup boxplot is 0.25
x.min.values.1 <- c(0.75, 1, 1.75, 2, 2.75, 3)
x.max.values.1 <- x.min.values.1 + 0.25
x.min.values.2 <- c(0.75, 1.75, 2.75)
x.max.values.2 <- x.min.values.2 + 0.50
x.min.values.3 <- c(0.75, 2)
x.max.values.3 <- c(2, 3)
x.min.values.4 <- c(0.75)
x.max.values.4 <- c(3)


# finally, plot the significance bars for each layer, one on top of the other
q + geom_signif(y_position = y.values.1, 
                xmin = x.min.values.1, 
                xmax = x.max.values.1, 
                annotations = labels.1,
                tip_length = rep(0.02, length(labels.1)),
                vjust = 0.5 ) +
  geom_signif(y_position = y.values.2, 
              xmin = x.min.values.2, 
              xmax = x.max.values.2, 
              annotations = labels.2,
              tip_length = rep(0.04, length(labels.2)),
              vjust = 0.5 ) +
  geom_signif(y_position = y.values.3, 
              xmin = x.min.values.3, 
              xmax = x.max.values.3, 
              annotations = labels.3,
              tip_length = rep(0.04, length(labels.3)),
              vjust = 0.5 ) +
  geom_signif(y_position = y.values.4, 
              xmin = x.min.values.4, 
              xmax = x.max.values.4, 
              annotations = labels.4,
              tip_length = rep(0.06, length(labels.4)),
              vjust = 0.5 )

输出如下所示:

Barplot_with_significance_bars_within_and_between_groups