在闪避 ggplot2 箱线图中的组内和组间添加显着性条
Adding significance bars within and between groups in dodged ggplot2 boxplots
我有一些数据想要 1) 绘制为分组箱线图,以及 2) 添加显着性条 A) 在每个组内的箱线图之间和 B) 在不同组的特定箱线图之间。我的数据看起来像这样:
library("ggplot2")
df <- data.frame(enzyme = c(rep("A", 9), rep("B", 9), rep("C", 9)),
substrate = c(rep("1", 3), rep("2", 3), rep("3", 3),
rep("1", 3), rep("4", 3), rep("5", 3),
rep("1", 3), rep("4", 3), rep("5", 3)),
AUC = c(6.64, 6.56, 6.21, 5.96, 6.12, 6.24, 6.02, 6.32, 6.12,
0, 0, 0, 5.99, 6.26, 5.94, 0, 0, 0,
0, 0, 0, 5.99, 6.11, 6.13, 0, 0, 0))
q <- ggplot(df, aes(x = enzyme, y = AUC, color = substrate)) +
geom_boxplot(show.legend = F,
position = position_dodge2(width = 0.75, preserve = "single")) +
geom_point(show.legend = F, size = 2, position = position_dodge2(width = 0.75, preserve = "single"))
plot(q)
我知道我可以使用以下方法在组之间添加显着性条:
q + geom_signif(comparisons = list(c("A", "B"), c("A", "C"), c("B", "C")),
test = "t.test", map_signif_level = T)
但是,这些比较对我的数据没有意义。
相反,我想 A) 在同一组的箱线图之间添加显着性条。我想我可以听从 Simon 的建议,他建议我通过为条形图定义 p 值、标签和 y 坐标 () 来手动添加条形图,尽管对于我的数据集来说这会更加困难,因为我每个组有三个子组,而不是两个。
最后,我还想 B) 添加比较来自不同组的两个特定子组的显着性条。
我的问题是,有没有使用现有 functions/packages 的简单方法来做到这一点?如果我必须手动执行此操作,有人可以提出一个好的策略吗?我将不胜感激!
我想了想,想出了一个冗长的解决方案。如果有人有更简洁的方法,请告诉我!
## significance bars within and between subgroups
# rearrange df, one unique sample per column, rows are replicates
df.split <- do.call(cbind, sapply(split(df, df$enzyme), function(x) {
sapply(split(x, x$substrate), function(x) {x$AUC}) }) )
# keep track of sample names
sample.names <- do.call(c, lapply(split(df, df$enzyme), function(x) {
unique(paste0(x$enzyme, ".", x$substrate)) }) )
colnames(df.split) <- sample.names
# perform statistical tests between every pairwise combination of
# samples/columns in df.split
df.tests <- apply(combn(seq_along(sample.names), 2), 2,
function(x) {
t.test(df.split[ ,x[1]], df.split[ ,x[2]])$p.value })
# keep track of sample pairs
sample.pairs <- apply(combn(seq_along(sample.names), 2), 2,
function(x) {
paste0(colnames(df.split)[x[1]], "X",
colnames(df.split)[x[2]]) })
names(df.tests) <- sample.pairs
# think about how the significance bars will be laid out: because there are
# three subgroups per enzyme, the bars for the three pairwise comparisons on
# the same plot would overlap. This needs to be done in layers
# select tests of interest for each layer
within.tests.1 <- c("A.1XA.2", "A.2XA.3",
"B.1XB.4", "B.4XB.5",
"C.1XC.4", "C.4XC.5")
within.tests.2 <- c("A.1XA.3", "B.1XB.5","C.1XC.5")
between.tests.1 <- c("A.1XB.4", "B.4XC.4")
between.tests.2 <- c("A.1XC.4")
p.values.1 <- df.tests[which(names(df.tests) %in% within.tests.1)]
p.values.2 <- df.tests[which(names(df.tests) %in% within.tests.2)]
p.values.3 <- df.tests[which(names(df.tests) %in% between.tests.1)]
p.values.4 <- df.tests[which(names(df.tests) %in% between.tests.2)]
# convert p-values into easily read labels, with NaN values omitted
p.values.1 <- replace(p.values.1, is.na(p.values.1), 1)
p.values.2 <- replace(p.values.2, is.na(p.values.2), 1)
p.values.3 <- replace(p.values.3, is.na(p.values.3), 1)
p.values.4 <- replace(p.values.4, is.na(p.values.4), 1)
labels.1 <- symnum(p.values.1, corr = FALSE, cutpoints = c(0, .001,.01,.05, 1),
symbols = c("***","**","*",""))
labels.2 <- symnum(p.values.2, corr = FALSE, cutpoints = c(0, .001,.01,.05, 1),
symbols = c("***","**","*",""))
labels.3 <- symnum(p.values.3, corr = FALSE, cutpoints = c(0, .001,.01,.05, 1),
symbols = c("***","**","*",""))
labels.4 <- symnum(p.values.4, corr = FALSE, cutpoints = c(0, .001,.01,.05, 1),
symbols = c("***","**","*",""))
# determine coordinates for significance bars
# y values for layer 1 should all be just above the highest data point of all
# samples being compared
y.values.1 <- do.call(max, lapply(unlist(strsplit(names(labels.1), "X")),
function(x) {
df.split[, which(colnames(df.split) %in% x)] }) ) + 0.3 %>%
rep(times = length(labels.1))
# y values for layer 2 should be higher than those of layer 1
y.values.2 <- y.values.1[c(1, 3, 5)] + 0.4
# y values for layer 3 should all be above the highest data point of all
# samples being compared, and higher than layer 2
y.values.3 <- do.call(max, lapply(unlist(strsplit(names(labels.3), "X")),
function(x) {
df.split[, which(colnames(df.split) %in% x)] }) ) + 1.2 %>%
rep(times = length(labels.3))
# y values for layer 4 should be higher than those of layer 3
y.values.4 <- y.values.3[1] + 0.5
# for x values, first boxplot is always at x = 1
# since there are three groups per x = 1 and preserve = "single", the width of
# each subgroup boxplot is 0.25
x.min.values.1 <- c(0.75, 1, 1.75, 2, 2.75, 3)
x.max.values.1 <- x.min.values.1 + 0.25
x.min.values.2 <- c(0.75, 1.75, 2.75)
x.max.values.2 <- x.min.values.2 + 0.50
x.min.values.3 <- c(0.75, 2)
x.max.values.3 <- c(2, 3)
x.min.values.4 <- c(0.75)
x.max.values.4 <- c(3)
# finally, plot the significance bars for each layer, one on top of the other
q + geom_signif(y_position = y.values.1,
xmin = x.min.values.1,
xmax = x.max.values.1,
annotations = labels.1,
tip_length = rep(0.02, length(labels.1)),
vjust = 0.5 ) +
geom_signif(y_position = y.values.2,
xmin = x.min.values.2,
xmax = x.max.values.2,
annotations = labels.2,
tip_length = rep(0.04, length(labels.2)),
vjust = 0.5 ) +
geom_signif(y_position = y.values.3,
xmin = x.min.values.3,
xmax = x.max.values.3,
annotations = labels.3,
tip_length = rep(0.04, length(labels.3)),
vjust = 0.5 ) +
geom_signif(y_position = y.values.4,
xmin = x.min.values.4,
xmax = x.max.values.4,
annotations = labels.4,
tip_length = rep(0.06, length(labels.4)),
vjust = 0.5 )
输出如下所示:
Barplot_with_significance_bars_within_and_between_groups
我有一些数据想要 1) 绘制为分组箱线图,以及 2) 添加显着性条 A) 在每个组内的箱线图之间和 B) 在不同组的特定箱线图之间。我的数据看起来像这样:
library("ggplot2")
df <- data.frame(enzyme = c(rep("A", 9), rep("B", 9), rep("C", 9)),
substrate = c(rep("1", 3), rep("2", 3), rep("3", 3),
rep("1", 3), rep("4", 3), rep("5", 3),
rep("1", 3), rep("4", 3), rep("5", 3)),
AUC = c(6.64, 6.56, 6.21, 5.96, 6.12, 6.24, 6.02, 6.32, 6.12,
0, 0, 0, 5.99, 6.26, 5.94, 0, 0, 0,
0, 0, 0, 5.99, 6.11, 6.13, 0, 0, 0))
q <- ggplot(df, aes(x = enzyme, y = AUC, color = substrate)) +
geom_boxplot(show.legend = F,
position = position_dodge2(width = 0.75, preserve = "single")) +
geom_point(show.legend = F, size = 2, position = position_dodge2(width = 0.75, preserve = "single"))
plot(q)
我知道我可以使用以下方法在组之间添加显着性条:
q + geom_signif(comparisons = list(c("A", "B"), c("A", "C"), c("B", "C")),
test = "t.test", map_signif_level = T)
但是,这些比较对我的数据没有意义。
相反,我想 A) 在同一组的箱线图之间添加显着性条。我想我可以听从 Simon 的建议,他建议我通过为条形图定义 p 值、标签和 y 坐标 (
最后,我还想 B) 添加比较来自不同组的两个特定子组的显着性条。
我的问题是,有没有使用现有 functions/packages 的简单方法来做到这一点?如果我必须手动执行此操作,有人可以提出一个好的策略吗?我将不胜感激!
我想了想,想出了一个冗长的解决方案。如果有人有更简洁的方法,请告诉我!
## significance bars within and between subgroups
# rearrange df, one unique sample per column, rows are replicates
df.split <- do.call(cbind, sapply(split(df, df$enzyme), function(x) {
sapply(split(x, x$substrate), function(x) {x$AUC}) }) )
# keep track of sample names
sample.names <- do.call(c, lapply(split(df, df$enzyme), function(x) {
unique(paste0(x$enzyme, ".", x$substrate)) }) )
colnames(df.split) <- sample.names
# perform statistical tests between every pairwise combination of
# samples/columns in df.split
df.tests <- apply(combn(seq_along(sample.names), 2), 2,
function(x) {
t.test(df.split[ ,x[1]], df.split[ ,x[2]])$p.value })
# keep track of sample pairs
sample.pairs <- apply(combn(seq_along(sample.names), 2), 2,
function(x) {
paste0(colnames(df.split)[x[1]], "X",
colnames(df.split)[x[2]]) })
names(df.tests) <- sample.pairs
# think about how the significance bars will be laid out: because there are
# three subgroups per enzyme, the bars for the three pairwise comparisons on
# the same plot would overlap. This needs to be done in layers
# select tests of interest for each layer
within.tests.1 <- c("A.1XA.2", "A.2XA.3",
"B.1XB.4", "B.4XB.5",
"C.1XC.4", "C.4XC.5")
within.tests.2 <- c("A.1XA.3", "B.1XB.5","C.1XC.5")
between.tests.1 <- c("A.1XB.4", "B.4XC.4")
between.tests.2 <- c("A.1XC.4")
p.values.1 <- df.tests[which(names(df.tests) %in% within.tests.1)]
p.values.2 <- df.tests[which(names(df.tests) %in% within.tests.2)]
p.values.3 <- df.tests[which(names(df.tests) %in% between.tests.1)]
p.values.4 <- df.tests[which(names(df.tests) %in% between.tests.2)]
# convert p-values into easily read labels, with NaN values omitted
p.values.1 <- replace(p.values.1, is.na(p.values.1), 1)
p.values.2 <- replace(p.values.2, is.na(p.values.2), 1)
p.values.3 <- replace(p.values.3, is.na(p.values.3), 1)
p.values.4 <- replace(p.values.4, is.na(p.values.4), 1)
labels.1 <- symnum(p.values.1, corr = FALSE, cutpoints = c(0, .001,.01,.05, 1),
symbols = c("***","**","*",""))
labels.2 <- symnum(p.values.2, corr = FALSE, cutpoints = c(0, .001,.01,.05, 1),
symbols = c("***","**","*",""))
labels.3 <- symnum(p.values.3, corr = FALSE, cutpoints = c(0, .001,.01,.05, 1),
symbols = c("***","**","*",""))
labels.4 <- symnum(p.values.4, corr = FALSE, cutpoints = c(0, .001,.01,.05, 1),
symbols = c("***","**","*",""))
# determine coordinates for significance bars
# y values for layer 1 should all be just above the highest data point of all
# samples being compared
y.values.1 <- do.call(max, lapply(unlist(strsplit(names(labels.1), "X")),
function(x) {
df.split[, which(colnames(df.split) %in% x)] }) ) + 0.3 %>%
rep(times = length(labels.1))
# y values for layer 2 should be higher than those of layer 1
y.values.2 <- y.values.1[c(1, 3, 5)] + 0.4
# y values for layer 3 should all be above the highest data point of all
# samples being compared, and higher than layer 2
y.values.3 <- do.call(max, lapply(unlist(strsplit(names(labels.3), "X")),
function(x) {
df.split[, which(colnames(df.split) %in% x)] }) ) + 1.2 %>%
rep(times = length(labels.3))
# y values for layer 4 should be higher than those of layer 3
y.values.4 <- y.values.3[1] + 0.5
# for x values, first boxplot is always at x = 1
# since there are three groups per x = 1 and preserve = "single", the width of
# each subgroup boxplot is 0.25
x.min.values.1 <- c(0.75, 1, 1.75, 2, 2.75, 3)
x.max.values.1 <- x.min.values.1 + 0.25
x.min.values.2 <- c(0.75, 1.75, 2.75)
x.max.values.2 <- x.min.values.2 + 0.50
x.min.values.3 <- c(0.75, 2)
x.max.values.3 <- c(2, 3)
x.min.values.4 <- c(0.75)
x.max.values.4 <- c(3)
# finally, plot the significance bars for each layer, one on top of the other
q + geom_signif(y_position = y.values.1,
xmin = x.min.values.1,
xmax = x.max.values.1,
annotations = labels.1,
tip_length = rep(0.02, length(labels.1)),
vjust = 0.5 ) +
geom_signif(y_position = y.values.2,
xmin = x.min.values.2,
xmax = x.max.values.2,
annotations = labels.2,
tip_length = rep(0.04, length(labels.2)),
vjust = 0.5 ) +
geom_signif(y_position = y.values.3,
xmin = x.min.values.3,
xmax = x.max.values.3,
annotations = labels.3,
tip_length = rep(0.04, length(labels.3)),
vjust = 0.5 ) +
geom_signif(y_position = y.values.4,
xmin = x.min.values.4,
xmax = x.max.values.4,
annotations = labels.4,
tip_length = rep(0.06, length(labels.4)),
vjust = 0.5 )
输出如下所示:
Barplot_with_significance_bars_within_and_between_groups