根据数据集的条件设置颜色和框(箱线图)

Set color and boxes on conditions from data set (boxplot)

所以我有一个名为 Carbon 的数据集(Carbon$Graphite 确实包含一些在头部看不到的值):

       Mesa  TIC   TOC Graphite    TC
Kx V17   Ja 5.26 15.64       NA 20.90
Mu V17  Nej 4.08 11.32       NA 15.40
Ob V17  Nej 5.22 12.68       NA 17.90
Vä V17   Ja 6.45  6.35       NA 12.80
Ös V17  Nej 3.90  2.54       NA  6.44
Ig V17   Ja 8.20  3.20       NA 11.40

我想要一个显示 8 个框的箱线图,一个框只包含满足 Carbon$TIC[Carbon$Mesa=="Ja, ], one that equals Carbon$TIC[Carbon$Mesa=="Ja, ] 的值,一个等于 Carbon$TOC[Carbon$ Mesa=="Ja, ] and so fort. And the colour (fill) of the boxes is Carbon$Mesa so "Ja"=red box 和 "Nej"=blue box。我已经设法在不使用 ggpplot 的情况下做到这一点,但我需要用 ggplot 做到这一点(所以我所有的图表看起来都一样剩下的图表是用 ggplot 完成的。

我以前在没有 ggplot 的情况下制作的代码(这是我想要的,但像往常一样在 ggplot 的旁边有图例):

MesaJa <-Carbon[Carbon$Mesa=="Ja", ]
MesaNej <-Carbon[Carbon$Mesa=="Nej", ]

col.box<- c( rep("red", 3),   rep("blue", 3))

boxplot( list(MesaJa [, "TIC"], MesaJa [, "TOC"], MesaJa [, "TC"], 
              MesaNej[, "TIC"], MesaNej[, "TOC"], MesaNej[, "TC"] ),
         names=c("TIC", "TOC", "TC","TIC", "TOC", "TC") , 
         col=col.box
)  
legend("topleft", legend= c("Lime mud", "No lime mud"), pch=19, col=c("red","blue"), cex=0.7)

我尝试了几种不同的方法,但仍然无法正常工作。我得到的最接近的是:

Carbon$TIC_Ja <- ifelse(Carbon$Mesa=="Ja",Carbon$TIC, NA)
Carbon$TIC_Nej <- ifelse(Carbon$Mesa=="Nej",Carbon$TIC, NA)
Carbon$TOC_Ja <- ifelse(Carbon$Mesa=="Ja",Carbon$TOC, NA)
Carbon$TOC_Nej <- ifelse(Carbon$Mesa=="Nej",Carbon$TOC, NA)
Carbon$TC_Ja <- ifelse(Carbon$Mesa=="Ja",Carbon$TC, NA)
Carbon$TC_Nej <- ifelse(Carbon$Mesa=="Nej",Carbon$TC, NA)

Carbon.plot<-Carbon[ , c(1, 6:11)]
Carbon.key <- colnames(Carbon)  

ggplot(
  gather(Carbon.plot, key=Carbon.key, value="value", -"Mesa"),
  aes(x=factor(Carbon.key), y=as.numeric(value), fill= Carbon.key) 
  ) + 
  geom_boxplot() +
  scale_fill_manual(values=c("red", "blue", "red", "blue", "red", "blue"),
                    labels=c("Lime mud added", "No lime mud")
                    )

但它仍然不好,因为我只想要图例中的上面两个 post 并且想删除 "NA"。而且我认为必须有一种不涉及使用 ifelse 对数据框进行排序的更简单的方法。我已经搜索过了,但到目前为止还没有看到任何类似的例子。那么,请帮忙?

编辑:添加了有关 df 和会话信息的信息。但是在 post 解决了这个问题之后,我开始更新了一些软件包,但是,事情并不顺利,所以目前我什至没有 ggplot 工作。

> str(Carbon)
'data.frame':   70 obs. of  5 variables:
 $ Mesa    : chr  "Ja" "Nej" "Nej" "Ja" ...
 $ TIC     : num  5.26 4.08 5.22 6.45 3.9 ...
 $ TOC     : num  15.64 11.32 12.68 6.35 2.54 ...
 $ Graphite: num  NA NA NA NA NA NA NA NA NA NA ...
 $ TC      : num  20.9 15.4 17.9 12.8 6.44 11.4 12.9 21.6 11.8 15.3 ...

> dput(Carbon)
structure(list(Mesa = c("Ja", "Nej", "Nej", "Ja", "Nej", "Ja", 
"Ja", "Ja", "Ja", "Ja", "Nej", "Nej", "Nej", "Ja", "Ja", "Nej", 
"Nej", "Nej", "Nej", "Ja", "Ja", "Ja", "Ja", "Nej", "Nej", "Nej", 
"Ja", "Ja", "Ja", "Nej", "Nej", "Nej", "Nej", "Ja", "Ja", "Ja", 
"Ja", "Nej", "Nej", "Nej", "Ja", "Ja", "Ja", "Nej", "Nej", "Nej", 
"Ja", "Nej", "Ja", "Ja", "Ja", "Ja", "Nej", "Nej", "Nej", "Ja", 
"Ja", "Ja", "Nej", "Nej", "Nej", "Nej", "Ja", "Ja", "Ja", "Ja", 
"Nej", "Nej", "Nej", "Ja"), TIC = c(5.26, 4.08, 5.22, 6.45, 3.9, 
8.2, 10.67, 7.43, 9.55, 8.19, 7.83, 4.04, 2.66, 4.93, 7.41, 3.25, 
4.47, 4.385, 3.48, 8.01, 9.49, 8.93, 6.03, 7.32, 3.84, 2.42, 
5.01, 3.87, 7, 4.8, 5.64, 5.76, 5.69, 8.7, 10.2, 9.78, 6.1, 8.07, 
4.33, 3.98, 7.39, 9.68, 9.67, 3.75, 5.07, 4.7, 4.86, 2.98, 8.05, 
10.29, 9.99, 6.65, 8.85, 4.82, 3.84, 8.49, 3.86, 6.63, 3.49, 
3.01, 4.83, 3.78, 8.95, 10.1, 8.15, 6.16, 8.15, 4.27, 3.96, 4.96
), TOC = c(15.64, 11.32, 12.68, 6.35, 2.54, 3.2, 2.23, 14.17, 
2.25, 7.11, 2.37, 17.16, 36.14, 13.47, 5.29, 17.95, 14.63, 3.85, 
6.31, 3.19, 2.81, 7.27, 23.07, 1.94, 26.19, 36.63, 23.19, 11.37, 
5.1, 2.39, 18.46, 2.17, 2.45, 1.7, 4.2, 4.92, 20.2, 0.86, 20.67, 
33.32, 5.11, 2.01, 0.53, 6.48, 29.51, 2.5, 9.41, 3.42, 3.04, 
4, 4.1, 11.15, 1.94, 20.66, 31.73, 1.81, 16.39, 20.75, 6.61, 
33.98, 2.48, 3.15, 1.65, 2.4, 12.63, 17.33, 0.99, 25.62, 38.63, 
17.93), Graphite = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, 0.13, NA, NA, NA, NA, NA, 0.07, 
0.05, NA, 0.06, NA, 0.03, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, 0.07, 0.02, 0.08, 0.03, 0.33, NA, NA, NA, NA, 
NA, 0.02, 0.13, NA, 0.05, 0.02, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA), TC = c(20.9, 15.4, 17.9, 12.8, 6.44, 11.4, 12.9, 
21.6, 11.8, 15.3, 10.2, 21.2, 38.8, 18.4, 12.7, 21.2, 19.1, 8.235, 
9.92, 11.2, 12.3, 16.2, 29.1, 9.26, 30.1, 39.1, 28.2, 15.3, 12.1, 
7.22, 24.1, 7.93, 8.14, 10.4, 14.4, 14.7, 26.3, 8.93, 25, 37.3, 
12.5, 11.7, 10.2, 10.3, 34.6, 7.28, 14.3, 6.73, 11.1, 14.3, 14.1, 
17.8, 10.8, 25.5, 35.7, 10.3, 20.3, 27.4, 10.1, 37, 7.33, 6.93, 
10.6, 12.5, 20.8, 23.5, 9.14, 29.9, 42.6, 22.9)), class = "data.frame", row.names = c("Kx V17", 
"Mu V17", "Ob V17", "Vä V17", "Ös V17", "Ig V17", "Va V17", "Gä V17", 
"Sk V17", "Fr V17", "Gr V17", "Bi V17", "As V17", "Kx H17", "Pi H17", 
"Mu H17", "Ob H17", "Do H17", "Ös H17", "Ig H17", "Va H17", "Gä H17", 
"Fr H17", "Gr H17", "Bi H17", "As H17", "So H17", "Kx V18", "Pi V18", 
"Mu V18", "Ob V18", "Do V18", "Ös V18", "Ig V18", "Va V18", "Gä V18", 
"Fr V18", "Gr V18", "Bi V18", "As V18", "So V18", "Kx H18", "Pi H18", 
"Mu H18", "Ob H18", "Do H18", "Vä H18", "Ös H18", "Ig H18", "Va H18", 
"Gä H18", "Fr H18", "Gr H18", "Bi H18", "As H18", "So H18", "Kx V19", 
"Pi V19", "Mu V19", "Ob V19", "Do V19", "Ös V19", "Ig V19", "Va V19", 
"Gä V19", "Fr V19", "Gr V19", "Bi V19", "As V19", "So V19"))

> sessionInfo()
R version 3.5.3 (2019-03-11)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 18362)

Matrix products: default

locale:
[1] LC_COLLATE=Swedish_Sweden.1252  LC_CTYPE=Swedish_Sweden.1252   
[3] LC_MONETARY=Swedish_Sweden.1252 LC_NUMERIC=C                   
[5] LC_TIME=Swedish_Sweden.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

loaded via a namespace (and not attached):
 [1] crayon_1.3.4     grid_3.5.3       R6_2.4.1         lifecycle_0.2.0  gtable_0.3.0    
 [6] magrittr_1.5     scales_1.1.0     rlang_0.4.5      rstudioapi_0.11  ellipsis_0.3.0  
[11] tools_3.5.3      glue_1.4.0       munsell_0.5.0    compiler_3.5.3   colorspace_1.4-1

这里一种可能的方法是使用例如 tidyr 中的 pivot_longer 函数将数据帧转换为更长的格式(pivot_longertidyr [=16 中可用=]):

library(tidyr)
library(dplyr)

Carbon %>% pivot_longer(cols = c(TIC, TOC, TC), names_to = "var",values_to = "val") %>%
  mutate(var = factor(var,levels = c("TIC","TOC","TC"))) 

# A tibble: 210 x 4
   Mesa  Graphite var     val
   <chr>    <dbl> <fct> <dbl>
 1 Ja          NA TIC    5.26
 2 Ja          NA TOC   15.6 
 3 Ja          NA TC    20.9 
 4 Nej         NA TIC    4.08
 5 Nej         NA TOC   11.3 
 6 Nej         NA TC    15.4 
 7 Nej         NA TIC    5.22
 8 Nej         NA TOC   12.7 
 9 Nej         NA TC    17.9 
10 Ja          NA TIC    6.45
# … with 200 more rows

然后,你可以使用interaction将"Mesa"和"var"分组为x值并在ggplot2中得到以下boxplot

library(tidyr)
library(dplyr)
library(ggplot)

Carbon %>% pivot_longer(cols = c(TIC, TOC, TC), names_to = "var",values_to = "val") %>%
  mutate(var = factor(var,levels = c("TIC","TOC","TC"))) %>%
  ggplot(aes(x = interaction(var, Mesa), y = val, fill = Mesa))+
  geom_boxplot()+
  scale_x_discrete(labels = rep(c("TIC","TOC","TC"),2))+
  scale_fill_manual(values = c("red","blue"), labels = c("Lime mud", "No lime mud"))

它是否回答了您的问题?