从 R 中的 csv 创建分组箱线图

Creating a grouped box plot from csv in R

已编辑: 我正在尝试制作分组箱线图。我使用的数据以 csv + 长格式粘贴在下面:三组(个人、多编写器、列),每组有两个变量(PK、美国)及其数值。

Blog,Region,Dim1
Individual,PK,-4.75
Individual,PK,-5.69
Individual,PK,-0.27
Individual,PK,-2.76
Individual,PK,-8.24
Individual,PK,-12.51
Individual,PK,-1.28
Individual,PK,0.95
Individual,PK,-5.96
Individual,PK,-8.81
Individual,PK,-8.46
Individual,PK,-6.15
Individual,PK,-13.98
Individual,PK,-16.43
Individual,PK,-4.09
Individual,PK,-11.06
Individual,PK,-9.04
Individual,PK,-8.56
Individual,PK,-8.13
Individual,PK,-14.46
Individual,PK,-4.21
Individual,PK,-4.96
Individual,PK,-5.48
Multiwriter,PK,-3.31
Multiwriter,PK,-5.62
Multiwriter,PK,-4.48
Multiwriter,PK,-6.08
Multiwriter,PK,-4.68
Multiwriter,PK,-6.92
Multiwriter,PK,-11.29
Multiwriter,PK,6.66
Multiwriter,PK,1.66
Multiwriter,PK,3.39
Multiwriter,PK,0.06
Multiwriter,PK,4.11
Multiwriter,PK,-1.57
Multiwriter,PK,1.33
Multiwriter,PK,-6.91
Multiwriter,PK,4.87
Multiwriter,PK,-10.87
Multiwriter,PK,6.25
Multiwriter,PK,-0.68
Multiwriter,PK,0.11
Multiwriter,PK,0.71
Multiwriter,PK,-3.8
Multiwriter,PK,-1.75
Multiwriter,PK,-5.38
Multiwriter,PK,1.24
Multiwriter,PK,-5.59
Multiwriter,PK,4.98
Multiwriter,PK,0.98
Multiwriter,PK,7.47
Multiwriter,PK,-5.25
Multiwriter,PK,-14.24
Multiwriter,PK,-1.55
Multiwriter,PK,-8.44
Multiwriter,PK,-7.67
Multiwriter,PK,5.85
Multiwriter,PK,6
Multiwriter,PK,-7.53
Multiwriter,PK,1.59
Multiwriter,PK,-9.48
Multiwriter,PK,-3.99
Multiwriter,PK,-5.82
Multiwriter,PK,1.62
Multiwriter,PK,-4.14
Multiwriter,PK,1.06
Multiwriter,PK,4.52
Multiwriter,PK,-5.6
Multiwriter,PK,-3.38
Multiwriter,PK,4.82
Multiwriter,PK,0.76
Multiwriter,PK,-4.95
Multiwriter,PK,-2.05
Column,PK,1.64
Column,PK,5.2
Column,PK,2.8
Column,PK,1.93
Column,PK,2.36
Column,PK,4.77
Column,PK,-1.92
Column,PK,-2.94
Column,PK,4.58
Column,PK,2.98
Column,PK,9.07
Column,PK,8.5
Column,PK,1.23
Column,PK,8.97
Column,PK,4.1
Column,PK,7.25
Column,PK,0.02
Column,PK,-3.48
Column,PK,1.01
Column,PK,2.7
Column,PK,-2.32
Column,PK,3.22
Column,PK,-2.37
Column,PK,-13.28
Column,PK,-4.36
Column,PK,2.91
Column,PK,4.4
Column,PK,-5.07
Column,PK,-10.24
Column,PK,12.8
Column,PK,1.92
Column,PK,13.24
Column,PK,12.32
Column,PK,12.7
Column,PK,9.95
Column,PK,12.11
Column,PK,7.63
Column,PK,11.09
Column,PK,13.04
Column,PK,12.06
Column,PK,9.49
Column,PK,8.64
Column,PK,10.05
Column,PK,6.4
Column,PK,9.64
Column,PK,3.53
Column,PK,4.78
Column,PK,9.54
Column,PK,8.49
Column,PK,2.56
Column,PK,8.82
Column,PK,-3.59
Column,PK,-3.31
Column,PK,10.05
Column,PK,-0.28
Column,PK,-0.5
Column,PK,-6.37
Column,PK,2.97
Column,PK,4.49
Column,PK,9.14
Column,PK,4.5
Column,PK,8.6
Column,PK,6.76
Column,PK,3.67
Column,PK,6.79
Column,PK,5.77
Column,PK,10.5
Column,PK,1.57
Column,PK,9.47
Individual,US,-9.85
Individual,US,-2.73
Individual,US,-0.32
Individual,US,-0.94
Individual,US,-7.51
Individual,US,-8.21
Individual,US,-7.33
Individual,US,-5.1
Individual,US,-1.58
Individual,US,-2.49
Individual,US,-1.36
Individual,US,-5.76
Individual,US,-0.48
Individual,US,-3.38
Individual,US,2.42
Individual,US,-1.71
Individual,US,-2.17
Individual,US,-2.81
Individual,US,-0.64
Individual,US,-8.88
Individual,US,-1.53
Individual,US,-1.42
Individual,US,-17.89
Individual,US,7.1
Individual,US,-4.12
Individual,US,-0.83
Individual,US,2.05
Individual,US,-5.87
Individual,US,-0.15
Individual,US,5.78
Individual,US,-1.96
Individual,US,1.77
Individual,US,-0.67
Individual,US,-10.23
Individual,US,3.37
Individual,US,-1.18
Individual,US,6.94
Individual,US,-3.86
Individual,US,2.21
Individual,US,-11.64
Individual,US,-14.71
Individual,US,-12.74
Individual,US,-6.24
Individual,US,-13.64
Individual,US,-8.53
Individual,US,-10.4
Individual,US,-6.24
Individual,US,-12.15
Individual,US,-15.96
Multiwriter,US,11.27
Multiwriter,US,3.51
Multiwriter,US,4.05
Multiwriter,US,3.81
Multiwriter,US,8.56
Multiwriter,US,6.36
Multiwriter,US,-8.99
Multiwriter,US,3.36
Multiwriter,US,3.18
Multiwriter,US,-5.22
Multiwriter,US,-8.61
Multiwriter,US,-9.02
Multiwriter,US,-6.32
Multiwriter,US,0.53
Multiwriter,US,11.03
Multiwriter,US,-5.7
Multiwriter,US,4
Multiwriter,US,-3.55
Multiwriter,US,2.79
Multiwriter,US,4.61
Multiwriter,US,-3.8
Multiwriter,US,-9.62
Multiwriter,US,-8.37
Multiwriter,US,-2.18
Multiwriter,US,-1.64
Multiwriter,US,-9.99
Multiwriter,US,-1.44
Multiwriter,US,-4.45
Multiwriter,US,-7.84
Multiwriter,US,-11.6
Multiwriter,US,-2.71
Multiwriter,US,1.2
Multiwriter,US,-6.44
Multiwriter,US,-2.64
Multiwriter,US,-11.59
Multiwriter,US,-5.9
Multiwriter,US,-3.78
Multiwriter,US,-14.99
Multiwriter,US,1.32
Multiwriter,US,-6.55
Multiwriter,US,0.92
Multiwriter,US,-5.61
Multiwriter,US,-14.16
Multiwriter,US,-10.03
Multiwriter,US,-7.08
Multiwriter,US,0.62
Multiwriter,US,-5.43
Multiwriter,US,-1.11
Multiwriter,US,-11.37
Multiwriter,US,-13.37
Multiwriter,US,-12.71
Multiwriter,US,1.86
Multiwriter,US,14.11
Multiwriter,US,-5.24
Multiwriter,US,-6.77
Multiwriter,US,-4.79
Multiwriter,US,-6.22
Multiwriter,US,3.66
Multiwriter,US,-2.65
Multiwriter,US,-2.87
Multiwriter,US,-12.32
Multiwriter,US,-7.48
Multiwriter,US,-4.84
Multiwriter,US,0.44
Column,US,8.93
Column,US,10.29
Column,US,8.31
Column,US,5.88
Column,US,8.87
Column,US,-2.9
Column,US,3.71
Column,US,8.43
Column,US,1.47
Column,US,3.05
Column,US,-1.78
Column,US,1.14
Column,US,7.2
Column,US,5.22
Column,US,5.53
Column,US,8.14
Column,US,-2.22
Column,US,0.89
Column,US,2.5
Column,US,6.77
Column,US,3.63
Column,US,2.86
Column,US,3.7
Column,US,7.52
Column,US,3.12
Column,US,0
Column,US,0.28
Column,US,6.86
Column,US,-0.32
Column,US,2.92
Column,US,-1.14
Column,US,-1.11
Column,US,4.42
Column,US,4.37
Column,US,1.09
Column,US,-3.66
Column,US,7.09
Column,US,-11.02
Column,US,-0.78
Column,US,8.44
Column,US,4.88
Column,US,-3.9
Column,US,-0.21
Column,US,6.48
Column,US,4.49
Column,US,-8.89
Column,US,-0.73
Column,US,1.76
Column,US,-4.31
Column,US,4.63
Column,US,8.91
Column,US,3.55
Column,US,6.69
Column,US,-4.45
Column,US,9.82
Column,US,6.79
Column,US,1.84
Column,US,8.97
Column,US,2.38
Column,US,4.68
Column,US,9.23
Column,US,2.85
Column,US,4.19
Column,US,2.43
Column,US,5.48
Column,US,-1.08
Column,US,7.47
Column,US,3.13
Column,US,-0.42
Column,US,-0.71
Column,US,6.51
Column,US,6.34
Column,US,3.94
Column,US,5.46
Column,US,0.39
Column,US,8.15
Column,US,7.99
Column,US,6.26
Column,US,7.91
Column,US,14.18
Column,US,7.41
Column,US,7.16
Column,US,5.6
Column,US,7.51
Column,US,6.24
Column,US,3.67
Column,US,3.84
Column,US,2.37
Column,US,-3.5
Column,US,5.02
Column,US,-6.04
Column,US,5.36
Column,US,1.98
Column,US,7.79
Column,US,0.02
Column,US,-1.9
Column,US,-2.81
Column,US,10.69
Column,US,1.65
Column,US,8.19
Column,US,1.92

以下是我如何使用来自该论坛和其他地方的帮助来创建分组箱线图。

dim1 <- read.csv("path\to\dim1.csv", fileEncoding="UTF-8-BOM")
summary(dim1)
>>summary is show just fine.
ggplot2::ggplot(data = dim1) +
+     aes(x = Blog, y = Dim1, colour = Region) +
+     geom_boxplot()

但是在执行最后一条命令时出现以下错误:

Error in +aes(x = Blog, y = Dim1, colour = Region): invalid argument to unary operator

看来我在使用我创建的数据框时弄错了。 有任何想法吗? 更新 我写了一个 R 脚本来保存图表,效果很好。

rm(list = ls())
library(ggplot2)
dims <- read.csv("DataBlogs.csv", fileEncoding="UTF-8-BOM")
attach(dims)
summary(dims)
save <- function(plot, file_name){
    ggsave(file_name, plot = plot, path = "path\")
    unlink(file_name)
}
plotgraph <- function(x, y, colour, title)
{
    plot1 <- ggplot(dims, aes(x = x, y = y, colour = colour)) +
        geom_boxplot()
    plot1 <- plot1 + scale_y_continuous(name = "Dimension Score") + scale_x_discrete(name = "Blog Type")
    plot1 <- plot1 + ggtitle(title) + labs(color='Region') 
    return(plot1)
}
plot1 <- plotgraph (Blog, Dim1, Region, "Title")
save(plot1, "Dimension1.png")

现在我正在寻找方法来计算每个箱形图并添加均值。

将您的数据转化为长格式,然后进行分析。

下面是一些示例代码(请下次提供一些可用的示例数据):

> # Dummy data
> df <- data.frame(Group = rep(c('Individual', 'Multiwriter', 'News'), each = 20),
+                  Sub_group = rep(c('US', 'PK'), each = 10),
+                  Data = c(rnorm(9, 3, 1), NA, rnorm(10, 6, 1), # Individual
+                           rnorm(10, 5, 2), rnorm(4, 3, 1), rep(NA, 6), # Multiwriter
+                           rnorm(7, 9, 3), rep(NA, 3), rnorm(4, 7, 2), rep(NA, 6))) # News
> 
> # View
> df
         Group Sub_group       Data
1   Individual        US  3.7638540
2   Individual        US  2.3400850
3   Individual        US  2.0956558
4   Individual        US  2.3174383
5   Individual        US  3.0939467
6   Individual        US  2.4215387
7   Individual        US  2.5750679
8   Individual        US  3.4976811
9   Individual        US  4.3177465
10  Individual        US         NA
11  Individual        PK  5.4029275
12  Individual        PK  6.8948472
13  Individual        PK  5.7083497
14  Individual        PK  6.5364870
15  Individual        PK  8.6442513
16  Individual        PK  5.8469505
17  Individual        PK  5.6308073
18  Individual        PK  6.0677232
19  Individual        PK  7.5343209
20  Individual        PK  6.5114774
21 Multiwriter        US  5.2797776
22 Multiwriter        US  7.5760614
23 Multiwriter        US  5.1100036
24 Multiwriter        US  6.5244811
25 Multiwriter        US  7.2320747
26 Multiwriter        US  6.8812925
27 Multiwriter        US  4.8357571
28 Multiwriter        US  1.5791503
29 Multiwriter        US  2.3667888
30 Multiwriter        US  4.8180301
31 Multiwriter        PK  1.4981379
32 Multiwriter        PK  1.1078890
33 Multiwriter        PK  1.3776812
34 Multiwriter        PK  0.1700818
35 Multiwriter        PK         NA
36 Multiwriter        PK         NA
37 Multiwriter        PK         NA
38 Multiwriter        PK         NA
39 Multiwriter        PK         NA
40 Multiwriter        PK         NA
41        News        US 11.6376160
42        News        US 11.6902192
43        News        US  5.0811126
44        News        US 13.0634139
45        News        US  2.6509108
46        News        US  8.2467842
47        News        US 10.6461310
48        News        US         NA
49        News        US         NA
50        News        US         NA
51        News        PK  6.5860009
52        News        PK  9.4336251
53        News        PK  6.6071028
54        News        PK 10.4564444
55        News        PK         NA
56        News        PK         NA
57        News        PK         NA
58        News        PK         NA
59        News        PK         NA
60        News        PK         NA
> 
> # Plot
> library(ggplot2)
> ggplot2::ggplot(data = df) +
+     aes(x = Group, y = Data, colour = Sub_group) +
+     geom_boxplot()
Warning message:
Removed 16 rows containing non-finite values (stat_boxplot). 

根据评论更新

导入数据

> data <- read.csv('DataBlogs.csv')

查看数据

> str(data)
'data.frame':   3674 obs. of  6 variables:
 $ Blog  : Factor w/ 5 levels "Column","Individual",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ Region: Factor w/ 2 levels "PK","US": 1 1 1 1 1 1 1 1 1 1 ...
 $ Dim1  : num  -4.75 -5.69 -0.27 -2.76 -8.24 ...
 $ Dim2  : num  13.47 6.08 6.09 7.35 12.75 ...
 $ Dim3  : num  8.47 1.51 0.03 5.62 3.71 2.01 7.56 3.01 6.5 4.76 ...
 $ Dim4  : num  -1.29 -1.65 1.65 3.13 3.78 0.21 2.16 3.35 1.49 1.98 ...
>
> summary(data)
          Blog      Region         Dim1                 Dim2          
 Column     :1173   PK:1835   Min.   :-29.330000   Min.   :-17.55000  
 Individual :1188   US:1839   1st Qu.: -4.282500   1st Qu.: -4.41000  
 Multiwriter: 405             Median :  0.885000   Median : -0.74500  
 News       : 508             Mean   : -0.000054   Mean   :  0.00004  
 Tech       : 400             3rd Qu.:  5.147500   3rd Qu.:  3.44000  
                              Max.   : 22.020000   Max.   : 47.36000  
      Dim3                 Dim4           
 Min.   :-17.260000   Min.   :-15.050000  
 1st Qu.: -3.340000   1st Qu.: -3.280000  
 Median : -0.530000   Median : -0.460000  
 Mean   : -0.000035   Mean   :  0.000024  
 3rd Qu.:  2.845000   3rd Qu.:  2.647500  
 Max.   : 29.080000   Max.   : 29.640000  

我不确定为什么要将数据子集化到一个新的数据框中,但我发现它有助于为您的数据框列命名。

这是您尝试过的方法

> dim1 <- data.frame(data$Blog, data$Region, data$Dim1) 
> names(dim1)
[1] "data.Blog"   "data.Region" "data.Dim1" 

...现在有了命名列。

> dim1 <- data.frame(Blog = data$Blog, Region = data$Region, Dim1 = data$Dim1) 
> names(dim1)
[1] "Blog"   "Region" "Dim1"  

对于情节,有两个潜在的错误原因:i)你只为第一个函数调用ggplot2::,你需要为其他函数调用做同样的事情(ggplot2::aes, ggplot2::geom_boxplot), 或者只是在绘图之前加载包,并且 ii) 如果您直接复制并粘贴我的代码,您最终会得到两个加号 ('++') 分隔绘图的每个步骤.这是终端输出的人工产物。将代码添加到编辑器时,请确保步骤之间只有一个加号。后一个问题是错误消息的最可能原因。

修改后的剧情

> library(ggplot2)
>
> ggplot(data = dim1) + 
+     aes(x = Blog, y = Dim1, colour = Region) + 
+     geom_boxplot()
>