如何从 R 中的数据帧生成频率 table
How to make a frequency table from a data frame in R
数据框是这样的:
enter image description here
header: system
Row 1: 00000000000000000503_0
Row 2: 00000000000000000503_1
Row 3: 00000000000000000503_2
Row 4: 00000000000000000503_3
Row 5: 000000000000000004e7_0
Row 6: 000000000000000004e7_1
Row 7: 00000000000000000681_0
Row 8: 00000000000000000681_1
Row 9: 00000000000000000681_2
我想用字符串“_”之前的代码数量生成一个频率 table,这样:
“00000000000000000503”出现4次,“000000000000000004e7”出现2次,以此类推。
我如何在 R 中执行此操作?
删除下划线后的所有内容并使用table
计算频率
table(sub("_.*", "", data$col1))
#Also
#table(sub("(.*)_.*", "\1", data$col1))
#000000000000000004e7 00000000000000000503 00000000000000000681
# 2 4 3
如果最终输出需要是数据帧,请使用stack
stack(table(sub("_.*", "", data$col1)))
# values ind
#1 2 000000000000000004e7
#2 4 00000000000000000503
#3 3 00000000000000000681
数据
data <- structure(list(col1 = structure(c(3L, 4L, 5L, 6L, 1L, 2L, 7L,
8L, 9L), .Label = c("000000000000000004e7_0", "000000000000000004e7_1",
"00000000000000000503_0", "00000000000000000503_1",
"00000000000000000503_2",
"00000000000000000503_3", "00000000000000000681_0",
"00000000000000000681_1",
"00000000000000000681_2"), class = "factor")), class = "data.frame",
row.names = c(NA, -9L))
一个dplyr
-tidyr
备选方案:
df %>%
tidyr::separate(V3, c("target", "non_target")) %>%
count(target)
# A tibble: 3 x 2
target n
<chr> <int>
1 000000000000000004e7 2
2 00000000000000000503 4
3 00000000000000000681 3
与base
:
table(sapply(strsplit(df$system, "_"),"[[", 1))
数据:
df <- structure(list(V1 = c("Row", "Row", "Row", "Row", "Row", "Row",
"Row", "Row", "Row"), V2 = c("1:", "2:", "3:", "4:", "5:", "6:",
"7:", "8:", "9:"), V3 = c("00000000000000000503_0", "00000000000000000503_1",
"00000000000000000503_2", "00000000000000000503_3", "000000000000000004e7_0",
"000000000000000004e7_1", "00000000000000000681_0", "00000000000000000681_1",
"00000000000000000681_2")), class = "data.frame", row.names = c(NA,
-9L))
使用 stringr
库的另一个选项 tidyverse
> library(tidyverse)
> mydata <- data.frame(system = c("00000000000000000503_0",
"00000000000000000503_1",
"00000000000000000503_2",
"00000000000000000503_3",
"000000000000000004e7_0",
"000000000000000004e7_1",
"00000000000000000681_0",
"00000000000000000681_1",
"00000000000000000681_2"))
> mydata
system
1 00000000000000000503_0
2 00000000000000000503_1
3 00000000000000000503_2
4 00000000000000000503_3
5 000000000000000004e7_0
6 000000000000000004e7_1
7 00000000000000000681_0
8 00000000000000000681_1
9 00000000000000000681_2
> # Split data using str_split
> mydata$leftside <- sapply(mydata$system, function(x) unlist(str_split(x, "_"))[1]) #split string by the "_" and take first piece
> mydata$rightside <- sapply(mydata$system, function(x) unlist(str_split(x, "_"))[2]) #split string by the "_" and take second piece
>
> mydata
system leftside rightside
1 00000000000000000503_0 00000000000000000503 0
2 00000000000000000503_1 00000000000000000503 1
3 00000000000000000503_2 00000000000000000503 2
4 00000000000000000503_3 00000000000000000503 3
5 000000000000000004e7_0 000000000000000004e7 0
6 000000000000000004e7_1 000000000000000004e7 1
7 00000000000000000681_0 00000000000000000681 0
8 00000000000000000681_1 00000000000000000681 1
9 00000000000000000681_2 00000000000000000681 2
> # alternative tabulate fuction than base::table(). Can Provide nicer options.
> xtabs(data = mydata, formula = ~leftside)
leftside
000000000000000004e7 00000000000000000503 00000000000000000681
2 4 3
整洁的答案是
my_data <- mydata %>%
mutate_if(is.factor, as.character) %>%
mutate(system = gsub('_[^_]*$', '', system)) %>%
group_by(system) %>%
count() %>%
ungroup()
my_data
带有 str_remove
和 group_by
的选项
library(stringr)
library(dplyr)
df %>%
group_by(V3 = str_remove(V3, "_\d+$")) %>%
summarise(n = n())
# A tibble: 3 x 2
# V3 n
# <chr> <int>
#1 000000000000000004e7 2
#2 00000000000000000503 4
#3 00000000000000000681 3
或在 base R
中使用 table
和 trimws
table(trimws(df$V3, whitespace = "_[0-9]+"))
数据
df <- structure(list(V1 = c("Row", "Row", "Row", "Row", "Row", "Row",
"Row", "Row", "Row"), V2 = c("1:", "2:", "3:", "4:", "5:", "6:",
"7:", "8:", "9:"), V3 = c("00000000000000000503_0", "00000000000000000503_1",
"00000000000000000503_2", "00000000000000000503_3", "000000000000000004e7_0",
"000000000000000004e7_1", "00000000000000000681_0", "00000000000000000681_1",
"00000000000000000681_2")), class = "data.frame", row.names = c(NA,
-9L))
数据框是这样的: enter image description here
header: system
Row 1: 00000000000000000503_0
Row 2: 00000000000000000503_1
Row 3: 00000000000000000503_2
Row 4: 00000000000000000503_3
Row 5: 000000000000000004e7_0
Row 6: 000000000000000004e7_1
Row 7: 00000000000000000681_0
Row 8: 00000000000000000681_1
Row 9: 00000000000000000681_2
我想用字符串“_”之前的代码数量生成一个频率 table,这样:
“00000000000000000503”出现4次,“000000000000000004e7”出现2次,以此类推。
我如何在 R 中执行此操作?
删除下划线后的所有内容并使用table
计算频率
table(sub("_.*", "", data$col1))
#Also
#table(sub("(.*)_.*", "\1", data$col1))
#000000000000000004e7 00000000000000000503 00000000000000000681
# 2 4 3
如果最终输出需要是数据帧,请使用stack
stack(table(sub("_.*", "", data$col1)))
# values ind
#1 2 000000000000000004e7
#2 4 00000000000000000503
#3 3 00000000000000000681
数据
data <- structure(list(col1 = structure(c(3L, 4L, 5L, 6L, 1L, 2L, 7L,
8L, 9L), .Label = c("000000000000000004e7_0", "000000000000000004e7_1",
"00000000000000000503_0", "00000000000000000503_1",
"00000000000000000503_2",
"00000000000000000503_3", "00000000000000000681_0",
"00000000000000000681_1",
"00000000000000000681_2"), class = "factor")), class = "data.frame",
row.names = c(NA, -9L))
一个dplyr
-tidyr
备选方案:
df %>%
tidyr::separate(V3, c("target", "non_target")) %>%
count(target)
# A tibble: 3 x 2
target n
<chr> <int>
1 000000000000000004e7 2
2 00000000000000000503 4
3 00000000000000000681 3
与base
:
table(sapply(strsplit(df$system, "_"),"[[", 1))
数据:
df <- structure(list(V1 = c("Row", "Row", "Row", "Row", "Row", "Row",
"Row", "Row", "Row"), V2 = c("1:", "2:", "3:", "4:", "5:", "6:",
"7:", "8:", "9:"), V3 = c("00000000000000000503_0", "00000000000000000503_1",
"00000000000000000503_2", "00000000000000000503_3", "000000000000000004e7_0",
"000000000000000004e7_1", "00000000000000000681_0", "00000000000000000681_1",
"00000000000000000681_2")), class = "data.frame", row.names = c(NA,
-9L))
使用 stringr
库的另一个选项 tidyverse
> library(tidyverse)
> mydata <- data.frame(system = c("00000000000000000503_0",
"00000000000000000503_1",
"00000000000000000503_2",
"00000000000000000503_3",
"000000000000000004e7_0",
"000000000000000004e7_1",
"00000000000000000681_0",
"00000000000000000681_1",
"00000000000000000681_2"))
> mydata
system
1 00000000000000000503_0
2 00000000000000000503_1
3 00000000000000000503_2
4 00000000000000000503_3
5 000000000000000004e7_0
6 000000000000000004e7_1
7 00000000000000000681_0
8 00000000000000000681_1
9 00000000000000000681_2
> # Split data using str_split
> mydata$leftside <- sapply(mydata$system, function(x) unlist(str_split(x, "_"))[1]) #split string by the "_" and take first piece
> mydata$rightside <- sapply(mydata$system, function(x) unlist(str_split(x, "_"))[2]) #split string by the "_" and take second piece
>
> mydata
system leftside rightside
1 00000000000000000503_0 00000000000000000503 0
2 00000000000000000503_1 00000000000000000503 1
3 00000000000000000503_2 00000000000000000503 2
4 00000000000000000503_3 00000000000000000503 3
5 000000000000000004e7_0 000000000000000004e7 0
6 000000000000000004e7_1 000000000000000004e7 1
7 00000000000000000681_0 00000000000000000681 0
8 00000000000000000681_1 00000000000000000681 1
9 00000000000000000681_2 00000000000000000681 2
> # alternative tabulate fuction than base::table(). Can Provide nicer options.
> xtabs(data = mydata, formula = ~leftside)
leftside
000000000000000004e7 00000000000000000503 00000000000000000681
2 4 3
整洁的答案是
my_data <- mydata %>%
mutate_if(is.factor, as.character) %>%
mutate(system = gsub('_[^_]*$', '', system)) %>%
group_by(system) %>%
count() %>%
ungroup()
my_data
带有 str_remove
和 group_by
library(stringr)
library(dplyr)
df %>%
group_by(V3 = str_remove(V3, "_\d+$")) %>%
summarise(n = n())
# A tibble: 3 x 2
# V3 n
# <chr> <int>
#1 000000000000000004e7 2
#2 00000000000000000503 4
#3 00000000000000000681 3
或在 base R
中使用 table
和 trimws
table(trimws(df$V3, whitespace = "_[0-9]+"))
数据
df <- structure(list(V1 = c("Row", "Row", "Row", "Row", "Row", "Row",
"Row", "Row", "Row"), V2 = c("1:", "2:", "3:", "4:", "5:", "6:",
"7:", "8:", "9:"), V3 = c("00000000000000000503_0", "00000000000000000503_1",
"00000000000000000503_2", "00000000000000000503_3", "000000000000000004e7_0",
"000000000000000004e7_1", "00000000000000000681_0", "00000000000000000681_1",
"00000000000000000681_2")), class = "data.frame", row.names = c(NA,
-9L))