我怎样才能 select R 中一半的计数?
How can I select half of the counts in R?
我有一个包含 2 列的数据集,如下面的屏幕截图所示:
个人姓名、公司名称
> dput(df)
structure(list(Name = c("ABC", "BCD", "CDE", "DEF", "EFG", "FGH",
"GHI", "HIJ", "IJK", "JKL"), Company = c("A", "A", "A", "A",
"B", "B", "C", "C", "C", "C")), class = "data.frame", row.names = c(NA,
-10L))
> df
Name Company
1 ABC A
2 BCD A
3 CDE A
4 DEF A
5 EFG B
6 FGH B
7 GHI C
8 HIJ C
9 IJK C
10 JKL C
如何将数据集拆分为 2 个子集,每个子集都有相同数量的来自同一公司的人员?
比如A公司一共有4个人,A公司组内有两个子集,即2人在子集1,另外2人在子集2。B、C公司同理。
对于同一 Campany 中的奇数人,可以通过随机选择一个子集来打破平局。
谢谢!
试试这个 tidyverse
方法将数据 RANDOM 划分为两个子集。
> df
Name Company
1 ABC A
2 BCD A
3 CDE A
4 DEF A
5 EFG B
6 FGH B
7 GHI C
8 HIJ C
9 IJK C
10 JKL C
APPROACH-1 如果你想在一个包含两个独立数据帧的列表中得到结果
library(tidyverse)
set.seed(123) # set the seed to replicate the result lateron
df %>% mutate(id = row_number()) %>% group_by(Company) %>% mutate(n = n()) %>%
sample_n(n*0.5) %>% mutate(d = 1) %>% ungroup() %>% select(id, d) %>%
right_join(df %>% mutate(id = row_number()), by = "id") %>%
mutate(d = if_else(is.na(d), 2, d)) %>% group_split(d)
<list_of<
tbl_df<
id : integer
d : double
Name : character
Company: character
>
>[2]>
[[1]]
# A tibble: 5 x 4
id d Name Company
<int> <dbl> <chr> <chr>
1 3 1 CDE A
2 4 1 DEF A
3 5 1 EFG B
4 8 1 HIJ C
5 9 1 IJK C
[[2]]
# A tibble: 5 x 4
id d Name Company
<int> <dbl> <chr> <chr>
1 1 2 ABC A
2 2 2 BCD A
3 6 2 FGH B
4 7 2 GHI C
5 10 2 JKL C
当您需要两个单独的 csv 文件中的结果时。
set.seed(123)
df %>% mutate(id = row_number()) %>% group_by(Company) %>% mutate(n = n()) %>%
sample_n(n*0.5) %>% mutate(d = 1) %>% ungroup() %>% select(id, d) %>%
right_join(df %>% mutate(id = row_number()), by = "id") %>%
mutate(d = if_else(is.na(d), 2, d)) %>% group_split(d) %>% map2(c("file1.csv", "file2.csv"), ~ write.csv(.x, .y))
以上代码将在您的 getwd() 目录中写入 2 个 csv 文件。
APPROACH -2你也可以做
set.seed(123)
df1 <- df %>% mutate(id = row_number()) %>% group_by(Company) %>% mutate(n = n()) %>%
sample_n(n*0.5) %>% mutate(set_no = "Set1") %>% ungroup() %>% select(-n, -id)
df2 <- df %>% mutate(id = row_number()) %>% filter(!id %in% df1$id) %>%
mutate(set_no = "Set2") %>% select(-id)
将结果写入csv
write.csv(df1, "subset1.csv")
write.csv(df2, "subset2.csv")
列 id
和 d
是辅助列。 id
列将确保没有 row/record 被留下,以防任何组中有奇数条目。 d
将有助于识别子组编号。
如果任何公司的记录数为奇数,则子集的行数可能不同且不相等。
您可以使用 ave
.
计算子集序列的模 2
r <- transform(d, sub=ave(company, company, FUN=function(x)
paste0("sub", seq_along(x) %% 2 + 1)))
r
# name company sub
# 1 alex A sub2
# 2 malan A sub1
# 3 matteis A sub2
# 4 fenwick A sub1
# 5 nicolas B sub2
# 6 cleary B sub1
# 7 fin C sub2
# 8 stijn C sub1
# 9 antoine C sub2
# 10 fin C sub1
# 11 stijn C sub2
# 12 antoine C sub1
检查:
with(r, table(company, sub))
# company sub1 sub2
# A 2 2
# B 1 1
# C 1 2
实际上,如果按照示例中的方式订购公司,您只需执行以下操作:
d$sub <- NA
d$sub[] <- paste0("sub", 1:2) ## throws a warning when `nrow` is uneven
d
# name company sub
# 1 alex A sub1
# 2 malan A sub2
# 3 matteis A sub1
# 4 fenwick A sub2
# 5 nicolas B sub1
# 6 cleary B sub2
# 7 fin C sub1
# 8 stijn C sub2
# 9 antoine C sub1
数据:
d <- structure(list(name = c("alex", "malan", "matteis", "fenwick",
"nicolas", "cleary", "fin", "stijn", "antoine"), company = c("A",
"A", "A", "A", "B", "B", "C", "C", "C")), class = "data.frame", row.names = c(NA,
-9L))
您可以像这样使用 ave
和 sample
:
set.seed(42)
df$su <- ave(df$Company, df$Company, FUN=function(x)
sample(rep(1:2, ceiling(length(x)/2)), length(x)))
df
# Name Company su
#1 ABC A 1
#2 BCD A 2
#3 CDE A 1
#4 DEF A 2
#5 EFG B 2
#6 FGH B 1
#7 GHI C 2
#8 HIJ C 2
#9 IJK C 1
#10 JKL C 1
我有一个包含 2 列的数据集,如下面的屏幕截图所示: 个人姓名、公司名称
> dput(df)
structure(list(Name = c("ABC", "BCD", "CDE", "DEF", "EFG", "FGH",
"GHI", "HIJ", "IJK", "JKL"), Company = c("A", "A", "A", "A",
"B", "B", "C", "C", "C", "C")), class = "data.frame", row.names = c(NA,
-10L))
> df
Name Company
1 ABC A
2 BCD A
3 CDE A
4 DEF A
5 EFG B
6 FGH B
7 GHI C
8 HIJ C
9 IJK C
10 JKL C
如何将数据集拆分为 2 个子集,每个子集都有相同数量的来自同一公司的人员?
比如A公司一共有4个人,A公司组内有两个子集,即2人在子集1,另外2人在子集2。B、C公司同理。
对于同一 Campany 中的奇数人,可以通过随机选择一个子集来打破平局。
谢谢!
试试这个 tidyverse
方法将数据 RANDOM 划分为两个子集。
> df
Name Company
1 ABC A
2 BCD A
3 CDE A
4 DEF A
5 EFG B
6 FGH B
7 GHI C
8 HIJ C
9 IJK C
10 JKL C
APPROACH-1 如果你想在一个包含两个独立数据帧的列表中得到结果
library(tidyverse)
set.seed(123) # set the seed to replicate the result lateron
df %>% mutate(id = row_number()) %>% group_by(Company) %>% mutate(n = n()) %>%
sample_n(n*0.5) %>% mutate(d = 1) %>% ungroup() %>% select(id, d) %>%
right_join(df %>% mutate(id = row_number()), by = "id") %>%
mutate(d = if_else(is.na(d), 2, d)) %>% group_split(d)
<list_of<
tbl_df<
id : integer
d : double
Name : character
Company: character
>
>[2]>
[[1]]
# A tibble: 5 x 4
id d Name Company
<int> <dbl> <chr> <chr>
1 3 1 CDE A
2 4 1 DEF A
3 5 1 EFG B
4 8 1 HIJ C
5 9 1 IJK C
[[2]]
# A tibble: 5 x 4
id d Name Company
<int> <dbl> <chr> <chr>
1 1 2 ABC A
2 2 2 BCD A
3 6 2 FGH B
4 7 2 GHI C
5 10 2 JKL C
当您需要两个单独的 csv 文件中的结果时。
set.seed(123)
df %>% mutate(id = row_number()) %>% group_by(Company) %>% mutate(n = n()) %>%
sample_n(n*0.5) %>% mutate(d = 1) %>% ungroup() %>% select(id, d) %>%
right_join(df %>% mutate(id = row_number()), by = "id") %>%
mutate(d = if_else(is.na(d), 2, d)) %>% group_split(d) %>% map2(c("file1.csv", "file2.csv"), ~ write.csv(.x, .y))
以上代码将在您的 getwd() 目录中写入 2 个 csv 文件。
APPROACH -2你也可以做
set.seed(123)
df1 <- df %>% mutate(id = row_number()) %>% group_by(Company) %>% mutate(n = n()) %>%
sample_n(n*0.5) %>% mutate(set_no = "Set1") %>% ungroup() %>% select(-n, -id)
df2 <- df %>% mutate(id = row_number()) %>% filter(!id %in% df1$id) %>%
mutate(set_no = "Set2") %>% select(-id)
将结果写入csv
write.csv(df1, "subset1.csv")
write.csv(df2, "subset2.csv")
列 id
和 d
是辅助列。 id
列将确保没有 row/record 被留下,以防任何组中有奇数条目。 d
将有助于识别子组编号。
如果任何公司的记录数为奇数,则子集的行数可能不同且不相等。
您可以使用 ave
.
r <- transform(d, sub=ave(company, company, FUN=function(x)
paste0("sub", seq_along(x) %% 2 + 1)))
r
# name company sub
# 1 alex A sub2
# 2 malan A sub1
# 3 matteis A sub2
# 4 fenwick A sub1
# 5 nicolas B sub2
# 6 cleary B sub1
# 7 fin C sub2
# 8 stijn C sub1
# 9 antoine C sub2
# 10 fin C sub1
# 11 stijn C sub2
# 12 antoine C sub1
检查:
with(r, table(company, sub))
# company sub1 sub2
# A 2 2
# B 1 1
# C 1 2
实际上,如果按照示例中的方式订购公司,您只需执行以下操作:
d$sub <- NA
d$sub[] <- paste0("sub", 1:2) ## throws a warning when `nrow` is uneven
d
# name company sub
# 1 alex A sub1
# 2 malan A sub2
# 3 matteis A sub1
# 4 fenwick A sub2
# 5 nicolas B sub1
# 6 cleary B sub2
# 7 fin C sub1
# 8 stijn C sub2
# 9 antoine C sub1
数据:
d <- structure(list(name = c("alex", "malan", "matteis", "fenwick",
"nicolas", "cleary", "fin", "stijn", "antoine"), company = c("A",
"A", "A", "A", "B", "B", "C", "C", "C")), class = "data.frame", row.names = c(NA,
-9L))
您可以像这样使用 ave
和 sample
:
set.seed(42)
df$su <- ave(df$Company, df$Company, FUN=function(x)
sample(rep(1:2, ceiling(length(x)/2)), length(x)))
df
# Name Company su
#1 ABC A 1
#2 BCD A 2
#3 CDE A 1
#4 DEF A 2
#5 EFG B 2
#6 FGH B 1
#7 GHI C 2
#8 HIJ C 2
#9 IJK C 1
#10 JKL C 1