整洁; %>% group_by() 变异(foo = fill() )
tidyr; %>% group_by() mutate(foo = fill() )
我正在努力创建一个新变量来指示字母 LET
、某些组 grp
在 id id
中以什么开头。
下面我将说明我的问题。我有这样的数据,
library(dplyr); library(tidyr)
df <- tibble(id = rep(0:1, c(7, 10)),
grp = rep(c(0,1,0,1,2), c(3,4,2,5,3)),
LET = rep(c('A', 'B', 'A', 'B', 'A', 'B'), c(1,4, 3, 3, 4, 2)))
#> # A tibble: 17 x 3
#> id grp LET
#> <int> <dbl> <chr>
#> 1 0 0 A
#> 2 0 0 B
#> 3 0 0 B
#> 4 0 1 B
#> 5 0 1 B
#> 6 0 1 A
#> 7 0 1 A
#> 8 1 0 A
#> 9 1 0 B
#> 10 1 1 B
#> 11 1 1 B
#> 12 1 1 A
#> 13 1 1 A
#> 14 1 1 A
#> 15 1 2 A
#> 16 1 2 B
#> 17 1 2 B
我现在想创建一个新变量 %>% group_by(id, grp)
,我想我可以用 fill()
和 mutate(grp_LET = …
填充它,像这样;
df %>% group_by(id, grp) %>% fill(LET) %>% mutate(grp_LET = factor)
但我想不通。我希望获得的是这样的东西。我想要的结果,
dfd <- tibble(id = rep(0:1, c(7, 10)),
grp = rep(c(0,1,0,1,2), c(3,4,2,5,3)),
LET = rep(c('A', 'B', 'A', 'B', 'A', 'B'), c(1,4, 3, 3, 4, 2)),
grp_LET = rep(c('A', 'B', 'A', 'B', 'A'), c(3, 4, 2, 5, 3)));dfd
#> # A tibble: 17 x 4
#> id grp LET grp_LET
#> <int> <dbl> <chr> <chr>
#> 1 0 0 A A
#> 2 0 0 B A
#> 3 0 0 B A
#> 4 0 1 B B
#> 5 0 1 B B
#> 6 0 1 A B
#> 7 0 1 A B
#> 8 1 0 A A
#> 9 1 0 B A
#> 10 1 1 B B
#> 11 1 1 B B
#> 12 1 1 A B
#> 13 1 1 A B
#> 14 1 1 A B
#> 15 1 2 A A
#> 16 1 2 B A
#> 17 1 2 B A
如有任何帮助,我们将不胜感激。
更新2017-10-2611:03:40Z,我做了microbenchmark三个答案的比较你喜欢,
tbl <- df
dim(tbl)
#> [1] 17 3
# install.packages(c("dplyr"), dependencies = TRUE)
library(dplyr)
R_base_by_lmo <- function(x) {dat <- x
dat$grp_LET <- ave(dat$LET, dat[c("id", "grp")],
FUN=function(x) head(x, 1)); as_tibble(dat)
}
# mapply(all.equal, R_base_by_lmo(tbl), dfd)
# install.packages(c("data.table"), dependencies = TRUE)
library(data.table)
dt_by_akrun <- function(x) {foo <- copy(x)
setDT(foo)[, grp_LET := LET[1], .(id, grp)]
as_tibble(foo)
}
# mapply(all.equal, dt_by_akrun(tbl), dfd)
tidyverse_by_Psidom <- function(x) x %>% group_by(id,grp) %>% mutate(grp_LET=first(LET))
# mapply(all.equal, tidyverse_by_Psidom(df), dfd)
# install.packages(c("microbenchmark"), dependencies = TRUE)
require(microbenchmark)
x <- tbl
res <- microbenchmark(R_base_by_lmo(x),
dt_by_akrun(x),
tidyverse_by_Psidom(x), times = 67)
## Print results:
print(res)
Unit: milliseconds
expr min lq mean median uq max neval cld
R_base_by_lmo(x) 1.338758 1.419860 1.620292 1.547867 1.640043 4.098088 67 a
dt_by_akrun(x) 1.670019 1.776765 2.123219 1.859477 1.972842 11.922270 67 a
tidyverse_by_Psidom(x) 3.964432 4.065466 4.718041 4.128942 4.478950 15.939186 67 b
### Plot results:
boxplot(res)
更新 2017-10-26 12:10:55Z,在 的鼓励下,我对我的生产数据重新运行了微基准测试。 超过 的人对结果表示敬意。
dim('my production-data')
#> [1] 46104 11
x <- 'my production-data'
res2 <- microbenchmark(R_base_by_lmo(x),
dt_by_akrun(x),
tidyverse_by_Psidom(x), times = 8)
print(res2)
Unit: milliseconds
expr min lq mean median uq max neval cld
R_base_by_lmo(x) 28976.46868 29236.19450 29468.63955 29464.51339 29591.25206 30188.72785 8 b
dt_by_akrun(x) 74.18023 76.69274 85.75983 87.15791 91.62508 100.94692 8 a
tidyverse_by_Psidom(x) 38.38051 41.15552 42.83667 41.92207 44.53830 49.08109 8 a
箱线图(res2)
在基础 R 中,只需使用 ave
和 head
:
dat$grp_let <- ave(dat$LET, dat[c("id", "grp")], FUN=function(x) head(x, 1))
这个returns
dat
id grp LET grp_let
1 0 0 A A
2 0 0 B A
3 0 0 B A
4 0 1 B B
5 0 1 B B
6 0 1 A B
7 0 1 A B
8 1 0 A A
9 1 0 B A
10 1 1 B B
11 1 1 B B
12 1 1 A B
13 1 1 A B
14 1 1 A B
15 1 2 A A
16 1 2 B A
17 1 2 B A
似乎每个组都需要第一个 LET;您可以从每个组的向量 LET 中提取 first
元素,mutate
将 broadcast/cycle 组内的值:
df %>% group_by(id, grp) %>% mutate(grp_LET = first(LET))
# A tibble: 17 x 4
# Groups: id, grp [5]
# id grp LET grp_LET
# <int> <dbl> <chr> <chr>
# 1 0 0 A A
# 2 0 0 B A
# 3 0 0 B A
# 4 0 1 B B
# 5 0 1 B B
# 6 0 1 A B
# 7 0 1 A B
# 8 1 0 A A
# 9 1 0 B A
#10 1 1 B B
#11 1 1 B B
#12 1 1 A B
#13 1 1 A B
#14 1 1 A B
#15 1 2 A A
#16 1 2 B A
#17 1 2 B A
或者我们可以使用data.table
library(data.table)
setDT(df)[, grp_LET := LET[1], .(id, grp)]
df
# id grp LET grp_LET
# 1: 0 0 A A
# 2: 0 0 B A
# 3: 0 0 B A
# 4: 0 1 B B
# 5: 0 1 B B
# 6: 0 1 A B
# 7: 0 1 A B
# 8: 1 0 A A
# 9: 1 0 B A
#10: 1 1 B B
#11: 1 1 B B
#12: 1 1 A B
#13: 1 1 A B
#14: 1 1 A B
#15: 1 2 A A
#16: 1 2 B A
#17: 1 2 B A
我正在努力创建一个新变量来指示字母 LET
、某些组 grp
在 id id
中以什么开头。
下面我将说明我的问题。我有这样的数据,
library(dplyr); library(tidyr)
df <- tibble(id = rep(0:1, c(7, 10)),
grp = rep(c(0,1,0,1,2), c(3,4,2,5,3)),
LET = rep(c('A', 'B', 'A', 'B', 'A', 'B'), c(1,4, 3, 3, 4, 2)))
#> # A tibble: 17 x 3
#> id grp LET
#> <int> <dbl> <chr>
#> 1 0 0 A
#> 2 0 0 B
#> 3 0 0 B
#> 4 0 1 B
#> 5 0 1 B
#> 6 0 1 A
#> 7 0 1 A
#> 8 1 0 A
#> 9 1 0 B
#> 10 1 1 B
#> 11 1 1 B
#> 12 1 1 A
#> 13 1 1 A
#> 14 1 1 A
#> 15 1 2 A
#> 16 1 2 B
#> 17 1 2 B
我现在想创建一个新变量 %>% group_by(id, grp)
,我想我可以用 fill()
和 mutate(grp_LET = …
填充它,像这样;
df %>% group_by(id, grp) %>% fill(LET) %>% mutate(grp_LET = factor)
但我想不通。我希望获得的是这样的东西。我想要的结果,
dfd <- tibble(id = rep(0:1, c(7, 10)),
grp = rep(c(0,1,0,1,2), c(3,4,2,5,3)),
LET = rep(c('A', 'B', 'A', 'B', 'A', 'B'), c(1,4, 3, 3, 4, 2)),
grp_LET = rep(c('A', 'B', 'A', 'B', 'A'), c(3, 4, 2, 5, 3)));dfd
#> # A tibble: 17 x 4
#> id grp LET grp_LET
#> <int> <dbl> <chr> <chr>
#> 1 0 0 A A
#> 2 0 0 B A
#> 3 0 0 B A
#> 4 0 1 B B
#> 5 0 1 B B
#> 6 0 1 A B
#> 7 0 1 A B
#> 8 1 0 A A
#> 9 1 0 B A
#> 10 1 1 B B
#> 11 1 1 B B
#> 12 1 1 A B
#> 13 1 1 A B
#> 14 1 1 A B
#> 15 1 2 A A
#> 16 1 2 B A
#> 17 1 2 B A
如有任何帮助,我们将不胜感激。
更新2017-10-2611:03:40Z,我做了microbenchmark三个答案的比较你喜欢,
tbl <- df
dim(tbl)
#> [1] 17 3
# install.packages(c("dplyr"), dependencies = TRUE)
library(dplyr)
R_base_by_lmo <- function(x) {dat <- x
dat$grp_LET <- ave(dat$LET, dat[c("id", "grp")],
FUN=function(x) head(x, 1)); as_tibble(dat)
}
# mapply(all.equal, R_base_by_lmo(tbl), dfd)
# install.packages(c("data.table"), dependencies = TRUE)
library(data.table)
dt_by_akrun <- function(x) {foo <- copy(x)
setDT(foo)[, grp_LET := LET[1], .(id, grp)]
as_tibble(foo)
}
# mapply(all.equal, dt_by_akrun(tbl), dfd)
tidyverse_by_Psidom <- function(x) x %>% group_by(id,grp) %>% mutate(grp_LET=first(LET))
# mapply(all.equal, tidyverse_by_Psidom(df), dfd)
# install.packages(c("microbenchmark"), dependencies = TRUE)
require(microbenchmark)
x <- tbl
res <- microbenchmark(R_base_by_lmo(x),
dt_by_akrun(x),
tidyverse_by_Psidom(x), times = 67)
## Print results:
print(res)
Unit: milliseconds
expr min lq mean median uq max neval cld
R_base_by_lmo(x) 1.338758 1.419860 1.620292 1.547867 1.640043 4.098088 67 a
dt_by_akrun(x) 1.670019 1.776765 2.123219 1.859477 1.972842 11.922270 67 a
tidyverse_by_Psidom(x) 3.964432 4.065466 4.718041 4.128942 4.478950 15.939186 67 b
### Plot results:
boxplot(res)
更新 2017-10-26 12:10:55Z,在 的鼓励下,我对我的生产数据重新运行了微基准测试。 超过 的人对结果表示敬意。
dim('my production-data')
#> [1] 46104 11
x <- 'my production-data'
res2 <- microbenchmark(R_base_by_lmo(x),
dt_by_akrun(x),
tidyverse_by_Psidom(x), times = 8)
print(res2)
Unit: milliseconds
expr min lq mean median uq max neval cld
R_base_by_lmo(x) 28976.46868 29236.19450 29468.63955 29464.51339 29591.25206 30188.72785 8 b
dt_by_akrun(x) 74.18023 76.69274 85.75983 87.15791 91.62508 100.94692 8 a
tidyverse_by_Psidom(x) 38.38051 41.15552 42.83667 41.92207 44.53830 49.08109 8 a
箱线图(res2)
在基础 R 中,只需使用 ave
和 head
:
dat$grp_let <- ave(dat$LET, dat[c("id", "grp")], FUN=function(x) head(x, 1))
这个returns
dat
id grp LET grp_let
1 0 0 A A
2 0 0 B A
3 0 0 B A
4 0 1 B B
5 0 1 B B
6 0 1 A B
7 0 1 A B
8 1 0 A A
9 1 0 B A
10 1 1 B B
11 1 1 B B
12 1 1 A B
13 1 1 A B
14 1 1 A B
15 1 2 A A
16 1 2 B A
17 1 2 B A
似乎每个组都需要第一个 LET;您可以从每个组的向量 LET 中提取 first
元素,mutate
将 broadcast/cycle 组内的值:
df %>% group_by(id, grp) %>% mutate(grp_LET = first(LET))
# A tibble: 17 x 4
# Groups: id, grp [5]
# id grp LET grp_LET
# <int> <dbl> <chr> <chr>
# 1 0 0 A A
# 2 0 0 B A
# 3 0 0 B A
# 4 0 1 B B
# 5 0 1 B B
# 6 0 1 A B
# 7 0 1 A B
# 8 1 0 A A
# 9 1 0 B A
#10 1 1 B B
#11 1 1 B B
#12 1 1 A B
#13 1 1 A B
#14 1 1 A B
#15 1 2 A A
#16 1 2 B A
#17 1 2 B A
或者我们可以使用data.table
library(data.table)
setDT(df)[, grp_LET := LET[1], .(id, grp)]
df
# id grp LET grp_LET
# 1: 0 0 A A
# 2: 0 0 B A
# 3: 0 0 B A
# 4: 0 1 B B
# 5: 0 1 B B
# 6: 0 1 A B
# 7: 0 1 A B
# 8: 1 0 A A
# 9: 1 0 B A
#10: 1 1 B B
#11: 1 1 B B
#12: 1 1 A B
#13: 1 1 A B
#14: 1 1 A B
#15: 1 2 A A
#16: 1 2 B A
#17: 1 2 B A