Rdata.table。如果列 x 那么行计数,否则求和
R data.table. If column x then row count, else sum
下午
假设我有这个 table:
df <- data.table(date = rep(c(1,2), each = 2)
, user = rep(c(1,2), 2)
, turnover = 2:5
, profit = 1:4
); df
date user turnover profit
1 1 2 1
1 2 3 2
2 1 4 3
2 2 5 4
如果我想对多列求和,我会:
# metrics
x <- c('user', 'turnover', 'profit')
# apply
df[, lapply(.SD, function(x) sum(x)), .SDcols=x, by=date]
给出:
date user turnover profit
1 3 5 3
2 3 9 7
但是,请注意对用户求和没有意义,相反我想要“用户”列的行数,即
date user turnover profit
1 2 5 3
2 2 9 7
假设我不想做一个 1 的虚拟列并求和,而是我坚持使用 apply
和 data.table
。我该怎么做?
谢谢。
这是一种可能。您可以将 c
和 lapply
函数组合在一起,如下所示(注意 .N
是每个组的行数):
df[, c(.(user=.N), lapply(.SD, sum)), by=date, .SDcols=c("turnover", "profit")]
# date user turnover profit
# 1: 1 2 5 3
# 2: 2 2 9 7
使用dplyr
和across
,做这些操作更灵活
library(dplyr)
df %>%
group_by(date) %>%
summarise(user = n(), across(c(turnover, profit), sum))
-输出
# A tibble: 2 x 4
date user turnover profit
<dbl> <int> <int> <int>
1 1 2 5 3
2 2 2 9 7
或者 collapse
中的另一种选择,来自同一团队构建 data.table
的唯一目的是提高效率。
library(collapse)
collap(df, ~ date, custom = list(fsum = c("turnover", "profit"),
fNobs = "turnover"))
date fsum.turnover fNobs.turnover fsum.profit
1: 1 5 2 3
2: 2 9 2 7
基准
在更大的数据集上测试
library(data.table)
library(dplyr)
library(collapse)
library(purrr)
# input data
set.seed(24)
df1 <- data.table(date = rep(1:1e6, each = 20),
user = rep(1:1e6, 20),
turnover = rnorm(1e6 * 20),
profit = rnorm(1e6 * 20))
# benchmarks
# - B. Christian Kamgang
system.time({
df1[, c(.(user=.N), lapply(.SD, sum)), by=date, .SDcols=c("turnover", "profit")]
})
#user system elapsed
#0.558 0.110 0.670
# - Uwe
# - first
system.time({
df1[, c(.SD[, lapply(.SD, length), .SDcols = c("user")],
.SD[, lapply(.SD, sum), .SDcols = c("turnover", "profit")]), by = date]
})
#Timing stopped at: 245.9 3.336 249.4 0 stopped as it was taking time
# - second
system.time({
df1[, purrr::map2(list(length, sum, sum), .SD, \(fn, args) purrr::exec(fn, args)), by = date]
})
#user system elapsed
#37.816 0.138 38.016
# - third
system.time({
df1[, {
fct <- c("length", "sum", "sum")
res <- setDT(purrr::map2(fct, .SD, \(fn, args) purrr::exec(fn, args)))
setnames(res, paste(names(.SD), fct, sep = "_"))
}, by = date]
})
#user system elapsed
#134.966 1.530 136.620
# - fourth
system.time({
df1[, {
fct <- c("length", "mean")
res <- setDT(purrr::map2(fct, .SD, \(fn, args) purrr::exec(fn, args)))
setnames(res, paste(names(.SD), fct, sep = "_"))
}, .SDcols = 2:3, by = date]
})
#user system elapsed
#128.036 1.426 129.610
# - fifth
system.time({
df1[, {
fct <- c(N = "length", "mean")
res <- setDT(purrr::map2(fct, .SD, \(fn, args) purrr::exec(fn, args)))
given_names <- names(fct)
created_names <- paste(names(.SD), fct, sep = "_")
setnames(res,
if (is.null(given_names))
created_names
else
fifelse(given_names == "", created_names, given_names))
}, .SDcols = 2:3, by = date]
})
#user system elapsed
#131.960 1.552 133.595
-此 post
的解决方案时间
# - akrun
# - first
system.time({
df1 %>%
group_by(date) %>%
summarise(user = n(), across(c(turnover, profit), sum))
})
#user system elapsed
#15.920 0.372 16.322
# - second
system.time({
collap(df1, ~ date, custom = list(fsum = c("turnover", "profit"),
fNobs = "turnover"))
})
#user system elapsed
#0.311 0.005 0.316
以下是将 不同 函数应用于 data.table as requested by the OP in .
的 不同 列的方法
1。使用 c()
并分别调用 .SD
和 .SDcols
df[, c(.SD[, lapply(.SD, length), .SDcols = c("user")],
.SD[, lapply(.SD, sum), .SDcols = c("turnover", "profit")]), by = date]
date user turnover profit
1: 1 2 5 3
2: 2 2 9 7
这不是很优雅,很冗长,可能会降低性能,但可以完成工作 - 并且它保留了列名。
2。使用 purrr::map2()
df[, purrr::map2(list(length, sum, sum), .SD, \(fn, args) purrr::exec(fn, args)), by = date]
date V1 V2 V3
1: 1 2 5 3
2: 2 2 9 7
这不那么冗长,但不幸的是列名丢失了。
3。使用 purrr::map2()
并适当地命名列
df[, {
fct <- c("length", "sum", "sum")
res <- setDT(purrr::map2(fct, .SD, \(fn, args) purrr::exec(fn, args)))
setnames(res, paste(names(.SD), fct, sep = "_"))
}, by = date]
date user_length turnover_sum profit_sum
1: 1 2 5 3
2: 2 2 9 7
如果使用 .SDcols
:
选择列,这也将起作用
df[, {
fct <- c("length", "mean")
res <- setDT(purrr::map2(fct, .SD, \(fn, args) purrr::exec(fn, args)))
setnames(res, paste(names(.SD), fct, sep = "_"))
}, .SDcols = 2:3, by = date]
date user_length turnover_mean
1: 1 2 2.5
2: 2 2 4.5
4。使用 purrr::map2()
和灵活的列命名
如果 fct
是命名向量并且函数已命名,给定的名称将用于相应的列。否则,将使用创建的名称:
df[, {
fct <- c(N = "length", "mean")
res <- setDT(purrr::map2(fct, .SD, \(fn, args) purrr::exec(fn, args)))
given_names <- names(fct)
created_names <- paste(names(.SD), fct, sep = "_")
setnames(res,
if (is.null(given_names))
created_names
else
fifelse(given_names == "", created_names, given_names))
}, .SDcols = 2:3, by = date]
date N turnover_mean
1: 1 2 2.5
2: 2 2 4.5
下午
假设我有这个 table:
df <- data.table(date = rep(c(1,2), each = 2)
, user = rep(c(1,2), 2)
, turnover = 2:5
, profit = 1:4
); df
date user turnover profit
1 1 2 1
1 2 3 2
2 1 4 3
2 2 5 4
如果我想对多列求和,我会:
# metrics
x <- c('user', 'turnover', 'profit')
# apply
df[, lapply(.SD, function(x) sum(x)), .SDcols=x, by=date]
给出:
date user turnover profit
1 3 5 3
2 3 9 7
但是,请注意对用户求和没有意义,相反我想要“用户”列的行数,即
date user turnover profit
1 2 5 3
2 2 9 7
假设我不想做一个 1 的虚拟列并求和,而是我坚持使用 apply
和 data.table
。我该怎么做?
谢谢。
这是一种可能。您可以将 c
和 lapply
函数组合在一起,如下所示(注意 .N
是每个组的行数):
df[, c(.(user=.N), lapply(.SD, sum)), by=date, .SDcols=c("turnover", "profit")]
# date user turnover profit
# 1: 1 2 5 3
# 2: 2 2 9 7
使用dplyr
和across
,做这些操作更灵活
library(dplyr)
df %>%
group_by(date) %>%
summarise(user = n(), across(c(turnover, profit), sum))
-输出
# A tibble: 2 x 4
date user turnover profit
<dbl> <int> <int> <int>
1 1 2 5 3
2 2 2 9 7
或者 collapse
中的另一种选择,来自同一团队构建 data.table
的唯一目的是提高效率。
library(collapse)
collap(df, ~ date, custom = list(fsum = c("turnover", "profit"),
fNobs = "turnover"))
date fsum.turnover fNobs.turnover fsum.profit
1: 1 5 2 3
2: 2 9 2 7
基准
在更大的数据集上测试
library(data.table)
library(dplyr)
library(collapse)
library(purrr)
# input data
set.seed(24)
df1 <- data.table(date = rep(1:1e6, each = 20),
user = rep(1:1e6, 20),
turnover = rnorm(1e6 * 20),
profit = rnorm(1e6 * 20))
# benchmarks
# - B. Christian Kamgang
system.time({
df1[, c(.(user=.N), lapply(.SD, sum)), by=date, .SDcols=c("turnover", "profit")]
})
#user system elapsed
#0.558 0.110 0.670
# - Uwe
# - first
system.time({
df1[, c(.SD[, lapply(.SD, length), .SDcols = c("user")],
.SD[, lapply(.SD, sum), .SDcols = c("turnover", "profit")]), by = date]
})
#Timing stopped at: 245.9 3.336 249.4 0 stopped as it was taking time
# - second
system.time({
df1[, purrr::map2(list(length, sum, sum), .SD, \(fn, args) purrr::exec(fn, args)), by = date]
})
#user system elapsed
#37.816 0.138 38.016
# - third
system.time({
df1[, {
fct <- c("length", "sum", "sum")
res <- setDT(purrr::map2(fct, .SD, \(fn, args) purrr::exec(fn, args)))
setnames(res, paste(names(.SD), fct, sep = "_"))
}, by = date]
})
#user system elapsed
#134.966 1.530 136.620
# - fourth
system.time({
df1[, {
fct <- c("length", "mean")
res <- setDT(purrr::map2(fct, .SD, \(fn, args) purrr::exec(fn, args)))
setnames(res, paste(names(.SD), fct, sep = "_"))
}, .SDcols = 2:3, by = date]
})
#user system elapsed
#128.036 1.426 129.610
# - fifth
system.time({
df1[, {
fct <- c(N = "length", "mean")
res <- setDT(purrr::map2(fct, .SD, \(fn, args) purrr::exec(fn, args)))
given_names <- names(fct)
created_names <- paste(names(.SD), fct, sep = "_")
setnames(res,
if (is.null(given_names))
created_names
else
fifelse(given_names == "", created_names, given_names))
}, .SDcols = 2:3, by = date]
})
#user system elapsed
#131.960 1.552 133.595
-此 post
的解决方案时间# - akrun
# - first
system.time({
df1 %>%
group_by(date) %>%
summarise(user = n(), across(c(turnover, profit), sum))
})
#user system elapsed
#15.920 0.372 16.322
# - second
system.time({
collap(df1, ~ date, custom = list(fsum = c("turnover", "profit"),
fNobs = "turnover"))
})
#user system elapsed
#0.311 0.005 0.316
以下是将 不同 函数应用于 data.table as requested by the OP in
1。使用 c()
并分别调用 .SD
和 .SDcols
df[, c(.SD[, lapply(.SD, length), .SDcols = c("user")],
.SD[, lapply(.SD, sum), .SDcols = c("turnover", "profit")]), by = date]
date user turnover profit 1: 1 2 5 3 2: 2 2 9 7
这不是很优雅,很冗长,可能会降低性能,但可以完成工作 - 并且它保留了列名。
2。使用 purrr::map2()
df[, purrr::map2(list(length, sum, sum), .SD, \(fn, args) purrr::exec(fn, args)), by = date]
date V1 V2 V3 1: 1 2 5 3 2: 2 2 9 7
这不那么冗长,但不幸的是列名丢失了。
3。使用 purrr::map2()
并适当地命名列
df[, {
fct <- c("length", "sum", "sum")
res <- setDT(purrr::map2(fct, .SD, \(fn, args) purrr::exec(fn, args)))
setnames(res, paste(names(.SD), fct, sep = "_"))
}, by = date]
date user_length turnover_sum profit_sum 1: 1 2 5 3 2: 2 2 9 7
如果使用 .SDcols
:
df[, {
fct <- c("length", "mean")
res <- setDT(purrr::map2(fct, .SD, \(fn, args) purrr::exec(fn, args)))
setnames(res, paste(names(.SD), fct, sep = "_"))
}, .SDcols = 2:3, by = date]
date user_length turnover_mean 1: 1 2 2.5 2: 2 2 4.5
4。使用 purrr::map2()
和灵活的列命名
如果 fct
是命名向量并且函数已命名,给定的名称将用于相应的列。否则,将使用创建的名称:
df[, {
fct <- c(N = "length", "mean")
res <- setDT(purrr::map2(fct, .SD, \(fn, args) purrr::exec(fn, args)))
given_names <- names(fct)
created_names <- paste(names(.SD), fct, sep = "_")
setnames(res,
if (is.null(given_names))
created_names
else
fifelse(given_names == "", created_names, given_names))
}, .SDcols = 2:3, by = date]
date N turnover_mean 1: 1 2 2.5 2: 2 2 4.5