Data.table 而不是 dplyr
Data.table instead of dplyr
我想更改 data.table
而不是 dplyr::summarize(dplyr::across(ends_with("PV"), median),.groups = 'drop'
中的 dplyr
。另外,我想知道您是否认为 data.table
的处理时间比 dplyr
快?
library(dplyr)
library(tidyr)
library(lubridate)
#database
df1 <- data.frame( Id = rep(1:5, length=100000),
date1 = as.Date( "2021-12-01"),
date2= rep(seq( as.Date("2021-01-01"), length.out=50000, by=1), each = 2),
Category = rep(c("ABC", "EFG"), length.out = 100000),
Week = rep(c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
"Saturday", "Sunday"), length.out = 100000),
DR1 = sample( 200:250, 100000, repl=TRUE),
setNames( replicate(365, { sample(0:100000, 100000)}, simplify=FALSE),
paste0("DRM", formatC(1:365, width = 2, format = "d", flag = "0"))))
subsetDRM<- df1 %>% select(starts_with("DRM"))
DR1_subsetDRM<-cbind (df1, setNames(df1$DR1 - subsetDRM, paste0(names(subsetDRM), "_PV")))
subset_PV<-select(DR1_subsetDRM,Id, date2,Week, Category, DR1, ends_with("PV"))
result_median<-subset_PV %>%
group_by(Id,Category,Week) %>%
dplyr::summarize(dplyr::across(ends_with("PV"), median),.groups = 'drop')
如果我们想直接翻译成 data.table
,也许下面的代码会有所帮助
library(data.table)
# convert data.frame to data.table - setDT
setDT(df1)
# subset the columns of DRM to create subsetDRM
subsetDRM <- df1[, .SD, .SDcols = patterns("^DRM")]
# subtract the subsetDRM from df1
DR1_subsetDRM <- cbind(df1, setNames(df1$DR1 -
subsetDRM, paste0(names(subsetDRM), "_PV")))
pv_nm1 <- names(DR1_subsetDRM)[endsWith(names(DR1_subsetDRM), "PV")]
nm1 <- c("Id", "date2", "Week", "Category", "DR1", pv_nm1)
# create the subset_PV
subset_PV <- DR1_subsetDRM[, .SD, .SDcols = nm1]
# do a group by median
result_median <- subset_PV[, lapply(.SD, median),
by = .(Id, Category, Week), .SDcols = pv_nm1]
我认为 akrun 的回答提供了很好的 expression-for-expression 翻译。不过,如果您不需要重复这些步骤,您可以试试这个:
library(data.table)
dt1 <- as.data.table(df1)
cols <- grep("^DRM", colnames(dt1), value = TRUE)
dt1_results_median <-
dt1[, (paste0(cols, "_PV")) := DR1 - .SD, .SDcols = cols
][, lapply(.SD, median), by = .(Id, Category, Week), .SDcols = paste0(cols, "_PV") ]
相对表现,两个答案都有小幅提高 (33-41%):
bench::mark(OP = {...}, akrun = {...}, r2evans = {...}, check = FALSE, iterations = 10)
# Warning: Some expressions had a GC in every iteration; so filtering is disabled.
# # A tibble: 3 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 OP 1.61s 1.72s 0.583 2.01GB 3.15 10 54 17.2s <NULL> <Rprofmem~ <benc~ <tibbl~
# 2 akrun 1.24s 1.29s 0.773 2.29GB 1.47 10 19 12.9s <NULL> <Rprofmem~ <benc~ <tibbl~
# 3 r2evans 1.19s 1.21s 0.823 1.88GB 1.65 10 20 12.2s <NULL> <Rprofmem~ <benc~ <tibbl~
我认识到 akrun 的回答可能更像是一个教学时刻,将 R 的一种方言翻译成另一种方言,因此速度差异是“没有实际意义的”。在这个比较中 run-time,我认为更重要的是使用更具可读性和易理解性的代码,这使得它更易于维护并且更容易排除故障 if/when 你需要改变你的方法。
我想更改 data.table
而不是 dplyr::summarize(dplyr::across(ends_with("PV"), median),.groups = 'drop'
中的 dplyr
。另外,我想知道您是否认为 data.table
的处理时间比 dplyr
快?
library(dplyr)
library(tidyr)
library(lubridate)
#database
df1 <- data.frame( Id = rep(1:5, length=100000),
date1 = as.Date( "2021-12-01"),
date2= rep(seq( as.Date("2021-01-01"), length.out=50000, by=1), each = 2),
Category = rep(c("ABC", "EFG"), length.out = 100000),
Week = rep(c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
"Saturday", "Sunday"), length.out = 100000),
DR1 = sample( 200:250, 100000, repl=TRUE),
setNames( replicate(365, { sample(0:100000, 100000)}, simplify=FALSE),
paste0("DRM", formatC(1:365, width = 2, format = "d", flag = "0"))))
subsetDRM<- df1 %>% select(starts_with("DRM"))
DR1_subsetDRM<-cbind (df1, setNames(df1$DR1 - subsetDRM, paste0(names(subsetDRM), "_PV")))
subset_PV<-select(DR1_subsetDRM,Id, date2,Week, Category, DR1, ends_with("PV"))
result_median<-subset_PV %>%
group_by(Id,Category,Week) %>%
dplyr::summarize(dplyr::across(ends_with("PV"), median),.groups = 'drop')
如果我们想直接翻译成 data.table
,也许下面的代码会有所帮助
library(data.table)
# convert data.frame to data.table - setDT
setDT(df1)
# subset the columns of DRM to create subsetDRM
subsetDRM <- df1[, .SD, .SDcols = patterns("^DRM")]
# subtract the subsetDRM from df1
DR1_subsetDRM <- cbind(df1, setNames(df1$DR1 -
subsetDRM, paste0(names(subsetDRM), "_PV")))
pv_nm1 <- names(DR1_subsetDRM)[endsWith(names(DR1_subsetDRM), "PV")]
nm1 <- c("Id", "date2", "Week", "Category", "DR1", pv_nm1)
# create the subset_PV
subset_PV <- DR1_subsetDRM[, .SD, .SDcols = nm1]
# do a group by median
result_median <- subset_PV[, lapply(.SD, median),
by = .(Id, Category, Week), .SDcols = pv_nm1]
我认为 akrun 的回答提供了很好的 expression-for-expression 翻译。不过,如果您不需要重复这些步骤,您可以试试这个:
library(data.table)
dt1 <- as.data.table(df1)
cols <- grep("^DRM", colnames(dt1), value = TRUE)
dt1_results_median <-
dt1[, (paste0(cols, "_PV")) := DR1 - .SD, .SDcols = cols
][, lapply(.SD, median), by = .(Id, Category, Week), .SDcols = paste0(cols, "_PV") ]
相对表现,两个答案都有小幅提高 (33-41%):
bench::mark(OP = {...}, akrun = {...}, r2evans = {...}, check = FALSE, iterations = 10)
# Warning: Some expressions had a GC in every iteration; so filtering is disabled.
# # A tibble: 3 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 OP 1.61s 1.72s 0.583 2.01GB 3.15 10 54 17.2s <NULL> <Rprofmem~ <benc~ <tibbl~
# 2 akrun 1.24s 1.29s 0.773 2.29GB 1.47 10 19 12.9s <NULL> <Rprofmem~ <benc~ <tibbl~
# 3 r2evans 1.19s 1.21s 0.823 1.88GB 1.65 10 20 12.2s <NULL> <Rprofmem~ <benc~ <tibbl~
我认识到 akrun 的回答可能更像是一个教学时刻,将 R 的一种方言翻译成另一种方言,因此速度差异是“没有实际意义的”。在这个比较中 run-time,我认为更重要的是使用更具可读性和易理解性的代码,这使得它更易于维护并且更容易排除故障 if/when 你需要改变你的方法。