如何使用正则表达式 group_by 和汇总多个变量?
How to group_by and summarize multiple variables using regex?
我想使用正则表达式来识别要用于 group_by 的变量并有效地汇总我的数据。我不能单独做,因为我有大量的变量要汇总,并且每次都需要动态传递 group_by 的变量。 data.table
接受使用正则表达式传递分组变量,但不接受汇总变量。到目前为止,我使用 tidyverse 的尝试也没有成功。任何帮助将不胜感激。
My data:
tempDF <- structure(list(d1 = c("A", "B", "C", "A", "C"), d2 = c(40L, 50L, 20L, 50L, 20L),
d3 = c(20L, 40L, 50L, 40L, 50L), d4 = c(60L, 30L, 30L,60L, 30L), p_A = c(1L,
3L, 2L, 3L, 2L), p_B = c(3L, 4L, 3L, 3L, 4L), p_C = c(2L, 1L, 1L,2L, 1L), p4 = c(5L,
5L, 4L, 5L, 4L)), class = "data.frame", row.names = c(NA, -5L))
View(tempDF)
lLevels<-c("d1")
lContinuum<-c("p_A", "p_B", "p_C")
My attempts:
setDT(tempDF)[ , list(group_means = mean(eval((paste0(lContinuum)))), by=eval((paste0(lLevels))))]
group_means by
1: NA d1
Warning message:
In mean.default(eval((paste0(lContinuum)))) :
argument is not numeric or logical: returning NA
But a single variable works:
setDT(tempDF)[ , list(group_means = mean(p_A)), by=eval((paste0(lLevels)))]
setDT(tempDF)[ , list(group_means = mean(p_B)), by=eval((paste0(lLevels)))]
setDT(tempDF)[ , list(group_means = mean(p_C)), by=eval((paste0(lLevels)))]
Expected output:
tempDF %>%
group_by(d1) %>%
summarise(p_A_mean = mean(p_A), p_B_mean = mean(p_B), p_C_mean = mean(p_C))
# A tibble: 3 x 4
d1 p_A_mean p_B_mean p_C_mean
<chr> <dbl> <dbl> <dbl>
1 A 2 3 2
2 B 3 4 1
3 C 2 3.5 1
虽然它看起来有点迂回,但将其重塑为长格式将允许不仅按 d1 分组,而且还可以按 p_A ... p_C 中的许多值进行分组数据集。
编辑:还添加了代码以通过正则表达式保留某些列 (d_cols
)。
library(tidyverse)
tempDF <- structure(
list(d1 = c("A", "B", "C", "A", "C"),
d2 = c(40L, 50L, 20L, 50L, 20L),
d3 = c(20L, 40L, 50L, 40L, 50L),
d4 = c(60L, 30L, 30L,60L, 30L),
d5 = c("AA", "BB", "CC", "AA", "CC"),
p_A = c(1L, 3L, 2L, 3L, 2L),
p_B = c(3L, 4L, 3L, 3L, 4L),
p_C = c(2L, 1L, 1L,2L, 1L),
p4 = c(5L, 5L, 4L, 5L, 4L)),
class = "data.frame",
row.names = c(NA, -5L))
# columns of d to keep, in strings
d_cols <- str_subset(colnames(tempDF), "d[15]")
tempDF %>%
pivot_longer(cols = matches("p_")) %>%
group_by(!!!syms(d_cols), name) %>%
summarize(mean = mean(value)) %>%
pivot_wider(id_cols = d_cols,
values_from = mean,
names_prefix = "mean_")
#> # A tibble: 3 x 5
#> # Groups: d1, d5 [3]
#> d1 d5 mean_p_A mean_p_B mean_p_C
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 A AA 2 3 2
#> 2 B BB 3 4 1
#> 3 C CC 2 3.5 1
由 reprex package (v0.3.0)
于 2019-10-19 创建
我确信这可以变得更高效/简洁但符合规范:
summarise_df <- function(df, grouping_var){
# Store string of the grouping var name:
grouping_vec <- gsub(".*[$]", "", deparse(substitute(grouping_var)))
# split apply combine summary - return dataframe:
tmpdf_list <- lapply(split(df[,sapply(df, is.numeric)], df[,grouping_vec]),
function(x){sapply(x, function(y){mean(y)})})
}
tmp <- do.call(rbind, summarise_df(df, df$d1))
df <- data.frame(cbind(d1 = row.names(tmp), tmp), row.names = NULL)
摘要变量也是动态的:
#
summarise_df <- function(df, grouping_var, summary_vars){
# Store string of the grouping var name:
grouping_vec <- gsub(".*[$]", "", deparse(substitute(grouping_var)))
# split apply combine summary - return dataframe:
tmpdf_list <- lapply(split(df[,summary_vars], df[,grouping_vec]),
function(x){sapply(x, function(y){mean(y)})})
}
tmp <- do.call(rbind, summarise_df(df, df$d1, c("p_A", "p_B", "p_C")))
tmp_df <- data.frame(cbind(d1 = row.names(tmp), tmp), row.names = NULL)
data.table方法非常简单:
library(data.table)
setDT(tempDF)
tempDF[, lapply(.SD, mean),
by = lLevels,
.SDcols = lContinuum]
d1 p_A p_B p_C
1: A 2 3.0 2
2: B 3 4.0 1
3: C 2 3.5 1
dplyr 中的类似方法是:
library(dplyr)
tempDF%>%
group_by_at(lLevels)%>%
summarize_at(lContinuum, mean)
# A tibble: 3 x 4
d1 p_A p_B p_C
<chr> <dbl> <dbl> <dbl>
1 A 2 3 2
2 B 3 4 1
3 C 2 3.5 1
无论哪种情况,您都可以将 lLevels
和 lContinuum
替换为正则表达式。 dplyr 选项还允许 select 助手,例如 starts_with()
和 ends_with()
:
https://www.rdocumentation.org/packages/tidyselect/versions/0.2.5/topics/select_helpers
.
我想使用正则表达式来识别要用于 group_by 的变量并有效地汇总我的数据。我不能单独做,因为我有大量的变量要汇总,并且每次都需要动态传递 group_by 的变量。 data.table
接受使用正则表达式传递分组变量,但不接受汇总变量。到目前为止,我使用 tidyverse 的尝试也没有成功。任何帮助将不胜感激。
My data:
tempDF <- structure(list(d1 = c("A", "B", "C", "A", "C"), d2 = c(40L, 50L, 20L, 50L, 20L),
d3 = c(20L, 40L, 50L, 40L, 50L), d4 = c(60L, 30L, 30L,60L, 30L), p_A = c(1L,
3L, 2L, 3L, 2L), p_B = c(3L, 4L, 3L, 3L, 4L), p_C = c(2L, 1L, 1L,2L, 1L), p4 = c(5L,
5L, 4L, 5L, 4L)), class = "data.frame", row.names = c(NA, -5L))
View(tempDF)
lLevels<-c("d1")
lContinuum<-c("p_A", "p_B", "p_C")
My attempts:
setDT(tempDF)[ , list(group_means = mean(eval((paste0(lContinuum)))), by=eval((paste0(lLevels))))]
group_means by
1: NA d1
Warning message:
In mean.default(eval((paste0(lContinuum)))) :
argument is not numeric or logical: returning NA
But a single variable works:
setDT(tempDF)[ , list(group_means = mean(p_A)), by=eval((paste0(lLevels)))]
setDT(tempDF)[ , list(group_means = mean(p_B)), by=eval((paste0(lLevels)))]
setDT(tempDF)[ , list(group_means = mean(p_C)), by=eval((paste0(lLevels)))]
Expected output:
tempDF %>%
group_by(d1) %>%
summarise(p_A_mean = mean(p_A), p_B_mean = mean(p_B), p_C_mean = mean(p_C))
# A tibble: 3 x 4
d1 p_A_mean p_B_mean p_C_mean
<chr> <dbl> <dbl> <dbl>
1 A 2 3 2
2 B 3 4 1
3 C 2 3.5 1
虽然它看起来有点迂回,但将其重塑为长格式将允许不仅按 d1 分组,而且还可以按 p_A ... p_C 中的许多值进行分组数据集。
编辑:还添加了代码以通过正则表达式保留某些列 (d_cols
)。
library(tidyverse)
tempDF <- structure(
list(d1 = c("A", "B", "C", "A", "C"),
d2 = c(40L, 50L, 20L, 50L, 20L),
d3 = c(20L, 40L, 50L, 40L, 50L),
d4 = c(60L, 30L, 30L,60L, 30L),
d5 = c("AA", "BB", "CC", "AA", "CC"),
p_A = c(1L, 3L, 2L, 3L, 2L),
p_B = c(3L, 4L, 3L, 3L, 4L),
p_C = c(2L, 1L, 1L,2L, 1L),
p4 = c(5L, 5L, 4L, 5L, 4L)),
class = "data.frame",
row.names = c(NA, -5L))
# columns of d to keep, in strings
d_cols <- str_subset(colnames(tempDF), "d[15]")
tempDF %>%
pivot_longer(cols = matches("p_")) %>%
group_by(!!!syms(d_cols), name) %>%
summarize(mean = mean(value)) %>%
pivot_wider(id_cols = d_cols,
values_from = mean,
names_prefix = "mean_")
#> # A tibble: 3 x 5
#> # Groups: d1, d5 [3]
#> d1 d5 mean_p_A mean_p_B mean_p_C
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 A AA 2 3 2
#> 2 B BB 3 4 1
#> 3 C CC 2 3.5 1
由 reprex package (v0.3.0)
于 2019-10-19 创建我确信这可以变得更高效/简洁但符合规范:
summarise_df <- function(df, grouping_var){
# Store string of the grouping var name:
grouping_vec <- gsub(".*[$]", "", deparse(substitute(grouping_var)))
# split apply combine summary - return dataframe:
tmpdf_list <- lapply(split(df[,sapply(df, is.numeric)], df[,grouping_vec]),
function(x){sapply(x, function(y){mean(y)})})
}
tmp <- do.call(rbind, summarise_df(df, df$d1))
df <- data.frame(cbind(d1 = row.names(tmp), tmp), row.names = NULL)
摘要变量也是动态的:
#
summarise_df <- function(df, grouping_var, summary_vars){
# Store string of the grouping var name:
grouping_vec <- gsub(".*[$]", "", deparse(substitute(grouping_var)))
# split apply combine summary - return dataframe:
tmpdf_list <- lapply(split(df[,summary_vars], df[,grouping_vec]),
function(x){sapply(x, function(y){mean(y)})})
}
tmp <- do.call(rbind, summarise_df(df, df$d1, c("p_A", "p_B", "p_C")))
tmp_df <- data.frame(cbind(d1 = row.names(tmp), tmp), row.names = NULL)
data.table方法非常简单:
library(data.table)
setDT(tempDF)
tempDF[, lapply(.SD, mean),
by = lLevels,
.SDcols = lContinuum]
d1 p_A p_B p_C
1: A 2 3.0 2
2: B 3 4.0 1
3: C 2 3.5 1
dplyr 中的类似方法是:
library(dplyr)
tempDF%>%
group_by_at(lLevels)%>%
summarize_at(lContinuum, mean)
# A tibble: 3 x 4
d1 p_A p_B p_C
<chr> <dbl> <dbl> <dbl>
1 A 2 3 2
2 B 3 4 1
3 C 2 3.5 1
无论哪种情况,您都可以将 lLevels
和 lContinuum
替换为正则表达式。 dplyr 选项还允许 select 助手,例如 starts_with()
和 ends_with()
:
https://www.rdocumentation.org/packages/tidyselect/versions/0.2.5/topics/select_helpers .