r中多列数据的累计计数Table
Cumulative Count of Multiple Columns of Data Table in r
给出下面的示例数据表,我能够找到分类列的累积计数,但是当数据集大得多时,cumcount 函数很慢。我正在寻找比下面给出的 cumcount 函数更快的替代方法。预期输出是下面的 totalCount
变量。
library(purrrlyr)
require(stringr)
require(data.table)
# Cumulative count function
cumcount <- function(x){
cumcount <- numeric(length(x))
names(cumcount) <- x
for(i in 1:length(x)){
cumcount[i] <- sum(x[1:i]==x[i])
}
return(cumcount - 1 )
}
# Example dataframe
cat_var <- c("rock", "indie", "rock", "rock", "pop", "indie", "pop", "rock")
cat_var_2 <- c("blue", "green", "red", "red", "blue", "red", "green", "blue")
target_var <- c(0, 0, 1, 1, 1, 1, 0, 1)
df <- data.table("categorical_variable" = cat_var, "categorical_variable_2" = cat_var_2, "target_variable" = target_var)
# Cumulative count for categorical variables
nms <- c("categorical_variable", "categorical_variable_2")
totalCount <- sapply(df[,..nms], cumcount)
尝试 data.table
的内置函数 rowid
df[,lapply(.SD, \(x) rowid(x) - 1), .SDcols = nms]
给出下面的示例数据表,我能够找到分类列的累积计数,但是当数据集大得多时,cumcount 函数很慢。我正在寻找比下面给出的 cumcount 函数更快的替代方法。预期输出是下面的 totalCount
变量。
library(purrrlyr)
require(stringr)
require(data.table)
# Cumulative count function
cumcount <- function(x){
cumcount <- numeric(length(x))
names(cumcount) <- x
for(i in 1:length(x)){
cumcount[i] <- sum(x[1:i]==x[i])
}
return(cumcount - 1 )
}
# Example dataframe
cat_var <- c("rock", "indie", "rock", "rock", "pop", "indie", "pop", "rock")
cat_var_2 <- c("blue", "green", "red", "red", "blue", "red", "green", "blue")
target_var <- c(0, 0, 1, 1, 1, 1, 0, 1)
df <- data.table("categorical_variable" = cat_var, "categorical_variable_2" = cat_var_2, "target_variable" = target_var)
# Cumulative count for categorical variables
nms <- c("categorical_variable", "categorical_variable_2")
totalCount <- sapply(df[,..nms], cumcount)
尝试 data.table
的内置函数 rowid
df[,lapply(.SD, \(x) rowid(x) - 1), .SDcols = nms]