无法将 lapply 与 data.table 一起使用
unable to use lapply with data.table
我正在尝试创建 data.table 中所有字符变量的摘要。基本上是为了获得总观察计数、缺失值、频率最高的类别等。但是我无法正确使用 lapply
。这是一个可重现的例子。
library(data.table)
#Function to analyze one variable at a time
analyze_char_var <- function(x) {
y = names(x)
z = x[,.N,by=y]
w = setorder(z,-N)
out = data.table(
total_obs = nrow(x),
missing_obs = sum(is.na(x)),
unique_cats = nrow(z),
top_cat = z[1,1],
top_freq = z[1,2]
)
return(out)
}
#Function to analyze all variables. I want to use lapply instead of loop
analyze_all_char <- function(dt) {
dt.char = dt[,sapply(dt,class)=="character", with=FALSE]
mylist = vector('list', length(dt.char))
for (i in 1:length(dt.char)){
x = dt.char[,i,with=FALSE]
mylist[[i]] = analyze_char_var(x)
}
return(mylist)
}
dt = data.table(
var1 = c('a', 'a', 'b','b', 'c'),
var2 = 1:5,
var3 = c('low','low','high','med',NA)
)
dt.analysis = analyze_all_char(dt)
仅使用 dt.analysis = dt.char[,lapply(.SD,analyze_char_var)]
会产生错误 Error in x[, .N, by = y] : incorrect number of dimensions
。我尝试了一些变体,但无法正常工作。
编辑: 最后这对我有用。但是,看起来很笨拙。将输入向量重新转换为 data.table,然后以 data.frame 的方式使用 lapply
。
test_func <- function(x) {
dt = as.data.table(x)
dt.summ = dt[,.N,by='x'] #by default name is x
# I was stuck in the above line, I was trying all
# sort of bad tricks to get the name of grouping variable
dt.summ.sorted = setorder(dt.summ,-N)
out = data.table(
total_obs = nrow(dt),
missing_obs = sum(is.na(dt)),
unique_cats = nrow(dt.summ.sorted),
top_cat = dt.summ.sorted[1,1],
top_freq = dt.summ.sorted[1,2]
)
return(out)
}
dt.char = dt[,sapply(dt,class)=="character", with=FALSE]
lapply(dt.char,test_func)
应该这样做:
analyze_all_char <- function(dt) {
dt.char = dt[,sapply(dt,class)=="character", with=FALSE]
mylist = lapply(1:length(dt.char), function(i) {
x = dt.char[,i,with=FALSE]
analyze_char_var(x)
})
return(mylist)
}
Benchmarking 它,你不会看到太多的性能提升。如果您追求性能,我建议您使用 data.table
操作进行计算。
我增加了 data.frame 并检查了 for-loop、lapply 和@Frank 的解决方案。明显的赢家是 data.table
!
Unit: milliseconds
expr min lq mean median uq max neval cld
forloop 4.070700 4.685024 7.220436 6.709425 8.564480 35.81166 500 b
lapply 3.988765 4.750347 7.367764 6.815147 8.613754 56.58692 500 b
lapply1 4.008022 4.728257 7.390874 6.786074 8.551453 51.31551 500 b
dtf 2.984400 3.320825 5.451909 4.699372 6.661660 40.85501 500 a
完整代码:
dt = data.table(
var1 = rep(c('a', 'a', 'b','b', 'c'),100),
var2 = rep(1:5,100),
var3 = rep(c('low','low','high','med',NA),100)
)
analyze_all_char <- function(dt) {
dt.char = dt[,sapply(dt,class)=="character", with=FALSE]
mylist = list()
for (i in 1:length(dt.char)){
x = dt.char[,i,with=FALSE]
mylist[[i]] = analyze_char_var(x)
}
return(mylist)
}
analyze_all_char_l <- function(dt) {
dt.char = dt[,sapply(dt,class)=="character", with=FALSE]
mylist = lapply(1:length(dt.char), function(i) {
x = dt.char[,i,with=FALSE]
analyze_char_var(x)
})
return(mylist)
}
analyze_all_char_l1 <- function(dt) {
dt.char = dt[,sapply(dt,class)=="character", with=FALSE]
mylist = lapply(1:length(dt.char), function(i) {
analyze_char_var(dt.char[,i,with=FALSE])
})
return(mylist)
}
dtf <- function() {
melt(dt.char <- Filter(is.character, dt), meas=names(dt.char))[, {
tabula = setDT(list(value))[, .N, by="V1"][order(-N, V1)]
.(
NOBS = .N,
NNA = sum(is.na(value)),
NVALS = nrow(tabula),
HIVAL = tabula$V1[1L],
NHI = tabula$N[1L]
)
}, by=variable]
}
analyze_all_char(dt)
analyze_all_char_l(dt)
analyze_all_char_l1(dt)
dtf()
library(microbenchmark)
mc <- microbenchmark(times=500,
forloop = analyze_all_char(dt),
lapply = analyze_all_char_l(dt),
lapply1 = analyze_all_char_l1(dt),
dtf = dtf()
)
mc
I am trying to create a summary of all character variables in a data.table. Basically to get total observation count, missing values, category with highest frequency etc.
由于所有感兴趣的列都具有相同的类型,您可以使用 melt
转到长格式:
melt(dt.char <- Filter(is.character, dt), meas=names(dt.char))[, {
tabula = setDT(list(value))[, .N, by="V1"][order(-N, V1)]
.(
NOBS = .N,
NNA = sum(is.na(value)),
NVALS = nrow(tabula),
HIVAL = tabula$V1[1L],
NHI = tabula$N[1L]
)
}, by=variable]
# variable NOBS NNA NVALS HIVAL NHI
# 1: var1 5 0 3 a 2
# 2: var3 5 1 4 low 2
要将 NA 排除为一个类别(出现在 NVALS 和可能的 HIVAL、NHI 中),请将上面的 [, .N, by="V1"]
更改为 [!is.na(V1), .N, by="V1"]
。
我怀疑性能对于这项任务是否重要,但这应该是相当有效的。
我正在尝试创建 data.table 中所有字符变量的摘要。基本上是为了获得总观察计数、缺失值、频率最高的类别等。但是我无法正确使用 lapply
。这是一个可重现的例子。
library(data.table)
#Function to analyze one variable at a time
analyze_char_var <- function(x) {
y = names(x)
z = x[,.N,by=y]
w = setorder(z,-N)
out = data.table(
total_obs = nrow(x),
missing_obs = sum(is.na(x)),
unique_cats = nrow(z),
top_cat = z[1,1],
top_freq = z[1,2]
)
return(out)
}
#Function to analyze all variables. I want to use lapply instead of loop
analyze_all_char <- function(dt) {
dt.char = dt[,sapply(dt,class)=="character", with=FALSE]
mylist = vector('list', length(dt.char))
for (i in 1:length(dt.char)){
x = dt.char[,i,with=FALSE]
mylist[[i]] = analyze_char_var(x)
}
return(mylist)
}
dt = data.table(
var1 = c('a', 'a', 'b','b', 'c'),
var2 = 1:5,
var3 = c('low','low','high','med',NA)
)
dt.analysis = analyze_all_char(dt)
仅使用 dt.analysis = dt.char[,lapply(.SD,analyze_char_var)]
会产生错误 Error in x[, .N, by = y] : incorrect number of dimensions
。我尝试了一些变体,但无法正常工作。
编辑: 最后这对我有用。但是,看起来很笨拙。将输入向量重新转换为 data.table,然后以 data.frame 的方式使用 lapply
。
test_func <- function(x) {
dt = as.data.table(x)
dt.summ = dt[,.N,by='x'] #by default name is x
# I was stuck in the above line, I was trying all
# sort of bad tricks to get the name of grouping variable
dt.summ.sorted = setorder(dt.summ,-N)
out = data.table(
total_obs = nrow(dt),
missing_obs = sum(is.na(dt)),
unique_cats = nrow(dt.summ.sorted),
top_cat = dt.summ.sorted[1,1],
top_freq = dt.summ.sorted[1,2]
)
return(out)
}
dt.char = dt[,sapply(dt,class)=="character", with=FALSE]
lapply(dt.char,test_func)
应该这样做:
analyze_all_char <- function(dt) {
dt.char = dt[,sapply(dt,class)=="character", with=FALSE]
mylist = lapply(1:length(dt.char), function(i) {
x = dt.char[,i,with=FALSE]
analyze_char_var(x)
})
return(mylist)
}
Benchmarking 它,你不会看到太多的性能提升。如果您追求性能,我建议您使用 data.table
操作进行计算。
我增加了 data.frame 并检查了 for-loop、lapply 和@Frank 的解决方案。明显的赢家是 data.table
!
Unit: milliseconds expr min lq mean median uq max neval cld forloop 4.070700 4.685024 7.220436 6.709425 8.564480 35.81166 500 b lapply 3.988765 4.750347 7.367764 6.815147 8.613754 56.58692 500 b lapply1 4.008022 4.728257 7.390874 6.786074 8.551453 51.31551 500 b dtf 2.984400 3.320825 5.451909 4.699372 6.661660 40.85501 500 a
完整代码:
dt = data.table(
var1 = rep(c('a', 'a', 'b','b', 'c'),100),
var2 = rep(1:5,100),
var3 = rep(c('low','low','high','med',NA),100)
)
analyze_all_char <- function(dt) {
dt.char = dt[,sapply(dt,class)=="character", with=FALSE]
mylist = list()
for (i in 1:length(dt.char)){
x = dt.char[,i,with=FALSE]
mylist[[i]] = analyze_char_var(x)
}
return(mylist)
}
analyze_all_char_l <- function(dt) {
dt.char = dt[,sapply(dt,class)=="character", with=FALSE]
mylist = lapply(1:length(dt.char), function(i) {
x = dt.char[,i,with=FALSE]
analyze_char_var(x)
})
return(mylist)
}
analyze_all_char_l1 <- function(dt) {
dt.char = dt[,sapply(dt,class)=="character", with=FALSE]
mylist = lapply(1:length(dt.char), function(i) {
analyze_char_var(dt.char[,i,with=FALSE])
})
return(mylist)
}
dtf <- function() {
melt(dt.char <- Filter(is.character, dt), meas=names(dt.char))[, {
tabula = setDT(list(value))[, .N, by="V1"][order(-N, V1)]
.(
NOBS = .N,
NNA = sum(is.na(value)),
NVALS = nrow(tabula),
HIVAL = tabula$V1[1L],
NHI = tabula$N[1L]
)
}, by=variable]
}
analyze_all_char(dt)
analyze_all_char_l(dt)
analyze_all_char_l1(dt)
dtf()
library(microbenchmark)
mc <- microbenchmark(times=500,
forloop = analyze_all_char(dt),
lapply = analyze_all_char_l(dt),
lapply1 = analyze_all_char_l1(dt),
dtf = dtf()
)
mc
I am trying to create a summary of all character variables in a data.table. Basically to get total observation count, missing values, category with highest frequency etc.
由于所有感兴趣的列都具有相同的类型,您可以使用 melt
转到长格式:
melt(dt.char <- Filter(is.character, dt), meas=names(dt.char))[, {
tabula = setDT(list(value))[, .N, by="V1"][order(-N, V1)]
.(
NOBS = .N,
NNA = sum(is.na(value)),
NVALS = nrow(tabula),
HIVAL = tabula$V1[1L],
NHI = tabula$N[1L]
)
}, by=variable]
# variable NOBS NNA NVALS HIVAL NHI
# 1: var1 5 0 3 a 2
# 2: var3 5 1 4 low 2
要将 NA 排除为一个类别(出现在 NVALS 和可能的 HIVAL、NHI 中),请将上面的 [, .N, by="V1"]
更改为 [!is.na(V1), .N, by="V1"]
。
我怀疑性能对于这项任务是否重要,但这应该是相当有效的。