通过在函数中使用 apply 来翻译多个字符串来加速问题
Speedproblems by using apply in a function to translate multiple strings
我写了一个小函数 dictTranslator
来将多个字符串翻译成缩写或将它们分组到类别中...我使用 list
作为字典并有一个带有缩写的数据框列,我想要一个具有相关类别名称的列类别。
dictTranslator <- function(x, dict) {
sapply(x, function(a) {
result <- names(which(sapply(dict, function(b) {a %in% b})))
if(identical(result, character(0))) {
warning(sprintf('NAs are introduced, "%s" not found!', a), call.=FALSE)
NA
} else {
result
}
})
}
my_dictionary <-
list(embryo=c('00h','01h','02h','e02','03h','04h','05h','06h','e06',
'08h','10h','12h','e12','14h','16h','18h','20h','e20'),
larvae=c('L1','L2','L3e','L3l'),
pupae=c('p1','p2','p3','p4','p5'),
adult=c('vm','m','vf','f'))
示例数据:
df <- data.frame(abbreviation=rep(unlist(my_dictionary), 30000))
nrow(df)
# [1] 930000
system.time(df$category <- dictTranslator(df$abbreviation, my_dictionary))
函数按预期工作,但性能非常慢(大约一分钟)。有没有人想加快这个速度或者是否有更好的解决方案?
结果如下:
> head(df,40)
abbreviation category
1 00h embryo
2 01h embryo
...
19 L1 larvae
20 L2 larvae
21 L3e larvae
22 L3l larvae
23 p1 pupae
24 p2 pupae
25 p3 pupae
26 p4 pupae
27 p5 pupae
28 vm adult
29 m adult
30 vf adult
31 f adult
32 00h embryo
33 01h embryo
34 02h embryo
35 e02 embryo
36 03h embryo
我将为此使用命名向量:
a2c <- c ("larvae", "larvae", "larvae", "pupae", "pupae", ...)
names (a2e) <- c ('L1','L2','L3e','p1', 'p2', ...)
在您的情况下,您可以轻松构建向量:
a2c <- rep (names (my_dictionary), times = sapply (my_dictionary, length))
names (a2c) <- unlist (my_dictionary)
确保它们匹配:
cbind (names (a2c), a2c)
然后:
df[,"category"] <- a2c[df$abbreviation]
在你的例子中 df$abbreviation 是一个因素所以你可能需要把它变成字符
df[,"category"] <- a2c[as.character (df$abbreviation)]
我建议字典使用另一种数据格式,也使用data.table
库:
library(data.table)
md <- list()
for(i in 1:length(my_dictionary)) {
md[[i]] <- data.table(abbreviation = my_dictionary[[i]], category = names(my_dictionary[i]))
}
md <- rbindlist(md)
那么您只需加入:
df <- data.table(df, key = 'abbreviation')
df <- df[md]
比较。我对三种方法进行了 运行 比较,结果如下:
资料准备:
@docendo-discimus 和@fdetsch 使用相同的数据格式
md1 <- unlist(my_dictionary)
df1 <- df
@danas.zuokas 使用
library(data.table)
md2 <- list()
for(i in 1:length(my_dictionary)) {
md2[[i]] <- data.table(abbreviation = my_dictionary[[i]], category = names(my_dictionary[i]))
}
md2 <- rbindlist(md2)
df2 <- data.table(df, key = 'abbreviation')
这里有三个函数可以比较
f_dd <- function(x, y) { x$category <- sub('\d+$', '', names(y)[match(x$abbreviation, y)]); x } #docendo-discimus
f_dz <- function(x, y) { x <- x[y]; x } #danas.zuokas
f_fd <- function(x, y) { x$category <- gsub('\d', '', sapply(x$abbreviation, function(i) names(which(i == y)))); x } #fdetsch
这是结果
library(microbenchmark)
microbenchmark(f_dd(df1, md1), f_fd(df1, md1), f_dz(df2, md2), times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
f_dd(df1, md1) 1041.9195 1142.8361 1236.2033 1224.498 1266.9600 1469.7119 10
f_fd(df1, md1) 7106.6641 7417.5538 7924.3541 7868.716 8304.7760 8961.2615 10
f_dz(df2, md2) 35.6389 41.7524 77.2347 63.478 70.1699 183.9867 10
我发现这种方法在我的机器上完成得最快。请注意,输入数据是 character
,与 factor
输入相比,这会导致速度提升。
## non-factor sample data
my_dictionary <- unlist(my_dictionary)
df <- data.frame(abbreviation = rep(my_dictionary, 30000),
stringsAsFactors = FALSE)
system.time({
## names
result <- sapply(df$abbreviation, function(i) {
names(which(i == my_dictionary))
})
## discard numbers
df$category <- gsub("\d", "", result)
})
# user system elapsed
# 3.993 0.000 3.991
关于您提供的代码(即结合使用多个 sapply
循环和 %in%
),请记住 %in%
(或 match
)执行当您搜索单个条目时相当糟糕(参见 this question)。
我会在命名向量上使用 match
,并删除通过取消列出 my_dictionary
(我使用 sub
)生成的数字。这样您就可以避免代价高昂的循环。
x <- unlist(my_dictionary)
df$category <- sub('\d+$', '', names(x)[match(df$abbreviation, x)])
运行时间不到一秒:
df <- data.frame(abbreviation=rep(unlist(my_dictionary), 30000),
stringsAsFactors = FALSE)
system.time({df$category <- sub('\d+$', '', names(x)[match(df$abbreviation, x)])})
# User System elapsed
# 0.634 0.003 0.639
我写了一个小函数 dictTranslator
来将多个字符串翻译成缩写或将它们分组到类别中...我使用 list
作为字典并有一个带有缩写的数据框列,我想要一个具有相关类别名称的列类别。
dictTranslator <- function(x, dict) {
sapply(x, function(a) {
result <- names(which(sapply(dict, function(b) {a %in% b})))
if(identical(result, character(0))) {
warning(sprintf('NAs are introduced, "%s" not found!', a), call.=FALSE)
NA
} else {
result
}
})
}
my_dictionary <-
list(embryo=c('00h','01h','02h','e02','03h','04h','05h','06h','e06',
'08h','10h','12h','e12','14h','16h','18h','20h','e20'),
larvae=c('L1','L2','L3e','L3l'),
pupae=c('p1','p2','p3','p4','p5'),
adult=c('vm','m','vf','f'))
示例数据:
df <- data.frame(abbreviation=rep(unlist(my_dictionary), 30000))
nrow(df)
# [1] 930000
system.time(df$category <- dictTranslator(df$abbreviation, my_dictionary))
函数按预期工作,但性能非常慢(大约一分钟)。有没有人想加快这个速度或者是否有更好的解决方案?
结果如下:
> head(df,40)
abbreviation category
1 00h embryo
2 01h embryo
...
19 L1 larvae
20 L2 larvae
21 L3e larvae
22 L3l larvae
23 p1 pupae
24 p2 pupae
25 p3 pupae
26 p4 pupae
27 p5 pupae
28 vm adult
29 m adult
30 vf adult
31 f adult
32 00h embryo
33 01h embryo
34 02h embryo
35 e02 embryo
36 03h embryo
我将为此使用命名向量:
a2c <- c ("larvae", "larvae", "larvae", "pupae", "pupae", ...)
names (a2e) <- c ('L1','L2','L3e','p1', 'p2', ...)
在您的情况下,您可以轻松构建向量:
a2c <- rep (names (my_dictionary), times = sapply (my_dictionary, length))
names (a2c) <- unlist (my_dictionary)
确保它们匹配:
cbind (names (a2c), a2c)
然后:
df[,"category"] <- a2c[df$abbreviation]
在你的例子中 df$abbreviation 是一个因素所以你可能需要把它变成字符
df[,"category"] <- a2c[as.character (df$abbreviation)]
我建议字典使用另一种数据格式,也使用data.table
库:
library(data.table)
md <- list()
for(i in 1:length(my_dictionary)) {
md[[i]] <- data.table(abbreviation = my_dictionary[[i]], category = names(my_dictionary[i]))
}
md <- rbindlist(md)
那么您只需加入:
df <- data.table(df, key = 'abbreviation')
df <- df[md]
比较。我对三种方法进行了 运行 比较,结果如下:
资料准备: @docendo-discimus 和@fdetsch 使用相同的数据格式
md1 <- unlist(my_dictionary)
df1 <- df
@danas.zuokas 使用
library(data.table)
md2 <- list()
for(i in 1:length(my_dictionary)) {
md2[[i]] <- data.table(abbreviation = my_dictionary[[i]], category = names(my_dictionary[i]))
}
md2 <- rbindlist(md2)
df2 <- data.table(df, key = 'abbreviation')
这里有三个函数可以比较
f_dd <- function(x, y) { x$category <- sub('\d+$', '', names(y)[match(x$abbreviation, y)]); x } #docendo-discimus
f_dz <- function(x, y) { x <- x[y]; x } #danas.zuokas
f_fd <- function(x, y) { x$category <- gsub('\d', '', sapply(x$abbreviation, function(i) names(which(i == y)))); x } #fdetsch
这是结果
library(microbenchmark)
microbenchmark(f_dd(df1, md1), f_fd(df1, md1), f_dz(df2, md2), times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
f_dd(df1, md1) 1041.9195 1142.8361 1236.2033 1224.498 1266.9600 1469.7119 10
f_fd(df1, md1) 7106.6641 7417.5538 7924.3541 7868.716 8304.7760 8961.2615 10
f_dz(df2, md2) 35.6389 41.7524 77.2347 63.478 70.1699 183.9867 10
我发现这种方法在我的机器上完成得最快。请注意,输入数据是 character
,与 factor
输入相比,这会导致速度提升。
## non-factor sample data
my_dictionary <- unlist(my_dictionary)
df <- data.frame(abbreviation = rep(my_dictionary, 30000),
stringsAsFactors = FALSE)
system.time({
## names
result <- sapply(df$abbreviation, function(i) {
names(which(i == my_dictionary))
})
## discard numbers
df$category <- gsub("\d", "", result)
})
# user system elapsed
# 3.993 0.000 3.991
关于您提供的代码(即结合使用多个 sapply
循环和 %in%
),请记住 %in%
(或 match
)执行当您搜索单个条目时相当糟糕(参见 this question)。
我会在命名向量上使用 match
,并删除通过取消列出 my_dictionary
(我使用 sub
)生成的数字。这样您就可以避免代价高昂的循环。
x <- unlist(my_dictionary)
df$category <- sub('\d+$', '', names(x)[match(df$abbreviation, x)])
运行时间不到一秒:
df <- data.frame(abbreviation=rep(unlist(my_dictionary), 30000),
stringsAsFactors = FALSE)
system.time({df$category <- sub('\d+$', '', names(x)[match(df$abbreviation, x)])})
# User System elapsed
# 0.634 0.003 0.639