R:加速双循环
R: Speeding up double loop
我正在寻找加速我的代码的解决方案。我正在处理大约的数据集。 100.000 行,目前正在使用双 for-loop。我猜这会减慢我的代码速度。
Example data:
dt<-structure(list(name = c("Marcus", "Tina", "Jack", "George"),
address = c("Oxford Str.", "Oxford Str.", "Waterloo Sq.",
"London Str."), number = c(1, 1, 20, 15), suffix = c("a",
"a", NA, "b"), child = c("Tina", NA, "George", NA)), .Names = c("name",
"address", "number", "suffix", "child"), row.names = c(NA, -4L
), class = "data.frame")
Example DataFrame:
name address number suffix child
1 Marcus Oxford Str. 1 a Tina
2 Tina Oxford Str. 1 a
3 Jack Waterloo Sq. 20 George
4 George London Str. 15 b
我正在对每一行进行迭代以检查 child 是否位于同一地址并在新列 'Output' 中放置一个“1”。默认为“0”。结果应该是:
Example result:
name address number suffix child output
1 Marcus Oxford Str. 1 a Tina 1
2 Tina Oxford Str. 1 a
3 Jack Waterloo Sq. 20 George 0
4 George London Str. 15 b
我当前的代码:
df$output = 0
n = NROW(df)
for(i in 1:n) {
childID = df[i,5]
address = df[i,2]
number = df[i,3]
suffix = df[i,4]
for(j in 1:n) {
if((childID %in% df[j,1])&(address %in% df[j,2])&(number %in% df[j,3])
&(suffix %in% df[j,4]))
(df[i,6] = 1)
}
}
我尝试过将 Rcpp 与 C++ 代码一起使用。它也在工作,但仍然很慢。任何加快速度的想法或者我应该接受它需要一些时间才能 运行 吗?
我会尝试连接地址然后使用 match
,像这样:
# recreate your input (I put NAs where you have blanks)
DF <-
data.frame(name=c('Marcus','Tina','Jack','George'),
address=c('Oxford Str.','Oxford Str.','Waterloo Sq.','London Str.'),
number=c(1,1,20,15),
suffix=c('a','a',NA,'b'),
child=c('Tina',NA,'George',NA))
# create a single character address by concatenating address,number and suffix
fulladdr <- paste(DF$address,DF$number,DF$suffix,sep='||')
# initialize output to 0
DF$output <- 0
# set 1 where concatenated addresses match
DF$output[fulladdr[match(DF$child,DF$name)] == fulladdr] <- 1
> DF
name address number suffix child output
1 Marcus Oxford Str. 1 a Tina 1
2 Tina Oxford Str. 1 a <NA> 0
3 Jack Waterloo Sq. 20 <NA> George 0
4 George London Str. 15 b <NA> 0
我已经实施了一个 data.table
解决方案,对于这个特定的数据集,它比 @digEmAll 解决方案慢,但也许仍然有用。
此外,我提供了一些小的基准,这对这个小数据集没有多大意义,所以请在更大的数据集上测试它。
library(data.table)
name = c("Marcus", "Tina", "Jack", "George")
address = c("Oxford Str.", "Oxford Str.", "Waterloo Sq.", "London Str.")
number = c(1, 1, 20, 15)
suffix = c("a", "a", "", "b")
child = c("Tina", "", "George", "")
dt <- data.table(name
, address
,number
,suffix
,child
)
dt[, FullAddr := paste0(address, " " , number, suffix)]
dt[ FullAddr[match(child,name)] == FullAddr, output := 1 ]
dt[is.na(output), output := 0]
dt
name address number suffix child FullAddr output
1: Marcus Oxford Str. 1 a Tina Oxford Str. 1a 1
2: Tina Oxford Str. 1 a Oxford Str. 1a 0
3: Jack Waterloo Sq. 20 George Waterloo Sq. 20 0
4: George London Str. 15 b London Str. 15b 0
library(microbenchmark)
microbenchmark(
a = {dt[ FullAddr[match(child,name)] == FullAddr, output := 1 ]}
, b= {df$output = 0
n = NROW(df)
for(i in 1:n) {
childID = df[i,5]
address = df[i,2]
number = df[i,3]
suffix = df[i,4]
for(j in 1:n) {
if((childID %in% df[j,1])&(address %in% df[j,2])&(number %in% df[j,3])
&(suffix %in% df[j,4]))
(df[i,6] = 1)
}
}}
, c = df$output[fulladdr[match(df$child,df$name)] == fulladdr] <- 1
, times = 100L
)
Unit: microseconds
expr min lq mean median uq max neval cld
a 298.842 348.347 427.59415 413.6995 489.4665 903.467 100 a
b 15042.275 15494.461 17983.16735 15864.5405 16257.7130 162306.656 100 b
c 39.847 46.487 58.82731 59.1655 64.7495 165.420 100 a
这里有一个基于 hashmap
的解决方案,如评论中所述:
df <- read.csv(text = 'name,address,number,suffix,child
Marcus,Oxford Str.,1,a,Tina
Tina,Oxford Str.,1,a,
Jack,Waterloo Sq.,20,,George
George,London Str.,15,b,', stringsAsFactors = FALSE)
df
library(hashmap)
address <- paste(df$address, df$number, df$suffix)
name_address <- hashmap(df$name, address)
child_address <- name_address[[df$child]]
output <- as.integer(child_address == address)
output <- ifelse(is.na(output), '', as.character(output))
df$output <- output
df
输出:
> df
name address number suffix child output
1 Marcus Oxford Str. 1 a Tina 1
2 Tina Oxford Str. 1 a
3 Jack Waterloo Sq. 20 George 0
4 George London Str. 15 b
我正在寻找加速我的代码的解决方案。我正在处理大约的数据集。 100.000 行,目前正在使用双 for-loop。我猜这会减慢我的代码速度。
Example data:
dt<-structure(list(name = c("Marcus", "Tina", "Jack", "George"),
address = c("Oxford Str.", "Oxford Str.", "Waterloo Sq.",
"London Str."), number = c(1, 1, 20, 15), suffix = c("a",
"a", NA, "b"), child = c("Tina", NA, "George", NA)), .Names = c("name",
"address", "number", "suffix", "child"), row.names = c(NA, -4L
), class = "data.frame")
Example DataFrame:
name address number suffix child
1 Marcus Oxford Str. 1 a Tina
2 Tina Oxford Str. 1 a
3 Jack Waterloo Sq. 20 George
4 George London Str. 15 b
我正在对每一行进行迭代以检查 child 是否位于同一地址并在新列 'Output' 中放置一个“1”。默认为“0”。结果应该是:
Example result:
name address number suffix child output
1 Marcus Oxford Str. 1 a Tina 1
2 Tina Oxford Str. 1 a
3 Jack Waterloo Sq. 20 George 0
4 George London Str. 15 b
我当前的代码:
df$output = 0
n = NROW(df)
for(i in 1:n) {
childID = df[i,5]
address = df[i,2]
number = df[i,3]
suffix = df[i,4]
for(j in 1:n) {
if((childID %in% df[j,1])&(address %in% df[j,2])&(number %in% df[j,3])
&(suffix %in% df[j,4]))
(df[i,6] = 1)
}
}
我尝试过将 Rcpp 与 C++ 代码一起使用。它也在工作,但仍然很慢。任何加快速度的想法或者我应该接受它需要一些时间才能 运行 吗?
我会尝试连接地址然后使用 match
,像这样:
# recreate your input (I put NAs where you have blanks)
DF <-
data.frame(name=c('Marcus','Tina','Jack','George'),
address=c('Oxford Str.','Oxford Str.','Waterloo Sq.','London Str.'),
number=c(1,1,20,15),
suffix=c('a','a',NA,'b'),
child=c('Tina',NA,'George',NA))
# create a single character address by concatenating address,number and suffix
fulladdr <- paste(DF$address,DF$number,DF$suffix,sep='||')
# initialize output to 0
DF$output <- 0
# set 1 where concatenated addresses match
DF$output[fulladdr[match(DF$child,DF$name)] == fulladdr] <- 1
> DF
name address number suffix child output
1 Marcus Oxford Str. 1 a Tina 1
2 Tina Oxford Str. 1 a <NA> 0
3 Jack Waterloo Sq. 20 <NA> George 0
4 George London Str. 15 b <NA> 0
我已经实施了一个 data.table
解决方案,对于这个特定的数据集,它比 @digEmAll 解决方案慢,但也许仍然有用。
此外,我提供了一些小的基准,这对这个小数据集没有多大意义,所以请在更大的数据集上测试它。
library(data.table)
name = c("Marcus", "Tina", "Jack", "George")
address = c("Oxford Str.", "Oxford Str.", "Waterloo Sq.", "London Str.")
number = c(1, 1, 20, 15)
suffix = c("a", "a", "", "b")
child = c("Tina", "", "George", "")
dt <- data.table(name
, address
,number
,suffix
,child
)
dt[, FullAddr := paste0(address, " " , number, suffix)]
dt[ FullAddr[match(child,name)] == FullAddr, output := 1 ]
dt[is.na(output), output := 0]
dt
name address number suffix child FullAddr output
1: Marcus Oxford Str. 1 a Tina Oxford Str. 1a 1
2: Tina Oxford Str. 1 a Oxford Str. 1a 0
3: Jack Waterloo Sq. 20 George Waterloo Sq. 20 0
4: George London Str. 15 b London Str. 15b 0
library(microbenchmark)
microbenchmark(
a = {dt[ FullAddr[match(child,name)] == FullAddr, output := 1 ]}
, b= {df$output = 0
n = NROW(df)
for(i in 1:n) {
childID = df[i,5]
address = df[i,2]
number = df[i,3]
suffix = df[i,4]
for(j in 1:n) {
if((childID %in% df[j,1])&(address %in% df[j,2])&(number %in% df[j,3])
&(suffix %in% df[j,4]))
(df[i,6] = 1)
}
}}
, c = df$output[fulladdr[match(df$child,df$name)] == fulladdr] <- 1
, times = 100L
)
Unit: microseconds
expr min lq mean median uq max neval cld
a 298.842 348.347 427.59415 413.6995 489.4665 903.467 100 a
b 15042.275 15494.461 17983.16735 15864.5405 16257.7130 162306.656 100 b
c 39.847 46.487 58.82731 59.1655 64.7495 165.420 100 a
这里有一个基于 hashmap
的解决方案,如评论中所述:
df <- read.csv(text = 'name,address,number,suffix,child
Marcus,Oxford Str.,1,a,Tina
Tina,Oxford Str.,1,a,
Jack,Waterloo Sq.,20,,George
George,London Str.,15,b,', stringsAsFactors = FALSE)
df
library(hashmap)
address <- paste(df$address, df$number, df$suffix)
name_address <- hashmap(df$name, address)
child_address <- name_address[[df$child]]
output <- as.integer(child_address == address)
output <- ifelse(is.na(output), '', as.character(output))
df$output <- output
df
输出:
> df
name address number suffix child output
1 Marcus Oxford Str. 1 a Tina 1
2 Tina Oxford Str. 1 a
3 Jack Waterloo Sq. 20 George 0
4 George London Str. 15 b