R：加速双循环

Question

我正在寻找加速我的代码的解决方案。我正在处理大约的数据集。 100.000 行，目前正在使用双 for-loop。我猜这会减慢我的代码速度。

Example data:

dt<-structure(list(name = c("Marcus", "Tina", "Jack", "George"), 
  address = c("Oxford Str.", "Oxford Str.", "Waterloo Sq.", 
  "London Str."), number = c(1, 1, 20, 15), suffix = c("a", 
  "a", NA, "b"), child = c("Tina", NA, "George", NA)), .Names = c("name", 
  "address", "number", "suffix", "child"), row.names = c(NA, -4L
  ), class = "data.frame")

Example DataFrame:
     name       address      number   suffix   child
1    Marcus     Oxford Str.  1        a        Tina
2    Tina       Oxford Str.  1        a     
3    Jack       Waterloo Sq. 20                George
4    George     London Str.  15       b

我正在对每一行进行迭代以检查 child 是否位于同一地址并在新列 'Output' 中放置一个“1”。默认为“0”。结果应该是：

Example result:
     name       address      number   suffix   child   output
1    Marcus     Oxford Str.  1        a        Tina    1
2    Tina       Oxford Str.  1        a     
3    Jack       Waterloo Sq. 20                George  0
4    George     London Str.  15       b

我当前的代码：

df$output = 0
n = NROW(df)

for(i in 1:n) {
 childID = df[i,5]
 address = df[i,2]
 number = df[i,3]
 suffix = df[i,4]
   for(j in 1:n) {
       if((childID %in% df[j,1])&(address %in% df[j,2])&(number %in% df[j,3])
         &(suffix %in% df[j,4]))
           (df[i,6] = 1)
    }
}

我尝试过将 Rcpp 与 C++ 代码一起使用。它也在工作，但仍然很慢。任何加快速度的想法或者我应该接受它需要一些时间才能运行吗？

Answer 1

我会尝试连接地址然后使用 match，像这样：

# recreate your input (I put NAs where you have blanks)
DF <- 
data.frame(name=c('Marcus','Tina','Jack','George'),
           address=c('Oxford Str.','Oxford Str.','Waterloo Sq.','London Str.'),
           number=c(1,1,20,15),
           suffix=c('a','a',NA,'b'),
           child=c('Tina',NA,'George',NA))

# create a single character address by concatenating address,number and suffix
fulladdr <- paste(DF$address,DF$number,DF$suffix,sep='||')
# initialize output to 0
DF$output <- 0
# set 1 where concatenated addresses match
DF$output[fulladdr[match(DF$child,DF$name)] == fulladdr] <- 1

> DF
    name      address number suffix  child output
1 Marcus  Oxford Str.      1      a   Tina      1
2   Tina  Oxford Str.      1      a   <NA>      0
3   Jack Waterloo Sq.     20   <NA> George      0
4 George  London Str.     15      b   <NA>      0

Answer 2

我已经实施了一个 data.table 解决方案，对于这个特定的数据集，它比 @digEmAll 解决方案慢，但也许仍然有用。此外，我提供了一些小的基准，这对这个小数据集没有多大意义，所以请在更大的数据集上测试它。

library(data.table)
name = c("Marcus", "Tina", "Jack", "George")
address = c("Oxford Str.", "Oxford Str.", "Waterloo Sq.", "London Str.")
number = c(1, 1, 20, 15)
suffix = c("a", "a", "", "b")
child = c("Tina", "", "George", "")

dt <- data.table(name
                 , address
                 ,number
                 ,suffix
                 ,child
                 )
dt[, FullAddr := paste0(address, " " , number, suffix)]
dt[ FullAddr[match(child,name)] == FullAddr, output := 1  ]

dt[is.na(output), output := 0]
dt
   name      address number suffix  child        FullAddr output
1: Marcus  Oxford Str.      1      a   Tina  Oxford Str. 1a      1
2:   Tina  Oxford Str.      1      a         Oxford Str. 1a      0
3:   Jack Waterloo Sq.     20        George Waterloo Sq. 20      0
4: George  London Str.     15      b        London Str. 15b      0

library(microbenchmark)

microbenchmark(
        a = {dt[ FullAddr[match(child,name)] == FullAddr, output := 1  ]}
        , b= {df$output = 0
        n = NROW(df)

        for(i in 1:n) {
                childID = df[i,5]
                address = df[i,2]
                number = df[i,3]
                suffix = df[i,4]
                for(j in 1:n) {
                        if((childID %in% df[j,1])&(address %in% df[j,2])&(number %in% df[j,3])
                           &(suffix %in% df[j,4]))
                                (df[i,6] = 1)
                }
        }}
        , c = df$output[fulladdr[match(df$child,df$name)] == fulladdr] <- 1

       , times = 100L

)

    Unit: microseconds
 expr       min        lq        mean     median         uq        max neval cld
    a   298.842   348.347   427.59415   413.6995   489.4665    903.467   100  a 
    b 15042.275 15494.461 17983.16735 15864.5405 16257.7130 162306.656   100   b
    c    39.847    46.487    58.82731    59.1655    64.7495    165.420   100  a

Answer 3

这里有一个基于 hashmap 的解决方案，如评论中所述：

df <- read.csv(text = 'name,address,number,suffix,child
Marcus,Oxford Str.,1,a,Tina
Tina,Oxford Str.,1,a,     
Jack,Waterloo Sq.,20,,George
George,London Str.,15,b,', stringsAsFactors = FALSE)
df

library(hashmap)
address <- paste(df$address, df$number, df$suffix)
name_address <- hashmap(df$name, address)
child_address <- name_address[[df$child]]
output <- as.integer(child_address == address)
output <- ifelse(is.na(output), '', as.character(output))              

df$output <- output
df

输出：

> df
    name      address number suffix  child output
1 Marcus  Oxford Str.      1      a   Tina      1
2   Tina  Oxford Str.      1      a              
3   Jack Waterloo Sq.     20        George      0
4 George  London Str.     15      b

R：加速双循环

R: Speeding up double loop

for-loop

r

rcpp