R - 为大型数据集增加 R 中 for 循环的运行时间

Question

考虑以下使用 R-

中的这 2 个数据帧的示例

original = data.frame(group = paste("G",c(1:5),sep=""), 
                field1 = c("A","B","C","D","E"),
                cost = round(runif(5,300,500),2), 
                slno = c("1 4 5 7","1 3","9","2 5 7 10","1 10"), stringsAsFactors = F)

alternative = data.frame(slno = c(1:10), 
                 name = paste("name",c(1:10),sep=""), 
                 cost = round(runif(10,50,100),2), stringsAsFactors = F)

我想执行以下步骤并在原始 -

中输入这些列

映射原始第4列中的每个slno（space分隔）替代数据帧中的数据帧并获取成本。
从原始成本中减去每项备选成本并计算节省。
original$max_alternative 列应具有最大节省的备选方案的名称。 原$max_saving应该有相应的节省。
original$oth_alt 列应该用分号分隔所有其他名称。 原始$oth_savings应该有相应的储蓄分号分隔。

数据集 -->

> original
group field1   cost     slno
1    G1      A 330.37  1 4 5 7
2    G2      B 463.80      1 3
3    G3      C 471.74        9
4    G4      D 465.71 2 5 7 10
5    G5      E 472.83     1 10

> alternative
    slno   name  cost
1     1  name1   64.98
2     2  name2   94.63
3     3  name3   98.96
4     4  name4   68.39
5     5  name5   61.48
6     6  name6   87.46
7     7  name7   75.91
8     8  name8   67.93
9     9  name9   55.29
10   10 name10   93.03

期望输出 -->

> original
group field1   cost     slno    max_alternative max_saving    oth_alt oth_sav
1    G1      A 330.37  1 4 5 7   name5           268.89   name1;name4;name7  265.39;261.98;254.46   
2    G2      B 463.80      1 3   name1           398.82   name3              364.84
3    G3      C 471.74        9   name9           416.45
4    G4      D 465.71 2 5 7 10   name5           404.23   name2;name7;name10 371.08;389.80;372.68
5    G5      E 472.83     1 10   name1           407.85   name10             379.80

注意事项： 我举了一个小例子来解释我的问题。就我而言，我有巨大的数据帧，每个数据帧都有近 100 万行。因此，for 循环在这种情况下效率不高，因为它需要数小时才能完成运行。有什么有效的方法可以做到这一点吗？

提前致谢！

Answer 1

使用 dplyr and tidyr 的解决方案。 original2 是最终输出。关键是用separate_rows展开slno列，在original和alternative之间根据slno进行join，然后用group_by和 summarize 总结所有信息。请注意 which.min 仅 return 是向量中的第一个最小值。如果您有多个值等于最小值，代码仍将 return 只是第一个最小值。

library(dplyr)
library(tidyr)

original2 <- original %>%
  separate_rows(slno, convert = TRUE) %>%
  left_join(alternative, by = "slno") %>%
  group_by(group, field1) %>%
  summarise(cost = first(cost.x),
            slno = paste(slno, collapse = " "),
            max_alternative = name[which.min(cost.y)],
            max_saving = first(cost.x) - cost.y[which.min(cost.y)],
            oth_alt = paste(name[-which.min(cost.y)], collapse = ";"),
            oth_sav = paste(first(cost.x) - cost.y[-which.min(cost.y)], collapse = ";")) %>%
  ungroup() %>%
  as.data.frame(stringsAsFactors = FALSE)

original2 
#   group field1   cost     slno max_alternative max_saving            oth_alt              oth_sav
# 1    G1      A 330.37  1 4 5 7           name5     268.89  name1;name4;name7 265.39;261.98;254.46
# 2    G2      B 463.80      1 3           name1     398.82              name3               364.84
# 3    G3      C 471.74        9           name9     416.45                                        
# 4    G4      D 465.71 2 5 7 10           name5     404.23 name2;name7;name10  371.08;389.8;372.68
# 5    G5      E 472.83     1 10           name1     407.85             name10                379.8

这是使用 data.table and splitstackshape 的替代方法。 cSplit函数和separate_rows一样，也可以扩展数据框。

library(data.table)
library(splitstackshape)

setDT(alternative)

original2 <- cSplit(original, "slno", direction = "long", sep = " ")

original3 <- merge(original2, alternative, by = "slno", all.x = TRUE)

original4 <- original3[, .(cost = first(cost.x),
                       slno = paste(slno, collapse = " "),
                       max_alternative = name[which.min(cost.y)],
                       max_saving = first(cost.x) - cost.y[which.min(cost.y)],
                       oth_alt = paste(name[-which.min(cost.y)], collapse = ";"),
                       oth_sav = paste(first(cost.x) - cost.y[-which.min(cost.y)], collapse = ";")),
                       by = .(group, field1)][order(group)]
original4[]
#    group field1   cost     slno max_alternative max_saving            oth_alt              oth_sav
# 1:    G1      A 330.37  1 4 5 7           name5     268.89  name1;name4;name7 265.39;261.98;254.46
# 2:    G2      B 463.80      1 3           name1     398.82              name3               364.84
# 3:    G3      C 471.74        9           name9     416.45                                        
# 4:    G4      D 465.71 2 5 7 10           name5     404.23 name2;name7;name10  371.08;389.8;372.68
# 5:    G5      E 472.83     1 10           name1     407.85             name10                379.8

性能评价

正如 OP 提到的那样，性能可能是一个问题。这里我使用了 microbenchmark 包和下面的代码，看看哪个更快。 m1 是 dplyr 方法，而 m2 是 data.table 方法。

library(microbenchmark)

# Create data.table object
alternative_dt <- as.data.table(alternative)
original_dt <- as.data.table(original)

# Evaluate performance
microbenchmark(m1 = {
  original2 <- original %>%
    separate_rows(slno, convert = TRUE) %>%
    left_join(alternative, by = "slno") %>%
    group_by(group, field1) %>%
    summarise(cost = first(cost.x),
              slno = paste(slno, collapse = " "),
              max_alternative = name[which.min(cost.y)],
              max_saving = first(cost.x) - cost.y[which.min(cost.y)],
              oth_alt = paste(name[-which.min(cost.y)], collapse = ";"),
              oth_sav = paste(first(cost.x) - cost.y[-which.min(cost.y)], collapse = ";")) %>%
    ungroup()},
  m2 = {original2 <- cSplit(original_dt, "slno", direction = "long", sep = " ")

  original3 <- merge(original2, alternative, by = "slno", all.x = TRUE)

  original4 <- original3[, .(cost = first(cost.x),
                             slno = paste(slno, collapse = " "),
                             max_alternative = name[which.min(cost.y)],
                             max_saving = first(cost.x) - cost.y[which.min(cost.y)],
                             oth_alt = paste(name[-which.min(cost.y)], collapse = ";"),
                             oth_sav = paste(first(cost.x) - cost.y[-which.min(cost.y)], collapse = ";")),
                         by = .(group, field1)][order(group)]})

# Unit: milliseconds
#  expr       min        lq      mean    median        uq      max neval
#    m1 21.106662 22.673250 23.978065 23.519644 25.005269 33.26359   100
#    m2  3.886784  4.418318  4.730305  4.702078  4.970674  7.61164   100

结果显示data.table比dplyr快。如果 OP 正在处理大量数据。 data.table 可能是首选。然而，虽然我没有开发 for-loop 方法并测试性能，但 data.table 和 dplyr 方法都可能比 for-loop 快得多。

数据

original <- read.table(text = "group field1   cost     slno
1    G1      A 330.37  '1 4 5 7'
                       2    G2      B 463.80      '1 3'
                       3    G3      C 471.74        '9'
                       4    G4      D 465.71 '2 5 7 10'
                       5    G5      E 472.83     '1 10'",
                       header = TRUE, stringsAsFactors = FALSE)

alternative <- read.table(text = "    slno   name  cost
1     1  name1   64.98
                          2     2  name2   94.63
                          3     3  name3   98.96
                          4     4  name4   68.39
                          5     5  name5   61.48
                          6     6  name6   87.46
                          7     7  name7   75.91
                          8     8  name8   67.93
                          9     9  name9   55.29
                          10   10 name10   93.03",
                          header = TRUE, stringsAsFactors = FALSE)

Answer 2

一个选项是使用 sqldf 和 dplyr。该解决方案按步骤显示以保持清晰。

#The data
library(sqldf)
library(dplyr)
original = data.frame(group = paste("G",c(1:5),sep=""), 
                      field1 = c("A","B","C","D","E"),
                      cost = round(runif(5,300,500),2), 
                      slno = c("1 4 5 7","1 3","9","2 5 7 10","1 10"), stringsAsFactors = F)

alternative = data.frame(slno = c(1:10), 
                         name = paste("name",c(1:10),sep=""), 
                         cost = round(runif(10,50,100),2), stringsAsFactors = F)

#> original
#  group field1   cost     slno
#1    G1      A 490.71  1 4 5 7
#2    G2      B 399.20      1 3
#3    G3      C 326.40        9
#4    G4      D 421.69 2 5 7 10
#5    G5      E 498.37     1 10

#> alternative
#   slno   name  cost
#1     1  name1 54.74
#2     2  name2 94.76
#3     3  name3 66.74
#4     4  name4 73.61
#5     5  name5 58.86
#6     6  name6 67.58
#7     7  name7 58.83
#8     8  name8 82.65
#9     9  name9 61.81
#10   10 name10 94.86


#join both data.frames
join_qury <- "select original.*, alternative.name as alternative, (original.cost - alternative.cost) as saving from original 
      inner join alternative where original.slno like '%' || alternative.slno || '%'"

df <- sqldf(join_qury,stringsAsFactors = FALSE)

#> df
#   group field1   cost     slno alternative saving
#1     G1      A 490.71  1 4 5 7       name1 435.97
#2     G1      A 490.71  1 4 5 7       name4 417.10
#3     G1      A 490.71  1 4 5 7       name5 431.85
#4     G1      A 490.71  1 4 5 7       name7 431.88
#5     G2      B 399.20      1 3       name1 344.46
#6     G2      B 399.20      1 3       name3 332.46
#7     G3      C 326.40        9       name9 264.59
#8     G4      D 421.69 2 5 7 10       name1 366.95
#9     G4      D 421.69 2 5 7 10       name2 326.93
#10    G4      D 421.69 2 5 7 10       name5 362.83
#11    G4      D 421.69 2 5 7 10       name7 362.86
#12    G4      D 421.69 2 5 7 10      name10 326.83
#13    G5      E 498.37     1 10       name1 443.63
#14    G5      E 498.37     1 10      name10 403.51

# Filter data to contain only max value for a name
df_maxval <- df %>%
  group_by(group,field1, cost, slno) %>%
  filter(saving == max(saving))

#Find and group other name and savings
df_other <- setdiff(df, df_maxval) %>%
  group_by(group,field1, cost, slno) %>%
  summarise_at(.vars = vars(alternative, saving), 
               .funs = c("toString")) %>%
  ungroup()

# finally join both max savings and other values
df_final <- df_maxval %>%
  inner_join(df_other, by = c("group", "field1", "slno")) %>%
  select(group, field1, cost = cost.x, slno, max_alternative = alternative.x, 
         max_saving = saving.x, oth_alt = alternative.y, oth_sav = saving.y)

#Result
#> df_final
# A tibble: 4 x 8
# Groups:   group, field1, cost, slno [4]
#  group field1   cost     slno max_alternative max_saving                     oth_alt                        oth_sav
#  <chr>  <chr>  <dbl>    <chr>           <chr>      <dbl>                       <chr>                          <chr>
#1    G1      A 490.71  1 4 5 7           name1     435.97         name4, name5, name7          417.1, 431.85, 431.88
#2    G2      B 399.20      1 3           name1     344.46                       name3                         332.46
#3    G4      D 421.69 2 5 7 10           name1     366.95 name2, name5, name7, name10 326.93, 362.83, 362.86, 326.83
#4    G5      E 498.37     1 10           name1     443.63                      name10                         403.51

Answer 3

这是一个使用基础 R 的试验：

 transform(df1,e=t(mapply(function(x,y){
   v=df2[x,][s<-which.min(df2$cost[x]),-1];
   w=df2[x,][-s,-1]
   cbind( c(v[1],y-v[2]),
         apply(cbind(w[,1],y-w[,2]),2,paste0,collapse=";"))},
   lapply(strsplit(df1$slno," "),as.numeric),df1$cost)))
  group field1   cost     slno   e.1    e.2                e.3                  e.4
1    G1      A 330.37  1 4 5 7 name5 268.89  name1;name4;name7 265.39;261.98;254.46
2    G2      B 463.80      1 3 name1 398.82              name3               364.84
3    G3      C 471.74        9 name9 416.45                                        
4    G4      D 465.71 2 5 7 10 name5 404.23 name2;name7;name10  371.08;389.8;372.68
5    G5      E 472.83     1 10 name1 407.85             name10                379.8
>

R - 为大型数据集增加 R 中 for 循环的运行时间

R - Increasing runtime for for loop in R for large datasets

for-loop

runtime

r

dataframe