R - 为大型数据集增加 R 中 for 循环的运行时间
R - Increasing runtime for for loop in R for large datasets
考虑以下使用 R-
中的这 2 个数据帧的示例
original = data.frame(group = paste("G",c(1:5),sep=""),
field1 = c("A","B","C","D","E"),
cost = round(runif(5,300,500),2),
slno = c("1 4 5 7","1 3","9","2 5 7 10","1 10"), stringsAsFactors = F)
alternative = data.frame(slno = c(1:10),
name = paste("name",c(1:10),sep=""),
cost = round(runif(10,50,100),2), stringsAsFactors = F)
我想执行以下步骤并在 原始 -
中输入这些列
映射原始第4列中的每个slno(space分隔) 替代数据帧中的数据帧并获取成本。
从原始成本中减去每项备选成本并计算节省。
original$max_alternative 列应具有最大节省的备选方案的名称。 原$max_saving应该有相应的节省。
original$oth_alt 列应该用分号分隔所有其他名称。 原始$oth_savings应该有相应的储蓄分号分隔。
数据集 -->
> original
group field1 cost slno
1 G1 A 330.37 1 4 5 7
2 G2 B 463.80 1 3
3 G3 C 471.74 9
4 G4 D 465.71 2 5 7 10
5 G5 E 472.83 1 10
> alternative
slno name cost
1 1 name1 64.98
2 2 name2 94.63
3 3 name3 98.96
4 4 name4 68.39
5 5 name5 61.48
6 6 name6 87.46
7 7 name7 75.91
8 8 name8 67.93
9 9 name9 55.29
10 10 name10 93.03
期望输出 -->
> original
group field1 cost slno max_alternative max_saving oth_alt oth_sav
1 G1 A 330.37 1 4 5 7 name5 268.89 name1;name4;name7 265.39;261.98;254.46
2 G2 B 463.80 1 3 name1 398.82 name3 364.84
3 G3 C 471.74 9 name9 416.45
4 G4 D 465.71 2 5 7 10 name5 404.23 name2;name7;name10 371.08;389.80;372.68
5 G5 E 472.83 1 10 name1 407.85 name10 379.80
注意事项:
我举了一个小例子来解释我的问题。就我而言,我有巨大的数据帧,每个数据帧都有近 100 万行。因此,for 循环在这种情况下效率不高,因为它需要数小时才能完成 运行。有什么有效的方法可以做到这一点吗?
提前致谢!
使用 dplyr and tidyr 的解决方案。 original2
是最终输出。关键是用separate_rows
展开slno
列,在original
和alternative
之间根据slno
进行join,然后用group_by
和 summarize
总结所有信息。请注意 which.min
仅 return 是向量中的第一个最小值。如果您有多个值等于最小值,代码仍将 return 只是第一个最小值。
library(dplyr)
library(tidyr)
original2 <- original %>%
separate_rows(slno, convert = TRUE) %>%
left_join(alternative, by = "slno") %>%
group_by(group, field1) %>%
summarise(cost = first(cost.x),
slno = paste(slno, collapse = " "),
max_alternative = name[which.min(cost.y)],
max_saving = first(cost.x) - cost.y[which.min(cost.y)],
oth_alt = paste(name[-which.min(cost.y)], collapse = ";"),
oth_sav = paste(first(cost.x) - cost.y[-which.min(cost.y)], collapse = ";")) %>%
ungroup() %>%
as.data.frame(stringsAsFactors = FALSE)
original2
# group field1 cost slno max_alternative max_saving oth_alt oth_sav
# 1 G1 A 330.37 1 4 5 7 name5 268.89 name1;name4;name7 265.39;261.98;254.46
# 2 G2 B 463.80 1 3 name1 398.82 name3 364.84
# 3 G3 C 471.74 9 name9 416.45
# 4 G4 D 465.71 2 5 7 10 name5 404.23 name2;name7;name10 371.08;389.8;372.68
# 5 G5 E 472.83 1 10 name1 407.85 name10 379.8
这是使用 data.table and splitstackshape 的替代方法。 cSplit
函数和separate_rows
一样,也可以扩展数据框。
library(data.table)
library(splitstackshape)
setDT(alternative)
original2 <- cSplit(original, "slno", direction = "long", sep = " ")
original3 <- merge(original2, alternative, by = "slno", all.x = TRUE)
original4 <- original3[, .(cost = first(cost.x),
slno = paste(slno, collapse = " "),
max_alternative = name[which.min(cost.y)],
max_saving = first(cost.x) - cost.y[which.min(cost.y)],
oth_alt = paste(name[-which.min(cost.y)], collapse = ";"),
oth_sav = paste(first(cost.x) - cost.y[-which.min(cost.y)], collapse = ";")),
by = .(group, field1)][order(group)]
original4[]
# group field1 cost slno max_alternative max_saving oth_alt oth_sav
# 1: G1 A 330.37 1 4 5 7 name5 268.89 name1;name4;name7 265.39;261.98;254.46
# 2: G2 B 463.80 1 3 name1 398.82 name3 364.84
# 3: G3 C 471.74 9 name9 416.45
# 4: G4 D 465.71 2 5 7 10 name5 404.23 name2;name7;name10 371.08;389.8;372.68
# 5: G5 E 472.83 1 10 name1 407.85 name10 379.8
性能评价
正如 OP 提到的那样,性能可能是一个问题。这里我使用了 microbenchmark 包和下面的代码,看看哪个更快。 m1
是 dplyr
方法,而 m2
是 data.table
方法。
library(microbenchmark)
# Create data.table object
alternative_dt <- as.data.table(alternative)
original_dt <- as.data.table(original)
# Evaluate performance
microbenchmark(m1 = {
original2 <- original %>%
separate_rows(slno, convert = TRUE) %>%
left_join(alternative, by = "slno") %>%
group_by(group, field1) %>%
summarise(cost = first(cost.x),
slno = paste(slno, collapse = " "),
max_alternative = name[which.min(cost.y)],
max_saving = first(cost.x) - cost.y[which.min(cost.y)],
oth_alt = paste(name[-which.min(cost.y)], collapse = ";"),
oth_sav = paste(first(cost.x) - cost.y[-which.min(cost.y)], collapse = ";")) %>%
ungroup()},
m2 = {original2 <- cSplit(original_dt, "slno", direction = "long", sep = " ")
original3 <- merge(original2, alternative, by = "slno", all.x = TRUE)
original4 <- original3[, .(cost = first(cost.x),
slno = paste(slno, collapse = " "),
max_alternative = name[which.min(cost.y)],
max_saving = first(cost.x) - cost.y[which.min(cost.y)],
oth_alt = paste(name[-which.min(cost.y)], collapse = ";"),
oth_sav = paste(first(cost.x) - cost.y[-which.min(cost.y)], collapse = ";")),
by = .(group, field1)][order(group)]})
# Unit: milliseconds
# expr min lq mean median uq max neval
# m1 21.106662 22.673250 23.978065 23.519644 25.005269 33.26359 100
# m2 3.886784 4.418318 4.730305 4.702078 4.970674 7.61164 100
结果显示data.table
比dplyr
快。如果 OP 正在处理大量数据。 data.table
可能是首选。然而,虽然我没有开发 for-loop 方法并测试性能,但 data.table
和 dplyr
方法都可能比 for-loop 快得多。
数据
original <- read.table(text = "group field1 cost slno
1 G1 A 330.37 '1 4 5 7'
2 G2 B 463.80 '1 3'
3 G3 C 471.74 '9'
4 G4 D 465.71 '2 5 7 10'
5 G5 E 472.83 '1 10'",
header = TRUE, stringsAsFactors = FALSE)
alternative <- read.table(text = " slno name cost
1 1 name1 64.98
2 2 name2 94.63
3 3 name3 98.96
4 4 name4 68.39
5 5 name5 61.48
6 6 name6 87.46
7 7 name7 75.91
8 8 name8 67.93
9 9 name9 55.29
10 10 name10 93.03",
header = TRUE, stringsAsFactors = FALSE)
一个选项是使用 sqldf
和 dplyr
。该解决方案按步骤显示以保持清晰。
#The data
library(sqldf)
library(dplyr)
original = data.frame(group = paste("G",c(1:5),sep=""),
field1 = c("A","B","C","D","E"),
cost = round(runif(5,300,500),2),
slno = c("1 4 5 7","1 3","9","2 5 7 10","1 10"), stringsAsFactors = F)
alternative = data.frame(slno = c(1:10),
name = paste("name",c(1:10),sep=""),
cost = round(runif(10,50,100),2), stringsAsFactors = F)
#> original
# group field1 cost slno
#1 G1 A 490.71 1 4 5 7
#2 G2 B 399.20 1 3
#3 G3 C 326.40 9
#4 G4 D 421.69 2 5 7 10
#5 G5 E 498.37 1 10
#> alternative
# slno name cost
#1 1 name1 54.74
#2 2 name2 94.76
#3 3 name3 66.74
#4 4 name4 73.61
#5 5 name5 58.86
#6 6 name6 67.58
#7 7 name7 58.83
#8 8 name8 82.65
#9 9 name9 61.81
#10 10 name10 94.86
#join both data.frames
join_qury <- "select original.*, alternative.name as alternative, (original.cost - alternative.cost) as saving from original
inner join alternative where original.slno like '%' || alternative.slno || '%'"
df <- sqldf(join_qury,stringsAsFactors = FALSE)
#> df
# group field1 cost slno alternative saving
#1 G1 A 490.71 1 4 5 7 name1 435.97
#2 G1 A 490.71 1 4 5 7 name4 417.10
#3 G1 A 490.71 1 4 5 7 name5 431.85
#4 G1 A 490.71 1 4 5 7 name7 431.88
#5 G2 B 399.20 1 3 name1 344.46
#6 G2 B 399.20 1 3 name3 332.46
#7 G3 C 326.40 9 name9 264.59
#8 G4 D 421.69 2 5 7 10 name1 366.95
#9 G4 D 421.69 2 5 7 10 name2 326.93
#10 G4 D 421.69 2 5 7 10 name5 362.83
#11 G4 D 421.69 2 5 7 10 name7 362.86
#12 G4 D 421.69 2 5 7 10 name10 326.83
#13 G5 E 498.37 1 10 name1 443.63
#14 G5 E 498.37 1 10 name10 403.51
# Filter data to contain only max value for a name
df_maxval <- df %>%
group_by(group,field1, cost, slno) %>%
filter(saving == max(saving))
#Find and group other name and savings
df_other <- setdiff(df, df_maxval) %>%
group_by(group,field1, cost, slno) %>%
summarise_at(.vars = vars(alternative, saving),
.funs = c("toString")) %>%
ungroup()
# finally join both max savings and other values
df_final <- df_maxval %>%
inner_join(df_other, by = c("group", "field1", "slno")) %>%
select(group, field1, cost = cost.x, slno, max_alternative = alternative.x,
max_saving = saving.x, oth_alt = alternative.y, oth_sav = saving.y)
#Result
#> df_final
# A tibble: 4 x 8
# Groups: group, field1, cost, slno [4]
# group field1 cost slno max_alternative max_saving oth_alt oth_sav
# <chr> <chr> <dbl> <chr> <chr> <dbl> <chr> <chr>
#1 G1 A 490.71 1 4 5 7 name1 435.97 name4, name5, name7 417.1, 431.85, 431.88
#2 G2 B 399.20 1 3 name1 344.46 name3 332.46
#3 G4 D 421.69 2 5 7 10 name1 366.95 name2, name5, name7, name10 326.93, 362.83, 362.86, 326.83
#4 G5 E 498.37 1 10 name1 443.63 name10 403.51
这是一个使用基础 R 的试验:
transform(df1,e=t(mapply(function(x,y){
v=df2[x,][s<-which.min(df2$cost[x]),-1];
w=df2[x,][-s,-1]
cbind( c(v[1],y-v[2]),
apply(cbind(w[,1],y-w[,2]),2,paste0,collapse=";"))},
lapply(strsplit(df1$slno," "),as.numeric),df1$cost)))
group field1 cost slno e.1 e.2 e.3 e.4
1 G1 A 330.37 1 4 5 7 name5 268.89 name1;name4;name7 265.39;261.98;254.46
2 G2 B 463.80 1 3 name1 398.82 name3 364.84
3 G3 C 471.74 9 name9 416.45
4 G4 D 465.71 2 5 7 10 name5 404.23 name2;name7;name10 371.08;389.8;372.68
5 G5 E 472.83 1 10 name1 407.85 name10 379.8
>
考虑以下使用 R-
中的这 2 个数据帧的示例original = data.frame(group = paste("G",c(1:5),sep=""),
field1 = c("A","B","C","D","E"),
cost = round(runif(5,300,500),2),
slno = c("1 4 5 7","1 3","9","2 5 7 10","1 10"), stringsAsFactors = F)
alternative = data.frame(slno = c(1:10),
name = paste("name",c(1:10),sep=""),
cost = round(runif(10,50,100),2), stringsAsFactors = F)
我想执行以下步骤并在 原始 -
中输入这些列映射原始第4列中的每个slno(space分隔) 替代数据帧中的数据帧并获取成本。
从原始成本中减去每项备选成本并计算节省。
original$max_alternative 列应具有最大节省的备选方案的名称。 原$max_saving应该有相应的节省。
original$oth_alt 列应该用分号分隔所有其他名称。 原始$oth_savings应该有相应的储蓄分号分隔。
数据集 -->
> original
group field1 cost slno
1 G1 A 330.37 1 4 5 7
2 G2 B 463.80 1 3
3 G3 C 471.74 9
4 G4 D 465.71 2 5 7 10
5 G5 E 472.83 1 10
> alternative
slno name cost
1 1 name1 64.98
2 2 name2 94.63
3 3 name3 98.96
4 4 name4 68.39
5 5 name5 61.48
6 6 name6 87.46
7 7 name7 75.91
8 8 name8 67.93
9 9 name9 55.29
10 10 name10 93.03
期望输出 -->
> original
group field1 cost slno max_alternative max_saving oth_alt oth_sav
1 G1 A 330.37 1 4 5 7 name5 268.89 name1;name4;name7 265.39;261.98;254.46
2 G2 B 463.80 1 3 name1 398.82 name3 364.84
3 G3 C 471.74 9 name9 416.45
4 G4 D 465.71 2 5 7 10 name5 404.23 name2;name7;name10 371.08;389.80;372.68
5 G5 E 472.83 1 10 name1 407.85 name10 379.80
注意事项: 我举了一个小例子来解释我的问题。就我而言,我有巨大的数据帧,每个数据帧都有近 100 万行。因此,for 循环在这种情况下效率不高,因为它需要数小时才能完成 运行。有什么有效的方法可以做到这一点吗?
提前致谢!
使用 dplyr and tidyr 的解决方案。 original2
是最终输出。关键是用separate_rows
展开slno
列,在original
和alternative
之间根据slno
进行join,然后用group_by
和 summarize
总结所有信息。请注意 which.min
仅 return 是向量中的第一个最小值。如果您有多个值等于最小值,代码仍将 return 只是第一个最小值。
library(dplyr)
library(tidyr)
original2 <- original %>%
separate_rows(slno, convert = TRUE) %>%
left_join(alternative, by = "slno") %>%
group_by(group, field1) %>%
summarise(cost = first(cost.x),
slno = paste(slno, collapse = " "),
max_alternative = name[which.min(cost.y)],
max_saving = first(cost.x) - cost.y[which.min(cost.y)],
oth_alt = paste(name[-which.min(cost.y)], collapse = ";"),
oth_sav = paste(first(cost.x) - cost.y[-which.min(cost.y)], collapse = ";")) %>%
ungroup() %>%
as.data.frame(stringsAsFactors = FALSE)
original2
# group field1 cost slno max_alternative max_saving oth_alt oth_sav
# 1 G1 A 330.37 1 4 5 7 name5 268.89 name1;name4;name7 265.39;261.98;254.46
# 2 G2 B 463.80 1 3 name1 398.82 name3 364.84
# 3 G3 C 471.74 9 name9 416.45
# 4 G4 D 465.71 2 5 7 10 name5 404.23 name2;name7;name10 371.08;389.8;372.68
# 5 G5 E 472.83 1 10 name1 407.85 name10 379.8
这是使用 data.table and splitstackshape 的替代方法。 cSplit
函数和separate_rows
一样,也可以扩展数据框。
library(data.table)
library(splitstackshape)
setDT(alternative)
original2 <- cSplit(original, "slno", direction = "long", sep = " ")
original3 <- merge(original2, alternative, by = "slno", all.x = TRUE)
original4 <- original3[, .(cost = first(cost.x),
slno = paste(slno, collapse = " "),
max_alternative = name[which.min(cost.y)],
max_saving = first(cost.x) - cost.y[which.min(cost.y)],
oth_alt = paste(name[-which.min(cost.y)], collapse = ";"),
oth_sav = paste(first(cost.x) - cost.y[-which.min(cost.y)], collapse = ";")),
by = .(group, field1)][order(group)]
original4[]
# group field1 cost slno max_alternative max_saving oth_alt oth_sav
# 1: G1 A 330.37 1 4 5 7 name5 268.89 name1;name4;name7 265.39;261.98;254.46
# 2: G2 B 463.80 1 3 name1 398.82 name3 364.84
# 3: G3 C 471.74 9 name9 416.45
# 4: G4 D 465.71 2 5 7 10 name5 404.23 name2;name7;name10 371.08;389.8;372.68
# 5: G5 E 472.83 1 10 name1 407.85 name10 379.8
性能评价
正如 OP 提到的那样,性能可能是一个问题。这里我使用了 microbenchmark 包和下面的代码,看看哪个更快。 m1
是 dplyr
方法,而 m2
是 data.table
方法。
library(microbenchmark)
# Create data.table object
alternative_dt <- as.data.table(alternative)
original_dt <- as.data.table(original)
# Evaluate performance
microbenchmark(m1 = {
original2 <- original %>%
separate_rows(slno, convert = TRUE) %>%
left_join(alternative, by = "slno") %>%
group_by(group, field1) %>%
summarise(cost = first(cost.x),
slno = paste(slno, collapse = " "),
max_alternative = name[which.min(cost.y)],
max_saving = first(cost.x) - cost.y[which.min(cost.y)],
oth_alt = paste(name[-which.min(cost.y)], collapse = ";"),
oth_sav = paste(first(cost.x) - cost.y[-which.min(cost.y)], collapse = ";")) %>%
ungroup()},
m2 = {original2 <- cSplit(original_dt, "slno", direction = "long", sep = " ")
original3 <- merge(original2, alternative, by = "slno", all.x = TRUE)
original4 <- original3[, .(cost = first(cost.x),
slno = paste(slno, collapse = " "),
max_alternative = name[which.min(cost.y)],
max_saving = first(cost.x) - cost.y[which.min(cost.y)],
oth_alt = paste(name[-which.min(cost.y)], collapse = ";"),
oth_sav = paste(first(cost.x) - cost.y[-which.min(cost.y)], collapse = ";")),
by = .(group, field1)][order(group)]})
# Unit: milliseconds
# expr min lq mean median uq max neval
# m1 21.106662 22.673250 23.978065 23.519644 25.005269 33.26359 100
# m2 3.886784 4.418318 4.730305 4.702078 4.970674 7.61164 100
结果显示data.table
比dplyr
快。如果 OP 正在处理大量数据。 data.table
可能是首选。然而,虽然我没有开发 for-loop 方法并测试性能,但 data.table
和 dplyr
方法都可能比 for-loop 快得多。
数据
original <- read.table(text = "group field1 cost slno
1 G1 A 330.37 '1 4 5 7'
2 G2 B 463.80 '1 3'
3 G3 C 471.74 '9'
4 G4 D 465.71 '2 5 7 10'
5 G5 E 472.83 '1 10'",
header = TRUE, stringsAsFactors = FALSE)
alternative <- read.table(text = " slno name cost
1 1 name1 64.98
2 2 name2 94.63
3 3 name3 98.96
4 4 name4 68.39
5 5 name5 61.48
6 6 name6 87.46
7 7 name7 75.91
8 8 name8 67.93
9 9 name9 55.29
10 10 name10 93.03",
header = TRUE, stringsAsFactors = FALSE)
一个选项是使用 sqldf
和 dplyr
。该解决方案按步骤显示以保持清晰。
#The data
library(sqldf)
library(dplyr)
original = data.frame(group = paste("G",c(1:5),sep=""),
field1 = c("A","B","C","D","E"),
cost = round(runif(5,300,500),2),
slno = c("1 4 5 7","1 3","9","2 5 7 10","1 10"), stringsAsFactors = F)
alternative = data.frame(slno = c(1:10),
name = paste("name",c(1:10),sep=""),
cost = round(runif(10,50,100),2), stringsAsFactors = F)
#> original
# group field1 cost slno
#1 G1 A 490.71 1 4 5 7
#2 G2 B 399.20 1 3
#3 G3 C 326.40 9
#4 G4 D 421.69 2 5 7 10
#5 G5 E 498.37 1 10
#> alternative
# slno name cost
#1 1 name1 54.74
#2 2 name2 94.76
#3 3 name3 66.74
#4 4 name4 73.61
#5 5 name5 58.86
#6 6 name6 67.58
#7 7 name7 58.83
#8 8 name8 82.65
#9 9 name9 61.81
#10 10 name10 94.86
#join both data.frames
join_qury <- "select original.*, alternative.name as alternative, (original.cost - alternative.cost) as saving from original
inner join alternative where original.slno like '%' || alternative.slno || '%'"
df <- sqldf(join_qury,stringsAsFactors = FALSE)
#> df
# group field1 cost slno alternative saving
#1 G1 A 490.71 1 4 5 7 name1 435.97
#2 G1 A 490.71 1 4 5 7 name4 417.10
#3 G1 A 490.71 1 4 5 7 name5 431.85
#4 G1 A 490.71 1 4 5 7 name7 431.88
#5 G2 B 399.20 1 3 name1 344.46
#6 G2 B 399.20 1 3 name3 332.46
#7 G3 C 326.40 9 name9 264.59
#8 G4 D 421.69 2 5 7 10 name1 366.95
#9 G4 D 421.69 2 5 7 10 name2 326.93
#10 G4 D 421.69 2 5 7 10 name5 362.83
#11 G4 D 421.69 2 5 7 10 name7 362.86
#12 G4 D 421.69 2 5 7 10 name10 326.83
#13 G5 E 498.37 1 10 name1 443.63
#14 G5 E 498.37 1 10 name10 403.51
# Filter data to contain only max value for a name
df_maxval <- df %>%
group_by(group,field1, cost, slno) %>%
filter(saving == max(saving))
#Find and group other name and savings
df_other <- setdiff(df, df_maxval) %>%
group_by(group,field1, cost, slno) %>%
summarise_at(.vars = vars(alternative, saving),
.funs = c("toString")) %>%
ungroup()
# finally join both max savings and other values
df_final <- df_maxval %>%
inner_join(df_other, by = c("group", "field1", "slno")) %>%
select(group, field1, cost = cost.x, slno, max_alternative = alternative.x,
max_saving = saving.x, oth_alt = alternative.y, oth_sav = saving.y)
#Result
#> df_final
# A tibble: 4 x 8
# Groups: group, field1, cost, slno [4]
# group field1 cost slno max_alternative max_saving oth_alt oth_sav
# <chr> <chr> <dbl> <chr> <chr> <dbl> <chr> <chr>
#1 G1 A 490.71 1 4 5 7 name1 435.97 name4, name5, name7 417.1, 431.85, 431.88
#2 G2 B 399.20 1 3 name1 344.46 name3 332.46
#3 G4 D 421.69 2 5 7 10 name1 366.95 name2, name5, name7, name10 326.93, 362.83, 362.86, 326.83
#4 G5 E 498.37 1 10 name1 443.63 name10 403.51
这是一个使用基础 R 的试验:
transform(df1,e=t(mapply(function(x,y){
v=df2[x,][s<-which.min(df2$cost[x]),-1];
w=df2[x,][-s,-1]
cbind( c(v[1],y-v[2]),
apply(cbind(w[,1],y-w[,2]),2,paste0,collapse=";"))},
lapply(strsplit(df1$slno," "),as.numeric),df1$cost)))
group field1 cost slno e.1 e.2 e.3 e.4
1 G1 A 330.37 1 4 5 7 name5 268.89 name1;name4;name7 265.39;261.98;254.46
2 G2 B 463.80 1 3 name1 398.82 name3 364.84
3 G3 C 471.74 9 name9 416.45
4 G4 D 465.71 2 5 7 10 name5 404.23 name2;name7;name10 371.08;389.8;372.68
5 G5 E 472.83 1 10 name1 407.85 name10 379.8
>