R: fast/parallel 多列多行查找
R: fast/parallel multicolumn multirow lookup
我有许多大型(300k - 1M 行)数据帧,我试图通过循环遍历数据帧(df_i
)并为每个数据帧追加值,循环遍历行并询问是什么第二个数据框中的值是 (do2
),匹配纬度、经度、月份和深度。 Lat/lon/month 将完全匹配,深度更棘手,因为 do2 有 57 列用于增加深度箱的值:
我的行循环代码的核心是一个 3-ands 行子集和一个列子集:
for (j in 1:nrow(df_i)) {
df_i[j,"DO2"] <- do2[do2$Latitude == df_i[j,"latbin"] &
do2$Longitude == df_i[j,"lonbin"] &
do2$Month == month(df_i[j,"DateTimeUTCmin5"]),
which.min(abs(depthbins - df_i[j, "Depth.m."])) + 3]
}
这行得通,但速度很慢。我知道它可以加快速度,但我的并行化努力一直在碰壁,debug/traceback 并行化要困难得多。我在阅读 this 后尝试了 FBM,但得到
value must be unique or of the dimension of x[i, j]
大约 20 万行。我知道 data.table indexes are fast, so possibly something like 可能有效,也许是 data.table 中的多行子集?但大概这与我现有的解决方案是相同的方法(因为我还需要 subset/lookup 列),只是可能更快一点?
有谁知道更聪明的方法吗?我以前对应用函数感到困惑,但如果那里有有用的东西我不会感到惊讶吗?
提前致谢。
可重现(简化月份,depthbins 添加为之前省略):
depthbins <- c(0,5,10,15,20,25,50,75,100,125,150,200,250,300,350,400)
df_i <- data.frame(latbin = c(-77.5, -78, -78.5),
lonbin = c(-178.5, -177.5, -176.5),
month = c(1,2,3),
Depth.m. = c(130,120,110))
do2 <- tibble(Month = c(1,1,1),
Latitude = c(-78,-78,-79),
Longitude = c(-178.5, -177.5, -177.5),
"0" = c(214, 223, 345),
"5" = c(123,234,345),
"10" = c(345,456,567))
最终编辑:对 Marius 代码的一些调整:
do2 %<>% gather(.vars = colnames(do2)[4:length(colnames(do2))],
key = "depbin", value = "DO2")
do2$depbin <- as.numeric(do2$depbin)
depthbins <- sort(unique(do2$depbin))
df_i$depbin = sapply(df_i$Depth.m., function(d) depthbins[which.min(abs(depthbins - d))])
df_i %<>% left_join(do2, by = c("Month" = "Month",
"latbin" = "Latitude",
"lonbin" = "Longitude",
"depbin" = "depbin")) %>%
select(-Month, -latbin, -lonbin, -depbin)
我认为通过一些重组,您可以将其作为合并来完成。合并部分应该比你的 for 循环方法快得多,do2
的大小增加和准备时间会稍微抵消它。请注意,我不得不稍微修改您的示例数据,以便每一行实际上都有一些要匹配的内容:
depthbins <- c(0,5,10,15,20,25,50,75,100,125,150,200,250,300,350,400)
df_i <- data.frame(latbin = c(-77.5, -78, -78.5),
lonbin = c(-178.5, -177.5, -176.5),
month = c(1,2,3),
Depth.m. = c(130,120,110))
do2 <- tibble(Month = c(1,2,3),
Latitude = c(-77.5,-78,-78.5),
Longitude = c(-178.5, -177.5, -176.5),
"100" = c(214, 223, 345),
"125" = c(123,234,345),
"150" = c(345,456,567))
library(tidyverse)
# Precalculate closest bin for each row
df_i$bin = sapply(df_i$Depth.m., function(d) depthbins[which.min(abs(depthbins - d))])
# Convert do2 to long
do2_long = do2 %>%
gather(bin, DO2, -Month, -Latitude, -Longitude) %>%
mutate(bin = as.numeric(bin))
# Now everything can just be done as a merge
# The merge syntax would be a bit cleaner if you give the two df's
# matching column names to start with
df_i %>%
left_join(do2_long, by = c("month" = "Month", "latbin" = "Latitude",
"lonbin" = "Longitude", "bin" = "bin"))
我有许多大型(300k - 1M 行)数据帧,我试图通过循环遍历数据帧(df_i
)并为每个数据帧追加值,循环遍历行并询问是什么第二个数据框中的值是 (do2
),匹配纬度、经度、月份和深度。 Lat/lon/month 将完全匹配,深度更棘手,因为 do2 有 57 列用于增加深度箱的值:
我的行循环代码的核心是一个 3-ands 行子集和一个列子集:
for (j in 1:nrow(df_i)) {
df_i[j,"DO2"] <- do2[do2$Latitude == df_i[j,"latbin"] &
do2$Longitude == df_i[j,"lonbin"] &
do2$Month == month(df_i[j,"DateTimeUTCmin5"]),
which.min(abs(depthbins - df_i[j, "Depth.m."])) + 3]
}
这行得通,但速度很慢。我知道它可以加快速度,但我的并行化努力一直在碰壁,debug/traceback 并行化要困难得多。我在阅读 this 后尝试了 FBM,但得到
value must be unique or of the dimension of x[i, j]
大约 20 万行。我知道 data.table indexes are fast, so possibly something like
有谁知道更聪明的方法吗?我以前对应用函数感到困惑,但如果那里有有用的东西我不会感到惊讶吗?
提前致谢。
可重现(简化月份,depthbins 添加为之前省略):
depthbins <- c(0,5,10,15,20,25,50,75,100,125,150,200,250,300,350,400)
df_i <- data.frame(latbin = c(-77.5, -78, -78.5),
lonbin = c(-178.5, -177.5, -176.5),
month = c(1,2,3),
Depth.m. = c(130,120,110))
do2 <- tibble(Month = c(1,1,1),
Latitude = c(-78,-78,-79),
Longitude = c(-178.5, -177.5, -177.5),
"0" = c(214, 223, 345),
"5" = c(123,234,345),
"10" = c(345,456,567))
最终编辑:对 Marius 代码的一些调整:
do2 %<>% gather(.vars = colnames(do2)[4:length(colnames(do2))],
key = "depbin", value = "DO2")
do2$depbin <- as.numeric(do2$depbin)
depthbins <- sort(unique(do2$depbin))
df_i$depbin = sapply(df_i$Depth.m., function(d) depthbins[which.min(abs(depthbins - d))])
df_i %<>% left_join(do2, by = c("Month" = "Month",
"latbin" = "Latitude",
"lonbin" = "Longitude",
"depbin" = "depbin")) %>%
select(-Month, -latbin, -lonbin, -depbin)
我认为通过一些重组,您可以将其作为合并来完成。合并部分应该比你的 for 循环方法快得多,do2
的大小增加和准备时间会稍微抵消它。请注意,我不得不稍微修改您的示例数据,以便每一行实际上都有一些要匹配的内容:
depthbins <- c(0,5,10,15,20,25,50,75,100,125,150,200,250,300,350,400)
df_i <- data.frame(latbin = c(-77.5, -78, -78.5),
lonbin = c(-178.5, -177.5, -176.5),
month = c(1,2,3),
Depth.m. = c(130,120,110))
do2 <- tibble(Month = c(1,2,3),
Latitude = c(-77.5,-78,-78.5),
Longitude = c(-178.5, -177.5, -176.5),
"100" = c(214, 223, 345),
"125" = c(123,234,345),
"150" = c(345,456,567))
library(tidyverse)
# Precalculate closest bin for each row
df_i$bin = sapply(df_i$Depth.m., function(d) depthbins[which.min(abs(depthbins - d))])
# Convert do2 to long
do2_long = do2 %>%
gather(bin, DO2, -Month, -Latitude, -Longitude) %>%
mutate(bin = as.numeric(bin))
# Now everything can just be done as a merge
# The merge syntax would be a bit cleaner if you give the two df's
# matching column names to start with
df_i %>%
left_join(do2_long, by = c("month" = "Month", "latbin" = "Latitude",
"lonbin" = "Longitude", "bin" = "bin"))