R:有条件地将数据从一个数据帧提取到另一个数据帧
R: Conditionally extract data from one dataframe to another
我有两个数据框,我想有条件地从一个数据框的一列中提取数据,并将其放入另一个数据框的新列中。
数据帧 1 如下所示:
df1 <- data.frame(date.start = c("2019-06-10 11:52:00",
"2019-06-11 11:52:00", "2019-06-12 11:51:00"), date.end =
c("2019-06-10 11:53:00", "2019-06-11 11:53:00", "2019-06-12 11:53:00"))
数据帧 2 如下所示:
df2 <- data.frame(date.start = c("2019-06-11 11:50:00",
"2019-06-10 11:51:00", "2019-06-12 11:50:00"), date.end =
c("2019-06-11 11:54:00", "2019-06-11 08:59:00", "2019-06-12 11:57:00"),
day = c(1, 15, 64))
如果 df.1 的 date.start
和 date.end
落在 df2
的任何行的 date.start
或 date.end
范围内,我想提取df2
中的变量 day
并将其放入 df1
.
的匹配行
预期结果如下所示:
expected.out <- data.frame(date.start = c("2019-06-10 11:52:00", "2019-06-11 11:52:00", "2019-06-12 11:51:00"),
date.end = c("2019-06-10 11:53:00", "2019-06-11 11:53:00", "2019-06-12 11:53:00"),
day = c(15, 1, 64))
我目前有以下循环可以工作,但是当我在我的大数据帧(行 = 1135133)上 运行 它时它非常慢,我想知道是否有更快的方法来做到这一点.
for(i in 1:nrow(df1)){
find.match <- which(df1$date.start[i] >= df2$date.start &
df1$date.end[i] <= df2$date.end)
if(length(find.match) !=0){
df1$day[i] <- df2$day[find.match]
}
}
使用library(fuzzyjoin)
library(tidyverse)
library(lubridate)
library(fuzzyjoin)
df1 <- data.frame(
date.start = c("2019-06-10 11:52:00", "2019-06-11 11:52:00", "2019-06-12 11:51:00"),
date.end = c("2019-06-10 11:53:00", "2019-06-11 11:53:00", "2019-06-12 11:53:00"), stringsAsFactors = F)
df2 <- data.frame(date.start = c("2019-06-11 11:50:00", "2019-06-10 11:51:00", "2019-06-12 11:50:00"),
date.end = c("2019-06-11 11:54:00", "2019-06-11 08:59:00", "2019-06-12 11:57:00"),
day = c(1, 15, 64), stringsAsFactors = F)
df1 <- df1 %>%
mutate(across(where(is.character), ymd_hms)) %>%
as_tibble()
df2 <- df2 %>%
mutate(across(where(is.character), ymd_hms)) %>%
as_tibble()
fuzzy_left_join(df1, df2, by = c("date.start", "date.end"), match_fun = list(`>=`, `<=`))
# A tibble: 3 x 5
date.start.x date.end.x date.start.y date.end.y day
<dttm> <dttm> <dttm> <dttm> <dbl>
1 2019-06-10 11:52:00 2019-06-10 11:53:00 2019-06-10 11:51:00 2019-06-11 08:59:00 15
2 2019-06-11 11:52:00 2019-06-11 11:53:00 2019-06-11 11:50:00 2019-06-11 11:54:00 1
3 2019-06-12 11:51:00 2019-06-12 11:53:00 2019-06-12 11:50:00 2019-06-12 11:57:00 64
由 reprex package (v0.3.0)
于 2020-09-23 创建
不确定该方法是否快速
您可以在 sapply
中使用 match
来获取 df2
的第一行,其中日期在给定时间范围内。
df1[] <- lapply(df1, as.POSIXct) #Convert character to POSIXct
df2[1:2] <- lapply(df2[1:2], as.POSIXct)
df1$day <- df2$day[sapply(asplit(df1, 1), function(x) {match(TRUE,
x[1] >= df2[,1] & x[2] <= df2[,2])})]
df1
# date.start date.end day
#1 2019-06-10 11:52:00 2019-06-10 11:53:00 15
#2 2019-06-11 11:52:00 2019-06-11 11:53:00 1
#3 2019-06-12 11:51:00 2019-06-12 11:53:00 64
将 data.table
中的 between
与 outer
结合使用。 which.max
扫描匹配矩阵中的 TRUE
值。
library(data.table)
FUN <- Vectorize(function(x, y) all(between(unlist(df1[x, ]), df2[y, 1], df2[y, 2])))
res <- transform(df1, day=df2[apply(outer(1:3, 1:3, FUN), 1, which.max), 3])
res
# date.start date.end day
# 1 2019-06-10 11:52:00 2019-06-10 11:53:00 15
# 2 2019-06-11 11:52:00 2019-06-11 11:53:00 1
# 3 2019-06-12 11:51:00 2019-06-12 11:53:00 64
您可能需要事先转换为 POSIXct
格式以应用解决方案。
df1[1:2] <- lapply(df1[1:2], as.POSIXct)
df2[1:2] <- lapply(df2[1:2], as.POSIXct)
数据:
df1 <- structure(list(date.start = structure(c(1560160320, 1560246720,
1560333060), class = c("POSIXct", "POSIXt"), tzone = ""), date.end = structure(c(1560160380,
1560246780, 1560333180), class = c("POSIXct", "POSIXt"), tzone = "")), row.names = c(NA,
-3L), class = "data.frame")
df2 <- structure(list(date.start = structure(c(1560246600, 1560160260,
1560333000), class = c("POSIXct", "POSIXt"), tzone = ""), date.end = structure(c(1560246840,
1560236340, 1560333420), class = c("POSIXct", "POSIXt"), tzone = ""),
day = c(1, 15, 64)), row.names = c(NA, -3L), class = "data.frame")
我有两个数据框,我想有条件地从一个数据框的一列中提取数据,并将其放入另一个数据框的新列中。
数据帧 1 如下所示:
df1 <- data.frame(date.start = c("2019-06-10 11:52:00",
"2019-06-11 11:52:00", "2019-06-12 11:51:00"), date.end =
c("2019-06-10 11:53:00", "2019-06-11 11:53:00", "2019-06-12 11:53:00"))
数据帧 2 如下所示:
df2 <- data.frame(date.start = c("2019-06-11 11:50:00",
"2019-06-10 11:51:00", "2019-06-12 11:50:00"), date.end =
c("2019-06-11 11:54:00", "2019-06-11 08:59:00", "2019-06-12 11:57:00"),
day = c(1, 15, 64))
如果 df.1 的 date.start
和 date.end
落在 df2
的任何行的 date.start
或 date.end
范围内,我想提取df2
中的变量 day
并将其放入 df1
.
预期结果如下所示:
expected.out <- data.frame(date.start = c("2019-06-10 11:52:00", "2019-06-11 11:52:00", "2019-06-12 11:51:00"),
date.end = c("2019-06-10 11:53:00", "2019-06-11 11:53:00", "2019-06-12 11:53:00"),
day = c(15, 1, 64))
我目前有以下循环可以工作,但是当我在我的大数据帧(行 = 1135133)上 运行 它时它非常慢,我想知道是否有更快的方法来做到这一点.
for(i in 1:nrow(df1)){
find.match <- which(df1$date.start[i] >= df2$date.start &
df1$date.end[i] <= df2$date.end)
if(length(find.match) !=0){
df1$day[i] <- df2$day[find.match]
}
}
使用library(fuzzyjoin)
library(tidyverse)
library(lubridate)
library(fuzzyjoin)
df1 <- data.frame(
date.start = c("2019-06-10 11:52:00", "2019-06-11 11:52:00", "2019-06-12 11:51:00"),
date.end = c("2019-06-10 11:53:00", "2019-06-11 11:53:00", "2019-06-12 11:53:00"), stringsAsFactors = F)
df2 <- data.frame(date.start = c("2019-06-11 11:50:00", "2019-06-10 11:51:00", "2019-06-12 11:50:00"),
date.end = c("2019-06-11 11:54:00", "2019-06-11 08:59:00", "2019-06-12 11:57:00"),
day = c(1, 15, 64), stringsAsFactors = F)
df1 <- df1 %>%
mutate(across(where(is.character), ymd_hms)) %>%
as_tibble()
df2 <- df2 %>%
mutate(across(where(is.character), ymd_hms)) %>%
as_tibble()
fuzzy_left_join(df1, df2, by = c("date.start", "date.end"), match_fun = list(`>=`, `<=`))
# A tibble: 3 x 5
date.start.x date.end.x date.start.y date.end.y day
<dttm> <dttm> <dttm> <dttm> <dbl>
1 2019-06-10 11:52:00 2019-06-10 11:53:00 2019-06-10 11:51:00 2019-06-11 08:59:00 15
2 2019-06-11 11:52:00 2019-06-11 11:53:00 2019-06-11 11:50:00 2019-06-11 11:54:00 1
3 2019-06-12 11:51:00 2019-06-12 11:53:00 2019-06-12 11:50:00 2019-06-12 11:57:00 64
由 reprex package (v0.3.0)
于 2020-09-23 创建不确定该方法是否快速
您可以在 sapply
中使用 match
来获取 df2
的第一行,其中日期在给定时间范围内。
df1[] <- lapply(df1, as.POSIXct) #Convert character to POSIXct
df2[1:2] <- lapply(df2[1:2], as.POSIXct)
df1$day <- df2$day[sapply(asplit(df1, 1), function(x) {match(TRUE,
x[1] >= df2[,1] & x[2] <= df2[,2])})]
df1
# date.start date.end day
#1 2019-06-10 11:52:00 2019-06-10 11:53:00 15
#2 2019-06-11 11:52:00 2019-06-11 11:53:00 1
#3 2019-06-12 11:51:00 2019-06-12 11:53:00 64
将 data.table
中的 between
与 outer
结合使用。 which.max
扫描匹配矩阵中的 TRUE
值。
library(data.table)
FUN <- Vectorize(function(x, y) all(between(unlist(df1[x, ]), df2[y, 1], df2[y, 2])))
res <- transform(df1, day=df2[apply(outer(1:3, 1:3, FUN), 1, which.max), 3])
res
# date.start date.end day
# 1 2019-06-10 11:52:00 2019-06-10 11:53:00 15
# 2 2019-06-11 11:52:00 2019-06-11 11:53:00 1
# 3 2019-06-12 11:51:00 2019-06-12 11:53:00 64
您可能需要事先转换为 POSIXct
格式以应用解决方案。
df1[1:2] <- lapply(df1[1:2], as.POSIXct)
df2[1:2] <- lapply(df2[1:2], as.POSIXct)
数据:
df1 <- structure(list(date.start = structure(c(1560160320, 1560246720,
1560333060), class = c("POSIXct", "POSIXt"), tzone = ""), date.end = structure(c(1560160380,
1560246780, 1560333180), class = c("POSIXct", "POSIXt"), tzone = "")), row.names = c(NA,
-3L), class = "data.frame")
df2 <- structure(list(date.start = structure(c(1560246600, 1560160260,
1560333000), class = c("POSIXct", "POSIXt"), tzone = ""), date.end = structure(c(1560246840,
1560236340, 1560333420), class = c("POSIXct", "POSIXt"), tzone = ""),
day = c(1, 15, 64)), row.names = c(NA, -3L), class = "data.frame")