根据唯一的不同时间间隔合并两个数据集,然后创建一个包含所有不匹配值的数据集(在 R 中)
Merge two datasets based on unique distinct time intervals and then create a dataset with all unmatched values (in R)
我有两个独立的数据集:df1 和 df2。我想创建一个新的数据集 df3,如果日期时间彼此相差 20 秒以内,它将 df1 的结束时间列与 df2 的发送列相匹配。最后,我想创建一个最终数据集,为我提供 df2 数据集(sent/ID 数据集)中与 df1.
不匹配的所有值
df1
endtime ID
1/7/2020 1:35:08 AM A
1/7/2020 1:39:00 AM B
1/20/2020 1:45:00 AM C
df2
sent ID
1/7/2020 1:35:20 AM E
1/7/2020 1:42:00 AM F
1/20/2020 1:55:00 AM G
1/20/2020 2:00:00 AM E
这是我想要的 df3 输出。只有一行,因为只有两个值符合结束时间和已发送列的 20 秒内条件。
endtime sent
1/7/2020 1:35:08 AM 1/7/2020 1:35:20 AM
不匹配值的期望输出
sent
1/7/2020 1:42:00 AM
1/20/2020 1:55:00 AM
1/20/2020 2:00:00 AM
这是输出:
df1
structure(list(endtime = structure(c(2L, 3L, 1L), .Label = c("1/10/2020 1:45:00 AM",
"1/7/2020 1:35:08 AM", "1/7/2020 1:39:00 AM"), class = "factor"),
ID = structure(1:3, .Label = c("A", "B", "C"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L))
df2
structure(list(sent = structure(c(3L, 4L, 1L, 2L), .Label = c("1/20/2020 1:55:00 AM",
"1/20/2020 2:00:00 AM", "1/7/2020 1:35:20 AM", "1/7/2020 1:42:00 AM"
), class = "factor"), ID = structure(c(1L, 2L, 3L, 1L), .Label = c("E",
"F", "G"), class = "factor")), class = "data.frame", row.names = c(NA,
-4L))
这是我试过的:
我正在考虑执行左连接并匹配值,或者我可以使用 merge(),但棘手的部分是将值与条件语句匹配。任何建议表示赞赏。
df3<-crossing(endtime = as.POSIXct(df1$endtime,format ="%m/%d/%Y %I:%M:%S %p" ),
sent = as.POSIXct(df2$sent, format = "%m/%d/%Y %I:%M:%S %p")) %>%
filter((endtime - seconds(20)) <= sent,
(endtime + seconds(20)) >= (sent)) %>%
mutate_all(format, format = "%m/%d/%Y %I:%M:%S %p") %>%
distinct(sent, .keep_all = TRUE)
这很好用,但我不知道如何在 df2 数据集中找到本质上 "left-over" 并且没有匹配的所有值。任何建议表示赞赏。
library(dplyr)
library(tidyr)
library(lubridate)
df1 <- structure(list(endtime = structure(c(2L, 3L, 1L), .Label = c("1/10/2020 1:45:00 AM", "1/7/2020 1:35:08 AM", "1/7/2020 1:39:00 AM"),class = "factor"),
ID = structure(1:3, .Label = c("A", "B", "C"), class = "factor")),
class = "data.frame", row.names = c(NA, -3L))
df2 <- structure(list(sent = structure(c(3L, 4L, 1L, 2L), .Label = c("1/20/2020 1:55:00 AM", "1/20/2020 2:00:00 AM", "1/7/2020 1:35:20 AM", "1/7/2020 1:42:00 AM"), class = "factor"),
ID = structure(c(1L, 2L, 3L, 1L), .Label = c("E", "F", "G"), class = "factor")),
class = "data.frame", row.names = c(NA, -4L))
编辑:
我不得不稍微更改日期的重铸,因为它不显示日期时间字符串的"AM"和"PM"部分:
# A tibble: 1 x 2
endtime sent
<chr> <chr>
1 "01/07/2020 01:35:08 " "01/07/2020 01:35:20 "
之前:
mutate_all(format, format = "%m/%d/%Y %I:%M:%S %p")
之后:
mutate_all(format, format = "%m/%d/%Y %H:%M:%S")
改编代码以创建 df3
:
df3<-crossing(endtime = as.POSIXct(df1$endtime,format ="%m/%d/%Y %I:%M:%S %p" ),
sent = as.POSIXct(df2$sent, format = "%m/%d/%Y %I:%M:%S %p")) %>%
filter((endtime - seconds(20)) <= sent,
(endtime + seconds(20)) >= (sent)) %>%
mutate_all(format, format = "%m/%d/%Y %H:%M:%S") %>%
distinct(sent, .keep_all = TRUE)
df3
现在:
# A tibble: 1 x 2
endtime sent
<chr> <chr>
1 01/07/2020 01:35:08 01/07/2020 01:35:20
评论:为什么sent
和endtime
在这里重新转换为字符串?你不想让他们约会吗??因为我们将不得不在下方再次 对它们进行重新分类。
建议的解决方案:
识别不匹配的行:
df1_unmatched <- df1 %>%
mutate(endtime = as.POSIXct(endtime,format ="%m/%d/%Y %H:%M:%S" )) %>%
filter(!endtime %in% as.POSIXct(df3$endtime,format ="%m/%d/%Y %H:%M:%S" ))
df2_unmatched <- df2 %>%
mutate(sent = as.POSIXct(sent, format = "%m/%d/%Y %H:%M:%S")) %>%
filter(!sent %in% as.POSIXct(df3$sent,format ="%m/%d/%Y %H:%M:%S" ))
df1_unmatched
Returns:
endtime ID
1 2020-01-07 01:39:00 B
2 2020-01-10 01:45:00 C
和
df2_unmatched
Returns:
sent ID
1 2020-01-07 01:42:00 F
2 2020-01-20 01:55:00 G
3 2020-01-20 02:00:00 E
在 data.table
中使用非等值:
df3 <- df1[, c("st", "et") := .(endtime - 20L, endtime + 20L)][
df2, on=.(st<=sent, et>=sent), nomatch=0L, .(ID1=x.ID, ID2=i.ID, endtime, sent)]
输出:
ID1 ID2 endtime sent
1: A E 2020-01-07 01:35:08 2020-01-07 01:35:20
数据:
library(data.table)
setDT(df1)[, endtime := as.POSIXct(as.character(endtime),format ="%m/%d/%Y %I:%M:%S %p")]
setDT(df2)[, sent := as.POSIXct(as.character(sent), format = "%m/%d/%Y %I:%M:%S %p")]
我有两个独立的数据集:df1 和 df2。我想创建一个新的数据集 df3,如果日期时间彼此相差 20 秒以内,它将 df1 的结束时间列与 df2 的发送列相匹配。最后,我想创建一个最终数据集,为我提供 df2 数据集(sent/ID 数据集)中与 df1.
不匹配的所有值 df1
endtime ID
1/7/2020 1:35:08 AM A
1/7/2020 1:39:00 AM B
1/20/2020 1:45:00 AM C
df2
sent ID
1/7/2020 1:35:20 AM E
1/7/2020 1:42:00 AM F
1/20/2020 1:55:00 AM G
1/20/2020 2:00:00 AM E
这是我想要的 df3 输出。只有一行,因为只有两个值符合结束时间和已发送列的 20 秒内条件。
endtime sent
1/7/2020 1:35:08 AM 1/7/2020 1:35:20 AM
不匹配值的期望输出
sent
1/7/2020 1:42:00 AM
1/20/2020 1:55:00 AM
1/20/2020 2:00:00 AM
这是输出:
df1
structure(list(endtime = structure(c(2L, 3L, 1L), .Label = c("1/10/2020 1:45:00 AM",
"1/7/2020 1:35:08 AM", "1/7/2020 1:39:00 AM"), class = "factor"),
ID = structure(1:3, .Label = c("A", "B", "C"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L))
df2
structure(list(sent = structure(c(3L, 4L, 1L, 2L), .Label = c("1/20/2020 1:55:00 AM",
"1/20/2020 2:00:00 AM", "1/7/2020 1:35:20 AM", "1/7/2020 1:42:00 AM"
), class = "factor"), ID = structure(c(1L, 2L, 3L, 1L), .Label = c("E",
"F", "G"), class = "factor")), class = "data.frame", row.names = c(NA,
-4L))
这是我试过的:
我正在考虑执行左连接并匹配值,或者我可以使用 merge(),但棘手的部分是将值与条件语句匹配。任何建议表示赞赏。
df3<-crossing(endtime = as.POSIXct(df1$endtime,format ="%m/%d/%Y %I:%M:%S %p" ),
sent = as.POSIXct(df2$sent, format = "%m/%d/%Y %I:%M:%S %p")) %>%
filter((endtime - seconds(20)) <= sent,
(endtime + seconds(20)) >= (sent)) %>%
mutate_all(format, format = "%m/%d/%Y %I:%M:%S %p") %>%
distinct(sent, .keep_all = TRUE)
这很好用,但我不知道如何在 df2 数据集中找到本质上 "left-over" 并且没有匹配的所有值。任何建议表示赞赏。
library(dplyr)
library(tidyr)
library(lubridate)
df1 <- structure(list(endtime = structure(c(2L, 3L, 1L), .Label = c("1/10/2020 1:45:00 AM", "1/7/2020 1:35:08 AM", "1/7/2020 1:39:00 AM"),class = "factor"),
ID = structure(1:3, .Label = c("A", "B", "C"), class = "factor")),
class = "data.frame", row.names = c(NA, -3L))
df2 <- structure(list(sent = structure(c(3L, 4L, 1L, 2L), .Label = c("1/20/2020 1:55:00 AM", "1/20/2020 2:00:00 AM", "1/7/2020 1:35:20 AM", "1/7/2020 1:42:00 AM"), class = "factor"),
ID = structure(c(1L, 2L, 3L, 1L), .Label = c("E", "F", "G"), class = "factor")),
class = "data.frame", row.names = c(NA, -4L))
编辑:
我不得不稍微更改日期的重铸,因为它不显示日期时间字符串的"AM"和"PM"部分:
# A tibble: 1 x 2
endtime sent
<chr> <chr>
1 "01/07/2020 01:35:08 " "01/07/2020 01:35:20 "
之前:
mutate_all(format, format = "%m/%d/%Y %I:%M:%S %p")
之后:
mutate_all(format, format = "%m/%d/%Y %H:%M:%S")
改编代码以创建 df3
:
df3<-crossing(endtime = as.POSIXct(df1$endtime,format ="%m/%d/%Y %I:%M:%S %p" ),
sent = as.POSIXct(df2$sent, format = "%m/%d/%Y %I:%M:%S %p")) %>%
filter((endtime - seconds(20)) <= sent,
(endtime + seconds(20)) >= (sent)) %>%
mutate_all(format, format = "%m/%d/%Y %H:%M:%S") %>%
distinct(sent, .keep_all = TRUE)
df3
现在:
# A tibble: 1 x 2
endtime sent
<chr> <chr>
1 01/07/2020 01:35:08 01/07/2020 01:35:20
评论:为什么sent
和endtime
在这里重新转换为字符串?你不想让他们约会吗??因为我们将不得不在下方再次 对它们进行重新分类。
建议的解决方案:
识别不匹配的行:
df1_unmatched <- df1 %>%
mutate(endtime = as.POSIXct(endtime,format ="%m/%d/%Y %H:%M:%S" )) %>%
filter(!endtime %in% as.POSIXct(df3$endtime,format ="%m/%d/%Y %H:%M:%S" ))
df2_unmatched <- df2 %>%
mutate(sent = as.POSIXct(sent, format = "%m/%d/%Y %H:%M:%S")) %>%
filter(!sent %in% as.POSIXct(df3$sent,format ="%m/%d/%Y %H:%M:%S" ))
df1_unmatched
Returns:
endtime ID
1 2020-01-07 01:39:00 B
2 2020-01-10 01:45:00 C
和
df2_unmatched
Returns:
sent ID
1 2020-01-07 01:42:00 F
2 2020-01-20 01:55:00 G
3 2020-01-20 02:00:00 E
在 data.table
中使用非等值:
df3 <- df1[, c("st", "et") := .(endtime - 20L, endtime + 20L)][
df2, on=.(st<=sent, et>=sent), nomatch=0L, .(ID1=x.ID, ID2=i.ID, endtime, sent)]
输出:
ID1 ID2 endtime sent
1: A E 2020-01-07 01:35:08 2020-01-07 01:35:20
数据:
library(data.table)
setDT(df1)[, endtime := as.POSIXct(as.character(endtime),format ="%m/%d/%Y %I:%M:%S %p")]
setDT(df2)[, sent := as.POSIXct(as.character(sent), format = "%m/%d/%Y %I:%M:%S %p")]