在 df2 的日期时间使用 df1 的 "hour" 和 "min" 条件合并 2 个数据帧
Merge 2 dataframes using conditions on "hour" and "min" of df1 in datetimes of df2
我有一个这样的数据框 df.sample
id <- c("A","A","A","A","A","A","A","A","A","A","A")
date <- c("2018-11-12","2018-11-12","2018-11-12","2018-11-12","2018-11-12",
"2018-11-12","2018-11-12","2018-11-14","2018-11-14","2018-11-14",
"2018-11-12")
hour <- c(8,8,9,9,13,13,16,6,7,19,7)
min <- c(47,59,6,18,22,36,12,32,12,21,47)
value <- c(70,70,86,86,86,74,81,77,79,83,91)
df.sample <- data.frame(id,date,hour,min,value,stringsAsFactors = F)
df.sample$date <- as.Date(df.sample$date,format="%Y-%m-%d")
我有另一个数据框 df.state
像这样
id <- c("A","A","A")
starttime <- c("2018-11-12 08:59:00","2018-11-14 06:24:17","2018-11-15 09:17:00")
endtime <- c("2018-11-12 15:57:00","2018-11-14 17:22:16","2018-11-15 12:17:32")
state <- c("Pass","Pass","Pass")
df.state <- data.frame(id,starttime,endtime,state,stringsAsFactors = F)
df.state$starttime <- as.POSIXct(df.state$starttime,format="%Y-%m-%d %H:%M:%S")
df.state$endtime <- as.POSIXct(df.state$endtime,format="%Y-%m-%d %H:%M:%S")
我正在尝试根据条件合并这 2 个数据帧
如果df.sample
中的hour
和min
在df.state
的starttime
和endtime
中,则合并state = Pass
在 df.sample
中。
例如,df.sample
中的第 2 行有 hour = 8
、min = 59
,并且由于它在 df.state
中的 starttime = 2018-11-12 08:59:00
内,因此值 Pass
添加
这是我的期望输出
id date hour min value state
A 2018-11-12 8 47 70
A 2018-11-12 8 59 70 Pass
A 2018-11-12 9 6 86 Pass
A 2018-11-12 9 18 86 Pass
A 2018-11-12 13 22 86 Pass
A 2018-11-12 13 36 74 Pass
A 2018-11-12 16 12 81
A 2018-11-14 6 32 77 Pass
A 2018-11-14 7 12 79 Pass
A 2018-11-14 19 21 83
A 2018-11-12 7 47 91
我可以像这样合并这两个数据帧,但无法在 df.state
的开始时间和结束时间查找 df.sample 的小时和分钟
library(tidyverse)
df.sample <- df.sample %>%
left_join(df.state)
谁能给我指出正确的方向
(重要准备说明:as.POSIXct
使用本地时区创建 POSIXct 值,而 lubridate::ymd
创建 UTC 时间。如果时区在下面的连接中有所不同,您将得到意想不到的结果。)
df.state$starttime <- lubridate::ymd_hms(df.state$starttime)
df.state$endtime <- lubridate::ymd_hms(df.state$endtime)
这可以用 fuzzyjoin 来完成:
library(fuzzyjoin)
df.sample %>%
mutate(sample_time = lubridate::ymd_hm(paste(date, hour, min))) %>%
fuzzy_left_join(df.state,
by = c("id" = "id",
"sample_time" = "starttime",
"sample_time" = "endtime"),
match_fun = list(`==`, `>=`, `<=`))
id.x date hour min value sample_time id.y starttime endtime state
1 A 2018-11-12 8 47 70 2018-11-12 08:47:00 <NA> <NA> <NA> <NA>
2 A 2018-11-12 8 59 70 2018-11-12 08:59:00 A 2018-11-12 08:59:00 2018-11-12 15:57:00 Pass
3 A 2018-11-12 9 6 86 2018-11-12 09:06:00 A 2018-11-12 08:59:00 2018-11-12 15:57:00 Pass
4 A 2018-11-12 9 18 86 2018-11-12 09:18:00 A 2018-11-12 08:59:00 2018-11-12 15:57:00 Pass
5 A 2018-11-12 13 22 86 2018-11-12 13:22:00 A 2018-11-12 08:59:00 2018-11-12 15:57:00 Pass
6 A 2018-11-12 13 36 74 2018-11-12 13:36:00 A 2018-11-12 08:59:00 2018-11-12 15:57:00 Pass
7 A 2018-11-12 16 12 81 2018-11-12 16:12:00 <NA> <NA> <NA> <NA>
8 A 2018-11-14 6 32 77 2018-11-14 06:32:00 A 2018-11-14 06:24:17 2018-11-14 17:22:16 Pass
9 A 2018-11-14 7 12 79 2018-11-14 07:12:00 A 2018-11-14 06:24:17 2018-11-14 17:22:16 Pass
10 A 2018-11-14 19 21 83 2018-11-14 19:21:00 <NA> <NA> <NA> <NA>
11 A 2018-11-12 7 47 91 2018-11-12 07:47:00 <NA> <NA> <NA> <NA>
如果您碰巧有大数据帧,使用 non-equi 从 data.table
包加入会更快更容易: Benchmark | Video
library(data.table)
## convert both data.frames to data.tables by reference
setDT(df.sample)
setDT(df.state)
## create a `time` column in df.sample
df.sample[, time := as.POSIXct(paste0(date, " ", hour, ":", min, ":00"))]
## change column order
setcolorder(df.sample, c("id", "time"))
# join by id and time within start & end time limits
# "x." is used so we can refer to the column in other data.table explicitly
df.state[df.sample, .(id, time, date, hour, min, value, state = x.state),
on = .(id, starttime <= time, endtime >= time)]
#> id time date hour min value state
#> 1: A 2018-11-12 08:47:00 2018-11-12 8 47 70 <NA>
#> 2: A 2018-11-12 08:59:00 2018-11-12 8 59 70 Pass
#> 3: A 2018-11-12 09:06:00 2018-11-12 9 6 86 Pass
#> 4: A 2018-11-12 09:18:00 2018-11-12 9 18 86 Pass
#> 5: A 2018-11-12 13:22:00 2018-11-12 13 22 86 Pass
#> 6: A 2018-11-12 13:36:00 2018-11-12 13 36 74 Pass
#> 7: A 2018-11-12 16:12:00 2018-11-12 16 12 81 <NA>
#> 8: A 2018-11-14 06:32:00 2018-11-14 6 32 77 Pass
#> 9: A 2018-11-14 07:12:00 2018-11-14 7 12 79 Pass
#> 10: A 2018-11-14 19:21:00 2018-11-14 19 21 83 <NA>
#> 11: A 2018-11-12 07:47:00 2018-11-12 7 47 91 <NA>
### remove NA
df.state[df.sample, .(id, time, date, hour, min, value, state = x.state),
on = .(id, starttime <= time, endtime >= time), nomatch = 0L]
#> id time date hour min value state
#> 1: A 2018-11-12 08:59:00 2018-11-12 8 59 70 Pass
#> 2: A 2018-11-12 09:06:00 2018-11-12 9 6 86 Pass
#> 3: A 2018-11-12 09:18:00 2018-11-12 9 18 86 Pass
#> 4: A 2018-11-12 13:22:00 2018-11-12 13 22 86 Pass
#> 5: A 2018-11-12 13:36:00 2018-11-12 13 36 74 Pass
#> 6: A 2018-11-14 06:32:00 2018-11-14 6 32 77 Pass
#> 7: A 2018-11-14 07:12:00 2018-11-14 7 12 79 Pass
由 reprex package (v0.3.0)
于 2019-05-23 创建
可以通过首先向 df.sample
data.frame 添加一个时间列,然后使用 sapply
根据您的标准进行评估并将此结果添加到 df.sample
]
df.sample$time <- paste0(df.sample$date, ' ', sprintf('%02d', df.sample$hour),':', sprintf('%02d', df.sample$min), ':00')
df.sample$state <- sapply(df.sample$time, function(x) {
after_start <- x >= df.state$starttime
before_end <- x <= df.state$endtime
y <- cbind(after_start, before_end)
pass_check <- apply(y, 1, sum)
if (2 %in% pass_check) {'PASS'} else {''}
})
df.sample
id date hour min value time state
1 A 2018-11-12 8 47 70 2018-11-12 08:47:00
2 A 2018-11-12 8 59 70 2018-11-12 08:59:00 PASS
3 A 2018-11-12 9 6 86 2018-11-12 09:06:00 PASS
4 A 2018-11-12 9 18 86 2018-11-12 09:18:00 PASS
5 A 2018-11-12 13 22 86 2018-11-12 13:22:00 PASS
6 A 2018-11-12 13 36 74 2018-11-12 13:36:00 PASS
7 A 2018-11-12 16 12 81 2018-11-12 16:12:00
8 A 2018-11-14 6 32 77 2018-11-14 06:32:00 PASS
9 A 2018-11-14 7 12 79 2018-11-14 07:12:00 PASS
10 A 2018-11-14 19 21 83 2018-11-14 19:21:00
11 A 2018-11-12 7 47 91 2018-11-12 07:47:00
我所做的是从您提供的每个数据框中提取小数小时,这样我就可以询问是否在该小数小时内找到了一个值。但首先,您必须根据 id(假设您有其他 id)和日期(假设每天只有一个状态;或者换句话说 df.state 数据集中每天存在一个日期)合并数据集.
id <- c("A","A","A","A","A","A","A","A","A","A","A")
date <- c("2018-11-12","2018-11-12","2018-11-12","2018-11-12","2018-11-12",
"2018-11-12","2018-11-12","2018-11-14","2018-11-14","2018-11-14",
"2018-11-12")
hour <- c(8,8,9,9,13,13,16,6,7,19,7)
min <- c(47,59,6,18,22,36,12,32,12,21,47)
value <- c(70,70,86,86,86,74,81,77,79,83,91)
df.sample <- data.frame(id,date,hour,min,value,stringsAsFactors = F)
df.sample$date <- as.Date(df.sample$date,format="%Y-%m-%d")
df.sample$dec.hour <- as.numeric(df.sample$hour) +
as.numeric(df.sample$min)/60
我在上面添加的是最后几行,用于根据您提供的小时和分钟值计算十进制小时
id <- c("A","A","A")
starttime <- c("2018-11-12 08:59:00","2018-11-14 06:24:17","2018-11-15 09:17:00")
endtime <- c("2018-11-12 15:57:00","2018-11-14 17:22:16","2018-11-15 12:17:32")
state <- c("Pass","Pass","Pass")
df.state <- data.frame(id,starttime,endtime,state,stringsAsFactors = F)
我在这里添加了一个日期向量(用于合并)。我任意选择开始时间,假设开始和结束时间的日期始终相同。
df.state$date <- as.Date(df.state$starttime,format="%Y-%m-%d")
然后我得到那个日期的开始时间和结束时间的小数小时
t.str <- strptime(df.state$starttime, "%Y-%m-%d %H:%M:%S")
df.state$dec.hour.start <- as.numeric(format(t.str, "%H")) +
as.numeric(format(t.str, "%M"))/60
t.end <- strptime(df.state$endtime, "%Y-%m-%d %H:%M:%S")
df.state$dec.hour.end <- as.numeric(format(t.end, "%H")) +
as.numeric(format(t.end, "%M"))/60
按 ID 和日期合并数据帧
df<-merge(df.sample, df.state, by=c("id","date"))
如果样本的十进制小时在开始或结束小数小时内(对于该日期),则 return 状态为 TRUE。
df<-df %>%
mutate(state = dec.hour >= dec.hour.start & dec.hour <= dec.hour.end)
现在,如果你想删除我创建的所有这些额外的列(所以它看起来像你想要的输出):
df<-df[,-c(6:8,10:11)]
因为df$state是合乎逻辑的,如果你想把TRUE改成pass,FALSE改成空白,你得先把值转成字符space:
df$state<-as.character(df$state)
df$state[df$state=="TRUE"]<-"pass"
df$state[df$state=="FALSE"]<-""
看看:
df
> df
id date hour min value state
1 A 2018-11-12 8 47 70
2 A 2018-11-12 8 59 70 pass
3 A 2018-11-12 9 6 86 pass
4 A 2018-11-12 9 18 86 pass
5 A 2018-11-12 13 22 86 pass
6 A 2018-11-12 13 36 74 pass
7 A 2018-11-12 16 12 81
8 A 2018-11-12 7 47 91
9 A 2018-11-14 6 32 77 pass
10 A 2018-11-14 7 12 79 pass
11 A 2018-11-14 19 21 83
我用这个 post: extract hours and seconds from POSIXct for plotting purposes in R 来提取小数小时
还有这个: 看看你的采样时间是否在你的状态时间之内。
我有一个这样的数据框 df.sample
id <- c("A","A","A","A","A","A","A","A","A","A","A")
date <- c("2018-11-12","2018-11-12","2018-11-12","2018-11-12","2018-11-12",
"2018-11-12","2018-11-12","2018-11-14","2018-11-14","2018-11-14",
"2018-11-12")
hour <- c(8,8,9,9,13,13,16,6,7,19,7)
min <- c(47,59,6,18,22,36,12,32,12,21,47)
value <- c(70,70,86,86,86,74,81,77,79,83,91)
df.sample <- data.frame(id,date,hour,min,value,stringsAsFactors = F)
df.sample$date <- as.Date(df.sample$date,format="%Y-%m-%d")
我有另一个数据框 df.state
像这样
id <- c("A","A","A")
starttime <- c("2018-11-12 08:59:00","2018-11-14 06:24:17","2018-11-15 09:17:00")
endtime <- c("2018-11-12 15:57:00","2018-11-14 17:22:16","2018-11-15 12:17:32")
state <- c("Pass","Pass","Pass")
df.state <- data.frame(id,starttime,endtime,state,stringsAsFactors = F)
df.state$starttime <- as.POSIXct(df.state$starttime,format="%Y-%m-%d %H:%M:%S")
df.state$endtime <- as.POSIXct(df.state$endtime,format="%Y-%m-%d %H:%M:%S")
我正在尝试根据条件合并这 2 个数据帧
如果df.sample
中的hour
和min
在df.state
的starttime
和endtime
中,则合并state = Pass
在 df.sample
中。
例如,df.sample
中的第 2 行有 hour = 8
、min = 59
,并且由于它在 df.state
中的 starttime = 2018-11-12 08:59:00
内,因此值 Pass
添加
这是我的期望输出
id date hour min value state
A 2018-11-12 8 47 70
A 2018-11-12 8 59 70 Pass
A 2018-11-12 9 6 86 Pass
A 2018-11-12 9 18 86 Pass
A 2018-11-12 13 22 86 Pass
A 2018-11-12 13 36 74 Pass
A 2018-11-12 16 12 81
A 2018-11-14 6 32 77 Pass
A 2018-11-14 7 12 79 Pass
A 2018-11-14 19 21 83
A 2018-11-12 7 47 91
我可以像这样合并这两个数据帧,但无法在 df.state
的开始时间和结束时间查找 df.sample 的小时和分钟library(tidyverse)
df.sample <- df.sample %>%
left_join(df.state)
谁能给我指出正确的方向
(重要准备说明:as.POSIXct
使用本地时区创建 POSIXct 值,而 lubridate::ymd
创建 UTC 时间。如果时区在下面的连接中有所不同,您将得到意想不到的结果。)
df.state$starttime <- lubridate::ymd_hms(df.state$starttime)
df.state$endtime <- lubridate::ymd_hms(df.state$endtime)
这可以用 fuzzyjoin 来完成:
library(fuzzyjoin)
df.sample %>%
mutate(sample_time = lubridate::ymd_hm(paste(date, hour, min))) %>%
fuzzy_left_join(df.state,
by = c("id" = "id",
"sample_time" = "starttime",
"sample_time" = "endtime"),
match_fun = list(`==`, `>=`, `<=`))
id.x date hour min value sample_time id.y starttime endtime state
1 A 2018-11-12 8 47 70 2018-11-12 08:47:00 <NA> <NA> <NA> <NA>
2 A 2018-11-12 8 59 70 2018-11-12 08:59:00 A 2018-11-12 08:59:00 2018-11-12 15:57:00 Pass
3 A 2018-11-12 9 6 86 2018-11-12 09:06:00 A 2018-11-12 08:59:00 2018-11-12 15:57:00 Pass
4 A 2018-11-12 9 18 86 2018-11-12 09:18:00 A 2018-11-12 08:59:00 2018-11-12 15:57:00 Pass
5 A 2018-11-12 13 22 86 2018-11-12 13:22:00 A 2018-11-12 08:59:00 2018-11-12 15:57:00 Pass
6 A 2018-11-12 13 36 74 2018-11-12 13:36:00 A 2018-11-12 08:59:00 2018-11-12 15:57:00 Pass
7 A 2018-11-12 16 12 81 2018-11-12 16:12:00 <NA> <NA> <NA> <NA>
8 A 2018-11-14 6 32 77 2018-11-14 06:32:00 A 2018-11-14 06:24:17 2018-11-14 17:22:16 Pass
9 A 2018-11-14 7 12 79 2018-11-14 07:12:00 A 2018-11-14 06:24:17 2018-11-14 17:22:16 Pass
10 A 2018-11-14 19 21 83 2018-11-14 19:21:00 <NA> <NA> <NA> <NA>
11 A 2018-11-12 7 47 91 2018-11-12 07:47:00 <NA> <NA> <NA> <NA>
如果您碰巧有大数据帧,使用 non-equi 从 data.table
包加入会更快更容易: Benchmark | Video
library(data.table)
## convert both data.frames to data.tables by reference
setDT(df.sample)
setDT(df.state)
## create a `time` column in df.sample
df.sample[, time := as.POSIXct(paste0(date, " ", hour, ":", min, ":00"))]
## change column order
setcolorder(df.sample, c("id", "time"))
# join by id and time within start & end time limits
# "x." is used so we can refer to the column in other data.table explicitly
df.state[df.sample, .(id, time, date, hour, min, value, state = x.state),
on = .(id, starttime <= time, endtime >= time)]
#> id time date hour min value state
#> 1: A 2018-11-12 08:47:00 2018-11-12 8 47 70 <NA>
#> 2: A 2018-11-12 08:59:00 2018-11-12 8 59 70 Pass
#> 3: A 2018-11-12 09:06:00 2018-11-12 9 6 86 Pass
#> 4: A 2018-11-12 09:18:00 2018-11-12 9 18 86 Pass
#> 5: A 2018-11-12 13:22:00 2018-11-12 13 22 86 Pass
#> 6: A 2018-11-12 13:36:00 2018-11-12 13 36 74 Pass
#> 7: A 2018-11-12 16:12:00 2018-11-12 16 12 81 <NA>
#> 8: A 2018-11-14 06:32:00 2018-11-14 6 32 77 Pass
#> 9: A 2018-11-14 07:12:00 2018-11-14 7 12 79 Pass
#> 10: A 2018-11-14 19:21:00 2018-11-14 19 21 83 <NA>
#> 11: A 2018-11-12 07:47:00 2018-11-12 7 47 91 <NA>
### remove NA
df.state[df.sample, .(id, time, date, hour, min, value, state = x.state),
on = .(id, starttime <= time, endtime >= time), nomatch = 0L]
#> id time date hour min value state
#> 1: A 2018-11-12 08:59:00 2018-11-12 8 59 70 Pass
#> 2: A 2018-11-12 09:06:00 2018-11-12 9 6 86 Pass
#> 3: A 2018-11-12 09:18:00 2018-11-12 9 18 86 Pass
#> 4: A 2018-11-12 13:22:00 2018-11-12 13 22 86 Pass
#> 5: A 2018-11-12 13:36:00 2018-11-12 13 36 74 Pass
#> 6: A 2018-11-14 06:32:00 2018-11-14 6 32 77 Pass
#> 7: A 2018-11-14 07:12:00 2018-11-14 7 12 79 Pass
由 reprex package (v0.3.0)
于 2019-05-23 创建可以通过首先向 df.sample
data.frame 添加一个时间列,然后使用 sapply
根据您的标准进行评估并将此结果添加到 df.sample
]
df.sample$time <- paste0(df.sample$date, ' ', sprintf('%02d', df.sample$hour),':', sprintf('%02d', df.sample$min), ':00')
df.sample$state <- sapply(df.sample$time, function(x) {
after_start <- x >= df.state$starttime
before_end <- x <= df.state$endtime
y <- cbind(after_start, before_end)
pass_check <- apply(y, 1, sum)
if (2 %in% pass_check) {'PASS'} else {''}
})
df.sample
id date hour min value time state
1 A 2018-11-12 8 47 70 2018-11-12 08:47:00
2 A 2018-11-12 8 59 70 2018-11-12 08:59:00 PASS
3 A 2018-11-12 9 6 86 2018-11-12 09:06:00 PASS
4 A 2018-11-12 9 18 86 2018-11-12 09:18:00 PASS
5 A 2018-11-12 13 22 86 2018-11-12 13:22:00 PASS
6 A 2018-11-12 13 36 74 2018-11-12 13:36:00 PASS
7 A 2018-11-12 16 12 81 2018-11-12 16:12:00
8 A 2018-11-14 6 32 77 2018-11-14 06:32:00 PASS
9 A 2018-11-14 7 12 79 2018-11-14 07:12:00 PASS
10 A 2018-11-14 19 21 83 2018-11-14 19:21:00
11 A 2018-11-12 7 47 91 2018-11-12 07:47:00
我所做的是从您提供的每个数据框中提取小数小时,这样我就可以询问是否在该小数小时内找到了一个值。但首先,您必须根据 id(假设您有其他 id)和日期(假设每天只有一个状态;或者换句话说 df.state 数据集中每天存在一个日期)合并数据集.
id <- c("A","A","A","A","A","A","A","A","A","A","A")
date <- c("2018-11-12","2018-11-12","2018-11-12","2018-11-12","2018-11-12",
"2018-11-12","2018-11-12","2018-11-14","2018-11-14","2018-11-14",
"2018-11-12")
hour <- c(8,8,9,9,13,13,16,6,7,19,7)
min <- c(47,59,6,18,22,36,12,32,12,21,47)
value <- c(70,70,86,86,86,74,81,77,79,83,91)
df.sample <- data.frame(id,date,hour,min,value,stringsAsFactors = F)
df.sample$date <- as.Date(df.sample$date,format="%Y-%m-%d")
df.sample$dec.hour <- as.numeric(df.sample$hour) +
as.numeric(df.sample$min)/60
我在上面添加的是最后几行,用于根据您提供的小时和分钟值计算十进制小时
id <- c("A","A","A")
starttime <- c("2018-11-12 08:59:00","2018-11-14 06:24:17","2018-11-15 09:17:00")
endtime <- c("2018-11-12 15:57:00","2018-11-14 17:22:16","2018-11-15 12:17:32")
state <- c("Pass","Pass","Pass")
df.state <- data.frame(id,starttime,endtime,state,stringsAsFactors = F)
我在这里添加了一个日期向量(用于合并)。我任意选择开始时间,假设开始和结束时间的日期始终相同。
df.state$date <- as.Date(df.state$starttime,format="%Y-%m-%d")
然后我得到那个日期的开始时间和结束时间的小数小时
t.str <- strptime(df.state$starttime, "%Y-%m-%d %H:%M:%S")
df.state$dec.hour.start <- as.numeric(format(t.str, "%H")) +
as.numeric(format(t.str, "%M"))/60
t.end <- strptime(df.state$endtime, "%Y-%m-%d %H:%M:%S")
df.state$dec.hour.end <- as.numeric(format(t.end, "%H")) +
as.numeric(format(t.end, "%M"))/60
按 ID 和日期合并数据帧
df<-merge(df.sample, df.state, by=c("id","date"))
如果样本的十进制小时在开始或结束小数小时内(对于该日期),则 return 状态为 TRUE。
df<-df %>%
mutate(state = dec.hour >= dec.hour.start & dec.hour <= dec.hour.end)
现在,如果你想删除我创建的所有这些额外的列(所以它看起来像你想要的输出):
df<-df[,-c(6:8,10:11)]
因为df$state是合乎逻辑的,如果你想把TRUE改成pass,FALSE改成空白,你得先把值转成字符space:
df$state<-as.character(df$state)
df$state[df$state=="TRUE"]<-"pass"
df$state[df$state=="FALSE"]<-""
看看:
df
> df
id date hour min value state
1 A 2018-11-12 8 47 70
2 A 2018-11-12 8 59 70 pass
3 A 2018-11-12 9 6 86 pass
4 A 2018-11-12 9 18 86 pass
5 A 2018-11-12 13 22 86 pass
6 A 2018-11-12 13 36 74 pass
7 A 2018-11-12 16 12 81
8 A 2018-11-12 7 47 91
9 A 2018-11-14 6 32 77 pass
10 A 2018-11-14 7 12 79 pass
11 A 2018-11-14 19 21 83
我用这个 post: extract hours and seconds from POSIXct for plotting purposes in R 来提取小数小时
还有这个: