通过在 R 中的时间范围内重复值来合并具有不同时间范围的两个数据帧

Merge two dataframes with different time ranges by repeating values within time range in R

我有两个不同的数据帧,都有一个时间信息列,具有不同的时间间隔。第一个 df1 的时间间隔以秒为单位(~6s),另一个(df2)的时间间隔为 10min。 我想合并两个数据帧,保留来自两个 df 的信息,在 df1 的时间范围内重复 df2 值。 像这样:

df1

  x   y   z     time
-52 -39 -35 06:08:03
-47 -57 -36 06:08:08
-39   2 -40 06:08:13
-45 -23 -29 06:10:20
-51 -11 -31 06:10:29
-69 -28 -19 06:20:34

df2

time        Temp.ar  Ur ar  Vel. Vento
06:00:00    14.79    78.5   1.147
06:10:00    14.74    78.9   1.045
06:20:00    14.9     78.9   1.009
06:30:00    15.14    78.6   1.076
06:40:00    15.32    77.8   1.332
06:50:00    15.6     76.5   1.216   

我想要的输出

 x   y   z      time  Temp.ar   Ur ar   Vel. Vento
-52 -39 -35 06:08:03  14.79     78.5    1.147
-47 -57 -36 06:08:08  14.79     78.5    1.147
-39   2 -40 06:08:13  14.79     78.5    1.147
-45 -23 -29 06:10:20  14.74     78.9    1.045
-51 -11 -31 06:10:29  14.74     78.9    1.045
-69 -28 -19 06:20:34  14.9      78.9    1.009

时间列已经是 "POSIXct" 格式。

可能最通用的方法是定义一组时间windows,然后使用findInterval在每个数据帧中定位时间的索引。然后,您可以使用 merge 将两者放在一起:

# This is what Gabriel means by a reprex - if you provide the data in 
# loadable form it is much easier to help
df1 <- read.table(text="  x   y   z     time
-52 -39 -35 06:08:03
-47 -57 -36 06:08:08
-39   2 -40 06:08:13
-45 -23 -29 06:10:20
-51 -11 -31 06:10:29
-69 -28 -19 06:20:34", header=TRUE, stringsAsFactors=FALSE)

df2 <- read.table(text="time        Temp.ar  Ur.ar  Vel.Vento
06:00:00    14.79    78.5   1.147
06:10:00    14.74    78.9   1.045
06:20:00    14.9     78.9   1.009
06:30:00    15.14    78.6   1.076
06:40:00    15.32    77.8   1.332
06:50:00    15.6     76.5   1.216", header=TRUE, stringsAsFactors=FALSE)

df1$time <- strptime(df1$time, '%H:%M:%S')
df2$time <- strptime(df2$time, '%H:%M:%S')

# I'm just using the existing sequence in df2 as the time windows, but
# you could set up different ones
df1$interval <- findInterval(df1$time, df2$time)
df2$interval <- findInterval(df2$time, df2$time)

df3 <- merge(df1, df2, by='interval')

那里有一些额外的列(来自 df1 和 df2 的时间),但您可以将它们子集化。它们是有用的检查,但它确实有效。

您可以使用滚动连接

library(data.table)
setDT(df1)
setDT(df2)

df2[df1, on = .(time), roll = TRUE]

#                   time Temp.ar Ur.ar Vel.Vento   x   y   z
# 1: 2019-12-11 06:08:03   14.79  78.5     1.147 -52 -39 -35
# 2: 2019-12-11 06:08:08   14.79  78.5     1.147 -47 -57 -36
# 3: 2019-12-11 06:08:13   14.79  78.5     1.147 -39   2 -40
# 4: 2019-12-11 06:10:20   14.74  78.9     1.045 -45 -23 -29
# 5: 2019-12-11 06:10:29   14.74  78.9     1.045 -51 -11 -31
# 6: 2019-12-11 06:20:34   14.90  78.9     1.009 -69 -28 -19

使用的数据

df1 <- fread('
x   y   z     time
-52 -39 -35 06:08:03
-47 -57 -36 06:08:08
-39   2 -40 06:08:13
-45 -23 -29 06:10:20
-51 -11 -31 06:10:29
-69 -28 -19 06:20:34
')

df2 <- fread('
time        Temp.ar  Ur.ar  Vel.Vento
06:00:00    14.79    78.5   1.147
06:10:00    14.74    78.9   1.045
06:20:00    14.9     78.9   1.009
06:30:00    15.14    78.6   1.076
06:40:00    15.32    77.8   1.332
06:50:00    15.6     76.5   1.216
')

对于 base R,这里提供了两种可能会帮助您实现的方法,

  • 使用 findInterval():
df <- `row.names<-`(cbind(df1,df2[findInterval(df1$time, df2$time),-1]),rownames(df1))
  • 使用 which.max():
df <- `row.names<-`(cbind(df1,
                          df2[sapply(df1$time, 
                                     function(x) which.max(df2$time >= x)-1),-1]),rownames(df1)) 

这给出了

> df
    x   y   z                time Temp.ar Ur.ar Vel.Vento
1 -52 -39 -35 2019-12-11 06:08:03   14.79  78.5     1.147
2 -47 -57 -36 2019-12-11 06:08:08   14.79  78.5     1.147
3 -39   2 -40 2019-12-11 06:08:13   14.79  78.5     1.147
4 -45 -23 -29 2019-12-11 06:10:20   14.74  78.9     1.045
5 -51 -11 -31 2019-12-11 06:10:29   14.74  78.9     1.045
6 -69 -28 -19 2019-12-11 06:20:34   14.90  78.9     1.009

数据

df1 <- structure(list(x = c(-52L, -47L, -39L, -45L, -51L, -69L), y = c(-39L, 
-57L, 2L, -23L, -11L, -28L), z = c(-35L, -36L, -40L, -29L, -31L, 
-19L), time = structure(list(sec = c(3, 8, 13, 20, 29, 34), min = c(8L, 
8L, 8L, 10L, 10L, 20L), hour = c(6L, 6L, 6L, 6L, 6L, 6L), mday = c(11L, 
11L, 11L, 11L, 11L, 11L), mon = c(11L, 11L, 11L, 11L, 11L, 11L
), year = c(119L, 119L, 119L, 119L, 119L, 119L), wday = c(3L, 
3L, 3L, 3L, 3L, 3L), yday = c(344L, 344L, 344L, 344L, 344L, 344L
), isdst = c(0L, 0L, 0L, 0L, 0L, 0L), zone = c("CET", "CET", 
"CET", "CET", "CET", "CET"), gmtoff = c(NA_integer_, NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_)), class = c("POSIXlt", 
"POSIXt"))), row.names = c(NA, -6L), class = "data.frame")

df2 <- structure(list(time = structure(list(sec = c(0, 0, 0, 0, 0, 0
), min = c(0L, 10L, 20L, 30L, 40L, 50L), hour = c(6L, 6L, 6L, 
6L, 6L, 6L), mday = c(11L, 11L, 11L, 11L, 11L, 11L), mon = c(11L, 
11L, 11L, 11L, 11L, 11L), year = c(119L, 119L, 119L, 119L, 119L, 
119L), wday = c(3L, 3L, 3L, 3L, 3L, 3L), yday = c(344L, 344L, 
344L, 344L, 344L, 344L), isdst = c(0L, 0L, 0L, 0L, 0L, 0L), zone = c("CET", 
"CET", "CET", "CET", "CET", "CET"), gmtoff = c(NA_integer_, NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_)), class = c("POSIXlt", 
"POSIXt")), Temp.ar = c(14.79, 14.74, 14.9, 15.14, 15.32, 15.6
), Ur.ar = c(78.5, 78.9, 78.9, 78.6, 77.8, 76.5), Vel.Vento = c(1.147, 
1.045, 1.009, 1.076, 1.332, 1.216)), row.names = c(NA, -6L), class = "data.frame")