根据 `df1` 的一个变量 (`df1$var1`) 在 `df1` 中创建一个变量,根据 `df1$var1` 可以改变的 `df2` 的一个变量
Create a variable in `df1` depending on one variable of `df1` (`df1$var1`) and one variable of `df2` that is changeable depending on `df1$var1`
我有数据框 df1
,它总结了一段时间内鱼类的深度。 df1$Site
告诉您鱼所在的位置,df1$Ind
告诉您个体,df1$Depth
告诉您鱼在特定 df1$Datetime
.[=31= 的深度]
另一方面,我有 df2
总结了随着时间的推移(每三小时)从地表到 39 米深度的水流强度,间隔为 8 米(m0-7
, m8-15
、m16-23
、m24-31
和 m32-39
)。例如:
df1<-data.frame(Datetime=c("2016-08-01 15:34:07","2016-08-01 16:25:16","2016-08-01 17:29:16","2016-08-01 18:33:16","2016-08-01 20:54:16","2016-08-01 22:48:16"),Site=c("BD","HG","BD","BD","BD","BD"),Ind=c(16,17,19,16,17,16), Depth=c(5.3,24,36.4,42,NA,22.1))
df1$Datetime<-as.POSIXct(df1$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")
> df1
Datetime Site Ind Depth
1 2016-08-01 15:34:07 BD 16 5.3
2 2016-08-01 16:25:16 HG 17 24.0
3 2016-08-01 17:29:16 BD 19 36.4
4 2016-08-01 18:33:16 BD 16 42.0
5 2016-08-01 20:54:16 BD 17 NA
6 2016-08-01 22:48:16 BD 16 22.1
df2<-data.frame(Datetime=c("2016-08-01 12:00:00","2016-08-01 15:00:00","2016-08-01 18:00:00","2016-08-01 21:00:00","2016-08-02 00:00:00"), Site=c("BD","BD","BD","BD","BD"),var1=c(2.75,4,6.75,2.25,4.3),var2=c(3,4,4.75,3,2.1),var3=c(2.75,4,5.75,2.25,1.4),var4=c(3.25,3,6.5,2.75,3.4),var5=c(3,4,4.75,3,1.7))
df2$Datetime<-as.POSIXct(df2$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")
colnames(df2)<-c("Datetime","Site","m0-7","m8-15","m16-23","m24-31","m32-39")
> df2
Datetime Site m0-7 m8-15 m16-23 m24-31 m32-39
1 2016-08-01 12:00:00 BD 2.75 3.00 2.75 3.25 3.00
2 2016-08-01 15:00:00 BD 4.00 4.00 4.00 3.00 4.00
3 2016-08-01 18:00:00 BD 6.75 4.75 5.75 6.50 4.75
4 2016-08-01 21:00:00 BD 2.25 3.00 2.25 2.75 3.00
5 2016-08-02 00:00:00 BD 4.30 2.10 1.40 3.40 1.70
我想在 df1
中创建一个名为 df1$Current.Int
的新列,根据 df2
对洋流的描述,总结了鱼在何时何地的深度的洋流强度。
我想得到这个:
> df1
Datetime Site Ind Depth Current.Int
1 2016-08-01 15:34:07 BD 16 5.3 4.00
2 2016-08-01 16:25:16 HG 17 24.0 NA # Currents of this site are not included in df2
3 2016-08-01 17:29:16 BD 19 36.4 4.75
4 2016-08-01 18:33:16 BD 16 42.0 4.75
5 2016-08-01 20:54:16 BD 17 NA NA
6 2016-08-01 22:48:16 BD 16 22.1 1.40
需要指出的是,由于目前的记录是每三个小时一次,所以df2$Datetime
中的每个小时代表多一个半小时,少一个半小时。即df2
在21:00:00
处指出的电流强度反映了19:30:00
和22:30:00
之间的电流。其他时间也一样。
有人知道怎么做吗?
只要您的数据不是很大,您可能不必走上条件联接的道路。相反,首先仅使用 Site 加入,然后过滤掉额外的观察结果。它不是特别有效,但它可能比转向 sqldf
.
更容易
请注意,我对您提供的数据进行了一些更改,以便日期匹配。
library(tidyverse)
df1<-data.frame(Datetime=c("2016-08-01 15:34:07","2016-08-01 16:25:16","2016-08-01 17:29:16","2016-08-01 18:33:16","2016-08-01 20:54:16","2016-08-01 22:48:16"),
Site=c("BD","HG","BD","BD","BD","BD"),
Ind=c(16,17,19,16,17,16),
Depth=c(5.3,24,36.4,42,NA,22.1),
stringsAsFactors = FALSE)
df1$Datetime<-as.POSIXct(df1$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")
df2<-data.frame(Datetime=c("2016-08-01 12:00:00","2016-08-01 15:00:00","2016-08-01 18:00:00","2016-08-01 21:00:00","2016-08-02 00:00:00"),
Site=c("BD","BD","BD","BD","BD"),
var1=c(2.75,4,6.75,2.25,4.3),
var2=c(3,4,4.75,3,2.1),
var3=c(2.75,4,5.75,2.25,1.4),
var4=c(3.25,3,6.5,2.75,3.4),
var5=c(3,4,4.75,3,1.7),
stringsAsFactors = FALSE)
df2$Datetime<-as.POSIXct(df2$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")
colnames(df2)<-c("Datetime_CI","Site","m0-7","m8-15","m16-23","m24-31","m32-39")
#Tidy the data in df2 so that that we have two columns for min and max Depth
#and a single column for the value of the current intensity
df2 <- df2 %>%
gather(-Datetime_CI, -Site, key = Depth, value = Current.Int) %>%
separate(Depth, c("minDepth", "maxDepth")) %>%
mutate(minDepth = as.numeric(str_sub(minDepth, 2, nchar(minDepth))))
#join df1 and df2 based on the Site alone
df1 %>%
inner_join(df2, by = "Site") %>%
#now filter out any observations where depth is not between the min and max
filter(Depth >= minDepth,
Depth <= maxDepth,
#now exclude any current intensity observations prior to Datetime
Datetime > Datetime_CI) %>%
#finally, take the first current intensity observation after Datetime
group_by(Datetime, Site, Ind, Depth) %>%
filter(Datetime_CI == max(Datetime_CI))
# A tibble: 6 x 8
# Groups: Datetime, Site, Ind, Depth [4]
Datetime Site Ind Depth Datetime_CI minDepth maxDepth Current.Int
<dttm> <chr> <dbl> <dbl> <dttm> <dbl> <chr> <dbl>
1 2016-08-01 15:34:07 BD 16 5.3 2016-08-01 15:00:00 0 7 4
2 2016-08-01 17:29:16 BD 19 36.4 2016-08-01 15:00:00 0 7 4
3 2016-08-01 17:29:16 BD 19 36.4 2016-08-01 15:00:00 32 39 4
4 2016-08-01 18:33:16 BD 16 42 2016-08-01 18:00:00 0 7 6.75
5 2016-08-01 22:48:16 BD 16 22.1 2016-08-01 21:00:00 0 7 2.25
6 2016-08-01 22:48:16 BD 16 22.1 2016-08-01 21:00:00 16 23 2.25
日期不匹配,因此更改了示例。使用这种方法,您可以准确检查匹配的效果并确保它符合您的要求。
df1<-data.frame(Datetime=c("2016-08-18 15:34:07","2016-08-18 16:25:16","2016-08-18 17:29:16","2016-08-18 18:33:16","2016-08-18 20:54:16","2016-08-18 22:48:16"),Site=c("BD","HG","BD","BD","BD","BD"),Ind=c(16,17,19,16,17,16), Depth=c(5.3,24,36.4,42,NA,22.1))
df1$Datetime<-as.POSIXct(df1$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")
df2<-data.frame(Datetime=c("2016-08-18 12:00:00","2016-08-18 15:00:00","2016-08-18 18:00:00","2016-08-18 21:00:00","2016-08-19 00:00:00"), Site=c("BD","BD","BD","BD","BD"),var1=c(2.75,4,6.75,2.25,4.3),var2=c(3,4,4.75,3,2.1),var3=c(2.75,4,5.75,2.25,1.4),var4=c(3.25,3,6.5,2.75,3.4),var5=c(3,4,4.75,3,1.7))
df2$Datetime<-as.POSIXct(df2$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")
colnames(df2)<-c("Datetime","Site","m0-7","m8-15","m16-23","m24-31","m32-39")
library(dplyr)
library(lubridate)
# Round the date and convert the depth to match the look-up.
df1 = df1 %>%
mutate(
Datetime_rounded = round_date(Datetime, "3 hour"),
Depth_ind = ifelse(Depth < 8, "m0-7",
ifelse(Depth > 7 & Depth < 16, "m8-15",
ifelse(Depth > 15 & Depth < 24, "m16-23",
ifelse(Depth > 23 & Depth < 32, "m24-31",
ifelse(Depth > 31 & Depth < 40, "m32-39", NA)
)
)
)
)
)
# Wide to long on the intensity columns.
df2 = df2 %>%
tidyr::gather("Depth_ind", "Intensity", 3:7)
# Join
df1 %>%
left_join(df2, by = c("Datetime_rounded" = "Datetime",
"Site",
"Depth_ind"))
Datetime Site Ind Depth Datetime_rounded Depth_ind Intensity
1 2016-08-18 15:34:07 BD 16 5.3 2016-08-18 15:00:00 m0-7 4.00
2 2016-08-18 16:25:16 HG 17 24.0 2016-08-18 15:00:00 m24-31 NA
3 2016-08-18 17:29:16 BD 19 36.4 2016-08-18 18:00:00 m32-39 4.75
4 2016-08-18 18:33:16 BD 16 42.0 2016-08-18 18:00:00 <NA> NA
5 2016-08-18 20:54:16 BD 17 NA 2016-08-18 21:00:00 <NA> NA
6 2016-08-18 22:48:16 BD 16 22.1 2016-08-19 00:00:00 m16-23 1.40
# EDIT ----
## As per the request, the width of the final depth range can be adjusted as you wish, e.g. to a max depth of 60 m.
# Round the date and convert the depth to match the look-up.
df1 = df1 %>%
mutate(
Datetime_rounded = round_date(Datetime, "3 hour"),
Depth_ind = ifelse(Depth < 8, "m0-7",
ifelse(Depth > 7 & Depth < 16, "m8-15",
ifelse(Depth > 15 & Depth < 24, "m16-23",
ifelse(Depth > 23 & Depth < 32, "m24-31",
ifelse(Depth > 31 & Depth < 60, "m32-39", NA)
)
)
)
)
)
这可以直接在单个 SQL 语句中完成。我们将 df1
连接到 df2
,并按 df1
行指定的 on
条件分组。在指定的组上计算 max(b.Datetime)
将挑选出 df2
的适当行。 (如果 a.Datetime
、a.Site
没有唯一定义一行 df1
,则改为按 a.rowid
分组。)最后,我们使用 [-1]
删除该列。
我们使用了最后注释中显示的数据,因为问题中的数据在 df1
和 df2
中没有相应的日期。
library(sqldf)
sqldf("select max(b.Datetime), a.*,
case when a.Depth <= 7 then b.[m0-7]
when a.Depth <= 15 then b.[m8-15]
when a.Depth <= 23 then b.[m16-23]
when a.Depth <= 31 then b.[m24-31]
else b.[m32-39]
end as [Current.Int]
from df1 a
left join df2 b on a.Site = b.Site and a.Datetime >= b.Datetime
group by a.Datetime, a.Site")[-1]
给予:
Datetime Site Ind Depth Current.Int
1 2016-08-01 15:34:07 BD 16 5.3 4.00
2 2016-08-01 16:25:16 HG 17 24.0 NA
3 2016-08-01 17:29:16 BD 19 36.4 4.00
4 2016-08-01 18:33:16 BD 16 42.0 4.75
5 2016-08-01 20:54:16 BD 17 NA 4.75
6 2016-08-01 22:48:16 BD 16 22.1 2.25
备注
这是使用的输入,与问题中的相同,除了:
UTC 时区已被淘汰。如果您想保留 UTC 时区,请使用 Sys.setenv(TZ='UTC')
将您的会话时区更改为 UTC。处理时区的另一种可能性是对 Datetime
列使用字符串而不是 POSIXct,在这种情况下,您首先不会遇到时区问题。
添加最后一行是为了改进示例,因为日期不匹配。
这里是使用的输入。
df1<-data.frame(Datetime=c("2016-08-01 15:34:07","2016-08-01 16:25:16","2016-08-01 17:29:16","2016-08-01 18:33:16","2016-08-01 20:54:16","2016-08-01 22:48:16"),Site=c("BD","HG","BD","BD","BD","BD"),Ind=c(16,17,19,16,17,16), Depth=c(5.3,24,36.4,42,NA,22.1))
df1$Datetime<-as.POSIXct(df1$Datetime, format="%Y-%m-%d %H:%M:%S")
df2<-data.frame(Datetime=c("2016-08-18 12:00:00","2016-08-18 15:00:00","2016-08-18 18:00:00","2016-08-18 21:00:00","2016-08-19 00:00:00"), Site=c("BD","BD","BD","BD","BD"),var1=c(2.75,4,6.75,2.25,4.3),var2=c(3,4,4.75,3,2.1),var3=c(2.75,4,5.75,2.25,1.4),var4=c(3.25,3,6.5,2.75,3.4),var5=c(3,4,4.75,3,1.7))
df2$Datetime<-as.POSIXct(df2$Datetime, format="%Y-%m-%d %H:%M:%S")
colnames(df2)<-c("Datetime","Site","m0-7","m8-15","m16-23","m24-31","m32-39")
df2$Datetime <- as.POSIXct(paste("2016-08-01", sub(".* ", "", df2$Datetime)))
我有数据框 df1
,它总结了一段时间内鱼类的深度。 df1$Site
告诉您鱼所在的位置,df1$Ind
告诉您个体,df1$Depth
告诉您鱼在特定 df1$Datetime
.[=31= 的深度]
另一方面,我有 df2
总结了随着时间的推移(每三小时)从地表到 39 米深度的水流强度,间隔为 8 米(m0-7
, m8-15
、m16-23
、m24-31
和 m32-39
)。例如:
df1<-data.frame(Datetime=c("2016-08-01 15:34:07","2016-08-01 16:25:16","2016-08-01 17:29:16","2016-08-01 18:33:16","2016-08-01 20:54:16","2016-08-01 22:48:16"),Site=c("BD","HG","BD","BD","BD","BD"),Ind=c(16,17,19,16,17,16), Depth=c(5.3,24,36.4,42,NA,22.1))
df1$Datetime<-as.POSIXct(df1$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")
> df1
Datetime Site Ind Depth
1 2016-08-01 15:34:07 BD 16 5.3
2 2016-08-01 16:25:16 HG 17 24.0
3 2016-08-01 17:29:16 BD 19 36.4
4 2016-08-01 18:33:16 BD 16 42.0
5 2016-08-01 20:54:16 BD 17 NA
6 2016-08-01 22:48:16 BD 16 22.1
df2<-data.frame(Datetime=c("2016-08-01 12:00:00","2016-08-01 15:00:00","2016-08-01 18:00:00","2016-08-01 21:00:00","2016-08-02 00:00:00"), Site=c("BD","BD","BD","BD","BD"),var1=c(2.75,4,6.75,2.25,4.3),var2=c(3,4,4.75,3,2.1),var3=c(2.75,4,5.75,2.25,1.4),var4=c(3.25,3,6.5,2.75,3.4),var5=c(3,4,4.75,3,1.7))
df2$Datetime<-as.POSIXct(df2$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")
colnames(df2)<-c("Datetime","Site","m0-7","m8-15","m16-23","m24-31","m32-39")
> df2
Datetime Site m0-7 m8-15 m16-23 m24-31 m32-39
1 2016-08-01 12:00:00 BD 2.75 3.00 2.75 3.25 3.00
2 2016-08-01 15:00:00 BD 4.00 4.00 4.00 3.00 4.00
3 2016-08-01 18:00:00 BD 6.75 4.75 5.75 6.50 4.75
4 2016-08-01 21:00:00 BD 2.25 3.00 2.25 2.75 3.00
5 2016-08-02 00:00:00 BD 4.30 2.10 1.40 3.40 1.70
我想在 df1
中创建一个名为 df1$Current.Int
的新列,根据 df2
对洋流的描述,总结了鱼在何时何地的深度的洋流强度。
我想得到这个:
> df1
Datetime Site Ind Depth Current.Int
1 2016-08-01 15:34:07 BD 16 5.3 4.00
2 2016-08-01 16:25:16 HG 17 24.0 NA # Currents of this site are not included in df2
3 2016-08-01 17:29:16 BD 19 36.4 4.75
4 2016-08-01 18:33:16 BD 16 42.0 4.75
5 2016-08-01 20:54:16 BD 17 NA NA
6 2016-08-01 22:48:16 BD 16 22.1 1.40
需要指出的是,由于目前的记录是每三个小时一次,所以df2$Datetime
中的每个小时代表多一个半小时,少一个半小时。即df2
在21:00:00
处指出的电流强度反映了19:30:00
和22:30:00
之间的电流。其他时间也一样。
有人知道怎么做吗?
只要您的数据不是很大,您可能不必走上条件联接的道路。相反,首先仅使用 Site 加入,然后过滤掉额外的观察结果。它不是特别有效,但它可能比转向 sqldf
.
请注意,我对您提供的数据进行了一些更改,以便日期匹配。
library(tidyverse)
df1<-data.frame(Datetime=c("2016-08-01 15:34:07","2016-08-01 16:25:16","2016-08-01 17:29:16","2016-08-01 18:33:16","2016-08-01 20:54:16","2016-08-01 22:48:16"),
Site=c("BD","HG","BD","BD","BD","BD"),
Ind=c(16,17,19,16,17,16),
Depth=c(5.3,24,36.4,42,NA,22.1),
stringsAsFactors = FALSE)
df1$Datetime<-as.POSIXct(df1$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")
df2<-data.frame(Datetime=c("2016-08-01 12:00:00","2016-08-01 15:00:00","2016-08-01 18:00:00","2016-08-01 21:00:00","2016-08-02 00:00:00"),
Site=c("BD","BD","BD","BD","BD"),
var1=c(2.75,4,6.75,2.25,4.3),
var2=c(3,4,4.75,3,2.1),
var3=c(2.75,4,5.75,2.25,1.4),
var4=c(3.25,3,6.5,2.75,3.4),
var5=c(3,4,4.75,3,1.7),
stringsAsFactors = FALSE)
df2$Datetime<-as.POSIXct(df2$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")
colnames(df2)<-c("Datetime_CI","Site","m0-7","m8-15","m16-23","m24-31","m32-39")
#Tidy the data in df2 so that that we have two columns for min and max Depth
#and a single column for the value of the current intensity
df2 <- df2 %>%
gather(-Datetime_CI, -Site, key = Depth, value = Current.Int) %>%
separate(Depth, c("minDepth", "maxDepth")) %>%
mutate(minDepth = as.numeric(str_sub(minDepth, 2, nchar(minDepth))))
#join df1 and df2 based on the Site alone
df1 %>%
inner_join(df2, by = "Site") %>%
#now filter out any observations where depth is not between the min and max
filter(Depth >= minDepth,
Depth <= maxDepth,
#now exclude any current intensity observations prior to Datetime
Datetime > Datetime_CI) %>%
#finally, take the first current intensity observation after Datetime
group_by(Datetime, Site, Ind, Depth) %>%
filter(Datetime_CI == max(Datetime_CI))
# A tibble: 6 x 8
# Groups: Datetime, Site, Ind, Depth [4]
Datetime Site Ind Depth Datetime_CI minDepth maxDepth Current.Int
<dttm> <chr> <dbl> <dbl> <dttm> <dbl> <chr> <dbl>
1 2016-08-01 15:34:07 BD 16 5.3 2016-08-01 15:00:00 0 7 4
2 2016-08-01 17:29:16 BD 19 36.4 2016-08-01 15:00:00 0 7 4
3 2016-08-01 17:29:16 BD 19 36.4 2016-08-01 15:00:00 32 39 4
4 2016-08-01 18:33:16 BD 16 42 2016-08-01 18:00:00 0 7 6.75
5 2016-08-01 22:48:16 BD 16 22.1 2016-08-01 21:00:00 0 7 2.25
6 2016-08-01 22:48:16 BD 16 22.1 2016-08-01 21:00:00 16 23 2.25
日期不匹配,因此更改了示例。使用这种方法,您可以准确检查匹配的效果并确保它符合您的要求。
df1<-data.frame(Datetime=c("2016-08-18 15:34:07","2016-08-18 16:25:16","2016-08-18 17:29:16","2016-08-18 18:33:16","2016-08-18 20:54:16","2016-08-18 22:48:16"),Site=c("BD","HG","BD","BD","BD","BD"),Ind=c(16,17,19,16,17,16), Depth=c(5.3,24,36.4,42,NA,22.1))
df1$Datetime<-as.POSIXct(df1$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")
df2<-data.frame(Datetime=c("2016-08-18 12:00:00","2016-08-18 15:00:00","2016-08-18 18:00:00","2016-08-18 21:00:00","2016-08-19 00:00:00"), Site=c("BD","BD","BD","BD","BD"),var1=c(2.75,4,6.75,2.25,4.3),var2=c(3,4,4.75,3,2.1),var3=c(2.75,4,5.75,2.25,1.4),var4=c(3.25,3,6.5,2.75,3.4),var5=c(3,4,4.75,3,1.7))
df2$Datetime<-as.POSIXct(df2$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")
colnames(df2)<-c("Datetime","Site","m0-7","m8-15","m16-23","m24-31","m32-39")
library(dplyr)
library(lubridate)
# Round the date and convert the depth to match the look-up.
df1 = df1 %>%
mutate(
Datetime_rounded = round_date(Datetime, "3 hour"),
Depth_ind = ifelse(Depth < 8, "m0-7",
ifelse(Depth > 7 & Depth < 16, "m8-15",
ifelse(Depth > 15 & Depth < 24, "m16-23",
ifelse(Depth > 23 & Depth < 32, "m24-31",
ifelse(Depth > 31 & Depth < 40, "m32-39", NA)
)
)
)
)
)
# Wide to long on the intensity columns.
df2 = df2 %>%
tidyr::gather("Depth_ind", "Intensity", 3:7)
# Join
df1 %>%
left_join(df2, by = c("Datetime_rounded" = "Datetime",
"Site",
"Depth_ind"))
Datetime Site Ind Depth Datetime_rounded Depth_ind Intensity
1 2016-08-18 15:34:07 BD 16 5.3 2016-08-18 15:00:00 m0-7 4.00
2 2016-08-18 16:25:16 HG 17 24.0 2016-08-18 15:00:00 m24-31 NA
3 2016-08-18 17:29:16 BD 19 36.4 2016-08-18 18:00:00 m32-39 4.75
4 2016-08-18 18:33:16 BD 16 42.0 2016-08-18 18:00:00 <NA> NA
5 2016-08-18 20:54:16 BD 17 NA 2016-08-18 21:00:00 <NA> NA
6 2016-08-18 22:48:16 BD 16 22.1 2016-08-19 00:00:00 m16-23 1.40
# EDIT ----
## As per the request, the width of the final depth range can be adjusted as you wish, e.g. to a max depth of 60 m.
# Round the date and convert the depth to match the look-up.
df1 = df1 %>%
mutate(
Datetime_rounded = round_date(Datetime, "3 hour"),
Depth_ind = ifelse(Depth < 8, "m0-7",
ifelse(Depth > 7 & Depth < 16, "m8-15",
ifelse(Depth > 15 & Depth < 24, "m16-23",
ifelse(Depth > 23 & Depth < 32, "m24-31",
ifelse(Depth > 31 & Depth < 60, "m32-39", NA)
)
)
)
)
)
这可以直接在单个 SQL 语句中完成。我们将 df1
连接到 df2
,并按 df1
行指定的 on
条件分组。在指定的组上计算 max(b.Datetime)
将挑选出 df2
的适当行。 (如果 a.Datetime
、a.Site
没有唯一定义一行 df1
,则改为按 a.rowid
分组。)最后,我们使用 [-1]
删除该列。
我们使用了最后注释中显示的数据,因为问题中的数据在 df1
和 df2
中没有相应的日期。
library(sqldf)
sqldf("select max(b.Datetime), a.*,
case when a.Depth <= 7 then b.[m0-7]
when a.Depth <= 15 then b.[m8-15]
when a.Depth <= 23 then b.[m16-23]
when a.Depth <= 31 then b.[m24-31]
else b.[m32-39]
end as [Current.Int]
from df1 a
left join df2 b on a.Site = b.Site and a.Datetime >= b.Datetime
group by a.Datetime, a.Site")[-1]
给予:
Datetime Site Ind Depth Current.Int
1 2016-08-01 15:34:07 BD 16 5.3 4.00
2 2016-08-01 16:25:16 HG 17 24.0 NA
3 2016-08-01 17:29:16 BD 19 36.4 4.00
4 2016-08-01 18:33:16 BD 16 42.0 4.75
5 2016-08-01 20:54:16 BD 17 NA 4.75
6 2016-08-01 22:48:16 BD 16 22.1 2.25
备注
这是使用的输入,与问题中的相同,除了:
UTC 时区已被淘汰。如果您想保留 UTC 时区,请使用
Sys.setenv(TZ='UTC')
将您的会话时区更改为 UTC。处理时区的另一种可能性是对Datetime
列使用字符串而不是 POSIXct,在这种情况下,您首先不会遇到时区问题。添加最后一行是为了改进示例,因为日期不匹配。
这里是使用的输入。
df1<-data.frame(Datetime=c("2016-08-01 15:34:07","2016-08-01 16:25:16","2016-08-01 17:29:16","2016-08-01 18:33:16","2016-08-01 20:54:16","2016-08-01 22:48:16"),Site=c("BD","HG","BD","BD","BD","BD"),Ind=c(16,17,19,16,17,16), Depth=c(5.3,24,36.4,42,NA,22.1))
df1$Datetime<-as.POSIXct(df1$Datetime, format="%Y-%m-%d %H:%M:%S")
df2<-data.frame(Datetime=c("2016-08-18 12:00:00","2016-08-18 15:00:00","2016-08-18 18:00:00","2016-08-18 21:00:00","2016-08-19 00:00:00"), Site=c("BD","BD","BD","BD","BD"),var1=c(2.75,4,6.75,2.25,4.3),var2=c(3,4,4.75,3,2.1),var3=c(2.75,4,5.75,2.25,1.4),var4=c(3.25,3,6.5,2.75,3.4),var5=c(3,4,4.75,3,1.7))
df2$Datetime<-as.POSIXct(df2$Datetime, format="%Y-%m-%d %H:%M:%S")
colnames(df2)<-c("Datetime","Site","m0-7","m8-15","m16-23","m24-31","m32-39")
df2$Datetime <- as.POSIXct(paste("2016-08-01", sub(".* ", "", df2$Datetime)))