在 df1 中创建一个新的分类变量,它考虑了一个 df1 变量和多个具有多个条件的 df2 变量
Create a new categorical variable in df1 that takes into account one df1 variable and several df2 variables with multiple conditions
我有一个类似于 and post 中使用的数据框。原因是因为我需要在我的数据集中创建三个不同的变量,并且我对每个问题做了不同的 post,因为处理它们的方式各不相同。
df1
总结了不同时间不同地点不同鱼类的深度。 df2
以 8 米的间隔总结了从地表到 39 米深度随时间(每三小时)的电流强度(m0-7
、m8-15
、m16-23
、m24-31
和 m32-39
) 在特定的地方。例如:
df1<-data.frame(Datetime=c("2016-08-01 15:34:07","2016-08-01 16:25:16","2016-08-01 17:29:16","2016-08-01 18:33:16","2016-08-01 20:54:16","2016-08-01 22:48:16"),Site=c("BD","BD","BD","BD","BD","BD"),Ind=c(16,17,19,16,17,16), Depth=c(5.3,24,36.4,42,NA,22.1))
df1$Datetime<-as.POSIXct(df1$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")
> df1
Datetime Site Ind Depth
1 2016-08-01 15:34:07 BD 16 5.3
2 2016-08-01 16:25:16 BD 17 24.0
3 2016-08-01 17:29:16 BD 19 36.4
4 2016-08-01 18:33:16 BD 16 42.0
5 2016-08-01 20:54:16 BD 17 NA
6 2016-08-01 22:48:16 BD 16 22.1
df2<-data.frame(Datetime=c("2016-08-01 12:00:00","2016-08-01 15:00:00","2016-08-01 18:00:00","2016-08-01 21:00:00","2016-08-02 00:00:00"), Site=c("BD","BD","BD","BD","BD"),var1=c(2.75,4,6.75,2.25,4.3),var2=c(3,4,4.75,3,2.1),var3=c(2.75,4,9.8,2.25,1.4),var4=c(3.25,3,6.5,8.9,3.4),var5=c(3,4,2.3,2.6,1.7))
df2$Datetime<-as.POSIXct(df2$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")
colnames(df2)<-c("Datetime","Site","m0-7","m8-15","m16-23","m24-31","m32-39")
> df2
Datetime Site m0-7 m8-15 m16-23 m24-31 m32-39
1 2016-08-01 12:00:00 BD 2.75 3.00 2.75 3.25 3.0
2 2016-08-01 15:00:00 BD 4.00 4.00 4.00 3.00 4.0
3 2016-08-01 18:00:00 BD 6.75 4.75 9.80 6.50 2.3
4 2016-08-01 21:00:00 BD 2.25 3.00 2.25 8.90 2.6
5 2016-08-02 00:00:00 BD 4.30 2.10 1.40 3.40 1.7
我想在 df1
中创建一个名为 Outside_currents
的新变量,以反映鱼是否避开高电流。我将列 Outside_currents
定义为 "Tell me if the fish AVOID to be in a layer of high currents when HE HAS A CHANCE TO BE THERE OR NOT"。我的鱼总是在大于 15 米的深度移动,所以我只考虑最后三列的计算(m16-23
、m24-31
和 m32-39
)。
将其转化为数学:
Outside_currents
:"when one or two layers OUT OF THE THREE CONSIDERED (m16-23
, m24-31
and m32-39
) have current intensities THREE TIMES GREATER than the other one or two, is the fish OUTSIDE?"。
可能的答案是:
Yes
: 鱼所在层的电流强度是其他层的一两层的三倍。
No
: 鱼所在层的电流强度是其他层的三倍。
NA
:当该条件不存在时(没有任何层的电流强度比其他层大3倍),或者当变量深度为"NA".
我希望这样:
> df1
Datetime Site Ind Depth Out_current
1 2016-08-01 15:34:07 BD 16 5.3 NA
2 2016-08-01 16:25:16 BD 17 24.0 NA
3 2016-08-01 17:29:16 BD 19 36.4 YES
4 2016-08-01 18:33:16 BD 16 42.0 YES
5 2016-08-01 19:33:16 BD 17 24.0 NO
6 2016-08-01 20:54:16 BD 16 NA NA
7 2016-08-01 22:48:16 BD 16 22.1 NA
我想我遗漏了一些关于你的问题的信息。似乎您只查看 df2,并且任何时候当前标记为比一个深度高 3 倍,当前将标记为其他深度之一低 3 倍。我把这个放在一起。看看它是否可以帮助您入门。
library(tidyverse)
outside_calcs <-
df2 %>%
gather(depth, value, m16_23:m32_39) %>%
left_join(df2) %>%
mutate(
comp_16 = m16_23/value,
comp_24 = m24_31/value,
comp_32 = m32_39/value,
min_diff = pmin(comp_16, comp_24, comp_32),
max_diff = pmax(comp_16, comp_24, comp_32)
) %>%
mutate(
outside_currents =
case_when(
min_diff < 0.33 ~ "Yes",
max_diff > 3 ~ "No",
TRUE ~ NA_character_
)
)
# Datetime Site depth value m16_23 m24_31 m32_39 comp_16 comp_24 comp_32 min_diff max_diff outside_currents
# 2016-08-18 21:00:00 BD m16_23 2.25 2.25 8.9 2.6 1.000 3.96 1.156 1.000 3.96 No
# 2016-08-18 21:00:00 BD m24_31 8.90 2.25 8.9 2.6 0.253 1.00 0.292 0.253 1.00 Yes
# 2016-08-18 21:00:00 BD m32_39 2.60 2.25 8.9 2.6 0.865 3.42 1.000 0.865 3.42 No
final_outside <-
outside_calcs %>%
mutate(depth = str_replace(depth, "m", "c")) %>%
select(
Datetime, Site,
depth, outside_currents
) %>%
spread(depth, outside_currents) %>%
left_join(df2) %>%
select(Datetime, Site, starts_with("m"), starts_with("c"))
final_outside
# Datetime Site m16_23 m24_31 m32_39 c16_23 c24_31 c32_39
# 2016-08-18 12:00:00 BD 2.75 3.25 3.0 <NA> <NA> <NA>
# 2016-08-18 15:00:00 BD 4.00 3.00 4.0 <NA> <NA> <NA>
# 2016-08-18 18:00:00 BD 9.80 6.50 2.3 Yes <NA> No
# 2016-08-18 21:00:00 BD 2.25 8.90 2.6 No Yes No
# 2016-08-19 00:00:00 BD 1.40 3.40 1.7 <NA> <NA> <NA>
解决方案:
library(data.table)
library(lubridate)
library(dplyr)
df1<-data.frame(Datetime=c("2016-08-01 12:34:07","2016-08-01 15:34:07","2016-08-01 16:25:16","2016-08-01 17:29:16","2016-08-01 18:33:16","2016-08-01 19:23:16","2016-08-01 20:01:16","2016-08-01 20:54:16","2016-08-01 22:48:16","2016-08-01 23:48:16","2016-08-02 01:07:16"), Site=c("BD","BD","HG","BD","BD","BD","BD","BD","BD","HG","BD"),Ind=c(16,16,17,19,16,16,17,16,16,17,16), Depth=c(15.50,5.30,24.00,36.40,42.00,25.00,NA,22.10,54.00,27.00,21.50))
df1$Datetime<-as.POSIXct(df1$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")
df1$Datetime_rounded<-round_date(df1$Datetime, "3 hour")
df2<-data.frame(Datetime=c("2016-08-01 12:00:00","2016-08-01 15:00:00","2016-08-01 18:00:00","2016-08-01 21:00:00","2016-08-02 00:00:00"),
Site=c("BD","BD","BD","BD","BD"),
var1=c(2.75,4.00,6.75,2.25,4.30),
var2=c(3.80,7.75,4.75,3.00,2.10),
var3=c(2.20,4.30,6.80,2.25,3.40),
var4=c(5.40,1.10,2.25,3.30,6.50),
var5=c(7.30,5.20,1.30,2.60,1.70))
df2$Datetime<-as.POSIXct(df2$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")
colnames(df2)<-c("Datetime","Site","m0-7","m8-15","m16-23","m24-31","m32-39")
df1<-df1[,c(1,5,2,3,4)] # Rearrange the data frame
setDT(df1) # We convert into data.table
setDT(df2)
setkey(df1, Site, Datetime_rounded) # We indicate the key variables.
setkey(df2, Site, Datetime)
df_merge = df2[df1, roll = -Inf] # Associate one table with the other.
df_merge<-df_merge[,c(8,2,9,10,3:7)] # Rearrange the data.table
df_merge[, Outside_current := case_when(
Site != "BD" ~ "NA",
Depth == "NA" ~ "NA",
Depth < 15 ~ "NA",
Depth >= 15 & Depth < 24 & (`m16-23`*3 < `m24-31` | `m16-23`*3 < `m32-39` | `m16-23`*3 < (`m24-31`+`m32-39`)/2 ) ~ "Yes",
Depth >= 24 & Depth < 32 & (`m24-31`*3 < `m16-23` | `m24-31`*3 < `m32-39` | `m24-31`*3 < (`m16-23`+`m32-39`)/2 ) ~ "Yes",
Depth >= 32 & (`m32-39`*3 < `m16-23` | `m32-39`*3 < `m24-31` | `m32-39`*3 < (`m16-23`+`m24-31`)/2 ) ~ "Yes",
Depth >= 24 & (`m16-23`*3 < `m24-31` | `m16-23`*3 < `m32-39` | `m16-23`*3 < (`m24-31`+`m32-39`)/2 ) ~ "No",
(Depth >= 15 & Depth <24 | Depth >= 32) & (`m24-31`*3 < `m16-23` | `m24-31`*3 < `m32-39` | `m24-31`*3 < (`m16-23`+`m32-39`)/2 ) ~ "No",
Depth < 32 & (`m32-39`*3 < `m16-23` | `m32-39`*3 < `m24-31` | `m32-39`*3 < (`m16-23`+`m24-31`)/2 ) ~ "No",
T ~ "NA")]
> df_merge
i.Datetime Site Ind Depth m0-7 m8-15 m16-23 m24-31 m32-39 Outside_current
1: 2016-08-01 12:34:07 BD 16 15.5 2.75 3.80 2.20 5.40 7.3 Yes
2: 2016-08-01 15:34:07 BD 16 5.3 4.00 7.75 4.30 1.10 5.2 NA
3: 2016-08-01 17:29:16 BD 19 36.4 6.75 4.75 6.80 2.25 1.3 Yes
4: 2016-08-01 18:33:16 BD 16 42.0 6.75 4.75 6.80 2.25 1.3 Yes
5: 2016-08-01 19:23:16 BD 16 25.0 6.75 4.75 6.80 2.25 1.3 Yes
6: 2016-08-01 20:01:16 BD 17 NA 2.25 3.00 2.25 3.30 2.6 NA
7: 2016-08-01 20:54:16 BD 16 22.1 2.25 3.00 2.25 3.30 2.6 NA
8: 2016-08-01 22:48:16 BD 16 54.0 4.30 2.10 3.40 6.50 1.7 Yes
9: 2016-08-02 01:07:16 BD 16 21.5 4.30 2.10 3.40 6.50 1.7 No
10: 2016-08-01 16:25:16 HG 17 24.0 NA NA NA NA NA NA
11: 2016-08-01 23:48:16 HG 17 27.0 NA NA NA NA NA NA
我有一个类似于
df1
总结了不同时间不同地点不同鱼类的深度。 df2
以 8 米的间隔总结了从地表到 39 米深度随时间(每三小时)的电流强度(m0-7
、m8-15
、m16-23
、m24-31
和 m32-39
) 在特定的地方。例如:
df1<-data.frame(Datetime=c("2016-08-01 15:34:07","2016-08-01 16:25:16","2016-08-01 17:29:16","2016-08-01 18:33:16","2016-08-01 20:54:16","2016-08-01 22:48:16"),Site=c("BD","BD","BD","BD","BD","BD"),Ind=c(16,17,19,16,17,16), Depth=c(5.3,24,36.4,42,NA,22.1))
df1$Datetime<-as.POSIXct(df1$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")
> df1
Datetime Site Ind Depth
1 2016-08-01 15:34:07 BD 16 5.3
2 2016-08-01 16:25:16 BD 17 24.0
3 2016-08-01 17:29:16 BD 19 36.4
4 2016-08-01 18:33:16 BD 16 42.0
5 2016-08-01 20:54:16 BD 17 NA
6 2016-08-01 22:48:16 BD 16 22.1
df2<-data.frame(Datetime=c("2016-08-01 12:00:00","2016-08-01 15:00:00","2016-08-01 18:00:00","2016-08-01 21:00:00","2016-08-02 00:00:00"), Site=c("BD","BD","BD","BD","BD"),var1=c(2.75,4,6.75,2.25,4.3),var2=c(3,4,4.75,3,2.1),var3=c(2.75,4,9.8,2.25,1.4),var4=c(3.25,3,6.5,8.9,3.4),var5=c(3,4,2.3,2.6,1.7))
df2$Datetime<-as.POSIXct(df2$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")
colnames(df2)<-c("Datetime","Site","m0-7","m8-15","m16-23","m24-31","m32-39")
> df2
Datetime Site m0-7 m8-15 m16-23 m24-31 m32-39
1 2016-08-01 12:00:00 BD 2.75 3.00 2.75 3.25 3.0
2 2016-08-01 15:00:00 BD 4.00 4.00 4.00 3.00 4.0
3 2016-08-01 18:00:00 BD 6.75 4.75 9.80 6.50 2.3
4 2016-08-01 21:00:00 BD 2.25 3.00 2.25 8.90 2.6
5 2016-08-02 00:00:00 BD 4.30 2.10 1.40 3.40 1.7
我想在 df1
中创建一个名为 Outside_currents
的新变量,以反映鱼是否避开高电流。我将列 Outside_currents
定义为 "Tell me if the fish AVOID to be in a layer of high currents when HE HAS A CHANCE TO BE THERE OR NOT"。我的鱼总是在大于 15 米的深度移动,所以我只考虑最后三列的计算(m16-23
、m24-31
和 m32-39
)。
将其转化为数学:
Outside_currents
:"when one or two layers OUT OF THE THREE CONSIDERED (m16-23
, m24-31
and m32-39
) have current intensities THREE TIMES GREATER than the other one or two, is the fish OUTSIDE?"。
可能的答案是:
Yes
: 鱼所在层的电流强度是其他层的一两层的三倍。No
: 鱼所在层的电流强度是其他层的三倍。NA
:当该条件不存在时(没有任何层的电流强度比其他层大3倍),或者当变量深度为"NA".
我希望这样:
> df1
Datetime Site Ind Depth Out_current
1 2016-08-01 15:34:07 BD 16 5.3 NA
2 2016-08-01 16:25:16 BD 17 24.0 NA
3 2016-08-01 17:29:16 BD 19 36.4 YES
4 2016-08-01 18:33:16 BD 16 42.0 YES
5 2016-08-01 19:33:16 BD 17 24.0 NO
6 2016-08-01 20:54:16 BD 16 NA NA
7 2016-08-01 22:48:16 BD 16 22.1 NA
我想我遗漏了一些关于你的问题的信息。似乎您只查看 df2,并且任何时候当前标记为比一个深度高 3 倍,当前将标记为其他深度之一低 3 倍。我把这个放在一起。看看它是否可以帮助您入门。
library(tidyverse)
outside_calcs <-
df2 %>%
gather(depth, value, m16_23:m32_39) %>%
left_join(df2) %>%
mutate(
comp_16 = m16_23/value,
comp_24 = m24_31/value,
comp_32 = m32_39/value,
min_diff = pmin(comp_16, comp_24, comp_32),
max_diff = pmax(comp_16, comp_24, comp_32)
) %>%
mutate(
outside_currents =
case_when(
min_diff < 0.33 ~ "Yes",
max_diff > 3 ~ "No",
TRUE ~ NA_character_
)
)
# Datetime Site depth value m16_23 m24_31 m32_39 comp_16 comp_24 comp_32 min_diff max_diff outside_currents
# 2016-08-18 21:00:00 BD m16_23 2.25 2.25 8.9 2.6 1.000 3.96 1.156 1.000 3.96 No
# 2016-08-18 21:00:00 BD m24_31 8.90 2.25 8.9 2.6 0.253 1.00 0.292 0.253 1.00 Yes
# 2016-08-18 21:00:00 BD m32_39 2.60 2.25 8.9 2.6 0.865 3.42 1.000 0.865 3.42 No
final_outside <-
outside_calcs %>%
mutate(depth = str_replace(depth, "m", "c")) %>%
select(
Datetime, Site,
depth, outside_currents
) %>%
spread(depth, outside_currents) %>%
left_join(df2) %>%
select(Datetime, Site, starts_with("m"), starts_with("c"))
final_outside
# Datetime Site m16_23 m24_31 m32_39 c16_23 c24_31 c32_39
# 2016-08-18 12:00:00 BD 2.75 3.25 3.0 <NA> <NA> <NA>
# 2016-08-18 15:00:00 BD 4.00 3.00 4.0 <NA> <NA> <NA>
# 2016-08-18 18:00:00 BD 9.80 6.50 2.3 Yes <NA> No
# 2016-08-18 21:00:00 BD 2.25 8.90 2.6 No Yes No
# 2016-08-19 00:00:00 BD 1.40 3.40 1.7 <NA> <NA> <NA>
解决方案:
library(data.table)
library(lubridate)
library(dplyr)
df1<-data.frame(Datetime=c("2016-08-01 12:34:07","2016-08-01 15:34:07","2016-08-01 16:25:16","2016-08-01 17:29:16","2016-08-01 18:33:16","2016-08-01 19:23:16","2016-08-01 20:01:16","2016-08-01 20:54:16","2016-08-01 22:48:16","2016-08-01 23:48:16","2016-08-02 01:07:16"), Site=c("BD","BD","HG","BD","BD","BD","BD","BD","BD","HG","BD"),Ind=c(16,16,17,19,16,16,17,16,16,17,16), Depth=c(15.50,5.30,24.00,36.40,42.00,25.00,NA,22.10,54.00,27.00,21.50))
df1$Datetime<-as.POSIXct(df1$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")
df1$Datetime_rounded<-round_date(df1$Datetime, "3 hour")
df2<-data.frame(Datetime=c("2016-08-01 12:00:00","2016-08-01 15:00:00","2016-08-01 18:00:00","2016-08-01 21:00:00","2016-08-02 00:00:00"),
Site=c("BD","BD","BD","BD","BD"),
var1=c(2.75,4.00,6.75,2.25,4.30),
var2=c(3.80,7.75,4.75,3.00,2.10),
var3=c(2.20,4.30,6.80,2.25,3.40),
var4=c(5.40,1.10,2.25,3.30,6.50),
var5=c(7.30,5.20,1.30,2.60,1.70))
df2$Datetime<-as.POSIXct(df2$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")
colnames(df2)<-c("Datetime","Site","m0-7","m8-15","m16-23","m24-31","m32-39")
df1<-df1[,c(1,5,2,3,4)] # Rearrange the data frame
setDT(df1) # We convert into data.table
setDT(df2)
setkey(df1, Site, Datetime_rounded) # We indicate the key variables.
setkey(df2, Site, Datetime)
df_merge = df2[df1, roll = -Inf] # Associate one table with the other.
df_merge<-df_merge[,c(8,2,9,10,3:7)] # Rearrange the data.table
df_merge[, Outside_current := case_when(
Site != "BD" ~ "NA",
Depth == "NA" ~ "NA",
Depth < 15 ~ "NA",
Depth >= 15 & Depth < 24 & (`m16-23`*3 < `m24-31` | `m16-23`*3 < `m32-39` | `m16-23`*3 < (`m24-31`+`m32-39`)/2 ) ~ "Yes",
Depth >= 24 & Depth < 32 & (`m24-31`*3 < `m16-23` | `m24-31`*3 < `m32-39` | `m24-31`*3 < (`m16-23`+`m32-39`)/2 ) ~ "Yes",
Depth >= 32 & (`m32-39`*3 < `m16-23` | `m32-39`*3 < `m24-31` | `m32-39`*3 < (`m16-23`+`m24-31`)/2 ) ~ "Yes",
Depth >= 24 & (`m16-23`*3 < `m24-31` | `m16-23`*3 < `m32-39` | `m16-23`*3 < (`m24-31`+`m32-39`)/2 ) ~ "No",
(Depth >= 15 & Depth <24 | Depth >= 32) & (`m24-31`*3 < `m16-23` | `m24-31`*3 < `m32-39` | `m24-31`*3 < (`m16-23`+`m32-39`)/2 ) ~ "No",
Depth < 32 & (`m32-39`*3 < `m16-23` | `m32-39`*3 < `m24-31` | `m32-39`*3 < (`m16-23`+`m24-31`)/2 ) ~ "No",
T ~ "NA")]
> df_merge
i.Datetime Site Ind Depth m0-7 m8-15 m16-23 m24-31 m32-39 Outside_current
1: 2016-08-01 12:34:07 BD 16 15.5 2.75 3.80 2.20 5.40 7.3 Yes
2: 2016-08-01 15:34:07 BD 16 5.3 4.00 7.75 4.30 1.10 5.2 NA
3: 2016-08-01 17:29:16 BD 19 36.4 6.75 4.75 6.80 2.25 1.3 Yes
4: 2016-08-01 18:33:16 BD 16 42.0 6.75 4.75 6.80 2.25 1.3 Yes
5: 2016-08-01 19:23:16 BD 16 25.0 6.75 4.75 6.80 2.25 1.3 Yes
6: 2016-08-01 20:01:16 BD 17 NA 2.25 3.00 2.25 3.30 2.6 NA
7: 2016-08-01 20:54:16 BD 16 22.1 2.25 3.00 2.25 3.30 2.6 NA
8: 2016-08-01 22:48:16 BD 16 54.0 4.30 2.10 3.40 6.50 1.7 Yes
9: 2016-08-02 01:07:16 BD 16 21.5 4.30 2.10 3.40 6.50 1.7 No
10: 2016-08-01 16:25:16 HG 17 24.0 NA NA NA NA NA NA
11: 2016-08-01 23:48:16 HG 17 27.0 NA NA NA NA NA NA