有条件地过滤，如果 A 列中的值超出，则超出相应时间戳（B 列）的进一步观察将被丢弃

Question

我正在使用 R 中的鱼类遥测数据，下面提供了一个简化的数据集：

df <- structure(list(DATE.TIME = structure(c(1560900051, 1560900101, 
1560927373, 1560927504, 1560927533, 1560927585, 1560927689, 1560899962, 
1560900026, 1560900026, 1560900076, 1560927328, 1560927498, 1560927529, 
1560927558, 1560907660, 1560907720, 1560908037, 1560925131, 1560925260, 
1560931034, 1560907630, 1560907695, 1560907746, 1560907804, 1560908189, 
1560908268, 1560925097, 1560925300, 1560925426), class = c("POSIXct", 
"POSIXt"), tzone = "Canada/Atlantic"), TAG = c(1310230L, 1310230L, 
1310230L, 1310230L, 1310230L, 1310230L, 1310230L, 1310230L, 1310230L, 
1310230L, 1310230L, 1310230L, 1310230L, 1310230L, 1310230L, 1311038L, 
1311038L, 1311038L, 1311038L, 1311038L, 1311038L, 1311038L, 1311038L, 
1311038L, 1311038L, 1311038L, 1311038L, 1311038L, 1311038L, 1311038L
), SENSOR.ID = c(5665L, 5665L, 5665L, 5665L, 5665L, 5665L, 5665L, 
5666L, 5666L, 5666L, 5666L, 5666L, 5666L, 5666L, 5666L, 5821L, 
5821L, 5821L, 5821L, 5821L, 5821L, 5822L, 5822L, 5822L, 5822L, 
5822L, 5822L, 5822L, 5822L, 5822L), SENSOR.VALUE = c(8.1796, 
8.1796, 35.0095, 35.0095, 35.0095, 35.0095, 35.0095, 0.9024, 
0, 0, 0, 34.2986, 0.9024, 18.9544, 18.9544, 8.4934, 8.4934, 8.4934, 
35.0095, 35.0095, 35.0095, 0, 0, 0, 0, 0, 0, 13.5388, 1.805, 
1.805), SENSOR = c("temp", "temp", "temp", "temp", "temp", "temp", 
"temp", "depth", "depth", "depth", "depth", "depth", "depth", 
"depth", "depth", "temp", "temp", "temp", "temp", "temp", "temp", 
"depth", "depth", "depth", "depth", "depth", "depth", "depth", 
"depth", "depth")), row.names = c(435151L, 435152L, 435203L, 
435204L, 435205L, 435206L, 435207L, 435614L, 435615L, 435616L, 
435617L, 435664L, 435665L, 435666L, 435667L, 455286L, 455287L, 
455288L, 455295L, 455296L, 455297L, 455553L, 455554L, 455555L, 
455556L, 455557L, 455558L, 455568L, 455569L, 455570L), class = "data.frame")

数据结构如下：

DATE.TIME=鱼检测时间戳
TAG=植入鱼体内的声学标签的唯一 ID
SENSOR.ID=每个传感器的唯一 ID（温度和深度），每个
TAG SENSOR.VALUE=记录的温度（摄氏度）或深度（米）
SENSOR=表示传感器类型（温度或深度）的分类变量

我想做的是 subset/filter 这个数据，这样当温度升高到 30C 以上（表示捕食）时，将从温度和深度传感器中删除任何后续检测。植入鱼体内的每个 TAG 交替传输其温度或深度 SENSOR.ID 和 SENSOR.VALUE。我可以为温度传感器数据做这个过滤器：

dfsub <- subset(df, SENSOR=="temp" & SENSOR.VALUE<30)

但这仍然允许在深度传感器上的捕食事件之后包含检测，在这种情况下，它现在将反映捕食者的移动。理想情况下，过滤器将识别温度升高到 30C 以上的第一个实例的时间戳，并删除每条鱼（即 TAG）超过该时间的所有观察结果。一旦通过过滤器，我正在寻找解析数据集的方法，如下所示。

df <- structure(list(DATE.TIME = structure(c(1560900051, 1560900101, 
1560899962, 1560900026, 1560900026, 1560900076, 1560907660, 1560907720, 
1560908037, 1560907630, 1560907695, 1560907746, 1560907804, 1560908189, 
1560908268, 1560925097), class = c("POSIXct", "POSIXt"), tzone = "Canada/Atlantic"), 
    TAG = c(1310230L, 1310230L, 1310230L, 1310230L, 1310230L, 
    1310230L, 1311038L, 1311038L, 1311038L, 1311038L, 1311038L, 
    1311038L, 1311038L, 1311038L, 1311038L, 1311038L), SENSOR.ID = c(5665L, 
    5665L, 5666L, 5666L, 5666L, 5666L, 5821L, 5821L, 5821L, 5822L, 
    5822L, 5822L, 5822L, 5822L, 5822L, 5822L), SENSOR.VALUE = c(8.1796, 
    8.1796, 0.9024, 0, 0, 0, 8.4934, 8.4934, 8.4934, 0, 0, 0, 
    0, 0, 0, 13.5388), SENSOR = c("temp", "temp", "depth", "depth", 
    "depth", "depth", "temp", "temp", "temp", "depth", "depth", 
    "depth", "depth", "depth", "depth", "depth")), row.names = c(435151L, 
435152L, 435614L, 435615L, 435616L, 435617L, 455286L, 455287L, 
455288L, 455553L, 455554L, 455555L, 455556L, 455557L, 455558L, 
455568L), class = "data.frame")

感谢您的见解！

Answer 1

很棒的数据集！这是一个使用 tidyr::fill 的选项。我对您的数据进行了一些编辑和精简，以制作更好的代表。

还出于教学目的将其分成多个步骤，但实际上，您应该在单个管道链中执行此操作。

library(tidyverse)

fishdat <- tibble::tribble(
  ~DATE.TIME,        ~FISH.TAG, ~SENSOR.ID, ~SENSOR.VALUE, ~SENSOR,
  "2019-06-18 20:19:41",   1,      65,            9,     "temp",
  "2019-06-18 20:20:51",   1,      65,            37,    "temp",
  "2019-06-18 20:19:22",   1,      66,            1,    "depth",
  "2019-06-18 20:21:16",   1,      66,            0,    "depth",
  "2019-06-18 22:27:40",   2,      21,           35,     "temp",
  "2019-06-18 22:33:57",   2,      21,           38,     "temp",
  "2019-06-18 22:27:10",   2,      22,            0,    "depth",
  "2019-06-19 3:18:17",    2,      22,           13,    "depth"
  )

标记表示捕食的值

fishdat_marked <- 
  fishdat %>% 
  mutate(predated = ifelse(SENSOR == "temp" & SENSOR.VALUE > 30, 
                           "predated", 
                           NA_character_)) 
fishdat_marked

#> # A tibble: 8 × 6
#>   DATE.TIME           FISH.TAG SENSOR.ID SENSOR.VALUE SENSOR predated
#>   <chr>                  <dbl>     <dbl>        <dbl> <chr>  <chr>   
#> 1 2019-06-18 20:19:41        1        65            9 temp   <NA>    
#> 2 2019-06-18 20:20:51        1        65           37 temp   predated
#> 3 2019-06-18 20:19:22        1        66            1 depth  <NA>    
#> 4 2019-06-18 20:21:16        1        66            0 depth  <NA>    
#> 5 2019-06-18 22:27:40        2        21           35 temp   <NA>    
#> 6 2019-06-18 22:33:57        2        21           38 temp   predated
#> 7 2019-06-18 22:27:10        2        22            0 depth  <NA>    
#> 8 2019-06-19 3:18:17         2        22           13 depth  <NA>

沿着捕食标记向下级联

fishdat_filled <- 
  fishdat_marked %>% 
  group_by(FISH.TAG) %>% ## for each fish
  arrange(DATE.TIME, .by_group = T)  %>% 
  fill(predated, .direction = "down")

fishdat_filled

#> # A tibble: 8 × 6
#> # Groups:   FISH.TAG [2]
#>   DATE.TIME           FISH.TAG SENSOR.ID SENSOR.VALUE SENSOR predated
#>   <chr>                  <dbl>     <dbl>        <dbl> <chr>  <chr>   
#> 1 2019-06-18 20:19:22        1        66            1 depth  <NA>    
#> 2 2019-06-18 20:19:41        1        65            9 temp   <NA>    
#> 3 2019-06-18 20:20:51        1        65           37 temp   predated
#> 4 2019-06-18 20:21:16        1        66            0 depth  predated
#> 5 2019-06-18 22:27:10        2        22            0 depth  <NA>    
#> 6 2019-06-18 22:27:40        2        21           35 temp   <NA>    
#> 7 2019-06-18 22:33:57        2        21           38 temp   predated
#> 8 2019-06-19 3:18:17         2        22           13 depth  predated

现在，过滤

fishdat_filled %>% 
  filter(is.na(predated))
#> # A tibble: 4 × 6
#> # Groups:   FISH.TAG [2]
#>   DATE.TIME           FISH.TAG SENSOR.ID SENSOR.VALUE SENSOR predated
#>   <chr>                  <dbl>     <dbl>        <dbl> <chr>  <chr>   
#> 1 2019-06-18 20:19:22        1        66            1 depth  <NA>    
#> 2 2019-06-18 20:19:41        1        65            9 temp   <NA>    
#> 3 2019-06-18 22:27:10        2        22            0 depth  <NA>    
#> 4 2019-06-18 22:27:40        2        21           35 temp   <NA>

^{由 reprex package (v2.0.1)}

于 2021-10-22 创建

有条件地过滤，如果 A 列中的值超出，则超出相应时间戳（B 列）的进一步观察将被丢弃

Filter conditionally, where if a value is exceeded in column A, further observations beyond the respective timestamp (column B) are dropped

r

subset

filter

conditional-statements

dplyr

标记表示捕食的值

沿着捕食标记向下级联

现在，过滤