将年度时间点与 R 中前 365 天的数据相匹配

matching yearly time points to preceding 365 days of data in R

我正在尝试合并两个数据集。调查数据集由不同地区每 1-5 年在某个月份进行的生物多样性调查组成(月份在地区内不变,但在地区间不变)。温度数据集包含每个调查区域的每日温度读数。

对于具有不同开始月份和时间范围的多项调查,我想将每个调查*年份组合与其之前十二个月的温度数据配对。换句话说,我想将 1983 年 5 月的一项调查与它之前的 12 个月(或 365 天——我不在乎是哪一天)的每日温度记录配对,截至 1983 年 4 月 30 日。与此同时,另一项于 8 月在其他地方进行的调查1983 需要与截至 1983 年 7 月 31 日的 365 天温度数据配对。

有(至少)两种方法可以做到这一点——一种是将调查数据与(更长的)温度数据结合起来,然后以某种方式子集化或确定哪些日期落在调查日期之前的 12 个月内.另一种方法是从调查数据开始,并尝试将温度数据与矩阵列配对到每一行——我尝试使用 tsibbletsModel 的时间序列工具来做到这一点,但做不到按地区分组时使其“滞后”正确的值。

我能够创建一个标识符来加入数据集,以便温度数据中的每个日期都与后续调查及时匹配。然而,并非所有这些都在 365 天内(例如,在下面创建的数据集中,日期 1983-06-03 与 ref_year aleutian_islands-5-1986 匹配,因为调查每 3-5 年才进行一次).

以下是我想要的单个区域行为的一些示例(来自下面的示例数据集),尽管我愿意接受实现相同但看起来并不完全像这样的解决方案:

对于这一行,我要生成的新列中的值 (ref_match) 应该是 NA;该日期早于 ref_year.

超过 365 天
  region           date        year month month_year ref_year                temperature     
  <chr>            <date>     <dbl> <dbl> <chr>      <chr>                         <dbl>
1 aleutian_islands 1982-06-09  1982     6 6-1982     aleutian_islands-5-1983           0   

对于这一行,ref_match 应该是 aleutian_islands-5-2014,因为日期在 ref_year 的 12 个月内。

  region           date        year month month_year ref_year                temperature
  <chr>            <date>     <dbl> <dbl> <chr>      <chr>                         <dbl>
1 aleutian_islands 2013-07-22  2013     7 7-2013     aleutian_islands-5-2014       0.998

以下脚本将生成一个数据集 temp_dat,其中的列与上面代码片段中的列类似,我希望从中生成 ref_match 列。

# load packages
library(tidyverse)
library(lubridate)
set.seed=10

# make survey dfs
ai_dat <- data.frame("year" = c(1983, 1986, 1991, 1994, 1997), "region" = "aleutian_islands", "startmonth" = 5)
ebs_dat <- data.frame("year" = seq(1983, 1999, 1), "region" = "eastern_bering_sea", "startmonth" = 5)

# join and create what will become ref_year column
surv_dat <- rbind(ai_dat, ebs_dat) %>% 
  mutate(month_year = paste0(startmonth,"-",year)) %>%
  select(region, month_year) %>%
  distinct() %>%
  mutate(region_month_year = paste0(region,"-",month_year))

# expand out to all possible month*year combinations for joining with temperature
surv_dat_exploded <- expand.grid(month=seq(1, 12, 1), year=seq(1982, 2000, 1), region=c('aleutian_islands','eastern_bering_sea')) %>% # get a factorial combo of every possible month*year; have to start in 1982 even though we can't use surveys before 1983 because we need to match to temperature data from 1982
  mutate(region_month_year = paste0(region,"-",month,"-",year)) %>% # create unique identifier
  mutate(ref_year = ifelse(region_month_year %in% surv_dat$region_month_year, region_month_year, NA),
         month_year = paste0(month,"-",year)) %>% 
  select(region, month_year, ref_year) %>% 
  distinct() %>% 
  group_by(region) %>% 
  fill(ref_year, .direction="up") %>%  # fill in each region with the survey to which env data from each month*year should correspond
  ungroup() 

# make temperature dataset and join in survey ref_year column 
temp_dat <- data.frame(expand.grid(date=seq(ymd("1982-01-01"), ymd("1999-12-31"), "days"), region=c('aleutian_islands','eastern_bering_sea'))) %>% 
  mutate(temperature = rnorm(nrow(.), 10, 5),  # fill in with fake data
         year = year(date),
         month = month(date),
         month_year = paste0(month,"-",year)) %>% 
  left_join(surv_dat_exploded, by=c('region','month_year')) %>% 
  filter(!is.na(ref_year))# get rid of dates that are after any ref_year

试试这个解决方案。

我基本上是使用你的参考列来生成一个 ref_date 并估计观察和参考之间的天数差异。然后,我使用一个简单的 ifelse 来测试日期是否在 365 天范围内,然后将它们复制到 temp_valid 列。


# load packages
library(tidyverse)
library(lubridate)
set.seed=10

# make survey dfs
ai_dat <- data.frame("year" = c(1983, 1986, 1991, 1994, 1997), "region" = "aleutian_islands", "startmonth" = 5)
ebs_dat <- data.frame("year" = seq(1983, 1999, 1), "region" = "eastern_bering_sea", "startmonth" = 5)

# join and create what will become ref_year column
surv_dat <-
  rbind(ai_dat, ebs_dat) %>% 
  mutate(year_month = paste0(year,"-",startmonth),
         region_year_month = paste0(region,"-",year,"-",startmonth)) 


# expand out to all possible month*year combinations for joining with temperature
surv_dat_exploded <-
  expand.grid(month=seq(01, 12, 1), year=seq(1982, 2000, 1), region=c('aleutian_islands','eastern_bering_sea')) %>% # get a factorial combo of every possible month*year; have to start in 1982 even though we can't use surveys before 1983 because we need to match to temperature data from 1982
  mutate(year_month = paste0(year,"-",month)) %>%
  mutate(region_year_month = paste0(region,"-",year,"-",month)) %>% 
  mutate(ref_year = ifelse(region_year_month %in% surv_dat$region_year_month, region_year_month,NA)) %>%
  group_by(region) %>% 
  fill(ref_year, .direction="up") %>%  # fill in each region with the survey to which env data from each month*year should correspond
  ungroup() 

# make temperature dataset and join in survey ref_year column 
temp_dat <- data.frame(expand.grid(date=seq(ymd("1982-01-01"), ymd("1999-12-31"), "days"), region=c('aleutian_islands','eastern_bering_sea'))) %>% 
  mutate(temperature = rnorm(nrow(.), 10, 5),  # fill in with fake data
         year = year(date),
         month = month(date),
         year_month = paste0(year,"-",month))

final_df <- 
  left_join(temp_dat, surv_dat_exploded, by=c('region','year_month')) %>% 
    #split ref_column in ref_year and ref_region
  separate(ref_year, c("ref_region","ref_year"), "-", extra="merge") %>%
    #convert ref_year into date
    mutate_at("ref_year", as.Date, format= "%Y-%M") %>% 
    #round it down to be in the first day of the month (not needed if the day matters)
    mutate_at("ref_year", floor_date, "month" ) %>% 
    #difference between observed and the reference
    mutate(diff_days = date - ref_year) %>% 
    # ifelse statement for capturing values of interest
    mutate(temp_valid = ifelse(between(diff_days, -365, 0),temperature,NA))

听起来你想要一个非 equi 连接。这很容易用 data.table 完成,而且速度非常快。这是一个轻微修改 MWE 的示例:

library(data.table)

# make survey dfs
ai_dat = data.table(year = c(1983, 1986, 1991, 1994, 1997), 
                    region = "aleutian_islands", "startmonth" = 5)
ebs_dat = data.table(year = seq(1983, 1999, 1), 
                     region = "eastern_bering_sea", "startmonth" = 5)

# bind together and create date (and cutoffdate) vars
surv_dat = rbind(ai_dat, ebs_dat)
surv_dat[, startdate := as.IDate(paste(year, startmonth, '01', sep = '-'))
         ][, cutoffdate := startdate - 365L]

# make temperature df
temp_dat = CJ(date=seq(as.IDate("1982-01-01"), as.IDate("1999-12-31"), "days"), 
              region=c('aleutian_islands','eastern_bering_sea'))
# add temperature var
temp_dat$temp = rnorm(nrow(temp_dat))
# create duplicate date variable (will make post-join processing easier)
temp_dat[, matchdate := date]

# Optional: Set keys for better join performance
setkey(surv_dat, region, startdate)
setkey(temp_dat, region, matchdate)

# Where the magic happens: Non-equi join
surv_dat = temp_dat[surv_dat, on = .(region == region, 
                                     matchdate <= startdate, 
                                     matchdate >= cutoffdate)]

# Optional: get rid of unneeded columns
surv_dat[, c('matchdate', 'matchdate.1') := NULL][]
#>             date             region       temp year startmonth
#>    1: 1982-05-01   aleutian_islands  0.3680810 1983          5
#>    2: 1982-05-02   aleutian_islands  0.8349334 1983          5
#>    3: 1982-05-03   aleutian_islands -1.3622227 1983          5
#>    4: 1982-05-04   aleutian_islands  1.4327587 1983          5
#>    5: 1982-05-05   aleutian_islands  0.5068226 1983          5
#>   ---                                                         
#> 8048: 1999-04-27 eastern_bering_sea -1.2924594 1999          5
#> 8049: 1999-04-28 eastern_bering_sea  0.7519078 1999          5
#> 8050: 1999-04-29 eastern_bering_sea -1.0185174 1999          5
#> 8051: 1999-04-30 eastern_bering_sea -1.4322252 1999          5
#> 8052: 1999-05-01 eastern_bering_sea -1.0412836 1999          5

reprex package (v2.0.0)

于 2021-05-20 创建