子集数据以删除不完整月份的数据

Subset data to remove data with incomplete months

我有一个数据框 df,我想将其分成 10-day 个间隔。我只想使用给定年份内具有完整月份的个人(例如,01-01-2011 - 01-31-2021)。我如何过滤数据以仅包含具有“完整”数据的数据?

library(lubridate)
library(tidyverse)
date <- rep_len(seq(dmy("15-06-2010"), dmy("20-12-2013"), by = "days"), 500)
ID <- rep(seq(1, 5), 100)

df <- data.frame(date = date,
                 x = runif(length(date), min = 60000, max = 80000),
                 y = runif(length(date), min = 800000, max = 900000),
                 ID)



int <- df %>%
  # arrange(ID) %>%   # skipped for readability of result
  mutate(new = floor_date(date, '10 day')) %>%
  mutate(new = if_else(day(new) == 31, new - days(10), new)) %>% 
  group_by(new) %>%
  group_split()

您可能可以按年或按月执行此操作 - 我在这里按月进行。基础数据 - 输入的修改版本:

library(lubridate)
library(tidyverse)
library(ggplot2)

# Build base dataframe
date <- seq.Date(dmy("15-06-2010"), dmy("20-12-2013"), by = "days")
id <- rep(seq(1, 5), each = length(date))
df_raw <- data.frame(date = date,
                 x = runif(length(date), min = 60000, max = 80000),
                 y = runif(length(date), min = 800000, max = 900000),
                 id)

将一些值降低到 'break' 一两年,以便稍后验证代码

set.seed(1234)
drop_rows <- sapply(sample(1:nrow(df_raw), 3), function(i) {
  return(i:(i+100))
}, simplify = FALSE) %>% unlist()

df <- df_raw[-c(drop_rows), ]

查看这些中断后的数据:

ggplot(df, aes(x = date, y = id, col = factor(id), group = id)) +
  geom_point() +
  scale_x_date(breaks = "2 months", date_labels = "%Y-%b") +
  theme_classic() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  geom_vline(xintercept = ymd(paste0(2011:2014, "-01-01")))

看起来有完整的年份:

  • 编号 1: 2011
  • id 2: 2011, 2012
  • 编号 3: 2012
  • id 4: 2011, 2012
  • id 5: 2011, 2012

用 dplyr 分组识别那些跨度:

df %>%
  # Assign a year and month value for simplicity
  mutate(
    yr = year(date),
    mon = month(date),
    ) %>%
  
  # Summarize for a grouping of id, year, and month to get group length
  group_by(id, yr, mon) %>%
  summarise(days_measured_n = n()) %>%
  
  # Compare the actual days in each month to the days recorded
  mutate(
    actual_days_in_mon = lubridate::days_in_month(ymd(paste(yr, mon, "01", sep = "-"))),
    all_days_inc = days_measured_n == actual_days_in_mon
    ) %>%
  
  # Ungroup to base level, then regroup by id and year and see which id~year groupings
  # have ALL TRUE values in the n-days-per-month-recorded == n-days-per-month-expected 
  ungroup() %>%
  group_by(id, yr) %>%
  summarise(
    all_months_correct = all(all_days_inc)
  ) %>%
  filter(all_months_correct)

根据上图预期的输出匹配:

# A tibble: 8 x 3
# Groups:   id [5]
     id    yr all_months_correct
  <int> <dbl> <lgl>             
1     1  2011 TRUE              
2     2  2011 TRUE              
3     2  2012 TRUE              
4     3  2012 TRUE              
5     4  2011 TRUE              
6     4  2012 TRUE              
7     5  2011 TRUE              
8     5  2012 TRUE