子集数据以删除不完整月份的数据
Subset data to remove data with incomplete months
我有一个数据框 df
,我想将其分成 10-day
个间隔。我只想使用给定年份内具有完整月份的个人(例如,01-01-2011 - 01-31-2021)。我如何过滤数据以仅包含具有“完整”数据的数据?
library(lubridate)
library(tidyverse)
date <- rep_len(seq(dmy("15-06-2010"), dmy("20-12-2013"), by = "days"), 500)
ID <- rep(seq(1, 5), 100)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID)
int <- df %>%
# arrange(ID) %>% # skipped for readability of result
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(new) %>%
group_split()
您可能可以按年或按月执行此操作 - 我在这里按月进行。基础数据 - 输入的修改版本:
library(lubridate)
library(tidyverse)
library(ggplot2)
# Build base dataframe
date <- seq.Date(dmy("15-06-2010"), dmy("20-12-2013"), by = "days")
id <- rep(seq(1, 5), each = length(date))
df_raw <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
id)
将一些值降低到 'break' 一两年,以便稍后验证代码
set.seed(1234)
drop_rows <- sapply(sample(1:nrow(df_raw), 3), function(i) {
return(i:(i+100))
}, simplify = FALSE) %>% unlist()
df <- df_raw[-c(drop_rows), ]
查看这些中断后的数据:
ggplot(df, aes(x = date, y = id, col = factor(id), group = id)) +
geom_point() +
scale_x_date(breaks = "2 months", date_labels = "%Y-%b") +
theme_classic() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
) +
geom_vline(xintercept = ymd(paste0(2011:2014, "-01-01")))
看起来有完整的年份:
- 编号 1: 2011
- id 2: 2011, 2012
- 编号 3: 2012
- id 4: 2011, 2012
- id 5: 2011, 2012
用 dplyr 分组识别那些跨度:
df %>%
# Assign a year and month value for simplicity
mutate(
yr = year(date),
mon = month(date),
) %>%
# Summarize for a grouping of id, year, and month to get group length
group_by(id, yr, mon) %>%
summarise(days_measured_n = n()) %>%
# Compare the actual days in each month to the days recorded
mutate(
actual_days_in_mon = lubridate::days_in_month(ymd(paste(yr, mon, "01", sep = "-"))),
all_days_inc = days_measured_n == actual_days_in_mon
) %>%
# Ungroup to base level, then regroup by id and year and see which id~year groupings
# have ALL TRUE values in the n-days-per-month-recorded == n-days-per-month-expected
ungroup() %>%
group_by(id, yr) %>%
summarise(
all_months_correct = all(all_days_inc)
) %>%
filter(all_months_correct)
根据上图预期的输出匹配:
# A tibble: 8 x 3
# Groups: id [5]
id yr all_months_correct
<int> <dbl> <lgl>
1 1 2011 TRUE
2 2 2011 TRUE
3 2 2012 TRUE
4 3 2012 TRUE
5 4 2011 TRUE
6 4 2012 TRUE
7 5 2011 TRUE
8 5 2012 TRUE
我有一个数据框 df
,我想将其分成 10-day
个间隔。我只想使用给定年份内具有完整月份的个人(例如,01-01-2011 - 01-31-2021)。我如何过滤数据以仅包含具有“完整”数据的数据?
library(lubridate)
library(tidyverse)
date <- rep_len(seq(dmy("15-06-2010"), dmy("20-12-2013"), by = "days"), 500)
ID <- rep(seq(1, 5), 100)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID)
int <- df %>%
# arrange(ID) %>% # skipped for readability of result
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(new) %>%
group_split()
您可能可以按年或按月执行此操作 - 我在这里按月进行。基础数据 - 输入的修改版本:
library(lubridate)
library(tidyverse)
library(ggplot2)
# Build base dataframe
date <- seq.Date(dmy("15-06-2010"), dmy("20-12-2013"), by = "days")
id <- rep(seq(1, 5), each = length(date))
df_raw <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
id)
将一些值降低到 'break' 一两年,以便稍后验证代码
set.seed(1234)
drop_rows <- sapply(sample(1:nrow(df_raw), 3), function(i) {
return(i:(i+100))
}, simplify = FALSE) %>% unlist()
df <- df_raw[-c(drop_rows), ]
查看这些中断后的数据:
ggplot(df, aes(x = date, y = id, col = factor(id), group = id)) +
geom_point() +
scale_x_date(breaks = "2 months", date_labels = "%Y-%b") +
theme_classic() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
) +
geom_vline(xintercept = ymd(paste0(2011:2014, "-01-01")))
看起来有完整的年份:
- 编号 1: 2011
- id 2: 2011, 2012
- 编号 3: 2012
- id 4: 2011, 2012
- id 5: 2011, 2012
用 dplyr 分组识别那些跨度:
df %>%
# Assign a year and month value for simplicity
mutate(
yr = year(date),
mon = month(date),
) %>%
# Summarize for a grouping of id, year, and month to get group length
group_by(id, yr, mon) %>%
summarise(days_measured_n = n()) %>%
# Compare the actual days in each month to the days recorded
mutate(
actual_days_in_mon = lubridate::days_in_month(ymd(paste(yr, mon, "01", sep = "-"))),
all_days_inc = days_measured_n == actual_days_in_mon
) %>%
# Ungroup to base level, then regroup by id and year and see which id~year groupings
# have ALL TRUE values in the n-days-per-month-recorded == n-days-per-month-expected
ungroup() %>%
group_by(id, yr) %>%
summarise(
all_months_correct = all(all_days_inc)
) %>%
filter(all_months_correct)
根据上图预期的输出匹配:
# A tibble: 8 x 3
# Groups: id [5]
id yr all_months_correct
<int> <dbl> <lgl>
1 1 2011 TRUE
2 2 2011 TRUE
3 2 2012 TRUE
4 3 2012 TRUE
5 4 2011 TRUE
6 4 2012 TRUE
7 5 2011 TRUE
8 5 2012 TRUE