使用 R 基于多个条件过滤记录的优雅方式
Elegant way to filter records based on multiple criteria using R
我有一个如下所示的数据框
test_df <- data.frame("subject_id" = c(1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3),
"date_1" = c("01/01/2003", "12/31/2007", "12/30/2008", "12/31/2005",
"01/01/2007", "01/01/2013", "12/31/2008", "03/04/2006",
"12/31/2009", "01/01/2015", "01/01/2009"))
我想做的是
按升序排列每个主题的日期(组内升序排列)
根据以下标准删除每个主题的日期记录(年份无关紧要):
2a。如果主题的第一条记录是 1 月 1 日,则仅删除 12 月 31 日的记录 例如:subject_id = 1
2b。如果主题的第一条记录是 12 月 31 日,则仅删除 1 月 1 日的记录 例如:subject_id = 2
2c。如果受试者的非第一条记录中同时包含 12 月 31 日和 1 月 1 日(即从第二条记录到其记录末尾),则仅删除 12 月 31 日的记录 ex:subject_id = 3
我正在尝试以下
sorted <- test_df %>% arrange(date_1,group_by = subject_id) #Am I right in sorts the dates within group?
test_df$month = month(test_df$date_1) #get the month
test_df$day = day(test_df$date_1) #get the year
filter(test_df, month==12 and day == 31) # doesn't work here
你能帮我看看如何根据我的条件筛选出记录吗?
我希望我的输出如下所示
这不是我写过的最漂亮的代码,但它确实有效。我假设过滤器是按顺序执行的;否则,第二个和第三个过滤器会剔除所有主题 2。
test_df %>%
mutate(date_1 = as.Date(as.character(date_1), format = "%m/%d/%Y"),
month = as.numeric(format(date_1, "%m")),
day = as.numeric(format(date_1, "%d"))) %>%
group_by(subject_id) %>%
arrange(date_1) %>%
filter(!(rep(month[1] == 1 & day[1] == 1, n()) & month == 12 & day == 31)) %>%
filter(!(rep(month[1] == 12 & day[1] == 31, n()) & month == 1 & day == 1)) %>%
filter(!(rep(sum(month[-1] == 1 & day[-1] == 1) > 0 & sum(month[-1] == 12 & day[-1] == 31) > 0, n()) & month == 12 & day == 31)) %>%
ungroup() %>%
arrange(subject_id, date_1)
subject_id date_1 month day
<dbl> <date> <dbl> <dbl>
1 1 2003-01-01 1 1
2 1 2008-12-30 12 30
3 2 2005-12-31 12 31
4 2 2008-12-31 12 31
5 3 2006-03-04 3 4
6 3 2009-01-01 1 1
7 3 2015-01-01 1 1
starting_names <- names(test_df)
test_df %>%
mutate(date_1 = lubridate::mdy(date_1)) %>%
group_by(subject_id) %>%
arrange() %>%
mutate(
without_year = format(date_1, "%m-%d"),
first_date = first(without_year),
has_both = all(c("01-01", "12-31") %in% tail(without_year, -1))
) %>%
filter(!(first_date == "01-01" & without_year == "12-31")) %>%
filter(!(first_date == "12-31" & without_year == "01-01")) %>%
filter(!(first_date != "01-01" & first_date != "12-31" & has_both == TRUE & without_year == "12-31")) %>%
select(all_of(starting_names)) %>%
ungroup()
给出:
# A tibble: 7 x 2
subject_id date_1
<dbl> <date>
1 1 2003-01-01
2 1 2008-12-30
3 2 2005-12-31
4 2 2008-12-31
5 3 2006-03-04
6 3 2015-01-01
7 3 2009-01-01
您也许还可以尝试使用一些润滑剂的基本解决方案:
library(lubridate)
# put date_1 as date
test_df$date_1 <- lubridate::mdy(test_df$date_1)
# create the field that's going to be the filter
test_df$cntrl <- paste0(month(test_df$date_1),day(test_df$date_1))
现在的想法是按组 subject_id
在列表中拆分你的 df,然后 lapply
一个使用你的条件进行过滤的函数:
# split as list
listed <- split(test_df, test_df$subject_id)
# order each df: requested and fundamental for the function
listed <- lapply(listed, function(df){df[order(df$date_1),]})
# here the function: it's a nested if else statement on the field
filtering <- function(x){if
(head(x,1)$cntrl == "11") { x[x$cntrl != '1231', ] }
else if
(head(x,1)$cntrl == "1231") { x[x$cntrl != '11', ] }
else if
( "11" %in% tail(x,nrow(x)-1)$cntrl & "1231" %in% tail(x,nrow(x)-1)$cntrl) { x[x$cntrl != '1231', ] }
else(x)}
# lapply it!
listed <- lapply(listed, function(x)filtering(x))
# now as a dataframe, removing the useless column:
res <- do.call(rbind, listed)[,-3]
# lastly you can rename the rownames
rownames(res) <- 1:nrow(res)
res
subject_id date_1
1 1 2003-01-01
2 1 2008-12-30
3 2 2005-12-31
4 2 2008-12-31
5 3 2006-03-04
6 3 2009-01-01
7 3 2015-01-01
我有一个如下所示的数据框
test_df <- data.frame("subject_id" = c(1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3),
"date_1" = c("01/01/2003", "12/31/2007", "12/30/2008", "12/31/2005",
"01/01/2007", "01/01/2013", "12/31/2008", "03/04/2006",
"12/31/2009", "01/01/2015", "01/01/2009"))
我想做的是
按升序排列每个主题的日期(组内升序排列)
根据以下标准删除每个主题的日期记录(年份无关紧要):
2a。如果主题的第一条记录是 1 月 1 日,则仅删除 12 月 31 日的记录 例如:subject_id = 1
2b。如果主题的第一条记录是 12 月 31 日,则仅删除 1 月 1 日的记录 例如:subject_id = 2
2c。如果受试者的非第一条记录中同时包含 12 月 31 日和 1 月 1 日(即从第二条记录到其记录末尾),则仅删除 12 月 31 日的记录 ex:subject_id = 3
我正在尝试以下
sorted <- test_df %>% arrange(date_1,group_by = subject_id) #Am I right in sorts the dates within group?
test_df$month = month(test_df$date_1) #get the month
test_df$day = day(test_df$date_1) #get the year
filter(test_df, month==12 and day == 31) # doesn't work here
你能帮我看看如何根据我的条件筛选出记录吗?
我希望我的输出如下所示
这不是我写过的最漂亮的代码,但它确实有效。我假设过滤器是按顺序执行的;否则,第二个和第三个过滤器会剔除所有主题 2。
test_df %>%
mutate(date_1 = as.Date(as.character(date_1), format = "%m/%d/%Y"),
month = as.numeric(format(date_1, "%m")),
day = as.numeric(format(date_1, "%d"))) %>%
group_by(subject_id) %>%
arrange(date_1) %>%
filter(!(rep(month[1] == 1 & day[1] == 1, n()) & month == 12 & day == 31)) %>%
filter(!(rep(month[1] == 12 & day[1] == 31, n()) & month == 1 & day == 1)) %>%
filter(!(rep(sum(month[-1] == 1 & day[-1] == 1) > 0 & sum(month[-1] == 12 & day[-1] == 31) > 0, n()) & month == 12 & day == 31)) %>%
ungroup() %>%
arrange(subject_id, date_1)
subject_id date_1 month day
<dbl> <date> <dbl> <dbl>
1 1 2003-01-01 1 1
2 1 2008-12-30 12 30
3 2 2005-12-31 12 31
4 2 2008-12-31 12 31
5 3 2006-03-04 3 4
6 3 2009-01-01 1 1
7 3 2015-01-01 1 1
starting_names <- names(test_df)
test_df %>%
mutate(date_1 = lubridate::mdy(date_1)) %>%
group_by(subject_id) %>%
arrange() %>%
mutate(
without_year = format(date_1, "%m-%d"),
first_date = first(without_year),
has_both = all(c("01-01", "12-31") %in% tail(without_year, -1))
) %>%
filter(!(first_date == "01-01" & without_year == "12-31")) %>%
filter(!(first_date == "12-31" & without_year == "01-01")) %>%
filter(!(first_date != "01-01" & first_date != "12-31" & has_both == TRUE & without_year == "12-31")) %>%
select(all_of(starting_names)) %>%
ungroup()
给出:
# A tibble: 7 x 2
subject_id date_1
<dbl> <date>
1 1 2003-01-01
2 1 2008-12-30
3 2 2005-12-31
4 2 2008-12-31
5 3 2006-03-04
6 3 2015-01-01
7 3 2009-01-01
您也许还可以尝试使用一些润滑剂的基本解决方案:
library(lubridate)
# put date_1 as date
test_df$date_1 <- lubridate::mdy(test_df$date_1)
# create the field that's going to be the filter
test_df$cntrl <- paste0(month(test_df$date_1),day(test_df$date_1))
现在的想法是按组 subject_id
在列表中拆分你的 df,然后 lapply
一个使用你的条件进行过滤的函数:
# split as list
listed <- split(test_df, test_df$subject_id)
# order each df: requested and fundamental for the function
listed <- lapply(listed, function(df){df[order(df$date_1),]})
# here the function: it's a nested if else statement on the field
filtering <- function(x){if
(head(x,1)$cntrl == "11") { x[x$cntrl != '1231', ] }
else if
(head(x,1)$cntrl == "1231") { x[x$cntrl != '11', ] }
else if
( "11" %in% tail(x,nrow(x)-1)$cntrl & "1231" %in% tail(x,nrow(x)-1)$cntrl) { x[x$cntrl != '1231', ] }
else(x)}
# lapply it!
listed <- lapply(listed, function(x)filtering(x))
# now as a dataframe, removing the useless column:
res <- do.call(rbind, listed)[,-3]
# lastly you can rename the rownames
rownames(res) <- 1:nrow(res)
res
subject_id date_1
1 1 2003-01-01
2 1 2008-12-30
3 2 2005-12-31
4 2 2008-12-31
5 3 2006-03-04
6 3 2009-01-01
7 3 2015-01-01