在 R 中创建条件总和(基于日期)作为数据框的新列
Creating conditional sums (based on dates) as new column of data frame in R
我正在尝试在 R 中进行一些特征工程。假设我有以下数据框:
events = data.frame(patient = c("A","A","A","A","B","B","B"),
date = as.Date(c("2017-12-15", "2018-01-09", "2018-01-31", "2018-02-05",
"2017-12-12", "2017-12-12", "2018-02-01")),
type = c("AnE","Inpatient","Inpatient","Inpatient","AnE","AnE",
"Inpatient"))`
我现在想添加一个列,其中包含过去 30 天内来自同一患者的 "Inpatient" 事件的总和。
有没有直接的方法(不涉及 for 循环)?
根据您的数据集,我会创建一些句柄变量和 运行 data.table 方法。
首先我添加了患者上次月经的日期。然后,我根据患者和距当前日期早于 30 天的上一期日期计算 "Inpatient" 在数据集中出现的次数。
library(data.table)
events = data.table(patient = c("A","A","A","A","B","B","B"),
date = as.Date(c("2017-12-15", "2018-01-09", "2018-01-31", "2018-02-05",
"2017-12-12", "2017-12-12", "2018-02-01")),
type = c("AnE","Inpatient","Inpatient","Inpatient","AnE","AnE",
"Inpatient"))
events = events[order(date), .SD, by = patient]
events[, date_t1 := lag(date), by = patient]
events[, timesInpatient := cumsum(type=="Inpatient"), by = .(patient, date_t1 > date - 30)]
结果是这样的
patient date type date1 timesInpatient
1: B 2017-12-12 AnE <NA> 0
2: B 2017-12-12 AnE 2017-12-12 0
3: B 2018-02-01 Inpatient 2017-12-12 1
4: A 2017-12-15 AnE <NA> 0
5: A 2018-01-09 Inpatient 2017-12-15 1
6: A 2018-01-31 Inpatient 2018-01-09 2
7: A 2018-02-05 Inpatient 2018-01-31 3
这可能不如 data.table
方法简洁,但您可以使用 lubridate
包中的 span
和 %within%
。
这是它们如何工作的示例:
# creating a span object and a vector of dates
span <- lubridate::interval("2018-01-01", "2018-01-30")
dates <- as.Date(c("2018-01-01", "2018-01-30", "2018-01-03", "2018-02-01"))
dates %within% span
[1] TRUE TRUE TRUE FALSE
# adding a vector indicating inpatient visits
inpatient_visit <- c(TRUE, FALSE, TRUE, FALSE)
# counting dates are both fall within the span and are inpatient visits
sum(dates %within% span & visit)
[1] 2
然后您可以使用 split-apply-combine approach(使用 split
和 purrr:map_df
)并为数据集中的每个患者重复此计数过程:
library(dplyr)
library(lubridate)
events = data.frame(patient = c("A","A","A","A","B","B","B"),
date = as.Date(c("2017-12-15", "2018-01-09", "2018-01-31", "2018-02-05",
"2017-12-12", "2017-12-12", "2018-02-01")),
type = c("AnE","Inpatient","Inpatient","Inpatient","AnE","AnE",
"Inpatient"))
count_visits <- function(df) {
res <- map(df$span, ~ sum(df$date %within% .x & df$inpatient))
df$count <- res
return(df)
}
events <- events %>%
mutate(inpatient = type == "Inpatient",
span = interval(date - days(30), date)) %>%
split(.$patient) %>%
map_df(count_visits) %>%
select(-inpatient, -span) %>%
arrange(date)
events
patient date type count
1 B 2017-12-12 AnE 0
2 B 2017-12-12 AnE 0
3 A 2017-12-15 AnE 0
4 A 2018-01-09 Inpatient 1
5 A 2018-01-31 Inpatient 2
6 B 2018-02-01 Inpatient 1
7 A 2018-02-05 Inpatient 3
我正在尝试在 R 中进行一些特征工程。假设我有以下数据框:
events = data.frame(patient = c("A","A","A","A","B","B","B"),
date = as.Date(c("2017-12-15", "2018-01-09", "2018-01-31", "2018-02-05",
"2017-12-12", "2017-12-12", "2018-02-01")),
type = c("AnE","Inpatient","Inpatient","Inpatient","AnE","AnE",
"Inpatient"))`
我现在想添加一个列,其中包含过去 30 天内来自同一患者的 "Inpatient" 事件的总和。
有没有直接的方法(不涉及 for 循环)?
根据您的数据集,我会创建一些句柄变量和 运行 data.table 方法。
首先我添加了患者上次月经的日期。然后,我根据患者和距当前日期早于 30 天的上一期日期计算 "Inpatient" 在数据集中出现的次数。
library(data.table)
events = data.table(patient = c("A","A","A","A","B","B","B"),
date = as.Date(c("2017-12-15", "2018-01-09", "2018-01-31", "2018-02-05",
"2017-12-12", "2017-12-12", "2018-02-01")),
type = c("AnE","Inpatient","Inpatient","Inpatient","AnE","AnE",
"Inpatient"))
events = events[order(date), .SD, by = patient]
events[, date_t1 := lag(date), by = patient]
events[, timesInpatient := cumsum(type=="Inpatient"), by = .(patient, date_t1 > date - 30)]
结果是这样的
patient date type date1 timesInpatient
1: B 2017-12-12 AnE <NA> 0
2: B 2017-12-12 AnE 2017-12-12 0
3: B 2018-02-01 Inpatient 2017-12-12 1
4: A 2017-12-15 AnE <NA> 0
5: A 2018-01-09 Inpatient 2017-12-15 1
6: A 2018-01-31 Inpatient 2018-01-09 2
7: A 2018-02-05 Inpatient 2018-01-31 3
这可能不如 data.table
方法简洁,但您可以使用 lubridate
包中的 span
和 %within%
。
这是它们如何工作的示例:
# creating a span object and a vector of dates
span <- lubridate::interval("2018-01-01", "2018-01-30")
dates <- as.Date(c("2018-01-01", "2018-01-30", "2018-01-03", "2018-02-01"))
dates %within% span
[1] TRUE TRUE TRUE FALSE
# adding a vector indicating inpatient visits
inpatient_visit <- c(TRUE, FALSE, TRUE, FALSE)
# counting dates are both fall within the span and are inpatient visits
sum(dates %within% span & visit)
[1] 2
然后您可以使用 split-apply-combine approach(使用 split
和 purrr:map_df
)并为数据集中的每个患者重复此计数过程:
library(dplyr)
library(lubridate)
events = data.frame(patient = c("A","A","A","A","B","B","B"),
date = as.Date(c("2017-12-15", "2018-01-09", "2018-01-31", "2018-02-05",
"2017-12-12", "2017-12-12", "2018-02-01")),
type = c("AnE","Inpatient","Inpatient","Inpatient","AnE","AnE",
"Inpatient"))
count_visits <- function(df) {
res <- map(df$span, ~ sum(df$date %within% .x & df$inpatient))
df$count <- res
return(df)
}
events <- events %>%
mutate(inpatient = type == "Inpatient",
span = interval(date - days(30), date)) %>%
split(.$patient) %>%
map_df(count_visits) %>%
select(-inpatient, -span) %>%
arrange(date)
events
patient date type count
1 B 2017-12-12 AnE 0
2 B 2017-12-12 AnE 0
3 A 2017-12-15 AnE 0
4 A 2018-01-09 Inpatient 1
5 A 2018-01-31 Inpatient 2
6 B 2018-02-01 Inpatient 1
7 A 2018-02-05 Inpatient 3