r:根据时间重新排列数据帧
r: rearrangement of data frame based on time
我有一个大型药物使用数据库:
library(data.table)
df <- data.frame("ID" = c(1,1,1,1,2,2,2,3,3), "IndexDate" = c("2019-01-01", "2019-01-01", "2019-01-01", "2019-01-01", "2019-05-01", "2019-05-01", "2019-05-01", "2019-07-01", "2019-07-01"), "CensorDate" = c("2019-06-30", "2019-06-30", "2019-06-30", "2019-06-30", "2019-07-30", "2019-07-30", "2019-07-30", "2019-12-31", "2019-12-31"), "DrugStart" = c("2019-02-01", "2019-03-01", "2019-04-01", "2019-06-01", "2019-03-01", "2019-04-15", "2019-05-16", "2019-07-05", "2020-01-01"), "DrugEnd" = c("2019-02-15", "2019-04-15", "2019-04-30", "2019-06-05", "2019-03-15", "2019-05-15", "2019-05-30", "2019-07-15", "2020-01-15"),"Notes" = c("", "", "Overlap 15 days", "", "All days before IndexDate", "15 days before IndexDate", "", "", "15 days after CensorDate"))
df
ID IndexDate CensorDate DrugStart DrugEnd Notes
1 1 2019-01-01 2019-06-30 2019-02-01 2019-02-15
2 1 2019-01-01 2019-06-30 2019-03-01 2019-04-15
3 1 2019-01-01 2019-06-30 2019-04-01 2019-04-30 Overlap 15 days
4 1 2019-01-01 2019-06-30 2019-06-01 2019-06-05
5 2 2019-05-01 2019-07-30 2019-03-01 2019-03-15 All days before IndexDate
6 2 2019-05-01 2019-07-30 2019-04-15 2019-05-15 15 days before IndexDate
7 2 2019-05-01 2019-07-30 2019-05-16 2019-05-30
8 3 2019-07-01 2019-12-31 2019-07-05 2019-07-15
9 3 2019-07-01 2019-12-31 2020-01-01 2020-01-15 15 days after CensorDate
IndexDate
和CensorDate
对于每个ID
都是一样的。观察期为IndexDate
至CensorDate
。
我想按照以下标准重新排列它:
- 由
ID
链接
- 忽略
IndexDate
之前或 CensorDate
之后的天数;
- 重叠的时间段只计算一次;
df
是一个药物使用数据库。 df
中的所有句点(从 DrugStart
到 DrugEnd
)都表示吸毒。 df
内遗漏期,但在观察期(IndexDate
至CensorDate
)内者,即未吸毒
- 药物使用标记为 2(使用)和 1(不使用);
IndexDate
定义为第 0 天(表示“TimeStart
”的所有开始时间均为 0)。
我期望结果如下:
> df2 <- data.frame("ID" = c(1,1,1,1,1,1,1,2,2,3,3,3), "TimeStart" = c("0", "31", "46", "59", "120", "151", "156", "0", "30", "0", "4", "15"), "TimeEnd" = c("30", "45", "58", "119", "150", "155", "180", "29", "90", "3", "14", "183"), "DrugUse" = c("1", "2", "1", "2", "1", "2", "1", "2", "1", "1", "2", "1"))
> df2
ID TimeStart TimeEnd DrugUse
1 1 0 30 1
2 1 31 45 2
3 1 46 58 1
4 1 59 119 2
5 1 120 150 1
6 1 151 155 2
7 1 156 180 1
8 2 0 29 2
9 2 30 90 1
10 3 0 3 1
11 3 4 14 2
12 3 15 183 1
现在,我知道如何通过“DrugStart
-IndexDate
”和“DrugEnd
-IndexDate
生成TimeStart
和TimeEnd
》,如下:
df$TimeStart<- as.Date(df$DrugStart, format="%Y-%m-%d")-as.Date(df$IndexDate, format="%Y-%m-%d")
df$TimeEnd<- as.Date(df$DrugEnd, format="%Y-%m-%d")-as.Date(df$IndexDate, format="%Y-%m-%d")
df
ID IndexDate CensorDate DrugStart DrugEnd Notes_Drug.use.days TimeStart TimeEnd
1 1 2019-01-01 2019-06-30 2019-02-01 2019-02-15 15days 31 days 45 days
2 1 2019-01-01 2019-06-30 2019-03-01 2019-04-15 46days 59 days 104 days
3 1 2019-01-01 2019-06-30 2019-04-01 2019-04-30 Overlap 15days + 15days 90 days 119 days
4 1 2019-01-01 2019-06-30 2019-06-01 2019-06-05 5days 151 days 155 days
5 2 2019-05-01 2019-07-30 2019-03-01 2019-03-15 15days before IndexDate -61 days -47 days
6 2 2019-05-01 2019-07-30 2019-04-15 2019-05-15 15days before IndexDate+15days -16 days 14 days
7 2 2019-05-01 2019-07-30 2019-05-16 2019-05-30 15days 15 days 29 days
8 3 2019-07-01 2019-12-31 2019-07-05 2019-07-15 11days 4 days 14 days
9 3 2019-07-01 2019-12-31 2020-01-01 2020-01-15 15days after CensorDate 184 days 198 days
但我不知道如何处理重叠期和那些连续期,如下:
# Overlapped periods:
# Transform
ID TimeStart TimeEnd
2 1 59 days 104 days
3 1 90 days 119 days
# to
ID TimeStart TimeEnd
2 1 59 days 119 days
# And Continous periods:
# Transform
ID TimeStart TimeEnd
6 2 -16 days 14 days
7 2 15 days 29 days
# To
ID TimeStart TimeEnd
6 2 0 days 29 days
另外,如何添加那些我们不使用药物的时期(那些DrugUse=1
)?例如这些行:
ID TimeStart TimeEnd DrugUse
1 1 0 30 1
3 1 46 58 1
5 1 120 150 1
7 1 156 180 1
9 2 30 90 1
10 3 0 3 1
12 3 15 183 1
有人帮我吗?非常感谢!
########################################### ########
更新:
谢谢巴斯的回答!!我对 Bas 的回答做了一些小修改。以下代码可能是最终版本!!
library(data.table)
df <- data.frame("ID" = c(1,1,1,1,2,2,2,3,3), "IndexDate" = c("2019-01-01", "2019-01-01", "2019-01-01", "2019-01-01", "2019-05-01", "2019-05-01", "2019-05-01", "2019-07-01", "2019-07-01"), "CensorDate" = c("2019-06-30", "2019-06-30", "2019-06-30", "2019-06-30", "2019-07-30", "2019-07-30", "2019-07-30", "2019-12-31", "2019-12-31"), "DrugStart" = c("2019-02-01", "2019-03-01", "2019-04-01", "2019-06-01", "2019-03-01", "2019-04-15", "2019-05-16", "2019-07-05", "2020-01-01"), "DrugEnd" = c("2019-02-15", "2019-04-15", "2019-04-30", "2019-06-05", "2019-03-15", "2019-05-15", "2019-05-30", "2019-07-15", "2020-01-15"),"Notes" = c("", "", "Overlap 15 days", "", "All days before IndexDate", "15 days before IndexDate", "", "", "15 days after CensorDate"))
df$DrugEnd <- as.Date(df$DrugEnd, format="%Y-%m-%d") + 1
df$CensorDate <- as.Date(df$CensorDate, format="%Y-%m-%d") + 1
library(dplyr)
library(tidyr)
library(lubridate)
df2 <- df %>%
mutate(across(IndexDate:DrugEnd, as.Date)) %>%
filter(DrugStart <= CensorDate, # Neglect days before IndexDate or after CensorDate
DrugEnd >= IndexDate) %>%
group_by(ID) %>%
mutate(interval = list(int_diff(sort(unique(c(IndexDate, CensorDate, DrugStart, DrugEnd)))))) %>%
unnest(interval) %>%
mutate(DrugUse = DrugStart < int_end(interval) & DrugEnd > int_start(interval)) %>%
group_by(ID, interval) %>%
summarise(IndexDate = first(IndexDate),
CensorDate = first(CensorDate),
DrugUse = if_else(sum(DrugUse) > 0, 2, 1)) %>%
ungroup() %>%
filter(int_end(interval) <= CensorDate,
int_start(interval) >= IndexDate) %>%
mutate(TimeStart = as.numeric(difftime(int_start(interval), IndexDate, units = "days")),
TimeEnd = as.numeric(difftime(int_end(interval), IndexDate, units = "days"))-1) %>%
group_by(ID, data.table::rleid(DrugUse)) %>%
summarise(TimeStart = min(TimeStart),
TimeEnd = max(TimeEnd),
DrugUse = first(DrugUse)) %>%
select(ID, TimeStart, TimeEnd, DrugUse)
> df2
# A tibble: 12 x 4
# Groups: ID [3]
ID TimeStart TimeEnd DrugUse
<dbl> <dbl> <dbl> <dbl>
1 1 0 30 1
2 1 31 45 2
3 1 46 58 1
4 1 59 119 2
5 1 120 150 1
6 1 151 155 2
7 1 156 180 1
8 2 0 29 2
9 2 30 90 1
10 3 0 3 1
11 3 4 14 2
12 3 15 183 1
########################################### ########
第二次更新:
如果您的数据集太大(例如,超过一百万条记录),使用上述代码可能会非常慢。 unnest()
后的文件特别大,这一步很慢
在这种情况下,我们可以使用split()
拆分文件(每个文件最好不要超过10000条记录)。 运行 通过循环语法 (for(i in sequence){statement}
)。然后使用 rbind()
.
组合文件
祝你好运!
使用 dplyr
、tidyr
和 lubridate
,这让您接近但不完全:
df %>%
mutate(across(IndexDate:DrugEnd, as.Date)) %>%
filter(DrugStart <= CensorDate, # Neglect days before IndexDate or after CensorDate
DrugEnd >= IndexDate) %>%
group_by(ID) %>%
mutate(interval = list(int_diff(sort(unique(c(IndexDate, CensorDate, DrugStart, DrugEnd)))))) %>%
unnest(interval) %>%
mutate(DrugUse = DrugStart < int_end(interval) & DrugEnd > int_start(interval)) %>%
group_by(ID, interval) %>%
summarise(IndexDate = first(IndexDate),
CensorDate = first(CensorDate),
DrugUse = if_else(sum(DrugUse) > 0, 2, 1)) %>%
ungroup() %>%
filter(int_end(interval) <= CensorDate,
int_start(interval) >= IndexDate) %>%
mutate(TimeStart = as.numeric(difftime(int_start(interval), IndexDate, units = "days")),
TimeEnd = as.numeric(difftime(int_end(interval), IndexDate, units = "days"))) %>%
group_by(ID, data.table::rleid(DrugUse)) %>%
summarise(TimeStart = min(TimeStart),
TimeEnd = max(TimeEnd),
DrugUse = first(DrugUse)) %>%
select(ID, TimeStart, TimeEnd, DrugUse)
这给出了
ID TimeStart TimeEnd DrugUse
<dbl> <dbl> <dbl> <dbl>
1 1 0 31 1
2 1 31 45 2
3 1 45 59 1
4 1 59 119 2
5 1 119 151 1
6 1 151 155 2
7 1 155 180 1
8 2 0 14 2
9 2 14 15 1
10 2 15 29 2
11 2 29 90 1
12 3 0 4 1
13 3 4 14 2
14 3 14 183 1
我有一个大型药物使用数据库:
library(data.table)
df <- data.frame("ID" = c(1,1,1,1,2,2,2,3,3), "IndexDate" = c("2019-01-01", "2019-01-01", "2019-01-01", "2019-01-01", "2019-05-01", "2019-05-01", "2019-05-01", "2019-07-01", "2019-07-01"), "CensorDate" = c("2019-06-30", "2019-06-30", "2019-06-30", "2019-06-30", "2019-07-30", "2019-07-30", "2019-07-30", "2019-12-31", "2019-12-31"), "DrugStart" = c("2019-02-01", "2019-03-01", "2019-04-01", "2019-06-01", "2019-03-01", "2019-04-15", "2019-05-16", "2019-07-05", "2020-01-01"), "DrugEnd" = c("2019-02-15", "2019-04-15", "2019-04-30", "2019-06-05", "2019-03-15", "2019-05-15", "2019-05-30", "2019-07-15", "2020-01-15"),"Notes" = c("", "", "Overlap 15 days", "", "All days before IndexDate", "15 days before IndexDate", "", "", "15 days after CensorDate"))
df
ID IndexDate CensorDate DrugStart DrugEnd Notes
1 1 2019-01-01 2019-06-30 2019-02-01 2019-02-15
2 1 2019-01-01 2019-06-30 2019-03-01 2019-04-15
3 1 2019-01-01 2019-06-30 2019-04-01 2019-04-30 Overlap 15 days
4 1 2019-01-01 2019-06-30 2019-06-01 2019-06-05
5 2 2019-05-01 2019-07-30 2019-03-01 2019-03-15 All days before IndexDate
6 2 2019-05-01 2019-07-30 2019-04-15 2019-05-15 15 days before IndexDate
7 2 2019-05-01 2019-07-30 2019-05-16 2019-05-30
8 3 2019-07-01 2019-12-31 2019-07-05 2019-07-15
9 3 2019-07-01 2019-12-31 2020-01-01 2020-01-15 15 days after CensorDate
IndexDate
和CensorDate
对于每个ID
都是一样的。观察期为IndexDate
至CensorDate
。
我想按照以下标准重新排列它:
- 由
ID
链接
- 忽略
IndexDate
之前或CensorDate
之后的天数; - 重叠的时间段只计算一次;
df
是一个药物使用数据库。df
中的所有句点(从DrugStart
到DrugEnd
)都表示吸毒。df
内遗漏期,但在观察期(IndexDate
至CensorDate
)内者,即未吸毒- 药物使用标记为 2(使用)和 1(不使用);
IndexDate
定义为第 0 天(表示“TimeStart
”的所有开始时间均为 0)。
我期望结果如下:
> df2 <- data.frame("ID" = c(1,1,1,1,1,1,1,2,2,3,3,3), "TimeStart" = c("0", "31", "46", "59", "120", "151", "156", "0", "30", "0", "4", "15"), "TimeEnd" = c("30", "45", "58", "119", "150", "155", "180", "29", "90", "3", "14", "183"), "DrugUse" = c("1", "2", "1", "2", "1", "2", "1", "2", "1", "1", "2", "1"))
> df2
ID TimeStart TimeEnd DrugUse
1 1 0 30 1
2 1 31 45 2
3 1 46 58 1
4 1 59 119 2
5 1 120 150 1
6 1 151 155 2
7 1 156 180 1
8 2 0 29 2
9 2 30 90 1
10 3 0 3 1
11 3 4 14 2
12 3 15 183 1
现在,我知道如何通过“DrugStart
-IndexDate
”和“DrugEnd
-IndexDate
生成TimeStart
和TimeEnd
》,如下:
df$TimeStart<- as.Date(df$DrugStart, format="%Y-%m-%d")-as.Date(df$IndexDate, format="%Y-%m-%d")
df$TimeEnd<- as.Date(df$DrugEnd, format="%Y-%m-%d")-as.Date(df$IndexDate, format="%Y-%m-%d")
df
ID IndexDate CensorDate DrugStart DrugEnd Notes_Drug.use.days TimeStart TimeEnd
1 1 2019-01-01 2019-06-30 2019-02-01 2019-02-15 15days 31 days 45 days
2 1 2019-01-01 2019-06-30 2019-03-01 2019-04-15 46days 59 days 104 days
3 1 2019-01-01 2019-06-30 2019-04-01 2019-04-30 Overlap 15days + 15days 90 days 119 days
4 1 2019-01-01 2019-06-30 2019-06-01 2019-06-05 5days 151 days 155 days
5 2 2019-05-01 2019-07-30 2019-03-01 2019-03-15 15days before IndexDate -61 days -47 days
6 2 2019-05-01 2019-07-30 2019-04-15 2019-05-15 15days before IndexDate+15days -16 days 14 days
7 2 2019-05-01 2019-07-30 2019-05-16 2019-05-30 15days 15 days 29 days
8 3 2019-07-01 2019-12-31 2019-07-05 2019-07-15 11days 4 days 14 days
9 3 2019-07-01 2019-12-31 2020-01-01 2020-01-15 15days after CensorDate 184 days 198 days
但我不知道如何处理重叠期和那些连续期,如下:
# Overlapped periods:
# Transform
ID TimeStart TimeEnd
2 1 59 days 104 days
3 1 90 days 119 days
# to
ID TimeStart TimeEnd
2 1 59 days 119 days
# And Continous periods:
# Transform
ID TimeStart TimeEnd
6 2 -16 days 14 days
7 2 15 days 29 days
# To
ID TimeStart TimeEnd
6 2 0 days 29 days
另外,如何添加那些我们不使用药物的时期(那些DrugUse=1
)?例如这些行:
ID TimeStart TimeEnd DrugUse
1 1 0 30 1
3 1 46 58 1
5 1 120 150 1
7 1 156 180 1
9 2 30 90 1
10 3 0 3 1
12 3 15 183 1
有人帮我吗?非常感谢!
########################################### ########
更新: 谢谢巴斯的回答!!我对 Bas 的回答做了一些小修改。以下代码可能是最终版本!!
library(data.table)
df <- data.frame("ID" = c(1,1,1,1,2,2,2,3,3), "IndexDate" = c("2019-01-01", "2019-01-01", "2019-01-01", "2019-01-01", "2019-05-01", "2019-05-01", "2019-05-01", "2019-07-01", "2019-07-01"), "CensorDate" = c("2019-06-30", "2019-06-30", "2019-06-30", "2019-06-30", "2019-07-30", "2019-07-30", "2019-07-30", "2019-12-31", "2019-12-31"), "DrugStart" = c("2019-02-01", "2019-03-01", "2019-04-01", "2019-06-01", "2019-03-01", "2019-04-15", "2019-05-16", "2019-07-05", "2020-01-01"), "DrugEnd" = c("2019-02-15", "2019-04-15", "2019-04-30", "2019-06-05", "2019-03-15", "2019-05-15", "2019-05-30", "2019-07-15", "2020-01-15"),"Notes" = c("", "", "Overlap 15 days", "", "All days before IndexDate", "15 days before IndexDate", "", "", "15 days after CensorDate"))
df$DrugEnd <- as.Date(df$DrugEnd, format="%Y-%m-%d") + 1
df$CensorDate <- as.Date(df$CensorDate, format="%Y-%m-%d") + 1
library(dplyr)
library(tidyr)
library(lubridate)
df2 <- df %>%
mutate(across(IndexDate:DrugEnd, as.Date)) %>%
filter(DrugStart <= CensorDate, # Neglect days before IndexDate or after CensorDate
DrugEnd >= IndexDate) %>%
group_by(ID) %>%
mutate(interval = list(int_diff(sort(unique(c(IndexDate, CensorDate, DrugStart, DrugEnd)))))) %>%
unnest(interval) %>%
mutate(DrugUse = DrugStart < int_end(interval) & DrugEnd > int_start(interval)) %>%
group_by(ID, interval) %>%
summarise(IndexDate = first(IndexDate),
CensorDate = first(CensorDate),
DrugUse = if_else(sum(DrugUse) > 0, 2, 1)) %>%
ungroup() %>%
filter(int_end(interval) <= CensorDate,
int_start(interval) >= IndexDate) %>%
mutate(TimeStart = as.numeric(difftime(int_start(interval), IndexDate, units = "days")),
TimeEnd = as.numeric(difftime(int_end(interval), IndexDate, units = "days"))-1) %>%
group_by(ID, data.table::rleid(DrugUse)) %>%
summarise(TimeStart = min(TimeStart),
TimeEnd = max(TimeEnd),
DrugUse = first(DrugUse)) %>%
select(ID, TimeStart, TimeEnd, DrugUse)
> df2
# A tibble: 12 x 4
# Groups: ID [3]
ID TimeStart TimeEnd DrugUse
<dbl> <dbl> <dbl> <dbl>
1 1 0 30 1
2 1 31 45 2
3 1 46 58 1
4 1 59 119 2
5 1 120 150 1
6 1 151 155 2
7 1 156 180 1
8 2 0 29 2
9 2 30 90 1
10 3 0 3 1
11 3 4 14 2
12 3 15 183 1
########################################### ########
第二次更新:
如果您的数据集太大(例如,超过一百万条记录),使用上述代码可能会非常慢。 unnest()
后的文件特别大,这一步很慢
在这种情况下,我们可以使用split()
拆分文件(每个文件最好不要超过10000条记录)。 运行 通过循环语法 (for(i in sequence){statement}
)。然后使用 rbind()
.
祝你好运!
使用 dplyr
、tidyr
和 lubridate
,这让您接近但不完全:
df %>%
mutate(across(IndexDate:DrugEnd, as.Date)) %>%
filter(DrugStart <= CensorDate, # Neglect days before IndexDate or after CensorDate
DrugEnd >= IndexDate) %>%
group_by(ID) %>%
mutate(interval = list(int_diff(sort(unique(c(IndexDate, CensorDate, DrugStart, DrugEnd)))))) %>%
unnest(interval) %>%
mutate(DrugUse = DrugStart < int_end(interval) & DrugEnd > int_start(interval)) %>%
group_by(ID, interval) %>%
summarise(IndexDate = first(IndexDate),
CensorDate = first(CensorDate),
DrugUse = if_else(sum(DrugUse) > 0, 2, 1)) %>%
ungroup() %>%
filter(int_end(interval) <= CensorDate,
int_start(interval) >= IndexDate) %>%
mutate(TimeStart = as.numeric(difftime(int_start(interval), IndexDate, units = "days")),
TimeEnd = as.numeric(difftime(int_end(interval), IndexDate, units = "days"))) %>%
group_by(ID, data.table::rleid(DrugUse)) %>%
summarise(TimeStart = min(TimeStart),
TimeEnd = max(TimeEnd),
DrugUse = first(DrugUse)) %>%
select(ID, TimeStart, TimeEnd, DrugUse)
这给出了
ID TimeStart TimeEnd DrugUse
<dbl> <dbl> <dbl> <dbl>
1 1 0 31 1
2 1 31 45 2
3 1 45 59 1
4 1 59 119 2
5 1 119 151 1
6 1 151 155 2
7 1 155 180 1
8 2 0 14 2
9 2 14 15 1
10 2 15 29 2
11 2 29 90 1
12 3 0 4 1
13 3 4 14 2
14 3 14 183 1