保留针对不同条件的特定间隔内的行,并按以下方式分组
Keep rows that are within specific interval for different conditions and grouped by
这里有一个代表来说明。
library(tidyverse)
set.seed(1337)
df <- tibble(
date_visit = sample(seq(as.Date("2020/01/01"),
as.Date("2021/01/01"),
by = "day"
), 400, replace = T),
patient_id = as.factor(paste("patient", sample(seq(1, 13), 400, replace = T), sep = "_")),
type_of_visit = as.factor(sample(c("medical", "veterinary"), 400, replace = T))
)
我想做的是创建一个数据框,其中我保留 patient_id(我假设是分组依据),如果该患者在不到 24 小时内进行了 2 次不同的访问,则访问类型。或者添加一个变量,如果满足该条件,则显示 True/False。
我尝试使用 patient_id 的左连接来处理 2 个不同的变量,但这需要太多的计算时间(我原来的 DF 比这长得多)
有人能指出我正确的方向吗?
谢谢
也许这会有所帮助 -
library(dplyr)
df %>%
group_by(patient_id, date_visit) %>%
summarise(flag = n_distinct(type_of_visit) >= 2) %>%
summarise(flag = any(flag))
# patient_id flag
# <fct> <lgl>
# 1 patient_1 TRUE
# 2 patient_10 FALSE
# 3 patient_11 TRUE
# 4 patient_12 FALSE
# 5 patient_13 FALSE
# 6 patient_2 FALSE
# 7 patient_3 FALSE
# 8 patient_4 FALSE
# 9 patient_5 TRUE
#10 patient_6 FALSE
#11 patient_7 TRUE
#12 patient_8 TRUE
#13 patient_9 TRUE
如果您想保留那些患者 ID 的所有行
df %>%
group_by(patient_id, date_visit) %>%
summarise(flag = n_distinct(type_of_visit) >= 2) %>%
filter(any(flag))
library(tidyverse)
set.seed(1337)
df <- tibble(
date_visit = sample(seq(as.Date("2020/01/01"),
as.Date("2021/01/01"),
by = "day"
), 400, replace = T),
patient_id = as.factor(paste("patient", sample(seq(1, 13), 400, replace = T), sep = "_")),
type_of_visit = as.factor(sample(c("medical", "veterinary"), 400, replace = T))
)
df
#> # A tibble: 400 x 3
#> date_visit patient_id type_of_visit
#> <date> <fct> <fct>
#> 1 2020-05-26 patient_11 medical
#> 2 2020-08-29 patient_4 medical
#> 3 2020-02-18 patient_6 medical
#> 4 2020-07-28 patient_9 veterinary
#> 5 2020-05-31 patient_9 veterinary
#> 6 2020-07-29 patient_1 veterinary
#> 7 2020-12-21 patient_11 veterinary
#> 8 2020-07-06 patient_9 veterinary
#> 9 2020-04-10 patient_3 medical
#> 10 2020-11-08 patient_12 medical
#> # … with 390 more rows
df %>%
group_by(patient_id, date_visit) %>%
# less than 24h <=> same date
filter(n() == 2) %>%
ungroup() %>%
distinct(patient_id, type_of_visit)
#> # A tibble: 15 x 2
#> patient_id type_of_visit
#> <fct> <fct>
#> 1 patient_9 veterinary
#> 2 patient_2 veterinary
#> 3 patient_11 medical
#> 4 patient_12 veterinary
#> 5 patient_2 medical
#> 6 patient_3 veterinary
#> 7 patient_5 veterinary
#> 8 patient_7 veterinary
#> 9 patient_6 veterinary
#> 10 patient_11 veterinary
#> 11 patient_9 medical
#> 12 patient_10 veterinary
#> 13 patient_5 medical
#> 14 patient_1 veterinary
#> 15 patient_3 medical
由 reprex package (v2.0.1)
于 2021-10-07 创建
使用data.table
library(data.table)
setDT(df)[, uniqueN(type_of_visit) >=2, .(patient_id, date_visit)][,
.(flag = any(V1)), patient_id]
patient_id flag
1: patient_11 TRUE
2: patient_4 FALSE
3: patient_6 FALSE
4: patient_9 TRUE
5: patient_1 FALSE
6: patient_3 TRUE
7: patient_12 FALSE
8: patient_7 FALSE
9: patient_8 FALSE
10: patient_10 FALSE
11: patient_13 FALSE
12: patient_2 TRUE
13: patient_5 TRUE
这里有一个代表来说明。
library(tidyverse)
set.seed(1337)
df <- tibble(
date_visit = sample(seq(as.Date("2020/01/01"),
as.Date("2021/01/01"),
by = "day"
), 400, replace = T),
patient_id = as.factor(paste("patient", sample(seq(1, 13), 400, replace = T), sep = "_")),
type_of_visit = as.factor(sample(c("medical", "veterinary"), 400, replace = T))
)
我想做的是创建一个数据框,其中我保留 patient_id(我假设是分组依据),如果该患者在不到 24 小时内进行了 2 次不同的访问,则访问类型。或者添加一个变量,如果满足该条件,则显示 True/False。
我尝试使用 patient_id 的左连接来处理 2 个不同的变量,但这需要太多的计算时间(我原来的 DF 比这长得多)
有人能指出我正确的方向吗?
谢谢
也许这会有所帮助 -
library(dplyr)
df %>%
group_by(patient_id, date_visit) %>%
summarise(flag = n_distinct(type_of_visit) >= 2) %>%
summarise(flag = any(flag))
# patient_id flag
# <fct> <lgl>
# 1 patient_1 TRUE
# 2 patient_10 FALSE
# 3 patient_11 TRUE
# 4 patient_12 FALSE
# 5 patient_13 FALSE
# 6 patient_2 FALSE
# 7 patient_3 FALSE
# 8 patient_4 FALSE
# 9 patient_5 TRUE
#10 patient_6 FALSE
#11 patient_7 TRUE
#12 patient_8 TRUE
#13 patient_9 TRUE
如果您想保留那些患者 ID 的所有行
df %>%
group_by(patient_id, date_visit) %>%
summarise(flag = n_distinct(type_of_visit) >= 2) %>%
filter(any(flag))
library(tidyverse)
set.seed(1337)
df <- tibble(
date_visit = sample(seq(as.Date("2020/01/01"),
as.Date("2021/01/01"),
by = "day"
), 400, replace = T),
patient_id = as.factor(paste("patient", sample(seq(1, 13), 400, replace = T), sep = "_")),
type_of_visit = as.factor(sample(c("medical", "veterinary"), 400, replace = T))
)
df
#> # A tibble: 400 x 3
#> date_visit patient_id type_of_visit
#> <date> <fct> <fct>
#> 1 2020-05-26 patient_11 medical
#> 2 2020-08-29 patient_4 medical
#> 3 2020-02-18 patient_6 medical
#> 4 2020-07-28 patient_9 veterinary
#> 5 2020-05-31 patient_9 veterinary
#> 6 2020-07-29 patient_1 veterinary
#> 7 2020-12-21 patient_11 veterinary
#> 8 2020-07-06 patient_9 veterinary
#> 9 2020-04-10 patient_3 medical
#> 10 2020-11-08 patient_12 medical
#> # … with 390 more rows
df %>%
group_by(patient_id, date_visit) %>%
# less than 24h <=> same date
filter(n() == 2) %>%
ungroup() %>%
distinct(patient_id, type_of_visit)
#> # A tibble: 15 x 2
#> patient_id type_of_visit
#> <fct> <fct>
#> 1 patient_9 veterinary
#> 2 patient_2 veterinary
#> 3 patient_11 medical
#> 4 patient_12 veterinary
#> 5 patient_2 medical
#> 6 patient_3 veterinary
#> 7 patient_5 veterinary
#> 8 patient_7 veterinary
#> 9 patient_6 veterinary
#> 10 patient_11 veterinary
#> 11 patient_9 medical
#> 12 patient_10 veterinary
#> 13 patient_5 medical
#> 14 patient_1 veterinary
#> 15 patient_3 medical
由 reprex package (v2.0.1)
于 2021-10-07 创建使用data.table
library(data.table)
setDT(df)[, uniqueN(type_of_visit) >=2, .(patient_id, date_visit)][,
.(flag = any(V1)), patient_id]
patient_id flag
1: patient_11 TRUE
2: patient_4 FALSE
3: patient_6 FALSE
4: patient_9 TRUE
5: patient_1 FALSE
6: patient_3 TRUE
7: patient_12 FALSE
8: patient_7 FALSE
9: patient_8 FALSE
10: patient_10 FALSE
11: patient_13 FALSE
12: patient_2 TRUE
13: patient_5 TRUE