从 R 中的宽数据帧中检索连续事件的计数
Retrieving counts of consecutive events from wide dataframe in R
我正在使用 R 中的宽格式数据集。它是用户生成的数据集,每个 UserID
的每一行显示每个日期的连续事件。这是我正在使用的数据类型的模拟示例:-
df<-structure(list(UserID = c("hdyyu-192", "yeui-1893", "dnnd-1882", "nopr-738", "ieka-1728"),
`05/06/2021` = c("Activity B", "Activity A", "Activity B", "Activity C", "Activity D"),
`06/06/2021` = c("Activity B", "Activity A", "Activity A", "Activity C", "Activity D"),
`07/06/2021` = c("Activity D","Activity A", NA, "Activity C", "Activity D"),
`08/06/2021` = c("Activity A","Activity B", NA, "Activity B", "Activity A"),
`09/06/2021` = c("Activity A","Activity B", NA, "Activity B", "Activity C"),
`10/06/2021` = c("Activity C",NA, NA, NA, NA),
`11/06/2021` = c("Activity B", NA, NA, NA, NA)),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,-5L))
head(df)
# A tibble: 5 x 8
UserID `05/06/2021` `06/06/2021` `07/06/2021` `08/06/2021` `09/06/2021` `10/06/2021` `11/06/2021`
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 hdyyu-192 Activity B Activity B Activity D Activity A Activity A Activity C Activity B
2 yeui-1893 Activity A Activity A Activity A Activity B Activity B NA NA
3 dnnd-1882 Activity B Activity A NA NA NA NA NA
4 nopr-738 Activity C Activity C Activity C Activity B Activity B NA NA
5 ieka-1728 Activity D Activity D Activity D Activity A Activity C NA NA
我想做的是计算连续事件的频率;换句话说,antecedent/consequent 事件的频率。
这是所需的输出:-
#desired outcome
Antecedent | Consequent | Count
Activity A | Activity A | 3
Activity A | Activity B | 1
Activity A | Activity C | 1
........................................
Activity D | Activity A | 2
Activity D | Activity B | 0
Activity D | Activity C | 0
Activity D | Activity D | 2
有人可以在 R 中建议一种可以让我实现此输出的方法吗?非常感谢:)
我会使用字符串搜索。我们可以使用 expand.grid
创建所有可能的组合,然后使用 do.call
将它们转换为字符串,在 df
上做同样的事情(没有 UserID
列)并且只需使用 stringr::str_count()
获取每一行中的实例数,总和是它在 df
中出现的总次数。这有助于我们维持您想要的顺序。
注意,我们需要在正则表达式中使用前瞻来计算重叠实例。
library(stringr)
activities <- paste0("Activity ", LETTERS[1:5])
activities_df <- expand.grid(activities, activities)
activities_df$count <- sapply(
do.call(paste, activities_df),
\(x) sum(str_count(do.call(paste, df[,-1]), paste0("(?=", x, ")")))
)
head(activities_df)
#> Var1 Var2 count
#> 1 Activity A Activity A 3
#> 2 Activity B Activity A 1
#> 3 Activity C Activity A 0
#> 4 Activity D Activity A 2
#> 5 Activity E Activity A 0
#> 6 Activity A Activity B 1
注意\(x)
是R >= 4.1.0
中function(x)
的shorthand,如果使用以前的版本就改成function(x)
.
另一种可能的解决方案,使用tidyverse
:
library(tidyverse)
df %>%
pivot_longer(-UserID) %>%
group_by(UserID) %>%
mutate(aux = str_c(lag(value), value, sep = "-")) %>%
ungroup %>% select(aux) %>%
mutate(aux2 = "count") %>%
pivot_wider(id_cols = aux, names_from = aux2,
values_from = aux2, values_fn = length) %>% drop_na(aux) %>%
separate(aux, into = c("Antecedent", "Consequent"), sep = "-") %>%
arrange(Antecedent, Consequent)
#> # A tibble: 10 × 3
#> Antecedent Consequent count
#> <chr> <chr> <int>
#> 1 Activity A Activity A 3
#> 2 Activity A Activity B 1
#> 3 Activity A Activity C 2
#> 4 Activity B Activity A 1
#> 5 Activity B Activity B 3
#> 6 Activity B Activity D 1
#> 7 Activity C Activity B 2
#> 8 Activity C Activity C 2
#> 9 Activity D Activity A 2
#> 10 Activity D Activity D 2
如果有人有兴趣也显示零计数:
df %>%
pivot_longer(-UserID) %>%
group_by(UserID) %>%
mutate(aux = str_c(lag(value), value, sep = "-")) %>%
ungroup %>% select(aux) %>%
mutate(aux2 = "count") %>%
pivot_wider(id_cols = aux, names_from = aux2,
values_from = aux2, values_fn = length) %>% drop_na(aux) %>%
separate(aux, into = c("Antecedent", "Consequent"), sep = "-") %>%
bind_rows(df[-1] %>% unlist %>% as.vector() %>% unique %>%
expand_grid(Antecedent = ., Consequent = .) ) %>%
drop_na(c(Antecedent, Consequent)) %>%
group_by(Antecedent, Consequent) %>%
summarise(count = sum(count, na.rm = T), .groups = "drop") %>%
arrange(Antecedent, Consequent)
#> # A tibble: 16 × 3
#> Antecedent Consequent count
#> <chr> <chr> <int>
#> 1 Activity A Activity A 3
#> 2 Activity A Activity B 1
#> 3 Activity A Activity C 2
#> 4 Activity A Activity D 0
#> 5 Activity B Activity A 1
#> 6 Activity B Activity B 3
#> 7 Activity B Activity C 0
#> 8 Activity B Activity D 1
#> 9 Activity C Activity A 0
#> 10 Activity C Activity B 2
#> 11 Activity C Activity C 2
#> 12 Activity C Activity D 0
#> 13 Activity D Activity A 2
#> 14 Activity D Activity B 0
#> 15 Activity D Activity C 0
#> 16 Activity D Activity D 2
与data.table
:
library(data.table)
setDT(df)
DT <- melt(df,id.vars = "UserID",variable.name = 'Timestamp',value.name = 'Activity')
DT[,TimeStamp:=as.Date(Timestamp,format='%d/%m/%Y')]
DT <- DT[order(Timestamp)][,Activity_prec:=shift(Activity),by=.(UserID)]
DT[!is.na(Activity_prec)&!is.na(Activity),.(Count=.N),by=.(Activity_prec,Activity)][
order(Activity_prec,Activity)]
Activity_prec Activity Count
<char> <char> <int>
1: Activity A Activity A 3
2: Activity A Activity B 1
3: Activity A Activity C 2
4: Activity B Activity A 1
5: Activity B Activity B 3
6: Activity B Activity D 1
7: Activity C Activity B 2
8: Activity C Activity C 2
9: Activity D Activity A 2
10: Activity D Activity D 2
我正在使用 R 中的宽格式数据集。它是用户生成的数据集,每个 UserID
的每一行显示每个日期的连续事件。这是我正在使用的数据类型的模拟示例:-
df<-structure(list(UserID = c("hdyyu-192", "yeui-1893", "dnnd-1882", "nopr-738", "ieka-1728"),
`05/06/2021` = c("Activity B", "Activity A", "Activity B", "Activity C", "Activity D"),
`06/06/2021` = c("Activity B", "Activity A", "Activity A", "Activity C", "Activity D"),
`07/06/2021` = c("Activity D","Activity A", NA, "Activity C", "Activity D"),
`08/06/2021` = c("Activity A","Activity B", NA, "Activity B", "Activity A"),
`09/06/2021` = c("Activity A","Activity B", NA, "Activity B", "Activity C"),
`10/06/2021` = c("Activity C",NA, NA, NA, NA),
`11/06/2021` = c("Activity B", NA, NA, NA, NA)),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,-5L))
head(df)
# A tibble: 5 x 8
UserID `05/06/2021` `06/06/2021` `07/06/2021` `08/06/2021` `09/06/2021` `10/06/2021` `11/06/2021`
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 hdyyu-192 Activity B Activity B Activity D Activity A Activity A Activity C Activity B
2 yeui-1893 Activity A Activity A Activity A Activity B Activity B NA NA
3 dnnd-1882 Activity B Activity A NA NA NA NA NA
4 nopr-738 Activity C Activity C Activity C Activity B Activity B NA NA
5 ieka-1728 Activity D Activity D Activity D Activity A Activity C NA NA
我想做的是计算连续事件的频率;换句话说,antecedent/consequent 事件的频率。
这是所需的输出:-
#desired outcome
Antecedent | Consequent | Count
Activity A | Activity A | 3
Activity A | Activity B | 1
Activity A | Activity C | 1
........................................
Activity D | Activity A | 2
Activity D | Activity B | 0
Activity D | Activity C | 0
Activity D | Activity D | 2
有人可以在 R 中建议一种可以让我实现此输出的方法吗?非常感谢:)
我会使用字符串搜索。我们可以使用 expand.grid
创建所有可能的组合,然后使用 do.call
将它们转换为字符串,在 df
上做同样的事情(没有 UserID
列)并且只需使用 stringr::str_count()
获取每一行中的实例数,总和是它在 df
中出现的总次数。这有助于我们维持您想要的顺序。
注意,我们需要在正则表达式中使用前瞻来计算重叠实例。
library(stringr)
activities <- paste0("Activity ", LETTERS[1:5])
activities_df <- expand.grid(activities, activities)
activities_df$count <- sapply(
do.call(paste, activities_df),
\(x) sum(str_count(do.call(paste, df[,-1]), paste0("(?=", x, ")")))
)
head(activities_df)
#> Var1 Var2 count
#> 1 Activity A Activity A 3
#> 2 Activity B Activity A 1
#> 3 Activity C Activity A 0
#> 4 Activity D Activity A 2
#> 5 Activity E Activity A 0
#> 6 Activity A Activity B 1
注意\(x)
是R >= 4.1.0
中function(x)
的shorthand,如果使用以前的版本就改成function(x)
.
另一种可能的解决方案,使用tidyverse
:
library(tidyverse)
df %>%
pivot_longer(-UserID) %>%
group_by(UserID) %>%
mutate(aux = str_c(lag(value), value, sep = "-")) %>%
ungroup %>% select(aux) %>%
mutate(aux2 = "count") %>%
pivot_wider(id_cols = aux, names_from = aux2,
values_from = aux2, values_fn = length) %>% drop_na(aux) %>%
separate(aux, into = c("Antecedent", "Consequent"), sep = "-") %>%
arrange(Antecedent, Consequent)
#> # A tibble: 10 × 3
#> Antecedent Consequent count
#> <chr> <chr> <int>
#> 1 Activity A Activity A 3
#> 2 Activity A Activity B 1
#> 3 Activity A Activity C 2
#> 4 Activity B Activity A 1
#> 5 Activity B Activity B 3
#> 6 Activity B Activity D 1
#> 7 Activity C Activity B 2
#> 8 Activity C Activity C 2
#> 9 Activity D Activity A 2
#> 10 Activity D Activity D 2
如果有人有兴趣也显示零计数:
df %>%
pivot_longer(-UserID) %>%
group_by(UserID) %>%
mutate(aux = str_c(lag(value), value, sep = "-")) %>%
ungroup %>% select(aux) %>%
mutate(aux2 = "count") %>%
pivot_wider(id_cols = aux, names_from = aux2,
values_from = aux2, values_fn = length) %>% drop_na(aux) %>%
separate(aux, into = c("Antecedent", "Consequent"), sep = "-") %>%
bind_rows(df[-1] %>% unlist %>% as.vector() %>% unique %>%
expand_grid(Antecedent = ., Consequent = .) ) %>%
drop_na(c(Antecedent, Consequent)) %>%
group_by(Antecedent, Consequent) %>%
summarise(count = sum(count, na.rm = T), .groups = "drop") %>%
arrange(Antecedent, Consequent)
#> # A tibble: 16 × 3
#> Antecedent Consequent count
#> <chr> <chr> <int>
#> 1 Activity A Activity A 3
#> 2 Activity A Activity B 1
#> 3 Activity A Activity C 2
#> 4 Activity A Activity D 0
#> 5 Activity B Activity A 1
#> 6 Activity B Activity B 3
#> 7 Activity B Activity C 0
#> 8 Activity B Activity D 1
#> 9 Activity C Activity A 0
#> 10 Activity C Activity B 2
#> 11 Activity C Activity C 2
#> 12 Activity C Activity D 0
#> 13 Activity D Activity A 2
#> 14 Activity D Activity B 0
#> 15 Activity D Activity C 0
#> 16 Activity D Activity D 2
与data.table
:
library(data.table)
setDT(df)
DT <- melt(df,id.vars = "UserID",variable.name = 'Timestamp',value.name = 'Activity')
DT[,TimeStamp:=as.Date(Timestamp,format='%d/%m/%Y')]
DT <- DT[order(Timestamp)][,Activity_prec:=shift(Activity),by=.(UserID)]
DT[!is.na(Activity_prec)&!is.na(Activity),.(Count=.N),by=.(Activity_prec,Activity)][
order(Activity_prec,Activity)]
Activity_prec Activity Count
<char> <char> <int>
1: Activity A Activity A 3
2: Activity A Activity B 1
3: Activity A Activity C 2
4: Activity B Activity A 1
5: Activity B Activity B 3
6: Activity B Activity D 1
7: Activity C Activity B 2
8: Activity C Activity C 2
9: Activity D Activity A 2
10: Activity D Activity D 2