R - 按键查找连续值的最大数量
R - Finding max number of consecutive values by key
我有一个发送给用户的消息数据集,有些成功了,有些失败了:
> df.messages <- data.frame(date = c("2018-01-01 12:00","2018-01-01 12:00","2018-01-01 12:00","2018-01-02 12:00","2018-01-02 12:00","2018-01-02 12:00","2018-01-03 12:00","2018-01-03 12:00","2018-01-03 12:00","2018-01-04 12:00","2018-01-04 12:00","2018-01-04 12:00"), id = c(1,2,3,1,2,3,1,2,3,1,2,3), status = c("S","S","S","S","S","F","S","F","F","F","F","S"))
> df.messages
date id status
1 2018-01-01 12:00 1 S
2 2018-01-01 12:00 2 S
3 2018-01-01 12:00 3 S
4 2018-01-02 12:00 1 S
5 2018-01-02 12:00 2 S
6 2018-01-02 12:00 3 F
7 2018-01-03 12:00 1 S
8 2018-01-03 12:00 2 F
9 2018-01-03 12:00 3 F
10 2018-01-04 12:00 1 F
11 2018-01-04 12:00 2 F
12 2018-01-04 12:00 3 S
注意事项如下:
- 每天发送一条消息,共四天
- id 1 3次成功(S),然后失败(F)
- id 2 成功两次,然后失败两次
- id 3 成功一次,然后失败两次,然后成功
我想把用户分成四组
- 总是成功的人
- 失败的,后来成功的
- 那些成功,然后失败,再也没有成功的人
- 那些总是失败的
然后理解
- 组 2 中的用户在再次成功之前失败的最大次数
- 组 2 中的用户在再次成功之前失败的最长时间
- 第 3 组用户失败的最大次数
- 第 3 组用户失败的最长时间
理想的输出是
id group num_f_messages date_f_messages
1 1 3 1 1
2 2 3 2 2
3 3 2 2 2
我知道我需要使用 rle()
和 diff()
,但它变得越来越复杂,我以前不必进行此类分析。我迷路了。
我有 9MM 行,所以我试图用 data.table 来完成这个,但欢迎任何解决方案。
编辑:
我正在尝试将此功能扩展到更大的数据集。所以在id 3的消息是"S,F,F,S,F,F,F,S"的场景下,我需要在最后的S.
之前反射最多3个F
你可以试试这个:
require(plyr); require(dplyr)
df.messages %>%
group_by(id) %>%
summarise(group = ifelse(sum(status == "S") == n(), 1,
ifelse(sum(status == "F") == n(), 4,
ifelse(n_distinct(status) > 1 &
status[1] == "S" & status[n()] == "S", 2, 3))),
num_f_messages = sum(status == "F"),
date_f_messages = n_distinct(date[status == "F"]))
给你:
# A tibble: 3 x 4
id group num_f_messages date_f_messages
<dbl> <dbl> <int> <int>
1 1 3 1 1
2 2 3 2 2
3 3 2 2 2
这是一个data.table
解决方案。
library(data.table)
library(magrittr)
df.messages <- data.frame(date = c("2018-01-01 12:00","2018-01-01 12:00","2018-01-01 12:00","2018-01-02 12:00","2018-01-02 12:00","2018-01-02 12:00","2018-01-03 12:00","2018-01-03 12:00","2018-01-03 12:00","2018-01-04 12:00","2018-01-04 12:00","2018-01-04 12:00"), id = c(1,2,3,1,2,3,1,2,3,1,2,3), status = c("S","S","S","S","S","F","S","F","F","F","F","S"))
df.messages$status <- as.character(df.messages$status)
setDT(df.messages)
ans <- df.messages[,
.(
by_rle = paste0(rle(status)$value, collapse = ""),
num_f_message = sum(status == "F"),
date_f_message = length(unique(date[status == "F"]))
),
by = id] %>%
# define groups and remove the by_rle columns
.[by_rle == "S", group := 1] %>%
.[by_rle == c("SFS"), group := 2] %>%
.[by_rle == c("SF"), group := 3] %>%
.[by_rle == "F", group := 4] %>%
.[, by_rle := NULL] %>%
setcolorder(c("id", "group", "num_f_message", "date_f_message"))
# id group num_f_message date_f_message
# 1: 1 3 1 1
# 2: 2 3 2 2
# 3: 3 2 2 2
我有一个发送给用户的消息数据集,有些成功了,有些失败了:
> df.messages <- data.frame(date = c("2018-01-01 12:00","2018-01-01 12:00","2018-01-01 12:00","2018-01-02 12:00","2018-01-02 12:00","2018-01-02 12:00","2018-01-03 12:00","2018-01-03 12:00","2018-01-03 12:00","2018-01-04 12:00","2018-01-04 12:00","2018-01-04 12:00"), id = c(1,2,3,1,2,3,1,2,3,1,2,3), status = c("S","S","S","S","S","F","S","F","F","F","F","S"))
> df.messages
date id status
1 2018-01-01 12:00 1 S
2 2018-01-01 12:00 2 S
3 2018-01-01 12:00 3 S
4 2018-01-02 12:00 1 S
5 2018-01-02 12:00 2 S
6 2018-01-02 12:00 3 F
7 2018-01-03 12:00 1 S
8 2018-01-03 12:00 2 F
9 2018-01-03 12:00 3 F
10 2018-01-04 12:00 1 F
11 2018-01-04 12:00 2 F
12 2018-01-04 12:00 3 S
注意事项如下:
- 每天发送一条消息,共四天
- id 1 3次成功(S),然后失败(F)
- id 2 成功两次,然后失败两次
- id 3 成功一次,然后失败两次,然后成功
我想把用户分成四组
- 总是成功的人
- 失败的,后来成功的
- 那些成功,然后失败,再也没有成功的人
- 那些总是失败的
然后理解
- 组 2 中的用户在再次成功之前失败的最大次数
- 组 2 中的用户在再次成功之前失败的最长时间
- 第 3 组用户失败的最大次数
- 第 3 组用户失败的最长时间
理想的输出是
id group num_f_messages date_f_messages
1 1 3 1 1
2 2 3 2 2
3 3 2 2 2
我知道我需要使用 rle()
和 diff()
,但它变得越来越复杂,我以前不必进行此类分析。我迷路了。
我有 9MM 行,所以我试图用 data.table 来完成这个,但欢迎任何解决方案。
编辑:
我正在尝试将此功能扩展到更大的数据集。所以在id 3的消息是"S,F,F,S,F,F,F,S"的场景下,我需要在最后的S.
之前反射最多3个F你可以试试这个:
require(plyr); require(dplyr)
df.messages %>%
group_by(id) %>%
summarise(group = ifelse(sum(status == "S") == n(), 1,
ifelse(sum(status == "F") == n(), 4,
ifelse(n_distinct(status) > 1 &
status[1] == "S" & status[n()] == "S", 2, 3))),
num_f_messages = sum(status == "F"),
date_f_messages = n_distinct(date[status == "F"]))
给你:
# A tibble: 3 x 4
id group num_f_messages date_f_messages
<dbl> <dbl> <int> <int>
1 1 3 1 1
2 2 3 2 2
3 3 2 2 2
这是一个data.table
解决方案。
library(data.table)
library(magrittr)
df.messages <- data.frame(date = c("2018-01-01 12:00","2018-01-01 12:00","2018-01-01 12:00","2018-01-02 12:00","2018-01-02 12:00","2018-01-02 12:00","2018-01-03 12:00","2018-01-03 12:00","2018-01-03 12:00","2018-01-04 12:00","2018-01-04 12:00","2018-01-04 12:00"), id = c(1,2,3,1,2,3,1,2,3,1,2,3), status = c("S","S","S","S","S","F","S","F","F","F","F","S"))
df.messages$status <- as.character(df.messages$status)
setDT(df.messages)
ans <- df.messages[,
.(
by_rle = paste0(rle(status)$value, collapse = ""),
num_f_message = sum(status == "F"),
date_f_message = length(unique(date[status == "F"]))
),
by = id] %>%
# define groups and remove the by_rle columns
.[by_rle == "S", group := 1] %>%
.[by_rle == c("SFS"), group := 2] %>%
.[by_rle == c("SF"), group := 3] %>%
.[by_rle == "F", group := 4] %>%
.[, by_rle := NULL] %>%
setcolorder(c("id", "group", "num_f_message", "date_f_message"))
# id group num_f_message date_f_message
# 1: 1 3 1 1
# 2: 2 3 2 2
# 3: 3 2 2 2