根据连续分数的不同标准创建组变量
Create a group variable based on different criteria of consecutive scores
我有一个数据集,其中仅包含主题 ID 和不同时间点的分数。有没有办法根据他们的分数创建一个组变量?例如,如果一个科目有 6 个连续分数为 1 或 2,我会将它们放在组“a” |如果他们连续4次得分为3,我会把他们分在“b”组 |如果他们有 6 次连续得分为 4 或更高,我会把他们放在“c”组。
这是一个示例数据集:
id score1 score2 score3 score4 score5 score6 score7 score8 group
101 2 2 2 2 1 2 2 1 a
102 4 4 3 3 3 3 4 4 b
103 4 5 5 5 5 6 5 5 c
这是上面 table 没有“组”列的 R 代码
structure(list(id = c(101, 102, 103), score1 = c(2, 4, 4), score2 = c(2,
4, 5), score3 = c(2, 3, 5), score4 = c(2, 3, 5), score5 = c(1,
3, 5), score6 = c(2, 3, 6), score7 = c(2, 4, 5), score8 = c(1,
4, 5)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
))
如有任何想法,我们将不胜感激!非常感谢:)
循环遍历apply
(MARGIN = 1
)数据的数字列行,将值1到2替换为1,大于等于4的替换为4 ,然后在行中的替换值上获取rle
(运行-length-encoding),提取'values'和'lengths',根据条件创建逻辑表达式如果满足这些条件,则在 OP 的 post 和 return 中指定所需的组值
library(dplyr)
df1$group <- apply(df1[-1], 1, function(x) {
x <- case_when(x %in% 1:2 ~ 1, x >=4 ~ 4, TRUE ~ x)
v1 <- rle(x)
na.omit(case_when(v1$values == 1 & v1$lengths >= 6 ~ 'a',
v1$values == 3 & v1$lengths >=4 ~ 'b',
v1$values ==4 & v1$lengths >= 6 ~ 'c' )) })
df1$group
#[1] "a" "b" "c"
或使用tidyverse
library(data.table)
library(tidyr)
df1 %>%
pivot_longer(cols = -id) %>%
mutate(newvalue = case_when(value %in% 1:2 ~ 1,
value >= 4 ~ 4, TRUE ~ value)) %>%
add_count(id, grp = rleid(newvalue)) %>%
group_by(id) %>%
summarise( group = first(na.omit(case_when(newvalue == 1 & n >= 6 ~ 'a',
newvalue == 3 & n >= 4 ~'b',
newvalue == 4 & n >= 6 ~ 'c'))), .groups = 'drop') %>%
left_join(df1, .)
-输出
# A tibble: 3 x 10
# id score1 score2 score3 score4 score5 score6 score7 score8 group
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
#1 101 2 2 2 2 1 2 2 1 a
#2 102 4 4 3 3 3 3 4 4 b
#3 103 4 5 5 5 5 6 5 5 c
所有解决方案的核心功能是rle()
。你如何处理周围的一切取决于你。
library(tidyverse, quietly = TRUE)
score_df %>%
pivot_longer(score1:score8) %>%
mutate(value =
case_when(
value <= 2 ~ 1,
value >= 4 ~ 4,
TRUE ~ value
)) %>%
group_by(id) %>%
group_map(~{
r <- rle(.$value)
highest_val <- max(r$values)
longest_len <- max(r$lengths)
case_when(max(r$value) == 1 ~ "a",
any(r$lengths[which(r$value == 3)] >= 4) ~ "b",
any(r$lengths[which(r$value == 4)] >= 6) ~ "c",
TRUE ~ NA_character_)
}) %>%
unlist()
#> [1] "a" "b" "c"
使用基础 R,你可以这样做:
pat <- c(a = "[12]{6}", b="3{4}", c="[4-9]{6}")
cbind(df, group = names(pat)[max.col(sapply(pat, grepl, do.call(paste0, df[-1])))])
id score1 score2 score3 score4 score5 score6 score7 score8 group
1 101 2 2 2 2 1 2 2 1 a
2 102 4 4 3 3 3 3 4 4 b
3 103 4 5 5 5 5 6 5 5 c
我有一个数据集,其中仅包含主题 ID 和不同时间点的分数。有没有办法根据他们的分数创建一个组变量?例如,如果一个科目有 6 个连续分数为 1 或 2,我会将它们放在组“a” |如果他们连续4次得分为3,我会把他们分在“b”组 |如果他们有 6 次连续得分为 4 或更高,我会把他们放在“c”组。
这是一个示例数据集:
id score1 score2 score3 score4 score5 score6 score7 score8 group
101 2 2 2 2 1 2 2 1 a
102 4 4 3 3 3 3 4 4 b
103 4 5 5 5 5 6 5 5 c
这是上面 table 没有“组”列的 R 代码
structure(list(id = c(101, 102, 103), score1 = c(2, 4, 4), score2 = c(2,
4, 5), score3 = c(2, 3, 5), score4 = c(2, 3, 5), score5 = c(1,
3, 5), score6 = c(2, 3, 6), score7 = c(2, 4, 5), score8 = c(1,
4, 5)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
))
如有任何想法,我们将不胜感激!非常感谢:)
循环遍历apply
(MARGIN = 1
)数据的数字列行,将值1到2替换为1,大于等于4的替换为4 ,然后在行中的替换值上获取rle
(运行-length-encoding),提取'values'和'lengths',根据条件创建逻辑表达式如果满足这些条件,则在 OP 的 post 和 return 中指定所需的组值
library(dplyr)
df1$group <- apply(df1[-1], 1, function(x) {
x <- case_when(x %in% 1:2 ~ 1, x >=4 ~ 4, TRUE ~ x)
v1 <- rle(x)
na.omit(case_when(v1$values == 1 & v1$lengths >= 6 ~ 'a',
v1$values == 3 & v1$lengths >=4 ~ 'b',
v1$values ==4 & v1$lengths >= 6 ~ 'c' )) })
df1$group
#[1] "a" "b" "c"
或使用tidyverse
library(data.table)
library(tidyr)
df1 %>%
pivot_longer(cols = -id) %>%
mutate(newvalue = case_when(value %in% 1:2 ~ 1,
value >= 4 ~ 4, TRUE ~ value)) %>%
add_count(id, grp = rleid(newvalue)) %>%
group_by(id) %>%
summarise( group = first(na.omit(case_when(newvalue == 1 & n >= 6 ~ 'a',
newvalue == 3 & n >= 4 ~'b',
newvalue == 4 & n >= 6 ~ 'c'))), .groups = 'drop') %>%
left_join(df1, .)
-输出
# A tibble: 3 x 10
# id score1 score2 score3 score4 score5 score6 score7 score8 group
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
#1 101 2 2 2 2 1 2 2 1 a
#2 102 4 4 3 3 3 3 4 4 b
#3 103 4 5 5 5 5 6 5 5 c
所有解决方案的核心功能是rle()
。你如何处理周围的一切取决于你。
library(tidyverse, quietly = TRUE)
score_df %>%
pivot_longer(score1:score8) %>%
mutate(value =
case_when(
value <= 2 ~ 1,
value >= 4 ~ 4,
TRUE ~ value
)) %>%
group_by(id) %>%
group_map(~{
r <- rle(.$value)
highest_val <- max(r$values)
longest_len <- max(r$lengths)
case_when(max(r$value) == 1 ~ "a",
any(r$lengths[which(r$value == 3)] >= 4) ~ "b",
any(r$lengths[which(r$value == 4)] >= 6) ~ "c",
TRUE ~ NA_character_)
}) %>%
unlist()
#> [1] "a" "b" "c"
使用基础 R,你可以这样做:
pat <- c(a = "[12]{6}", b="3{4}", c="[4-9]{6}")
cbind(df, group = names(pat)[max.col(sapply(pat, grepl, do.call(paste0, df[-1])))])
id score1 score2 score3 score4 score5 score6 score7 score8 group
1 101 2 2 2 2 1 2 2 1 a
2 102 4 4 3 3 3 3 4 4 b
3 103 4 5 5 5 5 6 5 5 c