根据连续分数的不同标准创建组变量

Create a group variable based on different criteria of consecutive scores

我有一个数据集,其中仅包含主题 ID 和不同时间点的分数。有没有办法根据他们的分数创建一个组变量?例如,如果一个科目有 6 个连续分数为 1 或 2,我会将它们放在组“a” |如果他们连续4次得分为3,我会把他们分在“b”组 |如果他们有 6 次连续得分为 4 或更高,我会把他们放在“c”组。

这是一个示例数据集:

id  score1  score2  score3  score4  score5  score6  score7  score8 group
101 2       2       2      2        1       2       2       1      a
102 4       4       3      3        3       3       4       4      b
103 4       5       5      5        5       6       5       5      c

这是上面 table 没有“组”列的 R 代码

structure(list(id = c(101, 102, 103), score1 = c(2, 4, 4), score2 = c(2, 
4, 5), score3 = c(2, 3, 5), score4 = c(2, 3, 5), score5 = c(1, 
3, 5), score6 = c(2, 3, 6), score7 = c(2, 4, 5), score8 = c(1, 
4, 5)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
))

如有任何想法,我们将不胜感激!非常感谢:)

循环遍历applyMARGIN = 1)数据的数字列行,将值1到2替换为1,大于等于4的替换为4 ,然后在行中的替换值上获取rle(运行-length-encoding),提取'values'和'lengths',根据条件创建逻辑表达式如果满足这些条件,则在 OP 的 post 和 return 中指定所需的组值

library(dplyr)
df1$group <- apply(df1[-1], 1, function(x) {
     x <- case_when(x %in% 1:2 ~ 1, x >=4 ~ 4, TRUE ~ x)
     v1 <- rle(x)
     na.omit(case_when(v1$values == 1 & v1$lengths >= 6 ~  'a',
       v1$values == 3 & v1$lengths >=4 ~ 'b',
         v1$values ==4 & v1$lengths >= 6 ~  'c' )) })
df1$group
#[1] "a" "b" "c"

或使用tidyverse

library(data.table)
library(tidyr)
df1 %>%
   pivot_longer(cols = -id) %>% 
   mutate(newvalue = case_when(value %in% 1:2 ~ 1, 
         value >= 4 ~ 4, TRUE ~ value)) %>%
   add_count(id, grp = rleid(newvalue)) %>%
   group_by(id) %>%
   summarise( group = first(na.omit(case_when(newvalue == 1 & n >= 6 ~ 'a',
           newvalue == 3 & n >= 4  ~'b',
           newvalue == 4 & n >= 6 ~ 'c'))), .groups = 'drop') %>% 
  left_join(df1, .)

-输出

# A tibble: 3 x 10
#     id score1 score2 score3 score4 score5 score6 score7 score8 group
#  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl> <chr>
#1   101      2      2      2      2      1      2      2      1 a    
#2   102      4      4      3      3      3      3      4      4 b    
#3   103      4      5      5      5      5      6      5      5 c    

所有解决方案的核心功能是rle()。你如何处理周围的一切取决于你。

library(tidyverse, quietly = TRUE)
score_df %>% 
  pivot_longer(score1:score8) %>% 
  mutate(value = 
           case_when(
             value <= 2 ~ 1,
             value >= 4 ~ 4,
             TRUE ~ value
           )) %>% 
  group_by(id) %>% 
  group_map(~{
    r <- rle(.$value)
    highest_val <- max(r$values)
    longest_len <- max(r$lengths)
    case_when(max(r$value) == 1 ~ "a",
              any(r$lengths[which(r$value == 3)] >= 4) ~ "b",
              any(r$lengths[which(r$value == 4)] >= 6) ~ "c",
              TRUE ~ NA_character_)
  }) %>% 
  unlist()
#> [1] "a" "b" "c"

使用基础 R,你可以这样做:

pat <- c(a = "[12]{6}", b="3{4}", c="[4-9]{6}")

cbind(df, group = names(pat)[max.col(sapply(pat, grepl, do.call(paste0, df[-1])))])

   id score1 score2 score3 score4 score5 score6 score7 score8 group
1 101      2      2      2      2      1      2      2      1     a
2 102      4      4      3      3      3      3      4      4     b
3 103      4      5      5      5      5      6      5      5     c