r 按拆分规则重新编码

r recode by a splitting rule

我有一个学生数据集,包括学生信息,question id(5 个问题),每个试题的sequence 回答问题。我想创建一个变量来区分学生在完成所有问题后究竟从哪里开始复习问题。

这是一个示例数据集:

data <- data.frame(
person =   c(1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2),
question = c(1,2,2,3,3,3,4,3,5,1,2, 1,1,1,2,3,4,4,4,5,5,4,3,4,4,5,4,5),
sequence = c(1,1,2,1,2,3,1,4,1,2,3, 1,2,3,1,1,1,2,3,1,2,4,2,5,6,3,7,4))

data
   person question sequence
1       1        1        1
2       1        2        1
3       1        2        2
4       1        3        1
5       1        3        2
6       1        3        3
7       1        4        1
8       1        3        4
9       1        5        1
10      1        1        2
11      1        2        3
12      2        1        1
13      2        1        2
14      2        1        3
15      2        2        1
16      2        3        1
17      2        4        1
18      2        4        2
19      2        4        3
20      2        5        1
21      2        5        2
22      2        4        4
23      2        3        2
24      2        4        5
25      2        4        6
26      2        5        3
27      2        4        7
28      2        5        4

sequence 变量通过给出序号记录每次访问。一般来说,重访可以在看到所有问题之前进行。但是,attempt 变量应该只在学生看到所有 5 个问题后才记录。使用新变量,我以这个数据集为目标。

> data
   person question sequence attempt
1       1        1        1 initial
2       1        2        1 initial
3       1        2        2 initial
4       1        3        1 initial
5       1        3        2 initial
6       1        3        3 initial
7       1        4        1 initial
8       1        3        4 initial
9       1        5        1 initial
10      1        1        2  review
11      1        2        3  review
12      2        1        1 initial
13      2        1        2 initial
14      2        1        3 initial
15      2        2        1 initial
16      2        3        1 initial
17      2        4        1 initial
18      2        4        2 initial
19      2        4        3 initial
20      2        5        1 initial
21      2        5        2 initial
22      2        4        4  review
23      2        3        2  review
24      2        4        5  review
25      2        4        6  review
26      2        5        3  review
27      2        4        7  review
28      2        5        4  review

有什么想法吗? 谢谢!

一种方法是找到复习开始的位置(即看到第五个问题后的下一个条目)以及序列为 2 的位置。参见 v1v2。然后,通过对每个人进行子集化并按每个子集循环,您可以更新 attempt 变量缺失的条目,因为现在知道审查从哪里开始。

v1 <- c(FALSE, (data$question == 5)[-(nrow(data))])
v2 <- data$sequence == 2
data$attempt <- ifelse(v1 * v2 == 1, "review", NA)
persons <- unique(data$person)

persons.list <- vector(mode = "list", length = length(persons))

for(i in 1:length(persons)){      
  person.i <- subset(data, person == persons[i])
  n <- which(person.i$attempt == "review")
  m <- nrow(person.i)
  person.i$attempt[(n+1):m] <- "review" 
  person.i$attempt[which(is.na(person.i$attempt))] <- "initial"

  persons.list[[i]] <- person.i
}

do.call(rbind, persons.list)
   person question sequence attempt
1       1        1        1 initial
2       1        2        1 initial
3       1        2        2 initial
4       1        3        1 initial
5       1        3        2 initial
6       1        3        3 initial
7       1        4        1 initial
8       1        3        4 initial
9       1        5        1 initial
10      1        1        2  review
11      1        2        3  review
12      2        1        1 initial
13      2        1        2 initial
14      2        1        3 initial
15      2        2        1 initial
16      2        3        1 initial
17      2        4        1 initial
18      2        4        2 initial
19      2        4        3 initial
20      2        5        1 initial
21      2        5        2  review
22      2        4        4  review
23      2        3        2  review
24      2        4        5  review
25      2        4        6  review
26      2        5        3  review
27      2        4        7  review
28      2        5        4  review

或者,您也可以使用 lapply:

do.call(rbind, 
        lapply(persons, function(x){
          person.x <- subset(data, person == x)
          n <- which(person.x$attempt == "review")
          m <- nrow(person.x)
          person.x$attempt[(n+1):m] <- "review" 
          person.x$attempt[which(is.na(person.x$attempt))] <- "initial"
          person.x
        }))

多么具有挑战性的问题。花了将近2个小时才找到解决方案。

试试这个

library(dplyr)

dist_cum <- function(var)
  sapply(seq_along(var), function(x) length(unique(head(var, x))))

data %>% 
  mutate(var0 = n_distinct(question)) %>%
  group_by(person) %>% 
  mutate(var1 = dist_cum(question),
         var2 = cumsum(c(1, diff(question) != 0))) %>%
  ungroup() %>%
  mutate(var3 = if_else(sequence == 1 | var1 < var0, 0, 1)) %>%
  group_by(person, var2) %>%
  mutate(var4 = min(var3)) %>%
  ungroup() %>%
  mutate(attemp = if_else(var4 == 0, "initial", "review")) %>%
  select(-starts_with("var")) %>%
  as.data.frame

结果

   person question sequence  attemp
1       1        1        1 initial
2       1        2        1 initial
3       1        2        2 initial
4       1        3        1 initial
5       1        3        2 initial
6       1        3        3 initial
7       1        4        1 initial
8       1        3        4 initial
9       1        5        1 initial
10      1        1        2  review
11      1        2        3  review
12      2        1        1 initial
13      2        1        2 initial
14      2        1        3 initial
15      2        2        1 initial
16      2        3        1 initial
17      2        4        1 initial
18      2        4        2 initial
19      2        4        3 initial
20      2        5        1 initial
21      2        5        2 initial
22      2        4        4  review
23      2        3        2  review
24      2        4        5  review
25      2        4        6  review
26      2        5        3  review
27      2        4        7  review
28      2        5        4  review

dist_cum 是计算滚动不同 (Source) 的函数。 var0...var4 是帮手