查找最后一个序列剧集和递增计数变量

Find last sequence episode and increment count variable

我的数据是这样的(person-period文件),其中hldid代表唯一标识符,variable代表时间paid 是感兴趣的虚拟向量

   hldid variable paid
1      1        1    0
2      1        2    0
3      1        3    0
4      1        4    1
5      1        5    1
6      1        6    0
7      1        7    1
8      1        8    1
9      1        9    0
10     1       10    0
11     2        1    0
12     2        2    0
13     2        3    1
14     2        4    1
15     2        5    0
16     2        6    1
17     2        7    0
18     2        8    0
19     2        9    0
20     2       10    0

我想实现的是:

  hldid variable  paid  last wwork2
 1     1        1     0     0      0
 2     1        2     0     0      0
 3     1        3     0     0      0
 4     1        4     1     0      0
 5     1        5     1     0      0
 6     1        6     0     0     -2
 7     1        7     1     0     -1
 8     1        8     1     1      0
 9     1        9     0     0      1
10     1       10     0     0      2
11     2        1     0     0      0
12     2        2     0     0      0
13     2        3     1     0      0
14     2        4     1     0     -2
15     2        5     0     0     -1
16     2        6     1     1      0
17     2        7     0     0      1
18     2        8     0     0      2
19     2        9     0     0      0
20     2       10     0     0      0

我想创建一个向量,它 (1) 为每个 hldid 找到 paid 的最新一集,并且 (2) 到 decrement/increment 前 2 集和后 2 集paid 最后一集。

到目前为止,这就是我所做的。

  1. 查找 paid 的最后一集

这里比较复杂的是paid不是一个连续的序列。例如 hldid == 1 在第 6 集停止支付并在第 7 集重新开始,最后一集在 8 点。

所以我的想法是对所有 paid == 1 进行子集化,计算剧集的数量,然后将其合并回去。但是,我不确定这是最有效的策略。

ddw = dta %>% filter(paid == 1)
ddw$work = 0

for(i in 2:nrow(ddw)){
  if(ddw$hldid[i] == ddw$hldid[i-1] & 
     ddw$paid[i] == 1){
    ddw$work[i] <- ddw$work[i-1] + 1 
 }
}

ddf = merge(dta, ddw, by = c('hldid', 'variable', 'paid'), all = T)

然后,我找到最后一集

ddw2 = ddf %>% group_by(hldid) %>% mutate(end_work = ifelse(work == max(work, na.rm = T), variable, 0)) 

最后我创建了一个虚拟对象来指示最终 paid 剧集的位置

ddw2$end_work[is.na(ddw2$end_work)] <- 0
ddw2 = ddw2 %>% group_by(hldid) %>% mutate(wwork = ifelse(end_work == variable, 1, 0))
  1. Increment/decrement

现在,从这里我不知道如何在上一集之前和之后increment/decrement。到目前为止,我只能想出这个:

df = ddw2
df$wwork2 = 0

for(i in 2:nrow(df)){
  if(df$hldid[i] == df$hldid[i-1] & 
     df$wwork[i] == 1){
    df$wwork2[i-1] <- 1; df$wwork2[i] <- 1; df$wwork2[i+1] <- 1
  }
}

数据

dta = rbind(c(1,1,0), 
      c(1,2,0), 
      c(1,3,0), 
      c(1,4,1), 
      c(1,5,1), 
      c(1,6,0), 
      c(1,7,1), 
      c(1,8,1), 
      c(1,9,0), 
      c(1,10,0), 
      c(2,1,0), 
      c(2,2,0), 
      c(2,3,1), 
      c(2,4,1), 
      c(2,5,0), 
      c(2,6,1), 
      c(2,7,0), 
      c(2,8,0), 
      c(2,9,0), 
      c(2,10,0)) 

colnames(dta) = c('hldid', 'variable', 'paid')
dta = as.data.frame(dta)

library(dplyr)

使用 dplyr,按 hldid 分组,然后将 end_work 定义为 variablepaid==1 的最大值之间的差值,然后插入大于 2 的值为 0...

library(dplyr)
dta2 <- dta %>% group_by(hldid) %>% 
                mutate(last=as.numeric(variable==max(variable[paid==1]))) %>%
                mutate(end_work=variable-max(variable[paid==1])) %>%
                mutate(end_work=replace(end_work,abs(end_work)>2,0))

dta2
   hldid variable  paid  last end_work
   <dbl>    <dbl> <dbl> <dbl>    <dbl>
 1     1        1     0     0        0
 2     1        2     0     0        0
 3     1        3     0     0        0
 4     1        4     1     0        0
 5     1        5     1     0        0
 6     1        6     0     0       -2
 7     1        7     1     0       -1
 8     1        8     1     1        0
 9     1        9     0     0        1
10     1       10     0     0        2
11     2        1     0     0        0
12     2        2     0     0        0
13     2        3     1     0        0
14     2        4     1     0       -2
15     2        5     0     0       -1
16     2        6     1     1        0
17     2        7     0     0        1
18     2        8     0     0        2
19     2        9     0     0        0
20     2       10     0     0        0

工作剧集结束id可以总结为

end_w <- dta %>% group_by(hldid) %>% summarise(end_episode=max(variable[paid==1]))

end_w
  hldid end_episode
  <dbl>       <dbl>
1     1           8
2     2           6

我们可以试试data.table

library(data.table)
setDT(dta)[,  c('last', 'wwork2') := {
       i1 <- which.max(cumsum(paid))
       i2 <- seq_len(.N) - i1
     .(as.integer(seq_len(.N) ==i1), i2*(abs(i2) <=2))
         },  by = hldid]

df1
#     hldid variable paid last wwork2
# 1:     1        1    0    0      0
# 2:     1        2    0    0      0
# 3:     1        3    0    0      0
# 4:     1        4    1    0      0
# 5:     1        5    1    0      0
# 6:     1        6    0    0     -2
# 7:     1        7    1    0     -1
# 8:     1        8    1    1      0
# 9:     1        9    0    0      1
#10:     1       10    0    0      2
#11:     2        1    0    0      0
#12:     2        2    0    0      0
#13:     2        3    1    0      0
#14:     2        4    1    0     -2
#15:     2        5    0    0     -1
#16:     2        6    1    1      0
#17:     2        7    0    0      1
#18:     2        8    0    0      2
#19:     2        9    0    0      0
#20:     2       10    0    0      0