如何按顺序按组对值求和

How to sum values by groups in sequence

我有一个数据框,在 duration 列中包含持续时间值,在 gaze_focus 列中包含组值。

df1
   duration gaze_focus
29    1.011  periphery
31    1.590     center
33    1.582     center
35    0.571  periphery
37    0.561     center
39    2.136     center
41    0.181  periphery
43    1.475     center
45    0.177  periphery
47    0.940  periphery
49    2.102     center

我想计算 直接相邻的相同组值 的总和以获得此结果:

df2
  duration gaze_focus
1    1.011  periphery
2    3.172     center
3    0.571  periphery
4    2.697     center
5    0.181  periphery
6    1.475     center
7    1.117  periphery
8    2.102     center

我知道分组求和等数学运算可以使用例如aggregatetapply 但我不知道如何按小块分组对值求和。感谢您的帮助!

可重现的数据:

df1 <- structure(list(duration = c(1.011, 1.59, 1.582, 0.571, 0.561, 
2.136, 0.181, 1.475, 0.177, 0.94, 2.102), gaze_focus = c("periphery", 
"center", "center", "periphery", "center", "center", "periphery", 
"center", "periphery", "periphery", "center")), row.names = c(29L, 
31L, 33L, 35L, 37L, 39L, 41L, 43L, 45L, 47L, 49L), class = "data.frame")

一个dplyr选项可以是:

df1 %>%
 group_by(gaze_focus, rleid = with(rle(gaze_focus), rep(seq_along(lengths), lengths))) %>%
 summarise_all(sum) %>%
 arrange(rleid)

  gaze_focus rleid duration
  <chr>      <int>    <dbl>
1 periphery      1    1.01 
2 center         2    3.17 
3 periphery      3    0.571
4 center         4    2.70 
5 periphery      5    0.181
6 center         6    1.48 
7 periphery      7    1.12 
8 center         8    2.10 

这个有用吗:

> for(i in 1:(nrow(df1)-1)){
+           if(df1$gaze_focus[i] == df1$gaze_focus[i+1]) { 
+             df1$duration[i+1] = df1$duration[i] + df1$duration[i+1] 
+             df1[i,] = NA
+             }
+   else{ df1$duration[i] = df1$duration[i] }
+ }
> df1
   duration gaze_focus
29    1.011  periphery
31       NA       <NA>
33    3.172     center
35    0.571  periphery
37       NA       <NA>
39    2.697     center
41    0.181  periphery
43    1.475     center
45       NA       <NA>
47    1.117  periphery
49    2.102     center
> df2 <- df1 %>% na.omit()
> df2
   duration gaze_focus
29    1.011  periphery
33    3.172     center
35    0.571  periphery
39    2.697     center
41    0.181  periphery
43    1.475     center
47    1.117  periphery
49    2.102     center
> rownames(df2) <- NULL

我们可以使用 data.table 中的 rleid 将每个连续的值创建为单独的组。

library(data.table)

setDT(df1)[, .(gaze_focus = first(gaze_focus), 
               duration = sum(duration)), rleid(gaze_focus)]

#   rleid gaze_focus duration
#1:     1  periphery    1.011
#2:     2     center    3.172
#3:     3  periphery    0.571
#4:     4     center    2.697
#5:     5  periphery    0.181
#6:     6     center    1.475
#7:     7  periphery    1.117
#8:     8     center    2.102

带有 dplyrrleid

的选项
library(dplyr)
library(data.table)
df1 %>% 
   group_by(grp = rleid(gaze_focus), gaze_focus) %>% 
  summarise(duration = sum(duration, na.rm = TRUE),
         .groups = 'drop') %>% 
  select(-grp)

-输出

# A tibble: 8 x 2
#  gaze_focus duration
#  <chr>         <dbl>
#1 periphery     1.01 
#2 center        3.17 
#3 periphery     0.571
#4 center        2.70 
#5 periphery     0.181
#6 center        1.48 
#7 periphery     1.12 
#8 center        2.10