如何按顺序按组对值求和

Question

我有一个数据框，在 duration 列中包含持续时间值，在 gaze_focus 列中包含组值。

df1
   duration gaze_focus
29    1.011  periphery
31    1.590     center
33    1.582     center
35    0.571  periphery
37    0.561     center
39    2.136     center
41    0.181  periphery
43    1.475     center
45    0.177  periphery
47    0.940  periphery
49    2.102     center

我想计算 直接相邻的相同组值 的总和以获得此结果：

df2
  duration gaze_focus
1    1.011  periphery
2    3.172     center
3    0.571  periphery
4    2.697     center
5    0.181  periphery
6    1.475     center
7    1.117  periphery
8    2.102     center

我知道分组求和等数学运算可以使用例如aggregate 或 tapply 但我不知道如何按小块分组对值求和。感谢您的帮助！

可重现的数据：

df1 <- structure(list(duration = c(1.011, 1.59, 1.582, 0.571, 0.561, 
2.136, 0.181, 1.475, 0.177, 0.94, 2.102), gaze_focus = c("periphery", 
"center", "center", "periphery", "center", "center", "periphery", 
"center", "periphery", "periphery", "center")), row.names = c(29L, 
31L, 33L, 35L, 37L, 39L, 41L, 43L, 45L, 47L, 49L), class = "data.frame")

Answer 1

一个dplyr选项可以是：

df1 %>%
 group_by(gaze_focus, rleid = with(rle(gaze_focus), rep(seq_along(lengths), lengths))) %>%
 summarise_all(sum) %>%
 arrange(rleid)

  gaze_focus rleid duration
  <chr>      <int>    <dbl>
1 periphery      1    1.01 
2 center         2    3.17 
3 periphery      3    0.571
4 center         4    2.70 
5 periphery      5    0.181
6 center         6    1.48 
7 periphery      7    1.12 
8 center         8    2.10

Answer 2

这个有用吗：

> for(i in 1:(nrow(df1)-1)){
+           if(df1$gaze_focus[i] == df1$gaze_focus[i+1]) { 
+             df1$duration[i+1] = df1$duration[i] + df1$duration[i+1] 
+             df1[i,] = NA
+             }
+   else{ df1$duration[i] = df1$duration[i] }
+ }
> df1
   duration gaze_focus
29    1.011  periphery
31       NA       <NA>
33    3.172     center
35    0.571  periphery
37       NA       <NA>
39    2.697     center
41    0.181  periphery
43    1.475     center
45       NA       <NA>
47    1.117  periphery
49    2.102     center
> df2 <- df1 %>% na.omit()
> df2
   duration gaze_focus
29    1.011  periphery
33    3.172     center
35    0.571  periphery
39    2.697     center
41    0.181  periphery
43    1.475     center
47    1.117  periphery
49    2.102     center
> rownames(df2) <- NULL

Answer 3

我们可以使用 data.table 中的 rleid 将每个连续的值创建为单独的组。

library(data.table)

setDT(df1)[, .(gaze_focus = first(gaze_focus), 
               duration = sum(duration)), rleid(gaze_focus)]

#   rleid gaze_focus duration
#1:     1  periphery    1.011
#2:     2     center    3.172
#3:     3  periphery    0.571
#4:     4     center    2.697
#5:     5  periphery    0.181
#6:     6     center    1.475
#7:     7  periphery    1.117
#8:     8     center    2.102

Answer 4

带有 dplyr 和 rleid

的选项

library(dplyr)
library(data.table)
df1 %>% 
   group_by(grp = rleid(gaze_focus), gaze_focus) %>% 
  summarise(duration = sum(duration, na.rm = TRUE),
         .groups = 'drop') %>% 
  select(-grp)

-输出

# A tibble: 8 x 2
#  gaze_focus duration
#  <chr>         <dbl>
#1 periphery     1.01 
#2 center        3.17 
#3 periphery     0.571
#4 center        2.70 
#5 periphery     0.181
#6 center        1.48 
#7 periphery     1.12 
#8 center        2.10

如何按顺序按组对值求和

How to sum values by groups in sequence

grouping

r

sum