如何按顺序按组对值求和
How to sum values by groups in sequence
我有一个数据框,在 duration
列中包含持续时间值,在 gaze_focus
列中包含组值。
df1
duration gaze_focus
29 1.011 periphery
31 1.590 center
33 1.582 center
35 0.571 periphery
37 0.561 center
39 2.136 center
41 0.181 periphery
43 1.475 center
45 0.177 periphery
47 0.940 periphery
49 2.102 center
我想计算 直接相邻的相同组值 的总和以获得此结果:
df2
duration gaze_focus
1 1.011 periphery
2 3.172 center
3 0.571 periphery
4 2.697 center
5 0.181 periphery
6 1.475 center
7 1.117 periphery
8 2.102 center
我知道分组求和等数学运算可以使用例如aggregate
或 tapply
但我不知道如何按小块分组对值求和。感谢您的帮助!
可重现的数据:
df1 <- structure(list(duration = c(1.011, 1.59, 1.582, 0.571, 0.561,
2.136, 0.181, 1.475, 0.177, 0.94, 2.102), gaze_focus = c("periphery",
"center", "center", "periphery", "center", "center", "periphery",
"center", "periphery", "periphery", "center")), row.names = c(29L,
31L, 33L, 35L, 37L, 39L, 41L, 43L, 45L, 47L, 49L), class = "data.frame")
一个dplyr
选项可以是:
df1 %>%
group_by(gaze_focus, rleid = with(rle(gaze_focus), rep(seq_along(lengths), lengths))) %>%
summarise_all(sum) %>%
arrange(rleid)
gaze_focus rleid duration
<chr> <int> <dbl>
1 periphery 1 1.01
2 center 2 3.17
3 periphery 3 0.571
4 center 4 2.70
5 periphery 5 0.181
6 center 6 1.48
7 periphery 7 1.12
8 center 8 2.10
这个有用吗:
> for(i in 1:(nrow(df1)-1)){
+ if(df1$gaze_focus[i] == df1$gaze_focus[i+1]) {
+ df1$duration[i+1] = df1$duration[i] + df1$duration[i+1]
+ df1[i,] = NA
+ }
+ else{ df1$duration[i] = df1$duration[i] }
+ }
> df1
duration gaze_focus
29 1.011 periphery
31 NA <NA>
33 3.172 center
35 0.571 periphery
37 NA <NA>
39 2.697 center
41 0.181 periphery
43 1.475 center
45 NA <NA>
47 1.117 periphery
49 2.102 center
> df2 <- df1 %>% na.omit()
> df2
duration gaze_focus
29 1.011 periphery
33 3.172 center
35 0.571 periphery
39 2.697 center
41 0.181 periphery
43 1.475 center
47 1.117 periphery
49 2.102 center
> rownames(df2) <- NULL
我们可以使用 data.table
中的 rleid
将每个连续的值创建为单独的组。
library(data.table)
setDT(df1)[, .(gaze_focus = first(gaze_focus),
duration = sum(duration)), rleid(gaze_focus)]
# rleid gaze_focus duration
#1: 1 periphery 1.011
#2: 2 center 3.172
#3: 3 periphery 0.571
#4: 4 center 2.697
#5: 5 periphery 0.181
#6: 6 center 1.475
#7: 7 periphery 1.117
#8: 8 center 2.102
带有 dplyr
和 rleid
的选项
library(dplyr)
library(data.table)
df1 %>%
group_by(grp = rleid(gaze_focus), gaze_focus) %>%
summarise(duration = sum(duration, na.rm = TRUE),
.groups = 'drop') %>%
select(-grp)
-输出
# A tibble: 8 x 2
# gaze_focus duration
# <chr> <dbl>
#1 periphery 1.01
#2 center 3.17
#3 periphery 0.571
#4 center 2.70
#5 periphery 0.181
#6 center 1.48
#7 periphery 1.12
#8 center 2.10
我有一个数据框,在 duration
列中包含持续时间值,在 gaze_focus
列中包含组值。
df1
duration gaze_focus
29 1.011 periphery
31 1.590 center
33 1.582 center
35 0.571 periphery
37 0.561 center
39 2.136 center
41 0.181 periphery
43 1.475 center
45 0.177 periphery
47 0.940 periphery
49 2.102 center
我想计算 直接相邻的相同组值 的总和以获得此结果:
df2
duration gaze_focus
1 1.011 periphery
2 3.172 center
3 0.571 periphery
4 2.697 center
5 0.181 periphery
6 1.475 center
7 1.117 periphery
8 2.102 center
我知道分组求和等数学运算可以使用例如aggregate
或 tapply
但我不知道如何按小块分组对值求和。感谢您的帮助!
可重现的数据:
df1 <- structure(list(duration = c(1.011, 1.59, 1.582, 0.571, 0.561,
2.136, 0.181, 1.475, 0.177, 0.94, 2.102), gaze_focus = c("periphery",
"center", "center", "periphery", "center", "center", "periphery",
"center", "periphery", "periphery", "center")), row.names = c(29L,
31L, 33L, 35L, 37L, 39L, 41L, 43L, 45L, 47L, 49L), class = "data.frame")
一个dplyr
选项可以是:
df1 %>%
group_by(gaze_focus, rleid = with(rle(gaze_focus), rep(seq_along(lengths), lengths))) %>%
summarise_all(sum) %>%
arrange(rleid)
gaze_focus rleid duration
<chr> <int> <dbl>
1 periphery 1 1.01
2 center 2 3.17
3 periphery 3 0.571
4 center 4 2.70
5 periphery 5 0.181
6 center 6 1.48
7 periphery 7 1.12
8 center 8 2.10
这个有用吗:
> for(i in 1:(nrow(df1)-1)){
+ if(df1$gaze_focus[i] == df1$gaze_focus[i+1]) {
+ df1$duration[i+1] = df1$duration[i] + df1$duration[i+1]
+ df1[i,] = NA
+ }
+ else{ df1$duration[i] = df1$duration[i] }
+ }
> df1
duration gaze_focus
29 1.011 periphery
31 NA <NA>
33 3.172 center
35 0.571 periphery
37 NA <NA>
39 2.697 center
41 0.181 periphery
43 1.475 center
45 NA <NA>
47 1.117 periphery
49 2.102 center
> df2 <- df1 %>% na.omit()
> df2
duration gaze_focus
29 1.011 periphery
33 3.172 center
35 0.571 periphery
39 2.697 center
41 0.181 periphery
43 1.475 center
47 1.117 periphery
49 2.102 center
> rownames(df2) <- NULL
我们可以使用 data.table
中的 rleid
将每个连续的值创建为单独的组。
library(data.table)
setDT(df1)[, .(gaze_focus = first(gaze_focus),
duration = sum(duration)), rleid(gaze_focus)]
# rleid gaze_focus duration
#1: 1 periphery 1.011
#2: 2 center 3.172
#3: 3 periphery 0.571
#4: 4 center 2.697
#5: 5 periphery 0.181
#6: 6 center 1.475
#7: 7 periphery 1.117
#8: 8 center 2.102
带有 dplyr
和 rleid
library(dplyr)
library(data.table)
df1 %>%
group_by(grp = rleid(gaze_focus), gaze_focus) %>%
summarise(duration = sum(duration, na.rm = TRUE),
.groups = 'drop') %>%
select(-grp)
-输出
# A tibble: 8 x 2
# gaze_focus duration
# <chr> <dbl>
#1 periphery 1.01
#2 center 3.17
#3 periphery 0.571
#4 center 2.70
#5 periphery 0.181
#6 center 1.48
#7 periphery 1.12
#8 center 2.10