使用 dplyr 进行多变量汇总
Multiple variable summary with dplyr
我有以下DF
head(sample_data)
article value date
1 A 21920 2015
2 I 615 2017
3 B 1414 2018
4 D 102 2018
5 I 1096 2015
6 A 2577 2021
完整数据集
dput(sample_data)
structure(list(article = c("A", "I", "B", "D", "I", "A", "C",
"C", "D", "H", "B", "I", "A", "G", "E", "G", "D", "A", "D", "B",
"A", "C", "D", "F", "G", "D", "G", "C", "E", "E", "G", "G", "A",
"A", "E", "H", "B", "E", "E", "B", "B", "A", "H", "A", "B", "G",
"D", "C", "E", "A"), value = c(21920, 615, 1414, 102, 1096, 2577,
840, 311, 804, 695, 3863, 279, 7324, 299, 311, 133, 759, 5386,
5396, 11051, 14708, 856, 1749, 2212, 318, 3478, 415, 781, 227,
248, 122, 185, 1344, 15442, 248, 433, 5068, 38, 165, 369, 805,
18944, 264, 11716, 4274, 442, 2530, 827, 164, 18506), date = c("2015",
"2017", "2018", "2018", "2015", "2021", "2016", "2021", "2017",
"2021", "2019", "2015", "2019", "2016", "2015", "2019", "2018",
"2020", "2017", "2015", "2015", "2016", "2015", "2015", "2021",
"2015", "2019", "2016", "2016", "2015", "2019", "2020", "2019",
"2016", "2016", "2015", "2015", "2021", "2021", "2020", "2020",
"2015", "2016", "2017", "2019", "2016", "2015", "2016", "2019",
"2016")), row.names = c(NA, -50L), class = "data.frame")
我正在尝试使用 dplyr 来获得类似这样的东西:
sample_data %>%
+ group_by(article, date) %>%
+ summarise(weight = sum(value))
`summarise()` has grouped output by 'article'. You can override using the `.groups` argument.
# A tibble: 29 x 3
# Groups: article [9]
article date weight
<chr> <chr> <dbl>
1 A 2015 55572
2 A 2016 33948
3 A 2017 11716
4 A 2019 8668
5 A 2020 5386
6 A 2021 2577
7 B 2015 16119
8 B 2018 1414
9 B 2019 8137
10 B 2020 1174
# ... with 19 more rows
但是,我想添加另一列,其中包含每年每篇文章的权重占总数(A:I 的总和)的比例。所有文章比例的总和应为每年 1。
我尝试了下面的代码。我怀疑发生这种情况是因为我使用“值”导致所有值都被打印出来,因此出现了所有情况。我如何总结这一点,使其看起来像上面添加的列?
sample_data %>%
+ group_by(article, date) %>%
+ summarise(weight = sum(value), prop = value/weight)
`summarise()` has grouped output by 'article', 'date'. You can override using the `.groups` argument.
# A tibble: 50 x 4
# Groups: article, date [29]
article date weight prop
<chr> <chr> <dbl> <dbl>
1 A 2015 55572 0.394
2 A 2015 55572 0.265
3 A 2015 55572 0.341
4 A 2016 33948 0.455
5 A 2016 33948 0.545
6 A 2017 11716 1
7 A 2019 8668 0.845
8 A 2019 8668 0.155
9 A 2020 5386 1
10 A 2021 2577 1
# ... with 40 more rows
在最初的 summarize
之后,您每年每篇文章都有一个条目。然后你想知道每篇文章对每年的总贡献是多少,所以你需要再次group_by
只使用年份,最后mutate
得到每篇文章的比例。
library(dplyr)
sample_data %>%
group_by(article, date) %>%
summarise(weight = sum(value), .groups = "keep") %>%
group_by(date) %>%
mutate(prop = weight / sum(weight))
#> # A tibble: 29 x 4
#> # Groups: date [7]
#> article date weight prop
#> <chr> <chr> <dbl> <dbl>
#> 1 A 2015 55572 0.661
#> 2 A 2016 33948 0.876
#> 3 A 2017 11716 0.632
#> 4 A 2019 8668 0.491
#> 5 A 2020 5386 0.799
#> 6 A 2021 2577 0.628
#> 7 B 2015 16119 0.192
#> 8 B 2018 1414 0.622
#> 9 B 2019 8137 0.461
#> 10 B 2020 1174 0.174
#> # ... with 19 more rows
由 reprex package (v2.0.1)
于 2022-02-19 创建
一个选项也是在第一次汇总中按总和进行分组
library(dplyr)
library(tibble)
library(tidyr)
sample_data %>%
group_by(date) %>%
summarise(out = enframe(tapply(value, article, sum)/sum(value),
name = 'article', value = 'prop'), .groups = 'drop') %>%
unpack(out)
# A tibble: 29 × 3
date article prop
<chr> <chr> <dbl>
1 2015 A 0.661
2 2015 B 0.192
3 2015 D 0.0923
4 2015 E 0.00665
5 2015 F 0.0263
6 2015 H 0.00515
7 2015 I 0.0164
8 2016 A 0.876
9 2016 C 0.0853
10 2016 E 0.0123
# … with 19 more rows
我有以下DF
head(sample_data)
article value date
1 A 21920 2015
2 I 615 2017
3 B 1414 2018
4 D 102 2018
5 I 1096 2015
6 A 2577 2021
完整数据集
dput(sample_data)
structure(list(article = c("A", "I", "B", "D", "I", "A", "C",
"C", "D", "H", "B", "I", "A", "G", "E", "G", "D", "A", "D", "B",
"A", "C", "D", "F", "G", "D", "G", "C", "E", "E", "G", "G", "A",
"A", "E", "H", "B", "E", "E", "B", "B", "A", "H", "A", "B", "G",
"D", "C", "E", "A"), value = c(21920, 615, 1414, 102, 1096, 2577,
840, 311, 804, 695, 3863, 279, 7324, 299, 311, 133, 759, 5386,
5396, 11051, 14708, 856, 1749, 2212, 318, 3478, 415, 781, 227,
248, 122, 185, 1344, 15442, 248, 433, 5068, 38, 165, 369, 805,
18944, 264, 11716, 4274, 442, 2530, 827, 164, 18506), date = c("2015",
"2017", "2018", "2018", "2015", "2021", "2016", "2021", "2017",
"2021", "2019", "2015", "2019", "2016", "2015", "2019", "2018",
"2020", "2017", "2015", "2015", "2016", "2015", "2015", "2021",
"2015", "2019", "2016", "2016", "2015", "2019", "2020", "2019",
"2016", "2016", "2015", "2015", "2021", "2021", "2020", "2020",
"2015", "2016", "2017", "2019", "2016", "2015", "2016", "2019",
"2016")), row.names = c(NA, -50L), class = "data.frame")
我正在尝试使用 dplyr 来获得类似这样的东西:
sample_data %>%
+ group_by(article, date) %>%
+ summarise(weight = sum(value))
`summarise()` has grouped output by 'article'. You can override using the `.groups` argument.
# A tibble: 29 x 3
# Groups: article [9]
article date weight
<chr> <chr> <dbl>
1 A 2015 55572
2 A 2016 33948
3 A 2017 11716
4 A 2019 8668
5 A 2020 5386
6 A 2021 2577
7 B 2015 16119
8 B 2018 1414
9 B 2019 8137
10 B 2020 1174
# ... with 19 more rows
但是,我想添加另一列,其中包含每年每篇文章的权重占总数(A:I 的总和)的比例。所有文章比例的总和应为每年 1。
我尝试了下面的代码。我怀疑发生这种情况是因为我使用“值”导致所有值都被打印出来,因此出现了所有情况。我如何总结这一点,使其看起来像上面添加的列?
sample_data %>%
+ group_by(article, date) %>%
+ summarise(weight = sum(value), prop = value/weight)
`summarise()` has grouped output by 'article', 'date'. You can override using the `.groups` argument.
# A tibble: 50 x 4
# Groups: article, date [29]
article date weight prop
<chr> <chr> <dbl> <dbl>
1 A 2015 55572 0.394
2 A 2015 55572 0.265
3 A 2015 55572 0.341
4 A 2016 33948 0.455
5 A 2016 33948 0.545
6 A 2017 11716 1
7 A 2019 8668 0.845
8 A 2019 8668 0.155
9 A 2020 5386 1
10 A 2021 2577 1
# ... with 40 more rows
在最初的 summarize
之后,您每年每篇文章都有一个条目。然后你想知道每篇文章对每年的总贡献是多少,所以你需要再次group_by
只使用年份,最后mutate
得到每篇文章的比例。
library(dplyr)
sample_data %>%
group_by(article, date) %>%
summarise(weight = sum(value), .groups = "keep") %>%
group_by(date) %>%
mutate(prop = weight / sum(weight))
#> # A tibble: 29 x 4
#> # Groups: date [7]
#> article date weight prop
#> <chr> <chr> <dbl> <dbl>
#> 1 A 2015 55572 0.661
#> 2 A 2016 33948 0.876
#> 3 A 2017 11716 0.632
#> 4 A 2019 8668 0.491
#> 5 A 2020 5386 0.799
#> 6 A 2021 2577 0.628
#> 7 B 2015 16119 0.192
#> 8 B 2018 1414 0.622
#> 9 B 2019 8137 0.461
#> 10 B 2020 1174 0.174
#> # ... with 19 more rows
由 reprex package (v2.0.1)
于 2022-02-19 创建一个选项也是在第一次汇总中按总和进行分组
library(dplyr)
library(tibble)
library(tidyr)
sample_data %>%
group_by(date) %>%
summarise(out = enframe(tapply(value, article, sum)/sum(value),
name = 'article', value = 'prop'), .groups = 'drop') %>%
unpack(out)
# A tibble: 29 × 3
date article prop
<chr> <chr> <dbl>
1 2015 A 0.661
2 2015 B 0.192
3 2015 D 0.0923
4 2015 E 0.00665
5 2015 F 0.0263
6 2015 H 0.00515
7 2015 I 0.0164
8 2016 A 0.876
9 2016 C 0.0853
10 2016 E 0.0123
# … with 19 more rows