使用 dplyr 进行多变量汇总

Question

我有以下DF

head(sample_data)
  article value date
1       A 21920 2015
2       I   615 2017
3       B  1414 2018
4       D   102 2018
5       I  1096 2015
6       A  2577 2021

完整数据集

dput(sample_data)
structure(list(article = c("A", "I", "B", "D", "I", "A", "C", 
"C", "D", "H", "B", "I", "A", "G", "E", "G", "D", "A", "D", "B", 
"A", "C", "D", "F", "G", "D", "G", "C", "E", "E", "G", "G", "A", 
"A", "E", "H", "B", "E", "E", "B", "B", "A", "H", "A", "B", "G", 
"D", "C", "E", "A"), value = c(21920, 615, 1414, 102, 1096, 2577, 
840, 311, 804, 695, 3863, 279, 7324, 299, 311, 133, 759, 5386, 
5396, 11051, 14708, 856, 1749, 2212, 318, 3478, 415, 781, 227, 
248, 122, 185, 1344, 15442, 248, 433, 5068, 38, 165, 369, 805, 
18944, 264, 11716, 4274, 442, 2530, 827, 164, 18506), date = c("2015", 
"2017", "2018", "2018", "2015", "2021", "2016", "2021", "2017", 
"2021", "2019", "2015", "2019", "2016", "2015", "2019", "2018", 
"2020", "2017", "2015", "2015", "2016", "2015", "2015", "2021", 
"2015", "2019", "2016", "2016", "2015", "2019", "2020", "2019", 
"2016", "2016", "2015", "2015", "2021", "2021", "2020", "2020", 
"2015", "2016", "2017", "2019", "2016", "2015", "2016", "2019", 
"2016")), row.names = c(NA, -50L), class = "data.frame")

我正在尝试使用 dplyr 来获得类似这样的东西：

sample_data %>%
+   group_by(article, date) %>% 
+   summarise(weight = sum(value))
`summarise()` has grouped output by 'article'. You can override using the `.groups` argument.
# A tibble: 29 x 3
# Groups:   article [9]
   article date  weight
   <chr>   <chr>  <dbl>
 1 A       2015   55572
 2 A       2016   33948
 3 A       2017   11716
 4 A       2019    8668
 5 A       2020    5386
 6 A       2021    2577
 7 B       2015   16119
 8 B       2018    1414
 9 B       2019    8137
10 B       2020    1174
# ... with 19 more rows

但是，我想添加另一列，其中包含每年每篇文章的权重占总数（A:I 的总和）的比例。所有文章比例的总和应为每年 1。

我尝试了下面的代码。我怀疑发生这种情况是因为我使用“值”导致所有值都被打印出来，因此出现了所有情况。我如何总结这一点，使其看起来像上面添加的列？

sample_data %>%
+   group_by(article, date) %>% 
+   summarise(weight = sum(value), prop = value/weight)
`summarise()` has grouped output by 'article', 'date'. You can override using the `.groups` argument.
# A tibble: 50 x 4
# Groups:   article, date [29]
   article date  weight  prop
   <chr>   <chr>  <dbl> <dbl>
 1 A       2015   55572 0.394
 2 A       2015   55572 0.265
 3 A       2015   55572 0.341
 4 A       2016   33948 0.455
 5 A       2016   33948 0.545
 6 A       2017   11716 1    
 7 A       2019    8668 0.845
 8 A       2019    8668 0.155
 9 A       2020    5386 1    
10 A       2021    2577 1    
# ... with 40 more rows

Answer 1

在最初的 summarize 之后，您每年每篇文章都有一个条目。然后你想知道每篇文章对每年的总贡献是多少，所以你需要再次group_by只使用年份，最后mutate得到每篇文章的比例。

library(dplyr)

sample_data %>%
   group_by(article, date) %>% 
   summarise(weight = sum(value), .groups = "keep") %>%
   group_by(date) %>%
   mutate(prop = weight / sum(weight))
#> # A tibble: 29 x 4
#> # Groups:   date [7]
#>    article date  weight  prop
#>    <chr>   <chr>  <dbl> <dbl>
#>  1 A       2015   55572 0.661
#>  2 A       2016   33948 0.876
#>  3 A       2017   11716 0.632
#>  4 A       2019    8668 0.491
#>  5 A       2020    5386 0.799
#>  6 A       2021    2577 0.628
#>  7 B       2015   16119 0.192
#>  8 B       2018    1414 0.622
#>  9 B       2019    8137 0.461
#> 10 B       2020    1174 0.174
#> # ... with 19 more rows

^{由 reprex package (v2.0.1)}

于 2022-02-19 创建

Answer 2

一个选项也是在第一次汇总中按总和进行分组

library(dplyr)
library(tibble)
library(tidyr)
sample_data %>%
  group_by(date) %>%
  summarise(out = enframe(tapply(value, article, sum)/sum(value), 
   name = 'article', value = 'prop'), .groups = 'drop') %>% 
  unpack(out)
# A tibble: 29 × 3
   date  article    prop
   <chr> <chr>     <dbl>
 1 2015  A       0.661  
 2 2015  B       0.192  
 3 2015  D       0.0923 
 4 2015  E       0.00665
 5 2015  F       0.0263 
 6 2015  H       0.00515
 7 2015  I       0.0164 
 8 2016  A       0.876  
 9 2016  C       0.0853 
10 2016  E       0.0123 
# … with 19 more rows

使用 dplyr 进行多变量汇总

Multiple variable summary with dplyr

r

dplyr

tidyverse