在R中以长格式(相对于基组)按组计算跨行的增长率

Calculating growth rates across rows by groups in the long format (relative to a base group) in R

我想计算长格式数据集中各行的增长率(这里的组是“国家”和“年份”)。由于增长率应该与“基线”情景下的相同值(即相同的“国家”和相同的“年份”)相关(而不是相对于上一行),我通过将数据格式更改为宽格式,如下:

df <- spread(df, scenario, value) %>% 
    mutate(NDC_growth=((NDC/Baseline)-1)*100,
       `Partial BCA_growth`=((`Partial BCA`/Baseline)-1)*100,
       BCA_growth=((BCA/Baseline)-1)*100,
       `Full BCA_growth`=((`Full BCA`/Baseline)-1)*100 ) 

有没有办法在长格式中做到这一点?

这是数据:

    df<- structure(list(country = c("CAN", "CAN", "CAN", "CAN", "CAN", 
"CAN", "CAN", "CAN", "CAN", "CAN", "CAN", "CAN", "CAN", "CAN", 
"CAN", "NCOA", "NCOA", "NCOA", "NCOA", "NCOA"), year = c("2020", 
"2020", "2020", "2020", "2020", "2025", "2025", "2025", "2025", 
"2025", "2030", "2030", "2030", "2030", "2030", "2020", "2020", 
"2020", "2020", "2020"), scenario = c("Baseline", "BCA", "Full BCA", 
"NDC", "Partial BCA", "Baseline", "BCA", "Full BCA", "NDC", "Partial BCA", 
"Baseline", "BCA", "Full BCA", "NDC", "Partial BCA", "Baseline", 
"BCA", "Full BCA", "NDC", "Partial BCA"), value = c(50527.8708215592, 
50487.4619290311, 50485.0924261504, 50489.4453487844, 50486.1975947164, 
55845.9708589775, 55070.2745559464, 55133.107605613, 55153.4525662034, 
55065.0036253937, 61463.2383809614, 59893.8712077455, 59971.8726308887, 
59936.72156767, 59875.7762254252, 338418.917408225, 338420.617142445, 
338428.007621131, 338419.514027857, 338427.263672463)), row.names = c(NA, 
-20L), class = c("tbl_df", "tbl", "data.frame"))

你可以这样做:

df %>% 
  group_by(country, year) %>% 
  mutate(growth = (value / value[scenario == "Baseline"] - 1) * 100)
#> # A tibble: 20 x 5
#> # Groups:   country, year [4]
#>    country year  scenario      value    growth
#>    <chr>   <chr> <chr>         <dbl>     <dbl>
#>  1 CAN     2020  Baseline     50528.  0       
#>  2 CAN     2020  BCA          50487. -0.0800  
#>  3 CAN     2020  Full BCA     50485. -0.0847  
#>  4 CAN     2020  NDC          50489. -0.0760  
#>  5 CAN     2020  Partial BCA  50486. -0.0825  
#>  6 CAN     2025  Baseline     55846.  0       
#>  7 CAN     2025  BCA          55070. -1.39    
#>  8 CAN     2025  Full BCA     55133. -1.28    
#>  9 CAN     2025  NDC          55153. -1.24    
#> 10 CAN     2025  Partial BCA  55065. -1.40    
#> 11 CAN     2030  Baseline     61463.  0       
#> 12 CAN     2030  BCA          59894. -2.55    
#> 13 CAN     2030  Full BCA     59972. -2.43    
#> 14 CAN     2030  NDC          59937. -2.48    
#> 15 CAN     2030  Partial BCA  59876. -2.58    
#> 16 NCOA    2020  Baseline    338419.  0       
#> 17 NCOA    2020  BCA         338421.  0.000502
#> 18 NCOA    2020  Full BCA    338428.  0.00269 
#> 19 NCOA    2020  NDC         338420.  0.000176
#> 20 NCOA    2020  Partial BCA 338427.  0.00247 

要根据每个组中的第一个使用进行计算,嗯,first。如果 Baseline 始终是每个组中的第一个,则这是有效的。

df<- structure(list(
  country = c("CAN", "CAN", "CAN", "CAN", "CAN", 
              "CAN", "CAN", "CAN", "CAN", "CAN", "CAN", "CAN", "CAN", "CAN", 
              "CAN", "NCOA", "NCOA", "NCOA", "NCOA", "NCOA"), 
  year = c("2020", "2020", "2020", "2020", "2020", "2025", "2025", 
           "2025", "2025", "2025", "2030", "2030", "2030", "2030", "2030", 
           "2020", "2020", "2020", "2020", "2020"), 
  scenario = c("Baseline", "BCA", "Full BCA", "NDC", "Partial BCA", 
               "Baseline", "BCA", "Full BCA", "NDC", "Partial BCA", 
               "Baseline", "BCA", "Full BCA", "NDC", "Partial BCA", "Baseline", 
               "BCA", "Full BCA", "NDC", "Partial BCA"), 
  value = c(50527.8708215592, 50487.4619290311, 50485.0924261504, 
            50489.4453487844, 50486.1975947164, 55845.9708589775, 55070.2745559464, 
            55133.107605613, 55153.4525662034, 55065.0036253937, 61463.2383809614, 
            59893.8712077455, 59971.8726308887, 59936.72156767, 59875.7762254252, 
            338418.917408225, 338420.617142445, 338428.007621131, 338419.514027857, 
            338427.263672463)), row.names = c(NA, -20L), 
  class = c("tbl_df", "tbl", "data.frame"))

suppressPackageStartupMessages(library(dplyr))

df %>%
  group_by(country, year) %>%
  mutate(growth = (value/first(value) - 1)*100)
#> # A tibble: 20 × 5
#> # Groups:   country, year [4]
#>    country year  scenario      value    growth
#>    <chr>   <chr> <chr>         <dbl>     <dbl>
#>  1 CAN     2020  Baseline     50528.  0       
#>  2 CAN     2020  BCA          50487. -0.0800  
#>  3 CAN     2020  Full BCA     50485. -0.0847  
#>  4 CAN     2020  NDC          50489. -0.0760  
#>  5 CAN     2020  Partial BCA  50486. -0.0825  
#>  6 CAN     2025  Baseline     55846.  0       
#>  7 CAN     2025  BCA          55070. -1.39    
#>  8 CAN     2025  Full BCA     55133. -1.28    
#>  9 CAN     2025  NDC          55153. -1.24    
#> 10 CAN     2025  Partial BCA  55065. -1.40    
#> 11 CAN     2030  Baseline     61463.  0       
#> 12 CAN     2030  BCA          59894. -2.55    
#> 13 CAN     2030  Full BCA     59972. -2.43    
#> 14 CAN     2030  NDC          59937. -2.48    
#> 15 CAN     2030  Partial BCA  59876. -2.58    
#> 16 NCOA    2020  Baseline    338419.  0       
#> 17 NCOA    2020  BCA         338421.  0.000502
#> 18 NCOA    2020  Full BCA    338428.  0.00269 
#> 19 NCOA    2020  NDC         338420.  0.000176
#> 20 NCOA    2020  Partial BCA 338427.  0.00247

reprex package (v2.0.1)

于 2022-05-08 创建