使用 dplyr 扩展数据,自动排序 vars 和 sum 扩展列

using dplyr to widen data, automatically order vars and sum widened columns

我正在尝试执行一些我已经 completed/gotten 帮助过的功能,但我想使用我的结果添加更多列。我想在每年的集合结束时做一个总计,最后做一个总计。并且有两列被删除,我想保留为总计。 如果需要更多信息,请告诉我。提前致谢。

    np<-structure(list(STATE_ABBR = structure(c(2L, 1L, 2L,2L, 1L, 3L, 3L,2L, 2L),
                           .Label = c("CA", "KY", "NM"), class = "factor"),
                       
                   REPORTING_YEAR = structure(c(3L,1L, 3L,3L, 2L, 1L, 2L, 1L, 2L),
                           .Label = c("1990", "2000", "2005"), class = "factor"),
                    
                      COUNTY_NAME = structure(c(2L,1L, 3L,3L, 2L, 3L, 1L, 5L, 4L),
                           .Label = c("FRESNO", "ORANGE", "CARROLL","JEFFERSON", "TAYLOR"), class = "factor"), 
                    
                         CATEGORY = structure(c(1L, 2L, 1L,2L, 2L, 1L, 3L, 3L, 2L),
                           .Label = c("AIR","OnSite_LAND", "POTW METALS"), class = "factor"),
                    
           `SUM(REL_EST_AMT_SUM)` = c(14000,23149, 5617000,123, 23, 250, 1300, 0, 5),
                    
                       CARCINOGEN = structure(c(2L,1L, 1L,1L, 1L, 2L, 1L, 1L, 1L), .Label = c("N", "Y"), class = "factor"), 
                        CLEAN_AIR = structure(c(2L, 1L, 2L,2L, 2L, 2L, 2L, 1L, 2L), .Label = c("N","Y"), class = "factor")),
             row.names = c("62993", "49717","63290","00005", "19700", "332510", "323501", "67491", "62086"), class = "data.frame")
      
    NP<-arrange(np,STATE_ABBR,REPORTING_YEAR,COUNTY_NAME)


NP
           STATE_ABBR REPORTING_YEAR COUNTY_NAME    CATEGORY SUM(REL_EST_AMT_SUM) CARCINOGEN CLEAN_AIR
    49717          CA           1990      FRESNO OnSite_LAND                23149          N         N
    19700          CA           2000      ORANGE OnSite_LAND                   23          N         Y
    67491          KY           1990      TAYLOR POTW METALS                    0          N         N
    62086          KY           2000   JEFFERSON OnSite_LAND                    5          N         Y
    62993          KY           2005      ORANGE         AIR                14000          Y         Y
    63290          KY           2005     CARROLL         AIR              5617000          N         Y
    00005          KY           2005     CARROLL OnSite_LAND                  123          N         Y
    332510         NM           1990     CARROLL         AIR                  250          Y         Y
    323501         NM           2000      FRESNO POTW METALS                 1300          N         Y
    NP<-pivot_wider(NP, 
                    id_cols = c(STATE_ABBR, COUNTY_NAME), 
                    names_from = c(REPORTING_YEAR,CATEGORY), 
                    values_from = `SUM(REL_EST_AMT_SUM)`, 
                    values_fn = sum, 
                    values_fill = 0L)


NP
    # A tibble: 8 x 9
    STATE_ABBR  COUNTY_NAME  `1990_OnSite_LAN~`  `1990_POTW METAL~`  `1990_AIR`    "Total_1990"   2000_OnSite_LAN~`  `2000_POTW METAL~`  "total_2000"  `2005_AIR`  `2005_OnSite_LAN   "total_2005"   CARCINOGEN   CLEAN_AIR
      <fct>      <fct>                   <dbl>             <dbl>         <dbl>                             <dbl>                 <dbl>                     <dbl>              <dbl>                          0           0
     CA         FRESNO                  23149                 0             0            23149                0                     0             0           0                  0               0           0           1
     CA         ORANGE                      0                 0             0                0               23                     0            23           0                  0               0           0           0
     KY         TAYLOR                      0                 0             0                0                0                     0             0           0                  0               0           0           1
     KY         JEFFERSON                   0                 0             0                0                5                     0             5           0                  0               0           1           1
     KY         ORANGE                      0                 0             0                0                0                     0             0       14000                  0           14000           0           1
     KY         CARROLL                     0                 0             0                0                0                     0             0     5617000                123         5617123           0           2
     NM         CARROLL                     0                 0           250              250                0                     0             0           0                  0               0           1           1
     NM         FRESNO                      0                 0             0                0                0                  1300          1300           0                  0               0           0           1

鉴于您还有两列,我认为这种方法效果更好(并且可能更容易理解)。

library(dplyr)
library(tidyr)

make_yearly_total_cols <- 
  . %>% 
  group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>% 
  summarise(
    CATEGORY = "total", 
    across(`SUM(REL_EST_AMT_SUM)`, sum), 
    .groups = "drop"
  ) %>% 
  pivot_wider(
    names_from = c(CATEGORY, REPORTING_YEAR), 
    values_from = `SUM(REL_EST_AMT_SUM)`, 
    values_fill = 0L
  )

make_grand_total_cols <- 
  . %>% 
  group_by(STATE_ABBR, COUNTY_NAME) %>% 
  summarise(
    CATEGORY = "grand_total", 
    across(`SUM(REL_EST_AMT_SUM)`, sum), 
    across(c(CARCINOGEN, CLEAN_AIR), ~sum(c("N" = 0L, "Y" = 1L)[.])), 
    .groups = "drop"
  ) %>% 
  pivot_wider(
    names_from = CATEGORY, 
    values_from = `SUM(REL_EST_AMT_SUM)`, 
    values_fill = 0L
  )

make_category_cols <- 
  . %>% 
  pivot_wider(
    id_cols = c(STATE_ABBR, COUNTY_NAME), 
    names_from = c(CATEGORY, REPORTING_YEAR), 
    values_from = `SUM(REL_EST_AMT_SUM)`, 
    values_fn = sum, 
    values_fill = 0L
  )

make_category_cols(NP) %>% 
  left_join(make_yearly_total_cols(NP), by = c("STATE_ABBR", "COUNTY_NAME")) %>% 
  left_join(make_grand_total_cols(NP), by = c("STATE_ABBR", "COUNTY_NAME")) %>% 
  select(
    STATE_ABBR, COUNTY_NAME, 
    ends_with("1990"), ends_with("2000"), ends_with("2005"), 
    grand_total, CARCINOGEN, CLEAN_AIR
  )

输出

# A tibble: 8 x 15
  STATE_ABBR COUNTY_NAME OnSite_LAND_1990 `POTW METALS_1990` AIR_1990 total_1990 OnSite_LAND_2000 `POTW METALS_2000` total_2000 AIR_2005 OnSite_LAND_2005 total_2005 grand_total CARCINOGEN CLEAN_AIR
  <fct>      <fct>                  <dbl>              <dbl>    <dbl>      <dbl>            <dbl>              <dbl>      <dbl>    <dbl>            <dbl>      <dbl>       <dbl>      <int>     <int>
1 CA         FRESNO                 23149                  0        0      23149                0                  0          0        0                0          0       23149          0         0
2 CA         ORANGE                     0                  0        0          0               23                  0         23        0                0          0          23          0         1
3 KY         TAYLOR                     0                  0        0          0                0                  0          0        0                0          0           0          0         0
4 KY         JEFFERSON                  0                  0        0          0                5                  0          5        0                0          0           5          0         1
5 KY         ORANGE                     0                  0        0          0                0                  0          0    14000                0      14000       14000          1         1
6 KY         CARROLL                    0                  0        0          0                0                  0          0  5617000              123    5617123     5617123          0         2
7 NM         CARROLL                    0                  0      250        250                0                  0          0        0                0          0         250          1         1
8 NM         FRESNO                     0                  0        0          0                0               1300       1300        0                0          0        1300          0         1

如果您想详细了解旧方法背后的逻辑,请参阅下文。

这是一个tidyverse解决方案

library(dplyr)
library(tidyr)

NP %>% 
  rows_insert(
    (.) %>% 
      group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>% 
      summarise(CATEGORY = "total", across(`SUM(REL_EST_AMT_SUM)`, sum)), 
    by = c("STATE_ABBR", "COUNTY_NAME", "REPORTING_YEAR", "CATEGORY")
  ) %>% 
  arrange(REPORTING_YEAR, factor(CATEGORY, unique(CATEGORY))) %>% 
  pivot_wider(
    id_cols = c(STATE_ABBR, COUNTY_NAME), 
    names_from = c(CATEGORY, REPORTING_YEAR), 
    values_from = `SUM(REL_EST_AMT_SUM)`, 
    values_fn = sum, 
    values_fill = 0L
  ) %>% 
  mutate(grand_total = rowSums(across(starts_with("total"))))

输出

`summarise()` regrouping output by 'STATE_ABBR', 'COUNTY_NAME' (override with `.groups` argument)
# A tibble: 8 x 13
  STATE_ABBR COUNTY_NAME OnSite_LAND_1990 `POTW METALS_1990` AIR_1990 total_1990 OnSite_LAND_2000 `POTW METALS_2000` total_2000 OnSite_LAND_2005 AIR_2005 total_2005 grand_total
  <fct>      <fct>                  <dbl>              <dbl>    <dbl>      <dbl>            <dbl>              <dbl>      <dbl>            <dbl>    <dbl>      <dbl>       <dbl>
1 CA         FRESNO                 23149                  0        0      23149                0                  0          0                0        0          0       23149
2 KY         TAYLOR                     0                  0        0          0                0                  0          0                0        0          0           0
3 NM         CARROLL                    0                  0      250        250                0                  0          0                0        0          0         250
4 CA         ORANGE                     0                  0        0          0               23                  0         23                0        0          0          23
5 KY         JEFFERSON                  0                  0        0          0                5                  0          5                0        0          0           5
6 NM         FRESNO                     0                  0        0          0                0               1300       1300                0        0          0        1300
7 KY         CARROLL                    0                  0        0          0                0                  0          0              123  5617000    5617123     5617123
8 KY         ORANGE                     0                  0        0          0                0                  0          0                0    14000      14000       14000

一些解释

先看这个

NP %>% 
  rows_insert(
    SOME_DATA, 
    by = c("STATE_ABBR", "COUNTY_NAME", "REPORTING_YEAR", "CATEGORY")
  )

它通过扩展行将 SOME_DATA 插入 NP。此外,它使用 STATE_ABBR、COUNTY_NAME、REPORTING_YEAR 和 CATEGORY 的组合来识别要插入的正确位置。例如,尝试这样的事情

library(dplyr)

data <- data.frame(a = 1:3, b = letters[c(1:2, NA)], c = 0.5 + 0:2)
data
data %>% rows_insert(data.frame(a = 4, b = "z"), by = "a")
data %>% rows_insert(data.frame(a = 2, b = "x", c = 99), by = c("a", "b"))

你应该得到

> data <- data.frame(a = 1:3, b = letters[c(1:2, NA)], c = 0.5 + 0:2)
> data
  a    b   c
1 1    a 0.5
2 2    b 1.5
3 3 <NA> 2.5
> data %>% rows_insert(data.frame(a = 4, b = "z"), by = "a")
  a    b   c
1 1    a 0.5
2 2    b 1.5
3 3 <NA> 2.5
4 4    z  NA
> data %>% rows_insert(data.frame(a = 2, b = "x", c = 99), by = c("a", "b"))
  a    b    c
1 1    a  0.5
2 2    b  1.5
3 3 <NA>  2.5
4 2    x 99.0

那么,问题是什么是SOME_DATA?好吧,这是我们从这个管道中得到的总结

(.) %>% 
  group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>% 
  summarise(CATEGORY = "total", across(`SUM(REL_EST_AMT_SUM)`, sum))

在管道中,.可以有多种含义。请参阅下面我从文档中复制的解释

Usage lhs %>% rhs

Using the dot for secondary purposes Often, some attribute or property of lhs is desired in the rhs call in addition to the value of lhs itself, e.g. the number of rows or columns. It is perfectly valid to use the dot placeholder several times in the rhs call, but by design the behavior is slightly different when using it inside nested function calls. In particular, if the placeholder is only used in a nested function call, lhs will also be placed as the first argument! The reason for this is that in most use-cases this produces the most readable code. For example, iris %>% subset(1:nrow(.) %% 2 == 0) is equivalent to iris %>% subset(., 1:nrow(.) %% 2 == 0) but slightly more compact. It is possible to overrule this behavior by enclosing the rhs in braces. For example, 1:10 %>% {c(min(.), max(.))} is equivalent to c(min(1:10), max(1:10))

Using the dot-place holder as lhs When the dot is used as lhs, the result will be a functional sequence, i.e. a function which applies the entire chain of right-hand sides in turn to its input. See the examples.

所以,这里的点从外层流水线的lhs中捕获NP。然而,它也可以被认为是内部功能序列的开始。后一种用法比前一种用法具有更高的优先级。因此,我们将圆点放在大括号中,以避免它被错误地认为是功能序列的开始。

(.) %>%
  group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>% 
  summarise(CATEGORY = "total", across(`SUM(REL_EST_AMT_SUM)`, sum))

这一步之后

NP %>% 
  rows_insert(
    (.) %>% 
      group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>% 
      summarise(CATEGORY = "total", across(`SUM(REL_EST_AMT_SUM)`, sum), .groups = "drop"), 
    by = c("STATE_ABBR", "COUNTY_NAME", "REPORTING_YEAR", "CATEGORY")
  )

数据变成了这个样子

       STATE_ABBR REPORTING_YEAR COUNTY_NAME    CATEGORY SUM(REL_EST_AMT_SUM) CARCINOGEN CLEAN_AIR
49717          CA           1990      FRESNO OnSite_LAND                23149          N         N
19700          CA           2000      ORANGE OnSite_LAND                   23          N         Y
67491          KY           1990      TAYLOR POTW METALS                    0          N         N
62086          KY           2000   JEFFERSON OnSite_LAND                    5          N         Y
62993          KY           2005      ORANGE         AIR                14000          Y         Y
63290          KY           2005     CARROLL         AIR              5617000          N         Y
00005          KY           2005     CARROLL OnSite_LAND                  123          N         Y
332510         NM           1990     CARROLL         AIR                  250          Y         Y
323501         NM           2000      FRESNO POTW METALS                 1300          N         Y
...10          CA           1990      FRESNO       total                23149       <NA>      <NA>
...11          CA           2000      ORANGE       total                   23       <NA>      <NA>
...12          KY           2005      ORANGE       total                14000       <NA>      <NA>
...13          KY           2005     CARROLL       total              5617123       <NA>      <NA>
...14          KY           2000   JEFFERSON       total                    5       <NA>      <NA>
...15          KY           1990      TAYLOR       total                    0       <NA>      <NA>
...16          NM           2000      FRESNO       total                 1300       <NA>      <NA>
...17          NM           1990     CARROLL       total                  250       <NA>      <NA>

然后我们需要重新排列数据,以便将具有相同 REPORTING_YEAR 的行组合在一起。我们还希望保持 CATEGORY 列的顺序不变。这就是我们使用 factor(CATEGORY, unique(CATEGORY)) 的原因。重新排列后,

   STATE_ABBR REPORTING_YEAR COUNTY_NAME    CATEGORY SUM(REL_EST_AMT_SUM) CARCINOGEN CLEAN_AIR
1          CA           1990      FRESNO OnSite_LAND                23149          N         N
2          KY           1990      TAYLOR POTW METALS                    0          N         N
3          NM           1990     CARROLL         AIR                  250          Y         Y
4          CA           1990      FRESNO       total                23149       <NA>      <NA>
5          KY           1990      TAYLOR       total                    0       <NA>      <NA>
6          NM           1990     CARROLL       total                  250       <NA>      <NA>
7          CA           2000      ORANGE OnSite_LAND                   23          N         Y
8          KY           2000   JEFFERSON OnSite_LAND                    5          N         Y
9          NM           2000      FRESNO POTW METALS                 1300          N         Y
10         CA           2000      ORANGE       total                   23       <NA>      <NA>
11         KY           2000   JEFFERSON       total                    5       <NA>      <NA>
12         NM           2000      FRESNO       total                 1300       <NA>      <NA>
13         KY           2005     CARROLL OnSite_LAND                  123          N         Y
14         KY           2005      ORANGE         AIR                14000          Y         Y
15         KY           2005     CARROLL         AIR              5617000          N         Y
16         KY           2005      ORANGE       total                14000       <NA>      <NA>
17         KY           2005     CARROLL       total              5617123       <NA>      <NA>

然后我们pivot_wider()得到结果。这就是逻辑。请注意,如果您想摆脱此消息

`summarise()` regrouping output by 'STATE_ABBR', 'COUNTY_NAME' (override with 
`.groups` argument)

就做这样的事

NP %>% 
  rows_insert(
    (.) %>% 
      group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>% 
      summarise(CATEGORY = "total", across(`SUM(REL_EST_AMT_SUM)`, sum), .groups = "drop"), # drop the groups
    by = c("STATE_ABBR", "COUNTY_NAME", "REPORTING_YEAR", "CATEGORY")
  ) %>%
  arrange(REPORTING_YEAR, factor(CATEGORY, unique(CATEGORY)))

但是,我无法重现您遇到的其他错误。