使用 dplyr 扩展数据,自动排序 vars 和 sum 扩展列
using dplyr to widen data, automatically order vars and sum widened columns
我正在尝试执行一些我已经 completed/gotten 帮助过的功能,但我想使用我的结果添加更多列。我想在每年的集合结束时做一个总计,最后做一个总计。并且有两列被删除,我想保留为总计。
如果需要更多信息,请告诉我。提前致谢。
np<-structure(list(STATE_ABBR = structure(c(2L, 1L, 2L,2L, 1L, 3L, 3L,2L, 2L),
.Label = c("CA", "KY", "NM"), class = "factor"),
REPORTING_YEAR = structure(c(3L,1L, 3L,3L, 2L, 1L, 2L, 1L, 2L),
.Label = c("1990", "2000", "2005"), class = "factor"),
COUNTY_NAME = structure(c(2L,1L, 3L,3L, 2L, 3L, 1L, 5L, 4L),
.Label = c("FRESNO", "ORANGE", "CARROLL","JEFFERSON", "TAYLOR"), class = "factor"),
CATEGORY = structure(c(1L, 2L, 1L,2L, 2L, 1L, 3L, 3L, 2L),
.Label = c("AIR","OnSite_LAND", "POTW METALS"), class = "factor"),
`SUM(REL_EST_AMT_SUM)` = c(14000,23149, 5617000,123, 23, 250, 1300, 0, 5),
CARCINOGEN = structure(c(2L,1L, 1L,1L, 1L, 2L, 1L, 1L, 1L), .Label = c("N", "Y"), class = "factor"),
CLEAN_AIR = structure(c(2L, 1L, 2L,2L, 2L, 2L, 2L, 1L, 2L), .Label = c("N","Y"), class = "factor")),
row.names = c("62993", "49717","63290","00005", "19700", "332510", "323501", "67491", "62086"), class = "data.frame")
NP<-arrange(np,STATE_ABBR,REPORTING_YEAR,COUNTY_NAME)
NP
STATE_ABBR REPORTING_YEAR COUNTY_NAME CATEGORY SUM(REL_EST_AMT_SUM) CARCINOGEN CLEAN_AIR
49717 CA 1990 FRESNO OnSite_LAND 23149 N N
19700 CA 2000 ORANGE OnSite_LAND 23 N Y
67491 KY 1990 TAYLOR POTW METALS 0 N N
62086 KY 2000 JEFFERSON OnSite_LAND 5 N Y
62993 KY 2005 ORANGE AIR 14000 Y Y
63290 KY 2005 CARROLL AIR 5617000 N Y
00005 KY 2005 CARROLL OnSite_LAND 123 N Y
332510 NM 1990 CARROLL AIR 250 Y Y
323501 NM 2000 FRESNO POTW METALS 1300 N Y
NP<-pivot_wider(NP,
id_cols = c(STATE_ABBR, COUNTY_NAME),
names_from = c(REPORTING_YEAR,CATEGORY),
values_from = `SUM(REL_EST_AMT_SUM)`,
values_fn = sum,
values_fill = 0L)
NP
# A tibble: 8 x 9
STATE_ABBR COUNTY_NAME `1990_OnSite_LAN~` `1990_POTW METAL~` `1990_AIR` "Total_1990" 2000_OnSite_LAN~` `2000_POTW METAL~` "total_2000" `2005_AIR` `2005_OnSite_LAN "total_2005" CARCINOGEN CLEAN_AIR
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> 0 0
CA FRESNO 23149 0 0 23149 0 0 0 0 0 0 0 1
CA ORANGE 0 0 0 0 23 0 23 0 0 0 0 0
KY TAYLOR 0 0 0 0 0 0 0 0 0 0 0 1
KY JEFFERSON 0 0 0 0 5 0 5 0 0 0 1 1
KY ORANGE 0 0 0 0 0 0 0 14000 0 14000 0 1
KY CARROLL 0 0 0 0 0 0 0 5617000 123 5617123 0 2
NM CARROLL 0 0 250 250 0 0 0 0 0 0 1 1
NM FRESNO 0 0 0 0 0 1300 1300 0 0 0 0 1
鉴于您还有两列,我认为这种方法效果更好(并且可能更容易理解)。
library(dplyr)
library(tidyr)
make_yearly_total_cols <-
. %>%
group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>%
summarise(
CATEGORY = "total",
across(`SUM(REL_EST_AMT_SUM)`, sum),
.groups = "drop"
) %>%
pivot_wider(
names_from = c(CATEGORY, REPORTING_YEAR),
values_from = `SUM(REL_EST_AMT_SUM)`,
values_fill = 0L
)
make_grand_total_cols <-
. %>%
group_by(STATE_ABBR, COUNTY_NAME) %>%
summarise(
CATEGORY = "grand_total",
across(`SUM(REL_EST_AMT_SUM)`, sum),
across(c(CARCINOGEN, CLEAN_AIR), ~sum(c("N" = 0L, "Y" = 1L)[.])),
.groups = "drop"
) %>%
pivot_wider(
names_from = CATEGORY,
values_from = `SUM(REL_EST_AMT_SUM)`,
values_fill = 0L
)
make_category_cols <-
. %>%
pivot_wider(
id_cols = c(STATE_ABBR, COUNTY_NAME),
names_from = c(CATEGORY, REPORTING_YEAR),
values_from = `SUM(REL_EST_AMT_SUM)`,
values_fn = sum,
values_fill = 0L
)
make_category_cols(NP) %>%
left_join(make_yearly_total_cols(NP), by = c("STATE_ABBR", "COUNTY_NAME")) %>%
left_join(make_grand_total_cols(NP), by = c("STATE_ABBR", "COUNTY_NAME")) %>%
select(
STATE_ABBR, COUNTY_NAME,
ends_with("1990"), ends_with("2000"), ends_with("2005"),
grand_total, CARCINOGEN, CLEAN_AIR
)
输出
# A tibble: 8 x 15
STATE_ABBR COUNTY_NAME OnSite_LAND_1990 `POTW METALS_1990` AIR_1990 total_1990 OnSite_LAND_2000 `POTW METALS_2000` total_2000 AIR_2005 OnSite_LAND_2005 total_2005 grand_total CARCINOGEN CLEAN_AIR
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int>
1 CA FRESNO 23149 0 0 23149 0 0 0 0 0 0 23149 0 0
2 CA ORANGE 0 0 0 0 23 0 23 0 0 0 23 0 1
3 KY TAYLOR 0 0 0 0 0 0 0 0 0 0 0 0 0
4 KY JEFFERSON 0 0 0 0 5 0 5 0 0 0 5 0 1
5 KY ORANGE 0 0 0 0 0 0 0 14000 0 14000 14000 1 1
6 KY CARROLL 0 0 0 0 0 0 0 5617000 123 5617123 5617123 0 2
7 NM CARROLL 0 0 250 250 0 0 0 0 0 0 250 1 1
8 NM FRESNO 0 0 0 0 0 1300 1300 0 0 0 1300 0 1
如果您想详细了解旧方法背后的逻辑,请参阅下文。
旧
这是一个tidyverse
解决方案
library(dplyr)
library(tidyr)
NP %>%
rows_insert(
(.) %>%
group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>%
summarise(CATEGORY = "total", across(`SUM(REL_EST_AMT_SUM)`, sum)),
by = c("STATE_ABBR", "COUNTY_NAME", "REPORTING_YEAR", "CATEGORY")
) %>%
arrange(REPORTING_YEAR, factor(CATEGORY, unique(CATEGORY))) %>%
pivot_wider(
id_cols = c(STATE_ABBR, COUNTY_NAME),
names_from = c(CATEGORY, REPORTING_YEAR),
values_from = `SUM(REL_EST_AMT_SUM)`,
values_fn = sum,
values_fill = 0L
) %>%
mutate(grand_total = rowSums(across(starts_with("total"))))
输出
`summarise()` regrouping output by 'STATE_ABBR', 'COUNTY_NAME' (override with `.groups` argument)
# A tibble: 8 x 13
STATE_ABBR COUNTY_NAME OnSite_LAND_1990 `POTW METALS_1990` AIR_1990 total_1990 OnSite_LAND_2000 `POTW METALS_2000` total_2000 OnSite_LAND_2005 AIR_2005 total_2005 grand_total
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 CA FRESNO 23149 0 0 23149 0 0 0 0 0 0 23149
2 KY TAYLOR 0 0 0 0 0 0 0 0 0 0 0
3 NM CARROLL 0 0 250 250 0 0 0 0 0 0 250
4 CA ORANGE 0 0 0 0 23 0 23 0 0 0 23
5 KY JEFFERSON 0 0 0 0 5 0 5 0 0 0 5
6 NM FRESNO 0 0 0 0 0 1300 1300 0 0 0 1300
7 KY CARROLL 0 0 0 0 0 0 0 123 5617000 5617123 5617123
8 KY ORANGE 0 0 0 0 0 0 0 0 14000 14000 14000
一些解释
先看这个
NP %>%
rows_insert(
SOME_DATA,
by = c("STATE_ABBR", "COUNTY_NAME", "REPORTING_YEAR", "CATEGORY")
)
它通过扩展行将 SOME_DATA 插入 NP。此外,它使用 STATE_ABBR、COUNTY_NAME、REPORTING_YEAR 和 CATEGORY 的组合来识别要插入的正确位置。例如,尝试这样的事情
library(dplyr)
data <- data.frame(a = 1:3, b = letters[c(1:2, NA)], c = 0.5 + 0:2)
data
data %>% rows_insert(data.frame(a = 4, b = "z"), by = "a")
data %>% rows_insert(data.frame(a = 2, b = "x", c = 99), by = c("a", "b"))
你应该得到
> data <- data.frame(a = 1:3, b = letters[c(1:2, NA)], c = 0.5 + 0:2)
> data
a b c
1 1 a 0.5
2 2 b 1.5
3 3 <NA> 2.5
> data %>% rows_insert(data.frame(a = 4, b = "z"), by = "a")
a b c
1 1 a 0.5
2 2 b 1.5
3 3 <NA> 2.5
4 4 z NA
> data %>% rows_insert(data.frame(a = 2, b = "x", c = 99), by = c("a", "b"))
a b c
1 1 a 0.5
2 2 b 1.5
3 3 <NA> 2.5
4 2 x 99.0
那么,问题是什么是SOME_DATA?好吧,这是我们从这个管道中得到的总结
(.) %>%
group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>%
summarise(CATEGORY = "total", across(`SUM(REL_EST_AMT_SUM)`, sum))
在管道中,.
可以有多种含义。请参阅下面我从文档中复制的解释
Usage lhs %>% rhs
Using the dot for secondary purposes
Often, some attribute or property of lhs is desired in the rhs call in addition to the value of lhs itself, e.g. the number of rows or columns. It is perfectly valid to use the dot placeholder several times in the rhs call, but by design the behavior is slightly different when using it inside nested function calls. In particular, if the placeholder is only used in a nested function call, lhs will also be placed as the first argument! The reason for this is that in most use-cases this produces the most readable code. For example, iris %>% subset(1:nrow(.) %% 2 == 0) is equivalent to iris %>% subset(., 1:nrow(.) %% 2 == 0) but slightly more compact. It is possible to overrule this behavior by enclosing the rhs in braces. For example, 1:10 %>% {c(min(.), max(.))} is equivalent to c(min(1:10), max(1:10))
Using the dot-place holder as lhs
When the dot is used as lhs, the result will be a functional sequence, i.e. a function which applies the entire chain of right-hand sides in turn to its input. See the examples.
所以,这里的点从外层流水线的lhs中捕获NP
。然而,它也可以被认为是内部功能序列的开始。后一种用法比前一种用法具有更高的优先级。因此,我们将圆点放在大括号中,以避免它被错误地认为是功能序列的开始。
(.) %>%
group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>%
summarise(CATEGORY = "total", across(`SUM(REL_EST_AMT_SUM)`, sum))
这一步之后
NP %>%
rows_insert(
(.) %>%
group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>%
summarise(CATEGORY = "total", across(`SUM(REL_EST_AMT_SUM)`, sum), .groups = "drop"),
by = c("STATE_ABBR", "COUNTY_NAME", "REPORTING_YEAR", "CATEGORY")
)
数据变成了这个样子
STATE_ABBR REPORTING_YEAR COUNTY_NAME CATEGORY SUM(REL_EST_AMT_SUM) CARCINOGEN CLEAN_AIR
49717 CA 1990 FRESNO OnSite_LAND 23149 N N
19700 CA 2000 ORANGE OnSite_LAND 23 N Y
67491 KY 1990 TAYLOR POTW METALS 0 N N
62086 KY 2000 JEFFERSON OnSite_LAND 5 N Y
62993 KY 2005 ORANGE AIR 14000 Y Y
63290 KY 2005 CARROLL AIR 5617000 N Y
00005 KY 2005 CARROLL OnSite_LAND 123 N Y
332510 NM 1990 CARROLL AIR 250 Y Y
323501 NM 2000 FRESNO POTW METALS 1300 N Y
...10 CA 1990 FRESNO total 23149 <NA> <NA>
...11 CA 2000 ORANGE total 23 <NA> <NA>
...12 KY 2005 ORANGE total 14000 <NA> <NA>
...13 KY 2005 CARROLL total 5617123 <NA> <NA>
...14 KY 2000 JEFFERSON total 5 <NA> <NA>
...15 KY 1990 TAYLOR total 0 <NA> <NA>
...16 NM 2000 FRESNO total 1300 <NA> <NA>
...17 NM 1990 CARROLL total 250 <NA> <NA>
然后我们需要重新排列数据,以便将具有相同 REPORTING_YEAR 的行组合在一起。我们还希望保持 CATEGORY 列的顺序不变。这就是我们使用 factor(CATEGORY, unique(CATEGORY))
的原因。重新排列后,
STATE_ABBR REPORTING_YEAR COUNTY_NAME CATEGORY SUM(REL_EST_AMT_SUM) CARCINOGEN CLEAN_AIR
1 CA 1990 FRESNO OnSite_LAND 23149 N N
2 KY 1990 TAYLOR POTW METALS 0 N N
3 NM 1990 CARROLL AIR 250 Y Y
4 CA 1990 FRESNO total 23149 <NA> <NA>
5 KY 1990 TAYLOR total 0 <NA> <NA>
6 NM 1990 CARROLL total 250 <NA> <NA>
7 CA 2000 ORANGE OnSite_LAND 23 N Y
8 KY 2000 JEFFERSON OnSite_LAND 5 N Y
9 NM 2000 FRESNO POTW METALS 1300 N Y
10 CA 2000 ORANGE total 23 <NA> <NA>
11 KY 2000 JEFFERSON total 5 <NA> <NA>
12 NM 2000 FRESNO total 1300 <NA> <NA>
13 KY 2005 CARROLL OnSite_LAND 123 N Y
14 KY 2005 ORANGE AIR 14000 Y Y
15 KY 2005 CARROLL AIR 5617000 N Y
16 KY 2005 ORANGE total 14000 <NA> <NA>
17 KY 2005 CARROLL total 5617123 <NA> <NA>
然后我们pivot_wider()
得到结果。这就是逻辑。请注意,如果您想摆脱此消息
`summarise()` regrouping output by 'STATE_ABBR', 'COUNTY_NAME' (override with
`.groups` argument)
就做这样的事
NP %>%
rows_insert(
(.) %>%
group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>%
summarise(CATEGORY = "total", across(`SUM(REL_EST_AMT_SUM)`, sum), .groups = "drop"), # drop the groups
by = c("STATE_ABBR", "COUNTY_NAME", "REPORTING_YEAR", "CATEGORY")
) %>%
arrange(REPORTING_YEAR, factor(CATEGORY, unique(CATEGORY)))
但是,我无法重现您遇到的其他错误。
我正在尝试执行一些我已经 completed/gotten 帮助过的功能,但我想使用我的结果添加更多列。我想在每年的集合结束时做一个总计,最后做一个总计。并且有两列被删除,我想保留为总计。 如果需要更多信息,请告诉我。提前致谢。
np<-structure(list(STATE_ABBR = structure(c(2L, 1L, 2L,2L, 1L, 3L, 3L,2L, 2L),
.Label = c("CA", "KY", "NM"), class = "factor"),
REPORTING_YEAR = structure(c(3L,1L, 3L,3L, 2L, 1L, 2L, 1L, 2L),
.Label = c("1990", "2000", "2005"), class = "factor"),
COUNTY_NAME = structure(c(2L,1L, 3L,3L, 2L, 3L, 1L, 5L, 4L),
.Label = c("FRESNO", "ORANGE", "CARROLL","JEFFERSON", "TAYLOR"), class = "factor"),
CATEGORY = structure(c(1L, 2L, 1L,2L, 2L, 1L, 3L, 3L, 2L),
.Label = c("AIR","OnSite_LAND", "POTW METALS"), class = "factor"),
`SUM(REL_EST_AMT_SUM)` = c(14000,23149, 5617000,123, 23, 250, 1300, 0, 5),
CARCINOGEN = structure(c(2L,1L, 1L,1L, 1L, 2L, 1L, 1L, 1L), .Label = c("N", "Y"), class = "factor"),
CLEAN_AIR = structure(c(2L, 1L, 2L,2L, 2L, 2L, 2L, 1L, 2L), .Label = c("N","Y"), class = "factor")),
row.names = c("62993", "49717","63290","00005", "19700", "332510", "323501", "67491", "62086"), class = "data.frame")
NP<-arrange(np,STATE_ABBR,REPORTING_YEAR,COUNTY_NAME)
NP
STATE_ABBR REPORTING_YEAR COUNTY_NAME CATEGORY SUM(REL_EST_AMT_SUM) CARCINOGEN CLEAN_AIR
49717 CA 1990 FRESNO OnSite_LAND 23149 N N
19700 CA 2000 ORANGE OnSite_LAND 23 N Y
67491 KY 1990 TAYLOR POTW METALS 0 N N
62086 KY 2000 JEFFERSON OnSite_LAND 5 N Y
62993 KY 2005 ORANGE AIR 14000 Y Y
63290 KY 2005 CARROLL AIR 5617000 N Y
00005 KY 2005 CARROLL OnSite_LAND 123 N Y
332510 NM 1990 CARROLL AIR 250 Y Y
323501 NM 2000 FRESNO POTW METALS 1300 N Y
NP<-pivot_wider(NP,
id_cols = c(STATE_ABBR, COUNTY_NAME),
names_from = c(REPORTING_YEAR,CATEGORY),
values_from = `SUM(REL_EST_AMT_SUM)`,
values_fn = sum,
values_fill = 0L)
NP
# A tibble: 8 x 9
STATE_ABBR COUNTY_NAME `1990_OnSite_LAN~` `1990_POTW METAL~` `1990_AIR` "Total_1990" 2000_OnSite_LAN~` `2000_POTW METAL~` "total_2000" `2005_AIR` `2005_OnSite_LAN "total_2005" CARCINOGEN CLEAN_AIR
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> 0 0
CA FRESNO 23149 0 0 23149 0 0 0 0 0 0 0 1
CA ORANGE 0 0 0 0 23 0 23 0 0 0 0 0
KY TAYLOR 0 0 0 0 0 0 0 0 0 0 0 1
KY JEFFERSON 0 0 0 0 5 0 5 0 0 0 1 1
KY ORANGE 0 0 0 0 0 0 0 14000 0 14000 0 1
KY CARROLL 0 0 0 0 0 0 0 5617000 123 5617123 0 2
NM CARROLL 0 0 250 250 0 0 0 0 0 0 1 1
NM FRESNO 0 0 0 0 0 1300 1300 0 0 0 0 1
鉴于您还有两列,我认为这种方法效果更好(并且可能更容易理解)。
library(dplyr)
library(tidyr)
make_yearly_total_cols <-
. %>%
group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>%
summarise(
CATEGORY = "total",
across(`SUM(REL_EST_AMT_SUM)`, sum),
.groups = "drop"
) %>%
pivot_wider(
names_from = c(CATEGORY, REPORTING_YEAR),
values_from = `SUM(REL_EST_AMT_SUM)`,
values_fill = 0L
)
make_grand_total_cols <-
. %>%
group_by(STATE_ABBR, COUNTY_NAME) %>%
summarise(
CATEGORY = "grand_total",
across(`SUM(REL_EST_AMT_SUM)`, sum),
across(c(CARCINOGEN, CLEAN_AIR), ~sum(c("N" = 0L, "Y" = 1L)[.])),
.groups = "drop"
) %>%
pivot_wider(
names_from = CATEGORY,
values_from = `SUM(REL_EST_AMT_SUM)`,
values_fill = 0L
)
make_category_cols <-
. %>%
pivot_wider(
id_cols = c(STATE_ABBR, COUNTY_NAME),
names_from = c(CATEGORY, REPORTING_YEAR),
values_from = `SUM(REL_EST_AMT_SUM)`,
values_fn = sum,
values_fill = 0L
)
make_category_cols(NP) %>%
left_join(make_yearly_total_cols(NP), by = c("STATE_ABBR", "COUNTY_NAME")) %>%
left_join(make_grand_total_cols(NP), by = c("STATE_ABBR", "COUNTY_NAME")) %>%
select(
STATE_ABBR, COUNTY_NAME,
ends_with("1990"), ends_with("2000"), ends_with("2005"),
grand_total, CARCINOGEN, CLEAN_AIR
)
输出
# A tibble: 8 x 15
STATE_ABBR COUNTY_NAME OnSite_LAND_1990 `POTW METALS_1990` AIR_1990 total_1990 OnSite_LAND_2000 `POTW METALS_2000` total_2000 AIR_2005 OnSite_LAND_2005 total_2005 grand_total CARCINOGEN CLEAN_AIR
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int>
1 CA FRESNO 23149 0 0 23149 0 0 0 0 0 0 23149 0 0
2 CA ORANGE 0 0 0 0 23 0 23 0 0 0 23 0 1
3 KY TAYLOR 0 0 0 0 0 0 0 0 0 0 0 0 0
4 KY JEFFERSON 0 0 0 0 5 0 5 0 0 0 5 0 1
5 KY ORANGE 0 0 0 0 0 0 0 14000 0 14000 14000 1 1
6 KY CARROLL 0 0 0 0 0 0 0 5617000 123 5617123 5617123 0 2
7 NM CARROLL 0 0 250 250 0 0 0 0 0 0 250 1 1
8 NM FRESNO 0 0 0 0 0 1300 1300 0 0 0 1300 0 1
如果您想详细了解旧方法背后的逻辑,请参阅下文。
旧
这是一个tidyverse
解决方案
library(dplyr)
library(tidyr)
NP %>%
rows_insert(
(.) %>%
group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>%
summarise(CATEGORY = "total", across(`SUM(REL_EST_AMT_SUM)`, sum)),
by = c("STATE_ABBR", "COUNTY_NAME", "REPORTING_YEAR", "CATEGORY")
) %>%
arrange(REPORTING_YEAR, factor(CATEGORY, unique(CATEGORY))) %>%
pivot_wider(
id_cols = c(STATE_ABBR, COUNTY_NAME),
names_from = c(CATEGORY, REPORTING_YEAR),
values_from = `SUM(REL_EST_AMT_SUM)`,
values_fn = sum,
values_fill = 0L
) %>%
mutate(grand_total = rowSums(across(starts_with("total"))))
输出
`summarise()` regrouping output by 'STATE_ABBR', 'COUNTY_NAME' (override with `.groups` argument)
# A tibble: 8 x 13
STATE_ABBR COUNTY_NAME OnSite_LAND_1990 `POTW METALS_1990` AIR_1990 total_1990 OnSite_LAND_2000 `POTW METALS_2000` total_2000 OnSite_LAND_2005 AIR_2005 total_2005 grand_total
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 CA FRESNO 23149 0 0 23149 0 0 0 0 0 0 23149
2 KY TAYLOR 0 0 0 0 0 0 0 0 0 0 0
3 NM CARROLL 0 0 250 250 0 0 0 0 0 0 250
4 CA ORANGE 0 0 0 0 23 0 23 0 0 0 23
5 KY JEFFERSON 0 0 0 0 5 0 5 0 0 0 5
6 NM FRESNO 0 0 0 0 0 1300 1300 0 0 0 1300
7 KY CARROLL 0 0 0 0 0 0 0 123 5617000 5617123 5617123
8 KY ORANGE 0 0 0 0 0 0 0 0 14000 14000 14000
一些解释
先看这个
NP %>%
rows_insert(
SOME_DATA,
by = c("STATE_ABBR", "COUNTY_NAME", "REPORTING_YEAR", "CATEGORY")
)
它通过扩展行将 SOME_DATA 插入 NP。此外,它使用 STATE_ABBR、COUNTY_NAME、REPORTING_YEAR 和 CATEGORY 的组合来识别要插入的正确位置。例如,尝试这样的事情
library(dplyr)
data <- data.frame(a = 1:3, b = letters[c(1:2, NA)], c = 0.5 + 0:2)
data
data %>% rows_insert(data.frame(a = 4, b = "z"), by = "a")
data %>% rows_insert(data.frame(a = 2, b = "x", c = 99), by = c("a", "b"))
你应该得到
> data <- data.frame(a = 1:3, b = letters[c(1:2, NA)], c = 0.5 + 0:2)
> data
a b c
1 1 a 0.5
2 2 b 1.5
3 3 <NA> 2.5
> data %>% rows_insert(data.frame(a = 4, b = "z"), by = "a")
a b c
1 1 a 0.5
2 2 b 1.5
3 3 <NA> 2.5
4 4 z NA
> data %>% rows_insert(data.frame(a = 2, b = "x", c = 99), by = c("a", "b"))
a b c
1 1 a 0.5
2 2 b 1.5
3 3 <NA> 2.5
4 2 x 99.0
那么,问题是什么是SOME_DATA?好吧,这是我们从这个管道中得到的总结
(.) %>%
group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>%
summarise(CATEGORY = "total", across(`SUM(REL_EST_AMT_SUM)`, sum))
在管道中,.
可以有多种含义。请参阅下面我从文档中复制的解释
Usage lhs %>% rhs
Using the dot for secondary purposes Often, some attribute or property of lhs is desired in the rhs call in addition to the value of lhs itself, e.g. the number of rows or columns. It is perfectly valid to use the dot placeholder several times in the rhs call, but by design the behavior is slightly different when using it inside nested function calls. In particular, if the placeholder is only used in a nested function call, lhs will also be placed as the first argument! The reason for this is that in most use-cases this produces the most readable code. For example, iris %>% subset(1:nrow(.) %% 2 == 0) is equivalent to iris %>% subset(., 1:nrow(.) %% 2 == 0) but slightly more compact. It is possible to overrule this behavior by enclosing the rhs in braces. For example, 1:10 %>% {c(min(.), max(.))} is equivalent to c(min(1:10), max(1:10))
Using the dot-place holder as lhs When the dot is used as lhs, the result will be a functional sequence, i.e. a function which applies the entire chain of right-hand sides in turn to its input. See the examples.
所以,这里的点从外层流水线的lhs中捕获NP
。然而,它也可以被认为是内部功能序列的开始。后一种用法比前一种用法具有更高的优先级。因此,我们将圆点放在大括号中,以避免它被错误地认为是功能序列的开始。
(.) %>%
group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>%
summarise(CATEGORY = "total", across(`SUM(REL_EST_AMT_SUM)`, sum))
这一步之后
NP %>%
rows_insert(
(.) %>%
group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>%
summarise(CATEGORY = "total", across(`SUM(REL_EST_AMT_SUM)`, sum), .groups = "drop"),
by = c("STATE_ABBR", "COUNTY_NAME", "REPORTING_YEAR", "CATEGORY")
)
数据变成了这个样子
STATE_ABBR REPORTING_YEAR COUNTY_NAME CATEGORY SUM(REL_EST_AMT_SUM) CARCINOGEN CLEAN_AIR
49717 CA 1990 FRESNO OnSite_LAND 23149 N N
19700 CA 2000 ORANGE OnSite_LAND 23 N Y
67491 KY 1990 TAYLOR POTW METALS 0 N N
62086 KY 2000 JEFFERSON OnSite_LAND 5 N Y
62993 KY 2005 ORANGE AIR 14000 Y Y
63290 KY 2005 CARROLL AIR 5617000 N Y
00005 KY 2005 CARROLL OnSite_LAND 123 N Y
332510 NM 1990 CARROLL AIR 250 Y Y
323501 NM 2000 FRESNO POTW METALS 1300 N Y
...10 CA 1990 FRESNO total 23149 <NA> <NA>
...11 CA 2000 ORANGE total 23 <NA> <NA>
...12 KY 2005 ORANGE total 14000 <NA> <NA>
...13 KY 2005 CARROLL total 5617123 <NA> <NA>
...14 KY 2000 JEFFERSON total 5 <NA> <NA>
...15 KY 1990 TAYLOR total 0 <NA> <NA>
...16 NM 2000 FRESNO total 1300 <NA> <NA>
...17 NM 1990 CARROLL total 250 <NA> <NA>
然后我们需要重新排列数据,以便将具有相同 REPORTING_YEAR 的行组合在一起。我们还希望保持 CATEGORY 列的顺序不变。这就是我们使用 factor(CATEGORY, unique(CATEGORY))
的原因。重新排列后,
STATE_ABBR REPORTING_YEAR COUNTY_NAME CATEGORY SUM(REL_EST_AMT_SUM) CARCINOGEN CLEAN_AIR
1 CA 1990 FRESNO OnSite_LAND 23149 N N
2 KY 1990 TAYLOR POTW METALS 0 N N
3 NM 1990 CARROLL AIR 250 Y Y
4 CA 1990 FRESNO total 23149 <NA> <NA>
5 KY 1990 TAYLOR total 0 <NA> <NA>
6 NM 1990 CARROLL total 250 <NA> <NA>
7 CA 2000 ORANGE OnSite_LAND 23 N Y
8 KY 2000 JEFFERSON OnSite_LAND 5 N Y
9 NM 2000 FRESNO POTW METALS 1300 N Y
10 CA 2000 ORANGE total 23 <NA> <NA>
11 KY 2000 JEFFERSON total 5 <NA> <NA>
12 NM 2000 FRESNO total 1300 <NA> <NA>
13 KY 2005 CARROLL OnSite_LAND 123 N Y
14 KY 2005 ORANGE AIR 14000 Y Y
15 KY 2005 CARROLL AIR 5617000 N Y
16 KY 2005 ORANGE total 14000 <NA> <NA>
17 KY 2005 CARROLL total 5617123 <NA> <NA>
然后我们pivot_wider()
得到结果。这就是逻辑。请注意,如果您想摆脱此消息
`summarise()` regrouping output by 'STATE_ABBR', 'COUNTY_NAME' (override with `.groups` argument)
就做这样的事
NP %>%
rows_insert(
(.) %>%
group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>%
summarise(CATEGORY = "total", across(`SUM(REL_EST_AMT_SUM)`, sum), .groups = "drop"), # drop the groups
by = c("STATE_ABBR", "COUNTY_NAME", "REPORTING_YEAR", "CATEGORY")
) %>%
arrange(REPORTING_YEAR, factor(CATEGORY, unique(CATEGORY)))
但是,我无法重现您遇到的其他错误。