为因子的每个水平附加一行总和

Appending a row of sums for each level of a factor

我想像这样为每个 Reg 添加一行总和

   Reg   Res    Pop
1      Total 1000915
2    A Urban 500414
3    A Rural 500501

4     Total  999938
5    B Urban 499922
6    B Rural 500016

7      Total 1000912
8    C Urban 501638
9    C Rural 499274

10     Total  999629
11    D Urban 499804
12    D Rural 499825

13     Total 1000303
14   E Urban 499917
15   E Rural 500386

MWE 如下:

Reg <- rep(LETTERS[1:5], each = 2)
Res <- rep(c("Urban", "Rural"), times = 5)
set.seed(12345)
Pop <- rpois(n = 10, lambda = 500000)
df <- data.frame(Reg, Res, Pop)

df
   Reg   Res    Pop
1    A Urban 500414
2    A Rural 500501
3    B Urban 499922
4    B Rural 500016
5    C Urban 501638
6    C Rural 499274
7    D Urban 499804
8    D Rural 499825
9    E Urban 499917
10   E Rural 500386

df %>%
   group_by(Reg) %>%
   summarise(Total = sum(Pop))
# A tibble: 5 x 2
     Reg   Total
  <fctr>   <int>
1      A 1000915
2      B  999938
3      C 1000912
4      D  999629
5      E 1000303

已编辑

我想要 dplyrdata.table 两种解决方案。

lapply(split(df, df$Reg),
       function(a) rbind(data.frame(Reg = a$Reg[1],
                                    Res = "Total",
                                    Pop = sum(a$Pop)),
                         a))
$A
  Reg   Res     Pop
1   A Total 1000915
2   A Urban  500414
3   A Rural  500501

$B
  Reg   Res    Pop
1   B Total 999938
3   B Urban 499922
4   B Rural 500016

$C
  Reg   Res     Pop
1   C Total 1000912
5   C Urban  501638
6   C Rural  499274

$D
  Reg   Res    Pop
1   D Total 999629
7   D Urban 499804
8   D Rural 499825

$E
   Reg   Res     Pop
1    E Total 1000303
9    E Urban  499917
10   E Rural  500386

如果需要,您可以使用 do.call(rbind, ...) 将整个内容转换为 data.frame

您可以在摘要中添加额外的 Res 列,然后 bind_rows 使用原始数据框:

df %>%
    group_by(Reg) %>%
    summarise(Pop = sum(Pop), Res = 'Total') %>%
    bind_rows(df) %>% 
    arrange(Reg)

# A tibble: 15 x 3
#     Reg     Pop   Res
#   <chr>   <int> <chr>
# 1     A 1000915 Total
# 2     A  500414 Urban
# 3     A  500501 Rural
# 4     B  999938 Total
# 5     B  499922 Urban
# 6     B  500016 Rural
# 7     C 1000912 Total
# 8     C  501638 Urban
# 9     C  499274 Rural
#10     D  999629 Total
#11     D  499804 Urban
#12     D  499825 Rural
#13     E 1000303 Total
#14     E  499917 Urban
#15     E  500386 Rural

一个对应的data.table解法:

dt <- setDT(df)
rbindlist(list(dt[, .(Pop = sum(Pop), Res = 'Total'), Reg], dt), use.names = TRUE)

堆叠和重新排列将起作用:

library(dplyr)

Reg <- rep(LETTERS[1:5], each = 2)
Res <- rep(c("Urban", "Rural"), times = 5)
set.seed(12345)
Pop <- rpois(n = 10, lambda = 500000)
df <- data.frame(Reg, Res, Pop, stringsAsFactors = FALSE)


sums <- df %>%
  group_by(Reg) %>%
  summarise(Pop = sum(Pop)) %>%
  mutate(Res = "Total")

df_sums <- bind_rows(df, sums) %>% 
  arrange(Reg, Res)

您的数据:

Reg <- rep(LETTERS[1:5], each = 2)
Res <- rep(c("Urban", "Rural"), times = 5)
set.seed(12345)
Pop <- rpois(n = 10, lambda = 500000)
df  <- data.frame(Reg, Res, Pop)

require(dplyr)
df1 <- 
df %>%
  group_by(Reg) %>%
  summarise(Total = sum(Pop))

我的解决方案(注:我也将之前的管道发送到df1):

df <- rbind(df, data.frame(Reg=df1$Reg, Res="Total", Pop=df1$Total))

df <- df[order(as.character(df$Reg), decreasing = T),]
df <- df[seq(dim(df)[1],1),]

结果:

print(df, row.names = F)
 Reg   Res     Pop
   A Total 1000915
   A Rural  500501
   A Urban  500414
   B Total  999938
   B Rural  500016
   B Urban  499922
   C Total 1000912
   C Rural  499274
   C Urban  501638
   D Total  999629
   D Rural  499825
   D Urban  499804
   E Total 1000303
   E Rural  500386
   E Urban  499917

如果要在不更改数据类型的情况下在组之间使用换行符打印它们:

for(g in unique(df$Reg)){
  print(df[df$Reg==g,], row.names = F)
  cat("\n")
}
 Reg   Res     Pop
   A Total 1000915
   A Rural  500501
   A Urban  500414

 Reg   Res    Pop
   B Total 999938
   B Rural 500016
   B Urban 499922

 Reg   Res     Pop
   C Total 1000912
   C Rural  499274
   C Urban  501638

 Reg   Res    Pop
   D Total 999629
   D Rural 499825
   D Urban 499804

 Reg   Res     Pop
   E Total 1000303
   E Rural  500386
   E Urban  499917

您还请求了 data.table 解决方案。这与上面的相同,除了像这样创建 df1

dt  <- as.data.table(df)
df1 <- dt[,sum(Pop),by=dt$Reg]

我们可以使用dplyrpurrr。这类似于 d.b 的方法,但 map_dfr 的输出将是一个数据框。因此不需要从列表到数据框的进一步转换。请注意,我使用了 data_frame 函数来构建 df,因为对于这个分析因素来说不需要。 df2 是最终输出。

library(dplyr)
library(purrr)

df <- data_frame(Reg, Res, Pop)

df2 <- df %>%
  split(.$Reg) %>%
  map_dfr(~bind_rows(.x, data_frame(Reg = .x$Reg[1], Res = "Total", Pop = sum(.x$Pop))))

df2 
# A tibble: 15 x 3
     Reg   Res     Pop
   <chr> <chr>   <int>
 1     A Urban  500414
 2     A Rural  500501
 3     A Total 1000915
 4     B Urban  499922
 5     B Rural  500016
 6     B Total  999938
 7     C Urban  501638
 8     C Rural  499274
 9     C Total 1000912
10     D Urban  499804
11     D Rural  499825
12     D Total  999629
13     E Urban  499917
14     E Rural  500386
15     E Total 1000303

data.table 包的开发版本 1.10.5(参见 here for installation instructions)具有三个新函数,用于计算不同级别分组的聚合,可在此处使用。

请注意,OP 的预期结果包含连续的行号 1 到 15,这表明 OP 期望一个 data.frame 或 data.table,而不是 首选的列表。但是,我们将在下面展示 data.table 也可以以对眼睛友好的方式打印。

rollup()

使用新的 rollup() 函数并按 Reg

排序
library(data.table)   # development version 1.10.5 as of 2015-09-10
setDT(df)
rollup(df, j = list(Pop = sum(Pop)), by = c("Reg", "Res"))[order(Reg)]

我们确实得到了

    Reg   Res     Pop
 1:   A Urban  500414
 2:   A Rural  500501
 3:   A    NA 1000915
 4:   B Urban  499922
 5:   B Rural  500016
 6:   B    NA  999938
 7:   C Urban  501638
 8:   C Rural  499274
 9:   C    NA 1000912
10:   D Urban  499804
11:   D Rural  499825
12:   D    NA  999629
13:   E Urban  499917
14:   E Rural  500386
15:   E    NA 1000303
16:  NA    NA 5001697

各自的总数用NA表示(包括总计)。如果我们想更好地重现预期结果,可以删除总计,将 NA 替换为 Total:

rollup(df, j = list(Pop = sum(Pop)), by = c("Reg", "Res"))[order(Reg)][
  is.na(Res), Res := "Total"][!is.na(Reg)]
    Reg   Res     Pop
 1:   A Urban  500414
 2:   A Rural  500501
 3:   A Total 1000915
 4:   B Urban  499922
 5:   B Rural  500016
 6:   B Total  999938
 7:   C Urban  501638
 8:   C Rural  499274
 9:   C Total 1000912
10:   D Urban  499804
11:   D Rural  499825
12:   D Total  999629
13:   E Urban  499917
14:   E Rural  500386
15:   E Total 1000303

请注意,Total 行出现在 下方 的详细信息行中,这与 OP 的预期结果不完全一致。

groupingsets()

使用groupingsets()功能,可以对聚合进行非常详细的控制:

groupingsets(df, j = list(Pop = sum(Pop)), by = c("Reg", "Res"), 
             sets = list("Reg", c("Reg", "Res")))[order(Reg)][
               is.na(Res), Res := "Total"][]
    Reg   Res     Pop
 1:   A Total 1000915
 2:   A Urban  500414
 3:   A Rural  500501
 4:   B Total  999938
 5:   B Urban  499922
 6:   B Rural  500016
 7:   C Total 1000912
 8:   C Urban  501638
 9:   C Rural  499274
10:   D Total  999629
11:   D Urban  499804
12:   D Rural  499825
13:   E Total 1000303
14:   E Urban  499917
15:   E Rural  500386

现在,Total 行显示在详细信息行上方,根本没有创建总计。

印刷精美的“经典”data.table解决方案

到目前为止, and .

发布了两个“经典”data.table 解决方案

两者都可以更简洁地重写为

rbind(df[, .(Res = "Total", Pop = sum(Pop)), by = Reg], df)[order(Reg)]

结果可以使用

以“对眼睛友好”的方式打印,组与组之间有空行
rbind(df[, .(Res = "Total", Pop = sum(Pop)), by = Reg], df)[
  order(Reg), {print(data.table(Reg, .SD), row.names = FALSE); cat("\n")}, by = Reg]
 Reg   Res     Pop
   A Total 1000915
   A Urban  500414
   A Rural  500501

 Reg   Res    Pop
   B Total 999938
   B Urban 499922
   B Rural 500016

 Reg   Res     Pop
   C Total 1000912
   C Urban  501638
   C Rural  499274

 Reg   Res    Pop
   D Total 999629
   D Urban 499804
   D Rural 499825

 Reg   Res     Pop
   E Total 1000303
   E Urban  499917
   E Rural  500386