查找R中数据框中每3列的平均值

Question

我想在包含 60 列的数据框中找到每 3 列的平均值，这样新的数据框就会有 20 列左右。我将样本数据如下：此外，新变量，如果我可以将它们作为字符串数组传递，将会有所帮助。

structure(list(`1961` = c(0, 0, 0, 0, 0, 0, 0, 0, 4.633, 54.247, 
0, 0, 0, 0, 0, 0, 0, 31.036, 3.18, 19.862), `1962` = c(0, 0, 
0, 0, 0, 0, 0, 0, 4.168, 63.587, 0, 0, 0, 0, 0, 0, 0, 28.169, 
2.913, 17.273), `1963` = c(0, 0, 0, 0, 0, 0, 0, 0, 3.284, 56.888, 
0, 0, 0, 0, 0, 0, 0, 26.667, 2.653, 16.586), `1964` = c(0, 0, 
0, 0, 0, 0, 0, 0, 2.689, 48.722, 0, 0, 0, 0, 0, 0, 0, 25.483, 
3.873, 15.708), `1965` = c(0, 0, 0, 0, 0, 0, 0, 0, 3.304, 33.838, 
0, 0, 0, 0, 0, 0, 0, 28.164, 3.927, 11.147), `1966` = c(0, 0, 
0, 0, 0, 0, 0, 0, 2.871, 26.695, 0, 0, 0, 0, 0, 0, 0, 28.962, 
4.434, 14.056), `1967` = c(0, 0, 0, 0, 0, 0, 0, 0, 2.752, 36.246, 
0, 0, 0, 0, 0, 0, 0, 30.877, 4.739, 14.765), `1968` = c(0, 0, 
0, 0, 0, 0, 0, 0, 3.537, 33.368, 0, 0, 0, 0, 0, 0, 0, 25.628, 
5.445, 14.372), `1969` = c(0, 0, 0, 0, 0, 0, 0, 0, 2.484, 35.711, 
0, 0, 0, 0, 0, 0, 0, 27.123, 5.286, 15.527)), row.names = c("Almonds, with shell", 
"Anise, badian, fennel, coriander", "Apples", "Apricots", "Areca nuts", 
"Asparagus", "Avocados", "Bananas", "Barley", "Bastfibres, other", 
"Beans, dry", "Beans, green", "Berries nes", "Broad beans, horse beans, dry", 
"Buckwheat", "Cabbages and other brassicas", "Carrots and turnips", 
"Cashew nuts, with shell", "Cassava", "Castor oil seed"), class = "data.frame")

Answer 1

我们可以使用 tidyverse 将数据从宽到长旋转，然后按 3 年序列分组。

library(dplyr)
library(tidyr)
library(tibble)

df.averaged = df %>%
  # Extract rownames as their own column
  rownames_to_column("product") %>%
  # Convert from wide to long: one row per product per year
  gather(year, value, -product) %>%
  # Add a column with "year group" (every 3 years go into one group)
  mutate(year = as.numeric(year),
         year.group = (floor((year + 1) / 3) * 3) - 1) %>%
  # Group by product and year group
  group_by(product, year.group) %>%
  # Get averages
  summarize(value = mean(value)) %>%
  # Convert back from long to wide, if desired
  spread(year.group, value)

编辑： 对于移动平均线，我们可以使用 RcppRoll 包（另请参阅）。

library(RcppRoll)
df.moving.window = df %>%
  # Extract rownames as their own column
  rownames_to_column("product") %>%
  # Convert from wide to long: one row per product per year
  gather(year, value, -product) %>%
  # Order by product, then year
  arrange(product, year) %>%
  # Compute the rolling average
  group_by(product) %>%
  mutate(value = roll_mean(value, n = 3, align = "right", fill = NA)) %>%
  # Convert back from long to wide, if desired
  spread(year, value)

Answer 2

或者，这可以使用 base R 解决：

sapply(seq(2, ncol(mydf), 3), function(j) rowMeans(mydf[, j+(-1:1)]))

或

sapply(seq(1, ncol(mydf), 3), function(j) rowMeans(mydf[, j+(0:2)]))

                                      [,1]      [,2]      [,3]
Almonds, with shell               0.000000  0.000000  0.000000
Anise, badian, fennel, coriander  0.000000  0.000000  0.000000
Apples                            0.000000  0.000000  0.000000
Apricots                          0.000000  0.000000  0.000000
Areca nuts                        0.000000  0.000000  0.000000
Asparagus                         0.000000  0.000000  0.000000
Avocados                          0.000000  0.000000  0.000000
Bananas                           0.000000  0.000000  0.000000
Barley                            4.028333  2.954667  2.924333
Bastfibres, other                58.240667 36.418333 35.108333
Beans, dry                        0.000000  0.000000  0.000000
Beans, green                      0.000000  0.000000  0.000000
Berries nes                       0.000000  0.000000  0.000000
Broad beans, horse beans, dry     0.000000  0.000000  0.000000
Buckwheat                         0.000000  0.000000  0.000000
Cabbages and other brassicas      0.000000  0.000000  0.000000
Carrots and turnips               0.000000  0.000000  0.000000
Cashew nuts, with shell          28.624000 27.536333 27.876000
Cassava                           2.915333  4.078000  5.156667
Castor oil seed                  17.907000 13.637000 14.888000

好处是聚合仅基于位置，而不是基于列名。这与相反，后者要求可以将列名强制转换为连续的整数序列，即年份序列。

不过，上面的代码returns是一个矩阵，不是data.frame。此外，OP 已请求将新变量作为字符串数组传递。

library(magrittr)   # piping used to improve readability
new_cols <- c("Mean_A", "Mean_B", "Mean_C")
sapply(seq(1, ncol(mydf), 3), function(j) rowMeans(mydf[, j+(0:2)])) %>% 
  as.data.frame() %>% 
  set_names(new_cols)

                                    Mean_A    Mean_B    Mean_C
Almonds, with shell               0.000000  0.000000  0.000000
Anise, badian, fennel, coriander  0.000000  0.000000  0.000000
Apples                            0.000000  0.000000  0.000000
Apricots                          0.000000  0.000000  0.000000
Areca nuts                        0.000000  0.000000  0.000000
Asparagus                         0.000000  0.000000  0.000000
Avocados                          0.000000  0.000000  0.000000
Bananas                           0.000000  0.000000  0.000000
Barley                            4.028333  2.954667  2.924333
Bastfibres, other                58.240667 36.418333 35.108333
Beans, dry                        0.000000  0.000000  0.000000
Beans, green                      0.000000  0.000000  0.000000
Berries nes                       0.000000  0.000000  0.000000
Broad beans, horse beans, dry     0.000000  0.000000  0.000000
Buckwheat                         0.000000  0.000000  0.000000
Cabbages and other brassicas      0.000000  0.000000  0.000000
Carrots and turnips               0.000000  0.000000  0.000000
Cashew nuts, with shell          28.624000 27.536333 27.876000
Cassava                           2.915333  4.078000  5.156667
Castor oil seed                  17.907000 13.637000 14.888000

顺便说一句：重新考虑数据结构

数据集看起来更像矩阵而不是 data.frame，即所有列都是相同的数据类型。否则，就无法通过跨列的方式进行聚合。也许，数据应该被视为矩阵，我们可以从矩阵运算中受益，比如 rowMeans().

矩阵的行和列也可以命名：

library(magrittr)
new_cols <- c("Mean_A", "Mean_B", "Mean_C")
sapply(seq(1, ncol(mydf), 3), function(j) rowMeans(mydf[, j+(0:2)])) %>% 
  set_colnames(new_cols)

                                    Mean_A    Mean_B    Mean_C
Almonds, with shell               0.000000  0.000000  0.000000
Anise, badian, fennel, coriander  0.000000  0.000000  0.000000
Apples                            0.000000  0.000000  0.000000
Apricots                          0.000000  0.000000  0.000000
Areca nuts                        0.000000  0.000000  0.000000
Asparagus                         0.000000  0.000000  0.000000
Avocados                          0.000000  0.000000  0.000000
Bananas                           0.000000  0.000000  0.000000
Barley                            4.028333  2.954667  2.924333
Bastfibres, other                58.240667 36.418333 35.108333
Beans, dry                        0.000000  0.000000  0.000000
Beans, green                      0.000000  0.000000  0.000000
Berries nes                       0.000000  0.000000  0.000000
Broad beans, horse beans, dry     0.000000  0.000000  0.000000
Buckwheat                         0.000000  0.000000  0.000000
Cabbages and other brassicas      0.000000  0.000000  0.000000
Carrots and turnips               0.000000  0.000000  0.000000
Cashew nuts, with shell          28.624000 27.536333 27.876000
Cassava                           2.915333  4.078000  5.156667
Castor oil seed                  17.907000 13.637000 14.888000

打印输出看起来类似于 data.frame 解决方案，但底层数据结构现在是一个矩阵。

或者，数据可以在整形后以长格式存储（这是对 gather() 的调用在中所做的）。然后，列名成为数据对象，可以这样操作。

Answer 3

这是另一种比较稳健的方法：

n <- 3
i <- seq(1, length(DF), n)

DF2 <- data.frame(nut = rownames(DF))
DF2[, paste0('NewCol', seq_along(i))] <- lapply(i, function (j) rowMeans(DF[, j:min(j+2, length(DF))]))

DF2

                                nut   NewCol1   NewCol2   NewCol3
1               Almonds, with shell  0.000000  0.000000  0.000000
2  Anise, badian, fennel, coriander  0.000000  0.000000  0.000000
3                            Apples  0.000000  0.000000  0.000000
4                          Apricots  0.000000  0.000000  0.000000
5                        Areca nuts  0.000000  0.000000  0.000000
6                         Asparagus  0.000000  0.000000  0.000000
7                          Avocados  0.000000  0.000000  0.000000
8                           Bananas  0.000000  0.000000  0.000000
9                            Barley  4.028333  2.954667  2.924333
10                Bastfibres, other 58.240667 36.418333 35.108333
11                       Beans, dry  0.000000  0.000000  0.000000
12                     Beans, green  0.000000  0.000000  0.000000
13                      Berries nes  0.000000  0.000000  0.000000
14    Broad beans, horse beans, dry  0.000000  0.000000  0.000000
15                        Buckwheat  0.000000  0.000000  0.000000
16     Cabbages and other brassicas  0.000000  0.000000  0.000000
17              Carrots and turnips  0.000000  0.000000  0.000000
18          Cashew nuts, with shell 28.624000 27.536333 27.876000
19                          Cassava  2.915333  4.078000  5.156667
20                  Castor oil seed 17.907000 13.637000 14.888000

需要指出的是输出是 data.frame。 lapply()函数returns一个列表。然后将这些列表分配回 DF2.

中的新列

最重要的是j:min(j+2, length(DF))。这部分将允许代码是否有 2 列或 3 列。

Answer 4

有趣的问题！使用 purrr 和 hablar 进行了另一次拍摄。为每 3 列创建一个整数列表。对每 3 列应用行均值并组合成新的 df。从原始 df.

复制行名

代码

library(tidyverse)
library(hablar)
library(magrittr)

l <- map(seq(1, ncol(df), 3), ~seq(.x, .x + 2))

map_dfc(l, ~df %>% transmute(mean = row_mean_(.x))) %>% 
  set_rownames(rownames(df))

结果

                                      mean     mean1     mean2
Almonds, with shell               0.000000  0.000000  0.000000
Anise, badian, fennel, coriander  0.000000  0.000000  0.000000
Apples                            0.000000  0.000000  0.000000
Apricots                          0.000000  0.000000  0.000000
Areca nuts                        0.000000  0.000000  0.000000
Asparagus                         0.000000  0.000000  0.000000
Avocados                          0.000000  0.000000  0.000000
Bananas                           0.000000  0.000000  0.000000
Barley                            4.028333  2.954667  2.924333
Bastfibres, other                58.240667 36.418333 35.108333
Beans, dry                        0.000000  0.000000  0.000000
Beans, green                      0.000000  0.000000  0.000000
Berries nes                       0.000000  0.000000  0.000000
Broad beans, horse beans, dry     0.000000  0.000000  0.000000
Buckwheat                         0.000000  0.000000  0.000000
Cabbages and other brassicas      0.000000  0.000000  0.000000
Carrots and turnips               0.000000  0.000000  0.000000
Cashew nuts, with shell          28.624000 27.536333 27.876000
Cassava                           2.915333  4.078000  5.156667
Castor oil seed                  17.907000 13.637000 14.888000

查找R中数据框中每3列的平均值

Finding average for every 3 columns in dataframe in R

r

mean

multiple-columns

顺便说一句：重新考虑数据结构