r - 按第一行分组的计算平均值和总和值

r - Calculated mean and sum values group by the first row

我有一个数据框,我想根据数据框的第一行计算 x 的所有平均值和 y 组的所有总和。

The dateframe to be calculate

下面的link就是我想要的结果。 The result expected

这是数据。

dt=structure(list(year = structure(c(5L, 1L, 2L, 3L, 4L), .Label = c("1980", 
    "1981", "1982", "1985", "group"), class = "factor"), x1 = structure(c(4L, 
    1L, 3L, 2L, 1L), .Label = c("1", "2", "4", "A"), class = "factor"), 
        y1 = structure(c(4L, 1L, 3L, 2L, 2L), .Label = c("1", "3", 
        "5", "A"), class = "factor"), x2 = structure(c(5L, 1L, 4L, 
        3L, 2L), .Label = c("2", "4", "5", "6", "A"), class = "factor"), 
        y2 = structure(c(4L, 1L, 3L, 3L, 2L), .Label = c("3", "5", 
        "7", "A"), class = "factor"), x3 = structure(c(4L, 1L, 3L, 
        2L, 1L), .Label = c("4", "6", "8", "B"), class = "factor"), 
        y3 = structure(c(4L, 1L, 3L, 2L, 1L), .Label = c("3", "5", 
        "6", "B"), class = "factor"), x4 = structure(c(4L, 1L, 3L, 
        2L, 3L), .Label = c("2", "4", "5", "C"), class = "factor"), 
        y4 = structure(c(5L, 1L, 2L, 3L, 4L), .Label = c("3", "4", 
        "5", "6", "C"), class = "factor"), x5 = structure(c(5L, 2L, 
        1L, 3L, 4L), .Label = c("3", "4", "6", "7", "C"), class = "factor"), 
        y5 = structure(c(4L, 2L, 1L, 3L, 2L), .Label = c("2", "5", 
        "8", "C"), class = "factor")), class = "data.frame", row.names = c(NA, 
    -5L))

预期结果,

result_expected <- structure(list(year = c(1980L, 1981L, 1982L, 1985L), A_x_mean = c(1.5, 
5, 3.5, 2.5), A_y_sum = c(4L, 12L, 10L, 8L), B_x_mean = c(4L, 
8L, 6L, 4L), B_y_sum = c(3L, 6L, 5L, 3L), C_x_mean = 3:6, C_y_sum = c(8L, 
6L, 13L, 11L)), class = "data.frame", row.names = c(NA, -4L))

我在 goole 和 Whosebug 中搜索关键词,但没有合适的答案。我目前的想法是计算第一行的唯一组 A、B、C。

require(tidyverse)
group_variables <- dt%>%gather(key,value)%>%distinct(value)%>%arrange(value)

然后通过 for

获取 group_variables 中的行
for i in group_variables{......}

或者我可以通过 tidyr 中的 gathespread 以及 dplyr 方法更改数据帧的结构,就像下面的代码一样,

dt_new%>% group_by (group)%>%
          summarise(mean=mean(x,na.rm=TRUE),
          sum=sum(x,na.rm=TURE))

一种方法是将你的因素变成字符,然后将你的第一行作为你的列名(并删除第一行)。然后我使用 dplyrtidyr 进行了一些数据操作,使数据按年份和字母变长,然后在求和和均值后将数据转置为宽格式。

dt=structure(list(year = structure(c(5L, 1L, 2L, 3L, 4L), .Label = c("1980", 
"1981", "1982", "1985", "group"), class = "factor"), x1 = structure(c(4L, 
1L, 3L, 2L, 1L), .Label = c("1", "2", "4", "A"), class = "factor"), 
y1 = structure(c(4L, 1L, 3L, 2L, 2L), .Label = c("1", "3", 
"5", "A"), class = "factor"), x2 = structure(c(5L, 1L, 4L, 
3L, 2L), .Label = c("2", "4", "5", "6", "A"), class = "factor"), 
y2 = structure(c(4L, 1L, 3L, 3L, 2L), .Label = c("3", "5", 
"7", "A"), class = "factor"), x3 = structure(c(4L, 1L, 3L, 
2L, 1L), .Label = c("4", "6", "8", "B"), class = "factor"), 
y3 = structure(c(4L, 1L, 3L, 2L, 1L), .Label = c("3", "5", 
"6", "B"), class = "factor"), x4 = structure(c(4L, 1L, 3L, 
2L, 3L), .Label = c("2", "4", "5", "C"), class = "factor"), 
y4 = structure(c(5L, 1L, 2L, 3L, 4L), .Label = c("3", "4", 
"5", "6", "C"), class = "factor"), x5 = structure(c(5L, 2L, 
1L, 3L, 4L), .Label = c("3", "4", "6", "7", "C"), class = "factor"), 
y5 = structure(c(4L, 2L, 1L, 3L, 2L), .Label = c("2", "5", 
"8", "C"), class = "factor")), class = "data.frame", row.names = c(NA, 
-5L))

dt[sapply(dt, is.factor)] <- sapply(dt, as.character)


colnames(dt) <- dt[1,]

dt2 <- dt[-1,]

library(tidyverse)

dt3 <- pivot_longer(dt2, cols = c("A","B","C"),
                    names_to = "letters") %>%
      ungroup %>% 
      select(-.copy) %>% 
        ungroup %>% 
        mutate(value = as.numeric(value)) %>% 
      group_by(letters,group) %>% 
      summarize(meanval = mean(value),
             sumval = sum(value)) %>% 
      ungroup %>% 
      pivot_wider(names_from = letters,
                  values_from = c(meanval,sumval))

首先我们需要取出有组的第一行,把数据框变长,将x1,x2,x3简化为x等,然后把组放回去:

group_var = sapply(dt[1,-1],as.character)
mat <- 
dt[-1,] %>% pivot_longer(-year) %>% 
   mutate(value=as.numeric(as.character(value))) %>% 
   mutate(group=as.character(group_var[as.character(name)])) %>% 
   mutate(name=substr(name,1,1))

mat
# A tibble: 40 x 4
   year  name  value group
   <fct> <chr> <dbl> <chr>
 1 1980  x         1 A    
 2 1980  y         1 A    
 3 1980  x         2 A    
 4 1980  y         3 A    
 5 1980  x         4 B    
 6 1980  y         3 B    
 7 1980  x         2 C    
 8 1980  y         3 C    
 9 1980  x         4 C    
10 1980  y         5 C   

现在剩下的就是按照年、名、组进行分组,做各自的功能,所以我们定义一个函数:

func = function(DF,func){
DF %>% 
   group_by(group,name,year) %>% 
   summarise_all(func) %>%
   mutate(label=paste(group,name,func,sep="_")) %>%
   ungroup %>%
   select(year,value,label) %>%
   pivot_wider(values_from=value,names_from=label)
}

我们将它应用于两部分数据:

cbind(func(mat %>% filter(name=="x"),"mean"),func(mat %>% filter(name=="y"),"sum"))

year A_x_mean B_x_mean C_x_mean year A_y_sum B_y_sum C_y_sum
1 1980      1.5        4        3 1980       4       3       8
2 1981      5.0        8        4 1981      12       6       6
3 1982      3.5        6        5 1982      10       5      13
4 1985      2.5        4        6 1985       8       3      11