如何重塑数据结构

How to reshape the data structure

这可能是一个简单的问题,但我没有在“类似问题”中找到解决方案 - 虽然肯定有人问过。不管怎样,如果我的问题在其他地方有答案,请告诉我..

现在,我的问题。我有一个看起来像这样的数据:

但我想这样构造它:

换句话说,国家-年份结构。没有每年和主题的双重观察。任何建议都会很棒!

尝试:

library(tidyverse)
new <- df %>% group_by(country_coo,country,year) %>%
  summarise(Air_pollution=sum(Air_pollution,na.rm = T))

输出:

# A tibble: 6 x 4
# Groups:   country_coo, country [4]
  country_coo country  year Air_pollution
        <dbl> <chr>   <dbl>         <dbl>
1          22 A        2000             6
2          22 A        2001             1
3          44 B        2000             2
4          66 C        2000            10
5          88 D        2000             7
6          88 D        2001            15

使用了一些数据:

#Data
df <- structure(list(country_coo = c(22, 22, 22, 44, 44, 66, 88, 88, 
88, 88), country = c("A", "A", "A", "B", "B", "C", "D", "D", 
"D", "D"), year = c(2000, 2000, 2001, 2000, 2000, 2000, 2000, 
2001, 2001, 2001), Air_pollution = c(5, 1, 1, 1, 1, 10, 7, 5, 
5, 5)), class = "data.frame", row.names = c(NA, -10L))

我们可以使用 base R 中的 aggregate(不需要包)

aggregate(Air_pollution ~ ., df, FUN = sum)

-输出

#   country_code country year Air_pollution
#1           22       A 2000             6
#2           44       B 2000             2
#3           66       C 2000            10
#4           88       D 2000             7
#5           22       A 2001             2
#6           88       D 2001            15

dplyr

library(dplyr)
df %>%
    group_by(country_code, country, year) %>%
    summarise(Air_pollution = sum(Air_pollution), .groups = 'drop')

-输出

# A tibble: 6 x 4
#  country_code country  year Air_pollution
#         <dbl> <chr>   <dbl>         <dbl>
#1           22 A        2000             6
#2           22 A        2001             2
#3           44 B        2000             2
#4           66 C        2000            10
#5           88 D        2000             7
#6           88 D        2001            15

数据

df <- structure(list(country_code = c(22, 22, 22, 44, 44, 66, 88, 88, 
88, 88), country = c("A", "A", "A", "B", "B", "C", "D", "D", 
"D", "D"), year = c(2000, 2000, 2001, 2000, 2000, 2000, 2000, 
2001, 2001, 2001), Air_pollution = c(5, 1, 2, 1, 1, 10, 7, 5, 
5, 5)), class = "data.frame", row.names = c(NA, -10L))

一个data.table选项

> setDT(df)[, .(Air_pollution = sum(Air_pollution)), by = country_code:year]
   country_code country year Air_pollution
1:           22       A 2000             6
2:           22       A 2001             2
3:           44       B 2000             2
4:           66       C 2000            10
5:           88       D 2000             7
6:           88       D 2001            15

数据

> dput(df)
structure(list(country_code = c(22, 22, 22, 44, 44, 66, 88, 88,
88, 88), country = c("A", "A", "A", "B", "B", "C", "D", "D", 
"D", "D"), year = c(2000, 2000, 2001, 2000, 2000, 2000, 2000,
2001, 2001, 2001), Air_pollution = c(5, 1, 2, 1, 1, 10, 7, 5,
5, 5)), class = "data.frame", row.names = c(NA, -10L))