如何重塑数据结构
How to reshape the data structure
这可能是一个简单的问题,但我没有在“类似问题”中找到解决方案 - 虽然肯定有人问过。不管怎样,如果我的问题在其他地方有答案,请告诉我..
现在,我的问题。我有一个看起来像这样的数据:
但我想这样构造它:
换句话说,国家-年份结构。没有每年和主题的双重观察。任何建议都会很棒!
尝试:
library(tidyverse)
new <- df %>% group_by(country_coo,country,year) %>%
summarise(Air_pollution=sum(Air_pollution,na.rm = T))
输出:
# A tibble: 6 x 4
# Groups: country_coo, country [4]
country_coo country year Air_pollution
<dbl> <chr> <dbl> <dbl>
1 22 A 2000 6
2 22 A 2001 1
3 44 B 2000 2
4 66 C 2000 10
5 88 D 2000 7
6 88 D 2001 15
使用了一些数据:
#Data
df <- structure(list(country_coo = c(22, 22, 22, 44, 44, 66, 88, 88,
88, 88), country = c("A", "A", "A", "B", "B", "C", "D", "D",
"D", "D"), year = c(2000, 2000, 2001, 2000, 2000, 2000, 2000,
2001, 2001, 2001), Air_pollution = c(5, 1, 1, 1, 1, 10, 7, 5,
5, 5)), class = "data.frame", row.names = c(NA, -10L))
我们可以使用 base R
中的 aggregate
(不需要包)
aggregate(Air_pollution ~ ., df, FUN = sum)
-输出
# country_code country year Air_pollution
#1 22 A 2000 6
#2 44 B 2000 2
#3 66 C 2000 10
#4 88 D 2000 7
#5 22 A 2001 2
#6 88 D 2001 15
或 dplyr
library(dplyr)
df %>%
group_by(country_code, country, year) %>%
summarise(Air_pollution = sum(Air_pollution), .groups = 'drop')
-输出
# A tibble: 6 x 4
# country_code country year Air_pollution
# <dbl> <chr> <dbl> <dbl>
#1 22 A 2000 6
#2 22 A 2001 2
#3 44 B 2000 2
#4 66 C 2000 10
#5 88 D 2000 7
#6 88 D 2001 15
数据
df <- structure(list(country_code = c(22, 22, 22, 44, 44, 66, 88, 88,
88, 88), country = c("A", "A", "A", "B", "B", "C", "D", "D",
"D", "D"), year = c(2000, 2000, 2001, 2000, 2000, 2000, 2000,
2001, 2001, 2001), Air_pollution = c(5, 1, 2, 1, 1, 10, 7, 5,
5, 5)), class = "data.frame", row.names = c(NA, -10L))
一个data.table
选项
> setDT(df)[, .(Air_pollution = sum(Air_pollution)), by = country_code:year]
country_code country year Air_pollution
1: 22 A 2000 6
2: 22 A 2001 2
3: 44 B 2000 2
4: 66 C 2000 10
5: 88 D 2000 7
6: 88 D 2001 15
数据
> dput(df)
structure(list(country_code = c(22, 22, 22, 44, 44, 66, 88, 88,
88, 88), country = c("A", "A", "A", "B", "B", "C", "D", "D",
"D", "D"), year = c(2000, 2000, 2001, 2000, 2000, 2000, 2000,
2001, 2001, 2001), Air_pollution = c(5, 1, 2, 1, 1, 10, 7, 5,
5, 5)), class = "data.frame", row.names = c(NA, -10L))
这可能是一个简单的问题,但我没有在“类似问题”中找到解决方案 - 虽然肯定有人问过。不管怎样,如果我的问题在其他地方有答案,请告诉我..
现在,我的问题。我有一个看起来像这样的数据:
但我想这样构造它:
换句话说,国家-年份结构。没有每年和主题的双重观察。任何建议都会很棒!
尝试:
library(tidyverse)
new <- df %>% group_by(country_coo,country,year) %>%
summarise(Air_pollution=sum(Air_pollution,na.rm = T))
输出:
# A tibble: 6 x 4
# Groups: country_coo, country [4]
country_coo country year Air_pollution
<dbl> <chr> <dbl> <dbl>
1 22 A 2000 6
2 22 A 2001 1
3 44 B 2000 2
4 66 C 2000 10
5 88 D 2000 7
6 88 D 2001 15
使用了一些数据:
#Data
df <- structure(list(country_coo = c(22, 22, 22, 44, 44, 66, 88, 88,
88, 88), country = c("A", "A", "A", "B", "B", "C", "D", "D",
"D", "D"), year = c(2000, 2000, 2001, 2000, 2000, 2000, 2000,
2001, 2001, 2001), Air_pollution = c(5, 1, 1, 1, 1, 10, 7, 5,
5, 5)), class = "data.frame", row.names = c(NA, -10L))
我们可以使用 base R
中的 aggregate
(不需要包)
aggregate(Air_pollution ~ ., df, FUN = sum)
-输出
# country_code country year Air_pollution
#1 22 A 2000 6
#2 44 B 2000 2
#3 66 C 2000 10
#4 88 D 2000 7
#5 22 A 2001 2
#6 88 D 2001 15
或 dplyr
library(dplyr)
df %>%
group_by(country_code, country, year) %>%
summarise(Air_pollution = sum(Air_pollution), .groups = 'drop')
-输出
# A tibble: 6 x 4
# country_code country year Air_pollution
# <dbl> <chr> <dbl> <dbl>
#1 22 A 2000 6
#2 22 A 2001 2
#3 44 B 2000 2
#4 66 C 2000 10
#5 88 D 2000 7
#6 88 D 2001 15
数据
df <- structure(list(country_code = c(22, 22, 22, 44, 44, 66, 88, 88,
88, 88), country = c("A", "A", "A", "B", "B", "C", "D", "D",
"D", "D"), year = c(2000, 2000, 2001, 2000, 2000, 2000, 2000,
2001, 2001, 2001), Air_pollution = c(5, 1, 2, 1, 1, 10, 7, 5,
5, 5)), class = "data.frame", row.names = c(NA, -10L))
一个data.table
选项
> setDT(df)[, .(Air_pollution = sum(Air_pollution)), by = country_code:year]
country_code country year Air_pollution
1: 22 A 2000 6
2: 22 A 2001 2
3: 44 B 2000 2
4: 66 C 2000 10
5: 88 D 2000 7
6: 88 D 2001 15
数据
> dput(df)
structure(list(country_code = c(22, 22, 22, 44, 44, 66, 88, 88,
88, 88), country = c("A", "A", "A", "B", "B", "C", "D", "D",
"D", "D"), year = c(2000, 2000, 2001, 2000, 2000, 2000, 2000,
2001, 2001, 2001), Air_pollution = c(5, 1, 2, 1, 1, 10, 7, 5,
5, 5)), class = "data.frame", row.names = c(NA, -10L))