使用 R 折叠不同列中具有不同值的重复行
Collapse duplicated rows with different values in different columns using R
我有一个包含 500 个观测值的数据框,但我在示例中只显示了 3 个。这些是不同列中具有不同值的重复项(ID 列除外,其中包括重复的人)。我正在复制数据框的情况 (df) 以及处理后的情况 (df_new)。这可能吗 ?数据框是 10 个变量,所以我不担心 'doubling' 它们。变量中的值为 a、b、c、d、0、''。然而,我在表格中让它们更笼统。
df <- data.frame(ID = c('1','1','2', '2', '3','3'),
Year = c('smaller year.1', 'bigger year.1', 'bigger year.2', 'smaller year.2', 'same year.3', 'same year.3'),
V1 = c('a', 'b','c','d','e','f'),
V2 = c('g', 'h', 'i', 'j', 'k', 'l'),
Vn = c('n1', 'n2','n3','n4','n5','n6'))
df_new <- data.frame(ID = c('1','2','3'),
Year_smaller = c('smaller year.1', 'smaller year.2', 'same year.3'),
Year_bigger = c('bigger year.1', 'bigger year.2', 'same year.3'),
V1 = c('a','c','e'),
V1.1 = c('b','d','f'),
V2 = c('g','i','k'),
V2.1 = c('h','j','l'),
Vn = c('n1','n3','n5'),
Vn.1 = c('n2','n4','n6'))
这是我不雅的回答。这可能会给你一些想法。
library(tidyverse)
year_df <- df %>%
select(ID, Year) %>%
filter(!str_detect(Year, "^same")) %>%
mutate(year_group = map_chr(Year, ~str_split(., " ")[[1]][1])) %>%
pivot_wider(
names_from = year_group,
values_from = Year
) %>%
add_row(ID = "3", smaller = "same year.3", bigger = "same year.3")
df_new <- df %>%
select(-Year) %>%
pivot_longer(-ID) %>%
mutate(
group = paste0(str_sub(name, -1), str_sub(value, -1)),
name = str_remove(name, "_[a-z]")
) %>%
pivot_wider(
names_from = c(name, group),
values_from = value
) %>%
left_join(year_df, by = "ID") %>%
relocate(c(smaller, bigger), .after = ID)
df_new
根据修改后的要求编辑数据。由于在字母表中 b
出现在 s
之前,因此 bigger_year
显示在 smaller_year
之前,但是,在实际数据中,您将正确排序年份。不过,如果您想像这样对字符串进行排序,请使用 sort(desc(Year))
而不是 sort(Year)
df <- data.frame(ID = c('1','1','2', '2', '3','3'),
Year = c('smaller year.1', 'bigger year.1', 'bigger year.2', 'smaller year.2', 'same year.3', 'same year.3'),
V1 = c('a', 'b','c','d','e','f'),
V2 = c('g', 'h', 'i', 'j', 'k', 'l'),
Vn = c('n1', 'n2','n3','n4','n5','n6'))
library(tidyverse)
df %>% group_by(ID) %>% mutate(Year = sort(Year)) %>%
mutate(rid = row_number()) %>%
pivot_wider(id_cols = ID, names_from = rid, values_from = c(Year:Vn), names_sep = '')
#> # A tibble: 3 x 9
#> # Groups: ID [3]
#> ID Year1 Year2 V11 V12 V21 V22 Vn1 Vn2
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1 bigger year.1 smaller year.1 a b g h n1 n2
#> 2 2 bigger year.2 smaller year.2 c d i j n3 n4
#> 3 3 same year.3 same year.3 e f k l n5 n6
由 reprex package (v2.0.0)
于 2021-06-19 创建
library(tidyverse)
df %>% group_by(ID) %>% mutate(rid = row_number()) %>%
pivot_wider(id_cols = ID, names_from = rid, values_from = c(Year:Variable_n), names_sep = '')
# A tibble: 3 x 9
# Groups: ID [3]
ID Year1 Year2 Variable_a1 Variable_a2 Variable_b1 Variable_b2 Variable_n1 Variable_n2
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 smaller year.1 bigger year.1 va11 va12 vb11 vb12 vn11 vn12
2 2 bigger year.2 smaller year.2 va21 va22 vb21 vb22 vn21 vn22
3 3 same year.3 same year.3 va31 va32 vb31 vb32 vn31 vn32
你是这个意思吗?
df %>% group_by(ID) %>% arrange(desc(Year)) %>% mutate(rid = row_number()) %>%
pivot_wider(id_cols = ID, names_from = rid, values_from = c(Year:Variable_n), names_sep = '')
# A tibble: 3 x 9
# Groups: ID [3]
ID Year1 Year2 Variable_a1 Variable_a2 Variable_b1 Variable_b2 Variable_n1 Variable_n2
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 2 smaller year.2 bigger year.2 va22 va21 vb22 vb21 vn22 vn21
2 1 smaller year.1 bigger year.1 va11 va12 vb11 vb12 vn11 vn12
3 3 same year.3 same year.3 va31 va32 vb31 vb32 vn31 vn32
这是一个 data.table
选项 -
library(data.table)
cols <- grep('Variable', names(df), value = TRUE)
dcast(setDT(df), ID~rowid(ID), value.var = c('Year', cols))
# ID Year_1 Year_2 Variable_a_1 Variable_a_2 Variable_b_1
#1: 1 smaller year.1 bigger year.1 va11 va12 vb11
#2: 2 bigger year.2 smaller year.2 va21 va22 vb21
#3: 3 same year.3 same year.3 va31 va32 vb31
# Variable_b_2 Variable_n_1 Variable_n_2
#1: vb12 vn11 vn12
#2: vb22 vn21 vn22
#3: vb32 vn31 vn32
我们也可以采用以下解决方案:
library(dplyr)
library(tidyr)
library(purrr)
df %>%
group_split(ID) %>%
map_dfr(~ .x %>%
mutate(id = row_number()) %>%
pivot_wider(names_from = id, values_from = c(Year, Variable_a, Variable_b, Variable_n),
names_sep = "") %>%
rename(Year_smaller = Year1,
Year_bigger = Year2)) %>%
select(starts_with("Year"))
# A tibble: 3 x 9
ID Year_smaller Year_bigger Variable_a1 Variable_a2 Variable_b1 Variable_b2 Variable_n1
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 smaller year.1 bigger year.1 va11 va12 vb11 vb12 vn11
2 2 bigger year.2 smaller year.2 va21 va22 vb21 vb22 vn21
3 3 same year.3 same year.3 va31 va32 vb31 vb32 vn31
# ... with 1 more variable: Variable_n2 <chr>
我有一个包含 500 个观测值的数据框,但我在示例中只显示了 3 个。这些是不同列中具有不同值的重复项(ID 列除外,其中包括重复的人)。我正在复制数据框的情况 (df) 以及处理后的情况 (df_new)。这可能吗 ?数据框是 10 个变量,所以我不担心 'doubling' 它们。变量中的值为 a、b、c、d、0、''。然而,我在表格中让它们更笼统。
df <- data.frame(ID = c('1','1','2', '2', '3','3'),
Year = c('smaller year.1', 'bigger year.1', 'bigger year.2', 'smaller year.2', 'same year.3', 'same year.3'),
V1 = c('a', 'b','c','d','e','f'),
V2 = c('g', 'h', 'i', 'j', 'k', 'l'),
Vn = c('n1', 'n2','n3','n4','n5','n6'))
df_new <- data.frame(ID = c('1','2','3'),
Year_smaller = c('smaller year.1', 'smaller year.2', 'same year.3'),
Year_bigger = c('bigger year.1', 'bigger year.2', 'same year.3'),
V1 = c('a','c','e'),
V1.1 = c('b','d','f'),
V2 = c('g','i','k'),
V2.1 = c('h','j','l'),
Vn = c('n1','n3','n5'),
Vn.1 = c('n2','n4','n6'))
这是我不雅的回答。这可能会给你一些想法。
library(tidyverse)
year_df <- df %>%
select(ID, Year) %>%
filter(!str_detect(Year, "^same")) %>%
mutate(year_group = map_chr(Year, ~str_split(., " ")[[1]][1])) %>%
pivot_wider(
names_from = year_group,
values_from = Year
) %>%
add_row(ID = "3", smaller = "same year.3", bigger = "same year.3")
df_new <- df %>%
select(-Year) %>%
pivot_longer(-ID) %>%
mutate(
group = paste0(str_sub(name, -1), str_sub(value, -1)),
name = str_remove(name, "_[a-z]")
) %>%
pivot_wider(
names_from = c(name, group),
values_from = value
) %>%
left_join(year_df, by = "ID") %>%
relocate(c(smaller, bigger), .after = ID)
df_new
根据修改后的要求编辑数据。由于在字母表中 b
出现在 s
之前,因此 bigger_year
显示在 smaller_year
之前,但是,在实际数据中,您将正确排序年份。不过,如果您想像这样对字符串进行排序,请使用 sort(desc(Year))
而不是 sort(Year)
df <- data.frame(ID = c('1','1','2', '2', '3','3'),
Year = c('smaller year.1', 'bigger year.1', 'bigger year.2', 'smaller year.2', 'same year.3', 'same year.3'),
V1 = c('a', 'b','c','d','e','f'),
V2 = c('g', 'h', 'i', 'j', 'k', 'l'),
Vn = c('n1', 'n2','n3','n4','n5','n6'))
library(tidyverse)
df %>% group_by(ID) %>% mutate(Year = sort(Year)) %>%
mutate(rid = row_number()) %>%
pivot_wider(id_cols = ID, names_from = rid, values_from = c(Year:Vn), names_sep = '')
#> # A tibble: 3 x 9
#> # Groups: ID [3]
#> ID Year1 Year2 V11 V12 V21 V22 Vn1 Vn2
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1 bigger year.1 smaller year.1 a b g h n1 n2
#> 2 2 bigger year.2 smaller year.2 c d i j n3 n4
#> 3 3 same year.3 same year.3 e f k l n5 n6
由 reprex package (v2.0.0)
于 2021-06-19 创建library(tidyverse)
df %>% group_by(ID) %>% mutate(rid = row_number()) %>%
pivot_wider(id_cols = ID, names_from = rid, values_from = c(Year:Variable_n), names_sep = '')
# A tibble: 3 x 9
# Groups: ID [3]
ID Year1 Year2 Variable_a1 Variable_a2 Variable_b1 Variable_b2 Variable_n1 Variable_n2
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 smaller year.1 bigger year.1 va11 va12 vb11 vb12 vn11 vn12
2 2 bigger year.2 smaller year.2 va21 va22 vb21 vb22 vn21 vn22
3 3 same year.3 same year.3 va31 va32 vb31 vb32 vn31 vn32
你是这个意思吗?
df %>% group_by(ID) %>% arrange(desc(Year)) %>% mutate(rid = row_number()) %>%
pivot_wider(id_cols = ID, names_from = rid, values_from = c(Year:Variable_n), names_sep = '')
# A tibble: 3 x 9
# Groups: ID [3]
ID Year1 Year2 Variable_a1 Variable_a2 Variable_b1 Variable_b2 Variable_n1 Variable_n2
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 2 smaller year.2 bigger year.2 va22 va21 vb22 vb21 vn22 vn21
2 1 smaller year.1 bigger year.1 va11 va12 vb11 vb12 vn11 vn12
3 3 same year.3 same year.3 va31 va32 vb31 vb32 vn31 vn32
这是一个 data.table
选项 -
library(data.table)
cols <- grep('Variable', names(df), value = TRUE)
dcast(setDT(df), ID~rowid(ID), value.var = c('Year', cols))
# ID Year_1 Year_2 Variable_a_1 Variable_a_2 Variable_b_1
#1: 1 smaller year.1 bigger year.1 va11 va12 vb11
#2: 2 bigger year.2 smaller year.2 va21 va22 vb21
#3: 3 same year.3 same year.3 va31 va32 vb31
# Variable_b_2 Variable_n_1 Variable_n_2
#1: vb12 vn11 vn12
#2: vb22 vn21 vn22
#3: vb32 vn31 vn32
我们也可以采用以下解决方案:
library(dplyr)
library(tidyr)
library(purrr)
df %>%
group_split(ID) %>%
map_dfr(~ .x %>%
mutate(id = row_number()) %>%
pivot_wider(names_from = id, values_from = c(Year, Variable_a, Variable_b, Variable_n),
names_sep = "") %>%
rename(Year_smaller = Year1,
Year_bigger = Year2)) %>%
select(starts_with("Year"))
# A tibble: 3 x 9
ID Year_smaller Year_bigger Variable_a1 Variable_a2 Variable_b1 Variable_b2 Variable_n1
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 smaller year.1 bigger year.1 va11 va12 vb11 vb12 vn11
2 2 bigger year.2 smaller year.2 va21 va22 vb21 vb22 vn21
3 3 same year.3 same year.3 va31 va32 vb31 vb32 vn31
# ... with 1 more variable: Variable_n2 <chr>