使用 R 折叠不同列中具有不同值的重复行

Collapse duplicated rows with different values in different columns using R

我有一个包含 500 个观测值的数据框,但我在示例中只显示了 3 个。这些是不同列中具有不同值的重复项(ID 列除外,其中包括重复的人)。我正在复制数据框的情况 (df) 以及处理后的情况 (df_new)。这可能吗 ?数据框是 10 个变量,所以我不担心 'doubling' 它们。变量中的值为 a、b、c、d、0、''。然而,我在表格中让它们更笼统。

df <- data.frame(ID =  c('1','1','2', '2', '3','3'),
                 Year = c('smaller year.1', 'bigger year.1', 'bigger year.2', 'smaller year.2', 'same year.3', 'same year.3'),
                 V1 = c('a', 'b','c','d','e','f'),
                 V2 = c('g', 'h', 'i', 'j', 'k', 'l'),
                 Vn = c('n1', 'n2','n3','n4','n5','n6'))


df_new <- data.frame(ID = c('1','2','3'),
                     Year_smaller = c('smaller year.1', 'smaller year.2', 'same year.3'),
                     Year_bigger = c('bigger year.1', 'bigger year.2', 'same year.3'),
                     V1 = c('a','c','e'),
                     V1.1 = c('b','d','f'),
                     V2 = c('g','i','k'),
                     V2.1 = c('h','j','l'),
                     Vn = c('n1','n3','n5'),
                     Vn.1 = c('n2','n4','n6'))

这是我不雅的回答。这可能会给你一些想法。

library(tidyverse)

year_df <- df %>% 
  select(ID, Year) %>% 
  filter(!str_detect(Year, "^same")) %>% 
  mutate(year_group = map_chr(Year, ~str_split(., " ")[[1]][1])) %>% 
  pivot_wider(
    names_from = year_group,
    values_from = Year
  ) %>% 
  add_row(ID = "3", smaller = "same year.3", bigger = "same year.3")

df_new <- df %>% 
  select(-Year) %>%
  pivot_longer(-ID) %>% 
  mutate(
    group = paste0(str_sub(name, -1), str_sub(value, -1)),
    name = str_remove(name, "_[a-z]")
  ) %>%
  pivot_wider(
    names_from = c(name, group),
    values_from = value
  ) %>% 
  left_join(year_df, by = "ID") %>% 
  relocate(c(smaller, bigger), .after = ID)

df_new

根据修改后的要求编辑数据。由于在字母表中 b 出现在 s 之前,因此 bigger_year 显示在 smaller_year 之前,但是,在实际数据中,您将正确排序年份。不过,如果您想像这样对字符串进行排序,请使用 sort(desc(Year)) 而不是 sort(Year)

df <- data.frame(ID =  c('1','1','2', '2', '3','3'),
                 Year = c('smaller year.1', 'bigger year.1', 'bigger year.2', 'smaller year.2', 'same year.3', 'same year.3'),
                 V1 = c('a', 'b','c','d','e','f'),
                 V2 = c('g', 'h', 'i', 'j', 'k', 'l'),
                 Vn = c('n1', 'n2','n3','n4','n5','n6'))


library(tidyverse)

df %>% group_by(ID) %>% mutate(Year = sort(Year)) %>% 
  mutate(rid = row_number()) %>%
  pivot_wider(id_cols = ID, names_from = rid, values_from = c(Year:Vn), names_sep = '')

#> # A tibble: 3 x 9
#> # Groups:   ID [3]
#>   ID    Year1         Year2          V11   V12   V21   V22   Vn1   Vn2  
#>   <chr> <chr>         <chr>          <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1     bigger year.1 smaller year.1 a     b     g     h     n1    n2   
#> 2 2     bigger year.2 smaller year.2 c     d     i     j     n3    n4   
#> 3 3     same year.3   same year.3    e     f     k     l     n5    n6

reprex package (v2.0.0)

于 2021-06-19 创建
library(tidyverse)

df %>% group_by(ID) %>% mutate(rid = row_number()) %>%
  pivot_wider(id_cols = ID, names_from = rid, values_from = c(Year:Variable_n), names_sep = '')

# A tibble: 3 x 9
# Groups:   ID [3]
  ID    Year1          Year2          Variable_a1 Variable_a2 Variable_b1 Variable_b2 Variable_n1 Variable_n2
  <chr> <chr>          <chr>          <chr>       <chr>       <chr>       <chr>       <chr>       <chr>      
1 1     smaller year.1 bigger year.1  va11        va12        vb11        vb12        vn11        vn12       
2 2     bigger year.2  smaller year.2 va21        va22        vb21        vb22        vn21        vn22       
3 3     same year.3    same year.3    va31        va32        vb31        vb32        vn31        vn32 

你是这个意思吗?


df %>% group_by(ID) %>% arrange(desc(Year)) %>% mutate(rid = row_number()) %>%
  pivot_wider(id_cols = ID, names_from = rid, values_from = c(Year:Variable_n), names_sep = '')

# A tibble: 3 x 9
# Groups:   ID [3]
  ID    Year1          Year2         Variable_a1 Variable_a2 Variable_b1 Variable_b2 Variable_n1 Variable_n2
  <chr> <chr>          <chr>         <chr>       <chr>       <chr>       <chr>       <chr>       <chr>      
1 2     smaller year.2 bigger year.2 va22        va21        vb22        vb21        vn22        vn21       
2 1     smaller year.1 bigger year.1 va11        va12        vb11        vb12        vn11        vn12       
3 3     same year.3    same year.3   va31        va32        vb31        vb32        vn31        vn32

这是一个 data.table 选项 -

library(data.table)

cols <- grep('Variable', names(df), value = TRUE)
dcast(setDT(df), ID~rowid(ID), value.var = c('Year', cols))


#   ID         Year_1         Year_2 Variable_a_1 Variable_a_2 Variable_b_1
#1:  1 smaller year.1  bigger year.1         va11         va12         vb11
#2:  2  bigger year.2 smaller year.2         va21         va22         vb21
#3:  3    same year.3    same year.3         va31         va32         vb31

#   Variable_b_2 Variable_n_1 Variable_n_2
#1:         vb12         vn11         vn12
#2:         vb22         vn21         vn22
#3:         vb32         vn31         vn32

我们也可以采用以下解决方案:

library(dplyr)
library(tidyr)
library(purrr)

df %>%
  group_split(ID) %>%
  map_dfr(~ .x %>% 
            mutate(id = row_number()) %>%
            pivot_wider(names_from = id, values_from = c(Year, Variable_a, Variable_b, Variable_n),
                        names_sep = "") %>%
            rename(Year_smaller = Year1,
                   Year_bigger = Year2)) %>%
  select(starts_with("Year"))

# A tibble: 3 x 9
  ID    Year_smaller   Year_bigger    Variable_a1 Variable_a2 Variable_b1 Variable_b2 Variable_n1
  <chr> <chr>          <chr>          <chr>       <chr>       <chr>       <chr>       <chr>      
1 1     smaller year.1 bigger year.1  va11        va12        vb11        vb12        vn11       
2 2     bigger year.2  smaller year.2 va21        va22        vb21        vb22        vn21       
3 3     same year.3    same year.3    va31        va32        vb31        vb32        vn31       
# ... with 1 more variable: Variable_n2 <chr>