如果它们之间的所有值都是 NA，则连接 R 中数据帧的两列？

Question

我有一个如下所示的数据框：

> sample
# A tibble: 6 x 10
  Level_1 Level_2 Level_3 Level_4 Level_5 Level_6 Level_7 Level_8 Level_9 Supplier 
    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <lgl>   <chr>    
1       1       2       3       4       8      NA      NA      NA NA      orioles  
2       1       2       3       4       9      13      NA      NA NA      nationals
3       1       2       3       5      10      14      16      18 NA      dodgers  
4       1       2       3       5      10      14      17      19 NA      cardinals
5       1       2       3       6      11      NA      NA      NA NA      giants   
6       1       2       3       7      12      15      NA      NA NA      padres

我想做的是将 Supplier 列与任何 Level 列连接起来，前提是它们之间的所有值都是 NA。我考虑的另一种方式是，如果级别列右侧的列是 NA，则将该列与供应商列连接起来。

我在想一个for循环，但我还没有想出如何实现这个逻辑。我在想的逻辑是这样的：

for (level in levels) {
   if is.na(level n + 1) {
     paste0(level, Supplier)
     }
   else {
    level}
   }

我也可以像这样进行大量 mutate 调用，但它看起来超级重复且没有必要：

sample %>%
  mutate(
    Level_5 = ifelse(
      is.na(Level_6),
      paste0(Supplier, "<br>", Level_5),
      Level_5)
  )

这是数据的输入：

structure(list(Level_1 = c(1, 1, 1, 1, 1, 1), Level_2 = c(2, 
2, 2, 2, 2, 2), Level_3 = c(3, 3, 3, 3, 3, 3), Level_4 = c(4, 
4, 5, 5, 6, 7), Level_5 = c(8, 9, 10, 10, 11, 12), Level_6 = c(NA, 
13, 14, 14, NA, 15), Level_7 = c(NA, NA, 16, 17, NA, NA), Level_8 = c(NA, 
NA, 18, 19, NA, NA), Level_9 = c(NA, NA, NA, NA, NA, NA), Supplier = c("orioles", 
"nationals", "dodgers", "cardinals", "giants", "padres")), row.names = c(NA, 
-6L), class = c("tbl_df", "tbl", "data.frame"))

Answer 1

老实说，我不是 100% 确定您想要的输出。使用 dplyr 和 tidyr:

library(tidyr)
library(dplyr)

sample %>%
  pivot_longer(cols=starts_with("Level_"), names_prefix="Level_", names_to="level") %>%
  drop_na() %>%
  group_by(Supplier) %>%
  mutate(new_val=ifelse(level==max(level), paste0(Supplier, "<br>", value), value)) %>%
  select(-value) %>%
  pivot_wider(names_from=level, names_prefix="Level_", values_from=new_val)

returns

# A tibble: 6 x 9
# Groups:   Supplier [6]
  Supplier  Level_1 Level_2 Level_3 Level_4 Level_5      Level_6         Level_7 Level_8        
  <chr>     <chr>   <chr>   <chr>   <chr>   <chr>        <chr>           <chr>   <chr>          
1 orioles   1       2       3       4       orioles<br>8 NA              NA      NA             
2 nationals 1       2       3       4       9            nationals<br>13 NA      NA             
3 dodgers   1       2       3       5       10           14              16      dodgers<br>18  
4 cardinals 1       2       3       5       10           14              17      cardinals<br>19
5 giants    1       2       3       6       giants<br>11 NA              NA      NA             
6 padres    1       2       3       7       12           padres<br>15    NA      NA

我丢失了 Level_9 列，因为它只包含 NA。您可以轻松地再次添加它。

Answer 2

另一种方法：

library(tidyr)
library(dplyr)

df %>% mutate(across(contains('Level'), ~ as.character(.))) %>% 
   mutate(across(contains('Level'), ~ coalesce(., Supplier))) %>% select(-Supplier) %>% 
   mutate(ID = row_number()) %>% 
     pivot_longer(cols = -ID) %>% group_by(ID) %>% 
       mutate(value = case_when(duplicated(value) ~ NA_character_, TRUE ~ value)) %>% pivot_wider(names_from = name, values_from = value) %>% 
   ungroup() %>% select(-ID)
# A tibble: 6 x 9
  Level_1 Level_2 Level_3 Level_4 Level_5 Level_6 Level_7   Level_8 Level_9  
  <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>     <chr>   <chr>    
1 1       2       3       4       8       orioles NA        NA      NA       
2 1       2       3       4       9       13      nationals NA      NA       
3 1       2       3       5       10      14      16        18      dodgers  
4 1       2       3       5       10      14      17        19      cardinals
5 1       2       3       6       11      giants  NA        NA      NA       
6 1       2       3       7       12      15      padres    NA      NA

Answer 3

非常有趣的问题。这是我没有 pivot_longer 的方法：

library(dplyr)
# check is na
ind <- !is.na(df1)

# save vector who fullfill assumption value before first NA
values <- as.vector(tapply(df1[ind], row(df1)[ind], tail, 1))

# bind to dataframe
df2 <- cbind(df, values)

# accomplish the task
df2 %>% 
  mutate(across(Level_1:Level_9, ~ case_when(. == values ~ str_c(Supplier ,.),
                                             . != values ~ as.character(.)))) %>% 
  select(-values)

输出：

  Level_1 Level_2 Level_3 Level_4  Level_5     Level_6 Level_7     Level_8 Level_9  Supplier
1       1       2       3       4 orioles8        <NA>    <NA>        <NA>    <NA>   orioles
2       1       2       3       4        9 nationals13    <NA>        <NA>    <NA> nationals
3       1       2       3       5       10          14      16   dodgers18    <NA>   dodgers
4       1       2       3       5       10          14      17 cardinals19    <NA> cardinals
5       1       2       3       6 giants11        <NA>    <NA>        <NA>    <NA>    giants
6       1       2       3       7       12    padres15    <NA>        <NA>    <NA>    padres

Answer 4

最终更新 我意识到我在试图在每一行中找到最大值并将其替换为所需的连接字符串时犯了错误。所以我想出了另一个解决方案，它只替换最后一个非 NA 值（它也可以不是行的最大值），因为所有值都不是数字。所以这是我的最终解决方案：

library(dplyr)
library(stringr)
library(purrr)

df %>%
  pmap_dfr(., ~ {x <- c(...)[-10][!is.na(c(...)[-10])];
  ind <- which(c(...) == x[length(x)]);
  replace(c(...), ind[length(ind)], str_c(..10, x[length(x)], sep = "_"))}
  )

# A tibble: 6 x 10
  Level_1 Level_2 Level_3 Level_4 Level_5   Level_6      Level_7 Level_8      Level_9 Supplier 
  <chr>   <chr>   <chr>   <chr>   <chr>     <chr>        <chr>   <chr>        <chr>   <chr>    
1 1       2       3       4       orioles_8 NA           NA      NA           NA      orioles  
2 1       2       3       4       9         nationals_13 NA      NA           NA      nationals
3 1       2       3       5       10        14           16      dodgers_18   NA      dodgers  
4 1       2       3       5       10        14           17      cardinals_19 NA      cardinals
5 1       2       3       6       giants_11 NA           NA      NA           NA      giants   
6 1       2       3       7       12        padres_15    NA      NA           NA      padres

Answer 5

c_across 和 across

的组合方法

library(tidyverse)
df %>% rowwise() %>%
  mutate(dummy = max(which(!is.na(c_across(starts_with('Level')))))) %>% ungroup() %>%
  mutate(across(starts_with('Level_'), 
                ~ifelse(as.numeric(str_remove(cur_column(), 'Level_')) == dummy, paste(Supplier, ., sep = '_'), .)))

# A tibble: 6 x 11
  Level_1 Level_2 Level_3 Level_4 Level_5   Level_6      Level_7 Level_8      Level_9 Supplier  dummy
    <dbl>   <dbl>   <dbl>   <dbl> <chr>     <chr>          <dbl> <chr>        <lgl>   <chr>     <int>
1       1       2       3       4 orioles_8 NA                NA NA           NA      orioles       5
2       1       2       3       4 9         nationals_13      NA NA           NA      nationals     6
3       1       2       3       5 10        14                16 dodgers_18   NA      dodgers       8
4       1       2       3       5 10        14                17 cardinals_19 NA      cardinals     8
5       1       2       3       6 giants_11 NA                NA NA           NA      giants        5
6       1       2       3       7 12        padres_15         NA NA           NA      padres        6

结合我朋友Anoushiravan的上面使用的which策略可以简化为：

在purrr::pmap_dfr里面做这些
- collect/store 临时变量的列数 n
- collect/store 临时变量的所需索引 i
- 创建一个长度为 n 的 temp 变量，在 i 处有一个 T 并且在其他地方有 F
- 使用 replace 将第 i 个变量（使用 temp）替换为所需的值

df %>%
  pmap_dfr(., ~ {n <- ncol(df); i <- max(which(!is.na(c(...)[-n])));
  tmp <- rep(F, n); tmp[i] <- T;
  replace(c(...), tmp, paste(c(...)[n], c(...)[i], sep = '_'))})

# A tibble: 6 x 10
  Level_1 Level_2 Level_3 Level_4 Level_5   Level_6      Level_7 Level_8      Level_9 Supplier 
  <chr>   <chr>   <chr>   <chr>   <chr>     <chr>        <chr>   <chr>        <chr>   <chr>    
1 1       2       3       4       orioles_8 NA           NA      NA           NA      orioles  
2 1       2       3       4       9         nationals_13 NA      NA           NA      nationals
3 1       2       3       5       10        14           16      dodgers_18   NA      dodgers  
4 1       2       3       5       10        14           17      cardinals_19 NA      cardinals
5 1       2       3       6       giants_11 NA           NA      NA           NA      giants   
6 1       2       3       7       12        padres_15    NA      NA           NA      padres

如果它们之间的所有值都是 NA，则连接 R 中数据帧的两列？

Concatenate two columns of a dataframe in R if all values between them are NA?

r

dataframe

na