在 R 中的多个数据框中创建新列

Creating new columns in multiple dataframes in R

之后,我在 R 中处理大量数据帧,每个数据帧都有不同的列数。我想同化这些数据集,以便它们都具有相同数量的列和新添加列的 NA 值。我写了一个循环,但我不确定如何更新真实的数据帧。

first_df   = data.frame(matrix(rnorm(20), nrow=10))
second_df  = data.frame(matrix(rnorm(20), nrow=4))
third_df   = data.frame(matrix(rnorm(20), nrow=5))

library(tidyverse)

min_max <- mget(ls(pattern = "_df")) %>%
  map_dbl(ncol) %>%
  enframe() %>%
  arrange(value) %>%
  slice(1, n())

min_max

# A tibble: 2 x 2
#  name      value
#  <chr>     <dbl>
#1 first_df      2
#2 second_df     5

diff <- setdiff(names(get(min_max$name[2])), names(get(min_max$name[1])))

for (col_name in diff)
    
#     all dataframes whose names contain "_df"
    for (df_index in 1:length(ls(pattern = "_df")))
    
    {
#     capturing the dataframe
        data = get(ls(pattern = "_df")[df_index]);
        
     if (!(col_name %in% names(data)))
         
    {data[,col_name] <- NA}
#          I don't know how to update the real datasets
#     get(ls(pattern = "_df")[df_index]) <- data
                   
    }

我很快查了一下,解决方案是 assign() 函数。

这是你的赋值代表。但我也了解到将数据帧收集到一个列表中会很有用,然后你可以更改我认为的列表位置的名称。

first_df   = data.frame(matrix(rnorm(20), nrow=10))
second_df  = data.frame(matrix(rnorm(20), nrow=4))
third_df   = data.frame(matrix(rnorm(20), nrow=5))

library(tidyverse)

min_max <- mget(ls(pattern = "_df")) %>%
  map_dbl(ncol) %>%
  enframe() %>%
  arrange(value) %>%
  slice(1, n())

min_max

diff <- setdiff(names(get(min_max$name[2])), names(get(min_max$name[1])))

for (col_name in diff) {
  
  #     all dataframes whose names contain "_df"
  for (df_index in 1:length(ls(pattern = "_df"))) {
    
    #     capturing the dataframe
    data = get(ls(pattern = "_df")[df_index]);
    
    if (!(col_name %in% names(data))) {
      data[,col_name] <- NA
    assign(ls(pattern = "_df")[df_index], data)
    }
    #          I don't know how to update the real datasets
    #     get(ls(pattern = "_df")[df_index]) <- data
    
  }
}

这是一个绕过循环的替代方案;它使用 dplyr::bind_rows() 将最大的数据帧放在一起,在需要的地方用 NA 填充。

first_df   = data.frame(matrix(rnorm(20), nrow=10))
second_df  = data.frame(matrix(rnorm(20), nrow=4))
third_df   = data.frame(matrix(rnorm(20), nrow=5))

library(tidyverse)

df_names <- ls(pattern = "_df")
df_list <- mget(df_names)

new_df_list <-
  df_list %>%
  bind_rows(.id = "id") %>%       # put together with biggest number of columns
  group_split(id) %>%             # break down to list 
  set_names(df_names) %>%
  map(., ~ dplyr::select(., -id)) # remove the id column 

# save each df back to global environment
list2env(new_df_list, globalenv())

# check
first_df