在 R 中合并具有相同列名的多个数据集,同时为每列附加后缀以标识其数据集
Merge Multiple Datasets with Identical Column Names in R While Attaching a Suffix to Each Column to Identify its Dataset
我正在尝试合并 3 个数据集,它们是纵向研究的一部分。变量名称在研究的波浪中是相同的。我想要完成的是在合并它们时为每个变量名附加一个后缀以指示它所属的数据集。我已经能够以迂回的方式做到这一点,但我认为应该有一种更简单的方法来实现这一点。以下是我到目前为止所做的,只是使用一些虚构的数据来说明。
## step 1: create three dfs with identical variable names
df1 = data.frame('ID' = 1:10, 'V1' = sample(x=1:5, size=10, replace=T), 'v2' = sample(x=1:5, size=10, replace=T))
df2 = data.frame('ID' = 1:10, 'V1' = sample(x=1:5, size=10, replace=T), 'v2' = sample(x=1:5, size=10, replace=T))
df3 = data.frame('ID' = 1:10, 'V1' = sample(x=1:5, size=10, replace=T), 'v2' = sample(x=1:5, size=10, replace=T))
## step 2: function to rename all columns in a dataset
colsRename <- function(data, nn){
nn = nn
df = data %>%
rename_with(~ paste0(colnames(data), paste0('_'), nn)[which(colnames(data) == .x)],
.cols = colnames(data)) %>%
glimpse()
return(df)
}
## call function to rename columns
w1 = colsRename(df1, 'W1')
w2 = colsRename(df2, 'W2')
w3 = colsRename(df3, 'W3')
# merge the dataframes
dfMerge = merge(w1, w2, by.x = c("ID_W1"), by.y = c("ID_W2"), all = FALSE) %>%
merge(., w3, by.x = c("ID_W1"), by.y = c("ID_W3"), all = FALSE) %>%
glimpse()
joined/merged 数据帧的最终输出如下所示
ID_W1 V1_W1 v2_W1 V1_W2 v2_W2 V1_W3 v2_W3
1 1 3 4 1 5 5 1
2 2 4 2 5 4 5 2
3 3 4 3 2 4 1 3
4 4 5 1 1 2 5 1
5 5 5 1 3 1 5 1
6 6 4 4 3 4 3 5
7 7 2 5 3 2 3 2
8 8 1 1 2 1 2 2
9 9 5 3 2 2 1 3
10 10 5 2 5 3 4 5
您可以将数据框放在命名列表中,重命名数据框并使用 reduce
-
library(dplyr)
library(purrr)
list(W1 = df1, W2 = df2, W3 = df3) %>%
imap(function(x, y) x %>% rename_with(~paste(., y, sep = '_'), -ID)) %>%
reduce(inner_join, by = 'ID')
# ID V1_W1 v2_W1 V1_W2 v2_W2 V1_W3 v2_W3
#1 1 1 1 5 2 5 5
#2 2 3 2 4 4 3 3
#3 3 4 3 5 4 1 2
#4 4 1 4 2 3 4 5
#5 5 3 5 1 1 1 5
#6 6 5 5 1 2 1 3
#7 7 4 3 3 1 3 2
#8 8 2 1 1 2 4 2
#9 9 5 2 5 4 1 2
#10 10 1 5 1 5 3 4
如果有更多的数据帧并且您不想单独命名它们,您可以使用 mget
.
从全局环境中收集所有数据帧
mget(ls(pattern = 'df\d+')) %>%
imap(function(x, y) x %>% rename_with(~paste(., y, sep = '_'), -ID)) %>%
reduce(inner_join, by = 'ID')
在 R 基础中你可以这样做
d_nm <- c("df1", "df2", "df3")
L <- setNames(mget(d_nm), paste0('_W', seq(d_nm)))
Reduce(merge, Map(\(x, y) {names(x)[-1] <- paste0(names(x), y)[-1];x}, L, names(L)))
# ID V1_W1 v2_W1 V1_W2 v2_W2 V1_W3 v2_W3
# 1 1 1 1 5 3 2 2
# 2 2 5 5 5 2 3 2
# 3 3 1 4 5 4 1 2
# 4 4 1 2 4 4 5 5
# 5 5 2 2 2 2 2 1
# 6 6 4 3 4 5 2 1
# 7 7 2 1 3 4 2 4
# 8 8 2 1 2 5 4 5
# 9 9 1 3 1 4 3 2
# 10 10 4 4 2 2 5 1
数据:
set.seed(42)
df1 <- data.frame(ID=1:10, V1=sample(x=1:5, size=10, replace=T), v2=sample(x=1:5, size=10, replace=T))
df2 <- data.frame(ID=1:10, V1=sample(x=1:5, size=10, replace=T), v2=sample(x=1:5, size=10, replace=T))
df3 <- data.frame(ID=1:10, V1=sample(x=1:5, size=10, replace=T), v2=sample(x=1:5, size=10, replace=T))
我正在尝试合并 3 个数据集,它们是纵向研究的一部分。变量名称在研究的波浪中是相同的。我想要完成的是在合并它们时为每个变量名附加一个后缀以指示它所属的数据集。我已经能够以迂回的方式做到这一点,但我认为应该有一种更简单的方法来实现这一点。以下是我到目前为止所做的,只是使用一些虚构的数据来说明。
## step 1: create three dfs with identical variable names
df1 = data.frame('ID' = 1:10, 'V1' = sample(x=1:5, size=10, replace=T), 'v2' = sample(x=1:5, size=10, replace=T))
df2 = data.frame('ID' = 1:10, 'V1' = sample(x=1:5, size=10, replace=T), 'v2' = sample(x=1:5, size=10, replace=T))
df3 = data.frame('ID' = 1:10, 'V1' = sample(x=1:5, size=10, replace=T), 'v2' = sample(x=1:5, size=10, replace=T))
## step 2: function to rename all columns in a dataset
colsRename <- function(data, nn){
nn = nn
df = data %>%
rename_with(~ paste0(colnames(data), paste0('_'), nn)[which(colnames(data) == .x)],
.cols = colnames(data)) %>%
glimpse()
return(df)
}
## call function to rename columns
w1 = colsRename(df1, 'W1')
w2 = colsRename(df2, 'W2')
w3 = colsRename(df3, 'W3')
# merge the dataframes
dfMerge = merge(w1, w2, by.x = c("ID_W1"), by.y = c("ID_W2"), all = FALSE) %>%
merge(., w3, by.x = c("ID_W1"), by.y = c("ID_W3"), all = FALSE) %>%
glimpse()
joined/merged 数据帧的最终输出如下所示
ID_W1 V1_W1 v2_W1 V1_W2 v2_W2 V1_W3 v2_W3
1 1 3 4 1 5 5 1
2 2 4 2 5 4 5 2
3 3 4 3 2 4 1 3
4 4 5 1 1 2 5 1
5 5 5 1 3 1 5 1
6 6 4 4 3 4 3 5
7 7 2 5 3 2 3 2
8 8 1 1 2 1 2 2
9 9 5 3 2 2 1 3
10 10 5 2 5 3 4 5
您可以将数据框放在命名列表中,重命名数据框并使用 reduce
-
library(dplyr)
library(purrr)
list(W1 = df1, W2 = df2, W3 = df3) %>%
imap(function(x, y) x %>% rename_with(~paste(., y, sep = '_'), -ID)) %>%
reduce(inner_join, by = 'ID')
# ID V1_W1 v2_W1 V1_W2 v2_W2 V1_W3 v2_W3
#1 1 1 1 5 2 5 5
#2 2 3 2 4 4 3 3
#3 3 4 3 5 4 1 2
#4 4 1 4 2 3 4 5
#5 5 3 5 1 1 1 5
#6 6 5 5 1 2 1 3
#7 7 4 3 3 1 3 2
#8 8 2 1 1 2 4 2
#9 9 5 2 5 4 1 2
#10 10 1 5 1 5 3 4
如果有更多的数据帧并且您不想单独命名它们,您可以使用 mget
.
mget(ls(pattern = 'df\d+')) %>%
imap(function(x, y) x %>% rename_with(~paste(., y, sep = '_'), -ID)) %>%
reduce(inner_join, by = 'ID')
在 R 基础中你可以这样做
d_nm <- c("df1", "df2", "df3")
L <- setNames(mget(d_nm), paste0('_W', seq(d_nm)))
Reduce(merge, Map(\(x, y) {names(x)[-1] <- paste0(names(x), y)[-1];x}, L, names(L)))
# ID V1_W1 v2_W1 V1_W2 v2_W2 V1_W3 v2_W3
# 1 1 1 1 5 3 2 2
# 2 2 5 5 5 2 3 2
# 3 3 1 4 5 4 1 2
# 4 4 1 2 4 4 5 5
# 5 5 2 2 2 2 2 1
# 6 6 4 3 4 5 2 1
# 7 7 2 1 3 4 2 4
# 8 8 2 1 2 5 4 5
# 9 9 1 3 1 4 3 2
# 10 10 4 4 2 2 5 1
数据:
set.seed(42)
df1 <- data.frame(ID=1:10, V1=sample(x=1:5, size=10, replace=T), v2=sample(x=1:5, size=10, replace=T))
df2 <- data.frame(ID=1:10, V1=sample(x=1:5, size=10, replace=T), v2=sample(x=1:5, size=10, replace=T))
df3 <- data.frame(ID=1:10, V1=sample(x=1:5, size=10, replace=T), v2=sample(x=1:5, size=10, replace=T))