比较两个相似元数据的名称和选项
comparing the names and options of two similar meta da
我有两个相似的元数据大约。 1000 条记录和 500 多列。我想检查两个数据框之间的一致性。现在我想创建一个新的数据框,它将在第一行显示 df1 的所有列名,在第二行显示 df2 的列名,并分别在第 3 列和第 4 列显示它们的选项。然后改变新列以显示 TRUE
或 FALSE
如果列名及其选项匹配。
基本上我必须检查 df1 中的列名称是否与 df 2 完全一致,以及 df1 所有列中的选项是否与 df2 完全匹配
df1 <- data.frame(ID =c("DEV2962","KTN2252","ANA2719","ITI2624","DEV2698","HRT2921",NA,"KTN2624","ANA2548","ITI2535","DEV2732","HRT2837","ERV2951","KTN2542","ANA2813","ITI2210"),
city=c("del","mum","nav","pun","bang","chen","triv","vish","del","mum","bang","vish","bhop","kol","noi","gurg"),
Name= c("dev,akash","singh,rahul","abbas,salman","lal,ram","singh,nkunj","garg,prabal","ali,sanu","singh,kunal","tomar,lakhan","thakur,praveen","ali,sarman","khan,zuber","singh,giriraj","sharma,lokesh","pawar,pooja","sharma,nikita"),
gender =c("m","f","m","f","m","m","m","m","m","m","m","f","m","f","m","m"))
df2 <- data.frame(ID =c("DEV2962","KTN2152","ANA2719","ITs2624","DEV2698","HRT2921",NA,"KTN2624","ANA2548","ITI2535","DEV2732","HRT2837","ERV2951","KTN2542","ANA2813","ITI2210"),
city=c("del","MUm"," nav","pun","bang","chen"," ddgy ","vish","del","mum","bang","vish","bhol","nhus","huay","gurg"),
Name= c("dev","singh,rahul","abbas,salman","lal,ram","singh,nkunj","huna,ghalak","khan,fhalt","singh,kunal","tomar,lakhan","thakur,praveen","ali,sarman","khan,zuber","singh,giriraj","sharma,lokesh","pawar,pooja","sharma,nikita"),
gender =c("m","f","m","f","m","m","m","male","m","male","m","f","m","f","m","m"))
df <- select(df1,matches("^[A-Z]"))
comp <- do.call(rbind, lapply(df[, 4:ncol(df)], function(option) as.data.frame(table(option))))
comp$variable <- gsub("[.](.*)","", rownames(comp))
rownames(comp) <- NULL
comp <- comp[, c(3,1,2)]
comp <- comp[order(-comp$Freq), ]
我无法附加文件,但我已经在下方显示了所需输出的图片。
输出应该类似于 。
- 使用
pivot_longer
将两个数据帧转换为长格式
cbind
他们 df_result
mutate
如果列匹配 ,则使用 ifelse
语句
library(dplyr)
library(tidyr)
df1_long <- df1 %>%
pivot_longer(
cols = everything(),
names_to = "Names_df1",
values_to = "options_df1"
) %>%
arrange(Names_df1)
df2_long <- df2 %>%
pivot_longer(
cols = everything(),
names_to = "Names_df2",
values_to = "options_df2"
) %>%
arrange(Names_df2)
df_result <- cbind(df1_long, df2_long) %>%
mutate(Names_matching = ifelse(Names_df1==Names_df2, TRUE, FALSE),
Options_matching = ifelse(options_df1==options_df2, TRUE, FALSE)) %>%
arrange(factor(Names_df1, levels = c("ID", "city", "Name", "gender")))
输出:
Names_df1 options_df1 Names_df2 options_df2 Names_matching Options_matching
1 ID DEV2962 ID DEV2962 TRUE TRUE
2 ID KTN2252 ID KTN2152 TRUE FALSE
3 ID ANA2719 ID ANA2719 TRUE TRUE
4 ID ITI2624 ID ITs2624 TRUE FALSE
5 ID DEV2698 ID DEV2698 TRUE TRUE
6 ID HRT2921 ID HRT2921 TRUE TRUE
7 ID <NA> ID <NA> TRUE NA
8 ID KTN2624 ID KTN2624 TRUE TRUE
9 ID ANA2548 ID ANA2548 TRUE TRUE
10 ID ITI2535 ID ITI2535 TRUE TRUE
11 ID DEV2732 ID DEV2732 TRUE TRUE
12 ID HRT2837 ID HRT2837 TRUE TRUE
13 ID ERV2951 ID ERV2951 TRUE TRUE
14 ID KTN2542 ID KTN2542 TRUE TRUE
15 ID ANA2813 ID ANA2813 TRUE TRUE
16 ID ITI2210 ID ITI2210 TRUE TRUE
17 city del city del TRUE TRUE
18 city mum city MUm TRUE FALSE
19 city nav city nav TRUE FALSE
20 city pun city pun TRUE TRUE
21 city bang city bang TRUE TRUE
22 city chen city chen TRUE TRUE
23 city triv city ddgy TRUE FALSE
24 city vish city vish TRUE TRUE
25 city del city del TRUE TRUE
26 city mum city mum TRUE TRUE
27 city bang city bang TRUE TRUE
28 city vish city vish TRUE TRUE
29 city bhop city bhol TRUE FALSE
30 city kol city nhus TRUE FALSE
31 city noi city huay TRUE FALSE
32 city gurg city gurg TRUE TRUE
33 Name dev,akash Name dev TRUE FALSE
34 Name singh,rahul Name singh,rahul TRUE TRUE
35 Name abbas,salman Name abbas,salman TRUE TRUE
36 Name lal,ram Name lal,ram TRUE TRUE
37 Name singh,nkunj Name singh,nkunj TRUE TRUE
38 Name garg,prabal Name huna,ghalak TRUE FALSE
39 Name ali,sanu Name khan,fhalt TRUE FALSE
40 Name singh,kunal Name singh,kunal TRUE TRUE
41 Name tomar,lakhan Name tomar,lakhan TRUE TRUE
42 Name thakur,praveen Name thakur,praveen TRUE TRUE
43 Name ali,sarman Name ali,sarman TRUE TRUE
44 Name khan,zuber Name khan,zuber TRUE TRUE
45 Name singh,giriraj Name singh,giriraj TRUE TRUE
46 Name sharma,lokesh Name sharma,lokesh TRUE TRUE
47 Name pawar,pooja Name pawar,pooja TRUE TRUE
48 Name sharma,nikita Name sharma,nikita TRUE TRUE
49 gender m gender m TRUE TRUE
50 gender f gender f TRUE TRUE
51 gender m gender m TRUE TRUE
52 gender f gender f TRUE TRUE
53 gender m gender m TRUE TRUE
54 gender m gender m TRUE TRUE
55 gender m gender m TRUE TRUE
56 gender m gender male TRUE FALSE
57 gender m gender m TRUE TRUE
58 gender m gender male TRUE FALSE
59 gender m gender m TRUE TRUE
60 gender f gender f TRUE TRUE
61 gender m gender m TRUE TRUE
62 gender f gender f TRUE TRUE
63 gender m gender m TRUE TRUE
64 gender m gender m TRUE TRUE
类似于 Tarjae 的 tidyverse 选项。
library(dplyr)
library(tidyr)
one <- pivot_longer(df1, everything()) %>%
rename(names_df1 = name, options_df1 = value)
two <- pivot_longer(df2, everything()) %>%
rename(names_df2 = name, options_df2 = value)
one %>%
bind_cols(two) %>%
mutate(names_matching = names_df1 == names_df2,
options_matching = options_df1 == options_df2) %>%
arrange(names_df1, names_df2)
# # A tibble: 64 x 6
# names_df1 options_df1 names_df2 options_df2 names_matching options_matching
# <chr> <chr> <chr> <chr> <lgl> <lgl>
# 1 city del city "del" TRUE TRUE
# 2 city mum city "MUm" TRUE FALSE
# 3 city nav city " nav" TRUE FALSE
# 4 city pun city "pun" TRUE TRUE
# 5 city bang city "bang" TRUE TRUE
# 6 city chen city "chen" TRUE TRUE
# 7 city triv city " ddgy " TRUE FALSE
# 8 city vish city "vish" TRUE TRUE
# 9 city del city "del" TRUE TRUE
我有两个相似的元数据大约。 1000 条记录和 500 多列。我想检查两个数据框之间的一致性。现在我想创建一个新的数据框,它将在第一行显示 df1 的所有列名,在第二行显示 df2 的列名,并分别在第 3 列和第 4 列显示它们的选项。然后改变新列以显示 TRUE
或 FALSE
如果列名及其选项匹配。
基本上我必须检查 df1 中的列名称是否与 df 2 完全一致,以及 df1 所有列中的选项是否与 df2 完全匹配
df1 <- data.frame(ID =c("DEV2962","KTN2252","ANA2719","ITI2624","DEV2698","HRT2921",NA,"KTN2624","ANA2548","ITI2535","DEV2732","HRT2837","ERV2951","KTN2542","ANA2813","ITI2210"),
city=c("del","mum","nav","pun","bang","chen","triv","vish","del","mum","bang","vish","bhop","kol","noi","gurg"),
Name= c("dev,akash","singh,rahul","abbas,salman","lal,ram","singh,nkunj","garg,prabal","ali,sanu","singh,kunal","tomar,lakhan","thakur,praveen","ali,sarman","khan,zuber","singh,giriraj","sharma,lokesh","pawar,pooja","sharma,nikita"),
gender =c("m","f","m","f","m","m","m","m","m","m","m","f","m","f","m","m"))
df2 <- data.frame(ID =c("DEV2962","KTN2152","ANA2719","ITs2624","DEV2698","HRT2921",NA,"KTN2624","ANA2548","ITI2535","DEV2732","HRT2837","ERV2951","KTN2542","ANA2813","ITI2210"),
city=c("del","MUm"," nav","pun","bang","chen"," ddgy ","vish","del","mum","bang","vish","bhol","nhus","huay","gurg"),
Name= c("dev","singh,rahul","abbas,salman","lal,ram","singh,nkunj","huna,ghalak","khan,fhalt","singh,kunal","tomar,lakhan","thakur,praveen","ali,sarman","khan,zuber","singh,giriraj","sharma,lokesh","pawar,pooja","sharma,nikita"),
gender =c("m","f","m","f","m","m","m","male","m","male","m","f","m","f","m","m"))
df <- select(df1,matches("^[A-Z]"))
comp <- do.call(rbind, lapply(df[, 4:ncol(df)], function(option) as.data.frame(table(option))))
comp$variable <- gsub("[.](.*)","", rownames(comp))
rownames(comp) <- NULL
comp <- comp[, c(3,1,2)]
comp <- comp[order(-comp$Freq), ]
我无法附加文件,但我已经在下方显示了所需输出的图片。
输出应该类似于 。
- 使用
pivot_longer
将两个数据帧转换为长格式
cbind
他们df_result
mutate
如果列匹配 ,则使用
ifelse
语句
library(dplyr)
library(tidyr)
df1_long <- df1 %>%
pivot_longer(
cols = everything(),
names_to = "Names_df1",
values_to = "options_df1"
) %>%
arrange(Names_df1)
df2_long <- df2 %>%
pivot_longer(
cols = everything(),
names_to = "Names_df2",
values_to = "options_df2"
) %>%
arrange(Names_df2)
df_result <- cbind(df1_long, df2_long) %>%
mutate(Names_matching = ifelse(Names_df1==Names_df2, TRUE, FALSE),
Options_matching = ifelse(options_df1==options_df2, TRUE, FALSE)) %>%
arrange(factor(Names_df1, levels = c("ID", "city", "Name", "gender")))
输出:
Names_df1 options_df1 Names_df2 options_df2 Names_matching Options_matching
1 ID DEV2962 ID DEV2962 TRUE TRUE
2 ID KTN2252 ID KTN2152 TRUE FALSE
3 ID ANA2719 ID ANA2719 TRUE TRUE
4 ID ITI2624 ID ITs2624 TRUE FALSE
5 ID DEV2698 ID DEV2698 TRUE TRUE
6 ID HRT2921 ID HRT2921 TRUE TRUE
7 ID <NA> ID <NA> TRUE NA
8 ID KTN2624 ID KTN2624 TRUE TRUE
9 ID ANA2548 ID ANA2548 TRUE TRUE
10 ID ITI2535 ID ITI2535 TRUE TRUE
11 ID DEV2732 ID DEV2732 TRUE TRUE
12 ID HRT2837 ID HRT2837 TRUE TRUE
13 ID ERV2951 ID ERV2951 TRUE TRUE
14 ID KTN2542 ID KTN2542 TRUE TRUE
15 ID ANA2813 ID ANA2813 TRUE TRUE
16 ID ITI2210 ID ITI2210 TRUE TRUE
17 city del city del TRUE TRUE
18 city mum city MUm TRUE FALSE
19 city nav city nav TRUE FALSE
20 city pun city pun TRUE TRUE
21 city bang city bang TRUE TRUE
22 city chen city chen TRUE TRUE
23 city triv city ddgy TRUE FALSE
24 city vish city vish TRUE TRUE
25 city del city del TRUE TRUE
26 city mum city mum TRUE TRUE
27 city bang city bang TRUE TRUE
28 city vish city vish TRUE TRUE
29 city bhop city bhol TRUE FALSE
30 city kol city nhus TRUE FALSE
31 city noi city huay TRUE FALSE
32 city gurg city gurg TRUE TRUE
33 Name dev,akash Name dev TRUE FALSE
34 Name singh,rahul Name singh,rahul TRUE TRUE
35 Name abbas,salman Name abbas,salman TRUE TRUE
36 Name lal,ram Name lal,ram TRUE TRUE
37 Name singh,nkunj Name singh,nkunj TRUE TRUE
38 Name garg,prabal Name huna,ghalak TRUE FALSE
39 Name ali,sanu Name khan,fhalt TRUE FALSE
40 Name singh,kunal Name singh,kunal TRUE TRUE
41 Name tomar,lakhan Name tomar,lakhan TRUE TRUE
42 Name thakur,praveen Name thakur,praveen TRUE TRUE
43 Name ali,sarman Name ali,sarman TRUE TRUE
44 Name khan,zuber Name khan,zuber TRUE TRUE
45 Name singh,giriraj Name singh,giriraj TRUE TRUE
46 Name sharma,lokesh Name sharma,lokesh TRUE TRUE
47 Name pawar,pooja Name pawar,pooja TRUE TRUE
48 Name sharma,nikita Name sharma,nikita TRUE TRUE
49 gender m gender m TRUE TRUE
50 gender f gender f TRUE TRUE
51 gender m gender m TRUE TRUE
52 gender f gender f TRUE TRUE
53 gender m gender m TRUE TRUE
54 gender m gender m TRUE TRUE
55 gender m gender m TRUE TRUE
56 gender m gender male TRUE FALSE
57 gender m gender m TRUE TRUE
58 gender m gender male TRUE FALSE
59 gender m gender m TRUE TRUE
60 gender f gender f TRUE TRUE
61 gender m gender m TRUE TRUE
62 gender f gender f TRUE TRUE
63 gender m gender m TRUE TRUE
64 gender m gender m TRUE TRUE
类似于 Tarjae 的 tidyverse 选项。
library(dplyr)
library(tidyr)
one <- pivot_longer(df1, everything()) %>%
rename(names_df1 = name, options_df1 = value)
two <- pivot_longer(df2, everything()) %>%
rename(names_df2 = name, options_df2 = value)
one %>%
bind_cols(two) %>%
mutate(names_matching = names_df1 == names_df2,
options_matching = options_df1 == options_df2) %>%
arrange(names_df1, names_df2)
# # A tibble: 64 x 6
# names_df1 options_df1 names_df2 options_df2 names_matching options_matching
# <chr> <chr> <chr> <chr> <lgl> <lgl>
# 1 city del city "del" TRUE TRUE
# 2 city mum city "MUm" TRUE FALSE
# 3 city nav city " nav" TRUE FALSE
# 4 city pun city "pun" TRUE TRUE
# 5 city bang city "bang" TRUE TRUE
# 6 city chen city "chen" TRUE TRUE
# 7 city triv city " ddgy " TRUE FALSE
# 8 city vish city "vish" TRUE TRUE
# 9 city del city "del" TRUE TRUE