比较两个相似元数据的名称和选项

comparing the names and options of two similar meta da

我有两个相似的元数据大约。 1000 条记录和 500 多列。我想检查两个数据框之间的一致性。现在我想创建一个新的数据框,它将在第一行显示 df1 的所有列名,在第二行显示 df2 的列名,并分别在第 3 列和第 4 列显示它们的选项。然后改变新列以显示 TRUEFALSE 如果列名及其选项匹配。

基本上我必须检查 df1 中的列名称是否与 df 2 完全一致,以及 df1 所有列中的选项是否与 df2 完全匹配

df1 <- data.frame(ID =c("DEV2962","KTN2252","ANA2719","ITI2624","DEV2698","HRT2921",NA,"KTN2624","ANA2548","ITI2535","DEV2732","HRT2837","ERV2951","KTN2542","ANA2813","ITI2210"),
                  city=c("del","mum","nav","pun","bang","chen","triv","vish","del","mum","bang","vish","bhop","kol","noi","gurg"),
                  Name= c("dev,akash","singh,rahul","abbas,salman","lal,ram","singh,nkunj","garg,prabal","ali,sanu","singh,kunal","tomar,lakhan","thakur,praveen","ali,sarman","khan,zuber","singh,giriraj","sharma,lokesh","pawar,pooja","sharma,nikita"),
                  gender =c("m","f","m","f","m","m","m","m","m","m","m","f","m","f","m","m"))

df2 <- data.frame(ID =c("DEV2962","KTN2152","ANA2719","ITs2624","DEV2698","HRT2921",NA,"KTN2624","ANA2548","ITI2535","DEV2732","HRT2837","ERV2951","KTN2542","ANA2813","ITI2210"),
                  city=c("del","MUm"," nav","pun","bang","chen"," ddgy ","vish","del","mum","bang","vish","bhol","nhus","huay","gurg"),
                  Name= c("dev","singh,rahul","abbas,salman","lal,ram","singh,nkunj","huna,ghalak","khan,fhalt","singh,kunal","tomar,lakhan","thakur,praveen","ali,sarman","khan,zuber","singh,giriraj","sharma,lokesh","pawar,pooja","sharma,nikita"),
                  gender =c("m","f","m","f","m","m","m","male","m","male","m","f","m","f","m","m"))


df <- select(df1,matches("^[A-Z]"))
comp <- do.call(rbind, lapply(df[, 4:ncol(df)], function(option) as.data.frame(table(option)))) 
comp$variable <- gsub("[.](.*)","", rownames(comp)) 
rownames(comp) <- NULL
comp <- comp[, c(3,1,2)] 
comp <- comp[order(-comp$Freq), ]

我无法附加文件,但我已经在下方显示了所需输出的图片。

输出应该类似于 。

  1. 使用 pivot_longer
  2. 将两个数据帧转换为长格式
  3. cbind 他们 df_result
  4. mutate 如果列匹配
  5. ,则使用 ifelse 语句
library(dplyr)
library(tidyr)

df1_long <- df1 %>% 
  pivot_longer(
    cols = everything(),
    names_to = "Names_df1",
    values_to = "options_df1"
  ) %>%   
  arrange(Names_df1)

df2_long <- df2 %>% 
  pivot_longer(
    cols = everything(),
    names_to = "Names_df2",
    values_to = "options_df2"
  ) %>%   
  arrange(Names_df2)

df_result <- cbind(df1_long, df2_long) %>% 
  mutate(Names_matching = ifelse(Names_df1==Names_df2, TRUE, FALSE),
         Options_matching = ifelse(options_df1==options_df2, TRUE, FALSE)) %>% 
  arrange(factor(Names_df1, levels = c("ID", "city", "Name", "gender")))

输出:

Names_df1    options_df1 Names_df2    options_df2 Names_matching Options_matching
1         ID        DEV2962        ID        DEV2962           TRUE             TRUE
2         ID        KTN2252        ID        KTN2152           TRUE            FALSE
3         ID        ANA2719        ID        ANA2719           TRUE             TRUE
4         ID        ITI2624        ID        ITs2624           TRUE            FALSE
5         ID        DEV2698        ID        DEV2698           TRUE             TRUE
6         ID        HRT2921        ID        HRT2921           TRUE             TRUE
7         ID           <NA>        ID           <NA>           TRUE               NA
8         ID        KTN2624        ID        KTN2624           TRUE             TRUE
9         ID        ANA2548        ID        ANA2548           TRUE             TRUE
10        ID        ITI2535        ID        ITI2535           TRUE             TRUE
11        ID        DEV2732        ID        DEV2732           TRUE             TRUE
12        ID        HRT2837        ID        HRT2837           TRUE             TRUE
13        ID        ERV2951        ID        ERV2951           TRUE             TRUE
14        ID        KTN2542        ID        KTN2542           TRUE             TRUE
15        ID        ANA2813        ID        ANA2813           TRUE             TRUE
16        ID        ITI2210        ID        ITI2210           TRUE             TRUE
17      city            del      city            del           TRUE             TRUE
18      city            mum      city            MUm           TRUE            FALSE
19      city            nav      city            nav           TRUE            FALSE
20      city            pun      city            pun           TRUE             TRUE
21      city           bang      city           bang           TRUE             TRUE
22      city           chen      city           chen           TRUE             TRUE
23      city           triv      city          ddgy            TRUE            FALSE
24      city           vish      city           vish           TRUE             TRUE
25      city            del      city            del           TRUE             TRUE
26      city            mum      city            mum           TRUE             TRUE
27      city           bang      city           bang           TRUE             TRUE
28      city           vish      city           vish           TRUE             TRUE
29      city           bhop      city           bhol           TRUE            FALSE
30      city            kol      city           nhus           TRUE            FALSE
31      city            noi      city           huay           TRUE            FALSE
32      city           gurg      city           gurg           TRUE             TRUE
33      Name      dev,akash      Name            dev           TRUE            FALSE
34      Name    singh,rahul      Name    singh,rahul           TRUE             TRUE
35      Name   abbas,salman      Name   abbas,salman           TRUE             TRUE
36      Name        lal,ram      Name        lal,ram           TRUE             TRUE
37      Name    singh,nkunj      Name    singh,nkunj           TRUE             TRUE
38      Name    garg,prabal      Name    huna,ghalak           TRUE            FALSE
39      Name       ali,sanu      Name     khan,fhalt           TRUE            FALSE
40      Name    singh,kunal      Name    singh,kunal           TRUE             TRUE
41      Name   tomar,lakhan      Name   tomar,lakhan           TRUE             TRUE
42      Name thakur,praveen      Name thakur,praveen           TRUE             TRUE
43      Name     ali,sarman      Name     ali,sarman           TRUE             TRUE
44      Name     khan,zuber      Name     khan,zuber           TRUE             TRUE
45      Name  singh,giriraj      Name  singh,giriraj           TRUE             TRUE
46      Name  sharma,lokesh      Name  sharma,lokesh           TRUE             TRUE
47      Name    pawar,pooja      Name    pawar,pooja           TRUE             TRUE
48      Name  sharma,nikita      Name  sharma,nikita           TRUE             TRUE
49    gender              m    gender              m           TRUE             TRUE
50    gender              f    gender              f           TRUE             TRUE
51    gender              m    gender              m           TRUE             TRUE
52    gender              f    gender              f           TRUE             TRUE
53    gender              m    gender              m           TRUE             TRUE
54    gender              m    gender              m           TRUE             TRUE
55    gender              m    gender              m           TRUE             TRUE
56    gender              m    gender           male           TRUE            FALSE
57    gender              m    gender              m           TRUE             TRUE
58    gender              m    gender           male           TRUE            FALSE
59    gender              m    gender              m           TRUE             TRUE
60    gender              f    gender              f           TRUE             TRUE
61    gender              m    gender              m           TRUE             TRUE
62    gender              f    gender              f           TRUE             TRUE
63    gender              m    gender              m           TRUE             TRUE
64    gender              m    gender              m           TRUE             TRUE

类似于 Tarjae 的 tidyverse 选项。

library(dplyr)
library(tidyr)

one <-  pivot_longer(df1, everything()) %>%
  rename(names_df1 = name, options_df1 = value)

two <-  pivot_longer(df2, everything()) %>%
  rename(names_df2 = name, options_df2 = value)

one %>%
  bind_cols(two) %>%
  mutate(names_matching = names_df1 == names_df2,
         options_matching = options_df1 == options_df2) %>%
  arrange(names_df1, names_df2)

# # A tibble: 64 x 6
# names_df1 options_df1 names_df2 options_df2 names_matching options_matching
#   <chr>     <chr>       <chr>      <chr>      <lgl>          <lgl>           
# 1 city      del         city      "del"       TRUE           TRUE            
# 2 city      mum         city      "MUm"       TRUE           FALSE           
# 3 city      nav         city      " nav"      TRUE           FALSE           
# 4 city      pun         city      "pun"       TRUE           TRUE            
# 5 city      bang        city      "bang"      TRUE           TRUE            
# 6 city      chen        city      "chen"      TRUE           TRUE            
# 7 city      triv        city      " ddgy "    TRUE           FALSE           
# 8 city      vish        city      "vish"      TRUE           TRUE            
# 9 city      del         city      "del"       TRUE           TRUE