如何保留原始列以便在 r 中两个数据库的 full_join() 中进行比较
How do you retain original column for comparison in full_join() of two databases in r
我正在尝试比较两个数据框以确定其中一列中的差异。为此,我使用了 tidyverse 中的 full_join()
,但我无法弄清楚如何保留数据框的原点,因此我可以理解差异。
#Two databases that differ in Charge for the name and date or also have a entirely unique rows
df1 <- tibble(Name = c("JANE,DOE", "JANE,DOE", "JIM,DOE", "JANE,BUCK", "JIM,BUCK", "JIM,BUCK"),
Date = c("1/1/21", "1/10/21", "2/1/21", "1/2/21", "2/2/21", "2/8/21"),
Charge = c(-500, -500, -450, 0, -450, 0))
df2 <- tibble(Name = c("JANE,DOE", "JANE,DOE", "JIM,DOE", "JANE,BUCK", "JIM,BUCK", "JIM,BUCK", "JIM,BUCK"),
Date = c("1/2/21", "1/10/21", "2/1/21", "1/2/21", "2/2/21", "2/8/21", "2/10/21"),
Charge = c(-500, -500, -450, -500, -500, -500, -50))
我试过将它们组合起来然后识别不同的行
Audit <- full_join(df1,df2)
Audit <- Audit %>% distinct() %>% arrange(Name, Date)
但我的输出不允许我对比名称和日期唯一的行的费用。
Name Date Charge
<chr> <chr> <dbl>
JANE,BUCK 1/2/21 0 #df2
JANE,BUCK 1/2/21 -500 #df1
JANE,DOE 1/1/21 -500 #df1
JANE,DOE 1/10/21 -500 #df1 & df2
JANE,DOE 1/2/21 -500 #df2
JIM,BUCK 2/10/21 -50 #df2
JIM,BUCK 2/2/21 -450 #df1
JIM,BUCK 2/2/21 -500 #df2
JIM,BUCK 2/8/21 0 #df1
JIM,BUCK 2/8/21 -500 #df2
JIM,DOE 2/1/21 -450 #df1
我最终想要实现的是审计生成这样的输出
Name Date Charge ChargeDiff
<chr> <chr> <dbl> <dbl>
JANE,BUCK 1/2/21 0 -500 #difference in Charge when name and date are same, but Charge differs
JANE,DOE 1/1/21 -500 0 #unique df1 is 0 because we know it is valid
JANE,DOE 1/2/21 -500 -500 #unique in df2 is -500 because it is missing
JIM,BUCK 2/10/21 -50 -50 #unique in df2 is -50 because it is missing
JIM,BUCK 2/2/21 -450 50 # df1-df2 on 2/2 is -450-(-500)
JIM,BUCK 2/8/21 0 500 # df1-df2
JIM,DOE 2/1/21 -450 #df1 #unique in df1
我在创建 ChangeDiff 列时遇到了一些问题。是否有一个不同的连接选项,它允许我仅将来自 df2 的唯一行添加到 Audit,但保留来自 df2 的唯一费用值在与名称和日期对齐的单独列中。
你和 full_join()
差不多了
使用 by()
参数允许您控制连接,以便您可以将两组 Charge
包含在单独的列中。使用 dplyr::mutate()
and/or case_when()
生成您的 ChargeDiff
列。
library(tibble)
library(dplyr)
tib <-
df1 %>%
full_join(df2, by = c("Name" = "Name", "Date" = "Date")) %>%
mutate(ChargDiff = case_when(is.na(Charge.x) | is.na(Charge.y) ~ NA_real_,
TRUE ~ Charge.x - Charge.y))
tib
#> # A tibble: 8 x 5
#> Name Date Charge.x Charge.y ChargDiff
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 JANE,DOE 1/1/21 -500 NA NA
#> 2 JANE,DOE 1/10/21 -500 -500 0
#> 3 JIM,DOE 2/1/21 -450 -450 0
#> 4 JANE,BUCK 1/2/21 0 -500 500
#> 5 JIM,BUCK 2/2/21 -450 -500 50
#> 6 JIM,BUCK 2/8/21 0 -500 500
#> 7 JANE,DOE 1/2/21 NA -500 NA
#> 8 JIM,BUCK 2/10/21 NA -50 NA
由 reprex package (v1.0.0)
于 2021-03-24 创建
这看起来不太漂亮 -- 编码方面;但它会变得“漂亮”table。
table 将显示在 RStudio 的查看器窗格中,并且与查看器一样宽。
require(htmlTable)
require(compareDF)
# adds color
emph <- function(xCd){
# create empty matrix
css.cell = matrix(nrow = dim(xCd)[1],
ncol = dim(xCd)[2] + 1) # another column will be added for origin
# for + add #003b70; for - add #b21e28
where <- which(xCd == "+", # origin in the first table
arr.ind = T) # get dim loc
css.cell[where] <- "font-weight:bold;color:#003b70;padding-left:1em;padding-right:1em;"
where2 <- which(xCd == "-", # origin in the second table
arr.ind = T)
css.cell[where2] <- "font-weight:bold;color:#b21e28;padding-left:1em;padding-right:1em;"
where3 <- which(css.cell == NA,
arr.ind = T)
css.cell[where3] <- "padding-left:1em;padding-right:1em;"
# add padding for every where else; this sets a minimum width
return(css.cell)
} # end udf
创建table
# creates table
fmtStyle <- function(frame1, frame2){
# create the comparison
comp <- compare_df(frame1, frame2,
keep_unchanged_rows = T) # create the comparison
# don't include rowname & chng_type columns-- 3 through the rest of the columns
# keeps both frames' rows even if they're the same data; don't keep in this table
compMat <- as.matrix(comp$comparison_table_diff[c(rownames(unique(comp[[1]]))),
3:ncol(comp$comparison_table_diff)])
css <- emph(compMat) # collect colors
# collect the joined data
# don't include rowname & chng_type columns-- 3 through the rest of the columns
# keeps both frames' rows even if they're the same data; don't keep in this table
mat <- as.matrix(comp$comparison_table_ts2char[c(rownames(unique(comp[[1]]))),
3:ncol(comp$comparison_table_ts2char)])
rownames(mat) <- NULL # remove the rownames (they aren't meaningful)
# make the data character type; the date column being an exception
mat[,3] <- txtRound(mat[,3],0) # make all values character data
# add a column identifying origin
Origins <- ifelse(comp$comparison_table_diff[c(rownames(unique(comp[[1]]))),
]$chng_type == "+", "df1",
ifelse(comp$comparison_table_diff[c(rownames(unique(comp[[1]]))),
]$chng_type == "-", "df2",
"df1 & df2"))
# add origins to the matrix
mat <- cbind(mat, Origins)
# make it pretty
soPretty <- mat %>%
addHtmlTableStyle(spacer.celltype = "double_cell",
css.cell = css,
css.header = "font-weight: normal",
pos.caption = "bottom") %>%
htmlTable(cgroup = list(c("Comparison Audit")),
n.cgroup = list(c(ncol(mat))), # span the name cgroup words over all the columns
caption = paste0("<i>Note</i>: Blue font indicates an origin that only exists in the first table;",
" where a red font indicates that the information is unique to the second table."))
# paste is used in the caption just make the caption readable in the script file
return(soPretty)
} # end table creation function
要使用函数,这样调用
# using it
(showMe <- fmtStyle(df1,df2))
要查看table后面的HTML或者修改它可以这样看脚本
as.character(showMe)
我正在尝试比较两个数据框以确定其中一列中的差异。为此,我使用了 tidyverse 中的 full_join()
,但我无法弄清楚如何保留数据框的原点,因此我可以理解差异。
#Two databases that differ in Charge for the name and date or also have a entirely unique rows
df1 <- tibble(Name = c("JANE,DOE", "JANE,DOE", "JIM,DOE", "JANE,BUCK", "JIM,BUCK", "JIM,BUCK"),
Date = c("1/1/21", "1/10/21", "2/1/21", "1/2/21", "2/2/21", "2/8/21"),
Charge = c(-500, -500, -450, 0, -450, 0))
df2 <- tibble(Name = c("JANE,DOE", "JANE,DOE", "JIM,DOE", "JANE,BUCK", "JIM,BUCK", "JIM,BUCK", "JIM,BUCK"),
Date = c("1/2/21", "1/10/21", "2/1/21", "1/2/21", "2/2/21", "2/8/21", "2/10/21"),
Charge = c(-500, -500, -450, -500, -500, -500, -50))
我试过将它们组合起来然后识别不同的行
Audit <- full_join(df1,df2)
Audit <- Audit %>% distinct() %>% arrange(Name, Date)
但我的输出不允许我对比名称和日期唯一的行的费用。
Name Date Charge
<chr> <chr> <dbl>
JANE,BUCK 1/2/21 0 #df2
JANE,BUCK 1/2/21 -500 #df1
JANE,DOE 1/1/21 -500 #df1
JANE,DOE 1/10/21 -500 #df1 & df2
JANE,DOE 1/2/21 -500 #df2
JIM,BUCK 2/10/21 -50 #df2
JIM,BUCK 2/2/21 -450 #df1
JIM,BUCK 2/2/21 -500 #df2
JIM,BUCK 2/8/21 0 #df1
JIM,BUCK 2/8/21 -500 #df2
JIM,DOE 2/1/21 -450 #df1
我最终想要实现的是审计生成这样的输出
Name Date Charge ChargeDiff
<chr> <chr> <dbl> <dbl>
JANE,BUCK 1/2/21 0 -500 #difference in Charge when name and date are same, but Charge differs
JANE,DOE 1/1/21 -500 0 #unique df1 is 0 because we know it is valid
JANE,DOE 1/2/21 -500 -500 #unique in df2 is -500 because it is missing
JIM,BUCK 2/10/21 -50 -50 #unique in df2 is -50 because it is missing
JIM,BUCK 2/2/21 -450 50 # df1-df2 on 2/2 is -450-(-500)
JIM,BUCK 2/8/21 0 500 # df1-df2
JIM,DOE 2/1/21 -450 #df1 #unique in df1
我在创建 ChangeDiff 列时遇到了一些问题。是否有一个不同的连接选项,它允许我仅将来自 df2 的唯一行添加到 Audit,但保留来自 df2 的唯一费用值在与名称和日期对齐的单独列中。
你和 full_join()
差不多了
使用 by()
参数允许您控制连接,以便您可以将两组 Charge
包含在单独的列中。使用 dplyr::mutate()
and/or case_when()
生成您的 ChargeDiff
列。
library(tibble)
library(dplyr)
tib <-
df1 %>%
full_join(df2, by = c("Name" = "Name", "Date" = "Date")) %>%
mutate(ChargDiff = case_when(is.na(Charge.x) | is.na(Charge.y) ~ NA_real_,
TRUE ~ Charge.x - Charge.y))
tib
#> # A tibble: 8 x 5
#> Name Date Charge.x Charge.y ChargDiff
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 JANE,DOE 1/1/21 -500 NA NA
#> 2 JANE,DOE 1/10/21 -500 -500 0
#> 3 JIM,DOE 2/1/21 -450 -450 0
#> 4 JANE,BUCK 1/2/21 0 -500 500
#> 5 JIM,BUCK 2/2/21 -450 -500 50
#> 6 JIM,BUCK 2/8/21 0 -500 500
#> 7 JANE,DOE 1/2/21 NA -500 NA
#> 8 JIM,BUCK 2/10/21 NA -50 NA
由 reprex package (v1.0.0)
于 2021-03-24 创建这看起来不太漂亮 -- 编码方面;但它会变得“漂亮”table。 table 将显示在 RStudio 的查看器窗格中,并且与查看器一样宽。
require(htmlTable)
require(compareDF)
# adds color
emph <- function(xCd){
# create empty matrix
css.cell = matrix(nrow = dim(xCd)[1],
ncol = dim(xCd)[2] + 1) # another column will be added for origin
# for + add #003b70; for - add #b21e28
where <- which(xCd == "+", # origin in the first table
arr.ind = T) # get dim loc
css.cell[where] <- "font-weight:bold;color:#003b70;padding-left:1em;padding-right:1em;"
where2 <- which(xCd == "-", # origin in the second table
arr.ind = T)
css.cell[where2] <- "font-weight:bold;color:#b21e28;padding-left:1em;padding-right:1em;"
where3 <- which(css.cell == NA,
arr.ind = T)
css.cell[where3] <- "padding-left:1em;padding-right:1em;"
# add padding for every where else; this sets a minimum width
return(css.cell)
} # end udf
创建table
# creates table
fmtStyle <- function(frame1, frame2){
# create the comparison
comp <- compare_df(frame1, frame2,
keep_unchanged_rows = T) # create the comparison
# don't include rowname & chng_type columns-- 3 through the rest of the columns
# keeps both frames' rows even if they're the same data; don't keep in this table
compMat <- as.matrix(comp$comparison_table_diff[c(rownames(unique(comp[[1]]))),
3:ncol(comp$comparison_table_diff)])
css <- emph(compMat) # collect colors
# collect the joined data
# don't include rowname & chng_type columns-- 3 through the rest of the columns
# keeps both frames' rows even if they're the same data; don't keep in this table
mat <- as.matrix(comp$comparison_table_ts2char[c(rownames(unique(comp[[1]]))),
3:ncol(comp$comparison_table_ts2char)])
rownames(mat) <- NULL # remove the rownames (they aren't meaningful)
# make the data character type; the date column being an exception
mat[,3] <- txtRound(mat[,3],0) # make all values character data
# add a column identifying origin
Origins <- ifelse(comp$comparison_table_diff[c(rownames(unique(comp[[1]]))),
]$chng_type == "+", "df1",
ifelse(comp$comparison_table_diff[c(rownames(unique(comp[[1]]))),
]$chng_type == "-", "df2",
"df1 & df2"))
# add origins to the matrix
mat <- cbind(mat, Origins)
# make it pretty
soPretty <- mat %>%
addHtmlTableStyle(spacer.celltype = "double_cell",
css.cell = css,
css.header = "font-weight: normal",
pos.caption = "bottom") %>%
htmlTable(cgroup = list(c("Comparison Audit")),
n.cgroup = list(c(ncol(mat))), # span the name cgroup words over all the columns
caption = paste0("<i>Note</i>: Blue font indicates an origin that only exists in the first table;",
" where a red font indicates that the information is unique to the second table."))
# paste is used in the caption just make the caption readable in the script file
return(soPretty)
} # end table creation function
要使用函数,这样调用
# using it
(showMe <- fmtStyle(df1,df2))
要查看table后面的HTML或者修改它可以这样看脚本
as.character(showMe)