使用 R 或 Python 将数据框列中的字符串与另一个数据框列中的字符串匹配
Matching strings in a column of a data frame with the strings in a column of another data frame using R or Python
我正在尝试将一个数据框的一列中的字符串与另一个数据框的一列中的字符串进行匹配,并映射相应的值。两个数据框的行数不同
df1 = data.frame(name = c("(CKMB)Creatinine Kinase Muscle & Brain", "24 Hours Urine for Sodium", "Antistreptolysin O Titer", "Blood group O", lonic_code = c("27816-8-O", "27816-8-B", "1869-7", "33914-3")
df2 = data.frame(Testcomponents = c("creatinine", "blood", "potassium"))
预期产出
Test Components lonic_code
creatinine 27816-8-O
blood 1869-7
potassium NA
这是一个可能的解决方案。可能不是最漂亮的,很想看看其他解决方案。
df1 = data.frame(name = c("(CKMB)Creatinine Kinase Muscle & Brain", "24 Hours Urine for Sodium", "Antistreptolysin O Titer", "Blood group O"), lonic_code = c("27816-8-O", "27816-8-B", "1869-7", "33914-3"))
df2 = data.frame(Testcomponents = c("creatinine", "blood", "potassium"))
result = lapply(sapply(df2$Testcomponents,function(x) {
which(sapply(df1$name,function(y) {grepl(x,y,ignore.case = T)}))}),function(z) {df1$lonic_code[z]})
df2$Ionic_code= result
输出:
Testcomponents Ionic_code
1 creatinine 3
2 blood 4
3 potassium
这比 Florian 的答案多了一点代码,但是,我认为它通过更易于阅读来弥补:
df1 = data.frame(Testcomponent = c("Albumin", "HDL Cholesterol",
"Erythrocyte Sedimentation Rate (ESR)", "Thyroid-stimulating Hormone (TSH)"))
df2 = data.frame(Names = c("Micro Albumin", "Serum Globulin", "CMV Antibody (IgG)"), lonic_code = c("10501-5", "5196", "EKC 1"))
get.test.component <- function(component.name) {
component <- grep(component.name, df2$Names)
if (length(component) == 0) {
return (NA)
} else {
return (as.character(df2$lonic_code[component]))
}
}
new.ionic.codes <- Reduce(c, lapply(df1$Testcomponent, function(x) get.test.component(x)))
df1.new <- cbind(df1, new.ionic.codes)
regex_right_join
在这种情况下会很方便。
library(fuzzyjoin)
library(dplyr)
df1 %>%
mutate(name = as.character(name)) %>%
regex_right_join(df2 %>%
mutate(Testcomponents = as.character(Testcomponents)),
by = c(name = "Testcomponents"), ignore_case = T) %>%
select(Testcomponents, lonic_code)
输出为:
Testcomponents lonic_code
1 creatinine 27816-8-O
2 blood 33914-3
3 potassium <NA>
示例数据:
df1 <- structure(list(name = structure(1:4, .Label = c("(CKMB)Creatinine Kinase Muscle & Brain",
"24 Hours Urine for Sodium", "Antistreptolysin O Titer", "Blood group O"
), class = "factor"), lonic_code = structure(c(3L, 2L, 1L, 4L
), .Label = c("1869-7", "27816-8-B", "27816-8-O", "33914-3"), class = "factor")), .Names = c("name",
"lonic_code"), row.names = c(NA, -4L), class = "data.frame")
df2 <- structure(list(Testcomponents = structure(c(2L, 1L, 3L), .Label = c("blood",
"creatinine", "potassium"), class = "factor")), .Names = "Testcomponents", row.names = c(NA,
-3L), class = "data.frame")
您可以使用sapply
循环测试组件:
df2$lonic_code <- sapply(tolower(df2$Testcomponents), function(x)
df1$lonic_code[grep(x, tolower(df1$name), fixed = TRUE)[1L]])
df2
# Testcomponents lonic_code
#1 creatinine 27816-8-O
#2 blood 33914-3
#3 potassium <NA>
如果有多个匹配项,这将始终return只有第一个匹配项。
这应该相当快,因为它只使用一个循环并且因为我们在 grep
中指定了 fixed = TRUE
。为了进一步提高速度,您可以使用 stringi
包的正则表达式函数。
我正在尝试将一个数据框的一列中的字符串与另一个数据框的一列中的字符串进行匹配,并映射相应的值。两个数据框的行数不同
df1 = data.frame(name = c("(CKMB)Creatinine Kinase Muscle & Brain", "24 Hours Urine for Sodium", "Antistreptolysin O Titer", "Blood group O", lonic_code = c("27816-8-O", "27816-8-B", "1869-7", "33914-3")
df2 = data.frame(Testcomponents = c("creatinine", "blood", "potassium"))
预期产出
Test Components lonic_code
creatinine 27816-8-O
blood 1869-7
potassium NA
这是一个可能的解决方案。可能不是最漂亮的,很想看看其他解决方案。
df1 = data.frame(name = c("(CKMB)Creatinine Kinase Muscle & Brain", "24 Hours Urine for Sodium", "Antistreptolysin O Titer", "Blood group O"), lonic_code = c("27816-8-O", "27816-8-B", "1869-7", "33914-3"))
df2 = data.frame(Testcomponents = c("creatinine", "blood", "potassium"))
result = lapply(sapply(df2$Testcomponents,function(x) {
which(sapply(df1$name,function(y) {grepl(x,y,ignore.case = T)}))}),function(z) {df1$lonic_code[z]})
df2$Ionic_code= result
输出:
Testcomponents Ionic_code
1 creatinine 3
2 blood 4
3 potassium
这比 Florian 的答案多了一点代码,但是,我认为它通过更易于阅读来弥补:
df1 = data.frame(Testcomponent = c("Albumin", "HDL Cholesterol",
"Erythrocyte Sedimentation Rate (ESR)", "Thyroid-stimulating Hormone (TSH)"))
df2 = data.frame(Names = c("Micro Albumin", "Serum Globulin", "CMV Antibody (IgG)"), lonic_code = c("10501-5", "5196", "EKC 1"))
get.test.component <- function(component.name) {
component <- grep(component.name, df2$Names)
if (length(component) == 0) {
return (NA)
} else {
return (as.character(df2$lonic_code[component]))
}
}
new.ionic.codes <- Reduce(c, lapply(df1$Testcomponent, function(x) get.test.component(x)))
df1.new <- cbind(df1, new.ionic.codes)
regex_right_join
在这种情况下会很方便。
library(fuzzyjoin)
library(dplyr)
df1 %>%
mutate(name = as.character(name)) %>%
regex_right_join(df2 %>%
mutate(Testcomponents = as.character(Testcomponents)),
by = c(name = "Testcomponents"), ignore_case = T) %>%
select(Testcomponents, lonic_code)
输出为:
Testcomponents lonic_code
1 creatinine 27816-8-O
2 blood 33914-3
3 potassium <NA>
示例数据:
df1 <- structure(list(name = structure(1:4, .Label = c("(CKMB)Creatinine Kinase Muscle & Brain",
"24 Hours Urine for Sodium", "Antistreptolysin O Titer", "Blood group O"
), class = "factor"), lonic_code = structure(c(3L, 2L, 1L, 4L
), .Label = c("1869-7", "27816-8-B", "27816-8-O", "33914-3"), class = "factor")), .Names = c("name",
"lonic_code"), row.names = c(NA, -4L), class = "data.frame")
df2 <- structure(list(Testcomponents = structure(c(2L, 1L, 3L), .Label = c("blood",
"creatinine", "potassium"), class = "factor")), .Names = "Testcomponents", row.names = c(NA,
-3L), class = "data.frame")
您可以使用sapply
循环测试组件:
df2$lonic_code <- sapply(tolower(df2$Testcomponents), function(x)
df1$lonic_code[grep(x, tolower(df1$name), fixed = TRUE)[1L]])
df2
# Testcomponents lonic_code
#1 creatinine 27816-8-O
#2 blood 33914-3
#3 potassium <NA>
如果有多个匹配项,这将始终return只有第一个匹配项。
这应该相当快,因为它只使用一个循环并且因为我们在 grep
中指定了 fixed = TRUE
。为了进一步提高速度,您可以使用 stringi
包的正则表达式函数。