循环:如何在 R 中循环 case_when 函数?
Loops: How can I loop case_when function in R?
这是代码,我在其中尝试通过检测单词并匹配它们来创建变量。这里我使用 dplyr
包及其函数 mutate
结合 case_when
。问题是我正在手动添加每个值,如您所见。我如何通过应用一些循环函数来匹配两者来自动化它?
city <- LETTERS #26 cities
district <- letters[10:20] #11 districts
streets <- paste0(district, district)
streets <- streets[-c(5:26)] #4 streets
df <- data.frame(x = c(1:5),
address = c("A, b, cc,", "B, dd", "a, dd", "C", "D, a, cc"))
library(dplyr)
library(stringi)
df2 <- df %>%
mutate(districts = case_when(
stri_detect_fixed(address, "b") ~ "b", #address[1]
#address[2]
stri_detect_fixed(address, "a") ~ "a", #address[3]
#address[4]
stri_detect_fixed(address, "cc") ~ "cc" #address[5]
))
代码在 address
中扫描 district
向量中的值。我很乐意为 city
和 street
变量做同样的事情。所以我在 Stack Overflow 中使用了 的修改版本。它会产生错误。
for (j in town_village2) {
trn_house3[,93] <- case_when(
stri_detect_fixed(trn_house3[1:6469, 4], j) ~ j)
}
我试图产生这样的结果:
x address city district street
1 A, b, cc, A b cc
2 B, dd B NA dd
3 a, dd NA a dd
4 C C NA NA
5 D, a, cc D a cc
这会将元素分成向量:
library(tidyverse)
df <- data.frame(
x = c(1:5),
address = c("A, b, cc,", "B, dd", "a, dd", "C", "D, a, cc")
)
df3 <-
df %>%
separate_rows(address, sep = "[, ]+") %>%
filter(nchar(address) > 0) %>%
nest(address) %>%
transmute(x, districts = data %>% map(~ .x[[1]]))
#> Warning: All elements of `...` must be named.
#> Did you want `data = address`?
df3
#> # A tibble: 5 × 2
#> x districts
#> <int> <list>
#> 1 1 <chr [3]>
#> 2 2 <chr [2]>
#> 3 3 <chr [2]>
#> 4 4 <chr [1]>
#> 5 5 <chr [3]>
df3$districts[[1]]
#> [1] "A" "b" "cc"
由 reprex package (v2.0.0)
于 2022-04-14 创建
一种data.table
方法
library(data.table)
DT <- data.table(city, streets, district)
# create a lookup table with all elements
lookup <- melt(DT, measure.vars = names(DT))
# set df to data.table format
setDT(df)
final <- df[, .(address = unlist(tstrsplit(address, ",[ ]*", perl = TRUE))), by = .(x)]
# now add elements
final[lookup, type := i.variable, on = .(address = value)]
# and dcast to wide
dcast(final, x ~ type, value.var = "address")
# x city streets district
# 1: 1 A cc b
# 2: 2 B dd <NA>
# 3: 3 <NA> dd a
# 4: 4 C <NA> <NA>
# 5: 5 D cc a
如果你要添加一个循环,使用case_when()
是没有意义的;如果可以遍历它们,则不必将所有选项都添加到其中。
你可以用 for-loop:
来解决
library(stringi)
df2 <- df
for(c in city) df2$city[stri_detect_fixed(df2$address, c)] <- c
for(d in district) df2$district[stri_detect_fixed(df2$address, d)] <- d
for(s in streets) df2$street[stri_detect_fixed(df2$address, s)] <- s
请注意,您的示例代码不起作用;在您的示例数据集中,地区名称是 'a' 和 'b',但您生成的名称是 'j' 到 't'。我在上面的代码中修复了这个问题。
并且如果市、区and/or街道名称重叠会报错。例如,如果一排在 'b' 区,在 'cc' 街,stri_detect_fixed 也会看到 'c' 并认为它在 'c' .我提出了一种完全不同的方法来克服这个问题:
替代方法
鉴于您的示例数据,首先将给定地址拆分为 ,
最有意义,然后查找与您的参考 city/district/street 完全匹配的 名字。我们可以寻找与 intersect()
.
完全匹配的那些
# example reference address parts
cities <- c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L",
"M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y",
"Z")
districts <- c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k")
streets <- c("aa", "bb", "cc", "dd")
# example dataset
df <- data.frame(x = c(1:5),
address = c("A, b, cc,", "B, dd", "a, dd", "C", "D, a, cc"))
# vectorize address into elements
address_elems = strsplit(df$address, ',') # split by comma
address_elems = sapply(address_elems, trimws) # trim whitespace
比较df$address
和新建的address_elems
:
> df$address
[1] "A, b, cc," "B, dd" "a, dd" "C" "D, a, cc"
> address_elems
[[1]]
[1] "A" "b" "cc"
[[2]]
[1] "B" "dd"
[[3]]
[1] "a" "dd"
[[4]]
[1] "C"
[[5]]
[1] "D" "a" "cc"
我们可以在 intersect(cities, address_elems[[1]])
.
中为 address_elems
中的第一个向量找到匹配的 cities
因为我们可能得到多个匹配项,所以我们只取第一个元素,intersect(cities, address_elems[[1]])[[1]]
.
要将此应用于 address_elems
中的每个向量,我们可以使用 sapply()
或 lapply()
:
# intersect the respective reference lists with each list of
# address items, taking only the first element
df$cities = sapply(address_elems, function(x) intersect(cities, x)[1])
df$district = sapply(address_elems, function(x) intersect(districts, x)[1])
df$street = sapply(address_elems, function(x) intersect(streets, x)[1])
PIAT
将它们放在一起我们得到:
# example reference address parts
cities <- c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L",
"M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y",
"Z")
districts <- c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k")
streets <- c("aa", "bb", "cc", "dd")
# example dataset
df <- data.frame(x = c(1:5),
address = c("A, b, cc,", "B, dd", "a, dd", "C", "D, a, cc"))
# create vector of address elements
address_elems = strsplit(df$address, ',') # split by comma
address_elems = sapply(address_elems, trimws) # trim whitespace
# intersect the respecitve reference lists with each list of
# address items, take only the first element
df$cities = lapply(address_elems, function(x) intersect(cities, x)[1])
df$district = sapply(address_elems, function(x) intersect(districts, x)[1])
df$street = sapply(address_elems, function(x) intersect(streets, x)[1])
# cleanup
rm(address_elems)
这是代码,我在其中尝试通过检测单词并匹配它们来创建变量。这里我使用 dplyr
包及其函数 mutate
结合 case_when
。问题是我正在手动添加每个值,如您所见。我如何通过应用一些循环函数来匹配两者来自动化它?
city <- LETTERS #26 cities
district <- letters[10:20] #11 districts
streets <- paste0(district, district)
streets <- streets[-c(5:26)] #4 streets
df <- data.frame(x = c(1:5),
address = c("A, b, cc,", "B, dd", "a, dd", "C", "D, a, cc"))
library(dplyr)
library(stringi)
df2 <- df %>%
mutate(districts = case_when(
stri_detect_fixed(address, "b") ~ "b", #address[1]
#address[2]
stri_detect_fixed(address, "a") ~ "a", #address[3]
#address[4]
stri_detect_fixed(address, "cc") ~ "cc" #address[5]
))
代码在 address
中扫描 district
向量中的值。我很乐意为 city
和 street
变量做同样的事情。所以我在 Stack Overflow 中使用了
for (j in town_village2) {
trn_house3[,93] <- case_when(
stri_detect_fixed(trn_house3[1:6469, 4], j) ~ j)
}
我试图产生这样的结果:
x address city district street
1 A, b, cc, A b cc
2 B, dd B NA dd
3 a, dd NA a dd
4 C C NA NA
5 D, a, cc D a cc
这会将元素分成向量:
library(tidyverse)
df <- data.frame(
x = c(1:5),
address = c("A, b, cc,", "B, dd", "a, dd", "C", "D, a, cc")
)
df3 <-
df %>%
separate_rows(address, sep = "[, ]+") %>%
filter(nchar(address) > 0) %>%
nest(address) %>%
transmute(x, districts = data %>% map(~ .x[[1]]))
#> Warning: All elements of `...` must be named.
#> Did you want `data = address`?
df3
#> # A tibble: 5 × 2
#> x districts
#> <int> <list>
#> 1 1 <chr [3]>
#> 2 2 <chr [2]>
#> 3 3 <chr [2]>
#> 4 4 <chr [1]>
#> 5 5 <chr [3]>
df3$districts[[1]]
#> [1] "A" "b" "cc"
由 reprex package (v2.0.0)
于 2022-04-14 创建一种data.table
方法
library(data.table)
DT <- data.table(city, streets, district)
# create a lookup table with all elements
lookup <- melt(DT, measure.vars = names(DT))
# set df to data.table format
setDT(df)
final <- df[, .(address = unlist(tstrsplit(address, ",[ ]*", perl = TRUE))), by = .(x)]
# now add elements
final[lookup, type := i.variable, on = .(address = value)]
# and dcast to wide
dcast(final, x ~ type, value.var = "address")
# x city streets district
# 1: 1 A cc b
# 2: 2 B dd <NA>
# 3: 3 <NA> dd a
# 4: 4 C <NA> <NA>
# 5: 5 D cc a
如果你要添加一个循环,使用case_when()
是没有意义的;如果可以遍历它们,则不必将所有选项都添加到其中。
你可以用 for-loop:
来解决library(stringi)
df2 <- df
for(c in city) df2$city[stri_detect_fixed(df2$address, c)] <- c
for(d in district) df2$district[stri_detect_fixed(df2$address, d)] <- d
for(s in streets) df2$street[stri_detect_fixed(df2$address, s)] <- s
请注意,您的示例代码不起作用;在您的示例数据集中,地区名称是 'a' 和 'b',但您生成的名称是 'j' 到 't'。我在上面的代码中修复了这个问题。
并且如果市、区and/or街道名称重叠会报错。例如,如果一排在 'b' 区,在 'cc' 街,stri_detect_fixed 也会看到 'c' 并认为它在 'c' .我提出了一种完全不同的方法来克服这个问题:
替代方法
鉴于您的示例数据,首先将给定地址拆分为 ,
最有意义,然后查找与您的参考 city/district/street 完全匹配的 名字。我们可以寻找与 intersect()
.
# example reference address parts
cities <- c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L",
"M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y",
"Z")
districts <- c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k")
streets <- c("aa", "bb", "cc", "dd")
# example dataset
df <- data.frame(x = c(1:5),
address = c("A, b, cc,", "B, dd", "a, dd", "C", "D, a, cc"))
# vectorize address into elements
address_elems = strsplit(df$address, ',') # split by comma
address_elems = sapply(address_elems, trimws) # trim whitespace
比较df$address
和新建的address_elems
:
> df$address
[1] "A, b, cc," "B, dd" "a, dd" "C" "D, a, cc"
> address_elems
[[1]]
[1] "A" "b" "cc"
[[2]]
[1] "B" "dd"
[[3]]
[1] "a" "dd"
[[4]]
[1] "C"
[[5]]
[1] "D" "a" "cc"
我们可以在 intersect(cities, address_elems[[1]])
.
address_elems
中的第一个向量找到匹配的 cities
因为我们可能得到多个匹配项,所以我们只取第一个元素,intersect(cities, address_elems[[1]])[[1]]
.
要将此应用于 address_elems
中的每个向量,我们可以使用 sapply()
或 lapply()
:
# intersect the respective reference lists with each list of
# address items, taking only the first element
df$cities = sapply(address_elems, function(x) intersect(cities, x)[1])
df$district = sapply(address_elems, function(x) intersect(districts, x)[1])
df$street = sapply(address_elems, function(x) intersect(streets, x)[1])
PIAT
将它们放在一起我们得到:
# example reference address parts
cities <- c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L",
"M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y",
"Z")
districts <- c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k")
streets <- c("aa", "bb", "cc", "dd")
# example dataset
df <- data.frame(x = c(1:5),
address = c("A, b, cc,", "B, dd", "a, dd", "C", "D, a, cc"))
# create vector of address elements
address_elems = strsplit(df$address, ',') # split by comma
address_elems = sapply(address_elems, trimws) # trim whitespace
# intersect the respecitve reference lists with each list of
# address items, take only the first element
df$cities = lapply(address_elems, function(x) intersect(cities, x)[1])
df$district = sapply(address_elems, function(x) intersect(districts, x)[1])
df$street = sapply(address_elems, function(x) intersect(streets, x)[1])
# cleanup
rm(address_elems)