逻辑函数:用于县选择的地理编码脚本
Function for logic: Geocoding script for county selection
我编写了以下脚本来获取城市给定输入字符串的相应或最佳县匹配,例如 "New York, NY"。内联注释所需的逻辑。我尽力使代码可重现。您只需更改相应的 dat
和 place
输入 (2.2.1) 即可查看其工作方式。
# Load Packages
library(acs)
library(tidyverse)
library(tigris)
data(fips_codes)
# 1. Select Data
dat = geo.lookup(state = "NY", place = "New York")
#dat = geo.lookup(state = "TX", place = "Dallas")
#dat = geo.lookup(state = "OR", place = "Portland")
#dat = geo.lookup(state = "NY", place = "Manhattan")
#dat = geo.lookup(state = "NY", place = "Queens")
print(dat)
dat = na.omit(dat) # remove first row that is only contains state information and NA
# 2. Check whether county.name has multiple counties, separated by comma
cvals <- dat %>% filter(str_detect(county.name, ","), row_number() == 2L)
nrow(cvals)
# 2.1 If nrow(cvals) = 0, take first row
dat[1,]
# 2.2 If nrow(cvals) > 0, do split string and unnest
unbundle <- dat %>%
group_by(state.name, place.name) %>%
mutate(county.name = strsplit(county.name, ", ")) %>%
unnest %>%
na.omit()
unbundle
# 2.2.1 If "place =" input matches a county.name in unbundle, select that row
check <- unbundle %>% filter(str_detect(county.name, "New York"))
nrow(check)
#select that row
select <- unbundle %>% filter(str_detect(county.name, "New York"))
# 2.2.2 Otherwise, if there is no match, i.e. nrow(select) = 0, take first row from unbundle by default
unbundle[1,]
# 3.1 Merge countyfips from fips_codes into selected table (For New York, the final output of 2.2.1 would have been selected)
colnames(fips_codes) = c("state.abb", "statefips", "state.name", "countyfips", "county.name")
select %>% left_join(fips_codes, by = c("state.name", "county.name"))
我想知道如何编写此函数,以便像 "Portland, OR"、"Queens, NY" 或以上任何内容这样的输入都可以工作。也许,还有一种更聪明的方法来编写整个脚本。由于我正在学习 dplyr,因此首选 dlplyr 解决方案。
谢谢!
解决方案(更新):
library(acs)
library(tidyverse)
library(tigris)
data(fips_codes)
colnames(fips_codes) = c("state.abb", "statefips", "state.name", "countyfips", "county.name")
FUN <- function(x) {
Place <- strsplit(x, ", ")[[1]][1]
State <- strsplit(x, ", ")[[1]][2]
dat = geo.lookup(state = State, place = Place)
dat = na.omit(dat)
# 1 Check whether county.name has multiple counties
cvals <- dat %>% filter(str_detect(county.name, ","))
# 2 If not, i.e. cvals == 0, take first row of output
if(nrow(cvals[2,]) == 0) {
output <- dat[1,]
}
# 3 If yes, i.e. cvals > 0, unbundle code and proceed
else {
unbundle <- dat %>%
group_by(state.name, place.name) %>%
mutate(county.name = strsplit(county.name, ", ")) %>%
unnest %>%
na.omit()
# 3.1 If "Place" matches one of county.name values, take that row
check <- unbundle %>% filter(str_detect(county.name, Place))
nrow(check)
if (nrow(check) > 0) {
output <- check[1,]
}
# 3.2 Otherwise, if no match, nrow(check) = 0, take first row from unbundle by default
output <- unbundle[1,]
}
# Join county data with fips code table
output <- output %>% left_join(fips_codes, by = c("state.name", "county.name"))
print(output)
}
FUN("New York, NY")
FUN("Portland, OR")
FUN("Manhattan, NY")
FUN("Cambridge, MA")
你的Q不是很清楚,直接用acs
数据框就可以了:
library(acs)
library(tidyverse)
place_to_county <- function(place, state = NULL) {
if (is.null(state)) {
x <- trimws(strsplit(place, ",", 2)[[1]])
place <- x[1]
state <- x[2]
}
tbl_df(acs::fips.place) %>%
filter(grepl(place, PLACENAME, ignore.case=TRUE) & STATE == state) %>%
separate_rows(COUNTY, sep=", ") %>%
head(1) %>%
setNames(tolower(colnames(.)))
}
一些示例:
place_to_county("New York", "NY")
## # A tibble: 1 x 7
## state statefp placefp placename type funcstat county
## <chr> <int> <int> <chr> <chr> <chr> <chr>
## 1 NY 36 51000 New York city Incorporated Place A Bronx County
place_to_county("New York, NY")
## # A tibble: 1 x 7
## state statefp placefp placename type funcstat county
## <chr> <int> <int> <chr> <chr> <chr> <chr>
## 1 NY 36 51000 New York city Incorporated Place A Bronx County
place_to_county("Queens", "NY")
## # A tibble: 1 x 7
## state statefp placefp placename type funcstat county
## <chr> <int> <int> <chr> <chr> <chr> <chr>
## 1 NY 36 60323 Queens borough County Subdivision G Queens County
place_to_county("Queens, NY")
## # A tibble: 1 x 7
## state statefp placefp placename type funcstat county
## <chr> <int> <int> <chr> <chr> <chr> <chr>
## 1 NY 36 60323 Queens borough County Subdivision G Queens County
place_to_county("Berwick", "ME")
## # A tibble: 1 x 7
## state statefp placefp placename type funcstat county
## <chr> <int> <int> <chr> <chr> <chr> <chr>
## 1 ME 23 4685 Berwick CDP Census Designated Place S York County
place_to_county("Berwick, ME")
## # A tibble: 1 x 7
## state statefp placefp placename type funcstat county
## <chr> <int> <int> <chr> <chr> <chr> <chr>
## 1 ME 23 4685 Berwick CDP Census Designated Place S York County
place_to_county("Manhattan", "NY")
## # A tibble: 1 x 7
## state statefp placefp placename type funcstat county
## <chr> <int> <int> <chr> <chr> <chr> <chr>
## 1 NY 36 44919 Manhattan borough County Subdivision G New York County
place_to_county("Manhattan, NY")
## # A tibble: 1 x 7
## state statefp placefp placename type funcstat county
## <chr> <int> <int> <chr> <chr> <chr> <chr>
## 1 NY 36 44919 Manhattan borough County Subdivision G New York County
如您所见,如果参数是单独指定的或作为 "x, y" 字符串指定的,那将起作用。
这个版本更健壮一点:
place_to_county <- function(place, state = NULL) {
if (is.null(state)) {
x <- trimws(strsplit(place, ",", 2)[[1]])
place <- x[1]
state <- x[2]
}
tbl_df(acs::fips.place) %>%
filter(grepl(place, PLACENAME, ignore.case=TRUE) & STATE == state) -> xdf
if (nrow(xdf) > 0) {
separate_rows(xdf, COUNTY, sep=", ") %>%
head(1) %>%
setNames(tolower(colnames(.)))
} else {
NULL
}
}
因为它可以优雅地处理完全未命中。
更新以解决评论(我将使用代码的简化版本):
tbl_df(acs::fips.place) %>%
filter(grepl(place, PLACENAME, ignore.case=TRUE) & STATE == state) %>%
separate_rows(COUNTY, sep=", ") %>%
head(1) %>%
setNames(tolower(colnames(.)))
鉴于:
%>%
在 tidyverse/dplyr 代码中看到的管道符号。它(过度简化)避免使用临时变量赋值
tbl_df(…)
(同样,过度简化)只是确保输出更易于阅读(它用额外的 类 标记数据框)
filter(…)
完成您想要的工作。 grepl()
不区分大小写搜索地点,然后获取这些匹配项并按州进一步细化。这是一种 "dumb" 方法,因为它可以捕获字符串中任意位置的地名。有更聪明的方法,但这应该很有效。
separate_rows(…)
将获取 fips.place
中包含多个县的条目,并为每个县单独一行。
head(1)
天真地拿下第一场比赛
setNames(…)
使列名小写,因为 fips.place
中的列名都是(呃)UPPER_CASE
倒数第二个更新
这合并了 tigris
place_name
数据:
place_to_county <- function(place, state = NULL) {
if (is.null(state)) {
x <- trimws(strsplit(place, ",", 2)[[1]])
place <- x[1]
state <- x[2]
}
tbl_df(acs::fips.place) %>%
filter(grepl(place, PLACENAME, ignore.case=TRUE) & STATE == state) -> xdf
if (nrow(xdf) > 0) {
separate_rows(xdf, COUNTY, sep=", ") %>%
head(1) %>%
setNames(tolower(colnames(.))) %>%
left_join(tigris::fips_codes, by=c("county", "state"))
} else {
NULL
}
}
最终更新(真实):
这是一个非常简单的添加。我不打算进一步完善这一点。在 R w/o 中进行一些自己的实验,您不会变得更好。
place_to_county <- function(place, state = NULL) {
if (is.null(state)) {
x <- trimws(strsplit(place, ",", 2)[[1]])
place <- x[1]
state <- x[2]
}
tbl_df(acs::fips.place) %>%
filter(grepl(place, PLACENAME, ignore.case=TRUE) & STATE == state) -> xdf
if (nrow(xdf) > 0) {
separate_rows(xdf, COUNTY, sep=", ") %>%
head(1) %>%
setNames(tolower(colnames(.))) %>%
left_join(tigris::fips_codes, by=c("county", "state"))
} else {
data_frame(
state = state,
statefp = NA,
placefp = NA,
placename = place,
type = NA,
funcstat = NA,
county = NA,
state_code = NA,
state_name = NA,
county_code = NA
)
}
}
任何其他 features/changes 由您决定。
我编写了以下脚本来获取城市给定输入字符串的相应或最佳县匹配,例如 "New York, NY"。内联注释所需的逻辑。我尽力使代码可重现。您只需更改相应的 dat
和 place
输入 (2.2.1) 即可查看其工作方式。
# Load Packages
library(acs)
library(tidyverse)
library(tigris)
data(fips_codes)
# 1. Select Data
dat = geo.lookup(state = "NY", place = "New York")
#dat = geo.lookup(state = "TX", place = "Dallas")
#dat = geo.lookup(state = "OR", place = "Portland")
#dat = geo.lookup(state = "NY", place = "Manhattan")
#dat = geo.lookup(state = "NY", place = "Queens")
print(dat)
dat = na.omit(dat) # remove first row that is only contains state information and NA
# 2. Check whether county.name has multiple counties, separated by comma
cvals <- dat %>% filter(str_detect(county.name, ","), row_number() == 2L)
nrow(cvals)
# 2.1 If nrow(cvals) = 0, take first row
dat[1,]
# 2.2 If nrow(cvals) > 0, do split string and unnest
unbundle <- dat %>%
group_by(state.name, place.name) %>%
mutate(county.name = strsplit(county.name, ", ")) %>%
unnest %>%
na.omit()
unbundle
# 2.2.1 If "place =" input matches a county.name in unbundle, select that row
check <- unbundle %>% filter(str_detect(county.name, "New York"))
nrow(check)
#select that row
select <- unbundle %>% filter(str_detect(county.name, "New York"))
# 2.2.2 Otherwise, if there is no match, i.e. nrow(select) = 0, take first row from unbundle by default
unbundle[1,]
# 3.1 Merge countyfips from fips_codes into selected table (For New York, the final output of 2.2.1 would have been selected)
colnames(fips_codes) = c("state.abb", "statefips", "state.name", "countyfips", "county.name")
select %>% left_join(fips_codes, by = c("state.name", "county.name"))
我想知道如何编写此函数,以便像 "Portland, OR"、"Queens, NY" 或以上任何内容这样的输入都可以工作。也许,还有一种更聪明的方法来编写整个脚本。由于我正在学习 dplyr,因此首选 dlplyr 解决方案。
谢谢!
解决方案(更新):
library(acs)
library(tidyverse)
library(tigris)
data(fips_codes)
colnames(fips_codes) = c("state.abb", "statefips", "state.name", "countyfips", "county.name")
FUN <- function(x) {
Place <- strsplit(x, ", ")[[1]][1]
State <- strsplit(x, ", ")[[1]][2]
dat = geo.lookup(state = State, place = Place)
dat = na.omit(dat)
# 1 Check whether county.name has multiple counties
cvals <- dat %>% filter(str_detect(county.name, ","))
# 2 If not, i.e. cvals == 0, take first row of output
if(nrow(cvals[2,]) == 0) {
output <- dat[1,]
}
# 3 If yes, i.e. cvals > 0, unbundle code and proceed
else {
unbundle <- dat %>%
group_by(state.name, place.name) %>%
mutate(county.name = strsplit(county.name, ", ")) %>%
unnest %>%
na.omit()
# 3.1 If "Place" matches one of county.name values, take that row
check <- unbundle %>% filter(str_detect(county.name, Place))
nrow(check)
if (nrow(check) > 0) {
output <- check[1,]
}
# 3.2 Otherwise, if no match, nrow(check) = 0, take first row from unbundle by default
output <- unbundle[1,]
}
# Join county data with fips code table
output <- output %>% left_join(fips_codes, by = c("state.name", "county.name"))
print(output)
}
FUN("New York, NY")
FUN("Portland, OR")
FUN("Manhattan, NY")
FUN("Cambridge, MA")
你的Q不是很清楚,直接用acs
数据框就可以了:
library(acs)
library(tidyverse)
place_to_county <- function(place, state = NULL) {
if (is.null(state)) {
x <- trimws(strsplit(place, ",", 2)[[1]])
place <- x[1]
state <- x[2]
}
tbl_df(acs::fips.place) %>%
filter(grepl(place, PLACENAME, ignore.case=TRUE) & STATE == state) %>%
separate_rows(COUNTY, sep=", ") %>%
head(1) %>%
setNames(tolower(colnames(.)))
}
一些示例:
place_to_county("New York", "NY")
## # A tibble: 1 x 7
## state statefp placefp placename type funcstat county
## <chr> <int> <int> <chr> <chr> <chr> <chr>
## 1 NY 36 51000 New York city Incorporated Place A Bronx County
place_to_county("New York, NY")
## # A tibble: 1 x 7
## state statefp placefp placename type funcstat county
## <chr> <int> <int> <chr> <chr> <chr> <chr>
## 1 NY 36 51000 New York city Incorporated Place A Bronx County
place_to_county("Queens", "NY")
## # A tibble: 1 x 7
## state statefp placefp placename type funcstat county
## <chr> <int> <int> <chr> <chr> <chr> <chr>
## 1 NY 36 60323 Queens borough County Subdivision G Queens County
place_to_county("Queens, NY")
## # A tibble: 1 x 7
## state statefp placefp placename type funcstat county
## <chr> <int> <int> <chr> <chr> <chr> <chr>
## 1 NY 36 60323 Queens borough County Subdivision G Queens County
place_to_county("Berwick", "ME")
## # A tibble: 1 x 7
## state statefp placefp placename type funcstat county
## <chr> <int> <int> <chr> <chr> <chr> <chr>
## 1 ME 23 4685 Berwick CDP Census Designated Place S York County
place_to_county("Berwick, ME")
## # A tibble: 1 x 7
## state statefp placefp placename type funcstat county
## <chr> <int> <int> <chr> <chr> <chr> <chr>
## 1 ME 23 4685 Berwick CDP Census Designated Place S York County
place_to_county("Manhattan", "NY")
## # A tibble: 1 x 7
## state statefp placefp placename type funcstat county
## <chr> <int> <int> <chr> <chr> <chr> <chr>
## 1 NY 36 44919 Manhattan borough County Subdivision G New York County
place_to_county("Manhattan, NY")
## # A tibble: 1 x 7
## state statefp placefp placename type funcstat county
## <chr> <int> <int> <chr> <chr> <chr> <chr>
## 1 NY 36 44919 Manhattan borough County Subdivision G New York County
如您所见,如果参数是单独指定的或作为 "x, y" 字符串指定的,那将起作用。
这个版本更健壮一点:
place_to_county <- function(place, state = NULL) {
if (is.null(state)) {
x <- trimws(strsplit(place, ",", 2)[[1]])
place <- x[1]
state <- x[2]
}
tbl_df(acs::fips.place) %>%
filter(grepl(place, PLACENAME, ignore.case=TRUE) & STATE == state) -> xdf
if (nrow(xdf) > 0) {
separate_rows(xdf, COUNTY, sep=", ") %>%
head(1) %>%
setNames(tolower(colnames(.)))
} else {
NULL
}
}
因为它可以优雅地处理完全未命中。
更新以解决评论(我将使用代码的简化版本):
tbl_df(acs::fips.place) %>%
filter(grepl(place, PLACENAME, ignore.case=TRUE) & STATE == state) %>%
separate_rows(COUNTY, sep=", ") %>%
head(1) %>%
setNames(tolower(colnames(.)))
鉴于:
%>%
在 tidyverse/dplyr 代码中看到的管道符号。它(过度简化)避免使用临时变量赋值tbl_df(…)
(同样,过度简化)只是确保输出更易于阅读(它用额外的 类 标记数据框)filter(…)
完成您想要的工作。grepl()
不区分大小写搜索地点,然后获取这些匹配项并按州进一步细化。这是一种 "dumb" 方法,因为它可以捕获字符串中任意位置的地名。有更聪明的方法,但这应该很有效。separate_rows(…)
将获取fips.place
中包含多个县的条目,并为每个县单独一行。head(1)
天真地拿下第一场比赛setNames(…)
使列名小写,因为fips.place
中的列名都是(呃)UPPER_CASE
倒数第二个更新
这合并了 tigris
place_name
数据:
place_to_county <- function(place, state = NULL) {
if (is.null(state)) {
x <- trimws(strsplit(place, ",", 2)[[1]])
place <- x[1]
state <- x[2]
}
tbl_df(acs::fips.place) %>%
filter(grepl(place, PLACENAME, ignore.case=TRUE) & STATE == state) -> xdf
if (nrow(xdf) > 0) {
separate_rows(xdf, COUNTY, sep=", ") %>%
head(1) %>%
setNames(tolower(colnames(.))) %>%
left_join(tigris::fips_codes, by=c("county", "state"))
} else {
NULL
}
}
最终更新(真实):
这是一个非常简单的添加。我不打算进一步完善这一点。在 R w/o 中进行一些自己的实验,您不会变得更好。
place_to_county <- function(place, state = NULL) {
if (is.null(state)) {
x <- trimws(strsplit(place, ",", 2)[[1]])
place <- x[1]
state <- x[2]
}
tbl_df(acs::fips.place) %>%
filter(grepl(place, PLACENAME, ignore.case=TRUE) & STATE == state) -> xdf
if (nrow(xdf) > 0) {
separate_rows(xdf, COUNTY, sep=", ") %>%
head(1) %>%
setNames(tolower(colnames(.))) %>%
left_join(tigris::fips_codes, by=c("county", "state"))
} else {
data_frame(
state = state,
statefp = NA,
placefp = NA,
placename = place,
type = NA,
funcstat = NA,
county = NA,
state_code = NA,
state_name = NA,
county_code = NA
)
}
}
任何其他 features/changes 由您决定。