逻辑函数：用于县选择的地理编码脚本

Question

我编写了以下脚本来获取城市给定输入字符串的相应或最佳县匹配，例如 "New York, NY"。内联注释所需的逻辑。我尽力使代码可重现。您只需更改相应的 dat 和 place 输入 (2.2.1) 即可查看其工作方式。

# Load Packages
library(acs)
library(tidyverse)
library(tigris)
data(fips_codes)

# 1. Select Data

dat = geo.lookup(state = "NY", place = "New York")
#dat = geo.lookup(state = "TX", place = "Dallas")
#dat = geo.lookup(state = "OR", place = "Portland")
#dat = geo.lookup(state = "NY", place = "Manhattan")
#dat = geo.lookup(state = "NY", place = "Queens")
print(dat)

dat = na.omit(dat) # remove first row that is only contains state information and NA

# 2. Check whether county.name has multiple counties, separated by comma
cvals <- dat %>% filter(str_detect(county.name, ","), row_number() == 2L)
nrow(cvals)

# 2.1 If nrow(cvals) = 0, take first row
dat[1,]

# 2.2 If nrow(cvals) > 0, do split string and unnest
unbundle <- dat %>% 
  group_by(state.name, place.name) %>% 
  mutate(county.name = strsplit(county.name, ", ")) %>% 
  unnest %>%
  na.omit()
unbundle

# 2.2.1 If "place =" input matches a county.name in unbundle, select that row
check <- unbundle %>% filter(str_detect(county.name, "New York"))
nrow(check)
#select that row
select <- unbundle %>% filter(str_detect(county.name, "New York"))

# 2.2.2 Otherwise, if there is no match, i.e. nrow(select) = 0, take first row from unbundle by default
unbundle[1,]

# 3.1 Merge countyfips from fips_codes into selected table (For New York, the final output of 2.2.1 would have been selected)
colnames(fips_codes) = c("state.abb", "statefips", "state.name", "countyfips", "county.name")
select %>% left_join(fips_codes, by = c("state.name", "county.name"))

我想知道如何编写此函数，以便像 "Portland, OR"、"Queens, NY" 或以上任何内容这样的输入都可以工作。也许，还有一种更聪明的方法来编写整个脚本。由于我正在学习 dplyr，因此首选 dlplyr 解决方案。

谢谢！

解决方案（更新）：

library(acs)
library(tidyverse)
library(tigris)
data(fips_codes)
colnames(fips_codes) = c("state.abb", "statefips", "state.name", "countyfips", "county.name")

    FUN <- function(x) {
      Place <- strsplit(x, ", ")[[1]][1]
      State <- strsplit(x, ", ")[[1]][2]
      dat = geo.lookup(state = State, place = Place)
      dat = na.omit(dat)

      # 1 Check whether county.name has multiple counties
      cvals <- dat %>% filter(str_detect(county.name, ","))

      # 2 If not, i.e. cvals == 0, take first row of output
      if(nrow(cvals[2,]) == 0) {
        output <- dat[1,]
      }

      # 3 If yes, i.e. cvals > 0, unbundle code and proceed
      else {
        unbundle <- dat %>% 
          group_by(state.name, place.name) %>% 
          mutate(county.name = strsplit(county.name, ", ")) %>% 
          unnest %>%
          na.omit()

        # 3.1 If "Place" matches one of county.name values, take that row
        check <- unbundle %>% filter(str_detect(county.name, Place))
        nrow(check)

        if (nrow(check) > 0) {
        output <- check[1,]
      } 
        # 3.2 Otherwise, if no match, nrow(check) = 0, take first row from unbundle by default
        output <- unbundle[1,]
      }
    # Join county data with fips code table
      output <- output %>% left_join(fips_codes, by = c("state.name", "county.name"))
      print(output)
    }

    FUN("New York, NY")
    FUN("Portland, OR")
    FUN("Manhattan, NY")
    FUN("Cambridge, MA")

Answer 1

你的Q不是很清楚，直接用acs数据框就可以了：

library(acs)
library(tidyverse)

place_to_county <- function(place, state = NULL) {

  if (is.null(state)) {

    x <- trimws(strsplit(place, ",", 2)[[1]])

    place <- x[1]
    state <- x[2]

  }

  tbl_df(acs::fips.place) %>%
    filter(grepl(place, PLACENAME, ignore.case=TRUE) & STATE == state) %>%
    separate_rows(COUNTY, sep=", ") %>%
    head(1) %>%
    setNames(tolower(colnames(.)))

}

一些示例：

place_to_county("New York", "NY")
## # A tibble: 1 x 7
##   state statefp placefp     placename               type funcstat       county
##   <chr>   <int>   <int>         <chr>              <chr>    <chr>        <chr>
## 1    NY      36   51000 New York city Incorporated Place        A Bronx County

place_to_county("New York, NY")
## # A tibble: 1 x 7
##   state statefp placefp     placename               type funcstat       county
##   <chr>   <int>   <int>         <chr>              <chr>    <chr>        <chr>
## 1    NY      36   51000 New York city Incorporated Place        A Bronx County

place_to_county("Queens", "NY")
## # A tibble: 1 x 7
##   state statefp placefp      placename               type funcstat        county
##   <chr>   <int>   <int>          <chr>              <chr>    <chr>         <chr>
## 1    NY      36   60323 Queens borough County Subdivision        G Queens County

place_to_county("Queens, NY")
## # A tibble: 1 x 7
##   state statefp placefp      placename               type funcstat        county
##   <chr>   <int>   <int>          <chr>              <chr>    <chr>         <chr>
## 1    NY      36   60323 Queens borough County Subdivision        G Queens County

place_to_county("Berwick", "ME")
## # A tibble: 1 x 7
##   state statefp placefp   placename                    type funcstat      county
##   <chr>   <int>   <int>       <chr>                   <chr>    <chr>       <chr>
## 1    ME      23    4685 Berwick CDP Census Designated Place        S York County

place_to_county("Berwick, ME")
## # A tibble: 1 x 7
##   state statefp placefp   placename                    type funcstat      county
##   <chr>   <int>   <int>       <chr>                   <chr>    <chr>       <chr>
## 1    ME      23    4685 Berwick CDP Census Designated Place        S York County

place_to_county("Manhattan", "NY")
## # A tibble: 1 x 7
##   state statefp placefp         placename               type funcstat          county
##   <chr>   <int>   <int>             <chr>              <chr>    <chr>           <chr>
## 1    NY      36   44919 Manhattan borough County Subdivision        G New York County

place_to_county("Manhattan, NY")
## # A tibble: 1 x 7
##   state statefp placefp         placename               type funcstat          county
##   <chr>   <int>   <int>             <chr>              <chr>    <chr>           <chr>
## 1    NY      36   44919 Manhattan borough County Subdivision        G New York County

如您所见，如果参数是单独指定的或作为 "x, y" 字符串指定的，那将起作用。

这个版本更健壮一点：

place_to_county <- function(place, state = NULL) {

  if (is.null(state)) {

    x <- trimws(strsplit(place, ",", 2)[[1]])

    place <- x[1]
    state <- x[2]

  }

  tbl_df(acs::fips.place) %>% 
    filter(grepl(place, PLACENAME, ignore.case=TRUE) & STATE == state) -> xdf

  if (nrow(xdf) > 0) {
    separate_rows(xdf, COUNTY, sep=", ") %>%
    head(1) %>%
    setNames(tolower(colnames(.)))
  } else {
    NULL
  }

}

因为它可以优雅地处理完全未命中。

更新以解决评论（我将使用代码的简化版本）：

tbl_df(acs::fips.place) %>%
  filter(grepl(place, PLACENAME, ignore.case=TRUE) & STATE == state) %>%
  separate_rows(COUNTY, sep=", ") %>%
  head(1) %>%
  setNames(tolower(colnames(.)))

鉴于：

%>% 在 tidyverse/dplyr 代码中看到的管道符号。它（过度简化）避免使用临时变量赋值
tbl_df(…)（同样，过度简化）只是确保输出更易于阅读（它用额外的类标记数据框）
filter(…) 完成您想要的工作。 grepl() 不区分大小写搜索地点，然后获取这些匹配项并按州进一步细化。这是一种 "dumb" 方法，因为它可以捕获字符串中任意位置的地名。有更聪明的方法，但这应该很有效。
separate_rows(…) 将获取 fips.place 中包含多个县的条目，并为每个县单独一行。
head(1)天真地拿下第一场比赛
setNames(…) 使列名小写，因为 fips.place 中的列名都是（呃）UPPER_CASE

倒数第二个更新

这合并了 tigris place_name 数据：

place_to_county <- function(place, state = NULL) {

  if (is.null(state)) {

    x <- trimws(strsplit(place, ",", 2)[[1]])

    place <- x[1]
    state <- x[2]

  }

  tbl_df(acs::fips.place) %>%
    filter(grepl(place, PLACENAME, ignore.case=TRUE) & STATE == state) -> xdf

  if (nrow(xdf) > 0) {
    separate_rows(xdf, COUNTY, sep=", ") %>%
      head(1) %>%
      setNames(tolower(colnames(.))) %>% 
      left_join(tigris::fips_codes, by=c("county", "state"))
  } else {
    NULL
  }

}

最终更新（真实）：

这是一个非常简单的添加。我不打算进一步完善这一点。在 R w/o 中进行一些自己的实验，您不会变得更好。

place_to_county <- function(place, state = NULL) {

  if (is.null(state)) {

    x <- trimws(strsplit(place, ",", 2)[[1]])

    place <- x[1]
    state <- x[2]

  }

  tbl_df(acs::fips.place) %>%
    filter(grepl(place, PLACENAME, ignore.case=TRUE) & STATE == state) -> xdf

  if (nrow(xdf) > 0) {
    separate_rows(xdf, COUNTY, sep=", ") %>%
      head(1) %>%
      setNames(tolower(colnames(.))) %>% 
      left_join(tigris::fips_codes, by=c("county", "state"))
  } else {
    data_frame(
      state = state, 
      statefp = NA, 
      placefp = NA, 
      placename = place, 
      type = NA, 
      funcstat = NA, 
      county = NA, 
      state_code = NA, 
      state_name = NA,
      county_code = NA
    )
  }

}

任何其他 features/changes 由您决定。

逻辑函数：用于县选择的地理编码脚本

Function for logic: Geocoding script for county selection

geocoding

r

function

dplyr