选择选项标签后抓取页面内容

Question

我想在选择省份（和市镇）后抓取页面内容。
以下代码正确输出省份及其值。

library(rvest)

page <- read_html(x = "https://www.solferinoesanmartino.it/progetto-torelli/progetto-torelli-risultati/")

text <- page %>% html_nodes(xpath='//select[@name="provincia"]/option')%>% html_text() 
values <- page %>%  html_nodes(xpath='//select[@name="provincia"]/option')%>% html_attr("value") 

Res <- data.frame(text = text, values = values, stringsAsFactors = FALSE)
Res

现在，我想访问每个值的页面，例如这可能有助于访问 value=19.

text <- page %>% html_nodes(xpath="//*/option[@value = '19']")%>% html_text() 
text

源码如下

<div class="row results_form_search">
        <form role="search" method="POST" class="search-form" action="/progetto-torelli/progetto-torelli-risultati/" id="search_location">
            <input type="hidden" name="comune_from" value="" />
            <div class="form-row">
                <input type="text" name="cognome" placeholder="Cognome" autocomplete="off" value="">
                <select name="provincia">
                    <option value="0" selected>Seleziona Provincia</option>
                                        <option value="74"
                        >-
</option>
                                        <option value="75"
                        >AGRIGENTO
</option>
                                        <option value="19"
                        >ALESSANDRIA

这就是我想要抓取的内容所在的位置。

    <div class="row">
        <ul class="listing_search">
        </ul>
    </div>

非常感谢您的建议！

Answer 1

RSelenium 可能最终成为要走的路。但是，如果您可以插入一些明智的等待或分块请求，这样服务器就不会被请求淹没，您可以使用 rvest 并发出与页面相同的请求。

您首先需要生成省和市镇的所有组合（过滤掉不需要的值）；这可以通过发出 xmlhttp 请求来完成，使用省份 select 中 options 的 value 属性，收集 comune 下拉菜单 options 及其相关值。

然后，您为每个组合对发出进一步的请求，以获取页面内容，您可以在手动从每个下拉菜单中进行选择并按下 CERCA。

时获得该页面内容

根据我的计算，有 10,389 个有效组合，因此需要暂停，并且，如果您尝试一个接一个地发出所有这些请求，并且在初始请求之后，服务器将切断连接。

另一种选择是将 combined 分块成更小的数据帧，并按时间间隔发出请求，然后合并结果。

library(rvest)
library(dplyr)
library(purrr)

get_provincias <- function(link) {
  nodes <- read_html(link) %>%
    html_nodes('[name="provincia"] > option:not([selected]):not(:contains("-")):not(:contains("\u0085"))')

  df <- data.frame(
    Provincia = nodes %>% html_text(trim = T),
    id0 = nodes %>% html_attr("value")
  )

  return(df)
}

get_comunes <- function(id) {
  link <- sprintf(
    "https://www.solferinoesanmartino.it/db-torelli/_get_comuni.php?id0=%s&id1=0&_=%i",
    id,
    as.numeric(as.POSIXct(Sys.Date(), format = "%Y-%m-%d"))
  )
  # print(link)
  nodes <- read_html(link) %>% html_nodes('option:not([value="0"])')

  df <- data.frame(
    id0 = id, # id1
    Comune = nodes %>% html_text(trim = T),
    id3 = nodes %>% html_attr("value")
  )
  return(df)
}

get_page <- function(prov_id, com_id) {
  link <- sprintf(
    "https://www.solferinoesanmartino.it/db-torelli/_get_soldati.php?id0=1&id1=&id2=%s&id3=%s&_=%i",
    prov_id,
    com_id,
    as.numeric(as.POSIXct(Sys.Date(), format = "%Y-%m-%d"))
  )

  page <- read_html(link)
  # print(page %>% html_node(".listing_name") %>% html_text(trim = T))
  # print(tibble(id3 = com_id, page = page))
  return(tibble(id3 = com_id, page = page))
}
 
provincias <- get_provincias("https://www.solferinoesanmartino.it/progetto-torelli/progetto-torelli-risultati")

comunes <- map_df(provincias$id0, get_comunes) %>% filter(Comune != "-")

combined <- dplyr::right_join(provincias, comunes, by = "id0")

# length(combined$Comune) ->  10389

results <- map2_dfr(combined$id0, combined$id3, .f = get_page)

final <- dplyr::inner_join(combined, results, by = "id3")

下面是一个更长的版本，包含您要求的额外信息，我在其中添加了停顿。我仍然发现我可以运行包括

combined <- dplyr::right_join(provincias, comunes, by = "id0")

一气呵成。但在那之后，我需要将请求分成大约 2000 个请求批次，中间间隔 20-30 分钟。您可以尝试调整以下时间。我最终将注释掉的部分用于运行每批，然后在两者之间暂停 30 分钟。

需要考虑的一些事项：

您似乎可以拥有像 ... 这样的 comunes 值，它仍然是 return 列表。考虑到这一点，您可能希望删除其中的 :not 部分：

html_nodes('[name="provincia"] > option:not([selected]):not(:contains("-")):not(:contains("\u0085"))')

因为我认为这是在过滤掉无效结果。

接下来，您可能会考虑使用 httr 和 retry 编写辅助函数，使用 backoff/retry 发出请求，而不是使用暂停。

这样的函数可能如下所示：

httr::RETRY(
  "GET", 
  <request url>,
  times = 3, 
  pause_min = 20*60,
  pause_base = 20*60)

无论如何，这些都是一些想法。即使服务器没有切断连接，通过使用等待，我仍然发现它开始限制请求，这意味着一些请求需要很长时间才能完成。对此进行优化可能会花费大量时间和精力。我花了好几天时间查看块大小并等待。

library(rvest)
library(dplyr)
library(purrr)

get_provincias <- function(link) {
  nodes <- read_html(link) %>%
    html_nodes('[name="provincia"] > option:not([selected]):not(:contains("-")):not(:contains("\u0085"))')

  df <- data.frame(
    Provincia = nodes %>% html_text(trim = T),
    id0 = nodes %>% html_attr("value")
  )

  return(df)
}

get_comunes <- function(id) {
  link <- sprintf(
    "https://www.solferinoesanmartino.it/db-torelli/_get_comuni.php?id0=%s&id1=0&_=%i",
    id,
    as.numeric(as.POSIXct(Sys.Date(), format = "%Y-%m-%d"))
  )
  # print(link)
  nodes <- read_html(link) %>% html_nodes('option:not([value="0"])')

  df <- data.frame(
    id0 = id, # id1
    Comune = nodes %>% html_text(trim = T),
    id3 = nodes %>% html_attr("value")
  )
  return(df)
}

get_data <- function(prov_id, com_id) {
  link <- sprintf(
    "https://www.solferinoesanmartino.it/db-torelli/_get_soldati.php?id0=1&id1=&id2=%s&id3=%s&_=%i",
    prov_id,
    com_id,
    as.numeric(as.POSIXct(Sys.Date(), format = "%Y-%m-%d"))
  )
  # print(link)
  page <- read_html(link)

  df <- data.frame(
    cognome = page %>% html_nodes(".listing_name") %>% html_text(trim = T),
    livello = page %>% html_nodes(".listing_level") %>% html_text(trim = T),
    id3 = com_id,# for later join back on comune
    id0 = prov_id
  )
  Sys.sleep(.25) # pause for . sec
  return(df)
}

get_chunks <- function(df, chunk_size) { # adapted from @BenBolker 
  n <- nrow(df)
  r <- rep(1:ceiling(n / chunk_size), each = chunk_size)[1:n]
  d <- split(df, r)
  return(d)
}

write_rows <- function(df, filename) {
  
  flag <- file.exists(filename)
  df2 <- purrr::map2_dfr(df$id0, df$id3, .f = get_data)

  write.table(df2,
    file = filename, sep = ",",
    append = flag,
    quote = F, col.names = !flag,
    row.names = F
  )
  Sys.sleep(60*10)
}

provincias <- get_provincias("https://www.solferinoesanmartino.it/progetto-torelli/progetto-torelli-risultati")

Sys.sleep(60*5)

comunes <- map_df(provincias$id0, get_comunes) %>% filter(Comune != "-")

Sys.sleep(60*10)

combined <- dplyr::right_join(provincias, comunes, by = "id0")

Sys.sleep(60*10)

chunked <- get_chunks(combined, 2000) # 

filename <- "prov_com_cog_liv.csv"

map(chunked, ~ write_rows(.x, filename))

## #### test case #####################

# df <- chunked[[6]]
# 
#   flag <- file.exists(filename)
#   
#   df2 <- map2_dfr(df$id0, df$id3, .f = get_data)
#   
#   write.table(df2,
#     file = filename, sep = ",",
#     append = flag,
#     quote = F, col.names = !flag,
#     row.names = F
#   )
####################################

results <- read.csv(filename)

final <- dplyr::right_join(combined, results, by = "id3")

选择选项标签后抓取页面内容

Scrape page content after option tag is selected

html

r

web-scraping

rvest