需要帮助在 URL 列表上循环这些函数并将聚合数据合并到 table

Question

有一个旧的 HTML 网站提供植物病毒的综合研究数据。我有兴趣提取单个病毒物种的易感宿主物种和不敏感宿主物种数据。

The parent URL containing all individual, paginated URLs for individual species is here.

URL分页从 001 到 911。An example of one of these URLs, in this case for Abelia latent tymovirus, is found here.

在这个论坛上有人的热心帮助下，我成功地生成了从单个 URL 中仅提取 susceptible/insusceptible 数据的代码。代码：

library(rvest)
library(stringr)

#### Preparing for the loop ####

## URL template
url <- 'http://bio-mirror.im.ac.cn/mirrors/pvo/vide/'

## List of all URLs to visit
list_of_pages <- str_c(url, 'descr', c("001", "002", "003", "004", "005", "006", "007", "008", "009", "010","011","012","013","014","015","016","017","018","019","020","021","022","023","024","025","026","027","028","029","030","031","032","033","034","035","036","037","038","039","040","041","042","043","044","045","046","047","048","049","050","051","052","053","054","055","056","057","058","059","060","061","062","063","064","065","066","067","068","069","070","071","072","073","074","075","076","077","078","079","080","081","082","083","084","085","086","087","088","089","090","091","092","093","094","095","096","097","098","099",100:911))
list_of_pages <- str_c(list_of_pages, ".htm")

#### Functions to be executed for each URL

## Obtain URL data for one species (will need to change for loop)
pvo <- read_html("http://bio-mirror.im.ac.cn/mirrors/pvo/vide/descr001.htm")

## Extract URL-specific virus species identifier and cleanup
virus_species <- pvo %>% html_element("center+ h1") %>% html_text()%>% str_replace_all("[\n]" , " ")

## Select only data in node h4, include listitems, and cleanup
all_values <- pvo %>% html_elements("h4, li") %>% html_text() %>% str_replace_all("[\n]" , "")

## Index headings in h4 to isolate susceptibility data
sus_index <- grep('Susceptible host species', all_values, fixed = TRUE)
insus_index <- grep('Insusceptible host species', all_values, fixed = TRUE)
family_sus_index <- grep('Families containing susceptible hosts', all_values, fixed = TRUE)

## Extract susceptible species data
susceptible_species <- all_values[(sus_index+1):(insus_index-1)]

## Extract insusceptible species data
insusceptible_species <- all_values[(insus_index+1):(family_sus_index-1)]

现在我想在我的 list_of_pages 上应用这些函数并将数据聚合到一个数据帧中，理想情况下看起来像：

有人可以帮我完成这个吗？我对循环的经验很少，主要使用 dplyr。

编辑：我忘了补充一件非常重要的事情：许多 URL 不包含 susceptibility/insusceptibility 数据，因为根本没有进行研究。循环必须能够在这些情况下继续而不是终止。

这些URL的例子：

包含敏感数据但不包含不敏感数据： http://bio-mirror.im.ac.cn/mirrors/pvo/vide/descr565.htm

包含既不敏感也不不敏感的数据： http://bio-mirror.im.ac.cn/mirrors/pvo/vide/descr562.htm

Answer 1

试试这个：

library(rvest)
library(stringr)

#### Preparing for the loop ####

## URL template
url <- 'http://bio-mirror.im.ac.cn/mirrors/pvo/vide/'

## List of all URLs to visit
list_of_pages <- str_c(url, 'descr', c("001", "002", "003", "004", "005", "006", "007", "008", "009", "010","011","012","013","014","015","016","017","018","019","020","021","022","023","024","025","026","027","028","029","030","031","032","033","034","035","036","037","038","039","040","041","042","043","044","045","046","047","048","049","050","051","052","053","054","055","056","057","058","059","060","061","062","063","064","065","066","067","068","069","070","071","072","073","074","075","076","077","078","079","080","081","082","083","084","085","086","087","088","089","090","091","092","093","094","095","096","097","098","099",100:911))
list_of_pages <- str_c(list_of_pages, ".htm")
#list_of_pages <- list_of_pages[641:n_distinct(list_of_pages)]
#### Functions to be executed for each URL

df <- data.frame()

for (pages in list_of_pages) {
  print(pages)
  ## Obtain URL data for one species (will need to change for loop)
  
  ## First catch the Error
  possible_error <- tryCatch(
    pvo <- read_html(pages),
    error = function(e)
      e
  )

  ## If any Error found then just capture it in virus_species column
  ## You will find  "Failed to parse text" where 
  if (inherits(possible_error, 'error')) {
    temp_df <- data.frame(
      pages,
      virus_species = possible_error$message,
      susceptible_species = NA,
      insusceptible_species = NA
    )
    
    df <- rbind(df, temp_df)
    next
  }
  
  ## If no error found reading the html then collect all necessary data
  if (!inherits(possible_error, 'error')) {
    #pvo <- read_html(pages)
    
    ## Extract URL-specific virus species identifier and cleanup
    virus_species <-
      pvo %>% html_element("center+ h1") %>% html_text() %>% str_replace_all("[\n]" , " ")
    
    ## Select only data in node h4, include listitems, and cleanup
    all_values <-
      pvo %>% html_elements("h4, li") %>% html_text() %>% str_replace_all("[\n]" , "")
    
    ## Index headings in h4 to isolate susceptibility data
    sus_index <-
      grep('Susceptible host species', all_values, fixed = TRUE)
    insus_index <-
      grep('Insusceptible host species', all_values, fixed = TRUE)
    family_sus_index <-
      grep('Families containing susceptible hosts', all_values, fixed = TRUE)
    
    ## Extract susceptible species data
    #susceptible_species <- all_values[(sus_index + 1):(insus_index - 1)]
    
    if (length(sus_index) > 0L & length(insus_index) > 0L) {
      susceptible_species <-
        all_values[(sus_index + 1):(insus_index - 1)]
    } else{
      susceptible_species <- NA
    }
    
    ## Extract insusceptible species data
    if (length(insus_index) > 0L & length(family_sus_index) > 0L) {
      insusceptible_species <-
        all_values[(insus_index + 1):(family_sus_index - 1)]
    } else{
      insusceptible_species <- NA
    }
    
    ll <-
      sapply(list(
        virus_species,
        susceptible_species,
        insusceptible_species
      ),
      length)
    
    mll <- max(ll)
    
    temp_df <- data.frame(
      pages = rep(pages, mll),
      virus_species = rep(virus_species, mll),
      susceptible_species = c(susceptible_species, rep(NA, mll - ll[2])),
      insusceptible_species = c(insusceptible_species, rep(NA, mll - ll[3]))
    )
    
    df <- rbind(df, temp_df)
    ## uncomment below line if you need the delay 
    ##Sys.sleep(1) 
    
  }
}

df

For better error handling in r see here

需要帮助在 URL 列表上循环这些函数并将聚合数据合并到 table

Need help looping these functions over a list of URLs and combining the aggregate data in a table

r

web-scraping

rvest