R 程序未输出正确的刮擦日记帐分录

R program is not outputting the correct scraped journal entries

library(rvest)
library(RCurl)
library(XML)
library(stringr)


#Getting the number of Page
getPageNumber <- function(URL) {
  # print(URL)
  parsedDocument <- read_html(URL)
  pageNumber <- parsedDocument %>%
    html_node(".al-currentPage + a:last-child") %>%
    html_text() %>%
    as.integer()
  return(pageNumber)
}


#Getting all articles based off of their DOI
getAllArticles <-function(URL){
  parsedDocument = read_html(URL)
  findLocationDiv <- html_nodes(parsedDocument,'div')
  foundClass <-  findLocationDiv[which(html_attr(findLocationDiv, "class") == "al-citation-list")]
  ArticleDOInumber = trimws(gsub(".*10.1093/dnares/","",html_text(foundClass)))
  DOImain <- "https://doi.org/10.1093/dnares/"
  fullDOI <- paste(DOImain, ArticleDOInumber, sep = "")
  return(fullDOI)
}

CorrespondingAuthors <- function(parsedDocument){
  CorrespondingAuthors <- parsedDocument %>%
    html_node("a.linked-name js-linked-name-trigger") %>%
    html_text() %>%
    return(CorrespondingAuthors)
}

CoAuthorEmail <- function(parsedDocument){
  CoAuthorEmail <- parsedDocument %>%
    html_node(".icon-general-mail") %>%
    html_text() %>%
    return(CoAuthorEmail)
}
FullText <- function(parsedDocument){
  FullText <- parsedDocument %>%
    html_node('.PdfOnlyLink .article-pdfLink') %>% html_attr('href')
    return(FullText)
}

#main function with input as parameter year
findURL <- function(year_chosen){
  if (year_chosen >= 1994) {
    noYearURL <- glue::glue("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F{year_chosen}%20TO%2012%2F31%2F{year_chosen}")
    pagesURl <- "&fl_SiteID=5275&page="
    URL <- paste(noYearURL, pagesURl, sep = "")
    # URL is working with parameter year_chosen
    firstPage <- getPageNumber(URL)
    
    if (firstPage == 5) {
      nextPage <- 0
      while (firstPage < nextPage | firstPage != nextPage) {
        firstPage <- nextPage
        URLwithPageNum <- paste(URL, firstPage-1, sep = "")
        nextPage <- getPageNumber(URLwithPageNum)
      }
    }
  DNAresearch <- data.frame()
    for (i in 1:firstPage) {
      URLallArticles <- getAllArticles(paste(URL, i, sep = ""))
      for (j in 1:(length(URLallArticles))) {
        parsedDocument <- read_html(URLallArticles[j])
        #"Title" = Title(parsedDocument),"Authors" = Authors(parsedDocument),"Author Affiliations" = AuthorAffil(parsedDocument),"Corresponding  Authors" CorrespondingAuthors=(parsedDocument),"CoAuthor Email" = CoAuthorEmail(parsedDocument),"Publication Date" = PublicationDate(parsedDocument),"Keywords" = Keywords(parsedDocument),"Abstract" = Abstract(parsedDocument), "Full Text" = FullText(parsedDocument)
        allData <- data.frame("Corresponding Authors" = (parsedDocument),"CoAuthor Email" = CoAuthorEmail(parsedDocument),"Full Text" = FullText(parsedDocument),stringsAsFactors = FALSE)
        #for(i in 1:allData == "NA"){
          #i == "NO"
        #}
        DNAresearch <- rbind(DNAresearch, allData)
      }
    }
    write.csv(DNAresearch, "DNAresearch.csv", row.names = FALSE)
  } else {
    print("The Year you provide is out of range, this journal only contain articles from 2005 to present")
  }
}

##################### Main function test
findURL(1994)

在上面的程序中,我正在从网站上抓取期刊。然后输出在一个名为 DNAresearch.csv 的文件中。我有三件事需要解决。

  1. 在CorrespondingAuthors中我不断得到期刊的第一作者。我实际上需要除第一作者以外的所有作者。

  2. 在 CoAuthorEmail 中我找不到作者的电子邮件,所以在 csv 文件中它 returns NA。它应该输出 NA ,因为我相信没有引用电子邮件,但是我希望 CSV 文件为 return NO 而不是 NA.

  3. 在 FullText 中,我正在尝试获取期刊的全文。必须通过 pdf link 抓取全文。我的 csv 当前 returns NA .

一切都正确,但我有上面的三个问题。预先感谢您的帮助!

这是一个不完整的答案,只是比将所有这些都放入评论中更容易:

  1. 为了return多个节点而不是第一个节点。您需要使用带有 s 的“html_nodes”。这将 return 所有节点,但缺点是如果节点缺少函数 returns 零长度向量。所以如果你确定有作者,那应该是有问题的

    CorrespondingAuthors <- function(parsedDocument){
      CorrespondingAuthors <- parsedDocument %>%
      html_nodes("a.linked-name js-linked-name-trigger") %>%
      html_text() 
      #probably need to add: CorrespondingAuthors  <- paste(CorrespondingAuthor, collapse =", ")
     return(CorrespondingAuthors)
    }
    
  2. “NA”和“NA”是有区别的。第一个只是N和A的字符串。要检查不可用的NA,最好使用is.na()函数。

  3. 有多种方法可以下载 PDF 文件并提取内容。最好回答一个严格关注该问题的新问题。更有可能得到解答,成为未来更有用的资源。

更新
基于此处评论中提供的 link 是有效的 CorrespondingAuthors 和 AuthorEmail

url <- "https://academic.oup.com/dnaresearch/article/25/6/655/5123538?searchresult=1"
page <- read_html(url)

    CorrespondingAuthors <- function(parsedDocument){
       CorrespondingAuthors <- parsedDocument %>%
          html_nodes("a.linked-name") %>%
          html_text() 
       #Comma separate string of names
       CorrespondingAuthors  <- paste(CorrespondingAuthors, collapse =", ")
       # Comment the above line for a vector names
       return(CorrespondingAuthors)
    }
    
    
   CoAuthorEmail <- function(parsedDocument){
      CoAuthorEmail <- parsedDocument %>%
           html_node("div.info-author-correspondence a") %>%
           html_text() 
      CoAuthorEmail <- ifelse(is.na(CoAuthorEmail), "No", CoAuthorEmail)
      return(CoAuthorEmail)
   }