R 程序未输出正确的刮擦日记帐分录
R program is not outputting the correct scraped journal entries
library(rvest)
library(RCurl)
library(XML)
library(stringr)
#Getting the number of Page
getPageNumber <- function(URL) {
# print(URL)
parsedDocument <- read_html(URL)
pageNumber <- parsedDocument %>%
html_node(".al-currentPage + a:last-child") %>%
html_text() %>%
as.integer()
return(pageNumber)
}
#Getting all articles based off of their DOI
getAllArticles <-function(URL){
parsedDocument = read_html(URL)
findLocationDiv <- html_nodes(parsedDocument,'div')
foundClass <- findLocationDiv[which(html_attr(findLocationDiv, "class") == "al-citation-list")]
ArticleDOInumber = trimws(gsub(".*10.1093/dnares/","",html_text(foundClass)))
DOImain <- "https://doi.org/10.1093/dnares/"
fullDOI <- paste(DOImain, ArticleDOInumber, sep = "")
return(fullDOI)
}
CorrespondingAuthors <- function(parsedDocument){
CorrespondingAuthors <- parsedDocument %>%
html_node("a.linked-name js-linked-name-trigger") %>%
html_text() %>%
return(CorrespondingAuthors)
}
CoAuthorEmail <- function(parsedDocument){
CoAuthorEmail <- parsedDocument %>%
html_node(".icon-general-mail") %>%
html_text() %>%
return(CoAuthorEmail)
}
FullText <- function(parsedDocument){
FullText <- parsedDocument %>%
html_node('.PdfOnlyLink .article-pdfLink') %>% html_attr('href')
return(FullText)
}
#main function with input as parameter year
findURL <- function(year_chosen){
if (year_chosen >= 1994) {
noYearURL <- glue::glue("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F{year_chosen}%20TO%2012%2F31%2F{year_chosen}")
pagesURl <- "&fl_SiteID=5275&page="
URL <- paste(noYearURL, pagesURl, sep = "")
# URL is working with parameter year_chosen
firstPage <- getPageNumber(URL)
if (firstPage == 5) {
nextPage <- 0
while (firstPage < nextPage | firstPage != nextPage) {
firstPage <- nextPage
URLwithPageNum <- paste(URL, firstPage-1, sep = "")
nextPage <- getPageNumber(URLwithPageNum)
}
}
DNAresearch <- data.frame()
for (i in 1:firstPage) {
URLallArticles <- getAllArticles(paste(URL, i, sep = ""))
for (j in 1:(length(URLallArticles))) {
parsedDocument <- read_html(URLallArticles[j])
#"Title" = Title(parsedDocument),"Authors" = Authors(parsedDocument),"Author Affiliations" = AuthorAffil(parsedDocument),"Corresponding Authors" CorrespondingAuthors=(parsedDocument),"CoAuthor Email" = CoAuthorEmail(parsedDocument),"Publication Date" = PublicationDate(parsedDocument),"Keywords" = Keywords(parsedDocument),"Abstract" = Abstract(parsedDocument), "Full Text" = FullText(parsedDocument)
allData <- data.frame("Corresponding Authors" = (parsedDocument),"CoAuthor Email" = CoAuthorEmail(parsedDocument),"Full Text" = FullText(parsedDocument),stringsAsFactors = FALSE)
#for(i in 1:allData == "NA"){
#i == "NO"
#}
DNAresearch <- rbind(DNAresearch, allData)
}
}
write.csv(DNAresearch, "DNAresearch.csv", row.names = FALSE)
} else {
print("The Year you provide is out of range, this journal only contain articles from 2005 to present")
}
}
##################### Main function test
findURL(1994)
在上面的程序中,我正在从网站上抓取期刊。然后输出在一个名为 DNAresearch.csv 的文件中。我有三件事需要解决。
在CorrespondingAuthors中我不断得到期刊的第一作者。我实际上需要除第一作者以外的所有作者。
在 CoAuthorEmail 中我找不到作者的电子邮件,所以在 csv 文件中它 returns NA。它应该输出 NA ,因为我相信没有引用电子邮件,但是我希望 CSV 文件为 return NO 而不是 NA.
在 FullText 中,我正在尝试获取期刊的全文。必须通过 pdf link 抓取全文。我的 csv 当前 returns NA .
一切都正确,但我有上面的三个问题。预先感谢您的帮助!
这是一个不完整的答案,只是比将所有这些都放入评论中更容易:
为了return多个节点而不是第一个节点。您需要使用带有 s 的“html_nodes”。这将 return 所有节点,但缺点是如果节点缺少函数 returns 零长度向量。所以如果你确定有作者,那应该是有问题的
CorrespondingAuthors <- function(parsedDocument){
CorrespondingAuthors <- parsedDocument %>%
html_nodes("a.linked-name js-linked-name-trigger") %>%
html_text()
#probably need to add: CorrespondingAuthors <- paste(CorrespondingAuthor, collapse =", ")
return(CorrespondingAuthors)
}
“NA”和“NA”是有区别的。第一个只是N和A的字符串。要检查不可用的NA,最好使用is.na()
函数。
有多种方法可以下载 PDF 文件并提取内容。最好回答一个严格关注该问题的新问题。更有可能得到解答,成为未来更有用的资源。
更新
基于此处评论中提供的 link 是有效的 CorrespondingAuthors 和 AuthorEmail
url <- "https://academic.oup.com/dnaresearch/article/25/6/655/5123538?searchresult=1"
page <- read_html(url)
CorrespondingAuthors <- function(parsedDocument){
CorrespondingAuthors <- parsedDocument %>%
html_nodes("a.linked-name") %>%
html_text()
#Comma separate string of names
CorrespondingAuthors <- paste(CorrespondingAuthors, collapse =", ")
# Comment the above line for a vector names
return(CorrespondingAuthors)
}
CoAuthorEmail <- function(parsedDocument){
CoAuthorEmail <- parsedDocument %>%
html_node("div.info-author-correspondence a") %>%
html_text()
CoAuthorEmail <- ifelse(is.na(CoAuthorEmail), "No", CoAuthorEmail)
return(CoAuthorEmail)
}
library(rvest)
library(RCurl)
library(XML)
library(stringr)
#Getting the number of Page
getPageNumber <- function(URL) {
# print(URL)
parsedDocument <- read_html(URL)
pageNumber <- parsedDocument %>%
html_node(".al-currentPage + a:last-child") %>%
html_text() %>%
as.integer()
return(pageNumber)
}
#Getting all articles based off of their DOI
getAllArticles <-function(URL){
parsedDocument = read_html(URL)
findLocationDiv <- html_nodes(parsedDocument,'div')
foundClass <- findLocationDiv[which(html_attr(findLocationDiv, "class") == "al-citation-list")]
ArticleDOInumber = trimws(gsub(".*10.1093/dnares/","",html_text(foundClass)))
DOImain <- "https://doi.org/10.1093/dnares/"
fullDOI <- paste(DOImain, ArticleDOInumber, sep = "")
return(fullDOI)
}
CorrespondingAuthors <- function(parsedDocument){
CorrespondingAuthors <- parsedDocument %>%
html_node("a.linked-name js-linked-name-trigger") %>%
html_text() %>%
return(CorrespondingAuthors)
}
CoAuthorEmail <- function(parsedDocument){
CoAuthorEmail <- parsedDocument %>%
html_node(".icon-general-mail") %>%
html_text() %>%
return(CoAuthorEmail)
}
FullText <- function(parsedDocument){
FullText <- parsedDocument %>%
html_node('.PdfOnlyLink .article-pdfLink') %>% html_attr('href')
return(FullText)
}
#main function with input as parameter year
findURL <- function(year_chosen){
if (year_chosen >= 1994) {
noYearURL <- glue::glue("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F{year_chosen}%20TO%2012%2F31%2F{year_chosen}")
pagesURl <- "&fl_SiteID=5275&page="
URL <- paste(noYearURL, pagesURl, sep = "")
# URL is working with parameter year_chosen
firstPage <- getPageNumber(URL)
if (firstPage == 5) {
nextPage <- 0
while (firstPage < nextPage | firstPage != nextPage) {
firstPage <- nextPage
URLwithPageNum <- paste(URL, firstPage-1, sep = "")
nextPage <- getPageNumber(URLwithPageNum)
}
}
DNAresearch <- data.frame()
for (i in 1:firstPage) {
URLallArticles <- getAllArticles(paste(URL, i, sep = ""))
for (j in 1:(length(URLallArticles))) {
parsedDocument <- read_html(URLallArticles[j])
#"Title" = Title(parsedDocument),"Authors" = Authors(parsedDocument),"Author Affiliations" = AuthorAffil(parsedDocument),"Corresponding Authors" CorrespondingAuthors=(parsedDocument),"CoAuthor Email" = CoAuthorEmail(parsedDocument),"Publication Date" = PublicationDate(parsedDocument),"Keywords" = Keywords(parsedDocument),"Abstract" = Abstract(parsedDocument), "Full Text" = FullText(parsedDocument)
allData <- data.frame("Corresponding Authors" = (parsedDocument),"CoAuthor Email" = CoAuthorEmail(parsedDocument),"Full Text" = FullText(parsedDocument),stringsAsFactors = FALSE)
#for(i in 1:allData == "NA"){
#i == "NO"
#}
DNAresearch <- rbind(DNAresearch, allData)
}
}
write.csv(DNAresearch, "DNAresearch.csv", row.names = FALSE)
} else {
print("The Year you provide is out of range, this journal only contain articles from 2005 to present")
}
}
##################### Main function test
findURL(1994)
在上面的程序中,我正在从网站上抓取期刊。然后输出在一个名为 DNAresearch.csv 的文件中。我有三件事需要解决。
在CorrespondingAuthors中我不断得到期刊的第一作者。我实际上需要除第一作者以外的所有作者。
在 CoAuthorEmail 中我找不到作者的电子邮件,所以在 csv 文件中它 returns NA。它应该输出 NA ,因为我相信没有引用电子邮件,但是我希望 CSV 文件为 return NO 而不是 NA.
在 FullText 中,我正在尝试获取期刊的全文。必须通过 pdf link 抓取全文。我的 csv 当前 returns NA .
一切都正确,但我有上面的三个问题。预先感谢您的帮助!
这是一个不完整的答案,只是比将所有这些都放入评论中更容易:
为了return多个节点而不是第一个节点。您需要使用带有 s 的“html_nodes”。这将 return 所有节点,但缺点是如果节点缺少函数 returns 零长度向量。所以如果你确定有作者,那应该是有问题的
CorrespondingAuthors <- function(parsedDocument){ CorrespondingAuthors <- parsedDocument %>% html_nodes("a.linked-name js-linked-name-trigger") %>% html_text() #probably need to add: CorrespondingAuthors <- paste(CorrespondingAuthor, collapse =", ") return(CorrespondingAuthors) }
“NA”和“NA”是有区别的。第一个只是N和A的字符串。要检查不可用的NA,最好使用
is.na()
函数。有多种方法可以下载 PDF 文件并提取内容。最好回答一个严格关注该问题的新问题。更有可能得到解答,成为未来更有用的资源。
更新
基于此处评论中提供的 link 是有效的 CorrespondingAuthors 和 AuthorEmail
url <- "https://academic.oup.com/dnaresearch/article/25/6/655/5123538?searchresult=1"
page <- read_html(url)
CorrespondingAuthors <- function(parsedDocument){
CorrespondingAuthors <- parsedDocument %>%
html_nodes("a.linked-name") %>%
html_text()
#Comma separate string of names
CorrespondingAuthors <- paste(CorrespondingAuthors, collapse =", ")
# Comment the above line for a vector names
return(CorrespondingAuthors)
}
CoAuthorEmail <- function(parsedDocument){
CoAuthorEmail <- parsedDocument %>%
html_node("div.info-author-correspondence a") %>%
html_text()
CoAuthorEmail <- ifelse(is.na(CoAuthorEmail), "No", CoAuthorEmail)
return(CoAuthorEmail)
}