我怎么错了 rvest 包?
How am I wrong with rvest package?
我在使用 r 中的 rvest 包进行抓取时遇到了问题。
我试图从网站上收集信息并创建一个数据框,其中包含循环内指定的向量。
除非我使用循环函数,否则我会得到正确的数据。
谁能告诉我以下代码有什么问题吗?
我的预感是我未能组合向量...
library(rvest)
library(dplyr)
library(tidyr)
library(stringr)
library(stringi)
#This is the URL from which I would like to get information.
source_url <- "https://go2senkyo.com/local/senkyo/"
senkyo <- data.frame()
*start for loop
for (i in 50:60) {
target_page <- paste0(source_url, i)
recall_html <- read_html(source_url, encoding = "UTF-8")
prefecture <- recall_html %>%
html_nodes(xpath='//*[contains(concat( " ", @class, " " ), concat( " ", "column_ttl_small", " " ))]') %>%
html_text()
city <- recall_html %>%
html_nodes(xpath='//*[contains(concat( " ", @class, " " ), concat( " ", "column_ttl", " " ))]') %>%
html_text()
city <- trimws(gsub("[\r\n]", "", city )) %>% unlist() %>% str
candidate <- recall_html %>%
html_nodes(xpath='//*[contains(concat( " ", @class, " " ), concat( " ", "m_senkyo_result_table", " " ))]') %>%
html_text()
candidate <- trimws(gsub("[\r\n\t]", "", candidate ))
all <- recall_html %>%
html_nodes(xpath='//td') %>%
html_text()
all <- trimws(gsub("[\r\n\t]", "", all))
election_day <- all[1]
turnout <- all[2]
magnitude_candidates <- all[3]
notificationday <- all[4]
turnout_lasttime <- all[5]
others <- all[6]
senkyo2 <- cbind(prefecture, city, candidate, election_day, turnout, magnitude_candidates, notificationday,
turnout_lasttime, others)
senkyo <- rbind(senkyo , senkyo2)
}
这似乎是你的错误:
recall_html <- read_html(source_url, encoding = "UTF-8")
应该使用target_page
而不是source_url
recall_html <- read_html(target_page, encoding = "UTF-8")
我在使用 r 中的 rvest 包进行抓取时遇到了问题。 我试图从网站上收集信息并创建一个数据框,其中包含循环内指定的向量。
除非我使用循环函数,否则我会得到正确的数据。 谁能告诉我以下代码有什么问题吗?
我的预感是我未能组合向量...
library(rvest)
library(dplyr)
library(tidyr)
library(stringr)
library(stringi)
#This is the URL from which I would like to get information.
source_url <- "https://go2senkyo.com/local/senkyo/"
senkyo <- data.frame()
*start for loop
for (i in 50:60) {
target_page <- paste0(source_url, i)
recall_html <- read_html(source_url, encoding = "UTF-8")
prefecture <- recall_html %>%
html_nodes(xpath='//*[contains(concat( " ", @class, " " ), concat( " ", "column_ttl_small", " " ))]') %>%
html_text()
city <- recall_html %>%
html_nodes(xpath='//*[contains(concat( " ", @class, " " ), concat( " ", "column_ttl", " " ))]') %>%
html_text()
city <- trimws(gsub("[\r\n]", "", city )) %>% unlist() %>% str
candidate <- recall_html %>%
html_nodes(xpath='//*[contains(concat( " ", @class, " " ), concat( " ", "m_senkyo_result_table", " " ))]') %>%
html_text()
candidate <- trimws(gsub("[\r\n\t]", "", candidate ))
all <- recall_html %>%
html_nodes(xpath='//td') %>%
html_text()
all <- trimws(gsub("[\r\n\t]", "", all))
election_day <- all[1]
turnout <- all[2]
magnitude_candidates <- all[3]
notificationday <- all[4]
turnout_lasttime <- all[5]
others <- all[6]
senkyo2 <- cbind(prefecture, city, candidate, election_day, turnout, magnitude_candidates, notificationday,
turnout_lasttime, others)
senkyo <- rbind(senkyo , senkyo2)
}
这似乎是你的错误:
recall_html <- read_html(source_url, encoding = "UTF-8")
应该使用target_page
而不是source_url
recall_html <- read_html(target_page, encoding = "UTF-8")