错误消息:.htm "value" 在当前工作目录中不存在,但它是我的网络抓取数据框的一部分
Error message: .htm "value" does not exist in current working directory, yet is it part of my webscraping dataframe
我知道这可能是一个简单的解决方案的幼稚问题。
基本的网络抓取练习,所有内容都在我的 df 中并且结构化。从下面的代码可以看出
library(rvest)
library(dplyr)
library(tidyr)
#load webpage
pres.library <- read_html(x = "http://stateoftheunion.onetwothree.net/texts/index.html")
#get URL of links
links <- pres.library %>%
html_nodes("#text li a") %>%
html_attr("href")
#get link text
text <- pres.library %>%
html_nodes("#text li a") %>%
html_text()
#combine into df, add structure
sotu <- data.frame (text = text, links = links, stringsAsFactors = F) %>%
separate(text, c("President", "Date", "Year"), ",")
#check for unneccesary whitespaces
sotu$President <- trimws(sotu$President)
sotu$links <- trimws(sotu$links)
sotu$Date <- trimws(sotu$Date)
sotu$Year <- trimws(sotu$Year)
#remove data from pre-modern era
sotu.modern <- sotu[-c(1:156),]
#separate democrats from republicans
democrats <- c("Harry S. Truman", "John F. Kennedy", "Lyndon B. Johnson", "Jimmy Carter", "William J. Clinton", "Barack Obama")
sotu.modern$Party <- ifelse(sotu.modern$President %in% democrats, "Democrat", "Republican")
到这里为止一切正常。此处的代码用于将每个 link 中的文本提取到目录
中
for (i in seq(nrow(sotu.modern))) {
sotu.text <- read_html(sotu.modern$links[i]) %>%
html_nodes("#text li a") %>%
html_text()
filename <- paste0("SOTU", President,"-", Date, Year, ".txt")
sink(file = filename) %>%
cat(text) %>%
sink()
}
运行 上面的代码给我这个错误信息:
'19460121.html' does not exist in current working directory ('C:/Users/User/Desktop/R/R Projects/SOTU project')
"19460121.html" 是 sotu.modern$link 列的第一个元素,所以我不明白为什么它说它不在目录中。
您向 read_html
提供了不完整的 URL。
例如,在第一次迭代中它被指向 20200204.html
,而不是完整的 URL http://stateoftheunion.onetwothree.net/texts/20200204.html
我也修正了html_node
和filename
。
for (i in seq(nrow(sotu.modern))) {
sotu.text <- read_html(paste0("http://stateoftheunion.onetwothree.net/texts/",sotu.modern$links[i])) %>%
html_nodes("#text") %>%
html_text()
filename <- paste0("SOTU", sotu.modern$President[i],"-", sotu.modern$Date[i], sotu.modern$Year[i], ".txt")
cat(sotu.text, file=filename)
}
我知道这可能是一个简单的解决方案的幼稚问题。
基本的网络抓取练习,所有内容都在我的 df 中并且结构化。从下面的代码可以看出
library(rvest)
library(dplyr)
library(tidyr)
#load webpage
pres.library <- read_html(x = "http://stateoftheunion.onetwothree.net/texts/index.html")
#get URL of links
links <- pres.library %>%
html_nodes("#text li a") %>%
html_attr("href")
#get link text
text <- pres.library %>%
html_nodes("#text li a") %>%
html_text()
#combine into df, add structure
sotu <- data.frame (text = text, links = links, stringsAsFactors = F) %>%
separate(text, c("President", "Date", "Year"), ",")
#check for unneccesary whitespaces
sotu$President <- trimws(sotu$President)
sotu$links <- trimws(sotu$links)
sotu$Date <- trimws(sotu$Date)
sotu$Year <- trimws(sotu$Year)
#remove data from pre-modern era
sotu.modern <- sotu[-c(1:156),]
#separate democrats from republicans
democrats <- c("Harry S. Truman", "John F. Kennedy", "Lyndon B. Johnson", "Jimmy Carter", "William J. Clinton", "Barack Obama")
sotu.modern$Party <- ifelse(sotu.modern$President %in% democrats, "Democrat", "Republican")
到这里为止一切正常。此处的代码用于将每个 link 中的文本提取到目录
中for (i in seq(nrow(sotu.modern))) {
sotu.text <- read_html(sotu.modern$links[i]) %>%
html_nodes("#text li a") %>%
html_text()
filename <- paste0("SOTU", President,"-", Date, Year, ".txt")
sink(file = filename) %>%
cat(text) %>%
sink()
}
运行 上面的代码给我这个错误信息:
'19460121.html' does not exist in current working directory ('C:/Users/User/Desktop/R/R Projects/SOTU project')
"19460121.html" 是 sotu.modern$link 列的第一个元素,所以我不明白为什么它说它不在目录中。
您向 read_html
提供了不完整的 URL。
例如,在第一次迭代中它被指向 20200204.html
,而不是完整的 URL http://stateoftheunion.onetwothree.net/texts/20200204.html
我也修正了html_node
和filename
。
for (i in seq(nrow(sotu.modern))) {
sotu.text <- read_html(paste0("http://stateoftheunion.onetwothree.net/texts/",sotu.modern$links[i])) %>%
html_nodes("#text") %>%
html_text()
filename <- paste0("SOTU", sotu.modern$President[i],"-", sotu.modern$Date[i], sotu.modern$Year[i], ".txt")
cat(sotu.text, file=filename)
}