如何正确使用 tryCatch() 并忽略此 rvest 函数中的 404 错误?
How do I correctly use tryCatch() and ignore 404 errors in this rvest function?
为了抓取歌词,我写了这个功能:
songscrape <- function(x) {
url <- paste0("https://www.azlyrics.com/", substring(x, 1, 1),"/",x, ".html")
artist <- x
SongsListScrapper <- function(x) {
page <- x
songs <- page %>%
read_html() %>%
html_nodes(xpath = "/html/body/div[2]/div/div[2]/div[4]/div/a") %>%
html_text() %>%
as.data.frame()
chart <- cbind(songs)
names(chart) <- c("Songs")
chart <- as.tibble(chart)
return(chart)
}
SongsList <- map_df(url, SongsListScrapper)
SongsList
SongsList %<>%
mutate(
Songs = as.character(Songs)
,Songs = gsub("[[:punct:]]", "", Songs)
,Songs = tolower(Songs)
,Songs = gsub(" ", "", Songs)
)
SongsList$Songs
#Scrape Lyrics
wipe_html <- function(str_html) {
gsub("<.*?>", "", str_html)
}
lyrics2 <- c()
albums2 <- c()
number <- 1
for(i in seq_along(SongsList$Songs)) {
for_url_name <- SongsList$Songs[i]
#clean name
for_url_name <- tolower(gsub("[[:punct:]]\s", "", for_url_name))
#create url
paste_url <- paste0("https://www.azlyrics.com/lyrics/", artist,"/", for_url_name, ".html")
tryCatch( {
#open connection to url
for_html_code <-read_html(paste_url)
for_lyrics <- html_node(for_html_code, xpath = "/html/body/div[2]/div/div[2]/div[5]")
for_albums <- html_node(for_html_code, xpath = "/html/body/div[2]/div/div[2]/div[11]/div[1]/b")
error = function(e){NA}
})
for_lyrics <- wipe_html(for_lyrics)
for_albums <- wipe_html(for_albums)
lyrics2[number] <- for_lyrics
albums2[number] <- for_albums
number <- number +1
show(paste0(for_url_name, " Scrape Complete!", "[",i,"/",nrow(SongsList),"]"))
Sys.sleep(10)
}
songs2 <- cbind(lyrics2, albums2) %>% as.data.frame()
songs2$albums2 <- gsub("[[:punct:]]", "", songs2$albums2)
return(songs2)
}
您会注意到我在代码中使用了 tryCatch()
(如下所示),因为我意识到在某些边缘情况下,URL 不会匹配并在中途停止函数:
tryCatch( {
#open connection to url
for_html_code <-read_html(paste_url)
for_lyrics <- html_node(for_html_code, xpath = "/html/body/div[2]/div/div[2]/div[5]")
for_albums <- html_node(for_html_code, xpath = "/html/body/div[2]/div/div[2]/div[11]/div[1]/b")
error = function(e){NA}
})
但是,我仍然收到此错误并且代码停止运行,而不是忽略错误并继续:
Error in open.connection(x, "rb") : HTTP error 404.
In addition: Warning message:
In for (i in seq_len(n)) { :
我做错了什么,如何解决?
正如用户@27ϕ9 所述,trcyCatch()
没有被正确使用。错误处理需要在右括号之外:
tryCatch( {
#open connection to url
for_html_code <-read_html(paste_url)
for_lyrics <- html_node(for_html_code, xpath = "/html/body/div[2]/div/div[2]/div[5]")
for_albums <- html_node(for_html_code, xpath = "/html/body/div[2]/div/div[2]/div[11]/div[1]/b")
}, error = function(e){NA}
)
有关详细信息,请参阅此处的 this 回答。
为了抓取歌词,我写了这个功能:
songscrape <- function(x) {
url <- paste0("https://www.azlyrics.com/", substring(x, 1, 1),"/",x, ".html")
artist <- x
SongsListScrapper <- function(x) {
page <- x
songs <- page %>%
read_html() %>%
html_nodes(xpath = "/html/body/div[2]/div/div[2]/div[4]/div/a") %>%
html_text() %>%
as.data.frame()
chart <- cbind(songs)
names(chart) <- c("Songs")
chart <- as.tibble(chart)
return(chart)
}
SongsList <- map_df(url, SongsListScrapper)
SongsList
SongsList %<>%
mutate(
Songs = as.character(Songs)
,Songs = gsub("[[:punct:]]", "", Songs)
,Songs = tolower(Songs)
,Songs = gsub(" ", "", Songs)
)
SongsList$Songs
#Scrape Lyrics
wipe_html <- function(str_html) {
gsub("<.*?>", "", str_html)
}
lyrics2 <- c()
albums2 <- c()
number <- 1
for(i in seq_along(SongsList$Songs)) {
for_url_name <- SongsList$Songs[i]
#clean name
for_url_name <- tolower(gsub("[[:punct:]]\s", "", for_url_name))
#create url
paste_url <- paste0("https://www.azlyrics.com/lyrics/", artist,"/", for_url_name, ".html")
tryCatch( {
#open connection to url
for_html_code <-read_html(paste_url)
for_lyrics <- html_node(for_html_code, xpath = "/html/body/div[2]/div/div[2]/div[5]")
for_albums <- html_node(for_html_code, xpath = "/html/body/div[2]/div/div[2]/div[11]/div[1]/b")
error = function(e){NA}
})
for_lyrics <- wipe_html(for_lyrics)
for_albums <- wipe_html(for_albums)
lyrics2[number] <- for_lyrics
albums2[number] <- for_albums
number <- number +1
show(paste0(for_url_name, " Scrape Complete!", "[",i,"/",nrow(SongsList),"]"))
Sys.sleep(10)
}
songs2 <- cbind(lyrics2, albums2) %>% as.data.frame()
songs2$albums2 <- gsub("[[:punct:]]", "", songs2$albums2)
return(songs2)
}
您会注意到我在代码中使用了 tryCatch()
(如下所示),因为我意识到在某些边缘情况下,URL 不会匹配并在中途停止函数:
tryCatch( {
#open connection to url
for_html_code <-read_html(paste_url)
for_lyrics <- html_node(for_html_code, xpath = "/html/body/div[2]/div/div[2]/div[5]")
for_albums <- html_node(for_html_code, xpath = "/html/body/div[2]/div/div[2]/div[11]/div[1]/b")
error = function(e){NA}
})
但是,我仍然收到此错误并且代码停止运行,而不是忽略错误并继续:
Error in open.connection(x, "rb") : HTTP error 404.
In addition: Warning message:
In for (i in seq_len(n)) { :
我做错了什么,如何解决?
正如用户@27ϕ9 所述,trcyCatch()
没有被正确使用。错误处理需要在右括号之外:
tryCatch( {
#open connection to url
for_html_code <-read_html(paste_url)
for_lyrics <- html_node(for_html_code, xpath = "/html/body/div[2]/div/div[2]/div[5]")
for_albums <- html_node(for_html_code, xpath = "/html/body/div[2]/div/div[2]/div[11]/div[1]/b")
}, error = function(e){NA}
)
有关详细信息,请参阅此处的 this 回答。