R脚本在抓取基本页面时返回一些错误
R Script coming back with a few errors when scraping basic page
这是我的脚本:
library(rvest)
library(dplyr)
link = "http://www.mmadecisions.com/decisions-by-judge/"
page = read_html(link)
name = page %>% html_nodes("#page1 a") %>% html_text()
name_links = page %>% html_nodes("#page1 a") %>%
html_attr("href") %>% paste("http://www.mmadecisions.com/", ., sep="")
get_decisions = function(name_link) {
judge_page = read_html(name_link)
date = judge_page %>% html_nodes(".list:nth-child(1)") %>% html_text()
event = judge_page %>% html_nodes(".list:nth-child(2) a") %>% html_text()
fight = judge_page %>% html_nodes(".list~ .list+ .list a") %>% html_text()
decisions = judge_page %>% html_nodes(".list:nth-child(4)") %>% html_text()
return(judge_page)
}
decision = sapply(name_links, FUN = get_decisions)
judges = data.frame(name, date, event, fight, decision, stringsAsFactors = FALSE)
我不断收到的错误如下:
> library(rvest)
> library(dplyr)
>
>
> link = "http://www.mmadecisions.com/decisions-by-judge/"
> page = read_html(link)
>
> name = page %>% html_nodes("#page1 a") %>% html_text()
> name_links = page %>% html_nodes("#page1 a") %>%
+ html_attr("href") %>% paste("http://www.mmadecisions.com/", ., sep="")
>
> get_decisions = function(name_link) {
+ judge_page = read_html(name_link)
+ date = judge_page %>% html_nodes(".list:nth-child(1)") %>% html_text()
+ event = judge_page %>% html_nodes(".list:nth-child(2) a") %>% html_text()
+ fight = judge_page %>% html_nodes(".list~ .list+ .list a") %>% html_text()
+ decisions = judge_page %>% html_nodes(".list:nth-child(4)") %>% html_text()
+ return(judge_page)
+ }
>
> decisions = sapply(name_links, FUN = get_decisions)
Error in open.connection(x, "rb") : HTTP error 400.
Called from: open.connection(x, "rb")
Browse[1]>
> judges = data.frame(name, date, event, fight, score, stringsAsFactors = FALSE)
Error in data.frame(name, date, event, fight, score, stringsAsFactors = FALSE) :
object 'score' not found
我的目标是从父页面导航到多个子页面,抓取四列数据 'judge-decisions',然后打印到列中。我很感激任何人可以提供的任何见解。
不完全确定为什么有两个结束变量。如果你想要这 4 列的最终 df,你可以从每一页使用 purrr::map_dfr
,并确保你从你的函数中 return 一个 tibble
。对于连接问题,您需要 trim 您的空白网址。
library(rvest)
library(dplyr)
library(purrr)
link = "http://www.mmadecisions.com/decisions-by-judge/"
page = read_html(link)
name = page %>% html_nodes("#page1 a") %>% html_text()
name_links = page %>% html_nodes("#page1 a") %>%
html_attr("href") %>% paste("http://www.mmadecisions.com/", ., sep="") %>% trimws()
get_decisions = function(name_link) {
judge_page = read_html(name_link)
tibble(
date = judge_page %>% html_nodes(".list:nth-child(1)") %>% html_text(),
event = judge_page %>% html_nodes(".list:nth-child(2) a") %>% html_text(),
fight = judge_page %>% html_nodes(".list ~ .list + .list a") %>% html_text(),
decisions = judge_page %>% html_nodes(".list:nth-child(4)") %>% html_text()
) -> t
return(t)
}
df <- map_dfr(name_links, get_decisions)
这是我的脚本:
library(rvest)
library(dplyr)
link = "http://www.mmadecisions.com/decisions-by-judge/"
page = read_html(link)
name = page %>% html_nodes("#page1 a") %>% html_text()
name_links = page %>% html_nodes("#page1 a") %>%
html_attr("href") %>% paste("http://www.mmadecisions.com/", ., sep="")
get_decisions = function(name_link) {
judge_page = read_html(name_link)
date = judge_page %>% html_nodes(".list:nth-child(1)") %>% html_text()
event = judge_page %>% html_nodes(".list:nth-child(2) a") %>% html_text()
fight = judge_page %>% html_nodes(".list~ .list+ .list a") %>% html_text()
decisions = judge_page %>% html_nodes(".list:nth-child(4)") %>% html_text()
return(judge_page)
}
decision = sapply(name_links, FUN = get_decisions)
judges = data.frame(name, date, event, fight, decision, stringsAsFactors = FALSE)
我不断收到的错误如下:
> library(rvest)
> library(dplyr)
>
>
> link = "http://www.mmadecisions.com/decisions-by-judge/"
> page = read_html(link)
>
> name = page %>% html_nodes("#page1 a") %>% html_text()
> name_links = page %>% html_nodes("#page1 a") %>%
+ html_attr("href") %>% paste("http://www.mmadecisions.com/", ., sep="")
>
> get_decisions = function(name_link) {
+ judge_page = read_html(name_link)
+ date = judge_page %>% html_nodes(".list:nth-child(1)") %>% html_text()
+ event = judge_page %>% html_nodes(".list:nth-child(2) a") %>% html_text()
+ fight = judge_page %>% html_nodes(".list~ .list+ .list a") %>% html_text()
+ decisions = judge_page %>% html_nodes(".list:nth-child(4)") %>% html_text()
+ return(judge_page)
+ }
>
> decisions = sapply(name_links, FUN = get_decisions)
Error in open.connection(x, "rb") : HTTP error 400.
Called from: open.connection(x, "rb")
Browse[1]>
> judges = data.frame(name, date, event, fight, score, stringsAsFactors = FALSE)
Error in data.frame(name, date, event, fight, score, stringsAsFactors = FALSE) :
object 'score' not found
我的目标是从父页面导航到多个子页面,抓取四列数据 'judge-decisions',然后打印到列中。我很感激任何人可以提供的任何见解。
不完全确定为什么有两个结束变量。如果你想要这 4 列的最终 df,你可以从每一页使用 purrr::map_dfr
,并确保你从你的函数中 return 一个 tibble
。对于连接问题,您需要 trim 您的空白网址。
library(rvest)
library(dplyr)
library(purrr)
link = "http://www.mmadecisions.com/decisions-by-judge/"
page = read_html(link)
name = page %>% html_nodes("#page1 a") %>% html_text()
name_links = page %>% html_nodes("#page1 a") %>%
html_attr("href") %>% paste("http://www.mmadecisions.com/", ., sep="") %>% trimws()
get_decisions = function(name_link) {
judge_page = read_html(name_link)
tibble(
date = judge_page %>% html_nodes(".list:nth-child(1)") %>% html_text(),
event = judge_page %>% html_nodes(".list:nth-child(2) a") %>% html_text(),
fight = judge_page %>% html_nodes(".list ~ .list + .list a") %>% html_text(),
decisions = judge_page %>% html_nodes(".list:nth-child(4)") %>% html_text()
) -> t
return(t)
}
df <- map_dfr(name_links, get_decisions)