使用 rvest 和特定错误抓取数据
Scraping data using rvest and a specific error
我有这个数据抓取功能:
espn_team_stats <- function(team, side, season) {
# Libraries
library(tidyverse)
library(rvest)
# Using expand.grid() to run all combinations of the links above
url_factors <- expand.grid(side = c("batting", "fielding"),
team = c("ari", "atl", "bal", "bos", "chc", "chw", "cws",
"cin", "cle", "det", "fla", "mia", "hou", "kan",
"laa", "lad", "mil", "min", "nyy", "nym", "oak",
"phi", "pit", "sd", "sf", "sea", "stl", "tb",
"tex", "tor", "was", "wsh"),
season = c(2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,
2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017,
2018))
# URL vectors (Need to put all three url vectors in dataframe)
team_url <- paste0("http://www.espn.com/mlb/team/stats/",
url_factors$side, "/_/name/",
url_factors$team, "/year/",
url_factors$season, "/")
team_url <- toString(team_url)
# Building the data table
team_page <- team_url %>%
read_html %>%
html_node("#my-players-table > div.mod-container.mod-table >
div.mod-content > table:nth-child(1)") %>%
html_table(header = T)
# Setting tables
team_tables <- team_page
team_tables$Year <- c(side, team, season)
return(team_tables)
}
espn_team_stats(bal, batting, 2018)
我会不断收到以下错误:
Error in open.connection(x, "rb") : HTTP error 414. # URLs are too long?
Called from: open.connection(x, "rb")
该函数不会 运行 - 我预计我的 expand.grid()
调用与 team_url
被解析和粘贴在一起的方式有问题。
我的 url 的来源示例 url:
http://www.espn.com/mlb/team/stats/batting/_/name/ari/year/2017
http://www.espn.com/mlb/team/stats/fielding/_/name/ari/year/2017
由于您正在输入所需的值,因此无需使用网格:只需执行以下操作:
espn_team_stats <- function(team, side, season) {
team_url <- paste0("http://www.espn.com/mlb/team/stats/", side, "/_/name/", team, "/year/", season, "/")
# Building the data table
team_tables <- team_url %>%
read_html %>%
html_node("#my-players-table > div.mod-container.mod-table >
div.mod-content > table:nth-child(1)") %>%
html_table(header =T)
team_tables$Year <- season
return(team_tables)
}
espn_team_stats("bal", "batting", 2018)
我有这个数据抓取功能:
espn_team_stats <- function(team, side, season) {
# Libraries
library(tidyverse)
library(rvest)
# Using expand.grid() to run all combinations of the links above
url_factors <- expand.grid(side = c("batting", "fielding"),
team = c("ari", "atl", "bal", "bos", "chc", "chw", "cws",
"cin", "cle", "det", "fla", "mia", "hou", "kan",
"laa", "lad", "mil", "min", "nyy", "nym", "oak",
"phi", "pit", "sd", "sf", "sea", "stl", "tb",
"tex", "tor", "was", "wsh"),
season = c(2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,
2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017,
2018))
# URL vectors (Need to put all three url vectors in dataframe)
team_url <- paste0("http://www.espn.com/mlb/team/stats/",
url_factors$side, "/_/name/",
url_factors$team, "/year/",
url_factors$season, "/")
team_url <- toString(team_url)
# Building the data table
team_page <- team_url %>%
read_html %>%
html_node("#my-players-table > div.mod-container.mod-table >
div.mod-content > table:nth-child(1)") %>%
html_table(header = T)
# Setting tables
team_tables <- team_page
team_tables$Year <- c(side, team, season)
return(team_tables)
}
espn_team_stats(bal, batting, 2018)
我会不断收到以下错误:
Error in open.connection(x, "rb") : HTTP error 414. # URLs are too long?
Called from: open.connection(x, "rb")
该函数不会 运行 - 我预计我的 expand.grid()
调用与 team_url
被解析和粘贴在一起的方式有问题。
我的 url 的来源示例 url:
http://www.espn.com/mlb/team/stats/batting/_/name/ari/year/2017
http://www.espn.com/mlb/team/stats/fielding/_/name/ari/year/2017
由于您正在输入所需的值,因此无需使用网格:只需执行以下操作:
espn_team_stats <- function(team, side, season) {
team_url <- paste0("http://www.espn.com/mlb/team/stats/", side, "/_/name/", team, "/year/", season, "/")
# Building the data table
team_tables <- team_url %>%
read_html %>%
html_node("#my-players-table > div.mod-container.mod-table >
div.mod-content > table:nth-child(1)") %>%
html_table(header =T)
team_tables$Year <- season
return(team_tables)
}
espn_team_stats("bal", "batting", 2018)