使用 Rvest 和 Glue 从网络抓取多个页面中创建聚合 df
Create aggregate df out of webscraping multiple pages using Rvest and Glue
我正在从以下网站的 table 抓取数据
https://fantasy.nfl.com/research/scoringleaders?position=1&statCategory=stats&statSeason=2019&statType=weekStats&statWeek=1
我想创建一个抓取所有 17 周,所有四个位置(qb、rb、wr、te)并获取前 4 页以获取前 100 行(在一个页面上仅显示 25 行)时间)。
library(tidyverse)
library(rvest)
library(glue)
scrape_19 <- function(week, position, page) {
Sys.sleep(3)
cat(".")
url <- glue("https://fantasy.nfl.com/research/scoringleaders?{page}position={position}&sort=pts&statCategory=stats&statSeason=2019&statType=weekStats&statWeek={week}")
read_html(url) %>%
html_nodes("table") %>%
html_table(header = T) %>%
simplify() %>%
first() %>%
setNames(paste0(colnames(.), as.character(.[1,]))) %>%
slice(-1) %>%
list()
}
以下是 glue 中每次调用的所有迭代:
week = 1:17;
position = 1:4;
page = c("", "offset=26&", "offset=51&", "offset=76&")
我 运行 遇到的问题是当我尝试用每周、位置和页面的所有数据制作一个 df 时。这是适用于周和位置的代码,但不适用于额外的嵌套 df。
scaffold <- tibble(week = weeks,
position = list(positions)) %>% tidyr::unnest()
scaffold
tbl_data <- scaffold %>%
mutate(data = purrr::map2(week, position, ~scrape_19(.x, .y)[[1]]))
基本上,我需要帮助来制作脚手架并将该脚手架转换为包含所有周、职位和页面的最终总数据集。
这是我的尝试。我不确定 glue()
是否可行。见下文。
first_name <- c("Fred", "Ana", "Bob")
last_name <- c("JOhnson", "Trump")
glue('My name is {first_name} {last_name}.')
Error: Variables must be length 1 or 3
您的情况与本例类似。所以我尝试使用 map()
的循环来创建所有可能的链接。然后,我检查了所有 URL 是否存在。我使用 map_dfr()
来遍历所有 URL 并绑定所有数据帧。在此过程中,我还添加了周和职位信息。如果位置为 1,则为 QB。如有必要,请自行更换这些数字。请注意,我在此演示中抓取了四个 URL。
library(httr)
library(rvest)
library(tidyverse)
# Create all URLs.
# Create 4 base URLs
paste("https://fantasy.nfl.com/research/scoringleaders?",
c("", "offset=26&", "offset=51&", "offset=76&"),
"position={position}&sort=pts&statCategory=stats&statSeason=2019&statType=weekStats&statWeek={week}",
sep = "") -> mytemp
# For each base URL, create 4 URLs. (4 x 4 = 16 URLs)
map(.x = 1:4,
.f = function(x){gsub(x = mytemp, pattern = "\{position\}", replacement = x)}) %>%
unlist -> mytemp
# For each of the 16 URLs, create 17 URLs
map(.x = 1:17,
.f = function(x){gsub(x = mytemp, pattern = "\{week\}", replacement = x)}) %>%
unlist -> myurls
# Check if any URLs are invalid
sapply(myurls, url_success) %>% table
# TRUE
# 272
# Scrape the tables
map_dfr(.x = myurls[1:4],
.f = function(x){read_html(x) %>%
html_nodes("table") %>%
html_table() %>%
simplify() %>%
first() %>%
setNames(paste0(colnames(.), as.character(.[1,]))) %>%
slice(-1) %>%
mutate(position = str_extract(string = x, pattern = "(?<=position=)\d+(?=&)"),
week = str_extract(string = x, pattern = "(?<=statWeek=)\d+"))},
.id = "url") -> foo
url Rank Player Opp PassingYds PassingTD PassingInt RushingYds RushingTD ReceivingRec ReceivingYds
1 1 1 Lamar Jackson QB - BAL @MIA 324 5 - 6 - - -
2 1 2 Dak Prescott QB - DAL NYG 405 4 - 12 - - -
3 1 3 Deshaun Watson QB - HOU @NO 268 3 1 40 1 - -
4 1 4 Matthew Stafford QB - DET @ARI 385 3 - 22 - - -
5 1 5 Patrick Mahomes QB - KC @JAX 378 3 - 2 - - -
ReceivingTD RetTD MiscFumTD Misc2PT FumLost FantasyPoints position week
1 - - - - - 33.56 1 1
2 - - - - - 33.40 1 1
3 - - - - - 30.72 1 1
4 - - - - 1 27.60 1 1
5 - - - - - 27.32 1 1
我正在从以下网站的 table 抓取数据
https://fantasy.nfl.com/research/scoringleaders?position=1&statCategory=stats&statSeason=2019&statType=weekStats&statWeek=1
我想创建一个抓取所有 17 周,所有四个位置(qb、rb、wr、te)并获取前 4 页以获取前 100 行(在一个页面上仅显示 25 行)时间)。
library(tidyverse)
library(rvest)
library(glue)
scrape_19 <- function(week, position, page) {
Sys.sleep(3)
cat(".")
url <- glue("https://fantasy.nfl.com/research/scoringleaders?{page}position={position}&sort=pts&statCategory=stats&statSeason=2019&statType=weekStats&statWeek={week}")
read_html(url) %>%
html_nodes("table") %>%
html_table(header = T) %>%
simplify() %>%
first() %>%
setNames(paste0(colnames(.), as.character(.[1,]))) %>%
slice(-1) %>%
list()
}
以下是 glue 中每次调用的所有迭代:
week = 1:17;
position = 1:4;
page = c("", "offset=26&", "offset=51&", "offset=76&")
我 运行 遇到的问题是当我尝试用每周、位置和页面的所有数据制作一个 df 时。这是适用于周和位置的代码,但不适用于额外的嵌套 df。
scaffold <- tibble(week = weeks,
position = list(positions)) %>% tidyr::unnest()
scaffold
tbl_data <- scaffold %>%
mutate(data = purrr::map2(week, position, ~scrape_19(.x, .y)[[1]]))
基本上,我需要帮助来制作脚手架并将该脚手架转换为包含所有周、职位和页面的最终总数据集。
这是我的尝试。我不确定 glue()
是否可行。见下文。
first_name <- c("Fred", "Ana", "Bob")
last_name <- c("JOhnson", "Trump")
glue('My name is {first_name} {last_name}.')
Error: Variables must be length 1 or 3
您的情况与本例类似。所以我尝试使用 map()
的循环来创建所有可能的链接。然后,我检查了所有 URL 是否存在。我使用 map_dfr()
来遍历所有 URL 并绑定所有数据帧。在此过程中,我还添加了周和职位信息。如果位置为 1,则为 QB。如有必要,请自行更换这些数字。请注意,我在此演示中抓取了四个 URL。
library(httr)
library(rvest)
library(tidyverse)
# Create all URLs.
# Create 4 base URLs
paste("https://fantasy.nfl.com/research/scoringleaders?",
c("", "offset=26&", "offset=51&", "offset=76&"),
"position={position}&sort=pts&statCategory=stats&statSeason=2019&statType=weekStats&statWeek={week}",
sep = "") -> mytemp
# For each base URL, create 4 URLs. (4 x 4 = 16 URLs)
map(.x = 1:4,
.f = function(x){gsub(x = mytemp, pattern = "\{position\}", replacement = x)}) %>%
unlist -> mytemp
# For each of the 16 URLs, create 17 URLs
map(.x = 1:17,
.f = function(x){gsub(x = mytemp, pattern = "\{week\}", replacement = x)}) %>%
unlist -> myurls
# Check if any URLs are invalid
sapply(myurls, url_success) %>% table
# TRUE
# 272
# Scrape the tables
map_dfr(.x = myurls[1:4],
.f = function(x){read_html(x) %>%
html_nodes("table") %>%
html_table() %>%
simplify() %>%
first() %>%
setNames(paste0(colnames(.), as.character(.[1,]))) %>%
slice(-1) %>%
mutate(position = str_extract(string = x, pattern = "(?<=position=)\d+(?=&)"),
week = str_extract(string = x, pattern = "(?<=statWeek=)\d+"))},
.id = "url") -> foo
url Rank Player Opp PassingYds PassingTD PassingInt RushingYds RushingTD ReceivingRec ReceivingYds
1 1 1 Lamar Jackson QB - BAL @MIA 324 5 - 6 - - -
2 1 2 Dak Prescott QB - DAL NYG 405 4 - 12 - - -
3 1 3 Deshaun Watson QB - HOU @NO 268 3 1 40 1 - -
4 1 4 Matthew Stafford QB - DET @ARI 385 3 - 22 - - -
5 1 5 Patrick Mahomes QB - KC @JAX 378 3 - 2 - - -
ReceivingTD RetTD MiscFumTD Misc2PT FumLost FantasyPoints position week
1 - - - - - 33.56 1 1
2 - - - - - 33.40 1 1
3 - - - - - 30.72 1 1
4 - - - - 1 27.60 1 1
5 - - - - - 27.32 1 1