Rvest 返回空值并且无法弄清楚原因
Rvest returning empty values and can't figure out why
我对这种事情有一个很好的模板化,我从来没有见过它不起作用。
library(tidyverse)
library(rvest)
library(magrittr)
library(dplyr)
library(tidyr)
library(data.table)
library(zoo)
rivals_url <- paste0("https://rivals.com/prospect_rankings/rivals250/2021")
t300 <- map_df(rivals_url, ~.x %>% read_html %>%
html_nodes(".position .pos , .last-name , .first-name") %>%
html_text() %>%
str_trim %>%
str_split(" ") %>%
matrix(ncol = 3, byrow = T) %>%
as.data.frame)
当我运行它时,它只是returns一组空值。是否有关于我从哪里拉出来的具体原因导致的。例如,这个脚本工作正常:
espn_url <- paste0("http://www.espn.com/college-sports/football/recruiting/playerrankings/_/view/rn300")
t300 <- map_df(espn_url, ~.x %>% read_html %>%
html_nodes("td:nth-child(3), td:nth-child(8), Strong, .colhead td:nth-child(2)") %>%
html_text() %>%
str_trim %>%
str_split(" ") %>%
matrix(ncol = 3, byrow = T) %>%
as.data.frame)
您的代码不起作用的原因是您试图抓取动态页面。您将需要使用诸如 RSelenium 之类的包。下面的代码应该可以工作:
library(tidyverse)
library(rvest)
library(magrittr)
library(dplyr)
library(tidyr)
library(data.table)
library(zoo)
library(RSelenium)
rivals_url <- paste0("https://rivals.com/prospect_rankings/rivals250/2021")
#####Open remote browser
rD <- rsDriver(browser = "chrome")
remDr <- rD[["client"]]
remDr$navigate(rivals_url)
rivals_page <- read_html(remDr$getPageSource()[[1]])
##### Scrape target page and format results
t300 <- rivals_page %>%
html_nodes(".position .pos , .last-name , .first-name") %>%
html_text() %>%
str_trim %>%
str_split(" ") %>%
matrix(ncol = 3, byrow = T) %>%
as.data.frame %>%
`colnames<-`(c("Last Name", "First Name", "Position"))
##### Close remote browser and terminate related processes
remDr$close()
rD$server$stop()
rm(rD, remDr)
gc()
system("taskkill /im java.exe /f", intern=FALSE, ignore.stdout=FALSE)
站点有public面向API您可以直接查询:
library(jsonlite)
library(dplyr)
dat <- fromJSON("https://n.rivals.com/api/v1/ranks/4408/prospects?start=0&position=ALL%20POSITIONS&pageSize=250")
dat$prospects %>%
as_tibble()
# A tibble: 250 x 26
id first_name last_name name position_abbrev~ stars prospect_url height weight verified_height verified_weight city hometown highschool profile_image_u~
<int> <chr> <chr> <chr> <chr> <int> <chr> <dbl> <dbl> <lgl> <lgl> <chr> <chr> <chr> <chr>
1 232205 "Maason" Smith Maas~ DT 5 https://n.r~ 77 297 NA NA Houma Houma, ~ Terrebonne https://images.~
2 209086 "JC" Latham JC L~ OT 5 https://n.r~ 78 310 NA NA Brad~ Bradent~ IMG Acade~ https://images.~
3 216686 "Korey" Foreman Kore~ SDE 5 https://n.r~ 77 254 TRUE FALSE Coro~ Corona,~ Centennial https://images.~
4 234618 "Amarius" Mims Amar~ OT 5 https://n.r~ 79 300 NA NA Coch~ Cochran~ Bleckley ~ https://images.~
5 210175 "Caleb" Williams Cale~ DUAL 5 https://n.r~ 73 200 NA NA Wash~ Washing~ Gonzaga https://images.~
6 208999 "J.T." Tuimoloau J.T.~ SDE 5 https://n.r~ 76 275 NA NA Bell~ Bellevu~ Eastside ~ https://images.~
7 210587 "Brock" Vandagri~ Broc~ PRO 5 https://n.r~ 75 198 NA NA Boga~ Bogart,~ Prince Av~ https://images.~
8 234013 "Tommy " Brockerm~ Tomm~ OT 5 https://n.r~ 78 288 NA NA Fort~ Fort Wo~ All Saint~ https://images.~
9 237486 "Ty" Thompson Ty T~ DUAL 5 https://n.r~ 76 203 NA NA Gilb~ Gilbert~ Mesquite https://images.~
10 194402 "Camar" Wheaton Cama~ RB 5 https://n.r~ 73 190 NA NA Garl~ Garland~ Lakeview ~ https://images.~
# ... with 240 more rows, and 11 more variables: state_abbreviation <chr>, school_id <int>, sport_school_id <int>, school_name <chr>, school_commits_url <chr>,
# recruit_year <int>, big_college_logo <chr>, status <chr>, commit_date <chr>, rank <int>, rank_change <int>
我对这种事情有一个很好的模板化,我从来没有见过它不起作用。
library(tidyverse)
library(rvest)
library(magrittr)
library(dplyr)
library(tidyr)
library(data.table)
library(zoo)
rivals_url <- paste0("https://rivals.com/prospect_rankings/rivals250/2021")
t300 <- map_df(rivals_url, ~.x %>% read_html %>%
html_nodes(".position .pos , .last-name , .first-name") %>%
html_text() %>%
str_trim %>%
str_split(" ") %>%
matrix(ncol = 3, byrow = T) %>%
as.data.frame)
当我运行它时,它只是returns一组空值。是否有关于我从哪里拉出来的具体原因导致的。例如,这个脚本工作正常:
espn_url <- paste0("http://www.espn.com/college-sports/football/recruiting/playerrankings/_/view/rn300")
t300 <- map_df(espn_url, ~.x %>% read_html %>%
html_nodes("td:nth-child(3), td:nth-child(8), Strong, .colhead td:nth-child(2)") %>%
html_text() %>%
str_trim %>%
str_split(" ") %>%
matrix(ncol = 3, byrow = T) %>%
as.data.frame)
您的代码不起作用的原因是您试图抓取动态页面。您将需要使用诸如 RSelenium 之类的包。下面的代码应该可以工作:
library(tidyverse)
library(rvest)
library(magrittr)
library(dplyr)
library(tidyr)
library(data.table)
library(zoo)
library(RSelenium)
rivals_url <- paste0("https://rivals.com/prospect_rankings/rivals250/2021")
#####Open remote browser
rD <- rsDriver(browser = "chrome")
remDr <- rD[["client"]]
remDr$navigate(rivals_url)
rivals_page <- read_html(remDr$getPageSource()[[1]])
##### Scrape target page and format results
t300 <- rivals_page %>%
html_nodes(".position .pos , .last-name , .first-name") %>%
html_text() %>%
str_trim %>%
str_split(" ") %>%
matrix(ncol = 3, byrow = T) %>%
as.data.frame %>%
`colnames<-`(c("Last Name", "First Name", "Position"))
##### Close remote browser and terminate related processes
remDr$close()
rD$server$stop()
rm(rD, remDr)
gc()
system("taskkill /im java.exe /f", intern=FALSE, ignore.stdout=FALSE)
站点有public面向API您可以直接查询:
library(jsonlite)
library(dplyr)
dat <- fromJSON("https://n.rivals.com/api/v1/ranks/4408/prospects?start=0&position=ALL%20POSITIONS&pageSize=250")
dat$prospects %>%
as_tibble()
# A tibble: 250 x 26
id first_name last_name name position_abbrev~ stars prospect_url height weight verified_height verified_weight city hometown highschool profile_image_u~
<int> <chr> <chr> <chr> <chr> <int> <chr> <dbl> <dbl> <lgl> <lgl> <chr> <chr> <chr> <chr>
1 232205 "Maason" Smith Maas~ DT 5 https://n.r~ 77 297 NA NA Houma Houma, ~ Terrebonne https://images.~
2 209086 "JC" Latham JC L~ OT 5 https://n.r~ 78 310 NA NA Brad~ Bradent~ IMG Acade~ https://images.~
3 216686 "Korey" Foreman Kore~ SDE 5 https://n.r~ 77 254 TRUE FALSE Coro~ Corona,~ Centennial https://images.~
4 234618 "Amarius" Mims Amar~ OT 5 https://n.r~ 79 300 NA NA Coch~ Cochran~ Bleckley ~ https://images.~
5 210175 "Caleb" Williams Cale~ DUAL 5 https://n.r~ 73 200 NA NA Wash~ Washing~ Gonzaga https://images.~
6 208999 "J.T." Tuimoloau J.T.~ SDE 5 https://n.r~ 76 275 NA NA Bell~ Bellevu~ Eastside ~ https://images.~
7 210587 "Brock" Vandagri~ Broc~ PRO 5 https://n.r~ 75 198 NA NA Boga~ Bogart,~ Prince Av~ https://images.~
8 234013 "Tommy " Brockerm~ Tomm~ OT 5 https://n.r~ 78 288 NA NA Fort~ Fort Wo~ All Saint~ https://images.~
9 237486 "Ty" Thompson Ty T~ DUAL 5 https://n.r~ 76 203 NA NA Gilb~ Gilbert~ Mesquite https://images.~
10 194402 "Camar" Wheaton Cama~ RB 5 https://n.r~ 73 190 NA NA Garl~ Garland~ Lakeview ~ https://images.~
# ... with 240 more rows, and 11 more variables: state_abbreviation <chr>, school_id <int>, sport_school_id <int>, school_name <chr>, school_commits_url <chr>,
# recruit_year <int>, big_college_logo <chr>, status <chr>, commit_date <chr>, rank <int>, rank_change <int>