无法使用 rvest 将函数映射到已抓取的链接列表
Trouble mapping a function to a list of scraped links using rvest
我正在尝试应用一个从已抓取链接列表中提取 table 的函数。我正处于将 get_injury_data
函数应用于链接的最后阶段 - 我在成功执行此操作时遇到了问题。我收到以下错误:
Error in matrix(unlist(values), ncol = width, byrow = TRUE) :
'data' must be of a vector type, was 'NULL'
不知道有没有人能帮我找出我哪里出错了。代码如下:
library(tidyverse)
library(rvest)
# create a function to grab the team links
get_team_links <- function(url){
url %>%
read_html %>%
html_nodes('td.hauptlink a') %>%
html_attr('href') %>%
.[. != '#'] %>% # remove rows with # string
paste0('https://www.transfermarkt.com', .) %>% # pat the website link to the url strings
unique() %>% # keep only unique links
as_tibble() %>% # turn strings into a tibble datatset
rename("links" = "value") %>% # rename the value column
filter(!grepl('profil', links)) %>% # remove link of players included
filter(!grepl('spielplan', links)) %>% # remove link of additional team pages included
mutate(links = gsub("startseite", "kader", links)) # change link to go to the detailed page
}
# create a function to grab the player links
get_player_links <- function(url){
url %>%
read_html %>%
html_nodes('td.hauptlink a') %>%
html_attr('href') %>%
.[. != '#'] %>% # remove rows with # string
paste0('https://www.transfermarkt.com', .) %>% # pat the website link to the url strings
unique() %>% # keep only unique links
as_tibble() %>% # turn strings into a tibble datatset
rename("links" = "value") %>% # rename the value column
filter(grepl('profil', links)) %>% # remove link of players included
mutate(links = gsub("profil", "verletzungen", links)) # change link to go to the injury page
}
# create a function to get the injury dataset
get_injury_data <- function(url){
url %>%
read_html() %>%
html_nodes('#yw1') %>%
html_table()
}
# get team links and save it as team_links
team_links <- get_team_links('https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1')
# get player links and by mapping the function on to the player_injury_links dataset
# and then unnest the list of lists as a long list
player_injury_links <- team_links %>%
mutate(links = map(team_links$links, get_player_links)) %>%
unnest(links)
# using the player_injury_links list create a dataset by web scrapping the play injury pages
player_injury_data <- map(player_injury_links$links, get_injury_data)
解决方案
所以我遇到的问题是我抓取的一些链接没有任何数据。
为了解决这个问题,我使用了 purrr
包中的 possibly
函数。这帮助我创建了一个新的、无错误的函数。
给我带来麻烦的行代码如下:
player_injury_data <- player_injury_links %>%
purrr::map(., purrr::possibly(get_injury_data, otherwise = NULL, quiet = TRUE))
我正在尝试应用一个从已抓取链接列表中提取 table 的函数。我正处于将 get_injury_data
函数应用于链接的最后阶段 - 我在成功执行此操作时遇到了问题。我收到以下错误:
Error in matrix(unlist(values), ncol = width, byrow = TRUE) :
'data' must be of a vector type, was 'NULL'
不知道有没有人能帮我找出我哪里出错了。代码如下:
library(tidyverse)
library(rvest)
# create a function to grab the team links
get_team_links <- function(url){
url %>%
read_html %>%
html_nodes('td.hauptlink a') %>%
html_attr('href') %>%
.[. != '#'] %>% # remove rows with # string
paste0('https://www.transfermarkt.com', .) %>% # pat the website link to the url strings
unique() %>% # keep only unique links
as_tibble() %>% # turn strings into a tibble datatset
rename("links" = "value") %>% # rename the value column
filter(!grepl('profil', links)) %>% # remove link of players included
filter(!grepl('spielplan', links)) %>% # remove link of additional team pages included
mutate(links = gsub("startseite", "kader", links)) # change link to go to the detailed page
}
# create a function to grab the player links
get_player_links <- function(url){
url %>%
read_html %>%
html_nodes('td.hauptlink a') %>%
html_attr('href') %>%
.[. != '#'] %>% # remove rows with # string
paste0('https://www.transfermarkt.com', .) %>% # pat the website link to the url strings
unique() %>% # keep only unique links
as_tibble() %>% # turn strings into a tibble datatset
rename("links" = "value") %>% # rename the value column
filter(grepl('profil', links)) %>% # remove link of players included
mutate(links = gsub("profil", "verletzungen", links)) # change link to go to the injury page
}
# create a function to get the injury dataset
get_injury_data <- function(url){
url %>%
read_html() %>%
html_nodes('#yw1') %>%
html_table()
}
# get team links and save it as team_links
team_links <- get_team_links('https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1')
# get player links and by mapping the function on to the player_injury_links dataset
# and then unnest the list of lists as a long list
player_injury_links <- team_links %>%
mutate(links = map(team_links$links, get_player_links)) %>%
unnest(links)
# using the player_injury_links list create a dataset by web scrapping the play injury pages
player_injury_data <- map(player_injury_links$links, get_injury_data)
解决方案
所以我遇到的问题是我抓取的一些链接没有任何数据。
为了解决这个问题,我使用了 purrr
包中的 possibly
函数。这帮助我创建了一个新的、无错误的函数。
给我带来麻烦的行代码如下:
player_injury_data <- player_injury_links %>%
purrr::map(., purrr::possibly(get_injury_data, otherwise = NULL, quiet = TRUE))