使用 rvest 抓取 'Artsy'
Scraping 'Artsy' using rvest
我正在尝试使用 R 的 rvest 包从 Artsy 获取信息。我想获取有关绘画名称、年份、价格、地点(画廊名称、拍卖等)、名称的信息艺术家,以及使用的 materials。 material 的信息在每幅画的内页中提供。下面提供了我尝试使用的代码:
library(rvest)
library(dplyr)
library(tidyverse)
get_material = function (painting_link) {
painting_page = read_html (painting_link)
material = painting_page %>% html_nodes('h2+ .kPqROo') %>%
html_text() %>% paste(collapse = ",")
return(material)
}
for(page_result in 2:3) {
link = paste0 ("https://www.artsy.net/collect?page=", page_result, "&additional_gene_ids%5B0%5D=painting")
page = read_html(link)
painting_name_year = page %>% html_nodes("#main .kjRHrZ") %>% html_text()
painting_link = page %>% html_nodes('#main .kjRHrZ') %>% html_attr("<div color="black60" font-family="sans" class="Box-sc-15se88d-0 Text-sc-18gcpao-0 kjRHrZ">\n<i>") %>% paste("https://www.artsy.net", ., sep="/")
price = page %>% html_nodes('.ibabyz') %>% html_text()
place = page %>% html_nodes('hWKLzd') %>% html_text()
artist = page %>% html_nodes('.bQOCym .bQOCym') %>% html_text()
material = sapply(painting_link, FUN=get_material, USE.NAMES = FALSE)
}
artsy <- data.frame(painting_name_year, price, place, artist)
view(artsy)
painting_link、place 和 material 的代码无效。此外,一次观察重复 3 次。我该如何解决这个问题?
您可以删除循环。首先生成起始url列表。然后,在访问各个列表页面之前,您可以先收集各个列表的所有 url,而不是从登陆页面抓取一些信息。
然后,您可以通过处理更多 cpu 个内核并通过对每个 url.
的函数调用从所有列表中收集您想要的数据来提高效率
N.B. 由于此操作受 I/O 约束,您可能会发现异步方法的效率更高。如果我能找到一个像样的tutorial/reference,我可能会更新这个答案。
如果您 return tibble
所需的信息,从每个列表 url,通过函数,您可以通过调用 future_map_dfr
在列表链接和用户定义函数上。
library(purrr)
library(rvest)
#> Loading required package: xml2
#> Warning: package 'xml2' was built under R version 4.0.3
#>
#> Attaching package: 'rvest'
#> The following object is masked from 'package:purrr':
#>
#> pluck
library(tidyverse)
#> Warning: package 'tibble' was built under R version 4.0.3
#> Warning: package 'forcats' was built under R version 4.0.3
library(jsonlite)
#> Warning: package 'jsonlite' was built under R version 4.0.3
#>
#> Attaching package: 'jsonlite'
#> The following object is masked from 'package:purrr':
#>
#> flatten
library(furrr)
#> Warning: package 'furrr' was built under R version 4.0.3
#> Loading required package: future
#> Warning: package 'future' was built under R version 4.0.3
library(stringr)
get_art_links <- function(link) {
hrefs <- read_html(link) %>%
html_nodes("[href*=artwork][class]") %>%
html_attr("href") %>%
paste0("https://www.artsy.net", .)
return(hrefs)
}
get_listing_json <- function(page) {
data <- page %>%
html_node('[type="application/ld+json"]') %>%
html_text() %>%
jsonlite::parse_json()
return(data)
}
get_listing_info <- function(link) {
page <- read_html(link)
json <- get_listing_json(page)
artist <- json$brand$name
title <- page %>%
html_node('[data-test="artworkSidebar"] h2 > i') %>%
html_text()
production_date <- json$productionDate
material <- page %>%
html_node('[data-test="artworkSidebar"] h2 + div') %>%
html_text()
width <- json$width
height <- json$height
place <- stringr::str_match(json$description, "from (.*?),")[, 2]
price <- json$offers$price
currency <- json$offers$priceCurrency
availability <- str_replace(json$offers$availability, "https://schema.org/", "")
return(tibble(artist, title, production_date, material, width, height, place, price, currency, availability))
}
pages <- 2:3 %>% as.character()
urls <- sprintf("https://www.artsy.net/collect?page=%s&additional_gene_ids[0]=painting", pages)
links <- purrr::map(urls, get_art_links) %>%
unlist()
no_cores <- future::availableCores() - 1
future::plan(future::multisession, workers = no_cores)
results <- future_map_dfr(links, .f = get_listing_info)
由 reprex package (v0.3.0)
于 2021-05-16 创建
我正在尝试使用 R 的 rvest 包从 Artsy 获取信息。我想获取有关绘画名称、年份、价格、地点(画廊名称、拍卖等)、名称的信息艺术家,以及使用的 materials。 material 的信息在每幅画的内页中提供。下面提供了我尝试使用的代码:
library(rvest)
library(dplyr)
library(tidyverse)
get_material = function (painting_link) {
painting_page = read_html (painting_link)
material = painting_page %>% html_nodes('h2+ .kPqROo') %>%
html_text() %>% paste(collapse = ",")
return(material)
}
for(page_result in 2:3) {
link = paste0 ("https://www.artsy.net/collect?page=", page_result, "&additional_gene_ids%5B0%5D=painting")
page = read_html(link)
painting_name_year = page %>% html_nodes("#main .kjRHrZ") %>% html_text()
painting_link = page %>% html_nodes('#main .kjRHrZ') %>% html_attr("<div color="black60" font-family="sans" class="Box-sc-15se88d-0 Text-sc-18gcpao-0 kjRHrZ">\n<i>") %>% paste("https://www.artsy.net", ., sep="/")
price = page %>% html_nodes('.ibabyz') %>% html_text()
place = page %>% html_nodes('hWKLzd') %>% html_text()
artist = page %>% html_nodes('.bQOCym .bQOCym') %>% html_text()
material = sapply(painting_link, FUN=get_material, USE.NAMES = FALSE)
}
artsy <- data.frame(painting_name_year, price, place, artist)
view(artsy)
painting_link、place 和 material 的代码无效。此外,一次观察重复 3 次。我该如何解决这个问题?
您可以删除循环。首先生成起始url列表。然后,在访问各个列表页面之前,您可以先收集各个列表的所有 url,而不是从登陆页面抓取一些信息。
然后,您可以通过处理更多 cpu 个内核并通过对每个 url.
的函数调用从所有列表中收集您想要的数据来提高效率N.B. 由于此操作受 I/O 约束,您可能会发现异步方法的效率更高。如果我能找到一个像样的tutorial/reference,我可能会更新这个答案。
如果您 return tibble
所需的信息,从每个列表 url,通过函数,您可以通过调用 future_map_dfr
在列表链接和用户定义函数上。
library(purrr)
library(rvest)
#> Loading required package: xml2
#> Warning: package 'xml2' was built under R version 4.0.3
#>
#> Attaching package: 'rvest'
#> The following object is masked from 'package:purrr':
#>
#> pluck
library(tidyverse)
#> Warning: package 'tibble' was built under R version 4.0.3
#> Warning: package 'forcats' was built under R version 4.0.3
library(jsonlite)
#> Warning: package 'jsonlite' was built under R version 4.0.3
#>
#> Attaching package: 'jsonlite'
#> The following object is masked from 'package:purrr':
#>
#> flatten
library(furrr)
#> Warning: package 'furrr' was built under R version 4.0.3
#> Loading required package: future
#> Warning: package 'future' was built under R version 4.0.3
library(stringr)
get_art_links <- function(link) {
hrefs <- read_html(link) %>%
html_nodes("[href*=artwork][class]") %>%
html_attr("href") %>%
paste0("https://www.artsy.net", .)
return(hrefs)
}
get_listing_json <- function(page) {
data <- page %>%
html_node('[type="application/ld+json"]') %>%
html_text() %>%
jsonlite::parse_json()
return(data)
}
get_listing_info <- function(link) {
page <- read_html(link)
json <- get_listing_json(page)
artist <- json$brand$name
title <- page %>%
html_node('[data-test="artworkSidebar"] h2 > i') %>%
html_text()
production_date <- json$productionDate
material <- page %>%
html_node('[data-test="artworkSidebar"] h2 + div') %>%
html_text()
width <- json$width
height <- json$height
place <- stringr::str_match(json$description, "from (.*?),")[, 2]
price <- json$offers$price
currency <- json$offers$priceCurrency
availability <- str_replace(json$offers$availability, "https://schema.org/", "")
return(tibble(artist, title, production_date, material, width, height, place, price, currency, availability))
}
pages <- 2:3 %>% as.character()
urls <- sprintf("https://www.artsy.net/collect?page=%s&additional_gene_ids[0]=painting", pages)
links <- purrr::map(urls, get_art_links) %>%
unlist()
no_cores <- future::availableCores() - 1
future::plan(future::multisession, workers = no_cores)
results <- future_map_dfr(links, .f = get_listing_info)
由 reprex package (v0.3.0)
于 2021-05-16 创建