用 rvest 抓取 - 按 div id 搜索
Ccraping with rvest - search by div id
我是新手,我想通过汽车在网站上提取信息:
https://www.plugndrive.ca/electric-cars-available-in-canada/
library(rvest);library(tidyverse)
elec_url <- read_html('https://www.plugndrive.ca/electric-cars-available-in-canada/')
car_list <- elec_url %>% html_nodes('.car-title') %>% html_text() %>% tolower() %>%
gsub(' ','-',.)
price <- read_html(paste0('https://www.plugndrive.ca/pnd_evcar_cat/',car_list[[1]],'/')) %>%
html_nodes('.starting-price .value') %>% html_text() %>% sub("\..*", "", .) %>% gsub('^\$|\,','',.) %>%
as.numeric()
我将如何从 Electric Range id
的范围中抓取范围
(XPath = //*[@id="content"]/section[1]/div[2]/p[2]/strong/span)
达到 42 公里。
或者靠近底部的“性能”选项卡中的电动 ID 范围
(XPath = //*[@id="performance-container"]/ul/li[3]/span[2]/text())
达到 35 公里(不要问我为什么范围不相等!)
我基本上尝试 运行:
时收到以下错误
read_html(paste0('https://www.plugndrive.ca/pnd_evcar_cat/',car_list[[1]],'/')) %>%
html_nodes('//*[@id="performance-container"]/ul/li[3]/span[2]/text()') %>% html_text()
Error in tokenize(css) : Unexpected character '/' found at position 1
您可以通过以下方式获取价格:
library(rvest)
price <- elec_url %>%
html_nodes('div.inner p.price-container span.starting-price') %>%
html_text()
price
# [1] ",930" ",990" ",749" ",965" ",895" ",990"
# [7] ",499" ",495" ",595" ",760" ",998" ",590"
#[13] ",898" ",950" ",995" ",998" ",999" ",371"
#[19] ",990" ",469" ",400" ",550" ",400" ",900"
#[25] ",200" ",390" ",950" ",950" ",500" ",800"
#[31] ",000" "9,090" "6,090" "2,800" "9,900" "3,900"
如果您想将其转换为数字,您可以使用 parse_number
from readr
:
readr::parse_number(price)
# [1] 32930 32990 33749 33965 37895 39990 41499 42495 42595 42760
#[11] 43998 44590 44898 44950 44995 44998 44999 45371 55990 56469
#[21] 66400 68550 69400 69900 72200 72390 74950 74950 80500 89800
#[31] 90000 109090 116090 122800 149900 173900
编辑
如果您想从每个单独的汽车页面获得电动系列和汽油系列,也许我以前错过了您要找的东西。您可以先提取所有 URL,然后从中获取数字。
library(tidyverse)
all_urls <- elec_url %>% html_nodes('div.evCar a') %>% html_attr('href')
all_ranges <- map_chr(all_urls, ~.x %>%
read_html() %>% html_nodes('div.info p strong') %>%
.[1] %>% html_text())
tibble(all_ranges, car_list) %>%
mutate(electic_range = str_extract(all_ranges, '(?<=Electric Range:\s)\d+'),
gasoline_range = str_extract(all_ranges,'(?<=Gasoline Range:\s)\d+')) %>%
select(-all_ranges)
# A tibble: 36 x 3
# car_list electic_range gasoline_range
# <chr> <chr> <chr>
# 1 ford-fusion-energi 42 940
# 2 toyota-prius-prime 40 995
# 3 hyundai-ioniq-phev 47 961
# 4 kia-niro-phev 42 853
# 5 volkswagen-e-golf 198 NA
# 6 mini-cooper-se 177 NA
# 7 hyundai-ioniq-electric 274 NA
# 8 subaru-crosstrek-phev 27 747
# 9 kia-soul-electric 383 NA
#10 honda-clarity-phev 77 475
# … with 26 more rows
我是新手,我想通过汽车在网站上提取信息: https://www.plugndrive.ca/electric-cars-available-in-canada/
library(rvest);library(tidyverse)
elec_url <- read_html('https://www.plugndrive.ca/electric-cars-available-in-canada/')
car_list <- elec_url %>% html_nodes('.car-title') %>% html_text() %>% tolower() %>%
gsub(' ','-',.)
price <- read_html(paste0('https://www.plugndrive.ca/pnd_evcar_cat/',car_list[[1]],'/')) %>%
html_nodes('.starting-price .value') %>% html_text() %>% sub("\..*", "", .) %>% gsub('^\$|\,','',.) %>%
as.numeric()
我将如何从 Electric Range id
的范围中抓取范围(XPath = //*[@id="content"]/section[1]/div[2]/p[2]/strong/span)
达到 42 公里。
或者靠近底部的“性能”选项卡中的电动 ID 范围
(XPath = //*[@id="performance-container"]/ul/li[3]/span[2]/text())
达到 35 公里(不要问我为什么范围不相等!)
我基本上尝试 运行:
时收到以下错误read_html(paste0('https://www.plugndrive.ca/pnd_evcar_cat/',car_list[[1]],'/')) %>%
html_nodes('//*[@id="performance-container"]/ul/li[3]/span[2]/text()') %>% html_text()
Error in tokenize(css) : Unexpected character '/' found at position 1
您可以通过以下方式获取价格:
library(rvest)
price <- elec_url %>%
html_nodes('div.inner p.price-container span.starting-price') %>%
html_text()
price
# [1] ",930" ",990" ",749" ",965" ",895" ",990"
# [7] ",499" ",495" ",595" ",760" ",998" ",590"
#[13] ",898" ",950" ",995" ",998" ",999" ",371"
#[19] ",990" ",469" ",400" ",550" ",400" ",900"
#[25] ",200" ",390" ",950" ",950" ",500" ",800"
#[31] ",000" "9,090" "6,090" "2,800" "9,900" "3,900"
如果您想将其转换为数字,您可以使用 parse_number
from readr
:
readr::parse_number(price)
# [1] 32930 32990 33749 33965 37895 39990 41499 42495 42595 42760
#[11] 43998 44590 44898 44950 44995 44998 44999 45371 55990 56469
#[21] 66400 68550 69400 69900 72200 72390 74950 74950 80500 89800
#[31] 90000 109090 116090 122800 149900 173900
编辑
如果您想从每个单独的汽车页面获得电动系列和汽油系列,也许我以前错过了您要找的东西。您可以先提取所有 URL,然后从中获取数字。
library(tidyverse)
all_urls <- elec_url %>% html_nodes('div.evCar a') %>% html_attr('href')
all_ranges <- map_chr(all_urls, ~.x %>%
read_html() %>% html_nodes('div.info p strong') %>%
.[1] %>% html_text())
tibble(all_ranges, car_list) %>%
mutate(electic_range = str_extract(all_ranges, '(?<=Electric Range:\s)\d+'),
gasoline_range = str_extract(all_ranges,'(?<=Gasoline Range:\s)\d+')) %>%
select(-all_ranges)
# A tibble: 36 x 3
# car_list electic_range gasoline_range
# <chr> <chr> <chr>
# 1 ford-fusion-energi 42 940
# 2 toyota-prius-prime 40 995
# 3 hyundai-ioniq-phev 47 961
# 4 kia-niro-phev 42 853
# 5 volkswagen-e-golf 198 NA
# 6 mini-cooper-se 177 NA
# 7 hyundai-ioniq-electric 274 NA
# 8 subaru-crosstrek-phev 27 747
# 9 kia-soul-electric 383 NA
#10 honda-clarity-phev 77 475
# … with 26 more rows