用 rvest 抓取 - 按 div id 搜索

Ccraping with rvest - search by div id

我是新手,我想通过汽车在网站上提取信息: https://www.plugndrive.ca/electric-cars-available-in-canada/

library(rvest);library(tidyverse)

elec_url <- read_html('https://www.plugndrive.ca/electric-cars-available-in-canada/')

car_list <- elec_url %>% html_nodes('.car-title') %>% html_text() %>% tolower() %>% 
    gsub(' ','-',.)

price <- read_html(paste0('https://www.plugndrive.ca/pnd_evcar_cat/',car_list[[1]],'/')) %>% 
    html_nodes('.starting-price .value') %>% html_text() %>% sub("\..*", "", .) %>% gsub('^\$|\,','',.) %>% 
    as.numeric()

我将如何从 Electric Range id

的范围中抓取范围
(XPath = //*[@id="content"]/section[1]/div[2]/p[2]/strong/span)

达到 42 公里。

或者靠近底部的“性能”选项卡中的电动 ID 范围

(XPath = //*[@id="performance-container"]/ul/li[3]/span[2]/text())

达到 35 公里(不要问我为什么范围不相等!)

我基本上尝试 运行:

时收到以下错误
read_html(paste0('https://www.plugndrive.ca/pnd_evcar_cat/',car_list[[1]],'/')) %>% 
html_nodes('//*[@id="performance-container"]/ul/li[3]/span[2]/text()') %>% html_text()

Error in tokenize(css) : Unexpected character '/' found at position 1

您可以通过以下方式获取价格:

library(rvest)

price <- elec_url %>% 
          html_nodes('div.inner p.price-container span.starting-price') %>%
          html_text()

price
# [1] ",930"  ",990"  ",749"  ",965"  ",895"  ",990" 
# [7] ",499"  ",495"  ",595"  ",760"  ",998"  ",590" 
#[13] ",898"  ",950"  ",995"  ",998"  ",999"  ",371" 
#[19] ",990"  ",469"  ",400"  ",550"  ",400"  ",900" 
#[25] ",200"  ",390"  ",950"  ",950"  ",500"  ",800" 
#[31] ",000"  "9,090" "6,090" "2,800" "9,900" "3,900"

如果您想将其转换为数字,您可以使用 parse_number from readr :

readr::parse_number(price)
# [1]  32930  32990  33749  33965  37895  39990  41499  42495  42595  42760
#[11]  43998  44590  44898  44950  44995  44998  44999  45371  55990  56469
#[21]  66400  68550  69400  69900  72200  72390  74950  74950  80500  89800
#[31]  90000 109090 116090 122800 149900 173900

编辑

如果您想从每个单独的汽车页面获得电动系列和汽油系列,也许我以前错过了您要找的东西。您可以先提取所有 URL,然后从中获取数字。

library(tidyverse)

all_urls <- elec_url %>% html_nodes('div.evCar a') %>%  html_attr('href')

all_ranges <- map_chr(all_urls, ~.x %>% 
                      read_html() %>% html_nodes('div.info p strong') %>%
                      .[1] %>% html_text())

tibble(all_ranges, car_list) %>%
  mutate(electic_range = str_extract(all_ranges, '(?<=Electric Range:\s)\d+'), 
         gasoline_range = str_extract(all_ranges,'(?<=Gasoline Range:\s)\d+')) %>%
  select(-all_ranges)

# A tibble: 36 x 3
#   car_list               electic_range gasoline_range
#   <chr>                  <chr>         <chr>         
# 1 ford-fusion-energi     42            940           
# 2 toyota-prius-prime     40            995           
# 3 hyundai-ioniq-phev     47            961           
# 4 kia-niro-phev          42            853           
# 5 volkswagen-e-golf      198           NA            
# 6 mini-cooper-se         177           NA            
# 7 hyundai-ioniq-electric 274           NA            
# 8 subaru-crosstrek-phev  27            747           
# 9 kia-soul-electric      383           NA            
#10 honda-clarity-phev     77            475           
# … with 26 more rows