我是使用 rvest 进行 R 网络抓取的新手。一直在尝试抓取肯尼亚的一个房​​地产网站

I am new to R web scraping using rvest. Have been trying to scrape a real estate website in Kenya

在 运行 下面的程序之后,我在 data.frame(SalePrice, BedRooms, Park_Spaces, Toilts, Bathrooms, : arguments imply differing number 中得到这个 Error行数:20、13、14

块引用

library(rvest)
library(tidyverse)
library(stringr)

df<-data.frame()
for(i in 1:10){
  Link<-paste0("https://kenyapropertycentre.com/for-sale/houses/nairobi/showtype?page=",i)
  page<-read_html(Link)
  SalePrice<-page %>% html_nodes(".price+.price") %>% 
      html_text() %>% str_remove_all(",") %>% as.numeric()
  BedRooms<-page %>% html_nodes(".fa-bed+ span")%>%
      html_text()%>% as.numeric()
  if (purrr::is_empty(BedRooms)){BedRooms=NA}
  Address<-page %>% html_nodes(".voffset-bottom-10 strong") %>% 
      html_text() %>% as.character()
  Bathrooms<-page %>% html_nodes(".fa-bath+ span")%>%
      html_text() %>% as.numeric()
  if (purrr::is_empty(Bathrooms)){Bathrooms=NA}
  
  Toilets<-page %>% html_nodes(".fa-toilet+ span")%>%
      html_text() %>% as.numeric()
  if (purrr::is_empty(Toilets)){Toilets=NA}
  
  Park_Spaces<-page %>% html_nodes(".fa-car+ span")%>% 
    html_text() %>% as.numeric() 
  if (purrr::is_empty(Park_Spaces)){Park_Spaces=NA} 
  
  Sales_Agency<-page %>% html_nodes(".text-right") %>% html_text()%>% 
      str_remove_all("[0-9]") %>% str_remove_all("\n") %>% str_remove_all(" ") %>%
      str_remove_all("[:punct:]") %>% as.character()
  Date_Added<-page %>% html_nodes(".added-on+.added-on") %>% html_text()%>%
      str_remove_all("Added on")
  df<-rbind(df,data.frame(SalePrice,BedRooms,Park_Spaces,Toilets,Bathrooms,
                            Sales_Agency,Date_Added))
  cat(paste("page",i),"\n")
}

首先生成网址列表。然后在这些 url 上应用一个函数,该函数最初获取所有父节点的列表 (.property)。然后使用 map_dfr 应用一个函数,该函数从每个 属性 列表中提取所需的信息,并映射到最终的 DataFrame。您不必担心添加 NA,因为这将默认从 html_element 返回,而

不存在。
library(rvest)
library(tidyverse)
library(stringr)

links <- sprintf("https://kenyapropertycentre.com/for-sale/houses/nairobi/showtype?page=%i", 1:10)

get_listings <- function(url) {
  read_html(url) %>%
    html_elements(".property") -> listings
  return(listings)
}

get_listing_info <- function(listing) {
  tibble(
    SalePrice = listing %>% html_element(".price+.price") %>% html_text() %>% str_remove_all(",") %>% as.numeric(),
    BedRooms = listing %>% html_element(".fa-bed+ span") %>% html_text() %>% as.numeric(),
    Address = listing %>% html_element(".voffset-bottom-10 strong") %>% html_text() %>% as.character(),
    Bathrooms = listing %>% html_element(".fa-bath+ span") %>% html_text() %>% as.numeric(),
    Toilets = listing %>% html_element(".fa-toilet+ span") %>% html_text() %>% as.numeric(),
    Park_Spaces = listing %>% html_element(".fa-car+ span") %>% html_text() %>% as.numeric(),
    Sales_Agency = listing %>% html_element(".text-right") %>% html_text() %>% str_remove_all("[0-9]") %>% str_remove_all("\n") %>% str_remove_all(" ") %>% str_remove_all("[:punct:]") %>% as.character(),
    Date_Added = listing %>% html_element(".added-on+.added-on") %>% html_text() %>% str_remove_all("Added on")
  ) -> t
  return(t)
}


all_listings <- purrr::map(links, get_listings) %>% unlist(recursive = F)
df <- map_dfr(all_listings, get_listing_info)