我是使用 rvest 进行 R 网络抓取的新手。一直在尝试抓取肯尼亚的一个房地产网站
I am new to R web scraping using rvest. Have been trying to scrape a real estate website in Kenya
在 运行 下面的程序之后,我在 data.frame(SalePrice, BedRooms, Park_Spaces, Toilts, Bathrooms, : arguments imply differing number 中得到这个 Error行数:20、13、14
块引用
library(rvest)
library(tidyverse)
library(stringr)
df<-data.frame()
for(i in 1:10){
Link<-paste0("https://kenyapropertycentre.com/for-sale/houses/nairobi/showtype?page=",i)
page<-read_html(Link)
SalePrice<-page %>% html_nodes(".price+.price") %>%
html_text() %>% str_remove_all(",") %>% as.numeric()
BedRooms<-page %>% html_nodes(".fa-bed+ span")%>%
html_text()%>% as.numeric()
if (purrr::is_empty(BedRooms)){BedRooms=NA}
Address<-page %>% html_nodes(".voffset-bottom-10 strong") %>%
html_text() %>% as.character()
Bathrooms<-page %>% html_nodes(".fa-bath+ span")%>%
html_text() %>% as.numeric()
if (purrr::is_empty(Bathrooms)){Bathrooms=NA}
Toilets<-page %>% html_nodes(".fa-toilet+ span")%>%
html_text() %>% as.numeric()
if (purrr::is_empty(Toilets)){Toilets=NA}
Park_Spaces<-page %>% html_nodes(".fa-car+ span")%>%
html_text() %>% as.numeric()
if (purrr::is_empty(Park_Spaces)){Park_Spaces=NA}
Sales_Agency<-page %>% html_nodes(".text-right") %>% html_text()%>%
str_remove_all("[0-9]") %>% str_remove_all("\n") %>% str_remove_all(" ") %>%
str_remove_all("[:punct:]") %>% as.character()
Date_Added<-page %>% html_nodes(".added-on+.added-on") %>% html_text()%>%
str_remove_all("Added on")
df<-rbind(df,data.frame(SalePrice,BedRooms,Park_Spaces,Toilets,Bathrooms,
Sales_Agency,Date_Added))
cat(paste("page",i),"\n")
}
首先生成网址列表。然后在这些 url 上应用一个函数,该函数最初获取所有父节点的列表 (.property
)。然后使用 map_dfr
应用一个函数,该函数从每个 属性 列表中提取所需的信息,并映射到最终的 DataFrame
。您不必担心添加 NA,因为这将默认从 html_element
返回,而
不存在。
library(rvest)
library(tidyverse)
library(stringr)
links <- sprintf("https://kenyapropertycentre.com/for-sale/houses/nairobi/showtype?page=%i", 1:10)
get_listings <- function(url) {
read_html(url) %>%
html_elements(".property") -> listings
return(listings)
}
get_listing_info <- function(listing) {
tibble(
SalePrice = listing %>% html_element(".price+.price") %>% html_text() %>% str_remove_all(",") %>% as.numeric(),
BedRooms = listing %>% html_element(".fa-bed+ span") %>% html_text() %>% as.numeric(),
Address = listing %>% html_element(".voffset-bottom-10 strong") %>% html_text() %>% as.character(),
Bathrooms = listing %>% html_element(".fa-bath+ span") %>% html_text() %>% as.numeric(),
Toilets = listing %>% html_element(".fa-toilet+ span") %>% html_text() %>% as.numeric(),
Park_Spaces = listing %>% html_element(".fa-car+ span") %>% html_text() %>% as.numeric(),
Sales_Agency = listing %>% html_element(".text-right") %>% html_text() %>% str_remove_all("[0-9]") %>% str_remove_all("\n") %>% str_remove_all(" ") %>% str_remove_all("[:punct:]") %>% as.character(),
Date_Added = listing %>% html_element(".added-on+.added-on") %>% html_text() %>% str_remove_all("Added on")
) -> t
return(t)
}
all_listings <- purrr::map(links, get_listings) %>% unlist(recursive = F)
df <- map_dfr(all_listings, get_listing_info)
在 运行 下面的程序之后,我在 data.frame(SalePrice, BedRooms, Park_Spaces, Toilts, Bathrooms, : arguments imply differing number 中得到这个 Error行数:20、13、14
块引用
library(rvest)
library(tidyverse)
library(stringr)
df<-data.frame()
for(i in 1:10){
Link<-paste0("https://kenyapropertycentre.com/for-sale/houses/nairobi/showtype?page=",i)
page<-read_html(Link)
SalePrice<-page %>% html_nodes(".price+.price") %>%
html_text() %>% str_remove_all(",") %>% as.numeric()
BedRooms<-page %>% html_nodes(".fa-bed+ span")%>%
html_text()%>% as.numeric()
if (purrr::is_empty(BedRooms)){BedRooms=NA}
Address<-page %>% html_nodes(".voffset-bottom-10 strong") %>%
html_text() %>% as.character()
Bathrooms<-page %>% html_nodes(".fa-bath+ span")%>%
html_text() %>% as.numeric()
if (purrr::is_empty(Bathrooms)){Bathrooms=NA}
Toilets<-page %>% html_nodes(".fa-toilet+ span")%>%
html_text() %>% as.numeric()
if (purrr::is_empty(Toilets)){Toilets=NA}
Park_Spaces<-page %>% html_nodes(".fa-car+ span")%>%
html_text() %>% as.numeric()
if (purrr::is_empty(Park_Spaces)){Park_Spaces=NA}
Sales_Agency<-page %>% html_nodes(".text-right") %>% html_text()%>%
str_remove_all("[0-9]") %>% str_remove_all("\n") %>% str_remove_all(" ") %>%
str_remove_all("[:punct:]") %>% as.character()
Date_Added<-page %>% html_nodes(".added-on+.added-on") %>% html_text()%>%
str_remove_all("Added on")
df<-rbind(df,data.frame(SalePrice,BedRooms,Park_Spaces,Toilets,Bathrooms,
Sales_Agency,Date_Added))
cat(paste("page",i),"\n")
}
首先生成网址列表。然后在这些 url 上应用一个函数,该函数最初获取所有父节点的列表 (.property
)。然后使用 map_dfr
应用一个函数,该函数从每个 属性 列表中提取所需的信息,并映射到最终的 DataFrame
。您不必担心添加 NA,因为这将默认从 html_element
返回,而
library(rvest)
library(tidyverse)
library(stringr)
links <- sprintf("https://kenyapropertycentre.com/for-sale/houses/nairobi/showtype?page=%i", 1:10)
get_listings <- function(url) {
read_html(url) %>%
html_elements(".property") -> listings
return(listings)
}
get_listing_info <- function(listing) {
tibble(
SalePrice = listing %>% html_element(".price+.price") %>% html_text() %>% str_remove_all(",") %>% as.numeric(),
BedRooms = listing %>% html_element(".fa-bed+ span") %>% html_text() %>% as.numeric(),
Address = listing %>% html_element(".voffset-bottom-10 strong") %>% html_text() %>% as.character(),
Bathrooms = listing %>% html_element(".fa-bath+ span") %>% html_text() %>% as.numeric(),
Toilets = listing %>% html_element(".fa-toilet+ span") %>% html_text() %>% as.numeric(),
Park_Spaces = listing %>% html_element(".fa-car+ span") %>% html_text() %>% as.numeric(),
Sales_Agency = listing %>% html_element(".text-right") %>% html_text() %>% str_remove_all("[0-9]") %>% str_remove_all("\n") %>% str_remove_all(" ") %>% str_remove_all("[:punct:]") %>% as.character(),
Date_Added = listing %>% html_element(".added-on+.added-on") %>% html_text() %>% str_remove_all("Added on")
) -> t
return(t)
}
all_listings <- purrr::map(links, get_listings) %>% unlist(recursive = F)
df <- map_dfr(all_listings, get_listing_info)