使用 rvest 和 purrr 用 R 抓取,多页
scraping with R using rvest and purrr, multiple pages
我正在尝试抓取一个数据库,其中包含有关丹麦某个地区以前出售的房屋的信息。我不仅要检索第 1 页的信息,还要检索第 2、3、4 等页的信息
我是 R 的新手,但从教程中我得到了这个。
library(purrr)
library(rvest)
urlbase <- "https://www.boliga.dk/solgt/alle_boliger-4000ipostnr=4000&so=1&p=%d"
map_df(1:5,function(i){
cat(".")
page <- read_html(sprintf(urlbase,i))
data.frame(Address = html_text(html_nodes(page,".d-md-table-cell a")))
Price = html_text(html_nodes(page,".text-md-left+ .d-md-table-cell .text-right"))
Rooms = html_text(html_nodes(page,".d-md-table-cell:nth-child(5) .paddingR"))
m2 = html_text(html_nodes(page,".qtipped+ .d-md-table-cell .paddingR"))
stringsAsFactors = FALSE
}) -> BOLIGA.ROSKILDE
View(BOLIGA.ROSKILDE)
这给了我消息:
Error in bind_rows_(x, .id) : Argument 1 must have names
欢迎任何帮助
试试这个:
library(rvest)
library(tidyverse)
url="https://www.boliga.dk/solgt/alle_boliger-4000ipostnr=4000?ipostnr=4000ipostnr&so=1&p=1"
# find number of pages in table
pgs<- ceiling(read_html(url)%>%
html_nodes(".d-print-none")%>%
html_nodes("b")%>%
html_text()%>%
gsub("[^\d]+", "", ., perl=TRUE)%>%
as.numeric()
/40)
#scrap our table
scrap=function(pg){
url=paste0("https://www.boliga.dk/solgt/alle_boliger-4000ipostnr=4000?ipostnr=4000ipostnr&so=1&p=",pg)
return( read_html(url)%>%
html_node(".searchResultTable")%>%
html_table()%>%
.[,c(1,2,5,4)]%>%
magrittr::set_colnames(c("Address","Price","Rooms","m2"))%>%
mutate(m2=as.numeric(m2))
)
}
#purrr for each page
df=seq(1,pgs)%>%
map_df(.,scrap)
我正在尝试抓取一个数据库,其中包含有关丹麦某个地区以前出售的房屋的信息。我不仅要检索第 1 页的信息,还要检索第 2、3、4 等页的信息
我是 R 的新手,但从教程中我得到了这个。
library(purrr)
library(rvest)
urlbase <- "https://www.boliga.dk/solgt/alle_boliger-4000ipostnr=4000&so=1&p=%d"
map_df(1:5,function(i){
cat(".")
page <- read_html(sprintf(urlbase,i))
data.frame(Address = html_text(html_nodes(page,".d-md-table-cell a")))
Price = html_text(html_nodes(page,".text-md-left+ .d-md-table-cell .text-right"))
Rooms = html_text(html_nodes(page,".d-md-table-cell:nth-child(5) .paddingR"))
m2 = html_text(html_nodes(page,".qtipped+ .d-md-table-cell .paddingR"))
stringsAsFactors = FALSE
}) -> BOLIGA.ROSKILDE
View(BOLIGA.ROSKILDE)
这给了我消息:
Error in bind_rows_(x, .id) : Argument 1 must have names
欢迎任何帮助
试试这个:
library(rvest)
library(tidyverse)
url="https://www.boliga.dk/solgt/alle_boliger-4000ipostnr=4000?ipostnr=4000ipostnr&so=1&p=1"
# find number of pages in table
pgs<- ceiling(read_html(url)%>%
html_nodes(".d-print-none")%>%
html_nodes("b")%>%
html_text()%>%
gsub("[^\d]+", "", ., perl=TRUE)%>%
as.numeric()
/40)
#scrap our table
scrap=function(pg){
url=paste0("https://www.boliga.dk/solgt/alle_boliger-4000ipostnr=4000?ipostnr=4000ipostnr&so=1&p=",pg)
return( read_html(url)%>%
html_node(".searchResultTable")%>%
html_table()%>%
.[,c(1,2,5,4)]%>%
magrittr::set_colnames(c("Address","Price","Rooms","m2"))%>%
mutate(m2=as.numeric(m2))
)
}
#purrr for each page
df=seq(1,pgs)%>%
map_df(.,scrap)