在 iframe 中使用 R 进行网页抓取
Web Scraping with R inside an iframe
我正在尝试通过网站的网络抓取来提取诊所名称列表,但发现该网页包含多个 iframe。这是我第一次进行网络抓取,我花了数小时研究解决方案但无济于事。
这是我目前的代码:
library(rvest)
library(stringr)
healthhub.url <- "https://www.healthhub.sg/directory/clinics-and-polyclinics/"
iframe_src <- html_session(healthhub.url) %>%
html_node("iframe") %>%
html_attr("src")
iframe_url <- str_c(healthhub.url,iframe_src)
html_session(iframe_url) %>%
html_nodes(".app_ment") %>%
html_text()
当然不行,不然也不用求高手了
我试过检查 iframe_url 但它只显示一个结果,这与我使用 view_page_source 的检查相矛盾。结果 iframe_url 不是嵌入诊所名称的那个。实际上,它是第二个没有出现的
我不知道哪里出了问题。非常感谢任何人的帮助。谢谢!
编辑:为了澄清,我能够提取第一页列表,但是当我尝试提取后续页面的信息时,url 没有改变,因此我怀疑它们是嵌入的在 iframe 中。
您可以仅使用 HTML 个节点来抓取目录中诊所的名称和地址,正如我在以下代码中所做的那样:
library(rvest)
library(stringr)
healthhub.url <- "https://www.healthhub.sg/directory/clinics-and-polyclinics/"
clinic_dets <- html_session(healthhub.url) %>%
html_nodes(".app_ment") %>%
html_text() %>%
str_extract(., "[^\r\n].*") %>%
trimws()
clinic_address <- html_session(healthhub.url) %>%
html_nodes(".add_sign") %>%
html_text() %>%
str_extract(., "[^\r\n].*") %>%
trimws()
clinic_name <- ""
for (clinic_idx in (1:length(clinic_dets))) {
clinic_name[clinic_idx] <- sub(clinic_address[clinic_idx], '', clinic_dets[clinic_idx])
}
clinic_name <- trimws(clinic_name)
df <- data.frame(clinic_name, clinic_address)
现在棘手的部分是在结果页面之间导航,这是通过 Javascript 请求完成的。您可以使用 Selenium
打开浏览器实例,模拟导航并从那里提取数据,就像我在下面的代码中所做的那样:
library(rvest)
library(stringr)
library(RSelenium)
scrape_clinics_data <- function(page_html){
#'
#'@description Retrieves clinics names and addresses from directory on
#'main source URL.
#'
#' Parameters:
#'@param page_html (XML): An XML document containing the webpages HTML code.
#'
#' Returns:
#'@return page_df (data.frame): i x 2 data.frame with columns
#'clinic_name (chr) and clinic_address (chr), where i equals the number of
#'clinics retrieved from the given page.
#'
#'@importFrom rvest, stringr
#'
clinic_dets <- page %>%
html_nodes(".app_ment") %>%
html_text() %>%
str_extract(., "[^\r\n].*") %>%
trimws()
clinic_address <- page %>%
html_nodes(".add_sign") %>%
html_text() %>%
str_extract(., "[^\r\n].*") %>%
trimws()
clinic_name <- ""
for (clinic_idx in (1:length(clinic_dets))) {
clinic_name[clinic_idx] <- sub(clinic_address[clinic_idx], '', clinic_dets[clinic_idx])
}
clinic_name <- trimws(clinic_name)
# Store scraped data in data.frame
page_df <- data.frame(clinic_name, clinic_address)
return(page_df)
}
# Source URL
url <- "https://www.healthhub.sg/directory/clinics-and-polyclinics/"
# Instantiate a Selenium server
rD <- rsDriver(browser=c("chrome"), chromever="91.0.4472.19")
# Assign the client to an object
rem_dr <- rD[["client"]]
# Navigate to the URL
rem_dr$navigate(url)
# Create a data.frame to store results
df <- data.frame()
# GET HTML of the first page of results in the directory
page <- read_html(rem_dr$getPageSource()[[1]])
# Store scraped data in temporary data.frame
temp_df <- scrape_clinics_data(page)
# Append temporary data.frame to main data.frame
df <- rbind(df, temp_df)
# Collect list of navigation links
links <- rem_dr$findElements(using = 'class', value = 'page-link')
# Retrieve number of results pages
results_pages <- c()
for (idx in 1:length(links)) {
link_txt <- unlist(links[[idx]]$getElementText())
if (!link_txt %in% c("", "❮", "❯")){
if (length(str_split(link_txt, '/')[[1]]) > 1) {
results_pages <- c(results_pages, str_split(link_txt, '/')[[1]][2])
} else {
results_pages <- c(results_pages, link_txt)
}
}
}
# Identify number of last results page
max_results_page <- max(as.numeric(results_pages))
# Move on to the second page of results
links[[length(links)]]$clickElement()
# Repeat the procedure from 2 to the maximum number of results pages, since we
# are already on the second page of results
for (i in 2:max_results_page) {
page <- read_html(rem_dr$getPageSource()[[1]])
temp_df <- scrape_clinics_data(page)
df <- rbind(df, temp_df)
if (i < max_results_page) {
links <- rem_dr$findElements(using = 'class', value = 'page-link')
links[[length(links)]]$clickElement()
}
}
# Close the client and the server
rem_dr$close()
rD$server$stop()
如果您是 Selenium
的新手,我强烈建议您看看 Ivan Millanes 的 this introductory guide 以及他们在那里链接的其他资源。
这是代码的简单实现。无论如何,先考虑 ethics of web scraping 总是好的。
我正在尝试通过网站的网络抓取来提取诊所名称列表,但发现该网页包含多个 iframe。这是我第一次进行网络抓取,我花了数小时研究解决方案但无济于事。
这是我目前的代码:
library(rvest)
library(stringr)
healthhub.url <- "https://www.healthhub.sg/directory/clinics-and-polyclinics/"
iframe_src <- html_session(healthhub.url) %>%
html_node("iframe") %>%
html_attr("src")
iframe_url <- str_c(healthhub.url,iframe_src)
html_session(iframe_url) %>%
html_nodes(".app_ment") %>%
html_text()
当然不行,不然也不用求高手了
我试过检查 iframe_url 但它只显示一个结果,这与我使用 view_page_source 的检查相矛盾。结果 iframe_url 不是嵌入诊所名称的那个。实际上,它是第二个没有出现的
我不知道哪里出了问题。非常感谢任何人的帮助。谢谢!
编辑:为了澄清,我能够提取第一页列表,但是当我尝试提取后续页面的信息时,url 没有改变,因此我怀疑它们是嵌入的在 iframe 中。
您可以仅使用 HTML 个节点来抓取目录中诊所的名称和地址,正如我在以下代码中所做的那样:
library(rvest)
library(stringr)
healthhub.url <- "https://www.healthhub.sg/directory/clinics-and-polyclinics/"
clinic_dets <- html_session(healthhub.url) %>%
html_nodes(".app_ment") %>%
html_text() %>%
str_extract(., "[^\r\n].*") %>%
trimws()
clinic_address <- html_session(healthhub.url) %>%
html_nodes(".add_sign") %>%
html_text() %>%
str_extract(., "[^\r\n].*") %>%
trimws()
clinic_name <- ""
for (clinic_idx in (1:length(clinic_dets))) {
clinic_name[clinic_idx] <- sub(clinic_address[clinic_idx], '', clinic_dets[clinic_idx])
}
clinic_name <- trimws(clinic_name)
df <- data.frame(clinic_name, clinic_address)
现在棘手的部分是在结果页面之间导航,这是通过 Javascript 请求完成的。您可以使用 Selenium
打开浏览器实例,模拟导航并从那里提取数据,就像我在下面的代码中所做的那样:
library(rvest)
library(stringr)
library(RSelenium)
scrape_clinics_data <- function(page_html){
#'
#'@description Retrieves clinics names and addresses from directory on
#'main source URL.
#'
#' Parameters:
#'@param page_html (XML): An XML document containing the webpages HTML code.
#'
#' Returns:
#'@return page_df (data.frame): i x 2 data.frame with columns
#'clinic_name (chr) and clinic_address (chr), where i equals the number of
#'clinics retrieved from the given page.
#'
#'@importFrom rvest, stringr
#'
clinic_dets <- page %>%
html_nodes(".app_ment") %>%
html_text() %>%
str_extract(., "[^\r\n].*") %>%
trimws()
clinic_address <- page %>%
html_nodes(".add_sign") %>%
html_text() %>%
str_extract(., "[^\r\n].*") %>%
trimws()
clinic_name <- ""
for (clinic_idx in (1:length(clinic_dets))) {
clinic_name[clinic_idx] <- sub(clinic_address[clinic_idx], '', clinic_dets[clinic_idx])
}
clinic_name <- trimws(clinic_name)
# Store scraped data in data.frame
page_df <- data.frame(clinic_name, clinic_address)
return(page_df)
}
# Source URL
url <- "https://www.healthhub.sg/directory/clinics-and-polyclinics/"
# Instantiate a Selenium server
rD <- rsDriver(browser=c("chrome"), chromever="91.0.4472.19")
# Assign the client to an object
rem_dr <- rD[["client"]]
# Navigate to the URL
rem_dr$navigate(url)
# Create a data.frame to store results
df <- data.frame()
# GET HTML of the first page of results in the directory
page <- read_html(rem_dr$getPageSource()[[1]])
# Store scraped data in temporary data.frame
temp_df <- scrape_clinics_data(page)
# Append temporary data.frame to main data.frame
df <- rbind(df, temp_df)
# Collect list of navigation links
links <- rem_dr$findElements(using = 'class', value = 'page-link')
# Retrieve number of results pages
results_pages <- c()
for (idx in 1:length(links)) {
link_txt <- unlist(links[[idx]]$getElementText())
if (!link_txt %in% c("", "❮", "❯")){
if (length(str_split(link_txt, '/')[[1]]) > 1) {
results_pages <- c(results_pages, str_split(link_txt, '/')[[1]][2])
} else {
results_pages <- c(results_pages, link_txt)
}
}
}
# Identify number of last results page
max_results_page <- max(as.numeric(results_pages))
# Move on to the second page of results
links[[length(links)]]$clickElement()
# Repeat the procedure from 2 to the maximum number of results pages, since we
# are already on the second page of results
for (i in 2:max_results_page) {
page <- read_html(rem_dr$getPageSource()[[1]])
temp_df <- scrape_clinics_data(page)
df <- rbind(df, temp_df)
if (i < max_results_page) {
links <- rem_dr$findElements(using = 'class', value = 'page-link')
links[[length(links)]]$clickElement()
}
}
# Close the client and the server
rem_dr$close()
rD$server$stop()
如果您是 Selenium
的新手,我强烈建议您看看 Ivan Millanes 的 this introductory guide 以及他们在那里链接的其他资源。
这是代码的简单实现。无论如何,先考虑 ethics of web scraping 总是好的。