向下滚动以抓取 Google 评论时出现问题
Issue with scrolling down to scrape Google Reviews
我正在尝试从 Google 评论(星级、评论、日期等)中抓取数据。
我尝试改编我在网上找到的代码,但在使其工作时遇到问题。显然,R 无法向下滚动 google 条评论,只有 returns 前十条评论(即 Google 无需滚动即可显示的评论)
有人遇到过同样的问题吗?谢谢!
#install.packages("rvest")
#install.packages("xml2")
#install.packages("RSelenium")
library(rvest)
library(xml2)
library(RSelenium)
rmDr=rsDriver(port = 4444L, browser=c("firefox"))
myclient= rmDr$client
myclient$navigate("https://www.google.com/search?client=firefox-b-d&q=emporio+santa+maria#lrd=0x94ce576a4e45ed99:0xa36a342d3ceb06c3,1,,,")
#click on the snippet to switch focus----------
webEle <- myclient$findElement(using = "css",value = ".review-snippet")
webEle$clickElement()
#simulate scroll down for several times-------------
scroll_down_times=1000
for(i in 1 :scroll_down_times){
webEle$sendKeysToActiveElement(sendKeys = list(key="page_down"))
#the content needs time to load,wait 1 second every 5 scroll downs
if(i%%5==0){
Sys.sleep(3)
}
}
#loop and simulate clicking on all "click on more" elements-------------
webEles <- myclient$findElements(using = "css",value = ".review-more-link")
for(webEle in webEles){
tryCatch(webEle$clickElement(),error=function(e){print(e)}) # trycatch to prevent any error from stopping the loop
}
pagesource= myclient$getPageSource()[[1]]
#this should get you the full review, including translation and original text-------------
reviews=read_html(pagesource) %>%
html_nodes(".review-full-text") %>%
html_text()
#number of stars
stars <- read_html(pagesource) %>%
html_node(".review-dialog-list") %>%
html_nodes("g-review-stars > span") %>%
html_attr("aria-label")
#time posted
post_time <- read_html(pagesource) %>%
html_node(".review-dialog-list") %>%
html_nodes(".dehysf") %>%
html_text()`enter code here`
代码全部正确,但您没有定位正确的元素,请改用 css 中的 .review-dialog-list
。该元素是滚动条所在的位置。
library(RSelenium)
rmDr <- rsDriver(browser = "firefox")
driver <- rmDr$client
driver$navigate("https://www.google.com/search?client=firefox-b-d&q=emporio+santa+maria#lrd=0x94ce576a4e45ed99:0xa36a342d3ceb06c3,1,,,")
Sys.sleep(3) # wait a couple of seconds to let browser render the review window.
webEle <- driver$findElement(using = "css",value = ".review-dialog-list")
for(i in 1 : 5){
webEle$sendKeysToElement(sendKeys = list(key = "page_down"))
Sys.sleep(1)
}
我正在尝试从 Google 评论(星级、评论、日期等)中抓取数据。
我尝试改编我在网上找到的代码,但在使其工作时遇到问题。显然,R 无法向下滚动 google 条评论,只有 returns 前十条评论(即 Google 无需滚动即可显示的评论)
有人遇到过同样的问题吗?谢谢!
#install.packages("rvest")
#install.packages("xml2")
#install.packages("RSelenium")
library(rvest)
library(xml2)
library(RSelenium)
rmDr=rsDriver(port = 4444L, browser=c("firefox"))
myclient= rmDr$client
myclient$navigate("https://www.google.com/search?client=firefox-b-d&q=emporio+santa+maria#lrd=0x94ce576a4e45ed99:0xa36a342d3ceb06c3,1,,,")
#click on the snippet to switch focus----------
webEle <- myclient$findElement(using = "css",value = ".review-snippet")
webEle$clickElement()
#simulate scroll down for several times-------------
scroll_down_times=1000
for(i in 1 :scroll_down_times){
webEle$sendKeysToActiveElement(sendKeys = list(key="page_down"))
#the content needs time to load,wait 1 second every 5 scroll downs
if(i%%5==0){
Sys.sleep(3)
}
}
#loop and simulate clicking on all "click on more" elements-------------
webEles <- myclient$findElements(using = "css",value = ".review-more-link")
for(webEle in webEles){
tryCatch(webEle$clickElement(),error=function(e){print(e)}) # trycatch to prevent any error from stopping the loop
}
pagesource= myclient$getPageSource()[[1]]
#this should get you the full review, including translation and original text-------------
reviews=read_html(pagesource) %>%
html_nodes(".review-full-text") %>%
html_text()
#number of stars
stars <- read_html(pagesource) %>%
html_node(".review-dialog-list") %>%
html_nodes("g-review-stars > span") %>%
html_attr("aria-label")
#time posted
post_time <- read_html(pagesource) %>%
html_node(".review-dialog-list") %>%
html_nodes(".dehysf") %>%
html_text()`enter code here`
代码全部正确,但您没有定位正确的元素,请改用 css 中的 .review-dialog-list
。该元素是滚动条所在的位置。
library(RSelenium)
rmDr <- rsDriver(browser = "firefox")
driver <- rmDr$client
driver$navigate("https://www.google.com/search?client=firefox-b-d&q=emporio+santa+maria#lrd=0x94ce576a4e45ed99:0xa36a342d3ceb06c3,1,,,")
Sys.sleep(3) # wait a couple of seconds to let browser render the review window.
webEle <- driver$findElement(using = "css",value = ".review-dialog-list")
for(i in 1 : 5){
webEle$sendKeysToElement(sendKeys = list(key = "page_down"))
Sys.sleep(1)
}