如何使用 selenium 和 scrapy 来自动化这个过程?
How to use selenium along with scrapy to automate the process?
我一度了解到您需要使用像 selenium 这样的网络工具包来自动化抓取。
我如何才能单击 google Play 商店上的下一步按钮,以便只为我的大学目的抓取评论!!
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from selenium import webdriver
import time
class Product(scrapy.Item):
title = scrapy.Field()
class FooSpider(CrawlSpider):
name = 'foo'
start_urls = ["https://play.google.com/store/apps/details?id=com.gaana&hl=en"]
def __init__(self, *args, **kwargs):
super(FooSpider, self).__init__(*args, **kwargs)
self.download_delay = 0.25
self.browser = webdriver.Chrome(executable_path="C:\chrm\chromedriver.exe")
self.browser.implicitly_wait(60) #
def parse(self,response):
self.browser.get(response.url)
sites = response.xpath('//div[@class="single-review"]/div[@class="review-header"]')
items = []
for i in range(0,200):
time.sleep(20)
button = self.browser.find_element_by_xpath("/html/body/div[4]/div[6]/div[1]/div[2]/div[2]/div[1]/div[2]/button[1]/div[2]/div/div")
button.click()
self.browser.implicitly_wait(30)
for site in sites:
item = Product()
item['title'] = site.xpath('.//div[@class="review-info"]/span[@class="author-name"]/a/text()').extract()
yield item
我已经更新了我的代码,它只再次给我重复 40 个项目,again.whats 我的 for 循环有问题吗?
似乎正在更新的源代码没有传递给 xpath,这就是为什么它返回相同的 40 个项目
我会这样做:
from scrapy import CrawlSpider
from selenium import webdriver
import time
class FooSpider(CrawlSpider):
name = 'foo'
allow_domains = 'foo.com'
start_urls = ['foo.com']
def __init__(self, *args, **kwargs):
super(FooSpider, self).__init__(*args, **kwargs)
self.download_delay = 0.25
self.browser = webdriver.Firefox()
self.browser.implicitly_wait(60)
def parse_foo(self.response):
self.browser.get(response.url) # load response to the browser
button = self.browser.find_element_by_xpath("path") # find
# the element to click to
button.click() # click
time.sleep(1) # wait until the page is fully loaded
source = self.browser.page_source # get source of the loaded page
sel = Selector(text=source) # create a Selector object
data = sel.xpath('path/to/the/data') # select data
...
不过,最好不要等待固定的时间。因此,您可以使用此处描述的方法之一代替 time.sleep(1)
。
我一度了解到您需要使用像 selenium 这样的网络工具包来自动化抓取。
我如何才能单击 google Play 商店上的下一步按钮,以便只为我的大学目的抓取评论!!
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from selenium import webdriver
import time
class Product(scrapy.Item):
title = scrapy.Field()
class FooSpider(CrawlSpider):
name = 'foo'
start_urls = ["https://play.google.com/store/apps/details?id=com.gaana&hl=en"]
def __init__(self, *args, **kwargs):
super(FooSpider, self).__init__(*args, **kwargs)
self.download_delay = 0.25
self.browser = webdriver.Chrome(executable_path="C:\chrm\chromedriver.exe")
self.browser.implicitly_wait(60) #
def parse(self,response):
self.browser.get(response.url)
sites = response.xpath('//div[@class="single-review"]/div[@class="review-header"]')
items = []
for i in range(0,200):
time.sleep(20)
button = self.browser.find_element_by_xpath("/html/body/div[4]/div[6]/div[1]/div[2]/div[2]/div[1]/div[2]/button[1]/div[2]/div/div")
button.click()
self.browser.implicitly_wait(30)
for site in sites:
item = Product()
item['title'] = site.xpath('.//div[@class="review-info"]/span[@class="author-name"]/a/text()').extract()
yield item
我已经更新了我的代码,它只再次给我重复 40 个项目,again.whats 我的 for 循环有问题吗?
似乎正在更新的源代码没有传递给 xpath,这就是为什么它返回相同的 40 个项目
我会这样做:
from scrapy import CrawlSpider
from selenium import webdriver
import time
class FooSpider(CrawlSpider):
name = 'foo'
allow_domains = 'foo.com'
start_urls = ['foo.com']
def __init__(self, *args, **kwargs):
super(FooSpider, self).__init__(*args, **kwargs)
self.download_delay = 0.25
self.browser = webdriver.Firefox()
self.browser.implicitly_wait(60)
def parse_foo(self.response):
self.browser.get(response.url) # load response to the browser
button = self.browser.find_element_by_xpath("path") # find
# the element to click to
button.click() # click
time.sleep(1) # wait until the page is fully loaded
source = self.browser.page_source # get source of the loaded page
sel = Selector(text=source) # create a Selector object
data = sel.xpath('path/to/the/data') # select data
...
不过,最好不要等待固定的时间。因此,您可以使用此处描述的方法之一代替 time.sleep(1)
。