使用 Selenium 和 Scrapy 抓取所有下一页
Grab all next pages using Selenium and Scrapy
我正在尝试获取所有“下一页”,并继续抓取这些页面,方法是使用 Selenium 单击页面底部的“下一页”按钮。我想获得所有这些(第 2、3、4 页等)。但是,我不确定我在这里做错了什么,但我无法使 'click' 选项起作用。
这是我的代码:
import scrapy
import re
import math
from selenium import webdriver
import time
class PropertyFoxSpider(scrapy.Spider):
name = 'property_fox'
start_urls = [
'https://propertyfox.co.za/listing-search?currentpage=1&term_id=62515&keywords=Western+Cape&orderby=createddate:desc&status%5B%5D=Active'
]
def __init__(self):
#path to driver
self.driver = webdriver.Chrome('my_path_here')
def parse(self, response):
self.driver.get(response.url)
for prop in response.css('div.property-item'):
link = prop.css('a::attr(href)').get()
banner = prop.css('div.property-figure-icon div::text').get()
sold_tag = None
if banner:
banner = banner.strip()
sold_tag = 'sold' if 'sold' in banner.lower() else None
yield scrapy.Request(
link,
callback=self.parse_property,
meta={'item': {
'agency': self.name,
'url': link,
'offering': 'buy',
'banners': banner,
'sold_tag': sold_tag,
}},
)
elem = self.driver.find_element_by_id('pagerNext')
#elem = self.driver.find_element_by_xpath('//*[@id="pagerNext"]')
elem.click()
time.sleep(0.2)
def parse_property(self, response):
item = response.meta.get('item')
. . .
尝试等到元素可点击:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
elem = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.ID, "pagerNext")))
elem.click()
我正在尝试获取所有“下一页”,并继续抓取这些页面,方法是使用 Selenium 单击页面底部的“下一页”按钮。我想获得所有这些(第 2、3、4 页等)。但是,我不确定我在这里做错了什么,但我无法使 'click' 选项起作用。
这是我的代码:
import scrapy
import re
import math
from selenium import webdriver
import time
class PropertyFoxSpider(scrapy.Spider):
name = 'property_fox'
start_urls = [
'https://propertyfox.co.za/listing-search?currentpage=1&term_id=62515&keywords=Western+Cape&orderby=createddate:desc&status%5B%5D=Active'
]
def __init__(self):
#path to driver
self.driver = webdriver.Chrome('my_path_here')
def parse(self, response):
self.driver.get(response.url)
for prop in response.css('div.property-item'):
link = prop.css('a::attr(href)').get()
banner = prop.css('div.property-figure-icon div::text').get()
sold_tag = None
if banner:
banner = banner.strip()
sold_tag = 'sold' if 'sold' in banner.lower() else None
yield scrapy.Request(
link,
callback=self.parse_property,
meta={'item': {
'agency': self.name,
'url': link,
'offering': 'buy',
'banners': banner,
'sold_tag': sold_tag,
}},
)
elem = self.driver.find_element_by_id('pagerNext')
#elem = self.driver.find_element_by_xpath('//*[@id="pagerNext"]')
elem.click()
time.sleep(0.2)
def parse_property(self, response):
item = response.meta.get('item')
. . .
尝试等到元素可点击:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
elem = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.ID, "pagerNext")))
elem.click()