超时异常硒
TimeoutException Selenium
启动抓取工具后,发生了一些奇怪的事情:它要么正常工作,要么在访问第二页并单击“下一步”按钮后结束,要么以某种方式结束在 属性 页面上,当我使用代码时当前被注释掉的行。然而,当那条线像现在这样放置时,它似乎工作了,它访问了所有页面并抓取了它们,最终,我得到了一个超时。我不确定是什么问题?有小费吗?
当前代码:
class PropertyFoxSpider(scrapy.Spider):
name = 'property_fox'
start_urls = [
'https://propertyfox.co.za/listing-search?currentpage=1&term_id=62515&keywords=Western+Cape&orderby=createddate:desc&status%5B%5D=Active'
]
def __init__(self):
#path to driver
self.driver = webdriver.Chrome('my_path')
def parse(self,response):
url = self.driver.get(response.url)
while True:
WebDriverWait(self.driver, 10).until(lambda driver: self.driver.current_url != url)
try:
elem = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.ID, "pagerNext")))
elem.click()
#WebDriverWait(self.driver, 10).until(lambda driver: self.driver.current_url != url)
url = self.driver.current_url
yield scrapy.Request(url=url, callback=self.parse_page, dont_filter=False)
except TimeoutException:
break
def parse_page(self, response):
for prop in response.css('div.property-item'):
link = prop.css('a::attr(href)').get()
banner = prop.css('div.property-figure-icon div::text').get()
sold_tag = None
if banner:
banner = banner.strip()
sold_tag = 'sold' if 'sold' in banner.lower() else None
yield scrapy.Request(
link,
callback=self.parse_property,
meta={'item': {
'agency': self.name,
'url': link,
'offering': 'buy',
'banners': banner,
'sold_tag': sold_tag,
}},
)
def parse_property(self, response):
item = response.meta.get('item')
...
Selenium 似乎“识别”甚至禁用了“下一步”按钮作为可点击元素,并且即使在最后一页仍然尝试点击它。您可以尝试以下代码使其工作:
def parse(self,response):
self.driver.get(response.url)
url = self.driver.current_url
while True:
try:
elem = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//span[@id="pagerNext" and not(@class="disabled")]')))
elem.click()
except TimeoutException:
break
WebDriverWait(self.driver, 10).until(lambda driver: self.driver.current_url != url)
url = self.driver.current_url
yield scrapy.Request(url=url, callback=self.parse_page, dont_filter=False)
请注意,我将 (By.ID, "pagerNext")
定位器替换为 (By.XPATH, '//span[@id="pagerNext" and not(@class="disabled")]')
,因此现在仅启用下一步按钮
启动抓取工具后,发生了一些奇怪的事情:它要么正常工作,要么在访问第二页并单击“下一步”按钮后结束,要么以某种方式结束在 属性 页面上,当我使用代码时当前被注释掉的行。然而,当那条线像现在这样放置时,它似乎工作了,它访问了所有页面并抓取了它们,最终,我得到了一个超时。我不确定是什么问题?有小费吗? 当前代码:
class PropertyFoxSpider(scrapy.Spider):
name = 'property_fox'
start_urls = [
'https://propertyfox.co.za/listing-search?currentpage=1&term_id=62515&keywords=Western+Cape&orderby=createddate:desc&status%5B%5D=Active'
]
def __init__(self):
#path to driver
self.driver = webdriver.Chrome('my_path')
def parse(self,response):
url = self.driver.get(response.url)
while True:
WebDriverWait(self.driver, 10).until(lambda driver: self.driver.current_url != url)
try:
elem = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.ID, "pagerNext")))
elem.click()
#WebDriverWait(self.driver, 10).until(lambda driver: self.driver.current_url != url)
url = self.driver.current_url
yield scrapy.Request(url=url, callback=self.parse_page, dont_filter=False)
except TimeoutException:
break
def parse_page(self, response):
for prop in response.css('div.property-item'):
link = prop.css('a::attr(href)').get()
banner = prop.css('div.property-figure-icon div::text').get()
sold_tag = None
if banner:
banner = banner.strip()
sold_tag = 'sold' if 'sold' in banner.lower() else None
yield scrapy.Request(
link,
callback=self.parse_property,
meta={'item': {
'agency': self.name,
'url': link,
'offering': 'buy',
'banners': banner,
'sold_tag': sold_tag,
}},
)
def parse_property(self, response):
item = response.meta.get('item')
...
Selenium 似乎“识别”甚至禁用了“下一步”按钮作为可点击元素,并且即使在最后一页仍然尝试点击它。您可以尝试以下代码使其工作:
def parse(self,response):
self.driver.get(response.url)
url = self.driver.current_url
while True:
try:
elem = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//span[@id="pagerNext" and not(@class="disabled")]')))
elem.click()
except TimeoutException:
break
WebDriverWait(self.driver, 10).until(lambda driver: self.driver.current_url != url)
url = self.driver.current_url
yield scrapy.Request(url=url, callback=self.parse_page, dont_filter=False)
请注意,我将 (By.ID, "pagerNext")
定位器替换为 (By.XPATH, '//span[@id="pagerNext" and not(@class="disabled")]')
,因此现在仅启用下一步按钮