只有在第一页被抓取后才单击下一页按钮，然后再抓取下一页

Question

如何让 selenium 等待 scrapy 从第一页抓取所需的信息，然后才单击下一页按钮，然后抓取下一页。最后，我试图重复这个过程，直到到达最后一页，即第 301 页。

# -*- coding: utf-8 -*-
from typing_extensions import Self
import scrapy
from scrapy.selector import Selector
from scrapy_selenium import SeleniumRequest
from time import sleep
from turtle import delay
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from shutil import which 

#login info
username = "xxx"
password = "xxx"

class HtSpiderSelenium(scrapy.Spider):
    name = 'ht_selenium1'
    allowed_domains = ['https://app.xxx.bootstart.tech']
    start_urls = ['https://app.xxx.bootstart.tech']
    
    def __init__(self):
        chrome_options = Options()
        #chrome_options.add_argument("--headless")

        driver = webdriver.Chrome(executable_path="./chromedriver")

        #get login page
        driver = webdriver.Chrome(executable_path="./chromedriver", options=chrome_options)
        driver.get("https://auth.bootstart.tech/auth/realms/xxxPlatform/protocol/openid-connect/auth?client_id=xxx-platform&redirect_uri=https%3A%2F%2Fapp.xxx.bootstart.tech%2F%3Fredirect_fragment%3D%252Fstartup&state=8780862b-1eaf-4b6e-92e5-fd9ab464c57f&nonce=79d66ef5-f0bb-4e75-8db2-6402114b9aa8&response_mode=fragment&response_type=code")

        #login
        driver.find_element_by_id("username").send_keys(username)
        driver.find_element_by_id("password").send_keys(password)
        driver.find_element_by_name("login").click()
        sleep(15)

        #next page button
        driver.find_element_by_xpath("/html/body/div[2]/div[2]/nav/div/div[2]/ul/li[14]/a").click()
        sleep(10)


        self.html = driver.page_source
        driver.close()

    #scrape needed info
    def parse(self, response):
        resp = Selector(text=self.html)
        for startup in resp.xpath("//div[contains(@class, 'col-sm-12')]"):
            yield {
                'startup name': startup.xpath(".//span[contains(@class, 'no-outline ng-binding')]/text()").get(),
                'startup descript': ''.join(startup.xpath('//div//p//div//text()').getall()),
                'startup location': startup.xpath(".//h4//small[@class='ng-binding']//text()").get(),
                'startup industry': startup.xpath(".//h4//span[@class='ng-binding']/text()").get(),
            }

Answer 1

您可以尝试这样做：

# -*- coding: utf-8 -*-
# from typing_extensions import Self
import scrapy
from scrapy.selector import Selector
# from scrapy_selenium import SeleniumRequest
from time import sleep
from turtle import delay
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from shutil import which

#login info
username = "xxx"
password = "xxx"


class HtSpiderSelenium(scrapy.Spider):
    name = 'ht_selenium1'
    allowed_domains = ['app.xxx.bootstart.tech']


    def __init__(self):
        chrome_options = Options()
        #chrome_options.add_argument("--headless")

        self.driver = webdriver.Chrome(executable_path="./chromedriver")

        #get login page
        self.driver = webdriver.Chrome(executable_path="./chromedriver", options=chrome_options)
        self.driver.get("https://auth.bootstart.tech/auth/realms/xxxPlatform/protocol/openid-connect/auth?client_id=xxx-platform&redirect_uri=https%3A%2F%2Fapp.xxx.bootstart.tech%2F%3Fredirect_fragment%3D%252Fstartup&state=8780862b-1eaf-4b6e-92e5-fd9ab464c57f&nonce=79d66ef5-f0bb-4e75-8db2-6402114b9aa8&response_mode=fragment&response_type=code")

        #login
        self.driver.find_element_by_id("username").send_keys(username)
        self.driver.find_element_by_id("password").send_keys(password)
        self.driver.find_element_by_name("login").click()

        self.start_urls = [self.driver.current_url]

    #scrape needed info
    def parse(self, response):
        self.driver.get(response.url)
        while True:
            resp = Selector(text=self.driver.page_source)
            for startup in resp.xpath("//div[contains(@class, 'col-sm-12')]"):
                yield {
                    'startup name': startup.xpath(".//span[contains(@class, 'no-outline ng-binding')]/text()").get(),
                    'startup description': ''.join(startup.xpath('//div//p//div//text()').getall()),
                    'startup location': startup.xpath(".//h4//small[@class='ng-binding']//text()").get(),
                    'startup industry': startup.xpath(".//h4//span[@class='ng-binding']/text()").get(),
                }

            #next page button
            next_page = self.driver.find_element_by_xpath("/html/body/div[2]/div[2]/nav/div/div[2]/ul/li[14]/a")
            try:
                next_page.click()
            except:
                break
        self.driver.close()

只有在第一页被抓取后才单击下一页按钮，然后再抓取下一页

Only click next page button after the first page has been scraped and then scrape next page

python

selenium

scrapy

web-scraping