如何使用 python selenium 关闭 popover-form？

Question

来自 indeed.com 我尝试自动获取职位名称、雇主和描述。

它工作正常，直到循环进入第二页。

弹出窗口出现，我应该在其中按 "Nein, danke." 按钮继续。当这个改变出现时，循环就停止了。

我的代码看起来像这样，并且在第二页之前工作正常：

# Import the packages
from selenium import webdriver
from time import sleep
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
import gensim

# Start Webscraping
driver = webdriver.Safari()
driver.maximize_window()

# List with indeed URLs to scrape through
indeed_url_list = ['https://de.indeed.com/Jobs?q=data&l=&sort=date',
                   'https://de.indeed.com/jobs?q=Data&sort=date&start=10',
                   'https://de.indeed.com/jobs?q=Data&sort=date&start=20',
                   'https://de.indeed.com/jobs?q=Data&sort=date&start=30',
                   'https://de.indeed.com/jobs?q=Data&sort=date&start=40'
                   ]

# Empty lists that will be filled
indeed_job_links = []         # list with links to scrape through
indeed_job_titles = []        # list with job titles
indeed_job_employers = []     # list with job employers
indeed_job_descriptions = []  # list with job descriptions

# for loop for scraping
for indeed_page in indeed_url_list:    
    driver.get(indeed_page)    
    links = driver.find_elements_by_xpath('//div[@class="jobsearch-SerpJobCard row result clickcard" or @class="jobsearch-SerpJobCard row sjlast result clickcard" or @class="jobsearch-SerpJobCard row result clickcard vjs-highlight" or @class="jobsearch-SerpJobCard lastRow row result clickcard" or @class="jobsearch-SerpJobCard row result clickcard vjs-highlight"]/*/a')     

    # get job link to list
    for i in list(links):
        indeed_job_links.append(i.get_attribute('href'))

    # scrape through the job descriptions
    for link in links:
        # open the link
        link.click()
        sleep(0.6)
        # get job title to list
        indeed_title = driver.find_element_by_xpath('//div[@id="vjs-jobtitle"]').text
        indeed_job_titles.append(indeed_title)
        # get job employer to list
        indeed_employer = driver.find_element_by_xpath('//span[@id="vjs-cn"]').text
        indeed_job_employers.append(indeed_employer)
        # get job description to list
        indeed_description = ' '.join(word_tokenize(driver.find_element_by_xpath('//div[@id="vjs-desc"]').text))
        indeed_job_descriptions.append(indeed_description)

我真的不知道在这里做什么。有人有想法吗？非常感谢。

Answer 1

如果它是一个真正的警报，这应该适合您。作为参考，here is the relevant section of the Selenium documentation

alertObj = driver.switch_to.alert
alertObj.accept()
alertObj.dismiss()

我在测试我们的应用程序时经常处理警报。我发现它们在网页呈现它们所需的时间方面可能非常不可靠。这是我的标准实现。

def test(self):
    self.driver.find_element_by_name(a).clear()
    self.driver.find_element_by_name(b).send_keys()
    self.driver.find_element_by_name(c).click()

    # wait for an alert box to render
    time.sleep(1)

    try:
        alert_text = self.driver.switch_to.alert
        self.assertEqual('alert text', alert_text.text)
        self.driver.switch_to.alert.dismiss()
    except TypeError:
        [do stuff]

Answer 2

我查看了您在 indeed.com 上引用的弹出窗口。它是一个对话框而不是浏览器警报，因此之前的答案（您切换到警报的地方）将不起作用。

driver.switch_to.alert

仅在 JavaScript 警报的情况下有效。

您在 indeed.com 上看到的是一个 HTML 对话框，应该像处理任何其他页面元素一样处理它。

第一次更改页面时，您知道会出现该对话框，所以等待它并关闭它。我不确定在您更改 N 页数后它是否会再次出现。但如果确实如此，您可能需要考虑给它发一封电子邮件以阻止它弹出，而不是关闭它。或者每次更改页面时，您可以检查是否显示对话框并关闭它 - 这是不太理想的。

这是您的代码，稍微整理一下以支持关闭对话框：

from time import sleep
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


class IndeedScraper:

    # List with indeed URLs to scrape through
    INDEED_URL_LIST = ['https://de.indeed.com/Jobs?q=data&l=&sort=date',
                       'https://de.indeed.com/jobs?q=Data&sort=date&start=10',
                       'https://de.indeed.com/jobs?q=Data&sort=date&start=20',
                       'https://de.indeed.com/jobs?q=Data&sort=date&start=30',
                       'https://de.indeed.com/jobs?q=Data&sort=date&start=40'
                       ]

    # Empty lists that will be filled
    INDEED_JOB_LINKS = []         # list with links to scrape through
    INDEED_JOB_TITLES = []        # list with job titles
    INDEED_JOB_EMPLOYERS = []     # list with job employers
    INDEED_JOB_DESCRIPTIONS = []  # list with job descriptions

    def __init__(self, driver):

        self.driver = driver
        self.have_closed_dialog = False

    def scrape(self):

        for indeed_page in IndeedScraper.INDEED_URL_LIST:
            self.driver.get(indeed_page)
            links = self.driver.find_elements_by_xpath(
                '//div[@class="jobsearch-SerpJobCard row result clickcard" or '
                '@class="jobsearch-SerpJobCard row sjlast result clickcard" or '
                '@class="jobsearch-SerpJobCard row result clickcard vjs-highlight" or '
                '@class="jobsearch-SerpJobCard lastRow row result clickcard" or '
                '@class="jobsearch-SerpJobCard row result clickcard vjs-highlight"]/*/a')

            # get job link to list
            for i in list(links):
                IndeedScraper.INDEED_JOB_LINKS.append(i.get_attribute('href'))

            # scrape through the job descriptions
            for link in links:
                # open the link
                link.click()
                sleep(0.6)
                self.__close_dialog()  # Will close the dialog once and only if its shown
                self.__get_job_titles()
                self.__get_job_employers()
                self.__get_job_descriptions()

    def __is_dialog_shown(self):
        try:
            WebDriverWait(self.driver, 0.3).until(EC.presence_of_element_located((By.ID, "popover-foreground")))
            return True
        except:
            return False

    def __close_dialog(self):

        if not self.have_closed_dialog and self.__is_dialog_shown():
            self.driver.find_element(By.ID, "popover-link-x").click()
            self.have_closed_dialog = True
        return self

    def __get_job_titles(self):
        # get job title to list
        IndeedScraper.INDEED_JOB_TITLES.append(self.driver.find_element_by_xpath('//div[@id="vjs-jobtitle"]').text)
        return self

    def __get_job_employers(self):
        # get job employer to list
        IndeedScraper.INDEED_JOB_EMPLOYERS.append(self.driver.find_element_by_xpath('//span[@id="vjs-cn"]').text)
        return self

    def __get_job_descriptions(self):
        # get job description to list
        description = ' '.join(word_tokenize(self.driver.find_element_by_xpath('//div[@id="vjs-desc"]').text))
        IndeedScraper.INDEED_JOB_DESCRIPTIONS.append(description)
        return self


if "__main__" == __name__:

    driver = webdriver.Safari()
    driver.maximize_window()
    scraper = IndeedScraper(driver)
    scraper.scrape()

如何使用 python selenium 关闭 popover-form？

How to close popover-form with python selenium?

python

selenium

alert

web-scraping