如何使用 python selenium 关闭 popover-form?
How to close popover-form with python selenium?
来自 indeed.com 我尝试自动获取职位名称、雇主和描述。
它工作正常,直到循环进入第二页。
弹出窗口出现,我应该在其中按 "Nein, danke." 按钮继续。当这个改变出现时,循环就停止了。
我的代码看起来像这样,并且在第二页之前工作正常:
# Import the packages
from selenium import webdriver
from time import sleep
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
import gensim
# Start Webscraping
driver = webdriver.Safari()
driver.maximize_window()
# List with indeed URLs to scrape through
indeed_url_list = ['https://de.indeed.com/Jobs?q=data&l=&sort=date',
'https://de.indeed.com/jobs?q=Data&sort=date&start=10',
'https://de.indeed.com/jobs?q=Data&sort=date&start=20',
'https://de.indeed.com/jobs?q=Data&sort=date&start=30',
'https://de.indeed.com/jobs?q=Data&sort=date&start=40'
]
# Empty lists that will be filled
indeed_job_links = [] # list with links to scrape through
indeed_job_titles = [] # list with job titles
indeed_job_employers = [] # list with job employers
indeed_job_descriptions = [] # list with job descriptions
# for loop for scraping
for indeed_page in indeed_url_list:
driver.get(indeed_page)
links = driver.find_elements_by_xpath('//div[@class="jobsearch-SerpJobCard row result clickcard" or @class="jobsearch-SerpJobCard row sjlast result clickcard" or @class="jobsearch-SerpJobCard row result clickcard vjs-highlight" or @class="jobsearch-SerpJobCard lastRow row result clickcard" or @class="jobsearch-SerpJobCard row result clickcard vjs-highlight"]/*/a')
# get job link to list
for i in list(links):
indeed_job_links.append(i.get_attribute('href'))
# scrape through the job descriptions
for link in links:
# open the link
link.click()
sleep(0.6)
# get job title to list
indeed_title = driver.find_element_by_xpath('//div[@id="vjs-jobtitle"]').text
indeed_job_titles.append(indeed_title)
# get job employer to list
indeed_employer = driver.find_element_by_xpath('//span[@id="vjs-cn"]').text
indeed_job_employers.append(indeed_employer)
# get job description to list
indeed_description = ' '.join(word_tokenize(driver.find_element_by_xpath('//div[@id="vjs-desc"]').text))
indeed_job_descriptions.append(indeed_description)
我真的不知道在这里做什么。有人有想法吗?非常感谢。
如果它是一个真正的警报,这应该适合您。作为参考,here is the relevant section of the Selenium documentation
alertObj = driver.switch_to.alert
alertObj.accept()
alertObj.dismiss()
我在测试我们的应用程序时经常处理警报。我发现它们在网页呈现它们所需的时间方面可能非常不可靠。这是我的标准实现。
def test(self):
self.driver.find_element_by_name(a).clear()
self.driver.find_element_by_name(b).send_keys()
self.driver.find_element_by_name(c).click()
# wait for an alert box to render
time.sleep(1)
try:
alert_text = self.driver.switch_to.alert
self.assertEqual('alert text', alert_text.text)
self.driver.switch_to.alert.dismiss()
except TypeError:
[do stuff]
我查看了您在 indeed.com 上引用的弹出窗口。
它是一个对话框而不是浏览器警报,因此之前的答案(您切换到警报的地方)将不起作用。
driver.switch_to.alert
仅在 JavaScript 警报的情况下有效。
您在 indeed.com 上看到的是一个 HTML 对话框,应该像处理任何其他页面元素一样处理它。
第一次更改页面时,您知道会出现该对话框,所以等待它并关闭它。我不确定在您更改 N 页数后它是否会再次出现。但如果确实如此,您可能需要考虑给它发一封电子邮件以阻止它弹出,而不是关闭它。或者每次更改页面时,您可以检查是否显示对话框并关闭它 - 这是不太理想的。
这是您的代码,稍微整理一下以支持关闭对话框:
from time import sleep
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class IndeedScraper:
# List with indeed URLs to scrape through
INDEED_URL_LIST = ['https://de.indeed.com/Jobs?q=data&l=&sort=date',
'https://de.indeed.com/jobs?q=Data&sort=date&start=10',
'https://de.indeed.com/jobs?q=Data&sort=date&start=20',
'https://de.indeed.com/jobs?q=Data&sort=date&start=30',
'https://de.indeed.com/jobs?q=Data&sort=date&start=40'
]
# Empty lists that will be filled
INDEED_JOB_LINKS = [] # list with links to scrape through
INDEED_JOB_TITLES = [] # list with job titles
INDEED_JOB_EMPLOYERS = [] # list with job employers
INDEED_JOB_DESCRIPTIONS = [] # list with job descriptions
def __init__(self, driver):
self.driver = driver
self.have_closed_dialog = False
def scrape(self):
for indeed_page in IndeedScraper.INDEED_URL_LIST:
self.driver.get(indeed_page)
links = self.driver.find_elements_by_xpath(
'//div[@class="jobsearch-SerpJobCard row result clickcard" or '
'@class="jobsearch-SerpJobCard row sjlast result clickcard" or '
'@class="jobsearch-SerpJobCard row result clickcard vjs-highlight" or '
'@class="jobsearch-SerpJobCard lastRow row result clickcard" or '
'@class="jobsearch-SerpJobCard row result clickcard vjs-highlight"]/*/a')
# get job link to list
for i in list(links):
IndeedScraper.INDEED_JOB_LINKS.append(i.get_attribute('href'))
# scrape through the job descriptions
for link in links:
# open the link
link.click()
sleep(0.6)
self.__close_dialog() # Will close the dialog once and only if its shown
self.__get_job_titles()
self.__get_job_employers()
self.__get_job_descriptions()
def __is_dialog_shown(self):
try:
WebDriverWait(self.driver, 0.3).until(EC.presence_of_element_located((By.ID, "popover-foreground")))
return True
except:
return False
def __close_dialog(self):
if not self.have_closed_dialog and self.__is_dialog_shown():
self.driver.find_element(By.ID, "popover-link-x").click()
self.have_closed_dialog = True
return self
def __get_job_titles(self):
# get job title to list
IndeedScraper.INDEED_JOB_TITLES.append(self.driver.find_element_by_xpath('//div[@id="vjs-jobtitle"]').text)
return self
def __get_job_employers(self):
# get job employer to list
IndeedScraper.INDEED_JOB_EMPLOYERS.append(self.driver.find_element_by_xpath('//span[@id="vjs-cn"]').text)
return self
def __get_job_descriptions(self):
# get job description to list
description = ' '.join(word_tokenize(self.driver.find_element_by_xpath('//div[@id="vjs-desc"]').text))
IndeedScraper.INDEED_JOB_DESCRIPTIONS.append(description)
return self
if "__main__" == __name__:
driver = webdriver.Safari()
driver.maximize_window()
scraper = IndeedScraper(driver)
scraper.scrape()
来自 indeed.com 我尝试自动获取职位名称、雇主和描述。
它工作正常,直到循环进入第二页。
弹出窗口出现,我应该在其中按 "Nein, danke." 按钮继续。当这个改变出现时,循环就停止了。
我的代码看起来像这样,并且在第二页之前工作正常:
# Import the packages
from selenium import webdriver
from time import sleep
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
import gensim
# Start Webscraping
driver = webdriver.Safari()
driver.maximize_window()
# List with indeed URLs to scrape through
indeed_url_list = ['https://de.indeed.com/Jobs?q=data&l=&sort=date',
'https://de.indeed.com/jobs?q=Data&sort=date&start=10',
'https://de.indeed.com/jobs?q=Data&sort=date&start=20',
'https://de.indeed.com/jobs?q=Data&sort=date&start=30',
'https://de.indeed.com/jobs?q=Data&sort=date&start=40'
]
# Empty lists that will be filled
indeed_job_links = [] # list with links to scrape through
indeed_job_titles = [] # list with job titles
indeed_job_employers = [] # list with job employers
indeed_job_descriptions = [] # list with job descriptions
# for loop for scraping
for indeed_page in indeed_url_list:
driver.get(indeed_page)
links = driver.find_elements_by_xpath('//div[@class="jobsearch-SerpJobCard row result clickcard" or @class="jobsearch-SerpJobCard row sjlast result clickcard" or @class="jobsearch-SerpJobCard row result clickcard vjs-highlight" or @class="jobsearch-SerpJobCard lastRow row result clickcard" or @class="jobsearch-SerpJobCard row result clickcard vjs-highlight"]/*/a')
# get job link to list
for i in list(links):
indeed_job_links.append(i.get_attribute('href'))
# scrape through the job descriptions
for link in links:
# open the link
link.click()
sleep(0.6)
# get job title to list
indeed_title = driver.find_element_by_xpath('//div[@id="vjs-jobtitle"]').text
indeed_job_titles.append(indeed_title)
# get job employer to list
indeed_employer = driver.find_element_by_xpath('//span[@id="vjs-cn"]').text
indeed_job_employers.append(indeed_employer)
# get job description to list
indeed_description = ' '.join(word_tokenize(driver.find_element_by_xpath('//div[@id="vjs-desc"]').text))
indeed_job_descriptions.append(indeed_description)
我真的不知道在这里做什么。有人有想法吗?非常感谢。
如果它是一个真正的警报,这应该适合您。作为参考,here is the relevant section of the Selenium documentation
alertObj = driver.switch_to.alert
alertObj.accept()
alertObj.dismiss()
我在测试我们的应用程序时经常处理警报。我发现它们在网页呈现它们所需的时间方面可能非常不可靠。这是我的标准实现。
def test(self):
self.driver.find_element_by_name(a).clear()
self.driver.find_element_by_name(b).send_keys()
self.driver.find_element_by_name(c).click()
# wait for an alert box to render
time.sleep(1)
try:
alert_text = self.driver.switch_to.alert
self.assertEqual('alert text', alert_text.text)
self.driver.switch_to.alert.dismiss()
except TypeError:
[do stuff]
我查看了您在 indeed.com 上引用的弹出窗口。 它是一个对话框而不是浏览器警报,因此之前的答案(您切换到警报的地方)将不起作用。
driver.switch_to.alert
仅在 JavaScript 警报的情况下有效。
您在 indeed.com 上看到的是一个 HTML 对话框,应该像处理任何其他页面元素一样处理它。
第一次更改页面时,您知道会出现该对话框,所以等待它并关闭它。我不确定在您更改 N 页数后它是否会再次出现。但如果确实如此,您可能需要考虑给它发一封电子邮件以阻止它弹出,而不是关闭它。或者每次更改页面时,您可以检查是否显示对话框并关闭它 - 这是不太理想的。
这是您的代码,稍微整理一下以支持关闭对话框:
from time import sleep
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class IndeedScraper:
# List with indeed URLs to scrape through
INDEED_URL_LIST = ['https://de.indeed.com/Jobs?q=data&l=&sort=date',
'https://de.indeed.com/jobs?q=Data&sort=date&start=10',
'https://de.indeed.com/jobs?q=Data&sort=date&start=20',
'https://de.indeed.com/jobs?q=Data&sort=date&start=30',
'https://de.indeed.com/jobs?q=Data&sort=date&start=40'
]
# Empty lists that will be filled
INDEED_JOB_LINKS = [] # list with links to scrape through
INDEED_JOB_TITLES = [] # list with job titles
INDEED_JOB_EMPLOYERS = [] # list with job employers
INDEED_JOB_DESCRIPTIONS = [] # list with job descriptions
def __init__(self, driver):
self.driver = driver
self.have_closed_dialog = False
def scrape(self):
for indeed_page in IndeedScraper.INDEED_URL_LIST:
self.driver.get(indeed_page)
links = self.driver.find_elements_by_xpath(
'//div[@class="jobsearch-SerpJobCard row result clickcard" or '
'@class="jobsearch-SerpJobCard row sjlast result clickcard" or '
'@class="jobsearch-SerpJobCard row result clickcard vjs-highlight" or '
'@class="jobsearch-SerpJobCard lastRow row result clickcard" or '
'@class="jobsearch-SerpJobCard row result clickcard vjs-highlight"]/*/a')
# get job link to list
for i in list(links):
IndeedScraper.INDEED_JOB_LINKS.append(i.get_attribute('href'))
# scrape through the job descriptions
for link in links:
# open the link
link.click()
sleep(0.6)
self.__close_dialog() # Will close the dialog once and only if its shown
self.__get_job_titles()
self.__get_job_employers()
self.__get_job_descriptions()
def __is_dialog_shown(self):
try:
WebDriverWait(self.driver, 0.3).until(EC.presence_of_element_located((By.ID, "popover-foreground")))
return True
except:
return False
def __close_dialog(self):
if not self.have_closed_dialog and self.__is_dialog_shown():
self.driver.find_element(By.ID, "popover-link-x").click()
self.have_closed_dialog = True
return self
def __get_job_titles(self):
# get job title to list
IndeedScraper.INDEED_JOB_TITLES.append(self.driver.find_element_by_xpath('//div[@id="vjs-jobtitle"]').text)
return self
def __get_job_employers(self):
# get job employer to list
IndeedScraper.INDEED_JOB_EMPLOYERS.append(self.driver.find_element_by_xpath('//span[@id="vjs-cn"]').text)
return self
def __get_job_descriptions(self):
# get job description to list
description = ' '.join(word_tokenize(self.driver.find_element_by_xpath('//div[@id="vjs-desc"]').text))
IndeedScraper.INDEED_JOB_DESCRIPTIONS.append(description)
return self
if "__main__" == __name__:
driver = webdriver.Safari()
driver.maximize_window()
scraper = IndeedScraper(driver)
scraper.scrape()