Selenium Python：如何 check/simulate [ERROR] 退出代码以继续 FOR 循环而不是完全退出

Question

我目前有一个 selenium 函数，它执行以下代码摘要：

def (list):
    FOR LOOP in list: # Page A (initial), Contains 12

    requests,bs4 grabs element coordinates.  
    [f''string transforms into CSS selector]. # this is the list and loops through this
    selenium.driver opens, detect and selects that element

    FOR LOOP in [f'string...']: # Page B:, Contains 1

        Driver.current url, used to prepare new elements to be detected
        requests,bs4 grabs element coordinates. # this is list and loops through this 
        f''string transforms into CSS selector.
        
        selenium.driver opens, detect and selects that element
        download beginds.
        sleep for .5 sec
        driver goes back to previous page.

现在，我的问题是在可预测的迭代中，特别是当 for 循环 B 在列表中的 6/12 元素上时，它会崩溃并显示以下错误代码：

'//OBJECT//' is not clickable at point (591, 797). Other element would receive the click: <div style="position: relative" class="cookie-consent-inner">...</div>
  (Session info: MicrosoftEdge=...)
Stacktrace:
Backtrace:
...

现在我这样做没有任何问题，但我希望它能继续到 PAGE B 7/12 等等，因为它确实有 Driver.back()。相反，应用程序停止了。

我尝试用 try and except: PASS 包裹整个东西来捕捉这个错误。但是，它从页面 A 开始，仍然错过了其余部分。

我想要一种可以在某处以某种方式执行 'continue' 语句的方法，但我才刚刚开始学习并且运行没有想法。你可以在原始代码中看到我试图做一个 FOR IF: ERROR 语句希望通过，但这似乎是一个语法错误。请参阅下面的原始代码：

import concurrent.futures
import os
import time
import requests
import re

import selenium.common.exceptions
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
import multiprocessing

edge_driver = 'C:selenium\webdriver\edge'
os.environ['PATH'] += edge_driver
web_links = {'digital archive': 'https://digital.nmla.metoffice.gov.uk/SO_1118bfbb-f2c9-476f-aa07-eb58b6db5ce6/', }


def scraping_bot(css_selector):
    # First stage: Years
    print('FIRST STAGE INITIATED....')

    driver = webdriver.Edge()
    driver.get(web_links.get('digital archive'))
    year_args = (By.CSS_SELECTOR, f'a[href="{css_selector}"]')
    driver.find_element(*year_args).click()

    # Second Stage: Months
    print('SECOND STAGE INITIATED....')

    sTWO_url = driver.current_url
    sTWO_site = requests.get(sTWO_url)
    sTWO_web_objects = BeautifulSoup(sTWO_site.text, 'lxml')
    monthly_placeholders = sTWO_web_objects.find(name='div', attrs={'class': 'twelve columns last results'})
    months = monthly_placeholders.find_all(name='h5')

    month_css_selector = {}
    for month_href_tags in months:
        month_tag = f'{month_href_tags.get_text()}'
        month_hrefs = re.findall(regex, str(month_href_tags))
        for month_href in month_hrefs:
            month_css_selector.update({month_tag: month_href})

    for v, y in zip(month_css_selector.values(), month_css_selector.keys()):
        print(v)  ##############################
        month_args = (By.CSS_SELECTOR, f'a[href="{v}/"]')
        driver.find_element(*month_args).click()

        # Third Stage: Download
        print(f'THIRD STAGE INITIATED for: {y}: {v}')

        sTWO_url = driver.current_url

        download_site = requests.get(sTWO_url)
        content = BeautifulSoup(download_site.text, 'lxml')
        nav_controls = content.find_all('nav')
        download_button = [nav_controls.find(attrs={'title': 'download'}) for nav_controls in nav_controls]
        download_regex = r'(?<=href=\").{1,}(?=\" title)'
        for button in download_button:
            if button is not None:
                print(button)  ##############################
                downl = re.findall(download_regex, str(button))
                if len(downl) == 1:
                    for downl_button in downl:
                        download_args = (By.CSS_SELECTOR, f'a[href="{downl_button}"]')
                        driver.find_element(*download_args).click()
                    time.sleep(2)
                    print(f'THIRD STAGE DOWNLOAD COMPLETE: {y}; {v}')

                    ##### END OF TREE HERE ####
                    driver.back()  # goes back to Second Stage and so on
                else:
                    print(f'Your download button matches exceeds 1: {len(downl)}')
        if selenium.common.exceptions.ElementClickInterceptedException:
            continue


if __name__ == '__main__':

    sONE_url = requests.get(web_links.get('digital archive'))
    sONE_web_objects = BeautifulSoup(sONE_url.text, 'lxml')

    year_placeholder = sONE_web_objects.find(name='div', attrs={'class': 'sixteen columns results-and-filters'})
    years = year_placeholder.find_all(name='div', attrs={'class': ['one_sixth grey_block new-secondary-background result-item',
                                                                   'one_sixth grey_block new-secondary-background result-item last']})  # don't skip, needed for titles.
    unit = [years.find('h5') for years in years]
    regex = r'(?<=href=\").{1,}(?=\/")'  # lookaround = PositiveLookBehind...PositiveLookAhead

    year_css_selector = []

    titles = [years.get('title') for years in years]
    for year_href_tags, year_tag in zip(unit, titles):  # href_tag -> bs4 component
        hrefs = re.findall(regex, str(year_href_tags.get_text))  # href_tag.get_text -> method that enables str.
        for year_href in hrefs:
            year_css_selector.append(f'{year_href}/')

    for i in year_css_selector:
        scraping_bot(i)

因此，我希望我的预期输出只是通过或继续跳过这个我可以手动下载自己的错误网页。

Answer 1

如果我理解你的问题，我认为你只需要在正确的地方加上一个“try/catch”，即包围函数 for v, y in zip(month_css_selector.values(),block ... : 块中的所有代码 scraping_bot:

import concurrent.futures
import os
import time
import requests
import re

import selenium.common.exceptions
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
import multiprocessing

edge_driver = 'C:selenium\webdriver\edge'
os.environ['PATH'] += edge_driver
web_links = {'digital archive': 'https://digital.nmla.metoffice.gov.uk/SO_1118bfbb-f2c9-476f-aa07-eb58b6db5ce6/', }


def scraping_bot(css_selector):
    # First stage: Years
    print('FIRST STAGE INITIATED....')

    driver = webdriver.Edge()
    driver.get(web_links.get('digital archive'))
    year_args = (By.CSS_SELECTOR, f'a[href="{css_selector}"]')
    driver.find_element(*year_args).click()

    # Second Stage: Months
    print('SECOND STAGE INITIATED....')

    sTWO_url = driver.current_url
    sTWO_site = requests.get(sTWO_url)
    sTWO_web_objects = BeautifulSoup(sTWO_site.text, 'lxml')
    monthly_placeholders = sTWO_web_objects.find(name='div', attrs={'class': 'twelve columns last results'})
    months = monthly_placeholders.find_all(name='h5')

    month_css_selector = {}
    for month_href_tags in months:
        month_tag = f'{month_href_tags.get_text()}'
        month_hrefs = re.findall(regex, str(month_href_tags))
        for month_href in month_hrefs:
            month_css_selector.update({month_tag: month_href})

    for v, y in zip(month_css_selector.values(), month_css_selector.keys()):
        try:
            print(v)  ##############################
            month_args = (By.CSS_SELECTOR, f'a[href="{v}/"]')
            driver.find_element(*month_args).click()
    
            # Third Stage: Download
            print(f'THIRD STAGE INITIATED for: {y}: {v}')
    
            sTWO_url = driver.current_url
    
            download_site = requests.get(sTWO_url)
            content = BeautifulSoup(download_site.text, 'lxml')
            nav_controls = content.find_all('nav')
            download_button = [nav_controls.find(attrs={'title': 'download'}) for nav_controls in nav_controls]
            download_regex = r'(?<=href=\").{1,}(?=\" title)'
            for button in download_button:
                if button is not None:
                    print(button)  ##############################
                    downl = re.findall(download_regex, str(button))
                    if len(downl) == 1:
                        for downl_button in downl:
                            download_args = (By.CSS_SELECTOR, f'a[href="{downl_button}"]')
                            driver.find_element(*download_args).click()
                        time.sleep(2)
                        print(f'THIRD STAGE DOWNLOAD COMPLETE: {y}; {v}')
    
                        ##### END OF TREE HERE ####
                        driver.back()  # goes back to Second Stage and so on
                    else:
                        print(f'Your download button matches exceeds 1: {len(downl)}')
        except selenium.common.exceptions.ElementClickInterceptedException:
            # This is sort of expected:
            pass
        except Exception as e:
            # If it is something else, print it out:
            print('Got exception:', e)


if __name__ == '__main__':

    sONE_url = requests.get(web_links.get('digital archive'))
    sONE_web_objects = BeautifulSoup(sONE_url.text, 'lxml')

    year_placeholder = sONE_web_objects.find(name='div', attrs={'class': 'sixteen columns results-and-filters'})
    years = year_placeholder.find_all(name='div', attrs={'class': ['one_sixth grey_block new-secondary-background result-item',
                                                                   'one_sixth grey_block new-secondary-background result-item last']})  # don't skip, needed for titles.
    unit = [years.find('h5') for years in years]
    regex = r'(?<=href=\").{1,}(?=\/")'  # lookaround = PositiveLookBehind...PositiveLookAhead

    year_css_selector = []

    titles = [years.get('title') for years in years]
    for year_href_tags, year_tag in zip(unit, titles):  # href_tag -> bs4 component
        hrefs = re.findall(regex, str(year_href_tags.get_text))  # href_tag.get_text -> method that enables str.
        for year_href in hrefs:
            year_css_selector.append(f'{year_href}/')

    for i in year_css_selector:
        scraping_bot(i)

Answer 2

更新答案

我发现不可点击的错误是以下两者之一：

selenium打开的驱动是不同的浏览器。这是调试错误时的重要分界线。这些 selenium 浏览器通常不具备先决条件：例如您的日常浏览器具有的 Cookie 首选项或手动首选项。这意味着打开的网页带有微小的细节，例如弹出式 cookie 或不同的缩放属性。由此可见，在普通浏览器中创建基于路径的脚本，在默认自带的selenium浏览器上可能无法正常工作。
单击元素时，即使 selenium 检测到它没有问题，您应该注意单击是 - 字面上就像手动单击一样。如果该元素不可见，则它将失败。这可以通过移动元素轻松解决，或者更好的是滚动查看功能（见下文）：

这个：

first_element = driver.find_element(By.CSS_SELECTOR, f'a[href="{urls}"]') 
driver.execute_script("arguments[0].scrollIntoView();", first_element) ##
first_element.click()

更好的是，这个：

first_element = driver.find_element(By.CSS_SELECTOR, f'a[href="{urls}"]') 
actions.move_to_element(first_element) ##
first_element.click()

鉴于此 post 对那些寻求解决方案的人可能有多大用处，@Booboo 的回答在大多数情况下都有帮助。但是，关于 selenium 驱动程序和 for 循环的问题，我发现正确的缩进（如前所述）和参数是罪魁祸首。

具体来说，我的正则表达式函数没有捕获 url 被两种情况包围的实例：

所在位置：

link rel="alternate" type="application/rss+xml" title="Met Office UA » DWS_2003_06 Comments Feed" href="https: //digital.nmla.metoffice.gov.uk/IO_e273bcd1-7131-482d-aec0-04755809ec3a/feed/
其中还有其他元素：

a class="new-primary new-primary-tint-hover fa fa-download" href="https://digital.nmla.metoffice.gov.uk/download/file/IO_efa3ef81-4812-4c8e-a4ab-055b147644d2" title="下载

我发现简单地更改下载按钮以包含 'OR' 正则表达式参数有助于解决这种情况：

(?<=href=").{1,}(?=" title|/">)

而不是

(?<=href=").{1,}(?=" title)

...显然连同上面的答案 post。

Selenium Python：如何 check/simulate [ERROR] 退出代码以继续 FOR 循环而不是完全退出

Selenium Python: How to check/simulate [ERROR] exit code to continue a FOR loop instead of exiting completely

python

selenium

beautifulsoup

multiprocessing