尝试使用 selenium 抓取标题

Try to scrape title using selenium

我正在尝试抓取 title 他们会在每个 link 里面抓取标题,但他们会告诉我错误

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from time import sleep

PATH="C:\Program Files (x86)\chromedriver.exe"
url='https://www.supplyvan.com/power-tools/cordless-powertools/cordless-drills.html'
driver =webdriver.Chrome(PATH)
wait = WebDriverWait(driver, 20)
driver.get(url)
list_button = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "h4.card-title"))).click()
title=driver.find_element_by_xpath('h1').text()
print(title)

您需要更改选择器以获取 h1 标签文本。

在此代码段中,抓取工具将访问第一个 link 并打印标题

# click the single link
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "h4.card-title"))).click()
time.sleep(2)

# parse the h1 tag text
title = driver.find_element(By.CSS_SELECTOR, 'h1.productView-title').text
print(title)

driver 将每隔 link 访问一次并抓取标题

# parse all the links
page_links = [element.get_attribute('href') for element in
              driver.find_elements(By.CSS_SELECTOR, "h4.card-title > a")]

# visit all the links
for link in page_links:
    driver.get(link)
    time.sleep(2)
    title = driver.find_element(By.CSS_SELECTOR, 'h1.productView-title').text

    # parse title for all the links
    print(title)
    time.sleep(2)

包含两个片段的完整代码 -


import time

from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager

options = webdriver.ChromeOptions()

# options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")

chrome_driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)


def supplyvan_scraper():
    with chrome_driver as driver:
        driver.implicitly_wait(15)
        URL = 'https://www.supplyvan.com/power-tools/cordless-powertools/cordless-drills.html'
        driver.get(URL)
        time.sleep(3)

        # opt #1 visit first link, print the title uncomment to see
        # click the single link
        # WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "h4.card-title"))).click()
        # time.sleep(2)
        #
        # # parse the h1 tag text
        # title = driver.find_element(By.CSS_SELECTOR, 'h1.productView-title').text
        # print(title)

        # opt #2 visit all links, print titles
        # parse all the links
        page_links = [element.get_attribute('href') for element in
                      driver.find_elements(By.CSS_SELECTOR, "h4.card-title > a")]

        # visit all the links
        for link in page_links:
            driver.get(link)
            time.sleep(2)
            title = driver.find_element(By.CSS_SELECTOR, 'h1.productView-title').text

            # parse title for all the links
            print(title)
            # driver.back()
            time.sleep(2)

        time.sleep(2)
        driver.quit()


supplyvan_scraper()

所有访问过的 links -

的输出
Bosch Professional Cordless Drill, GSR-120-Li, 12V, Blue/Black
Makita LXT Cordless Drill Driver, DDF481RTJ, 18V, 13MM
Bosch Cordless Drill, GSR-1000, 10.8V
.....