尝试使用 selenium 抓取标题
Try to scrape title using selenium
我正在尝试抓取 title
他们会在每个 link 里面抓取标题,但他们会告诉我错误
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from time import sleep
PATH="C:\Program Files (x86)\chromedriver.exe"
url='https://www.supplyvan.com/power-tools/cordless-powertools/cordless-drills.html'
driver =webdriver.Chrome(PATH)
wait = WebDriverWait(driver, 20)
driver.get(url)
list_button = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "h4.card-title"))).click()
title=driver.find_element_by_xpath('h1').text()
print(title)
您需要更改选择器以获取 h1
标签文本。
在此代码段中,抓取工具将访问第一个 link 并打印标题
# click the single link
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "h4.card-title"))).click()
time.sleep(2)
# parse the h1 tag text
title = driver.find_element(By.CSS_SELECTOR, 'h1.productView-title').text
print(title)
driver 将每隔 link 访问一次并抓取标题
# parse all the links
page_links = [element.get_attribute('href') for element in
driver.find_elements(By.CSS_SELECTOR, "h4.card-title > a")]
# visit all the links
for link in page_links:
driver.get(link)
time.sleep(2)
title = driver.find_element(By.CSS_SELECTOR, 'h1.productView-title').text
# parse title for all the links
print(title)
time.sleep(2)
包含两个片段的完整代码 -
import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
options = webdriver.ChromeOptions()
# options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
chrome_driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=options
)
def supplyvan_scraper():
with chrome_driver as driver:
driver.implicitly_wait(15)
URL = 'https://www.supplyvan.com/power-tools/cordless-powertools/cordless-drills.html'
driver.get(URL)
time.sleep(3)
# opt #1 visit first link, print the title uncomment to see
# click the single link
# WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "h4.card-title"))).click()
# time.sleep(2)
#
# # parse the h1 tag text
# title = driver.find_element(By.CSS_SELECTOR, 'h1.productView-title').text
# print(title)
# opt #2 visit all links, print titles
# parse all the links
page_links = [element.get_attribute('href') for element in
driver.find_elements(By.CSS_SELECTOR, "h4.card-title > a")]
# visit all the links
for link in page_links:
driver.get(link)
time.sleep(2)
title = driver.find_element(By.CSS_SELECTOR, 'h1.productView-title').text
# parse title for all the links
print(title)
# driver.back()
time.sleep(2)
time.sleep(2)
driver.quit()
supplyvan_scraper()
所有访问过的 links -
的输出
Bosch Professional Cordless Drill, GSR-120-Li, 12V, Blue/Black
Makita LXT Cordless Drill Driver, DDF481RTJ, 18V, 13MM
Bosch Cordless Drill, GSR-1000, 10.8V
.....
我正在尝试抓取 title
他们会在每个 link 里面抓取标题,但他们会告诉我错误
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from time import sleep
PATH="C:\Program Files (x86)\chromedriver.exe"
url='https://www.supplyvan.com/power-tools/cordless-powertools/cordless-drills.html'
driver =webdriver.Chrome(PATH)
wait = WebDriverWait(driver, 20)
driver.get(url)
list_button = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "h4.card-title"))).click()
title=driver.find_element_by_xpath('h1').text()
print(title)
您需要更改选择器以获取 h1
标签文本。
在此代码段中,抓取工具将访问第一个 link 并打印标题
# click the single link
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "h4.card-title"))).click()
time.sleep(2)
# parse the h1 tag text
title = driver.find_element(By.CSS_SELECTOR, 'h1.productView-title').text
print(title)
driver 将每隔 link 访问一次并抓取标题
# parse all the links
page_links = [element.get_attribute('href') for element in
driver.find_elements(By.CSS_SELECTOR, "h4.card-title > a")]
# visit all the links
for link in page_links:
driver.get(link)
time.sleep(2)
title = driver.find_element(By.CSS_SELECTOR, 'h1.productView-title').text
# parse title for all the links
print(title)
time.sleep(2)
包含两个片段的完整代码 -
import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
options = webdriver.ChromeOptions()
# options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
chrome_driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=options
)
def supplyvan_scraper():
with chrome_driver as driver:
driver.implicitly_wait(15)
URL = 'https://www.supplyvan.com/power-tools/cordless-powertools/cordless-drills.html'
driver.get(URL)
time.sleep(3)
# opt #1 visit first link, print the title uncomment to see
# click the single link
# WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "h4.card-title"))).click()
# time.sleep(2)
#
# # parse the h1 tag text
# title = driver.find_element(By.CSS_SELECTOR, 'h1.productView-title').text
# print(title)
# opt #2 visit all links, print titles
# parse all the links
page_links = [element.get_attribute('href') for element in
driver.find_elements(By.CSS_SELECTOR, "h4.card-title > a")]
# visit all the links
for link in page_links:
driver.get(link)
time.sleep(2)
title = driver.find_element(By.CSS_SELECTOR, 'h1.productView-title').text
# parse title for all the links
print(title)
# driver.back()
time.sleep(2)
time.sleep(2)
driver.quit()
supplyvan_scraper()
所有访问过的 links -
的输出Bosch Professional Cordless Drill, GSR-120-Li, 12V, Blue/Black
Makita LXT Cordless Drill Driver, DDF481RTJ, 18V, 13MM
Bosch Cordless Drill, GSR-1000, 10.8V
.....