如何使用 SeleniumWebdriver 和 Python 通过滚动找到网页上的所有元素
How to find all elements on the webpage through scrolling using SeleniumWebdriver and Python
我似乎无法获取网页上的所有元素。无论我尝试使用硒做什么。我确定我错过了什么。这是我的代码。 url 至少有 30 个元素,但每当我只抓取 6 个元素时 return。我错过了什么?
import requests
import webbrowser
import time
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
url = 'https://www.adidas.com/us/men-shoes-new_arrivals'
res = requests.get(url, headers = headers)
page_soup = bs(res.text, "html.parser")
containers = page_soup.findAll("div", {"class": "gl-product-card-container show-variation-carousel"})
print(len(containers))
#for each container find shoe model
shoe_colors = []
for container in containers:
if container.find("div", {'class': 'gl-product-card__reviews-number'}) is not None:
shoe_model = container.div.div.img["title"]
review = container.find('div', {'class':'gl-product-card__reviews-number'})
review = int(review.text)
driver = webdriver.Chrome()
driver.get(url)
time.sleep(5)
shoe_prices = driver.find_elements_by_css_selector('.gl-price')
for price in shoe_prices:
print(price.text)
print(len(shoe_prices))
你必须慢慢向下滚动页面。它仅在查看产品时请求带有 ajax 的价格数据。
options = Options()
options.add_argument('--start-maximized')
driver = webdriver.Chrome(options=options)
url = 'https://www.adidas.com/us/men-shoes-new_arrivals'
driver.get(url)
scroll_times = len(driver.find_elements_by_class_name('col-s-6')) / 4 # (divide by 4 column product per row)
scrolled = 0
scroll_size = 400
while scrolled < scroll_times:
driver.execute_script('window.scrollTo(0, arguments[0]);', scroll_size)
scrolled +=1
scroll_size += 400
time.sleep(1)
shoe_prices = driver.find_elements_by_class_name('gl-price')
for price in shoe_prices:
print(price.text)
print(len(shoe_prices))
所以使用您的代码试验的结果似乎有些不同:
- 您找到 30 个项目 requests 和 6 个项目 硒
- 我在哪里找到 40 个项目 requests 和 4 个项目硒
本网站的这些项目是通过Lazy Loading so you have to scrollDown
and wait for the new elements to render within the HTML DOM动态生成的,您可以使用以下解决方案:
代码块:
import requests
import webbrowser
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
url = 'https://www.adidas.com/us/men-shoes-new_arrivals'
res = requests.get(url, headers = headers)
page_soup = bs(res.text, "html.parser")
containers = page_soup.findAll("div", {"class": "gl-product-card-container show-variation-carousel"})
print(len(containers))
shoe_colors = []
for container in containers:
if container.find("div", {'class': 'gl-product-card__reviews-number'}) is not None:
shoe_model = container.div.div.img["title"]
review = container.find('div', {'class':'gl-product-card__reviews-number'})
review = int(review.text)
options = Options()
options.add_argument('start-maximized')
options.add_argument('disable-infobars')
options.add_argument('--disable-extensions')
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\WebDrivers\chromedriver.exe')
driver.get(url)
myLength = len(WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "span.gl-price"))))
while True:
driver.execute_script("window.scrollBy(0,400)", "")
try:
WebDriverWait(driver, 20).until(lambda driver: len(driver.find_elements_by_css_selector("span.gl-price")) > myLength)
titles = driver.find_elements_by_css_selector("span.gl-price")
myLength = len(titles)
except TimeoutException:
break
print(myLength)
for title in titles:
print(title.text)
driver.quit()
控制台输出:
47
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
我似乎无法获取网页上的所有元素。无论我尝试使用硒做什么。我确定我错过了什么。这是我的代码。 url 至少有 30 个元素,但每当我只抓取 6 个元素时 return。我错过了什么?
import requests
import webbrowser
import time
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
url = 'https://www.adidas.com/us/men-shoes-new_arrivals'
res = requests.get(url, headers = headers)
page_soup = bs(res.text, "html.parser")
containers = page_soup.findAll("div", {"class": "gl-product-card-container show-variation-carousel"})
print(len(containers))
#for each container find shoe model
shoe_colors = []
for container in containers:
if container.find("div", {'class': 'gl-product-card__reviews-number'}) is not None:
shoe_model = container.div.div.img["title"]
review = container.find('div', {'class':'gl-product-card__reviews-number'})
review = int(review.text)
driver = webdriver.Chrome()
driver.get(url)
time.sleep(5)
shoe_prices = driver.find_elements_by_css_selector('.gl-price')
for price in shoe_prices:
print(price.text)
print(len(shoe_prices))
你必须慢慢向下滚动页面。它仅在查看产品时请求带有 ajax 的价格数据。
options = Options()
options.add_argument('--start-maximized')
driver = webdriver.Chrome(options=options)
url = 'https://www.adidas.com/us/men-shoes-new_arrivals'
driver.get(url)
scroll_times = len(driver.find_elements_by_class_name('col-s-6')) / 4 # (divide by 4 column product per row)
scrolled = 0
scroll_size = 400
while scrolled < scroll_times:
driver.execute_script('window.scrollTo(0, arguments[0]);', scroll_size)
scrolled +=1
scroll_size += 400
time.sleep(1)
shoe_prices = driver.find_elements_by_class_name('gl-price')
for price in shoe_prices:
print(price.text)
print(len(shoe_prices))
所以使用您的代码试验的结果似乎有些不同:
- 您找到 30 个项目 requests 和 6 个项目 硒
- 我在哪里找到 40 个项目 requests 和 4 个项目硒
本网站的这些项目是通过Lazy Loading so you have to scrollDown
and wait for the new elements to render within the HTML DOM动态生成的,您可以使用以下解决方案:
代码块:
import requests import webbrowser from bs4 import BeautifulSoup as bs from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.common.exceptions import NoSuchElementException, TimeoutException headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'} url = 'https://www.adidas.com/us/men-shoes-new_arrivals' res = requests.get(url, headers = headers) page_soup = bs(res.text, "html.parser") containers = page_soup.findAll("div", {"class": "gl-product-card-container show-variation-carousel"}) print(len(containers)) shoe_colors = [] for container in containers: if container.find("div", {'class': 'gl-product-card__reviews-number'}) is not None: shoe_model = container.div.div.img["title"] review = container.find('div', {'class':'gl-product-card__reviews-number'}) review = int(review.text) options = Options() options.add_argument('start-maximized') options.add_argument('disable-infobars') options.add_argument('--disable-extensions') driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\WebDrivers\chromedriver.exe') driver.get(url) myLength = len(WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "span.gl-price")))) while True: driver.execute_script("window.scrollBy(0,400)", "") try: WebDriverWait(driver, 20).until(lambda driver: len(driver.find_elements_by_css_selector("span.gl-price")) > myLength) titles = driver.find_elements_by_css_selector("span.gl-price") myLength = len(titles) except TimeoutException: break print(myLength) for title in titles: print(title.text) driver.quit()
控制台输出:
47 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0