BeautifulSoup find_all 函数在 main 中不起作用
BeautifulSoup find_all function doesn't work inside of main
我正在尝试在网站 conforama 上删除,为此,我正在使用 BeautifulSoup。我正在尝试检索商品的价格、描述、费率、url 和评论数量,并在 3 个页面上递归执行此操作。
首先,我导入了所需的库
import csv
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
我定义了第一个函数:get_url 它将使用特定的 search_term 和 return 正确地格式化 url 和等待的 url格式正确的页码
def get_url(search_term):
template = 'https://www.conforama.fr/recherche-conforama/{}'
search_term = search_term.replace(' ','+')
url = template.format(search_term)
url+= '?P1-PRODUCTS%5Bpage%5D={}'
return url
我定义了第二个来去除一些使数据不可读的内容
def format_number(number):
new_number = ''
for n in number:
if n not in '0123456789€,.' : return new_number
new_number+=n
我定义了第三个函数,它将记录并从中提取我需要的所有信息:它的价格、描述、url、评分和评论数量。
def extract_record(item):
print(item)
descriptions = item.find_all("a", {"class" : "bindEvent"})
description = descriptions[1].text.strip() + ' ' + descriptions[2].text.strip()
#get url of product
url = descriptions[2]['href']
print(url)
#number of reviews
nor = descriptions[3].text.strip()
nor = format_number(nor)
#rating
try:
ratings = item.find_all("span", {"class" : "stars"})
rating = ratings[0]['data']
except AttributeError:
return
#price
try:
prices = item.find_all("div", {"class" : "price-product"})
price = prices[0].text.strip()
except AttributeError:
return
price = format_number(price)
return (description, price, rating, nor, url)
最后,我将所有函数集中在一个主函数中,这将使我能够遍历我需要从中提取的所有页面
def main(search_term):
#product_name = search_term
driver = webdriver.Chrome(ChromeDriverManager().install())
records = []
url = get_url(search_term)
somme = 0
for page in range (1,4):
driver.get(url.format(page))
soup = BeautifulSoup(driver.page_source, 'html.parser')
print('longueur soup', len(soup))
print(soup)
results = soup.find_all('li', {'class' : 'ais-Hits-item box-product fragItem'})
print(len(results))
somme+=len(results)
for result in results:
record = extract_record(result)
if record:
print(record)
records.append(record)
driver.close()
print('somme',somme)
现在的问题是,当我 运行 所有命令一一执行时:
driver = webdriver.Chrome(ChromeDriverManager().install())
url = get_url('couch').format(1)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find_all('li', {'class' : 'ais-Hits-item box-product fragItem'})
item = results[0]
extracted = extract_record(item)
一切都很好,extract_record 函数 return 正是我需要的。
但是,当我运行主函数时,这行代码:
results = soup.find_all('li', {'class' : 'ais-Hits-item box-product fragItem'})
没有return任何结果,即使我知道当我在主函数之外执行它时它有结果
有没有人遇到同样的问题,你知道我做错了什么以及如何解决吗?
非常感谢阅读和尝试回答
会发生什么?
主要问题是元素需要一些时间来生成/显示,并且在您抓取 driver.page_source
的那一刻它们不可用。
如何修复?
使用seleniums waits直到找到特定元素:
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'li.ais-Hits-item.box-product.fragItem div.price-product')))
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find_all('li', {'class' : 'ais-Hits-item box-product fragItem'})
例子
...
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
...
def main(search_term):
#product_name = search_term
driver = webdriver.Chrome(ChromeDriverManager().install())
records = []
url = get_url(search_term)
somme = 0
for page in range (1,4):
driver.get(url.format(page))
print(url.format(page))
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'li.ais-Hits-item.box-product.fragItem div.price-product')))
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find_all('li', {'class' : 'ais-Hits-item box-product fragItem'})
somme+=len(results)
for result in results:
record = extract_record(result)
if record:
print(record)
records.append(record)
driver.close()
print('somme',somme)
main('matelas')
我正在尝试在网站 conforama 上删除,为此,我正在使用 BeautifulSoup。我正在尝试检索商品的价格、描述、费率、url 和评论数量,并在 3 个页面上递归执行此操作。
首先,我导入了所需的库
import csv
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
我定义了第一个函数:get_url 它将使用特定的 search_term 和 return 正确地格式化 url 和等待的 url格式正确的页码
def get_url(search_term):
template = 'https://www.conforama.fr/recherche-conforama/{}'
search_term = search_term.replace(' ','+')
url = template.format(search_term)
url+= '?P1-PRODUCTS%5Bpage%5D={}'
return url
我定义了第二个来去除一些使数据不可读的内容
def format_number(number):
new_number = ''
for n in number:
if n not in '0123456789€,.' : return new_number
new_number+=n
我定义了第三个函数,它将记录并从中提取我需要的所有信息:它的价格、描述、url、评分和评论数量。
def extract_record(item):
print(item)
descriptions = item.find_all("a", {"class" : "bindEvent"})
description = descriptions[1].text.strip() + ' ' + descriptions[2].text.strip()
#get url of product
url = descriptions[2]['href']
print(url)
#number of reviews
nor = descriptions[3].text.strip()
nor = format_number(nor)
#rating
try:
ratings = item.find_all("span", {"class" : "stars"})
rating = ratings[0]['data']
except AttributeError:
return
#price
try:
prices = item.find_all("div", {"class" : "price-product"})
price = prices[0].text.strip()
except AttributeError:
return
price = format_number(price)
return (description, price, rating, nor, url)
最后,我将所有函数集中在一个主函数中,这将使我能够遍历我需要从中提取的所有页面
def main(search_term):
#product_name = search_term
driver = webdriver.Chrome(ChromeDriverManager().install())
records = []
url = get_url(search_term)
somme = 0
for page in range (1,4):
driver.get(url.format(page))
soup = BeautifulSoup(driver.page_source, 'html.parser')
print('longueur soup', len(soup))
print(soup)
results = soup.find_all('li', {'class' : 'ais-Hits-item box-product fragItem'})
print(len(results))
somme+=len(results)
for result in results:
record = extract_record(result)
if record:
print(record)
records.append(record)
driver.close()
print('somme',somme)
现在的问题是,当我 运行 所有命令一一执行时:
driver = webdriver.Chrome(ChromeDriverManager().install())
url = get_url('couch').format(1)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find_all('li', {'class' : 'ais-Hits-item box-product fragItem'})
item = results[0]
extracted = extract_record(item)
一切都很好,extract_record 函数 return 正是我需要的。 但是,当我运行主函数时,这行代码:
results = soup.find_all('li', {'class' : 'ais-Hits-item box-product fragItem'})
没有return任何结果,即使我知道当我在主函数之外执行它时它有结果
有没有人遇到同样的问题,你知道我做错了什么以及如何解决吗? 非常感谢阅读和尝试回答
会发生什么?
主要问题是元素需要一些时间来生成/显示,并且在您抓取 driver.page_source
的那一刻它们不可用。
如何修复?
使用seleniums waits直到找到特定元素:
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'li.ais-Hits-item.box-product.fragItem div.price-product')))
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find_all('li', {'class' : 'ais-Hits-item box-product fragItem'})
例子
...
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
...
def main(search_term):
#product_name = search_term
driver = webdriver.Chrome(ChromeDriverManager().install())
records = []
url = get_url(search_term)
somme = 0
for page in range (1,4):
driver.get(url.format(page))
print(url.format(page))
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'li.ais-Hits-item.box-product.fragItem div.price-product')))
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find_all('li', {'class' : 'ais-Hits-item box-product fragItem'})
somme+=len(results)
for result in results:
record = extract_record(result)
if record:
print(record)
records.append(record)
driver.close()
print('somme',somme)
main('matelas')