通过 Python 的 BeautifulSoup 在网站中抓取了产品详细信息的 <li>,但无法将其导出到 csv
Scraped the <li> of a product detail in a website via BeautifulSoup of Python, but can't export it to csv
我在 Python 上创建的爬虫脚本遇到了问题。
我正在抓取产品网址列表的功能和规格(它们是要点):
import csv
import requests
from bs4 import BeautifulSoup
url = 'https://www.academy.com/shop/browse/footwear/womens-footwear/womens-work-boots?&page_{}'
def trade_spider(max_pages):
data = []
page = 1
while page <= max_pages:
current_url = url.format(page)
source_code = requests.get(current_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text,'html.parser')
for link in soup.findAll('a', {'class':'product-detail-container'}):
href ="https://www.academy.com/" + link.get('href')
name=get_single_item_data1(href)
features=get_single_item_data5(href)
specs=get_single_item_data6(href)
entry = [name, features, specs]
data.append(entry)
page += 1
return data
def get_single_item_data1(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text,'html.parser')
for item_name in soup.findAll('div', {'class':'flex-wrap flex-fill'}):
print ('name:', item_name.string)
return item_name.string
def get_single_item_data5(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text,'html.parser')
for features in soup.findAll('li', {'data-auid':'feature-benefits-listing'}):
print('features: ', features.string)
return features.string
def get_single_item_data6(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text,'html.parser')
for spec in soup.findAll('li', {'data-auid':'specifications_listing'}):
print('specifications: ', spec.text)
return spec.text
row_data = trade_spider(1)
row_headers = ['name','features', 'specs']
with open('data.csv', 'w') as f:
write = csv.writer(f)
write.writerow(row_headers)
write.writerows(row_data)
print ()
每当我打印出结果时,抓取都会起作用,但是当将它们保存到 csv 文件中时,只会打印第一个要点。
所需的示例输出:
Name: Lace Up Work Boots
Features: Steel toes safeguard your feet Suede uppers provide
durability
5.5" boot height Meets ASTM F2413-11 I/75 C/75 standards Goodyear welt construction EVA insoles offer cushioning
Specifications: Activity: Work Steel toe: Yes Safety Toe: Steel
Gender: Women's Material: Suede Boot height (in.): 6" and Under Slip
Resistant: Yes Waterproof: No Electrical hazard (EH) rated: No
但是我现在才得到:
Name: Lace Up Work Boots
Features: Steel toes safeguard your feet
Specifications: Activity: Work
有什么帮助吗?
您可以使用此示例如何从项目中获取数据并将其保存为 CSV:
import csv
import requests
from bs4 import BeautifulSoup
url = "https://www.academy.com/shop/browse/footwear/womens-footwear/womens-work-boots?&page_{}"
def grab_data(url):
soup = BeautifulSoup(requests.get(url).content, "html.parser")
title = soup.h1.get_text(strip=True)
features = [
f.get_text(strip=True)
for f in soup.select('[data-auid="feature-benefits-listing"]')
]
specs = [
f.get_text(strip=True)
for f in soup.select('[data-auid="specifications_listing"]')
]
return title, features, specs
with open("data.csv", "w") as f_in:
csv_writer = csv.writer(f_in)
csv_writer.writerow(["Name", "Features", "Specifications"])
for page in range(1, 2): # <-- adjust to your page count
soup = BeautifulSoup(
requests.get(url.format(page)).content, "html.parser"
)
for a in soup.select("a.detail-card"):
u = "https://www.academy.com" + a["href"]
title, feats, specs = grab_data(u)
print(title)
print("*** Features:")
print(*feats, sep="\n")
print("*** Specifications:")
print(*specs, sep="\n")
print("-" * 80)
csv_writer.writerow([title, " ".join(feats), " ".join(specs)])
打印:
Brazos Women's Tradesman Steel Toe Lace Up Work Boots
*** Features:
Steel toes safeguard your feet
Suede uppers provide durability
5.5" boot height
Meets ASTM F2413-11 I/75 C/75 standards
Goodyear welt construction
EVA insoles offer cushioning
*** Specifications:
Activity:Work
Steel toe:Yes
Safety Toe:Steel
Gender:Women's
Material:Suede
Boot height (in.):6" and Under
Slip Resistant:Yes
Waterproof:No
Electrical hazard (EH) rated:No
--------------------------------------------------------------------------------
Fila Women's Memory Workshift Service Shoes
*** Features:
Slip-resistant rubber outsoles
Synthetic uppers
Low-top style
EVA midsoles
*** Specifications:
Activity:Work
Steel toe:No
Safety Toe:Soft
Gender:Women's
Material:Man-made Materials
Slip Resistant:Yes
Waterproof:No
Electrical hazard (EH) rated:No
--------------------------------------------------------------------------------
...and so on.
并保存 data.csv
(来自 LibreOffice 的屏幕截图):
我在 Python 上创建的爬虫脚本遇到了问题。 我正在抓取产品网址列表的功能和规格(它们是要点):
import csv
import requests
from bs4 import BeautifulSoup
url = 'https://www.academy.com/shop/browse/footwear/womens-footwear/womens-work-boots?&page_{}'
def trade_spider(max_pages):
data = []
page = 1
while page <= max_pages:
current_url = url.format(page)
source_code = requests.get(current_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text,'html.parser')
for link in soup.findAll('a', {'class':'product-detail-container'}):
href ="https://www.academy.com/" + link.get('href')
name=get_single_item_data1(href)
features=get_single_item_data5(href)
specs=get_single_item_data6(href)
entry = [name, features, specs]
data.append(entry)
page += 1
return data
def get_single_item_data1(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text,'html.parser')
for item_name in soup.findAll('div', {'class':'flex-wrap flex-fill'}):
print ('name:', item_name.string)
return item_name.string
def get_single_item_data5(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text,'html.parser')
for features in soup.findAll('li', {'data-auid':'feature-benefits-listing'}):
print('features: ', features.string)
return features.string
def get_single_item_data6(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text,'html.parser')
for spec in soup.findAll('li', {'data-auid':'specifications_listing'}):
print('specifications: ', spec.text)
return spec.text
row_data = trade_spider(1)
row_headers = ['name','features', 'specs']
with open('data.csv', 'w') as f:
write = csv.writer(f)
write.writerow(row_headers)
write.writerows(row_data)
print ()
每当我打印出结果时,抓取都会起作用,但是当将它们保存到 csv 文件中时,只会打印第一个要点。
所需的示例输出:
Name: Lace Up Work Boots
Features: Steel toes safeguard your feet Suede uppers provide durability 5.5" boot height Meets ASTM F2413-11 I/75 C/75 standards Goodyear welt construction EVA insoles offer cushioning
Specifications: Activity: Work Steel toe: Yes Safety Toe: Steel Gender: Women's Material: Suede Boot height (in.): 6" and Under Slip Resistant: Yes Waterproof: No Electrical hazard (EH) rated: No
但是我现在才得到:
Name: Lace Up Work Boots
Features: Steel toes safeguard your feet
Specifications: Activity: Work
有什么帮助吗?
您可以使用此示例如何从项目中获取数据并将其保存为 CSV:
import csv
import requests
from bs4 import BeautifulSoup
url = "https://www.academy.com/shop/browse/footwear/womens-footwear/womens-work-boots?&page_{}"
def grab_data(url):
soup = BeautifulSoup(requests.get(url).content, "html.parser")
title = soup.h1.get_text(strip=True)
features = [
f.get_text(strip=True)
for f in soup.select('[data-auid="feature-benefits-listing"]')
]
specs = [
f.get_text(strip=True)
for f in soup.select('[data-auid="specifications_listing"]')
]
return title, features, specs
with open("data.csv", "w") as f_in:
csv_writer = csv.writer(f_in)
csv_writer.writerow(["Name", "Features", "Specifications"])
for page in range(1, 2): # <-- adjust to your page count
soup = BeautifulSoup(
requests.get(url.format(page)).content, "html.parser"
)
for a in soup.select("a.detail-card"):
u = "https://www.academy.com" + a["href"]
title, feats, specs = grab_data(u)
print(title)
print("*** Features:")
print(*feats, sep="\n")
print("*** Specifications:")
print(*specs, sep="\n")
print("-" * 80)
csv_writer.writerow([title, " ".join(feats), " ".join(specs)])
打印:
Brazos Women's Tradesman Steel Toe Lace Up Work Boots
*** Features:
Steel toes safeguard your feet
Suede uppers provide durability
5.5" boot height
Meets ASTM F2413-11 I/75 C/75 standards
Goodyear welt construction
EVA insoles offer cushioning
*** Specifications:
Activity:Work
Steel toe:Yes
Safety Toe:Steel
Gender:Women's
Material:Suede
Boot height (in.):6" and Under
Slip Resistant:Yes
Waterproof:No
Electrical hazard (EH) rated:No
--------------------------------------------------------------------------------
Fila Women's Memory Workshift Service Shoes
*** Features:
Slip-resistant rubber outsoles
Synthetic uppers
Low-top style
EVA midsoles
*** Specifications:
Activity:Work
Steel toe:No
Safety Toe:Soft
Gender:Women's
Material:Man-made Materials
Slip Resistant:Yes
Waterproof:No
Electrical hazard (EH) rated:No
--------------------------------------------------------------------------------
...and so on.
并保存 data.csv
(来自 LibreOffice 的屏幕截图):