Python 中的网页抓取问题
Issue With Web Scraping In Python
因此,出于某种原因,当我尝试获取此脚本的结果时,它只是崩溃并且在我得到任何东西之前根本没有显示任何错误,请有人帮助我让它工作。我不知道这是为什么,我认为这可能在某些方面与获取 Items 变量有关,但我就是想不通!任何帮助将不胜感激。
这是脚本:
from bs4 import BeautifulSoup
import requests
import re
import time
print("Computer Deal Finder")
print("\nBy: ViridianTelamon.")
print("\nThis Program Will Help You Find The Best Computers, Adapters, Electronics, And Computer Components Using The Website New Egg.")
item_thing = input("\nEnter The Item You Want To Find The Best Deals On: ")
time.sleep(2)
#url = f"https://www.amazon.com/s?k={item}&page=1&crid=1BE844NMMQSV7&sprefix={item}%2Caps%2C1923&ref=nb_sb_noss_1"
url = f"https://www.newegg.ca/p/pl?d={item_thing}&N=4131"
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
#page_text = doc.find(class_="s-pagination-item s-pagination-selected")
page_text = doc.find(class_="list-tool-pagination-text").strong
pages = int(str(page_text).split("/")[-2].split(">")[-1][:-1])
items_found = []
for page in range(1, pages + 1):
url = f"https://www.newegg.ca/p/pl?d={item_thing}&N=4131page={page}"
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
items = doc.find_all(text=re.compile(item_thing))
#items = div.find_all(text=re.compile(item_thing))
for item in items:
parent = item.parent
link = None
if parent.name != "a":
continue
link = parent['href']
next_parent = item.find_parent(class_="item-container")
try:
price = next_parent.find(class_="price-current").find("strong").string
items_found[item] = {"Price: ": int(price.replace(",", "")), "URL: ": link}
except:
pass
#sorted_items = sorted(items_found.items(), key=lambda x: x[1]['price'])
sorted_items = sorted(items_found, key=lambda x: x[1]['price'])
print("\n--------------------")
for item in sorted_items:
print("\n"f"Name: {item[0]}")
print("\n"f"Price: ${items[1]['price']}")
print("\n"f"URL: items[1]['link']")
print("\n--------------------")
time.sleep(0.2)
我建议您测试 .find()
调用的结果,因为并非所有项目都包含您需要的信息。例如:
from bs4 import BeautifulSoup
import requests
import re
import time
item_thing = "adapter"
url = f"https://www.newegg.ca/p/pl?d={item_thing}&N=4131"
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
page_text = doc.find(class_="list-tool-pagination-text").strong
pages = int(str(page_text).split("/")[-2].split(">")[-1][:-1])
items_found = []
for page in range(1, pages + 1):
print(f"Getting page {page}")
url = f"https://www.newegg.ca/p/pl?d={item_thing}&N=4131&page={page}"
req = requests.get(url)
doc = BeautifulSoup(req.content, "html.parser")
for div in doc.find_all('div', class_="item-container"):
li_price = div.find(class_='price-current')
price = 0 # assume unknown price
if li_price:
strong = li_price.find('strong')
if strong:
price = float(strong.text.replace(',', ''))
a_tag = div.find('a', class_='item-title', href=True)
items_found.append([price, a_tag['href'], a_tag.text])
for price, link, name in sorted(items_found):
print(f"Name: {name}")
print(f"Price: ${price}")
print(f"URL: {link}")
print("--------------------")
这会给你开始的结果:
Name: axGear Universal Brass 3.5mm Male to 6.5mm Female Stereo Audio Adapter Jack Connector
Price: .0
URL: https://www.newegg.ca/p/231-0099-00023?Description=adapter&cm_re=adapter-_-9SIAD1NC9E3870-_-Product
--------------------
Name: axGear USB-C Female to USB 3.0 Male Adapter Converter Type C to USB 3 F/M
Price: .0
URL: https://www.newegg.ca/p/231-0099-00018?Description=adapter&cm_re=adapter-_-9SIAD1NB4E4533-_-Product
--------------------
Name: ORICO USB to Bluetooth 4.0 Portable Adapter Wireless Receiver Adapter Dongle -White
Price: .0
URL: https://www.newegg.ca/orico-bta-403/p/0XM-000H-00009?Description=adapter&cm_re=adapter-_-0XM-000H-00009-_-Product
--------------------
因此,出于某种原因,当我尝试获取此脚本的结果时,它只是崩溃并且在我得到任何东西之前根本没有显示任何错误,请有人帮助我让它工作。我不知道这是为什么,我认为这可能在某些方面与获取 Items 变量有关,但我就是想不通!任何帮助将不胜感激。
这是脚本:
from bs4 import BeautifulSoup
import requests
import re
import time
print("Computer Deal Finder")
print("\nBy: ViridianTelamon.")
print("\nThis Program Will Help You Find The Best Computers, Adapters, Electronics, And Computer Components Using The Website New Egg.")
item_thing = input("\nEnter The Item You Want To Find The Best Deals On: ")
time.sleep(2)
#url = f"https://www.amazon.com/s?k={item}&page=1&crid=1BE844NMMQSV7&sprefix={item}%2Caps%2C1923&ref=nb_sb_noss_1"
url = f"https://www.newegg.ca/p/pl?d={item_thing}&N=4131"
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
#page_text = doc.find(class_="s-pagination-item s-pagination-selected")
page_text = doc.find(class_="list-tool-pagination-text").strong
pages = int(str(page_text).split("/")[-2].split(">")[-1][:-1])
items_found = []
for page in range(1, pages + 1):
url = f"https://www.newegg.ca/p/pl?d={item_thing}&N=4131page={page}"
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
items = doc.find_all(text=re.compile(item_thing))
#items = div.find_all(text=re.compile(item_thing))
for item in items:
parent = item.parent
link = None
if parent.name != "a":
continue
link = parent['href']
next_parent = item.find_parent(class_="item-container")
try:
price = next_parent.find(class_="price-current").find("strong").string
items_found[item] = {"Price: ": int(price.replace(",", "")), "URL: ": link}
except:
pass
#sorted_items = sorted(items_found.items(), key=lambda x: x[1]['price'])
sorted_items = sorted(items_found, key=lambda x: x[1]['price'])
print("\n--------------------")
for item in sorted_items:
print("\n"f"Name: {item[0]}")
print("\n"f"Price: ${items[1]['price']}")
print("\n"f"URL: items[1]['link']")
print("\n--------------------")
time.sleep(0.2)
我建议您测试 .find()
调用的结果,因为并非所有项目都包含您需要的信息。例如:
from bs4 import BeautifulSoup
import requests
import re
import time
item_thing = "adapter"
url = f"https://www.newegg.ca/p/pl?d={item_thing}&N=4131"
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
page_text = doc.find(class_="list-tool-pagination-text").strong
pages = int(str(page_text).split("/")[-2].split(">")[-1][:-1])
items_found = []
for page in range(1, pages + 1):
print(f"Getting page {page}")
url = f"https://www.newegg.ca/p/pl?d={item_thing}&N=4131&page={page}"
req = requests.get(url)
doc = BeautifulSoup(req.content, "html.parser")
for div in doc.find_all('div', class_="item-container"):
li_price = div.find(class_='price-current')
price = 0 # assume unknown price
if li_price:
strong = li_price.find('strong')
if strong:
price = float(strong.text.replace(',', ''))
a_tag = div.find('a', class_='item-title', href=True)
items_found.append([price, a_tag['href'], a_tag.text])
for price, link, name in sorted(items_found):
print(f"Name: {name}")
print(f"Price: ${price}")
print(f"URL: {link}")
print("--------------------")
这会给你开始的结果:
Name: axGear Universal Brass 3.5mm Male to 6.5mm Female Stereo Audio Adapter Jack Connector
Price: .0
URL: https://www.newegg.ca/p/231-0099-00023?Description=adapter&cm_re=adapter-_-9SIAD1NC9E3870-_-Product
--------------------
Name: axGear USB-C Female to USB 3.0 Male Adapter Converter Type C to USB 3 F/M
Price: .0
URL: https://www.newegg.ca/p/231-0099-00018?Description=adapter&cm_re=adapter-_-9SIAD1NB4E4533-_-Product
--------------------
Name: ORICO USB to Bluetooth 4.0 Portable Adapter Wireless Receiver Adapter Dongle -White
Price: .0
URL: https://www.newegg.ca/orico-bta-403/p/0XM-000H-00009?Description=adapter&cm_re=adapter-_-0XM-000H-00009-_-Product
--------------------