Python 中的网页抓取问题

Question

因此，出于某种原因，当我尝试获取此脚本的结果时，它只是崩溃并且在我得到任何东西之前根本没有显示任何错误，请有人帮助我让它工作。我不知道这是为什么，我认为这可能在某些方面与获取 Items 变量有关，但我就是想不通！任何帮助将不胜感激。

这是脚本：

from bs4 import BeautifulSoup
import requests
import re
import time

print("Computer Deal Finder")

print("\nBy:  ViridianTelamon.")

print("\nThis Program Will Help You Find The Best  Computers, Adapters, Electronics, And Computer Components Using The Website New Egg.")

item_thing = input("\nEnter The Item You Want To Find The Best Deals On:  ")

time.sleep(2)

#url = f"https://www.amazon.com/s?k={item}&page=1&crid=1BE844NMMQSV7&sprefix={item}%2Caps%2C1923&ref=nb_sb_noss_1"

url = f"https://www.newegg.ca/p/pl?d={item_thing}&N=4131"
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")

#page_text = doc.find(class_="s-pagination-item s-pagination-selected")

page_text = doc.find(class_="list-tool-pagination-text").strong
pages = int(str(page_text).split("/")[-2].split(">")[-1][:-1])

items_found = []

for page in range(1, pages + 1):
    url = f"https://www.newegg.ca/p/pl?d={item_thing}&N=4131page={page}"
    page = requests.get(url).text
    doc = BeautifulSoup(page, "html.parser")

    items = doc.find_all(text=re.compile(item_thing))

    #items = div.find_all(text=re.compile(item_thing))

    for item in items:
        parent = item.parent
        link = None
        if parent.name != "a":
            continue

        link = parent['href']

        next_parent = item.find_parent(class_="item-container")
        try:
            price = next_parent.find(class_="price-current").find("strong").string
            items_found[item] = {"Price:  ": int(price.replace(",", "")), "URL:  ": link}
        except:
            pass

#sorted_items = sorted(items_found.items(), key=lambda x: x[1]['price'])
sorted_items = sorted(items_found, key=lambda x: x[1]['price'])

print("\n--------------------")

for item in sorted_items:
    print("\n"f"Name:  {item[0]}")
    print("\n"f"Price:  ${items[1]['price']}")
    print("\n"f"URL:  items[1]['link']")
    print("\n--------------------")
    time.sleep(0.2)

Answer 1

我建议您测试 .find() 调用的结果，因为并非所有项目都包含您需要的信息。例如：

from bs4 import BeautifulSoup
import requests
import re
import time

item_thing = "adapter"

url = f"https://www.newegg.ca/p/pl?d={item_thing}&N=4131"
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
page_text = doc.find(class_="list-tool-pagination-text").strong
pages = int(str(page_text).split("/")[-2].split(">")[-1][:-1])
items_found = []

for page in range(1, pages + 1):
    print(f"Getting page {page}")
    url = f"https://www.newegg.ca/p/pl?d={item_thing}&N=4131&page={page}"
    req = requests.get(url)
    doc = BeautifulSoup(req.content, "html.parser")

    for div in doc.find_all('div', class_="item-container"):
        li_price = div.find(class_='price-current')
        price = 0   # assume unknown price
        
        if li_price:
            strong = li_price.find('strong')
            
            if strong:
                price = float(strong.text.replace(',', ''))
            
        a_tag = div.find('a', class_='item-title', href=True)
        items_found.append([price, a_tag['href'], a_tag.text])
        
for price, link, name in sorted(items_found):
    print(f"Name:  {name}")
    print(f"Price:  ${price}")
    print(f"URL:  {link}")
    print("--------------------")

这会给你开始的结果：

Name:  axGear Universal Brass 3.5mm Male to 6.5mm Female Stereo Audio Adapter Jack Connector
Price:  .0
URL:  https://www.newegg.ca/p/231-0099-00023?Description=adapter&cm_re=adapter-_-9SIAD1NC9E3870-_-Product
--------------------
Name:  axGear USB-C Female to USB 3.0 Male Adapter Converter Type C to USB 3 F/M
Price:  .0
URL:  https://www.newegg.ca/p/231-0099-00018?Description=adapter&cm_re=adapter-_-9SIAD1NB4E4533-_-Product
--------------------
Name:  ORICO USB to Bluetooth 4.0 Portable Adapter Wireless Receiver Adapter Dongle -White
Price:  .0
URL:  https://www.newegg.ca/orico-bta-403/p/0XM-000H-00009?Description=adapter&cm_re=adapter-_-0XM-000H-00009-_-Product
--------------------

Python 中的网页抓取问题

Issue With Web Scraping In Python

python

windows

beautifulsoup

web-scraping