Beautifulsoup 亚马逊产品详情
Beautifoulsoup Amazon Product Detail
我无法使用请求或 requests_html 抓取“产品详细信息”部分(向下滚动网页您会找到它)html。
Find_all returns 一个 0 大小的对象...有帮助吗?
from requests import session
from requests_html import HTMLSession
s = HTMLSession()
#s = session()
r = s.get("https://www.amazon.com/dp/B094HWN66Y")
soup = BeautifulSoup(r.text, 'html.parser')
len(soup.find_all("div", {"id":"detailBulletsWrapper_feature_div"}))
这是一个示例,说明如何使用 bs4
和 requests
抓取产品标题,可轻松扩展以获取产品的其他信息。
你的请求不起作用的原因是你的请求没有 headers 所以亚马逊意识到你是一个机器人并且不希望你抓取他们的网站。这由您的请求显示为 <Response [503]>
并在 r.text
中解释。
我相信亚马逊有一个 API 用于此(他们可能希望您使用)但是像这样抓取 small-scale 东西会很好。
import requests
import bs4
# Amazon don't like you scrapeing them however these headers should stop them from noticing a small number of requests
HEADERS = ({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)AppleWebKit/537.36 (KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36','Accept-Language': 'en-US, en;q=0.5'})
def main():
url = "https://www.amazon.com/dp/B094HWN66Y"
title = get_title(url)
print("The title of %s is: %s" % (url, title))
def get_title(url: str) -> str:
"""Returns the title of the amazon product."""
# The request
r = requests.get(url, headers=HEADERS)
# Parse the content
soup = bs4.BeautifulSoup(r.content, 'html.parser')
title = soup.find("span", attrs={"id": 'productTitle'}).string
return title
if __name__ == "__main__":
main()
输出:
The title of https://www.amazon.com/dp/B094HWN66Y is: Will They, Won't They?
具有不同信息的产品详细信息:
代码:
from bs4 import BeautifulSoup
import requests
cookies = {'session': '131-1062572-6801905'}
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'}
r = requests.get("https://www.amazon.com/dp/B094HWN66Y",headers=headers,cookies=cookies)
print(r)
soup = BeautifulSoup(r.text, 'lxml')
key = [x.get_text(strip=True).replace('\u200f\n','').replace('\u200e','').replace(':\n','').replace('\n', '').strip() for x in soup.select('ul.a-unordered-list.a-nostyle.a-vertical.a-spacing-none.detail-bullet-list > li > span > span.a-text-bold')][:13]
#print(key)
value = [x.get_text(strip=True) for x in soup.select('ul.a-unordered-list.a-nostyle.a-vertical.a-spacing-none.detail-bullet-list > li > span > span:nth-child(2)')]
#print(value)
product_details = {k:v for k, v, in zip(key, value)}
print(product_details)
输出:
{'ASIN': 'B094HWN66Y', 'Publisher': 'Boldwood Books (September 7, 2021)', 'Publication date':
'September 7, 2021', 'Language': 'English', 'File size': '1883 KB', 'Text-to-Speech': 'Enabled', 'Screen Reader': 'Supported', 'Enhanced typesetting': 'Enabled', 'X-Ray': 'Enabled', 'Word
Wise': 'Enabled', 'Print length': '332 pages', 'Page numbers source ISBN': '1800487622', 'Lending': 'Not Enabled'}
我无法使用请求或 requests_html 抓取“产品详细信息”部分(向下滚动网页您会找到它)html。 Find_all returns 一个 0 大小的对象...有帮助吗?
from requests import session
from requests_html import HTMLSession
s = HTMLSession()
#s = session()
r = s.get("https://www.amazon.com/dp/B094HWN66Y")
soup = BeautifulSoup(r.text, 'html.parser')
len(soup.find_all("div", {"id":"detailBulletsWrapper_feature_div"}))
这是一个示例,说明如何使用 bs4
和 requests
抓取产品标题,可轻松扩展以获取产品的其他信息。
你的请求不起作用的原因是你的请求没有 headers 所以亚马逊意识到你是一个机器人并且不希望你抓取他们的网站。这由您的请求显示为 <Response [503]>
并在 r.text
中解释。
我相信亚马逊有一个 API 用于此(他们可能希望您使用)但是像这样抓取 small-scale 东西会很好。
import requests
import bs4
# Amazon don't like you scrapeing them however these headers should stop them from noticing a small number of requests
HEADERS = ({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)AppleWebKit/537.36 (KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36','Accept-Language': 'en-US, en;q=0.5'})
def main():
url = "https://www.amazon.com/dp/B094HWN66Y"
title = get_title(url)
print("The title of %s is: %s" % (url, title))
def get_title(url: str) -> str:
"""Returns the title of the amazon product."""
# The request
r = requests.get(url, headers=HEADERS)
# Parse the content
soup = bs4.BeautifulSoup(r.content, 'html.parser')
title = soup.find("span", attrs={"id": 'productTitle'}).string
return title
if __name__ == "__main__":
main()
输出:
The title of https://www.amazon.com/dp/B094HWN66Y is: Will They, Won't They?
具有不同信息的产品详细信息:
代码:
from bs4 import BeautifulSoup
import requests
cookies = {'session': '131-1062572-6801905'}
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'}
r = requests.get("https://www.amazon.com/dp/B094HWN66Y",headers=headers,cookies=cookies)
print(r)
soup = BeautifulSoup(r.text, 'lxml')
key = [x.get_text(strip=True).replace('\u200f\n','').replace('\u200e','').replace(':\n','').replace('\n', '').strip() for x in soup.select('ul.a-unordered-list.a-nostyle.a-vertical.a-spacing-none.detail-bullet-list > li > span > span.a-text-bold')][:13]
#print(key)
value = [x.get_text(strip=True) for x in soup.select('ul.a-unordered-list.a-nostyle.a-vertical.a-spacing-none.detail-bullet-list > li > span > span:nth-child(2)')]
#print(value)
product_details = {k:v for k, v, in zip(key, value)}
print(product_details)
输出:
{'ASIN': 'B094HWN66Y', 'Publisher': 'Boldwood Books (September 7, 2021)', 'Publication date':
'September 7, 2021', 'Language': 'English', 'File size': '1883 KB', 'Text-to-Speech': 'Enabled', 'Screen Reader': 'Supported', 'Enhanced typesetting': 'Enabled', 'X-Ray': 'Enabled', 'Word
Wise': 'Enabled', 'Print length': '332 pages', 'Page numbers source ISBN': '1800487622', 'Lending': 'Not Enabled'}