使用 Python 进行网络抓取以从 sitemap.xml 中提取每个页面的微数据
Web-scraping with Python to extract microdata for each page from a sitemap.xml
我正在尝试从 sitemap.xml 中提取的页面中提取名称、品牌、价格、股票微数据
但是我被以下步骤阻止了,谢谢你帮助我,因为我是新手我无法理解阻止元素
- 抓取 sitemap.xml 以获得 url 列表:OK
- 提取元数据:OK
- 提取产品架构:确定
- 提取产品不成功
- 抓取网站并存储产品不成功
- 抓取 sitemap.xml 以获得 url 列表:OK
import pandas as pd
import requests
import extruct
from w3lib.html import get_base_url
import urllib.request
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import advertools as adv
proximus_sitemap = adv.sitemap_to_df('https://www.proximus.be/iportal/sitemap.xml')
proximus_sitemap = proximus_sitemap[proximus_sitemap['loc'].str.contains('boutique')]
proximus_sitemap = proximus_sitemap[proximus_sitemap['loc'].str.contains('/fr/')]
- 提取元数据:OK
def extract_metadata(url):
r = requests.get(url)
base_url = get_base_url(r.text, r.url)
metadata = extruct.extract(r.text,
base_url=base_url,
uniform=True,
syntaxes=['json-ld',
'microdata',
'opengraph'])
return metadata
metadata = extract_metadata('https://www.proximus.be/fr/id_cr_apple-iphone-13-128gb-blue/particuliers/equipement/boutique/apple-iphone-13-128gb-blue.html')
metadata
- 提取产品架构:确定
def get_dictionary_by_key_value(dictionary, target_key, target_value):
for key in dictionary:
if len(dictionary[key]) > 0:
for item in dictionary[key]:
if item[target_key] == target_value:
return item
Product = get_dictionary_by_key_value(metadata, "@type", "Product")
Product
- 提取产品不成功 => errormessage = errorkey 'offers'
def get_products(metadata):
Product = get_dictionary_by_key_value(metadata, "@type", "Product")
if Product:
products = []
for offer in Product['offers']['offers']:
product = {
'product_name': Product.get('name', ''),
'brand': offer.get('description', ''),
'availability': offer.get('availability', ''),
'lowprice': offer.get('lowPrice', ''),
'highprice': offer.get('highPrice', ''),
'price': offer.get('price', ''),
'priceCurrency': offer.get('priceCurrency', ''),
}
products.append(product)
return products
- 抓取网站并存储产品,因为在上一步中被阻止
def scrape_products(proximus_sitemap, url='url'):
df_products = pd.DataFrame(columns = ['product_name', 'brand', 'name', 'availability',
'lowprice', 'highprice','price','priceCurrency'])
for index, row in proximus_sitemap.iterrows():
metadata = extract_metadata(row[url])
products = get_products(metadata)
if products is not None:
for product in products:
df_products = df_products.append(product, ignore_index=True)
return df_products
df_products = scrape_products(proximus_sitemap, url='loc')
df_products.to_csv('patch.csv', index=False)
df_products.head()
您只需使用广告工具 SEO crawler 即可继续。它有一个 crawl
功能,默认情况下也可以提取结构化数据(JSON-LD、OpenGraph 和 Twitter)。
我尝试抓取十页的样本,结果如下所示:
adv.crawl(proximus_sitemap['loc'], 'proximums.jl')
proximus_crawl = pd.read_json('proximums.jl', lines=True)
proximus_crawl.filter(regex='jsonld').columns
Index(['jsonld_@context', 'jsonld_@type', 'jsonld_name', 'jsonld_url',
'jsonld_potentialAction.@type', 'jsonld_potentialAction.target',
'jsonld_potentialAction.query-input', 'jsonld_1_@context',
'jsonld_1_@type', 'jsonld_1_name', 'jsonld_1_url', 'jsonld_1_logo',
'jsonld_1_sameAs', 'jsonld_2_@context', 'jsonld_2_@type',
'jsonld_2_itemListElement', 'jsonld_2_name', 'jsonld_2_image',
'jsonld_2_description', 'jsonld_2_sku', 'jsonld_2_review',
'jsonld_2_brand.@type', 'jsonld_2_brand.name',
'jsonld_2_aggregateRating.@type',
'jsonld_2_aggregateRating.ratingValue',
'jsonld_2_aggregateRating.reviewCount', 'jsonld_2_offers.@type',
'jsonld_2_offers.priceCurrency', 'jsonld_2_offers.availability',
'jsonld_2_offers.price', 'jsonld_3_@context', 'jsonld_3_@type',
'jsonld_3_itemListElement', 'jsonld_image', 'jsonld_description',
'jsonld_sku', 'jsonld_review', 'jsonld_brand.@type',
'jsonld_brand.name', 'jsonld_aggregateRating.@type',
'jsonld_aggregateRating.ratingValue',
'jsonld_aggregateRating.reviewCount', 'jsonld_offers.@type',
'jsonld_offers.lowPrice', 'jsonld_offers.highPrice',
'jsonld_offers.priceCurrency', 'jsonld_offers.availability',
'jsonld_offers.price', 'jsonld_offers.offerCount',
'jsonld_1_itemListElement', 'jsonld_2_offers.lowPrice',
'jsonld_2_offers.highPrice', 'jsonld_2_offers.offerCount',
'jsonld_itemListElement'],
dtype='object')
这些是您可能感兴趣的一些列(包含价格、货币、可用性等)
jsonld_2_description
jsonld_2_offers.priceCurrency
jsonld_2_offers.availability
jsonld_2_offers.price
jsonld_description
jsonld_offers.lowPrice
jsonld_offers.priceCurrency
jsonld_offers.availability
jsonld_offers.price
jsonld_2_offers.lowPrice
0
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
1
Numéro 7
EUR
OutOfStock
369.99
nan
nan
nan
nan
nan
nan
2
nan
nan
nan
nan
Apple
81.82
EUR
InStock
487.6
nan
3
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
4
nan
nan
nan
nan
Huawei
nan
EUR
OutOfStock
330.57
nan
5
nan
nan
nan
nan
Apple
81.82
EUR
LimitedAvailability
487.6
nan
6
Apple
EUR
InStock
589.99
nan
nan
nan
nan
nan
99
7
Apple
EUR
LimitedAvailability
589.99
nan
nan
nan
nan
nan
99
8
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
9
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
我正在尝试从 sitemap.xml 中提取的页面中提取名称、品牌、价格、股票微数据 但是我被以下步骤阻止了,谢谢你帮助我,因为我是新手我无法理解阻止元素
- 抓取 sitemap.xml 以获得 url 列表:OK
- 提取元数据:OK
- 提取产品架构:确定
- 提取产品不成功
- 抓取网站并存储产品不成功
- 抓取 sitemap.xml 以获得 url 列表:OK
import pandas as pd
import requests
import extruct
from w3lib.html import get_base_url
import urllib.request
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import advertools as adv
proximus_sitemap = adv.sitemap_to_df('https://www.proximus.be/iportal/sitemap.xml')
proximus_sitemap = proximus_sitemap[proximus_sitemap['loc'].str.contains('boutique')]
proximus_sitemap = proximus_sitemap[proximus_sitemap['loc'].str.contains('/fr/')]
- 提取元数据:OK
def extract_metadata(url):
r = requests.get(url)
base_url = get_base_url(r.text, r.url)
metadata = extruct.extract(r.text,
base_url=base_url,
uniform=True,
syntaxes=['json-ld',
'microdata',
'opengraph'])
return metadata
metadata = extract_metadata('https://www.proximus.be/fr/id_cr_apple-iphone-13-128gb-blue/particuliers/equipement/boutique/apple-iphone-13-128gb-blue.html')
metadata
- 提取产品架构:确定
def get_dictionary_by_key_value(dictionary, target_key, target_value):
for key in dictionary:
if len(dictionary[key]) > 0:
for item in dictionary[key]:
if item[target_key] == target_value:
return item
Product = get_dictionary_by_key_value(metadata, "@type", "Product")
Product
- 提取产品不成功 => errormessage = errorkey 'offers'
def get_products(metadata):
Product = get_dictionary_by_key_value(metadata, "@type", "Product")
if Product:
products = []
for offer in Product['offers']['offers']:
product = {
'product_name': Product.get('name', ''),
'brand': offer.get('description', ''),
'availability': offer.get('availability', ''),
'lowprice': offer.get('lowPrice', ''),
'highprice': offer.get('highPrice', ''),
'price': offer.get('price', ''),
'priceCurrency': offer.get('priceCurrency', ''),
}
products.append(product)
return products
- 抓取网站并存储产品,因为在上一步中被阻止
def scrape_products(proximus_sitemap, url='url'):
df_products = pd.DataFrame(columns = ['product_name', 'brand', 'name', 'availability',
'lowprice', 'highprice','price','priceCurrency'])
for index, row in proximus_sitemap.iterrows():
metadata = extract_metadata(row[url])
products = get_products(metadata)
if products is not None:
for product in products:
df_products = df_products.append(product, ignore_index=True)
return df_products
df_products = scrape_products(proximus_sitemap, url='loc')
df_products.to_csv('patch.csv', index=False)
df_products.head()
您只需使用广告工具 SEO crawler 即可继续。它有一个 crawl
功能,默认情况下也可以提取结构化数据(JSON-LD、OpenGraph 和 Twitter)。
我尝试抓取十页的样本,结果如下所示:
adv.crawl(proximus_sitemap['loc'], 'proximums.jl')
proximus_crawl = pd.read_json('proximums.jl', lines=True)
proximus_crawl.filter(regex='jsonld').columns
Index(['jsonld_@context', 'jsonld_@type', 'jsonld_name', 'jsonld_url',
'jsonld_potentialAction.@type', 'jsonld_potentialAction.target',
'jsonld_potentialAction.query-input', 'jsonld_1_@context',
'jsonld_1_@type', 'jsonld_1_name', 'jsonld_1_url', 'jsonld_1_logo',
'jsonld_1_sameAs', 'jsonld_2_@context', 'jsonld_2_@type',
'jsonld_2_itemListElement', 'jsonld_2_name', 'jsonld_2_image',
'jsonld_2_description', 'jsonld_2_sku', 'jsonld_2_review',
'jsonld_2_brand.@type', 'jsonld_2_brand.name',
'jsonld_2_aggregateRating.@type',
'jsonld_2_aggregateRating.ratingValue',
'jsonld_2_aggregateRating.reviewCount', 'jsonld_2_offers.@type',
'jsonld_2_offers.priceCurrency', 'jsonld_2_offers.availability',
'jsonld_2_offers.price', 'jsonld_3_@context', 'jsonld_3_@type',
'jsonld_3_itemListElement', 'jsonld_image', 'jsonld_description',
'jsonld_sku', 'jsonld_review', 'jsonld_brand.@type',
'jsonld_brand.name', 'jsonld_aggregateRating.@type',
'jsonld_aggregateRating.ratingValue',
'jsonld_aggregateRating.reviewCount', 'jsonld_offers.@type',
'jsonld_offers.lowPrice', 'jsonld_offers.highPrice',
'jsonld_offers.priceCurrency', 'jsonld_offers.availability',
'jsonld_offers.price', 'jsonld_offers.offerCount',
'jsonld_1_itemListElement', 'jsonld_2_offers.lowPrice',
'jsonld_2_offers.highPrice', 'jsonld_2_offers.offerCount',
'jsonld_itemListElement'],
dtype='object')
这些是您可能感兴趣的一些列(包含价格、货币、可用性等)
jsonld_2_description | jsonld_2_offers.priceCurrency | jsonld_2_offers.availability | jsonld_2_offers.price | jsonld_description | jsonld_offers.lowPrice | jsonld_offers.priceCurrency | jsonld_offers.availability | jsonld_offers.price | jsonld_2_offers.lowPrice | |
---|---|---|---|---|---|---|---|---|---|---|
0 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan |
1 | Numéro 7 | EUR | OutOfStock | 369.99 | nan | nan | nan | nan | nan | nan |
2 | nan | nan | nan | nan | Apple | 81.82 | EUR | InStock | 487.6 | nan |
3 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan |
4 | nan | nan | nan | nan | Huawei | nan | EUR | OutOfStock | 330.57 | nan |
5 | nan | nan | nan | nan | Apple | 81.82 | EUR | LimitedAvailability | 487.6 | nan |
6 | Apple | EUR | InStock | 589.99 | nan | nan | nan | nan | nan | 99 |
7 | Apple | EUR | LimitedAvailability | 589.99 | nan | nan | nan | nan | nan | 99 |
8 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan |
9 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan |