Python 请求网页抓取 API

Python requests web scraping API

我有一些网络抓取和 API 的经验,但是我无法在这个网站上搜索正确的 API 这样做:

https://www.giga.com.vc/Bebida obs:/Bebida 只是一个类似于“/Drinks”的类别

问题是,我发现了几个 API,但它们仅适用于一种产品,或者它们甚至适用于某些产品,但我似乎找不到正确的规则来对其进行适当的分页类别或页面并遍历类别产品以获取价格、EANS 等。

import requests
import pandas as pd
from bs4 import BeautifulSoup

例如:这行得通,但格式很糟糕:

print(requests.get('https://www.giga.com.vc/padaria?initialMap=c&initialQuery=padaria&map=category-1&page=1').content)

urlx = 'https://www.giga.com.vc/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=pt-BR&operationName=Products&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%2249a77e3e2082563773aff56ad9c0432d59302e86fd1baaad9ca0f4bca2630d46%22%2C%22sender%22%3A%22vtex.store-resources%400.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJBTExfQVZBSUxBQkxFIiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwiY2F0ZWdvcnkiOiIiLCJjb2xsZWN0aW9uIjoiMTYvIiwic3BlY2lmaWNhdGlvbkZpbHRlcnMiOltdLCJvcmRlckJ5IjoiIiwiZnJvbSI6MCwidG8iOjExfQ%3D%3D%22%7D'
r = requests.get(urlx)
for x in r.json()['data']['products']:
    print(x)

这也有效:

url2 = 'https://www.giga.com.vc/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=pt-BR&__bindingId=3f6e91e6-44f2-4fb0-a2d9-e238b53082e0&operationName=ProductRecommendations&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%22e5782bd9e8bc64d337a7d7f96b9c280b462cdb0754d15b415192dac2755ad280%22%2C%22sender%22%3A%22vtex.shelf%401.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJpZGVudGlmaWVyIjp7ImZpZWxkIjoiaWQiLCJ2YWx1ZSI6IjE0NzUyMyJ9LCJ0eXBlIjoidmlldyJ9%22%7D'

requests.get(url2).json()['data']['productRecommendations']

预期输出如下:

r = requests.get(urlx)
for items in r.json()['data']['products']:
    prd_dict = {
        'product_id': items['productId'],
        'price': items['priceRange']['sellingPrice']['highPrice'],
        'product_name': items['productName'],
        'category_id': items['categoryId'],
        'ean': items['items'][0]['ean'],
        'box_qty': items['specificationGroups'][0]['specifications'][0]['values']
        }
    print(prd_dict)

原始输出:

{'product_id': '141917', 'price': 20.54, 'product_name': 'Banana Nanica Kg', 'category_id': '433', 'ean': '4511', 'box_qty': ['0']}
{'product_id': '148077', 'price': 1.45, 'product_name': 'Água de Coco Tradicional Quadrado 200Ml', 'category_id': '148', 'ean': '0751320333650', 'box_qty': ['27']}

它将variables发送为base64,解码后有

'{"hideUnavailableItems":false,"skusFilter":"ALL","simulationBehavior":"default","installmentCriteria":"MAX_WITHOUT_INTEREST","productOriginVtex":false,"map":"c","query":"bebida","orderBy":"OrderByScoreDESC","from":40,"to":59,"selectedFacets":[{"key":"c","value":"bebida"}],"operator":"and","fuzzy":"0","searchState":null,"facetsBehavior":"Static","categoryTreeBehavior":"default","withFacets":false}'

如果我解析URL,全部转换为字典,替换为'from': 0 'to': 99 然后我得到 100 个产品

但如果你使用大于 99 的值,它就不起作用。也许它需要在 URL 中进行一些其他更改。

import base64
import urllib.parse
import urllib.request
import json

url1 = 'https://www.giga.com.vc/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=pt-BR&__bindingId=3f6e91e6-44f2-4fb0-a2d9-e238b53082e0&operationName=productSearchV3&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%226869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5%22%2C%22sender%22%3A%22vtex.store-resources%400.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJBTEwiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJiZWJpZGEiLCJvcmRlckJ5IjoiT3JkZXJCeVNjb3JlREVTQyIsImZyb20iOjIwLCJ0byI6MzksInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJiZWJpZGEifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9%22%7D'
url2 = 'https://www.giga.com.vc/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=pt-BR&__bindingId=3f6e91e6-44f2-4fb0-a2d9-e238b53082e0&operationName=productSearchV3&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%226869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5%22%2C%22sender%22%3A%22vtex.store-resources%400.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJBTEwiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJiZWJpZGEiLCJvcmRlckJ5IjoiT3JkZXJCeVNjb3JlREVTQyIsImZyb20iOjQwLCJ0byI6NTksInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJiZWJpZGEifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9%22%7D'

#print('--- url ---')
#print(url)

parts = urllib.parse.urlparse(url1)
#print(parts)

query = urllib.parse.parse_qs(parts.query)
#print(query)

data = json.loads(query['extensions'][0])
variables = data['variables']
#print(variables)

q = base64.b64decode(variables.encode()).decode()
q = json.loads(q)

print('--- replace values ---')

print(q)

q['from'] = 0
q['to'] = 99

print(q)

print('---')

q = json.dumps(q)
variables = base64.b64encode(q.encode()).decode()
#print(variables)

data['variables'] = variables
query['extensions'][0] = json.dumps(data)
#print(query)

parts = parts._replace(query=urllib.parse.urlencode(query, doseq=True))
#print(parts)

url = urllib.parse.urlunparse(parts)
#print('--- url ---')
#print(url)


req = urllib.request.urlopen(url)
data = json.loads(req.read())
for number, item in enumerate(data['data']['productSearch']['products'], 1):
    print(number, '|', item['productName'])

结果:

1 | Água de Coco Tradicional Quadrado 200Ml
2 | Leite Longa Vida Integral com Tampa Italac 1L
3 | Leite Longa Vida Quatá  Integral 1L
4 | Água Mineral sem Gás Minalba 1,5L
5 | Água Mineral Sem Gás Minalba 510Ml
6 | Refrigerante Coca-Cola200Ml
7 | Leite Longa Vida Integral Shefa Garrafa 1L
8 | Refrigerante Coca-Cola sem Açúcar 1L
9 | Cerveja Heineken Lata 350Ml
10 | Leite Integral Longa Vida com Tampa Ninho 1L
11 | Leite Longa Vida Semidesnatado Com Tampa Italac 1L
12 | Cerveja Heineken Long Neck 330Ml
13 | Cerveja Amstel Lata 269Ml
14 | Água Mineral Minalba Com Gás 510Ml
15 | Água Mineral Pureza Vital Sem Gás Nestlé Pet 510Ml
16 | Água Mineral Sem Gás Bonafont 500Ml
17 | Refrigerante Coca-Cola sem Açúcar 200ml
18 | Água de Coco Kero Coco 1L
19 | Refrigerante Coca-Cola 2,5L
20 | Leite Longa Vida Desnatado com Tampa Italac 1L
21 | Refrigerante Coca-Cola Lata 350Ml
22 | Leite Longa Vida Integral Tirol 1L
23 | Água Mineral com Gás Pureza Vital 510ML
24 | Suco de Uva Integral Tinto Aurora 1,5L
25 | Achocolatado Toddynho 200Ml
26 | Refrigerante Coca-Cola sem Açúcar Lata 220ml
27 | Energético Red Bull Energy Drink 250Ml
28 | Água Mineral Sem Gás Pureza Vital Nestlé 1,5 L
29 | Suco Natural One Laranja 2L
30 | Refrigerante Coca-Cola 2L
31 | Cerveja Skol Lata 350Ml
32 | Refrigerante Coca-Cola 1L
33 | Suco De Maçã Yakult 200Ml
34 | Cerveja Império Puro Malte Lata 269Ml
35 | Refrigerante Guaraná Antarctica 2L
36 | Refrigerante Coca-Cola Lata 220Ml
37 | Leite Em Pó Integral Italac Sachê 400G
38 | Água Coco Puro Coco 200Ml
39 | Água Mineral Com Gás Crystal Pet 1,5 L
40 | Achocolatado sabor Chocolate Italakinho  200Ml
41 | Cerveja Duplo Malte Brahma Lata 350Ml
42 | Água Mineral Indaiá sem Gás 500Ml
43 | Refresco em Pó Sabor Laranja Tang 25G
44 | Água De Coco Puro Coco 1L
45 | Cerveja Budweiser Lata 269Ml
46 | Cerveja Skol Lata 269Ml
47 | Refresco em Pó Sabor Uva Tang 25G
48 | Refrigerante Guaraná Antarctica Lata 350Ml
49 | Cerveja Eisenbahn Pilsen Lata 350Ml
50 | Cerveja Itaipava Lata 350Ml
51 | Cerveja Itaipava Lata 269Ml
52 | Refrigerante Coca-Cola 600Ml
53 | Refrigerante Guaraná Antarctica 1,5L
54 | Cerveja Stella Artois Lata 269Ml
55 | Whisky Escocês Johnnie Walker Red Label 1L
56 | Refrigerante Coca-Cola sem Açúcar 2L
57 | Água Mineral sem Gás Crystal Pet 1,5 L
58 | Cerveja Amstel Lata 350Ml
59 | Cerveja Corona Extra Long Neck 330Ml
60 | Cerveja Stella Artois Long Neck 330Ml
61 | Água Mineral Sem Gás Bonafont 1,5 L
62 | Cerveja Puro Malte Petra Lata 350Ml
63 | Água de Coco Kero Coco 200Ml
64 | Cerveja Heineken Garrafa 600Ml
65 | Refrigerante de Laranja Sukita 2L
66 | Chopp De Vinho Draft 600Ml
67 | Refrigerante De Limão H2Oh! 500Ml
68 | Suco Natural de Uva e Maca One Ambiente 2L
69 | Refrigerante Dolly Guaraná 2L
70 | Energético Energy Monster Lata 473Ml
71 | Refresco em Pó Sabor Limão Tang 25g
72 | Suco De Laranja Integral Prat's 4 Ls
73 | Energético Red Bull Tropical Energy Drink  250Ml
74 | Refrigerante Limoneto H2Oh! Pet 500Ml
75 | Água Tônica Antarctica Zero 350Ml
76 | Água Mineral Sem Gás Minalba 10 Ls
77 | Vodka Red Smirnoff 998Ml
78 | Suco De Laranja Natural Xandô Garrafa 900Ml
79 | Energético Red Bull Melancia Energy Drink 250Ml
80 | Bebida Láctea de Proteína Zero Lactose sabor Chocolate YoPro 15G
81 | Água Verão Sense Lindoya 510ml
82 | Vodka Nacional Smirnoff Ice Red 269ml
83 | Whisky Escocês White Horse 8 Anos 1L
84 | Refrigerante Coca-Cola sem Açúcar 2,5L
85 | Refresco em Pó Sabor Maracujá Tang 25g
86 | Cerveja Império Puro Malte Lata 350Ml
87 | Vodka Ice Smirnoff 275Ml
88 | Cerveja Eisenbahn Pilsen Long Neck 355Ml
89 | Guaraná Com Açaí Natural Guaraviton Pet 500Ml
90 | Cerveja Budweiser Long Neck 330Ml
91 | Água Mineral Com Gás Pet Crystal 500Ml
92 | Água Tônica Antarctica Lata 350Ml
93 | Refrigerante Sabor Guaraná Mini Dolly Pet 350Ml
94 | Água Tônica Schweppes lata 350ml
95 | Cachaça 51 965Ml
96 | Cerveja Skol Lata 550Ml
97 | Refresco em Pó Sabor Abacaxi Tang 25g
98 | Cerveja Puro Malte Petra Lata 269Ml
99 | Cachaça Velho Barreiro 910Ml
100 | Refrigerante Fanta Laranja 2L