使用 scrapy 将值存储到键中
Store values into keys with scrapy
我想从网站中提取价格等信息,并将其作为值存储在字典中。但是,我正在尝试学习 scrapy,所以我想知道如何使用它来实现这一目标。
这是 requests
和 BeautifulSoup
的样子
import numpy as np
import requests as r
import pandas as pd
from bs4 import BeauitfulSoup
html = ['https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=2&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=3&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=4&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=5&_sop=16']
data = defaultdict(list)
for i in range(0, len(html):
r = requests.get(html[i])
soup = BeautifulSoup(r.content, 'lxml')
name = soup.select(".s-item__title")
value = soup.select(".ITALIC")
for n, v in zip(name, value):
data["card"].append(n.text.strip())
data["price"].append(v.text.strip())
这是我用 scrapy 尝试过的,但在查看 json 输出后我没有得到任何值。我只是得到了链接,如何得到像上面代码那样的输出?:
import scrapy
from scrapy.loader import ItemLoader
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
html = np.array(['https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=2&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=3&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=4&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=5&_sop=16'],
dtype=object)
url = pd.DataFrame(html, columns=['data'])
class StatisticsItem(scrapy.Item):
statistics_div = Field(output_processor=TakeFirst())
url = Field(output_processor=TakeFirst())
class StatisticsSpider(scrapy.Spider):
name = 'statistics'
start_urls = url.data.values
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url
)
def parse(self, response):
table = response.xpath("//div[@class='s-item__price']").get()
loader = ItemLoader(StatisticsItem())
loader.add_value('values', table)
loader.add_value('url', response.url)
yield loader.load_item()
process = CrawlerProcess(
settings={
'FEED_URI': 'ebay_data.json',
'FEED_FORMAT': 'jsonlines'
}
)
process.crawl(StatisticsSpider)
process.start()
我设置 custom_settings 以 json 格式写入 'cards_info.json'。
在解析中,我浏览页面上的每张卡片(请参阅 xpath)并获取卡片的标题和价格,然后我放弃它们。 Scrapy 会将它们写入 'cards_info.json'.
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
class StatisticsItem(scrapy.Item):
statistics_div = Field(output_processor=TakeFirst())
url = Field(output_processor=TakeFirst())
class StatisticsSpider(scrapy.Spider):
name = 'statistics'
start_urls = ['https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=2&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=3&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=4&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=5&_sop=16']
custom_settings = {
'FEED_FORMAT': 'json',
'FEED_URI': 'cards_info.json'
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url
)
def parse(self, response):
all_cards = response.xpath('//div[@class="s-item__wrapper clearfix"]')
for card in all_cards:
name = card.xpath('.//h3/text()').get()
price = card.xpath('.//span[@class="s-item__price"]//text()').get()
# now do whatever you want, append to dictionary, yield as item.
# example with yield:
yield {
'card': name,
'price': price
}
输出:
[scrapy.core.scraper] DEBUG: Scraped from <200 https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16>
{'card': 'Pokemon 1st Edition Shadowless Base Set 11 Blister Booster Pack Lot - DM To Buy!', 'price': '£93,805.84'}
[scrapy.core.scraper] DEBUG: Scraped from <200 https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16>
{'card': 'Pokemon Team Rocket Complete Complete 83/82, German, 1. Edition', 'price': '£102,026.04'}
[scrapy.core.scraper] DEBUG: Scraped from <200 https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16>
{'card': 'Yugioh E Hero Pit Boss 2013 World Championship Prize Card BGS 9.5 Gem Mint', 'price': '£100,000.00'}
...
...
cards_info.json:
[
{"card": "1999 Pokemon Base Set Booster Box GREEN WING", "price": "\u00a340,000.00"},
{"card": "1996 MEDIA FACTORY POKEMON NO RARITY BASE SET CHARIZARD 006 BECKETT BGS MINT 9 ", "price": "\u00a339,999.99"},
{"card": "Yugioh - BGS8.5 Jump Festa Blue Eyes White Dragon -1999 - Limited - PSA", "price": "\u00a340,000.00"},
{"card": "PSA 8 CHARIZARD 1999 POKEMON 1ST EDITION THICK STAMP SHADOWLESS #4 HOLO NM-MINT", "price": "\u00a337,224.53"},
{"card": "PSA 9 MINT Pok\u00e9mon Play Promo 50000 PTS Gold Star Japanese Pokemon", "price": "\u00a338,261.06"},
...
...
]
我想从网站中提取价格等信息,并将其作为值存储在字典中。但是,我正在尝试学习 scrapy,所以我想知道如何使用它来实现这一目标。
这是 requests
和 BeautifulSoup
import numpy as np
import requests as r
import pandas as pd
from bs4 import BeauitfulSoup
html = ['https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=2&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=3&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=4&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=5&_sop=16']
data = defaultdict(list)
for i in range(0, len(html):
r = requests.get(html[i])
soup = BeautifulSoup(r.content, 'lxml')
name = soup.select(".s-item__title")
value = soup.select(".ITALIC")
for n, v in zip(name, value):
data["card"].append(n.text.strip())
data["price"].append(v.text.strip())
这是我用 scrapy 尝试过的,但在查看 json 输出后我没有得到任何值。我只是得到了链接,如何得到像上面代码那样的输出?:
import scrapy
from scrapy.loader import ItemLoader
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
html = np.array(['https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=2&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=3&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=4&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=5&_sop=16'],
dtype=object)
url = pd.DataFrame(html, columns=['data'])
class StatisticsItem(scrapy.Item):
statistics_div = Field(output_processor=TakeFirst())
url = Field(output_processor=TakeFirst())
class StatisticsSpider(scrapy.Spider):
name = 'statistics'
start_urls = url.data.values
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url
)
def parse(self, response):
table = response.xpath("//div[@class='s-item__price']").get()
loader = ItemLoader(StatisticsItem())
loader.add_value('values', table)
loader.add_value('url', response.url)
yield loader.load_item()
process = CrawlerProcess(
settings={
'FEED_URI': 'ebay_data.json',
'FEED_FORMAT': 'jsonlines'
}
)
process.crawl(StatisticsSpider)
process.start()
我设置 custom_settings 以 json 格式写入 'cards_info.json'。
在解析中,我浏览页面上的每张卡片(请参阅 xpath)并获取卡片的标题和价格,然后我放弃它们。 Scrapy 会将它们写入 'cards_info.json'.
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
class StatisticsItem(scrapy.Item):
statistics_div = Field(output_processor=TakeFirst())
url = Field(output_processor=TakeFirst())
class StatisticsSpider(scrapy.Spider):
name = 'statistics'
start_urls = ['https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=2&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=3&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=4&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=5&_sop=16']
custom_settings = {
'FEED_FORMAT': 'json',
'FEED_URI': 'cards_info.json'
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url
)
def parse(self, response):
all_cards = response.xpath('//div[@class="s-item__wrapper clearfix"]')
for card in all_cards:
name = card.xpath('.//h3/text()').get()
price = card.xpath('.//span[@class="s-item__price"]//text()').get()
# now do whatever you want, append to dictionary, yield as item.
# example with yield:
yield {
'card': name,
'price': price
}
输出:
[scrapy.core.scraper] DEBUG: Scraped from <200 https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16>
{'card': 'Pokemon 1st Edition Shadowless Base Set 11 Blister Booster Pack Lot - DM To Buy!', 'price': '£93,805.84'}
[scrapy.core.scraper] DEBUG: Scraped from <200 https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16>
{'card': 'Pokemon Team Rocket Complete Complete 83/82, German, 1. Edition', 'price': '£102,026.04'}
[scrapy.core.scraper] DEBUG: Scraped from <200 https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16>
{'card': 'Yugioh E Hero Pit Boss 2013 World Championship Prize Card BGS 9.5 Gem Mint', 'price': '£100,000.00'}
...
...
cards_info.json:
[
{"card": "1999 Pokemon Base Set Booster Box GREEN WING", "price": "\u00a340,000.00"},
{"card": "1996 MEDIA FACTORY POKEMON NO RARITY BASE SET CHARIZARD 006 BECKETT BGS MINT 9 ", "price": "\u00a339,999.99"},
{"card": "Yugioh - BGS8.5 Jump Festa Blue Eyes White Dragon -1999 - Limited - PSA", "price": "\u00a340,000.00"},
{"card": "PSA 8 CHARIZARD 1999 POKEMON 1ST EDITION THICK STAMP SHADOWLESS #4 HOLO NM-MINT", "price": "\u00a337,224.53"},
{"card": "PSA 9 MINT Pok\u00e9mon Play Promo 50000 PTS Gold Star Japanese Pokemon", "price": "\u00a338,261.06"},
...
...
]