使用脚本中的 Scrapy 运行 遍历所有 links/sub-links
Iterate over all links/sub-links with Scrapy run from script
我想从我的脚本中 运行 Scrapy Spider,但它仅适用于 1 个请求。我无法从 scrapy.http.Request(product_url, callback=self.parse_product)
执行过程 self.parse_product。
我猜是由于命令 crawler.signals.connect(callback, signal=signals.spider_closed)
。请告知如何正确遍历所有链接和子链接。
整个脚本如下所示。
import json
import scrapy
from scrapy.crawler import Crawler
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose, TakeFirst
from scrapy import log, signals, Spider, Item, Field
from scrapy.settings import Settings
from twisted.internet import reactor
# https://gist.github.com/alecxe/fc1527d6d9492b59c610
# define an item class
class WebStoreItem(Item):
name = Field()
price = Field()
developer = Field()
date_added = Field()
date_modified = Field()
votes = Field()
views = Field()
sales = Field()
avg_rating = Field()
comments = Field()
# define an item loader with input and output processors
class WebStoreItemLoader(ItemLoader):
default_input_processor = MapCompose(unicode.strip)
default_output_processor = TakeFirst()
desc_out = Join()
# define a pipeline
class JsonWriterPipeline(object):
def __init__(self):
self.file = open('items.json', 'wb')
def __del__(self):
self.file.close()
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
# define a spider
class WebStoreSpider(Spider):
name = "WebStore"
allowed_domains = ["http://www.WebStore.com"]
start_urls = [
"http://www.WebStore.com/index.php"
]
def parse(self, response):
for meta in response.xpath('//div[@class="extension-grid"]'):
for product_block in meta.xpath('//div[@class="image-holder image"]'):
item = WebStoreItem()
avg_rating = meta.xpath('//div[@class="rating"]/text()').extract()[0]
item['avg_rating'] = avg_rating[avg_rating.find(': ') + 1:].strip()
comment = meta.xpath('//div[@class="comment"]/text()').extract()[0]
item['comments'] = comment[comment.find(': ') + 1:].strip()
print 'product_block: ', product_block
product_url = product_block.xpath('a[1]/@href').extract()[0]
print 'product_url: ', product_url
request = scrapy.http.Request(product_url, callback=self.parse_product)
request.meta['item'] = item
yield request
def parse_product(self, response):
item = response.meta['item']
product_meta_block = response.xpath('//div[@class="name"]')
print 'product_meta_block: ', product_meta_block
product_rows = product_meta_block.xpath('//tr)')
print 'product_rows: ', product_rows
i = 0
for row in product_rows:
if i == 1:
item['name'] = row.select('td/text()').extract()
elif i == 3:
item['votes'] = row.select('td/text()').extract()
i += 1
return item
# callback fired when the spider is closed
def callback(spider, reason):
stats = spider.crawler.stats.get_stats() # collect/log stats?
# stop the reactor
reactor.stop()
def stop_reactor():
reactor.stop()
if __name__ == '__main__':
# instantiate settings and provide a custom configuration
settings = Settings()
settings.set('ITEM_PIPELINES', {
'__main__.JsonWriterPipeline': 100
})
# instantiate a crawler passing in settings
crawler = Crawler(settings)
# instantiate a spider
spider = WebStoreSpider()
# configure signals
crawler.signals.connect(callback, signal=signals.spider_closed)
# configure and start the crawler
crawler.configure()
crawler.crawl(spider)
crawler.start()
# start logging
log.start()
# start the reactor (blocks execution)
reactor.run()
根据您的 allowed_domains
规范,您的蜘蛛程序被阻止访问起始页之后的页面。该值应仅包含 domain,而不是 protocol。尝试
allowed_domains = ["www.WebStore.com"]
您的 WebStoreItemLoader
定义中的行 desc_out = Join()
可能会出错,因为您没有 desc
字段。
我想从我的脚本中 运行 Scrapy Spider,但它仅适用于 1 个请求。我无法从 scrapy.http.Request(product_url, callback=self.parse_product)
执行过程 self.parse_product。
我猜是由于命令 crawler.signals.connect(callback, signal=signals.spider_closed)
。请告知如何正确遍历所有链接和子链接。
整个脚本如下所示。
import json
import scrapy
from scrapy.crawler import Crawler
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose, TakeFirst
from scrapy import log, signals, Spider, Item, Field
from scrapy.settings import Settings
from twisted.internet import reactor
# https://gist.github.com/alecxe/fc1527d6d9492b59c610
# define an item class
class WebStoreItem(Item):
name = Field()
price = Field()
developer = Field()
date_added = Field()
date_modified = Field()
votes = Field()
views = Field()
sales = Field()
avg_rating = Field()
comments = Field()
# define an item loader with input and output processors
class WebStoreItemLoader(ItemLoader):
default_input_processor = MapCompose(unicode.strip)
default_output_processor = TakeFirst()
desc_out = Join()
# define a pipeline
class JsonWriterPipeline(object):
def __init__(self):
self.file = open('items.json', 'wb')
def __del__(self):
self.file.close()
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
# define a spider
class WebStoreSpider(Spider):
name = "WebStore"
allowed_domains = ["http://www.WebStore.com"]
start_urls = [
"http://www.WebStore.com/index.php"
]
def parse(self, response):
for meta in response.xpath('//div[@class="extension-grid"]'):
for product_block in meta.xpath('//div[@class="image-holder image"]'):
item = WebStoreItem()
avg_rating = meta.xpath('//div[@class="rating"]/text()').extract()[0]
item['avg_rating'] = avg_rating[avg_rating.find(': ') + 1:].strip()
comment = meta.xpath('//div[@class="comment"]/text()').extract()[0]
item['comments'] = comment[comment.find(': ') + 1:].strip()
print 'product_block: ', product_block
product_url = product_block.xpath('a[1]/@href').extract()[0]
print 'product_url: ', product_url
request = scrapy.http.Request(product_url, callback=self.parse_product)
request.meta['item'] = item
yield request
def parse_product(self, response):
item = response.meta['item']
product_meta_block = response.xpath('//div[@class="name"]')
print 'product_meta_block: ', product_meta_block
product_rows = product_meta_block.xpath('//tr)')
print 'product_rows: ', product_rows
i = 0
for row in product_rows:
if i == 1:
item['name'] = row.select('td/text()').extract()
elif i == 3:
item['votes'] = row.select('td/text()').extract()
i += 1
return item
# callback fired when the spider is closed
def callback(spider, reason):
stats = spider.crawler.stats.get_stats() # collect/log stats?
# stop the reactor
reactor.stop()
def stop_reactor():
reactor.stop()
if __name__ == '__main__':
# instantiate settings and provide a custom configuration
settings = Settings()
settings.set('ITEM_PIPELINES', {
'__main__.JsonWriterPipeline': 100
})
# instantiate a crawler passing in settings
crawler = Crawler(settings)
# instantiate a spider
spider = WebStoreSpider()
# configure signals
crawler.signals.connect(callback, signal=signals.spider_closed)
# configure and start the crawler
crawler.configure()
crawler.crawl(spider)
crawler.start()
# start logging
log.start()
# start the reactor (blocks execution)
reactor.run()
根据您的 allowed_domains
规范,您的蜘蛛程序被阻止访问起始页之后的页面。该值应仅包含 domain,而不是 protocol。尝试
allowed_domains = ["www.WebStore.com"]
您的 WebStoreItemLoader
定义中的行 desc_out = Join()
可能会出错,因为您没有 desc
字段。