Scrapy Request - 处理一组又一组的 url - 我可以使用优先级吗?
Scrapy Request - process one group of urls after another - can I use priority?
如何使 scrapy 处理一个 group/list 个 url?我有两个网址列表。我必须处理第一个列表,包括 item pipelines
,然后我可以处理第二个列表。
两者都应该由一个蜘蛛处理。
我不确定 priority
是否对我有帮助。
priority (int) – the priority of this request (defaults to 0). The priority is used by the scheduler to define the order used to process requests. Requests with a higher priority value will execute earlier. Negative values are allowed in order to indicate relatively low-priority.
因为我不知道它是否只是根据优先级重新排序 Requests
- 它可能会在第一个列表的最后一个 url 之前通过 pipelinenig 来自第二个列表的第一个 url 结束。
我能否确定第一个列表中的 items
会先于第二个列表中的导出到 XML(我正在使用 XMLItemExporter
)?
编辑:
错误(@Wilfredo):
2017-11-23 20:12:16 [scrapy.utils.signal] ERROR: Error caught on
signal handler: > Traceback (most recent call last):
File
"/home/milano/.virtualenvs/eoilenv/local/lib/python2.7/site-packages/scrapy/utils/signal.py",
line 30, in send_catch_log
*arguments, **named) File "/home/milano/.virtualenvs/eoilenv/local/lib/python2.7/site-packages/pydispatch/robustapply.py",
line 55, in robustApply
return receiver(*arguments, **named) TypeError: spider_idle() takes exactly 2 arguments (1 given) 2017-11-23 20:12:16
[scrapy.core.engine] INFO: Closing spider (finished)
编辑二:
# coding=utf-8
import scrapy
from bot.items import TestItem
from scrapy import Spider, Request, signals
from scrapy.exceptions import DontCloseSpider
class IndexSpider(Spider):
name = 'index_spider'
allowed_domains = ['www.scrape.com']
def start_requests(self):
for url in ["https://www.scrape.com/eshop"]:
# for url in ["https://www.scrape.com/search/getAjaxResult?categoryId=1&"]:
yield Request(url, callback=self.parse_main_page)
def parse_main_page(self, response):
# get subcategories and categories
self.categories = []
self.subcategories = []
parts = response.selector.xpath("//div[contains(@class,'side-nav') and not(contains(@class,'side-nav-'))]")
for part in parts:
part_name = part.xpath('.//h4/text()').extract_first().strip()
category_list = [part_name]
categories_ul = part.xpath('./ul')
categories_lis = categories_ul.xpath('./li')
for category_li in categories_lis:
category_list = category_list[:1]
category_name = category_li.xpath('./a/text()').extract_first().strip()
category_href = category_li.xpath('./a/@href').extract_first().strip()
categoryId = self._extract_categoryId_from_url(category_href)
category_list.append(category_name)
self.categories.append((categoryId, category_list))
subcategories_lis = category_li.xpath('.//li')
for subcategory_li in subcategories_lis:
category_list = category_list[:2]
subcategory_href = subcategory_li.xpath('./a/@href').extract_first()
subcategory_name = subcategory_li.xpath('./a/text()').extract_first().strip()
subcategoryId = self._extract_categoryId_from_url(subcategory_href)
category_list.append(subcategory_name)
self.subcategories.append((subcategoryId, category_list))
# Scrape all subcategories (then categories)
# for sub in self.subcategories:
# url = "https://www.scrape.com/search/getAjaxResult?categoryId={}".format(sub[0])
# yield Request(url,meta={'tup':sub,'priority':1,'type':'subcategory'},priority=1,callback=self.parse_category)
def parse_category(self, response):
tup = response.meta['tup']
type = response.meta['type']
priority = response.meta['priority']
current_page = response.meta.get('page', 1)
categoryId = tup[0]
categories_list = tup[1]
number_of_pages_href = response.selector.xpath(u'//a[text()="Last"]/@href').extract_first()
try:
number_of_pages = int(number_of_pages_href.split('p=')[1].split('&')[0])
except:
number_of_pages = current_page
if current_page < number_of_pages:
url = "https://www.scrape.com/search/getAjaxResult/?categoryId={}&p={}".format(categoryId, current_page + 1)
yield Request(url, self.parse_category, meta={'tup': tup, 'page': current_page + 1,'priority':priority,'type':type}, priority=priority)
hrefs = self._extract_all_product_urls(response)
for href in hrefs:
yield Request(href, self.parse_product, meta={"categories_list": categories_list,'type':type}, priority=2 if priority==1 else -1)
def parse_product(self, response):
yield TestItem(url=response.url,type=response.meta['type'], category_text='|'.join(response.meta['categories_list']))
def _extract_categoryId_from_url(self, url):
categoryId = url.split('/')[-2]
return categoryId
def _extract_all_product_urls(self, response):
hrefs = response.selector.xpath("//a[contains(@class, 'shop-item-image')]/@href").extract()
return [u"https://www.scrape.com{}".format(x) for x in hrefs]
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(IndexSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_idle,
signal=scrapy.signals.spider_idle)
return spider
def spider_idle(self):
self.crawler.signals.disconnect(self.spider_idle,
signal=scrapy.signals.spider_idle)
# yield a new group of urls
if self.categories:
for cat in self.categories:
url = "https://www.scrape.com/search/getAjaxResult?categoryId={}".format(cat[0])
yield Request(url, meta={'tup': cat, 'priority': 0, 'type': 'category'}, priority=0,
callback=self.parse_category)
self.categories = []
raise DontCloseSpider()
为了确保一个请求接踵而至,我会这样做:
def start_requests(self):
urls = ['url1', 'url2']
yield Request(
url=urls[0],
callback=self.process_request,
meta={'urls': urls, 'current_index':0}
def process_request(self, response):
# do my thing
yield {} # yield item
current_index = response.meta['current_index'] + 1
if current_index < len(response.meta['urls']:
yield Request(
url=response.meta['urls'][current_index],
callback=self.process_request,
meta={'urls': response.meta['urls'], 'current_index': current_index})
是的,优先级 url(具有更高优先级的请求)首先由调度程序处理,以确保您可以设置较低的并发性 CONCURRENT_REQUESTS = 1
原因是如果您使用更大的并发一些低优先级 urls 可能在您排队一些新请求时已经下载,这可能会给您留下不遵守顺序的印象。
另一种选择(如果您需要更高的并发性)是定义 a spider_idle
method (my bad, thanks to @eLRuLL for pointing this up) and yield the requests from the second group, yielding also a DontCloseSpider
异常,如下所示:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.exceptions import DontCloseSpider
class ExampleSpider(scrapy.Spider):
name = 'example'
allowed_domains = ['example.com']
first_group_of_urls = [
'http://quotes.toscrape.com/page/1/'
]
second_group_of_urls = [
'http://quotes.toscrape.com/page/2/'
]
def start_requests(self):
for url in self.first_group_of_urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
self.logger.debug('In response from %s', response.url)
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(ExampleSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_idle,
signal=scrapy.signals.spider_idle)
return spider
def spider_idle(self):
self.crawler.signals.disconnect(self.spider_idle,
signal=scrapy.signals.spider_idle)
for url in self.second_group_of_urls:
self.crawler.engine.crawl(scrapy.Request(url), self)
raise DontCloseSpider
如何使 scrapy 处理一个 group/list 个 url?我有两个网址列表。我必须处理第一个列表,包括 item pipelines
,然后我可以处理第二个列表。
两者都应该由一个蜘蛛处理。
我不确定 priority
是否对我有帮助。
priority (int) – the priority of this request (defaults to 0). The priority is used by the scheduler to define the order used to process requests. Requests with a higher priority value will execute earlier. Negative values are allowed in order to indicate relatively low-priority.
因为我不知道它是否只是根据优先级重新排序 Requests
- 它可能会在第一个列表的最后一个 url 之前通过 pipelinenig 来自第二个列表的第一个 url 结束。
我能否确定第一个列表中的 items
会先于第二个列表中的导出到 XML(我正在使用 XMLItemExporter
)?
编辑:
错误(@Wilfredo):
2017-11-23 20:12:16 [scrapy.utils.signal] ERROR: Error caught on signal handler: > Traceback (most recent call last): File "/home/milano/.virtualenvs/eoilenv/local/lib/python2.7/site-packages/scrapy/utils/signal.py", line 30, in send_catch_log *arguments, **named) File "/home/milano/.virtualenvs/eoilenv/local/lib/python2.7/site-packages/pydispatch/robustapply.py", line 55, in robustApply return receiver(*arguments, **named) TypeError: spider_idle() takes exactly 2 arguments (1 given) 2017-11-23 20:12:16 [scrapy.core.engine] INFO: Closing spider (finished)
编辑二:
# coding=utf-8
import scrapy
from bot.items import TestItem
from scrapy import Spider, Request, signals
from scrapy.exceptions import DontCloseSpider
class IndexSpider(Spider):
name = 'index_spider'
allowed_domains = ['www.scrape.com']
def start_requests(self):
for url in ["https://www.scrape.com/eshop"]:
# for url in ["https://www.scrape.com/search/getAjaxResult?categoryId=1&"]:
yield Request(url, callback=self.parse_main_page)
def parse_main_page(self, response):
# get subcategories and categories
self.categories = []
self.subcategories = []
parts = response.selector.xpath("//div[contains(@class,'side-nav') and not(contains(@class,'side-nav-'))]")
for part in parts:
part_name = part.xpath('.//h4/text()').extract_first().strip()
category_list = [part_name]
categories_ul = part.xpath('./ul')
categories_lis = categories_ul.xpath('./li')
for category_li in categories_lis:
category_list = category_list[:1]
category_name = category_li.xpath('./a/text()').extract_first().strip()
category_href = category_li.xpath('./a/@href').extract_first().strip()
categoryId = self._extract_categoryId_from_url(category_href)
category_list.append(category_name)
self.categories.append((categoryId, category_list))
subcategories_lis = category_li.xpath('.//li')
for subcategory_li in subcategories_lis:
category_list = category_list[:2]
subcategory_href = subcategory_li.xpath('./a/@href').extract_first()
subcategory_name = subcategory_li.xpath('./a/text()').extract_first().strip()
subcategoryId = self._extract_categoryId_from_url(subcategory_href)
category_list.append(subcategory_name)
self.subcategories.append((subcategoryId, category_list))
# Scrape all subcategories (then categories)
# for sub in self.subcategories:
# url = "https://www.scrape.com/search/getAjaxResult?categoryId={}".format(sub[0])
# yield Request(url,meta={'tup':sub,'priority':1,'type':'subcategory'},priority=1,callback=self.parse_category)
def parse_category(self, response):
tup = response.meta['tup']
type = response.meta['type']
priority = response.meta['priority']
current_page = response.meta.get('page', 1)
categoryId = tup[0]
categories_list = tup[1]
number_of_pages_href = response.selector.xpath(u'//a[text()="Last"]/@href').extract_first()
try:
number_of_pages = int(number_of_pages_href.split('p=')[1].split('&')[0])
except:
number_of_pages = current_page
if current_page < number_of_pages:
url = "https://www.scrape.com/search/getAjaxResult/?categoryId={}&p={}".format(categoryId, current_page + 1)
yield Request(url, self.parse_category, meta={'tup': tup, 'page': current_page + 1,'priority':priority,'type':type}, priority=priority)
hrefs = self._extract_all_product_urls(response)
for href in hrefs:
yield Request(href, self.parse_product, meta={"categories_list": categories_list,'type':type}, priority=2 if priority==1 else -1)
def parse_product(self, response):
yield TestItem(url=response.url,type=response.meta['type'], category_text='|'.join(response.meta['categories_list']))
def _extract_categoryId_from_url(self, url):
categoryId = url.split('/')[-2]
return categoryId
def _extract_all_product_urls(self, response):
hrefs = response.selector.xpath("//a[contains(@class, 'shop-item-image')]/@href").extract()
return [u"https://www.scrape.com{}".format(x) for x in hrefs]
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(IndexSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_idle,
signal=scrapy.signals.spider_idle)
return spider
def spider_idle(self):
self.crawler.signals.disconnect(self.spider_idle,
signal=scrapy.signals.spider_idle)
# yield a new group of urls
if self.categories:
for cat in self.categories:
url = "https://www.scrape.com/search/getAjaxResult?categoryId={}".format(cat[0])
yield Request(url, meta={'tup': cat, 'priority': 0, 'type': 'category'}, priority=0,
callback=self.parse_category)
self.categories = []
raise DontCloseSpider()
为了确保一个请求接踵而至,我会这样做:
def start_requests(self):
urls = ['url1', 'url2']
yield Request(
url=urls[0],
callback=self.process_request,
meta={'urls': urls, 'current_index':0}
def process_request(self, response):
# do my thing
yield {} # yield item
current_index = response.meta['current_index'] + 1
if current_index < len(response.meta['urls']:
yield Request(
url=response.meta['urls'][current_index],
callback=self.process_request,
meta={'urls': response.meta['urls'], 'current_index': current_index})
是的,优先级 url(具有更高优先级的请求)首先由调度程序处理,以确保您可以设置较低的并发性 CONCURRENT_REQUESTS = 1
原因是如果您使用更大的并发一些低优先级 urls 可能在您排队一些新请求时已经下载,这可能会给您留下不遵守顺序的印象。
另一种选择(如果您需要更高的并发性)是定义 a spider_idle
method (my bad, thanks to @eLRuLL for pointing this up) and yield the requests from the second group, yielding also a DontCloseSpider
异常,如下所示:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.exceptions import DontCloseSpider
class ExampleSpider(scrapy.Spider):
name = 'example'
allowed_domains = ['example.com']
first_group_of_urls = [
'http://quotes.toscrape.com/page/1/'
]
second_group_of_urls = [
'http://quotes.toscrape.com/page/2/'
]
def start_requests(self):
for url in self.first_group_of_urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
self.logger.debug('In response from %s', response.url)
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(ExampleSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_idle,
signal=scrapy.signals.spider_idle)
return spider
def spider_idle(self):
self.crawler.signals.disconnect(self.spider_idle,
signal=scrapy.signals.spider_idle)
for url in self.second_group_of_urls:
self.crawler.engine.crawl(scrapy.Request(url), self)
raise DontCloseSpider