避免 python/scrapy 中的冗余代码
Avoid redundant code in python/scrapy
我对 python 和 scrapy 还很陌生。我使用 scrapy 编写了一个工作脚本,需要进行一些改进以避免冗余。
在 parse_article_page 函数中,我遇到了 2 种可能性。文章是否有变体(更多页面需要废弃)。你能帮我避免重复使用 else 语句和 parse_data 函数中的代码吗?
我尝试了第二个请求,但这似乎不起作用。日志显示 "DEBUG: Filtered duplicate request" 或什么都不显示。
def parse_article_page(self, response):
#Check for variants
variants = response.xpath('//div[@class="variants"]/select/option[not(@disabled)]/@variant_href').extract()
if len(variants) > 1:
for variant in variants:
variant_url = response.urljoin(variant)
#Request article variants:
yield scrapy.Request(variant_url, callback=self.parse_data)
else:
#yield scrapy.Request(response.url, callback=self.parse_data) #Does not work
item = ShopItem()
item['desc'] = response.css(description_selector).extract()
item['price'] = response.css(price_selector).extract()
item['itno'] = response.css(no_selector).extract()
item['url'] = response.url
yield item
def parse_data(self, response):
item = ShopItem()
item['desc'] = response.css(description_selector).extract()
item['price'] = response.css(price_selector).extract()
item['itno'] = response.css(no_selector).extract()
item['url'] = response.url
yield item
调用 else: self.parse_data(response)
将不起作用,因为您仍然需要在该方法中生成该项目以便 scrapy 获取它,您必须执行以下操作:
def parse_article_page(self, response):
#Check for variants
variants = response.xpath('//div[@class="variants"]/select/option[not(@disabled)]/@variant_href').extract()
if len(variants) > 1:
for variant in variants:
variant_url = response.urljoin(variant)
#Request article variants:
yield scrapy.Request(variant_url, callback=self.parse_data)
else:
for item in self.parse_data(response):
yield item
def parse_data(self, response):
item = ShopItem()
item['desc'] = response.css(description_selector).extract()
item['price'] = response.css(price_selector).extract()
item['itno'] = response.css(no_selector).extract()
item['url'] = response.url
yield item
我对 python 和 scrapy 还很陌生。我使用 scrapy 编写了一个工作脚本,需要进行一些改进以避免冗余。
在 parse_article_page 函数中,我遇到了 2 种可能性。文章是否有变体(更多页面需要废弃)。你能帮我避免重复使用 else 语句和 parse_data 函数中的代码吗?
我尝试了第二个请求,但这似乎不起作用。日志显示 "DEBUG: Filtered duplicate request" 或什么都不显示。
def parse_article_page(self, response):
#Check for variants
variants = response.xpath('//div[@class="variants"]/select/option[not(@disabled)]/@variant_href').extract()
if len(variants) > 1:
for variant in variants:
variant_url = response.urljoin(variant)
#Request article variants:
yield scrapy.Request(variant_url, callback=self.parse_data)
else:
#yield scrapy.Request(response.url, callback=self.parse_data) #Does not work
item = ShopItem()
item['desc'] = response.css(description_selector).extract()
item['price'] = response.css(price_selector).extract()
item['itno'] = response.css(no_selector).extract()
item['url'] = response.url
yield item
def parse_data(self, response):
item = ShopItem()
item['desc'] = response.css(description_selector).extract()
item['price'] = response.css(price_selector).extract()
item['itno'] = response.css(no_selector).extract()
item['url'] = response.url
yield item
调用 else: self.parse_data(response)
将不起作用,因为您仍然需要在该方法中生成该项目以便 scrapy 获取它,您必须执行以下操作:
def parse_article_page(self, response):
#Check for variants
variants = response.xpath('//div[@class="variants"]/select/option[not(@disabled)]/@variant_href').extract()
if len(variants) > 1:
for variant in variants:
variant_url = response.urljoin(variant)
#Request article variants:
yield scrapy.Request(variant_url, callback=self.parse_data)
else:
for item in self.parse_data(response):
yield item
def parse_data(self, response):
item = ShopItem()
item['desc'] = response.css(description_selector).extract()
item['price'] = response.css(price_selector).extract()
item['itno'] = response.css(no_selector).extract()
item['url'] = response.url
yield item