KeyError: image_link key not found in Scrapy but present in items.py
KeyError: image_link key not found in Scrapy but present in items.py
我的items.py看起来像:
import scrapy
class NewsItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
pubDate = scrapy.Field()
description = scrapy.Field()
image_link = scrapy.Field()
pass
并且使用了 蜘蛛:
class Spider(BaseSpider):
NO_IMAGE = "NoImageFound"
name = '****'
allowed_domains = ['****', '****']
start_urls = [
'http://*****',
]
def parse(self, response):
self.log('A response from %s just arrived!' % response.url)
sel = Selector(response)
_items = sel.xpath('//item')
for item in _items:
_item = NewsItem()
_title = item.xpath('title/text()').extract()
_item['title'] = ""
if _title:
_item['title'] = _title[0]
#other stuffs here
yield Request(_item['link'], callback=self.parse_detail, meta={'_item': _item})
yield _item
def parse_detail(self, response):
_item = response.meta.get('_item')
sel = Selector(response)
try:
_item['image_link'] = sel.select("//div[@class='article_content']/*/img/@src").extract()[0]
except:
try:
_item['image_link'] = sel.select("//div[@class='entry']/descendant::node()/img/@src").extract()[0]
except:
_item['image_link'] = self.NO_IMAGE
if _item['image_link'][0].endswith('gif'):
_item = self.NO_IMAGE
# _item['image_link'] = "TESTING"
return _item
并且在 pipelines.py
class NewsUploadPipeline(object):
def process_item(self, item, spider):
title = item['title'].encode('ascii', 'ignore')
description = item['description'].encode('ascii', 'ignore')
link = item['link'].encode('ascii', 'ignore')
image_link = item['image_link'].encode('ascii', 'ignore')
当我 运行 项目时,我得到这个:
File "/home/khadka/rkbnb/my_app/crawler/rkbnbcrawler/rkbnbcrawler/pipelines.py", line 16, in process_item
image_link = item['image_link'].encode('ascii', 'ignore')
File "/home/khadka/rkbnb/my_app/lib/python2.7/site-packages/scrapy/item.py", line 56, in __getitem__
return self._values[key]
KeyError: 'image_link'
输出报告
'log_count/DEBUG': 57,
'log_count/ERROR': 27,
'log_count/INFO': 7,
'log_count/WARNING': 2,
怎么了?显然 image_link 存在于 items.py 中。衷心感谢任何帮助或提示。
我认为错误出在这部分代码中:
if _item['image_link'][0].endswith('gif'):
_item = self.NO_IMAGE
在这种情况下,您用 self.NO_IMAGE 覆盖所有 _item
应该是:
if _item['image_link'][0].endswith('gif'):
_item['image_link'] = self.NO_IMAGE
我找到问题了
问题是 item['image_link']
的添加没有被 传回 到 parse
函数。
已解决
我用
解决了这个问题
_item = Request(_item['link'], callback=self.parse_detail, meta={'_item': _item})
yield _item
我猜想从 parse_detail
函数返回了对 _item 的更改。
我的items.py看起来像:
import scrapy
class NewsItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
pubDate = scrapy.Field()
description = scrapy.Field()
image_link = scrapy.Field()
pass
并且使用了 蜘蛛:
class Spider(BaseSpider):
NO_IMAGE = "NoImageFound"
name = '****'
allowed_domains = ['****', '****']
start_urls = [
'http://*****',
]
def parse(self, response):
self.log('A response from %s just arrived!' % response.url)
sel = Selector(response)
_items = sel.xpath('//item')
for item in _items:
_item = NewsItem()
_title = item.xpath('title/text()').extract()
_item['title'] = ""
if _title:
_item['title'] = _title[0]
#other stuffs here
yield Request(_item['link'], callback=self.parse_detail, meta={'_item': _item})
yield _item
def parse_detail(self, response):
_item = response.meta.get('_item')
sel = Selector(response)
try:
_item['image_link'] = sel.select("//div[@class='article_content']/*/img/@src").extract()[0]
except:
try:
_item['image_link'] = sel.select("//div[@class='entry']/descendant::node()/img/@src").extract()[0]
except:
_item['image_link'] = self.NO_IMAGE
if _item['image_link'][0].endswith('gif'):
_item = self.NO_IMAGE
# _item['image_link'] = "TESTING"
return _item
并且在 pipelines.py
class NewsUploadPipeline(object):
def process_item(self, item, spider):
title = item['title'].encode('ascii', 'ignore')
description = item['description'].encode('ascii', 'ignore')
link = item['link'].encode('ascii', 'ignore')
image_link = item['image_link'].encode('ascii', 'ignore')
当我 运行 项目时,我得到这个:
File "/home/khadka/rkbnb/my_app/crawler/rkbnbcrawler/rkbnbcrawler/pipelines.py", line 16, in process_item
image_link = item['image_link'].encode('ascii', 'ignore')
File "/home/khadka/rkbnb/my_app/lib/python2.7/site-packages/scrapy/item.py", line 56, in __getitem__
return self._values[key]
KeyError: 'image_link'
输出报告
'log_count/DEBUG': 57,
'log_count/ERROR': 27,
'log_count/INFO': 7,
'log_count/WARNING': 2,
怎么了?显然 image_link 存在于 items.py 中。衷心感谢任何帮助或提示。
我认为错误出在这部分代码中:
if _item['image_link'][0].endswith('gif'):
_item = self.NO_IMAGE
在这种情况下,您用 self.NO_IMAGE 覆盖所有 _item
应该是:
if _item['image_link'][0].endswith('gif'):
_item['image_link'] = self.NO_IMAGE
我找到问题了
问题是 item['image_link']
的添加没有被 传回 到 parse
函数。
已解决
我用
解决了这个问题_item = Request(_item['link'], callback=self.parse_detail, meta={'_item': _item})
yield _item
我猜想从 parse_detail
函数返回了对 _item 的更改。