Pipeline.py 删除值而不是字段
Pipeline.py to drop Value rather than Field
我目前正在使用 Scrapy 脚本从亚马逊页面中提取产品信息。我 运行 遇到的问题是异常处理,它只删除错误字段而不是我输出中的整个 item/row。
当前蜘蛛:
from scrapy.spider import Spider
from scrapy.selector import Selector
from dirbot.items import Website
class DmozSpider(Spider):
name = "dmoz"
allowed_domains = ["amazon.co.uk"]
start_urls = [
"http://www.amazon.co.uk/dp/B004YVOU9S",
"http://www.amazon.co.uk/dp/B009NFE2QQ"
]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//div[contains(@class, "a-container")]')
items = []
for site in sites:
item = Website()
item['asin'] = response.url.split('/')[-1]
item['title'] = site.xpath('div[@id="centerCol"]/div[@id="title_feature_div"]/div[@id="titleSection"]/h1[@id="title"]/span[@id="productTitle"]/text()').extract()
item['description'] = site.xpath('//*[@id="productDescription"]/div/div[1]/text()').extract()[0].strip()
item['price'] = site.xpath('//*[@id="priceblock_ourprice"]/text()').extract()
item['image'] = site.xpath('//*[@id="landingImage"]/@data-a-dynamic-image').extract()
item['brand'] = site.xpath('//*[@id="brand"]/text()').extract()
item['bullets'] = site.xpath('//*[@id="feature-bullets"]/span/ul').extract()[0].strip()
item['category'] = site.xpath('//*[@id="wayfinding-breadcrumbs_feature_div"]/ul').extract()[0].strip()
item['details'] = site.xpath('//*[@id="prodDetails"]/div/div[1]/div/div/div[2]/div/div/table').extract()[0].strip()
items.append(item)
return items
当抓取结果缺少任何字段时,我目前收到错误:
exceptions.IndexError: list index out of range
为了解决这个问题,我以 IgnoreRequest 的形式添加了一些异常处理。
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.exceptions import IgnoreRequest
from dirbot.items import Website
class DmozSpider(Spider):
name = "dmoz"
allowed_domains = ["amazon.co.uk"]
start_urls = [
"http://www.amazon.co.uk/dp/B004YVOU9S",
"http://www.amazon.co.uk/dp/B009NFE2QQ"
]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//div[contains(@class, "a-container")]')
items = []
try:
for site in sites:
item = Website()
item['asin'] = response.url.split('/')[-1]
item['title'] = site.xpath('div[@id="centerCol"]/div[@id="title_feature_div"]/div[@id="titleSection"]/h1[@id="title"]/span[@id="productTitle"]/text()').extract()
item['description'] = site.xpath('//*[@id="productDescription"]/div/div[1]/text()').extract()[0].strip()
item['price'] = site.xpath('//*[@id="priceblock_ourprice"]/text()').extract()
item['image'] = site.xpath('//*[@id="landingImage"]/@data-a-dynamic-image').extract()
item['brand'] = site.xpath('//*[@id="brand"]/text()').extract()
item['bullets'] = site.xpath('//*[@id="feature-bullets"]/span/ul').extract()[0].strip()
item['category'] = site.xpath('//*[@id="wayfinding-breadcrumbs_feature_div"]/ul').extract()[0].strip()
item['details'] = site.xpath('//*[@id="prodDetails"]/div/div[1]/div/div/div[2]/div/div/table').extract()[0].strip()
items.append(item)
return items
except IndexError:
raise IgnoreRequest("Data type not found.")
我想做的是以一种继续输出其余蜘蛛结果的方式处理这个错误,只删除没有值的字段,而不是忽略整个项目。
如有任何帮助,我们将不胜感激。
Item Loaders
和 输入或输出处理器 是您需要的。
用 TakeFirst
processor 定义 ItemLoader
:
Returns the first non-null/non-empty value from the values received,
so it’s typically used as an output processor to single-valued fields.
It doesn’t receive any constructor arguments, nor accept Loader
contexts.
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import TakeFirst
class ProductLoader(ItemLoader):
default_output_processor = TakeFirst()
# specific field loaders
然后,使用加载程序加载项目:
for site in sites:
l = ProductLoader(Website(), site)
l.add_value('asin', response.url.split('/')[-1]) # (4)
l.add_xpath('title', 'div[@id="centerCol"]/div[@id="title_feature_div"]/div[@id="titleSection"]/h1[@id="title"]/span[@id="productTitle"]/text()')
# ...
yield l.load_item()
您可以采用不同的解决方案,如果您只想使用 try 、 catch 和 delete 单个字段,则必须对所有字段执行此操作,
try:
//extract field
except IndexError:
raise IgnoreRequest("Data type not found.")
如果你想要一个空值而不是删除,那么你必须检查值是否存在,你可以定义一个单独的提取方法
def get_value_from_node(self, node):
value = node.extract()
return value[0] if value else ''
并为所有字段调用此方法
item['title'] = self.get_value_from_node(site.xpath('div[@id="centerCol"]/div[@id="title_feature_div"]/div[@id="titleSection"]/h1[@id="title"]/span[@id="productTitle"]/text()'))
它将 return 值或空字符串。并且不需要异常处理。
我目前正在使用 Scrapy 脚本从亚马逊页面中提取产品信息。我 运行 遇到的问题是异常处理,它只删除错误字段而不是我输出中的整个 item/row。
当前蜘蛛:
from scrapy.spider import Spider
from scrapy.selector import Selector
from dirbot.items import Website
class DmozSpider(Spider):
name = "dmoz"
allowed_domains = ["amazon.co.uk"]
start_urls = [
"http://www.amazon.co.uk/dp/B004YVOU9S",
"http://www.amazon.co.uk/dp/B009NFE2QQ"
]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//div[contains(@class, "a-container")]')
items = []
for site in sites:
item = Website()
item['asin'] = response.url.split('/')[-1]
item['title'] = site.xpath('div[@id="centerCol"]/div[@id="title_feature_div"]/div[@id="titleSection"]/h1[@id="title"]/span[@id="productTitle"]/text()').extract()
item['description'] = site.xpath('//*[@id="productDescription"]/div/div[1]/text()').extract()[0].strip()
item['price'] = site.xpath('//*[@id="priceblock_ourprice"]/text()').extract()
item['image'] = site.xpath('//*[@id="landingImage"]/@data-a-dynamic-image').extract()
item['brand'] = site.xpath('//*[@id="brand"]/text()').extract()
item['bullets'] = site.xpath('//*[@id="feature-bullets"]/span/ul').extract()[0].strip()
item['category'] = site.xpath('//*[@id="wayfinding-breadcrumbs_feature_div"]/ul').extract()[0].strip()
item['details'] = site.xpath('//*[@id="prodDetails"]/div/div[1]/div/div/div[2]/div/div/table').extract()[0].strip()
items.append(item)
return items
当抓取结果缺少任何字段时,我目前收到错误:
exceptions.IndexError: list index out of range
为了解决这个问题,我以 IgnoreRequest 的形式添加了一些异常处理。
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.exceptions import IgnoreRequest
from dirbot.items import Website
class DmozSpider(Spider):
name = "dmoz"
allowed_domains = ["amazon.co.uk"]
start_urls = [
"http://www.amazon.co.uk/dp/B004YVOU9S",
"http://www.amazon.co.uk/dp/B009NFE2QQ"
]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//div[contains(@class, "a-container")]')
items = []
try:
for site in sites:
item = Website()
item['asin'] = response.url.split('/')[-1]
item['title'] = site.xpath('div[@id="centerCol"]/div[@id="title_feature_div"]/div[@id="titleSection"]/h1[@id="title"]/span[@id="productTitle"]/text()').extract()
item['description'] = site.xpath('//*[@id="productDescription"]/div/div[1]/text()').extract()[0].strip()
item['price'] = site.xpath('//*[@id="priceblock_ourprice"]/text()').extract()
item['image'] = site.xpath('//*[@id="landingImage"]/@data-a-dynamic-image').extract()
item['brand'] = site.xpath('//*[@id="brand"]/text()').extract()
item['bullets'] = site.xpath('//*[@id="feature-bullets"]/span/ul').extract()[0].strip()
item['category'] = site.xpath('//*[@id="wayfinding-breadcrumbs_feature_div"]/ul').extract()[0].strip()
item['details'] = site.xpath('//*[@id="prodDetails"]/div/div[1]/div/div/div[2]/div/div/table').extract()[0].strip()
items.append(item)
return items
except IndexError:
raise IgnoreRequest("Data type not found.")
我想做的是以一种继续输出其余蜘蛛结果的方式处理这个错误,只删除没有值的字段,而不是忽略整个项目。
如有任何帮助,我们将不胜感激。
Item Loaders
和 输入或输出处理器 是您需要的。
用 TakeFirst
processor 定义 ItemLoader
:
Returns the first non-null/non-empty value from the values received, so it’s typically used as an output processor to single-valued fields. It doesn’t receive any constructor arguments, nor accept Loader contexts.
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import TakeFirst
class ProductLoader(ItemLoader):
default_output_processor = TakeFirst()
# specific field loaders
然后,使用加载程序加载项目:
for site in sites:
l = ProductLoader(Website(), site)
l.add_value('asin', response.url.split('/')[-1]) # (4)
l.add_xpath('title', 'div[@id="centerCol"]/div[@id="title_feature_div"]/div[@id="titleSection"]/h1[@id="title"]/span[@id="productTitle"]/text()')
# ...
yield l.load_item()
您可以采用不同的解决方案,如果您只想使用 try 、 catch 和 delete 单个字段,则必须对所有字段执行此操作,
try:
//extract field
except IndexError:
raise IgnoreRequest("Data type not found.")
如果你想要一个空值而不是删除,那么你必须检查值是否存在,你可以定义一个单独的提取方法
def get_value_from_node(self, node):
value = node.extract()
return value[0] if value else ''
并为所有字段调用此方法
item['title'] = self.get_value_from_node(site.xpath('div[@id="centerCol"]/div[@id="title_feature_div"]/div[@id="titleSection"]/h1[@id="title"]/span[@id="productTitle"]/text()'))
它将 return 值或空字符串。并且不需要异常处理。