Scrapy 管道以正确的格式导出 csv 文件
Scrapy pipeline to export csv file in the right format
我根据下面alexce的建议进行了改进。我需要的是如下图。然而,每个 row/line 应该是一篇评论:包含日期、评分、评论文本和 link.
我需要让项目处理器处理每个页面的每个评论。
目前 TakeFirst() 只接受页面的第一次审查。所以10页,我只有10个lines/rows如下图。
蜘蛛代码如下:
import scrapy
from amazon.items import AmazonItem
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ['amazon.co.uk']
start_urls = [
'http://www.amazon.co.uk/product-reviews/B0042EU3A2/'.format(page) for page in xrange(1,114)
]
def parse(self, response):
for sel in response.xpath('//*[@id="productReviews"]//tr/td[1]'):
item = AmazonItem()
item['rating'] = sel.xpath('div/div[2]/span[1]/span/@title').extract()
item['date'] = sel.xpath('div/div[2]/span[2]/nobr/text()').extract()
item['review'] = sel.xpath('div/div[6]/text()').extract()
item['link'] = sel.xpath('div/div[7]/div[2]/div/div[1]/span[3]/a/@href').extract()
yield item
如果使用 -t csv
(正如 Frank 在评论中提出的那样)由于某种原因对您不起作用,您始终可以使用内置 CsvItemExporter
directly in the custom pipeline,例如:
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
class AmazonPipeline(object):
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
self.file = open('output.csv', 'w+b')
self.exporter = CsvItemExporter(self.file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
您需要添加到 ITEM_PIPELINES
:
ITEM_PIPELINES = {
'amazon.pipelines.AmazonPipeline': 300
}
此外,我会使用带有输入和输出处理器的 Item Loader 来加入评论文本并用空格替换新行。创建一个 ItemLoader
class:
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import TakeFirst, Join, MapCompose
class AmazonItemLoader(ItemLoader):
default_output_processor = TakeFirst()
review_in = MapCompose(lambda x: x.replace("\n", " "))
review_out = Join()
然后,用它构造一个Item
:
def parse(self, response):
for sel in response.xpath('//*[@id="productReviews"]//tr/td[1]'):
loader = AmazonItemLoader(item=AmazonItem(), selector=sel)
loader.add_xpath('rating', './/div/div[2]/span[1]/span/@title')
loader.add_xpath('date', './/div/div[2]/span[2]/nobr/text()')
loader.add_xpath('review', './/div/div[6]/text()')
loader.add_xpath('link', './/div/div[7]/div[2]/div/div[1]/span[3]/a/@href')
yield loader.load_item()
我从头开始,下面的蜘蛛应该是运行和
scrapy crawl amazon -t csv -o Amazon.csv --loglevel=INFO
这样我就可以用电子表格打开 CSV 文件
希望这对您有所帮助:-)
import scrapy
class AmazonItem(scrapy.Item):
rating = scrapy.Field()
date = scrapy.Field()
review = scrapy.Field()
link = scrapy.Field()
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ['amazon.co.uk']
start_urls = ['http://www.amazon.co.uk/product-reviews/B0042EU3A2/' ]
def parse(self, response):
for sel in response.xpath('//table[@id="productReviews"]//tr/td/div'):
item = AmazonItem()
item['rating'] = sel.xpath('./div/span/span/span/text()').extract()
item['date'] = sel.xpath('./div/span/nobr/text()').extract()
item['review'] = sel.xpath('./div[@class="reviewText"]/text()').extract()
item['link'] = sel.xpath('.//a[contains(.,"Permalink")]/@href').extract()
yield item
xpath_Next_Page = './/table[@id="productReviews"]/following::*//span[@class="paging"]/a[contains(.,"Next")]/@href'
if response.xpath(xpath_Next_Page):
url_Next_Page = response.xpath(xpath_Next_Page).extract()[0]
request = scrapy.Request(url_Next_Page, callback=self.parse)
yield request
我根据下面alexce的建议进行了改进。我需要的是如下图。然而,每个 row/line 应该是一篇评论:包含日期、评分、评论文本和 link.
我需要让项目处理器处理每个页面的每个评论。
目前 TakeFirst() 只接受页面的第一次审查。所以10页,我只有10个lines/rows如下图。
蜘蛛代码如下:
import scrapy
from amazon.items import AmazonItem
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ['amazon.co.uk']
start_urls = [
'http://www.amazon.co.uk/product-reviews/B0042EU3A2/'.format(page) for page in xrange(1,114)
]
def parse(self, response):
for sel in response.xpath('//*[@id="productReviews"]//tr/td[1]'):
item = AmazonItem()
item['rating'] = sel.xpath('div/div[2]/span[1]/span/@title').extract()
item['date'] = sel.xpath('div/div[2]/span[2]/nobr/text()').extract()
item['review'] = sel.xpath('div/div[6]/text()').extract()
item['link'] = sel.xpath('div/div[7]/div[2]/div/div[1]/span[3]/a/@href').extract()
yield item
如果使用 -t csv
(正如 Frank 在评论中提出的那样)由于某种原因对您不起作用,您始终可以使用内置 CsvItemExporter
directly in the custom pipeline,例如:
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
class AmazonPipeline(object):
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
self.file = open('output.csv', 'w+b')
self.exporter = CsvItemExporter(self.file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
您需要添加到 ITEM_PIPELINES
:
ITEM_PIPELINES = {
'amazon.pipelines.AmazonPipeline': 300
}
此外,我会使用带有输入和输出处理器的 Item Loader 来加入评论文本并用空格替换新行。创建一个 ItemLoader
class:
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import TakeFirst, Join, MapCompose
class AmazonItemLoader(ItemLoader):
default_output_processor = TakeFirst()
review_in = MapCompose(lambda x: x.replace("\n", " "))
review_out = Join()
然后,用它构造一个Item
:
def parse(self, response):
for sel in response.xpath('//*[@id="productReviews"]//tr/td[1]'):
loader = AmazonItemLoader(item=AmazonItem(), selector=sel)
loader.add_xpath('rating', './/div/div[2]/span[1]/span/@title')
loader.add_xpath('date', './/div/div[2]/span[2]/nobr/text()')
loader.add_xpath('review', './/div/div[6]/text()')
loader.add_xpath('link', './/div/div[7]/div[2]/div/div[1]/span[3]/a/@href')
yield loader.load_item()
我从头开始,下面的蜘蛛应该是运行和
scrapy crawl amazon -t csv -o Amazon.csv --loglevel=INFO
这样我就可以用电子表格打开 CSV 文件
希望这对您有所帮助:-)
import scrapy
class AmazonItem(scrapy.Item):
rating = scrapy.Field()
date = scrapy.Field()
review = scrapy.Field()
link = scrapy.Field()
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ['amazon.co.uk']
start_urls = ['http://www.amazon.co.uk/product-reviews/B0042EU3A2/' ]
def parse(self, response):
for sel in response.xpath('//table[@id="productReviews"]//tr/td/div'):
item = AmazonItem()
item['rating'] = sel.xpath('./div/span/span/span/text()').extract()
item['date'] = sel.xpath('./div/span/nobr/text()').extract()
item['review'] = sel.xpath('./div[@class="reviewText"]/text()').extract()
item['link'] = sel.xpath('.//a[contains(.,"Permalink")]/@href').extract()
yield item
xpath_Next_Page = './/table[@id="productReviews"]/following::*//span[@class="paging"]/a[contains(.,"Next")]/@href'
if response.xpath(xpath_Next_Page):
url_Next_Page = response.xpath(xpath_Next_Page).extract()[0]
request = scrapy.Request(url_Next_Page, callback=self.parse)
yield request