将 scrapy 导出到 csv

export scrapy to csv

我要去刮'healthunblock.com';我不知道为什么我在CSV文件中看不到提取的数据。

class HealthSpider(scrapy.Spider):
    name = 'health'
    #allowed_domains = ['https://healthunlocked.com/positivewellbeing/posts#popular']
    start_urls = ['https://healthunlocked.com/positivewellbeing/posts#popular']
    itemlist=[]

    def parse(self, response):
        
        all_div_posts = response.xpath('//div[@class="results-posts"]')
        
        for posts in all_div_posts:
            items={} 
            items['title']= posts.xpath('//h3[@class="results-post__title"]/text()').extract()
            items['post']= posts.xpath('//div[@class="results-post__body hidden-xs"]/text()').extract()
            self.itemlist.append(items)
          
           
        with open("outputfile.csv","w", newline="") as f:
            writer = csv.DictWriter(f,['title','post'])
            writer.writeheader()
            for data in self.itemlist:
                writer.writerow(data)

编辑: 我 运行 你的代码,它给了我结果文件。


Scrapy 可以 built-it 函数将结果保存在 CSV 中,你不必自己写。

您只需 yield

def parse(self, response):
    
    all_div_posts = response.xpath('//div[@class="results-posts"]')
    
    for posts in all_div_posts:
        items = {} 
        items['title']= posts.xpath('//h3[@class="results-post__title"]/text()').extract()
        items['post']= posts.xpath('//div[@class="results-post__body hidden-xs"]/text()').extract()

        yield items

和 运行 选项 -o outputfile.csv

scrapy runspider your_spider.py -o outputfile.csv

编辑:

我做了一些更改,现在两个版本都给出了相同的结果 - 我使用程序 diff 检查了它以比较两个 csv.

因为我以不同的方式组织项目,所以我可以直接使用 writer.writerows(self.itemlist) 而无需 for-loop(和 zip()

我还使用 .get() 而不是 extract()(或 extract_first())来获得单个标题和单个 post 来创建配对。我可以使用 strip() 来清除空格。

版本 1

import scrapy
import csv

class HealthSpider(scrapy.Spider):
    name = 'health'
    #allowed_domains = ['https://healthunlocked.com/positivewellbeing/posts#popular']
    start_urls = ['https://healthunlocked.com/positivewellbeing/posts#popular']
    
    itemlist = []

    def parse(self, response):
        
        all_div_posts = response.xpath('//div[@class="results-post"]')
        print('len(all_div_posts):', len(all_div_posts))
        
        for one_post in all_div_posts:
            #print('>>>>>>>>>>>>')
            one_item = {
                'title': one_post.xpath('.//h3[@class="results-post__title"]/text()').get().strip(),
                'post': one_post.xpath('.//div[@class="results-post__body hidden-xs"]/text()').get().strip(),
            }
            self.itemlist.append(one_item)

            #yield one_item
          
                   
        with open("outputfile.csv", "w", newline="") as f:
            writer = csv.DictWriter(f, ['title','post'])
            writer.writeheader()           
            writer.writerows(self.itemlist)

版本 2

import scrapy

class HealthSpider(scrapy.Spider):
    name = 'health'
    #allowed_domains = ['https://healthunlocked.com/positivewellbeing/posts#popular']
    start_urls = ['https://healthunlocked.com/positivewellbeing/posts#popular']
    
    #itemlist = []

    def parse(self, response):
        
        all_div_posts = response.xpath('//div[@class="results-post"]')
        print('len(all_div_posts):', len(all_div_posts))
        
        for one_post in all_div_posts:
            #print('>>>>>>>>>>>>')
            one_item = {
                'title': one_post.xpath('.//h3[@class="results-post__title"]/text()').get().strip(),
                'post': one_post.xpath('.//div[@class="results-post__body hidden-xs"]/text()').get().strip(),
            }
            #self.itemlist.append(one_item)

            yield one_item
          
                   
        #with open("outputfile.csv", "w", newline="") as f:
        #    writer = csv.DictWriter(f, ['title','post'])
        #    writer.writeheader()           
        #    writer.writerows(self.itemlist)

尝试以下操作以获得您在该网页中看到的准确结果。内容是动态的,您需要填充 json 内容才能获取所需的结果。我使用自定义方法将数据写入 csv 文件。如果您按照下面的方式进行操作,csv 文件将被打开一次。但是,写入数据后文件将被关闭。

import csv
import json
import scrapy

class HealthSpider(scrapy.Spider):
    name = "health"
    start_urls = ['https://solaris.healthunlocked.com/posts/positivewellbeing/popular']

    def __init__(self):
        self.outfile = open("output.csv","w",newline="",encoding="utf-8-sig")
        self.writer = csv.writer(self.outfile)
        self.writer.writerow(['title','post content'])

    def close(self,reason):
        self.outfile.close()

    def parse(self,response):
        for posts in json.loads(response.body_as_unicode()):
            title = ' '.join(posts['title'].split())
            post = ' '.join(posts['bodySnippet'].split())
            self.writer.writerow([title,post])
            yield {'title':title,'post':post}