将 scrapy 导出到 csv
export scrapy to csv
我要去刮'healthunblock.com';我不知道为什么我在CSV文件中看不到提取的数据。
class HealthSpider(scrapy.Spider):
name = 'health'
#allowed_domains = ['https://healthunlocked.com/positivewellbeing/posts#popular']
start_urls = ['https://healthunlocked.com/positivewellbeing/posts#popular']
itemlist=[]
def parse(self, response):
all_div_posts = response.xpath('//div[@class="results-posts"]')
for posts in all_div_posts:
items={}
items['title']= posts.xpath('//h3[@class="results-post__title"]/text()').extract()
items['post']= posts.xpath('//div[@class="results-post__body hidden-xs"]/text()').extract()
self.itemlist.append(items)
with open("outputfile.csv","w", newline="") as f:
writer = csv.DictWriter(f,['title','post'])
writer.writeheader()
for data in self.itemlist:
writer.writerow(data)
编辑: 我 运行 你的代码,它给了我结果文件。
Scrapy 可以 built-it 函数将结果保存在 CSV
中,你不必自己写。
您只需 yield
项
def parse(self, response):
all_div_posts = response.xpath('//div[@class="results-posts"]')
for posts in all_div_posts:
items = {}
items['title']= posts.xpath('//h3[@class="results-post__title"]/text()').extract()
items['post']= posts.xpath('//div[@class="results-post__body hidden-xs"]/text()').extract()
yield items
和 运行 选项 -o outputfile.csv
scrapy runspider your_spider.py -o outputfile.csv
编辑:
我做了一些更改,现在两个版本都给出了相同的结果 - 我使用程序 diff
检查了它以比较两个 csv
.
因为我以不同的方式组织项目,所以我可以直接使用 writer.writerows(self.itemlist)
而无需 for
-loop(和 zip()
)
我还使用 .get()
而不是 extract()
(或 extract_first()
)来获得单个标题和单个 post 来创建配对。我可以使用 strip()
来清除空格。
版本 1
import scrapy
import csv
class HealthSpider(scrapy.Spider):
name = 'health'
#allowed_domains = ['https://healthunlocked.com/positivewellbeing/posts#popular']
start_urls = ['https://healthunlocked.com/positivewellbeing/posts#popular']
itemlist = []
def parse(self, response):
all_div_posts = response.xpath('//div[@class="results-post"]')
print('len(all_div_posts):', len(all_div_posts))
for one_post in all_div_posts:
#print('>>>>>>>>>>>>')
one_item = {
'title': one_post.xpath('.//h3[@class="results-post__title"]/text()').get().strip(),
'post': one_post.xpath('.//div[@class="results-post__body hidden-xs"]/text()').get().strip(),
}
self.itemlist.append(one_item)
#yield one_item
with open("outputfile.csv", "w", newline="") as f:
writer = csv.DictWriter(f, ['title','post'])
writer.writeheader()
writer.writerows(self.itemlist)
版本 2
import scrapy
class HealthSpider(scrapy.Spider):
name = 'health'
#allowed_domains = ['https://healthunlocked.com/positivewellbeing/posts#popular']
start_urls = ['https://healthunlocked.com/positivewellbeing/posts#popular']
#itemlist = []
def parse(self, response):
all_div_posts = response.xpath('//div[@class="results-post"]')
print('len(all_div_posts):', len(all_div_posts))
for one_post in all_div_posts:
#print('>>>>>>>>>>>>')
one_item = {
'title': one_post.xpath('.//h3[@class="results-post__title"]/text()').get().strip(),
'post': one_post.xpath('.//div[@class="results-post__body hidden-xs"]/text()').get().strip(),
}
#self.itemlist.append(one_item)
yield one_item
#with open("outputfile.csv", "w", newline="") as f:
# writer = csv.DictWriter(f, ['title','post'])
# writer.writeheader()
# writer.writerows(self.itemlist)
尝试以下操作以获得您在该网页中看到的准确结果。内容是动态的,您需要填充 json 内容才能获取所需的结果。我使用自定义方法将数据写入 csv 文件。如果您按照下面的方式进行操作,csv 文件将被打开一次。但是,写入数据后文件将被关闭。
import csv
import json
import scrapy
class HealthSpider(scrapy.Spider):
name = "health"
start_urls = ['https://solaris.healthunlocked.com/posts/positivewellbeing/popular']
def __init__(self):
self.outfile = open("output.csv","w",newline="",encoding="utf-8-sig")
self.writer = csv.writer(self.outfile)
self.writer.writerow(['title','post content'])
def close(self,reason):
self.outfile.close()
def parse(self,response):
for posts in json.loads(response.body_as_unicode()):
title = ' '.join(posts['title'].split())
post = ' '.join(posts['bodySnippet'].split())
self.writer.writerow([title,post])
yield {'title':title,'post':post}
我要去刮'healthunblock.com';我不知道为什么我在CSV文件中看不到提取的数据。
class HealthSpider(scrapy.Spider):
name = 'health'
#allowed_domains = ['https://healthunlocked.com/positivewellbeing/posts#popular']
start_urls = ['https://healthunlocked.com/positivewellbeing/posts#popular']
itemlist=[]
def parse(self, response):
all_div_posts = response.xpath('//div[@class="results-posts"]')
for posts in all_div_posts:
items={}
items['title']= posts.xpath('//h3[@class="results-post__title"]/text()').extract()
items['post']= posts.xpath('//div[@class="results-post__body hidden-xs"]/text()').extract()
self.itemlist.append(items)
with open("outputfile.csv","w", newline="") as f:
writer = csv.DictWriter(f,['title','post'])
writer.writeheader()
for data in self.itemlist:
writer.writerow(data)
编辑: 我 运行 你的代码,它给了我结果文件。
Scrapy 可以 built-it 函数将结果保存在 CSV
中,你不必自己写。
您只需 yield
项
def parse(self, response):
all_div_posts = response.xpath('//div[@class="results-posts"]')
for posts in all_div_posts:
items = {}
items['title']= posts.xpath('//h3[@class="results-post__title"]/text()').extract()
items['post']= posts.xpath('//div[@class="results-post__body hidden-xs"]/text()').extract()
yield items
和 运行 选项 -o outputfile.csv
scrapy runspider your_spider.py -o outputfile.csv
编辑:
我做了一些更改,现在两个版本都给出了相同的结果 - 我使用程序 diff
检查了它以比较两个 csv
.
因为我以不同的方式组织项目,所以我可以直接使用 writer.writerows(self.itemlist)
而无需 for
-loop(和 zip()
)
我还使用 .get()
而不是 extract()
(或 extract_first()
)来获得单个标题和单个 post 来创建配对。我可以使用 strip()
来清除空格。
版本 1
import scrapy
import csv
class HealthSpider(scrapy.Spider):
name = 'health'
#allowed_domains = ['https://healthunlocked.com/positivewellbeing/posts#popular']
start_urls = ['https://healthunlocked.com/positivewellbeing/posts#popular']
itemlist = []
def parse(self, response):
all_div_posts = response.xpath('//div[@class="results-post"]')
print('len(all_div_posts):', len(all_div_posts))
for one_post in all_div_posts:
#print('>>>>>>>>>>>>')
one_item = {
'title': one_post.xpath('.//h3[@class="results-post__title"]/text()').get().strip(),
'post': one_post.xpath('.//div[@class="results-post__body hidden-xs"]/text()').get().strip(),
}
self.itemlist.append(one_item)
#yield one_item
with open("outputfile.csv", "w", newline="") as f:
writer = csv.DictWriter(f, ['title','post'])
writer.writeheader()
writer.writerows(self.itemlist)
版本 2
import scrapy
class HealthSpider(scrapy.Spider):
name = 'health'
#allowed_domains = ['https://healthunlocked.com/positivewellbeing/posts#popular']
start_urls = ['https://healthunlocked.com/positivewellbeing/posts#popular']
#itemlist = []
def parse(self, response):
all_div_posts = response.xpath('//div[@class="results-post"]')
print('len(all_div_posts):', len(all_div_posts))
for one_post in all_div_posts:
#print('>>>>>>>>>>>>')
one_item = {
'title': one_post.xpath('.//h3[@class="results-post__title"]/text()').get().strip(),
'post': one_post.xpath('.//div[@class="results-post__body hidden-xs"]/text()').get().strip(),
}
#self.itemlist.append(one_item)
yield one_item
#with open("outputfile.csv", "w", newline="") as f:
# writer = csv.DictWriter(f, ['title','post'])
# writer.writeheader()
# writer.writerows(self.itemlist)
尝试以下操作以获得您在该网页中看到的准确结果。内容是动态的,您需要填充 json 内容才能获取所需的结果。我使用自定义方法将数据写入 csv 文件。如果您按照下面的方式进行操作,csv 文件将被打开一次。但是,写入数据后文件将被关闭。
import csv
import json
import scrapy
class HealthSpider(scrapy.Spider):
name = "health"
start_urls = ['https://solaris.healthunlocked.com/posts/positivewellbeing/popular']
def __init__(self):
self.outfile = open("output.csv","w",newline="",encoding="utf-8-sig")
self.writer = csv.writer(self.outfile)
self.writer.writerow(['title','post content'])
def close(self,reason):
self.outfile.close()
def parse(self,response):
for posts in json.loads(response.body_as_unicode()):
title = ' '.join(posts['title'].split())
post = ' '.join(posts['bodySnippet'].split())
self.writer.writerow([title,post])
yield {'title':title,'post':post}