如何使用 Scrapy Feed 自动导出数据到 CSV
How to export data automatically to CSV with Scrapy Feed
我试图在每次执行 Scrapy crawl (myspider) 时自动生成一个 CSV。我尝试使用 Scrapy 提要,但出现以下错误:
error.ReactorNotRestartable()
twisted.internet.error.ReactorNotRestartable
这是我的代码:
import scrapy
import json
from ..items import inmobiliarias
from scrapy import Spider
from scrapy.selector import Selector
class IdealistaSpider(scrapy.Spider):
name = 'idealista'
custom_setting = {'FEEDS': {'/Users/aleja/Documentos OneDrive/items.csv':{'format': 'csv'}}}
def start_requests(self):
url = 'https://www.idealista.com/areas/venta-viviendas/pagina-{page}?shape=%28%28osyuFrkkUqPo%60%40rEaGxm%40UA%7ERRnInApEgUlBkG%7DCiGtE%29%29'
headers = {
"authority": "www.idealista.com",
"cache-control": "max-age=0",
"sec-ch-ua": "\"Chromium\";v=\"94\", \"Google Chrome\";v=\"94\", \";Not A Brand\";v=\"99\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"sec-fetch-site": "none",
"sec-fetch-mode": "navigate",
"sec-fetch-user": "?1",
"sec-fetch-dest": "document",
"accept-language": "es-ES,es;q=0.9"
}
for page in range(1,5):
yield scrapy.Request(url = url.format(page = page), headers = headers, callback = self.parse_json)
def parse_json(self, response):
#from scrapy.shell import inspect_response
#inspect_response(response, self)
sel = Selector(text= response.text)
item = sel.xpath('//div[@class="item-info-container"]').extract()
items = inmobiliarias()
for element in item:
sel = Selector(text = element)
items['title'] = sel.xpath('//a[@role="heading"]/@title').extract()
items['price'] = sel.xpath('//div/span[@class="item-price h2-simulated"]/text()').extract()
items['phone'] = sel.xpath('//span[@class="icon-phone item-not-clickable-phone"]/text()').extract()
items['url'] = 'https://www.idealista.com' + sel.xpath('//a[@role="heading"]/@href')[0].extract()
items['meters'] = sel.xpath('//span[small/text()="m²"]/text()').extract()
items['rooms'] = sel.xpath('//span[small/text()="hab."]/text()').extract()
items['real_state'] = sel.xpath('//picture[@class="logo-branding"]/a//@alt').extract()
items['garage'] = sel.xpath('//div/span[@class="item-parking"]/text()').extract()
last_item_detail = sel.xpath('//span[@class="item-detail"]/text()')[-1].extract()
if last_item_detail != items['meters'] and last_item_detail != items['rooms']:
items['floor'] = last_item_detail
else:
items['floor'] = ''
yield items
您错误地将 custom_settings
定义为 custom_setting
,即没有 s
。下面的示例 运行 显示没有错误,并且创建了 items.csv
文件。
我试图在每次执行 Scrapy crawl (myspider) 时自动生成一个 CSV。我尝试使用 Scrapy 提要,但出现以下错误:
error.ReactorNotRestartable() twisted.internet.error.ReactorNotRestartable
这是我的代码:
import scrapy
import json
from ..items import inmobiliarias
from scrapy import Spider
from scrapy.selector import Selector
class IdealistaSpider(scrapy.Spider):
name = 'idealista'
custom_setting = {'FEEDS': {'/Users/aleja/Documentos OneDrive/items.csv':{'format': 'csv'}}}
def start_requests(self):
url = 'https://www.idealista.com/areas/venta-viviendas/pagina-{page}?shape=%28%28osyuFrkkUqPo%60%40rEaGxm%40UA%7ERRnInApEgUlBkG%7DCiGtE%29%29'
headers = {
"authority": "www.idealista.com",
"cache-control": "max-age=0",
"sec-ch-ua": "\"Chromium\";v=\"94\", \"Google Chrome\";v=\"94\", \";Not A Brand\";v=\"99\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"sec-fetch-site": "none",
"sec-fetch-mode": "navigate",
"sec-fetch-user": "?1",
"sec-fetch-dest": "document",
"accept-language": "es-ES,es;q=0.9"
}
for page in range(1,5):
yield scrapy.Request(url = url.format(page = page), headers = headers, callback = self.parse_json)
def parse_json(self, response):
#from scrapy.shell import inspect_response
#inspect_response(response, self)
sel = Selector(text= response.text)
item = sel.xpath('//div[@class="item-info-container"]').extract()
items = inmobiliarias()
for element in item:
sel = Selector(text = element)
items['title'] = sel.xpath('//a[@role="heading"]/@title').extract()
items['price'] = sel.xpath('//div/span[@class="item-price h2-simulated"]/text()').extract()
items['phone'] = sel.xpath('//span[@class="icon-phone item-not-clickable-phone"]/text()').extract()
items['url'] = 'https://www.idealista.com' + sel.xpath('//a[@role="heading"]/@href')[0].extract()
items['meters'] = sel.xpath('//span[small/text()="m²"]/text()').extract()
items['rooms'] = sel.xpath('//span[small/text()="hab."]/text()').extract()
items['real_state'] = sel.xpath('//picture[@class="logo-branding"]/a//@alt').extract()
items['garage'] = sel.xpath('//div/span[@class="item-parking"]/text()').extract()
last_item_detail = sel.xpath('//span[@class="item-detail"]/text()')[-1].extract()
if last_item_detail != items['meters'] and last_item_detail != items['rooms']:
items['floor'] = last_item_detail
else:
items['floor'] = ''
yield items
您错误地将 custom_settings
定义为 custom_setting
,即没有 s
。下面的示例 运行 显示没有错误,并且创建了 items.csv
文件。