MongoDB 的 Scrapy 管道不工作

Scrapy Pipeline with MongoDB is not working

我正在尝试将我正在抓取的所有数据放入 MongoDB 中以监控房产价格。 我已经做了很多测试,但没有用。 如果有人可以帮助我,我会把代码放在这里。请。

init.py

import scrapy

from realstatedata.items import RealstatedataItem

class RsdataSpider(scrapy.Spider):
    name = 'realstatedata'
    allowed_domains = ['vivareal.com.br']
    start_urls = ['https://www.vivareal.com.br/aluguel/sp/sao-jose-dos-campos/apartamento_residencial/#preco-ate=2000']

    def parse(self, response):
        nextpageurl = response.xpath('//a[@title="Próxima página"]/@href')
        yield from self.scrape(response)

        if nextpageurl:
            path = nextpageurl.extract_first()
            #print(path)
            # Got #pagina=2   =>    Replace with ?pagina=2
            path = '?' + path[1:]
            #print(path)
            nextpage = response.urljoin(path)
            print("Found url: {}".format(nextpage))
            yield scrapy.Request(nextpage)

    def scrape(self, response):
        for resource in response.xpath('//article[@class="property-card__container js-property-card"]/..'):

            item = RealstatedataItem()

            item['description'] = resource.xpath('.//h2/span[@class="property-card__title js-cardLink js-card-title"]/text()').extract_first()
            item['address'] = resource.xpath('.//span[@class="property-card__address"]/text()').extract_first()
            item['prop_area'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area"]/text()').extract_first()
            item['prop_rooms'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
            item['prop_bath'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
            item['prop_parking'] = resource.xpath('.//ul/li[4]/span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
            item['price_rent'] = resource.xpath('.//p[@style="display: block;"]/text()').extract_first()
            item['price_cond'] = resource.xpath('.//strong[@class="js-condo-price"]/text()').extract_first()
            item['realstate_name'] = resource.xpath('.//picture/img/@alt').extract_first()

            yield item

settings.py

BOT_NAME = 'realstatedata'

SPIDER_MODULES = ['realstatedata.spiders']
NEWSPIDER_MODULE = 'realstatedata.spiders'

ITEM_PIPELINES = ['realstatedata.pipelines.MongoPipeline', ]

MONGO_URI = 'mongodb://localhost:27017'
MONGO_DATABASE = "vivareal_db"

items.py

导入 scrapy

class RealstatedataItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    description = scrapy.Field()
    address = scrapy.Field()
    prop_area = scrapy.Field()
    prop_rooms = scrapy.Field()
    prop_bath = scrapy.Field()
    prop_parking = scrapy.Field()
    price_rent = scrapy.Field()
    price_cond = scrapy.Field()
    realstate_name = scrapy.Field()

    pass

pipeline.py

在这部分代码中,我尝试了两种不同的形式,但 none 有效。

import pymongo
import logging

class MongoPipeline(object):

    collection_name = 'rent_properties'

    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        ## pull in information from settings.py
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DATABASE')
        )

    def open_spider(self, spider):
        ## initializing spider
        ## opening db connection
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def close_spider(self, spider):
        ## clean up when spider is closed
        self.client.close()

    
    def process_item(self, item, spider):
        ## how to handle each post
        self.db[self.collection_name].insert(dict(item))
        logging.debug("Properties added to MongoDB")
        return item

很明显你的管道启用设置是错误的。 ITEM_PIPELINES 应该定义为字典而不是列表。在您的代码中,管道根本没有加载。

ITEM_PIPELINES = {
    "realstatedata.pipelines.MongoPipeline": 100,
}

dict中的值表示启用超过1个管道时的优先级。