使用 MONGODB 使用 SCRAPY 和管道进行抓取时出现问题

Problem whem scraping whit SCRAPY and Pipeline with MONGODB

我正在尝试抓取此网站 https://www.vivareal.com.br/aluguel/sp/sao-jose-dos-campos/apartamento_residencial/。 它是一个真实的状态网站,但出于同样的原因,当开始更改页面时只会获得相同的数据,我真的不知道发生了什么。 有人可以帮帮我吗?

init.py

import scrapy

from realstatedata.items import RealstatedataItem

class RsdataSpider(scrapy.Spider):
    name = 'realstatedata'
    allowed_domains = ['vivareal.com.br']
    start_urls = ['https://www.vivareal.com.br/aluguel/sp/sao-jose-dos-campos/apartamento_residencial/']

    def parse(self, response):
        nextpageurl = response.xpath('//a[@title="Próxima página"]/@href')
        yield from self.scrape(response)

        if nextpageurl:
            path = nextpageurl.extract_first()
            #print(path)
            # Got #pagina=2   =>    Replace with ?pagina=2
            path = '?' + path[1:]
            #print(path)
            nextpage = response.urljoin(path)
            print("Found url: {}".format(nextpage))
            yield scrapy.Request(nextpage)

    def scrape(self, response):
        for resource in response.xpath('//article[@class="property-card__container js-property-card"]/..'):

            item = RealstatedataItem()

            item['description'] = resource.xpath('.//h2/span[@class="property-card__title js-cardLink js-card-title"]/text()').extract_first().strip().lower()
            item['address'] = resource.xpath('.//span[@class="property-card__address"]/text()').extract_first().strip().lower()
            item['prop_area'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area"]/text()').extract_first().strip()
            item['prop_rooms'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first().strip()
            item['prop_bath'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first().strip()
            item['prop_parking'] = resource.xpath('.//ul/li[4]/span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first().strip()
            item['price_rent'] = resource.xpath('.//p[@style="display: block;"]/text()').extract_first().strip().replace('R$', '')
            item['price_cond'] = resource.xpath('.//strong[@class="js-condo-price"]/text()').extract_first()
            item['realstate_name'] = resource.xpath('.//picture/img/@alt').extract_first().strip().lower()

            yield item

settings.py

BOT_NAME = 'realstatedata'

SPIDER_MODULES = ['realstatedata.spiders']
NEWSPIDER_MODULE = 'realstatedata.spiders'

ITEM_PIPELINES = {'realstatedata.pipelines.MongoPipeline':100, }

MONGO_URI = 'mongodb://localhost:27017'
MONGO_DATABASE = "vivareal_db"


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'realstatedata.middlewares.RealstatedataSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'realstatedata.middlewares.RealstatedataDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    'realstatedata.pipelines.RealstatedataPipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = [301,302]
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

pipeline.py

import logging
import pymongo

class MongoPipeline(object):

    collection_name = 'rent_properties'

    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        ## pull in information from settings.py
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DATABASE')
        )

    def open_spider(self, spider):
        ## initializing spider
        ## opening db connection
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def close_spider(self, spider):
        ## clean up when spider is closed
        self.client.close()

    def process_item(self, item, spider):
        ## how to handle each post
        #self.db[self.collection_name].insert(dict(item))
        #self.db[self.collection_name].insert_one(ItemAdapter(item).asdict())
        self.db[self.collection_name].update({'address': item['address']}, dict(item), upsert=True)
        logging.debug("Properties added to MongoDB")
        return item

items.py

import scrapy

class RealstatedataItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    description = scrapy.Field()
    address = scrapy.Field()
    prop_area = scrapy.Field()
    prop_rooms = scrapy.Field()
    prop_bath = scrapy.Field()
    prop_parking = scrapy.Field()
    price_rent = scrapy.Field()
    price_cond = scrapy.Field()
    realstate_name = scrapy.Field()

    pass

分页依赖javascript。 Scrapy 的行为类似于 requestshttpx 等其他 http 客户端。它不支持 javascript。您必须需要拦截请求并由一些浏览器处理它,例如 headless Chrome、Splash。考虑到兼容性,最好的解决方案是使用无头Chrome浏览器并通过scrapy-playwright控制它。

您应该避免的其他选择

  • scrapy-splash。 Splash 由 Scrapy 组织维护。但是这个轻量级的浏览器使用了 Webkit 引擎,它的行为与 Firefox 等流行的浏览器不同,Chrome。很多网站都没有用 Splash 正确呈现。
  • scrapy-selenium or scrapy-headless
    1. 这些插件使用同步的 Selenium。
    2. 这些插件创建了自定义 Request 并错误地编写了 pickling 代码。自定义 Request 在从 Scrapy 的内部队列中弹出后被破坏。