Scrapy:调试重定向(301)

Scrapy: Debug Redirecting (301)

在我收到错误“HTTP 状态代码未处理或不允许”之前,我修改了默认模式下的 USER_AGENT,现在我收到此错误:

import scrapy
 
 
class OlxSpider(scrapy.Spider):
    name = "olx"
    allowed_domains = ["pe.olx.com.br"]
    start_urls = (
        'http://pe.olx.com.br/imoveis/aluguel',
    )
 
    def parse(self, response):
        items =  response.xpath(
            '//div[contains(@class,"section_OLXad-list")]//li[contains'
            '(@class,"item")]'
        )
        for item in items:
            url = item.xpath(
                ".//a[contains(@class,'OLXad-list-link')]/@href"
            ).extract_first()
            yield scrapy.Request(url=url, callback=self.parse_detail)
 
        next_page = response.xpath(
            '//li[contains(@class,"item next")]//a/@href'
        ).extract_first()
        if next_page:
            self.log('Next Page: {0}'.format(next_page))
            yield scrapy.Request(url=next_page, callback=self.parse)
 
    def parse_detail(self, response):
        self.log(u'Imóvel URL: {0}'.format(response.url))
        item = {}
        item['photos'] = response.xpath(
            '//div[contains(@class,"photos")]//a/@href'
        ).extract()
        item['url'] = response.url
        item['address'] = response.xpath(
            'normalize-space(//div[contains(@class,"OLXad-location")]'
            '//.)'
        ).extract_first()
        item['title'] = response.xpath(
            'normalize-space(//h1[contains(@id,"ad_title")]//.)'
        ).extract_first()
        item['price'] = response.xpath(
            'normalize-space(//div[contains(@class,"OLXad-price")]'
            '//span[contains(@class,"actual-price")]//.)'
        ).extract_first()
        item['details'] = response.xpath(
            'normalize-space(//div[contains(@class,"OLXad-description")]'
            '//.)'
        ).extract_first()
        item['source_id'] = response.xpath(
            'normalize-space(//div[contains(@class,"OLXad-id")]//strong//.)'
        ).extract_first()
        date = response.xpath(
            'normalize-space(//div[contains(@class,"OLXad-date")]//.)'
        ).re("Inserido em: (.*).")
        item['date'] = (date and date[0]) or ''
        yield item


尝试在终端中执行 .py 文件,我收到以下消息:

2022-01-13 12:36:36 [scrapy.core.engine] INFO: Spider opened
2022-01-13 12:36:36 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2022-01-13 12:36:36 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2022-01-13 12:36:37 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://pe.olx.com.br/robots.txt> from <GET http://pe.olx.com.br/robots.txt>
2022-01-13 12:36:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://pe.olx.com.br/robots.txt> (referer: None)
2022-01-13 12:36:37 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://pe.olx.com.br/imoveis/aluguel> from <GET http://pe.olx.com.br/imoveis/aluguel>

有谁知道可能导致此问题的原因吗?

P.s.: 我试过这些解决方案

它只是从 http 重定向到 https 所以那里没有问题。

你的 xpath 完全错误。我在 parse 中修复了它,我在 parse_detail 中修复了 3 个 xpath 作为示例,但你需要修复其余的。

import scrapy


class OlxSpider(scrapy.Spider):
    name = "olx"
    allowed_domains = ["pe.olx.com.br"]
    start_urls = (
        'http://pe.olx.com.br/imoveis/aluguel',
    )

    def parse(self, response):
        # from scrapy.shell import inspect_response
        # inspect_response(response, self)
        items = response.xpath('//ul[@id="ad-list"]/li')

        for item in items:
            url = item.xpath('.//a/@href').get()
            if url:
                yield scrapy.Request(url=url, callback=self.parse_detail)

        next_page = response.xpath('//a[@data-lurker-detail="next_page"]/@href').get()
        if next_page:
            self.log('Next Page: {0}'.format(next_page))
            yield scrapy.Request(url=next_page, callback=self.parse)

    def parse_detail(self, response):
        self.log(u'Imóvel URL: {0}'.format(response.url))
        item = {}
        item['photos'] = response.xpath('//img[@class="image "]/@src').get()
        item['url'] = response.url
        item['address'] = response.xpath(
            'normalize-space(//div[contains(@class,"OLXad-location")]'
            '//.)'
        ).extract_first()
        item['title'] = response.xpath('//h1/text()').get()
        item['price'] = response.xpath('//h2/text()').get()
        item['details'] = response.xpath(
            'normalize-space(//div[contains(@class,"OLXad-description")]'
            '//.)'
        ).extract_first()
        item['source_id'] = response.xpath(
            'normalize-space(//div[contains(@class,"OLXad-id")]//strong//.)'
        ).extract_first()
        date = response.xpath(
            'normalize-space(//div[contains(@class,"OLXad-date")]//.)'
        ).re("Inserido em: (.*).")
        item['date'] = (date and date[0]) or ''
        yield item