Scrapy 无法抓取数据

Question

I am trying to scrape data from real estate site https://www.spitogatos.gr/。我从 robots.txt 那里看到：The Ultimate robots.txt Bot and User-Agent Blocker 我只想每天抓取一次网站，这是一种使用 scrapy 抓取的方法吗？提前谢谢你

import scrapy
    class MainprojectSpider(scrapy.Spider):
    name = 'mainProject'
    allowed_domains = ['www.spitogatos.gr']
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like 
    Gecko) Chrome/91.0.4472.124 Safari/537.36'
   #start_urls = ['https://www.spitogatos.gr/']
   
     def start_requests(self):
        yield scrapy.Request(url='https://www.spitogatos.gr', callback = self.parse,
        headers= {'User Agent':self.user_agent})        
     def parse(self, response):
        print(response.xpath('//h2[@class="text thin h1"]/text()').extract())#just dummy
     def set_user_agent(self, request):
        request.headers['User-Agent'] = self.user_agent
        return request

Answer 1

仅在 header 中传递 User-Agent 不足以获得成功响应。您必须通过此网站的完整请求 header 和 cookie。

代码

import scrapy

class MainprojectSpider(scrapy.Spider):
    name = 'mainProject'
    allowed_domains = ['spitogatos.gr']

    headers = {
        "authority": "www.spitogatos.gr",
        "pragma": "no-cache",
        "cache-control": "no-cache",
        "sec-ch-ua": "\" Not;A Brand\";v=\"99\", \"Google Chrome\";v=\"91\", \"Chromium\";v=\"91\"",
        "sec-ch-ua-mobile": "?0",
        "upgrade-insecure-requests": "1",
        "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "sec-fetch-site": "none",
        "sec-fetch-mode": "navigate",
        "sec-fetch-user": "?1",
        "sec-fetch-dest": "document",
        "accept-language": "en-US,en;q=0.9"
    }

    cookies = {
        "PHPSESSID": "po638herud8fh5bcd5faj6e2fn",
        "spitogatosHomepageMap": "0",
        "currentCurrency": "EUR",
        "_ga": "GA1.2.995321575.1625941672",
        "_gid": "GA1.2.2083505143.1625941672",
        "_hjTLDTest": "1",
        "_hjid": "81d5a4f5-1d68-4f26-86c9-be4f6f3c4072",
        "_fbp": "fb.1.1625941672273.1445941947",
        "__qca": "P0-96032413-1625941956393",
        "_hjAbsoluteSessionInProgress": "1",
        "openedTabs": "1",
        "_gat_UA-3455846-10": "1",
        "_gat_UA-3455846-2": "1",
        "_hjIncludedInSessionSample": "1",
        "reese84": "3:FGNSnDE4wRMItXmgo8P+Aw==:sGtH84yEJkKj63PngFcdU3iQbhkp11cYkDw3X06dMlyaUb7wTkc2Wah9Qovgk4eW/Gg34paBwJIFH5ywVR4iJmb+542uPLVNXHnd4LXLKtVOTdeLrew41lAeyvKyjAcHlsIW+El8j8715RwI9TirIOa50wILShhQbubz89vw4m4rSnNrNbI73GMNWQZBIaSG3Lct5PuBfdJjdl3rT4Fp1kR7dV/yggGv1e6T33RgojXdT23MRb9uG7TojFqiIlI75yqZ8XdxqsSDClwWq8b/vFbUagC19NlptsyY1OsG2v7jguFXIdcHeLnKTAx8UJ+cHz6heewN6SAGrdqw8b2GMVuvpbBhatIdmiP7d0+J5RUU+g5DO9eFGGiWV3ToR47VwCI48X4jxnhkslnXSnCesudsSh3mVIFWGRBIYL723SI=:K71/ib2SBQ6VfoGMhHDTkyPcg91uLZr+BqNFsVE0WFs="
    }
    def start_requests(self):
        url = 'https://www.spitogatos.gr/'
        yield  scrapy.Request(
            url=url,
            method='GET',
            cookies=self.cookies,
            headers=self.headers,
            callback= self.parse
        )

    def parse(self, response):
        print(response.xpath('//h2[@class="text thin h1"]/text()').extract()) # your dummy xpath

Scrapy 无法抓取数据

Scrapy cannot scrape data

python

scrapy