即使我用 xpath 找到数据，Scrapy 也没有抓取整个网站

Question

我通过从 https://www.tapology.com 抓取创建了我的第一个 scrapy 项目。我有一份 link 世界上所有由他们的国家订购的战斗机的清单。抓取器打开 links，抓取 link 上的所有战士，然后移动到该国家/地区的下一页，如果完成则移动到下一个国家/地区。

当我抓取每个战士的信息时，我毫无问题地获得了所有数据，但是当我尝试抓取他们的业余比赛的所有行时 or/and 职业比赛我一无所获。我已经尝试了多种解决方案，但即使我可以使用 XPath 找到数据，当我抓取它时它也不起作用。

Link 示例：https://www.tapology.com/search/mma-fighters-by-nationality/country-no

来自 link 的战斗机示例：https://www.tapology.com/fightcenter/fighters/126571-eric-mambo

我可以从战斗机上刮下的部分： information

我无法从战斗机上刮下的部分： amateur_record

刚刚发现打开页面源码而不是检查元素时，我看不到他业余记录的数据

import scrapy
import os
import logging

# , FighterscraperDetailsItem, FighterscraperProRecordItem, FighterscraperAmateurRecordItem
class FighterSpider(scrapy.Spider):
    name = "fighter"
    allowed_domains = ["tapology.com"]
    count = 0
    
    custom_settings = dict(
        DOWNLOADER_MIDDLEWARES={
            "scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": None,
            "scrapy.downloadermiddlewares.retry.RetryMiddleware": None,
            "scrapy_fake_useragent.middleware.RandomUserAgentMiddleware": 400,
            "scrapy_fake_useragent.middleware.RetryUserAgentMiddleware": 401,
        },
        FAKEUSERAGENT_PROVIDERS=[
            "scrapy_fake_useragent.providers.FakerProvider",
            "scrapy_fake_useragent.providers.FakeUserAgentProvider",
            "scrapy_fake_useragent.providers.FixedUserAgentProvider",
        ],
    )
    
 
    def start_requests(self):
        with open(os.path.abspath("linkListTest.txt"), "rt") as f:
            urls = [url.strip() for url in f.readlines()]
        
        for url in urls:
            
            yield scrapy.Request(url=url, callback=self.parse_country)


        
    
    def parse_country(self, response):
        print(response.request.headers.get("User-Agent"))
        # hrefs = response.xpath("(//td/a[contains(@href,'/fightcenter/fighters/')])/@href").getall()
        # fixedNames = [s.replace('\', '').replace('"', '') for s in names]
        
        for href in response.xpath("(//td/a[contains(@href,'/fightcenter/fighters/')])/@href"):
            
            recievedURL = response.urljoin(href.extract())
            yield scrapy.Request(recievedURL, callback = self.parse_fighter_page)
        
        nextPage = response.xpath("(//a[@rel='next'][contains(text(),'Next ›')])[2]/@href").get()
        if nextPage is not None:
            print("***** NEXTPAGE *****")
            newNextPage = response.urljoin(nextPage)
            # print(newNextPage)
            yield scrapy.Request(newNextPage, callback = self.parse_country)
            
        
    def parse_fighter_page(self, response):
       
        lista = []
        item = {}
        # details = ItemLoader(item = FighterscraperDetailsItem(), response = response)

        item['name'] = response.xpath("(//div[@id='stats'])[1]//li[position()=1]//strong[position()=1]/following-sibling::span/text()").get()
        item['fighter_url'] = f"{response.request.url}"
        # fighter.add_xpath('name', "(//div[@id='stats'])[1]//li[position()=1]//strong[position()=1]/following-sibling::span/text()")
        stats = "//div[@id='stats'][1]"
        
        item['details'] = {}
        item['details']['nickname'] = response.xpath(f"{stats}//li/strong[contains(text(), 'Nickname')]/following-sibling::span[position()=1]/text()").get()
        item['details']['record'] = response.xpath(f"{stats}[1]//li[position()=2]//strong[position()=1]/following-sibling::span/text()").get()
        item['details']['age'] = response.xpath("//span[@class='dateToAge'][1]/text()").get()
        item['details']['date_of_birth'] = response.xpath(f"{stats}//li/strong[contains(text(), 'Age')]/following-sibling::span[position()=2]/text()").get()
        item['details']['weight_class'] = response.xpath(f"{stats}//li/strong[contains(text(), 'Weight Class')]/following-sibling::span[position()=1]/text()").get()
        item['details']['born'] = response.xpath(f"{stats}//li/strong[contains(text(), 'Born')]/following-sibling::span[position()=1]/text()").get()
        item['details']['gym'] = response.xpath(f"{stats}//li/strong[contains(text(), 'Affiliation')]/following-sibling::span/a/text()").get()
        
        item['details']['country_code'] = response.xpath("(//div[@class='fighterUpcomingHeader'][1]/h2[@id='flag']/a[contains(@href,'/search/mma-fighters-by-nationality/country-')])/@href").get()
        
        
        # details.add_value('fighter_url', f"{response.request.url}")
        
        # details.add_xpath('nickname',"[1]//li/strong[contains(text(), 'Nickname')]/following-sibling::span[position()=1]/text()")
        # details.add_xpath('record',"[1]//li[position()=2]//strong[position()=1]/following-sibling::span/text()")
        # details.add_xpath('age',"//span[@class='dateToAge'][1]/text()")
        # details.add_xpath('date_of_birth',"//div[@id='stats'][1]//li/strong[contains(text(), 'Age')]/following-sibling::span[position()=2]/text()")
        # details.add_xpath('weight_class',"//div[@id='stats'][1]//li/strong[contains(text(), 'Weight Class')]/following-sibling::span[position()=1]/text()")
        # details.add_xpath('born',"//div[@id='stats'][1]//li/strong[contains(text(), 'Born')]/following-sibling::span[position()=1]/text()")
        # details.add_xpath('country_code',"(//div[@class='fighterUpcomingHeader'][1]/h2[@id='flag']/a[contains(@href,'/search/mma-fighters-by-nationality/country-')])/@href")
        # details.add_xpath('gym',"//div[@id='stats'][1]//li/strong[contains(text(), 'Affiliation')]/following-sibling::span[position()=1]/text()")
        
        item['pro_record'] = {}
        item['amateur_record'] = {}
        
        
        
        for match in response.xpath("//div[@id='react-container'][1]/div[@id='fighterRecord']/section[2]/ul//li/div"): 
            opponent = match.xpath("/div[1]/div[1]/a/text()").extract_first()
            print(opponent)

            # Nothing is printed, this is some of the data that is supposed to be yielded in amateur_record. (Also tried with get() and so on.'
        
        
        yield item

Answer 1

您没有得到结果，因为它们是创建的 dynamically。

了解如何使用scrapy shell如果您不知道如何使用，它将对您有很大帮助。

如果我们在 'xhr' 选项卡下的浏览器中使用 devtools 检查页面，我们可以看到它从 https://api.tapology.com/v1/internal_fighters/1389126571 获取数据。所以我们需要用相同的号码重新创建请求。

scrapy shell "https://www.tapology.com/fightcenter/fighters/126571-eric-mambo"

# Let's first get the correct number so we can create the correct request to the API
In [1]: fid = response.xpath('//meta[@name="fid"]/@content').get()

In [2]: salt = response.xpath('//meta[@name="salt"]/@content').get()

In [3]: file_number = salt + fid

# Now we need to create the headers (I just copied them from devtools).
In [4]: headers = {
   ...: "Accept": "*/*",
   ...: "Accept-Encoding": "gzip, deflate, br",
   ...: "Accept-Language": "en-US,en;q=0.5",
   ...: "authorization": "Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VyIjoiaW50ZXJuYWxfYXBpIiwiZXhwIjoyNTM3NjU0N
   ...: DAwfQ.C1E9hhkQOH7XrfZ5c7aTYS4CKN3ACkJ1nvgvx2v10YY",
   ...: "Cache-Control": "no-cache",
   ...: "Connection": "keep-alive",
   ...: "content-type": "application/vnd.api+json",
   ...: "DNT": "1",
   ...: "Host": "api.tapology.com",
   ...: "Origin": "https://www.tapology.com",
   ...: "Pragma": "no-cache",
   ...: "Referer": "https://www.tapology.com/",
   ...: "Sec-Fetch-Dest": "empty",
   ...: "Sec-Fetch-Mode": "cors",
   ...: "Sec-Fetch-Site": "same-site",
   ...: "Sec-GPC": "1",
   ...: "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
   ...: }

# create the request and fetch it
In [5]: req = scrapy.Request(url=f"https://api.tapology.com/v1/internal_fighters/{file_number}", headers=headers)

In [6]: fetch(req)
[scrapy.core.engine] DEBUG: Crawled (200) <GET https://api.tapology.com/v1/internal_fighters/138931242> (referer: https://www.tapology.com/)

# we got a json file, let's parse it (you can check it out by printing it or type view(response) to open it)

In [7]: json_data = response.json()
In [8]: for fighter in json_data['included']:
   ...:     print(fighter['id'])
   ...:     print(fighter['attributes']['opponent_fighter_name'])
   ...:
1170431
Greg Fischer
931909
Mohammad Alavi
879994
Kenta Takizawa
846160
Rodney Mondala
764333
Scott MacGregor
674056
Hiroyuki Oshiro
650257
Luke Catubig
600477
Josh Branham
534279
Won Jun Jang
458178
Rilley Dutro
427351
Jerome Cruz
356615
Kwan Ho Kwak
306251
Mark Abelardo
356662
Vince Masga
256096
Trevin Jones
175233
Ian Dela Cuesta
129057
Jung Hoon Ko
129499
Vince Pua
656653
John Paul Mendiola
129503
Carlos Tiongson

编辑：

要获得正确的结果，您需要在请求战士页面时添加 headers，例如：

In [1]: req = scrapy.Request(url = 'https://www.tapology.com/fightcenter/fighters/126571-eric-mambo')

In [2]: fetch(req)
[scrapy.core.engine] INFO: Spider opened
[scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.tapology.com/fightcenter/fighters/126571-eric-mambo> (referer: None)

In [3]: view(response)
Out[3]: True

我们得到页面：即使我们想要另一页。

与headers:

In [4]: headers = {
   ...: "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
   ...: "Accept-Encoding": "gzip, deflate, br",
   ...: "Accept-Language": "en-US,en;q=0.5",
   ...: "Cache-Control": "no-cache",
   ...: "Connection": "keep-alive",
   ...: "DNT": "1",
   ...: "Host": "www.tapology.com",
   ...: "Pragma": "no-cache",
   ...: "Referer": "https://www.tapology.com/search?term=eric+mambo",
   ...: "Sec-Fetch-Dest": "document",
   ...: "Sec-Fetch-Mode": "navigate",
   ...: "Sec-Fetch-Site": "same-origin",
   ...: "Sec-Fetch-User": "?1",
   ...: "Sec-GPC": "1",
   ...: "Upgrade-Insecure-Requests": "1",
   ...: "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.372
   ...: 9.169 Safari/537.36"
   ...: }

In [5]: req = scrapy.Request(url = 'https://www.tapology.com/fightcenter/fighters/126571-eric-mambo', headers=headers)

In [6]: fetch(req)
[scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.tapology.com/fightcenter/fighters/126571-eric-mambo> (referer: https://www.tapology.com/search?term=eric+mambo)

In [7]: view(response)
Out[7]: True

我们得到了我们想要的页面：

复制您的 headers，并将它们添加到页面请求中，例如 yield scrapy.Request(recievedURL, callback=self.parse_fighter_page, headers=headers)，对于 api 使用我使用的 headers。

编辑 2：

import scrapy
import os
import logging

# , FighterscraperDetailsItem, FighterscraperProRecordItem, FighterscraperAmateurRecordItem
class FighterSpider(scrapy.Spider):
    name = "fighter"
    allowed_domains = ["tapology.com"]
    count = 0

    custom_settings = dict(
        DOWNLOADER_MIDDLEWARES={
            "scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": None,
            "scrapy.downloadermiddlewares.retry.RetryMiddleware": None,
            "scrapy_fake_useragent.middleware.RandomUserAgentMiddleware": 400,
            "scrapy_fake_useragent.middleware.RetryUserAgentMiddleware": 401,
        },
        FAKEUSERAGENT_PROVIDERS=[
            "scrapy_fake_useragent.providers.FakerProvider",
            "scrapy_fake_useragent.providers.FakeUserAgentProvider",
            "scrapy_fake_useragent.providers.FixedUserAgentProvider",
        ],
    )

    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.5",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "DNT": "1",
        "Host": "www.tapology.com",
        "Pragma": "no-cache",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
        "Sec-GPC": "1",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
    }

    api_headers = {
        "Accept": "*/*",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.5",
        "authorization": "Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VyIjoiaW50ZXJuYWxfYXBpIiwiZXhwIjoyNTM3NjU0NDAwfQ.C1E9hhkQOH7XrfZ5c7aTYS4CKN3ACkJ1nvgvx2v10YY",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "content-type": "application/vnd.api+json",
        "DNT": "1",
        "Host": "api.tapology.com",
        "Origin": "https://www.tapology.com",
        "Pragma": "no-cache",
        "Referer": "https://www.tapology.com/",
        "Sec-Fetch-Dest": "empty",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Site": "same-site",
        "Sec-GPC": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
    }

    def start_requests(self):
        with open(os.path.abspath("linkListTest.txt"), "rt") as f:
            urls = [url.strip() for url in f.readlines()]

        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse_country)

    def parse_country(self, response):
        print(response.request.headers.get("User-Agent"))
        # hrefs = response.xpath("(//td/a[contains(@href,'/fightcenter/fighters/')])/@href").getall()
        # fixedNames = [s.replace('\', '').replace('"', '') for s in names]

        for href in response.xpath("(//td/a[contains(@href,'/fightcenter/fighters/')])/@href"):
            recievedURL = response.urljoin(href.extract())
            yield scrapy.Request(recievedURL, callback=self.parse_fighter_page, headers=self.headers)

        nextPage = response.xpath("(//a[@rel='next'][contains(text(),'Next ›')])[2]/@href").get()
        if nextPage is not None:
            print("***** NEXTPAGE *****")
            newNextPage = response.urljoin(nextPage)
            # print(newNextPage)
            yield scrapy.Request(newNextPage, callback=self.parse_country, headers=self.headers)

    def parse_fighter_page(self, response):
        lista = []
        item = {}
        # details = ItemLoader(item = FighterscraperDetailsItem(), response = response)

        item['name'] = response.xpath("(//div[@id='stats'])[1]//li[position()=1]//strong[position()=1]/following-sibling::span/text()").get()
        item['fighter_url'] = f"{response.request.url}"
        # fighter.add_xpath('name', "(//div[@id='stats'])[1]//li[position()=1]//strong[position()=1]/following-sibling::span/text()")
        stats = "//div[@id='stats'][1]"

        item['details'] = {}
        item['details']['nickname'] = response.xpath(f"{stats}//li/strong[contains(text(), 'Nickname')]/following-sibling::span[position()=1]/text()").get()
        item['details']['record'] = response.xpath(f"{stats}[1]//li[position()=2]//strong[position()=1]/following-sibling::span/text()").get()
        item['details']['age'] = response.xpath("//span[@class='dateToAge'][1]/text()").get()
        item['details']['date_of_birth'] = response.xpath(f"{stats}//li/strong[contains(text(), 'Age')]/following-sibling::span[position()=2]/text()").get()
        item['details']['weight_class'] = response.xpath(f"{stats}//li/strong[contains(text(), 'Weight Class')]/following-sibling::span[position()=1]/text()").get()
        item['details']['born'] = response.xpath(f"{stats}//li/strong[contains(text(), 'Born')]/following-sibling::span[position()=1]/text()").get()
        item['details']['gym'] = response.xpath(f"{stats}//li/strong[contains(text(), 'Affiliation')]/following-sibling::span/a/text()").get()

        item['details']['country_code'] = response.xpath("(//div[@class='fighterUpcomingHeader'][1]/h2[@id='flag']/a[contains(@href,'/search/mma-fighters-by-nationality/country-')])/@href").get()


        # details.add_value('fighter_url', f"{response.request.url}")

        # details.add_xpath('nickname',"[1]//li/strong[contains(text(), 'Nickname')]/following-sibling::span[position()=1]/text()")
        # details.add_xpath('record',"[1]//li[position()=2]//strong[position()=1]/following-sibling::span/text()")
        # details.add_xpath('age',"//span[@class='dateToAge'][1]/text()")
        # details.add_xpath('date_of_birth',"//div[@id='stats'][1]//li/strong[contains(text(), 'Age')]/following-sibling::span[position()=2]/text()")
        # details.add_xpath('weight_class',"//div[@id='stats'][1]//li/strong[contains(text(), 'Weight Class')]/following-sibling::span[position()=1]/text()")
        # details.add_xpath('born',"//div[@id='stats'][1]//li/strong[contains(text(), 'Born')]/following-sibling::span[position()=1]/text()")
        # details.add_xpath('country_code',"(//div[@class='fighterUpcomingHeader'][1]/h2[@id='flag']/a[contains(@href,'/search/mma-fighters-by-nationality/country-')])/@href")
        # details.add_xpath('gym',"//div[@id='stats'][1]//li/strong[contains(text(), 'Affiliation')]/following-sibling::span[position()=1]/text()")

        item['pro_record'] = {}
        item['amateur_record'] = {}

        fid = response.xpath('//meta[@name="fid"]/@content').get()
        salt = response.xpath('//meta[@name="salt"]/@content').get()
        file_number = salt + fid

        yield scrapy.Request(url=f"https://api.tapology.com/v1/internal_fighters/{file_number}",
                             headers=self.api_headers,
                             callback=self.parse_api, meta={'item': item})

    def parse_api(self, response):
        json_data = response.json()
        api_data = []
        for fighter in json_data['included']:
            opponent_id = fighter['id']
            opponent_name = fighter['attributes']['opponent_fighter_name']
            api_data.append({'id': opponent_id, 'opponent_name': opponent_name})

        item = response.meta['item']
        item['opponent'] = api_data
        yield item

Answer 2

您也可以使用此代码提取战斗机信息，请检查：

import scrapy
from scrapy.utils.response import open_in_browser
import urllib.parse
class FighterinfoSpider(scrapy.Spider):
    name = 'fighter'
    allowed_domains = ['tapology.com']
    start_urls = ['https://www.tapology.com/search']

    def parse(self, response):
        # extract countrywise figher list
        fighters_by_nationality = response.xpath('//div[@class="siteSearchFightersByNationality"]/dd/a/@href').getall()
        for link in fighters_by_nationality:
            yield response.follow(link, callback = self.parse_fighter_list_by_country)
    
    

    def parse_fighter_list_by_country(self,response):
        # extract fighter list
        all_fighter_links = response.xpath('//table[@class="siteSearchResults"]/tr/td/a/@href').getall()

        for individual_fighter in all_fighter_links:
            url = urllib.parse.urljoin('https://www.tapology.com',individual_fighter)
            yield scrapy.Request(url,callback=self.parse_fighter_page)
        next_page = response.xpath('//span[@class="next"]/a/@href').get()
        if next_page:
            yield response.follow(next_page, callback = self.parse_fighter_list_by_country)

    def parse_fighter_page(self, response):
        # extract fighter information

即使我用 xpath 找到数据，Scrapy 也没有抓取整个网站

Scrapy not scraping the whole website even though I find the data with xpath

python

scrapy

web-scraping