Python Scrapy Spider 没有跟随正确 Link

Question

我正在尝试从 this post 中抓取数据。但是，我在抓取评论时遇到了问题。评论的分页由url末尾的“page=1”决定。我注意到如果使用“page=0”，它会将所有评论加载到一页上，这非常好。然而，无论如何，我的 scrapy 脚本只会从第一页抓取评论。即使我将 link 更改为“page=2”，它仍然只会从第一页抓取评论。我不明白为什么会出现这个问题。

import scrapy

from scrapy.crawler import CrawlerProcess


class IdeaSpider(scrapy.Spider):
    name = "IdeaSpider"


    def start_requests(self):
        yield scrapy.Request(
            url="https://www.games2gether.com/amplitude-studios/endless-space-2/ideas/1850-force-infinite-actions-to"
                "-the-bottom-of-the-queue?page=0", callback=self.parse_idea)
            

    # parses title, post, status, author, date
    def parse_idea(self, response):
        post_author = response.xpath('//span[@class = "username-content"]/text()')
        temp_list.append(post_author.extract_first())

        post_categories = response.xpath('//a[@class = "list-tags-item ng-star-inserted"]/text()')
        post_categories_ext = post_categories.extract()
        if len(post_categories_ext) > 1:
            post_categories_combined = ""
            for category in post_categories_ext:
                post_categories_combined = post_categories_combined + category + ", "
            temp_list.append(post_categories_combined)
        else:
            temp_list.append(post_categories_ext[0])

        post_date = response.xpath('//div[@class = "time-date"]/text()')
        temp_list.append(post_date.extract_first())

        post_title = response.xpath('//h1[@class = "title"]/text()')
        temp_list.append(post_title.extract()[0])

        post_body = response.xpath('//article[@class = "post-list-item clearfix ng-star-inserted"]//div[@class = '
                                   '"post-list-item-message-content post-content ng-star-inserted"]//text()')
        post_body_ext = post_body.extract()
        if len(post_body_ext) > 1:
            post_body_combined = ""
            for text in post_body_ext:
                post_body_combined = post_body_combined + " " + text
            temp_list.append(post_body_combined)
        else:
            temp_list.append(post_body_ext[0])

        post_status = response.xpath('//p[@class = "status-title"][1]/text()')
        if len(post_status.extract()) != 0:
            temp_list.append(post_status.extract()[0])
        else:
            temp_list.append("no status")

        dev_name = response.xpath('//div[@class = "ideas-details-status-comment user-role u-bdcolor-2 dev"]//p[@class '
                                  '= "username user-role-username"]/text()')
        temp_list.append(dev_name.extract_first())
        dev_comment = response.xpath('//div[@class = "message post-content ng-star-inserted"]/p/text()')
        temp_list.append(dev_comment.extract_first())

        c_author_index = 0
        c_body_index = 0
        c_author_path = response.xpath('//article[@class = "post-list-item clearfix two-columns '
                                       'ng-star-inserted"]//span[@class = "username-content"]/text()')
        while c_author_index < len(c_author_path):
            comment_author = c_author_path[c_author_index]
            temp_list.append(comment_author.extract())
            c_author_index += 1

            c_body_combined = ""
            c_body_path = '//div[@class = "post-list-comments"]/g2g-comments-item[1]/article[@class = ' \
                          '"post-list-item clearfix two-columns ng-star-inserted"]/div/div//div[@class ' \
                          '="post-list-item-message-content post-content ng-star-inserted"]//text() '
            c_body = response.xpath(c_body_path.replace("1", str(c_body_index + 1)))
            c_body_list = c_body.extract()
            if len(c_body_list) > 1:
                for word in c_body_list:
                    c_body_combined = c_body_combined + " " + word
                temp_list.append(c_body_combined)
                c_body_index += 1
            elif len(c_body_list) != 0:
                temp_list.append(c_body_list[0])
                c_body_index += 1
            elif len(c_body_list) == 0:
                c_body_index += 1
                c_body = response.xpath(c_body_path.replace("1", str(c_body_index + 1)))
                c_body_list = c_body.extract()
                if len(c_body_list) > 1:
                    for word in c_body_list:
                        c_body_combined = c_body_combined + " " + word
                    temp_list.append(c_body_combined)
                c_body_index += 1


temp_list = list()
all_post_data = list()

process = CrawlerProcess()
process.crawl(IdeaSpider)
process.start()

print(temp_list)

Answer 1

这是因为评论页面是使用 JavaScript 加载的，而 Scrapy 没有渲染 JavaScript。你可以使用 Splash.

Python Scrapy Spider 没有跟随正确 Link

Python Scrapy Spider Not Following Correct Link

python

scrapy