Scrapy-无法导航到下一页

Question

我是运行一个 Scrapy 项目，我想从社交论坛中提取用户帖子。目前我无法让我的蜘蛛导航到论坛的下一页。我认为问题是我没有得到正确的 xpath 表达式。这是我的蜘蛛程序：

import scrapy
from datetime import datetime


class PeripartumSpider(scrapy.Spider):

    name = 'babyctr'

    start_urls = ['https://community.babycenter.com/groups/a15325/postpartum_depression_anxiety_and_related_topics']

# Going into each post link from the main page
# Note: Xpaths may need to be edited according to each website

    def parse(self, response):
        for post_link in response.xpath('//*[@class="titleText"]/a/@href').extract():
            link = response.urljoin(post_link)
            yield scrapy.Request(link, callback=self.parse_thread)

# Checking if there is a link to next page in the main page

        next_page = response.xpath('//*[@class= "__next"]/@href').extract_first()
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)


# Going into each post and extracting information.

    def parse_thread(self, response):
        original_post = response.xpath("//*[@class='content']/p/text()").extract()
        title = response.xpath("//*[@id='post_title']/text()").extract()
        author_name = response.xpath("//*[@class='user_nickname textLinks']/a/text()").extract_first()
        timestamp = response.xpath("//*[@class='finePrint']/text()").extract_first()
        number_comments = response.xpath("//*[@id='post_comments']/h2/text()").extract()
        yield {
            "title": title,
            "date": timestamp,
            "author_name": author_name,
            "post": original_post,
            "comments": number_comments}

# Getting the comments and their information for each post

        author_list = response.xpath("//*[@class='user_nickname textLinks']/a/text()").extract()
        response_author_list = author_list[1::]
        comment_response_list = response.xpath(".//*[@class='post_content bodyText clear']/p//text()").extract()
        post_dates = response.xpath("//*[@class='finePrint']/text()").extract()

        reply_dates = post_dates[1::]

        author_comment_date = zip(response_author_list, comment_response_list, reply_dates)

        for x, y, z in author_comment_date:
            yield{
                "title" : title,
                "date" : z,
                "author_name" : x,
                "post": y,
                "comments": 'Comments (0)'

            }

谁能帮我看看为什么我无法进入下一页？我尝试了很多替代方法，例如：response.xpath('//*[@label= "pagination"]/@href') 但我得到了错误的节点或错误的属性名称。我要抓取的网站包含在蜘蛛程序中。

提前致谢！

Answer 1

问题是 javascript，不是你的 xpath。

import scrapy
from datetime import datetime


class PeripartumSpider(scrapy.Spider):

    name = 'babyctr'

    start_urls = ['https://community.babycenter.com/groups/a15325/postpartum_depression_anxiety_and_related_topics']
    page = 1

    # Going into each post link from the main page
    # Note: Xpaths may need to be edited according to each website

    def parse(self, response):
        post_links = response.xpath('//*[@class="titleText"]/a/@href').extract()
        for post_link in post_links:
            link = response.urljoin(post_link)
            yield scrapy.Request(link, callback=self.parse_thread)

        # if there are no posts we are done
        if post_links:
            self.page += 1
            next_page = f'?page={self.page}'
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page)

    # Going into each post and extracting information.
    def parse_thread(self, response):
        original_post = response.xpath("//*[@class='content']/p/text()").extract()
        title = response.xpath("//*[@id='post_title']/text()").extract()
        author_name = response.xpath("//*[@class='user_nickname textLinks']/a/text()").extract_first()
        timestamp = response.xpath("//*[@class='finePrint']/text()").extract_first()
        number_comments = response.xpath("//*[@id='post_comments']/h2/text()").extract()
        yield {
            "title": title,
            "date": timestamp,
            "author_name": author_name,
            "post": original_post,
            "comments": number_comments}

        # Getting the comments and their information for each post

        author_list = response.xpath("//*[@class='user_nickname textLinks']/a/text()").extract()
        response_author_list = author_list[1::]
        comment_response_list = response.xpath(".//*[@class='post_content bodyText clear']/p//text()").extract()
        post_dates = response.xpath("//*[@class='finePrint']/text()").extract()

        reply_dates = post_dates[1::]

        author_comment_date = zip(response_author_list, comment_response_list, reply_dates)

        for x, y, z in author_comment_date:
            yield{
                "title": title,
                "date": z,
                "author_name": x,
                "post": y,
                "comments": 'Comments (0)'
            }

这修复了分页，但请注意你的 xpath 里面 parse_thread 是错误的。

Scrapy-无法导航到下一页

Scrapy- not able to navigate to next page

python

xpath

scrapy