python scripts returns 只输出每页的第一个结果而不是所有

python scripts returns outputs only first result of each page instead of all

我正在执行网络抓取任务,将所有数据输出到 json 文件中。如果我从终端使用 scrapy crawl ufcspider -o quotes.json 输出文件,我会得到所有预期的结果,但是当我 运行 脚本 scrapy crawl ufcspider 时,我只会得到根据脚本创建的所有页面,但只有每页的第一个结果输出到文件中,而不是所有十个输出。

我还注意到在终端上所有结果都按预期输出。

脚本

import scrapy
import json

class ExampleSpider(scrapy.Spider):
    name = "ufcspider"

    start_urls = [
        'http://quotes.toscrape.com/page/1/',
    ]

    def parse(self, response):
        for quote in response.css('div.quote'):

            result = {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'link': 'http://quotes.toscrape.com' + quote.css("span a::attr(href)").get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }

            yield result

        next_page = response.css("li.next a::attr(href)").get()
        if next_page is not None:
            # next_page = response.urljoin(next_page)
            yield response.follow(next_page, callback=self.parse)

        page = response.url.split("/")[-2]
        filename = f'quotes-{page}.json'
        with open(filename, 'w') as f:
            f.write(json.dumps(result))
            f.close()
        self.log(f'Saved file {filename}')

这是因为你在 for 循环中覆盖了 result

用这个替换应该有效:

import scrapy
import json

class ExampleSpider(scrapy.Spider):
    name = "ufcspider"

    start_urls = [
        'http://quotes.toscrape.com/page/1/',
    ]

    def parse(self, response):
        results = []
        for quote in response.css('div.quote'):
            result = {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'link': 'http://quotes.toscrape.com' + quote.css("span a::attr(href)").get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }
            results.append(result)
            yield result

        next_page = response.css("li.next a::attr(href)").get()
        if next_page is not None:
            # next_page = response.urljoin(next_page)
            yield response.follow(next_page, callback=self.parse)

        page = response.url.split("/")[-2]
        filename = f'quotes-{page}.json'
        with open(filename, 'w') as f:
            f.write(json.dumps(results))
            f.close()
        self.log(f'Saved file {filename}')