在scrapy递归抓取过程中,如何从父url和关联子url的多个节点中一起提取信息?

During recursive scraping in scrapy, how extract info from multiple nodes of parent url and associated children url together?

父节点url有多个节点(引用),每个父节点都有子节点url(作者信息)。由于 scrapy 的异步特性,我在将引用链接到作者信息时遇到问题?

我该如何解决这个问题,这是目前为止的代码。添加了 # <--- 评论以便于查找。

import scrapy 

class AuthorSpider(scrapy.Spider):
    name = 'quotes1'
    var = None # <----

    def start_requests(self):
        start_urls = ['http://quotes.toscrape.com/']
        yield scrapy.Request(url=start_urls[0], callback=self.parse)

    def parse(self, response):

        for quote in response.css('div.quote'):
            AuthorSpider.var = quote.css('div span.text::text').get() # <----

            authShortLink = quote.css('small.author + a::attr(href)').get()
            authFullLink = response.urljoin(authShortLink)
            yield scrapy.Request(url=authFullLink, callback=self.parse_author)

        # # looping through next pages
        # nextPage = response.css('li.next a::attr(href)').get()
        # if nextPage is not None:
        #     nextPage = response.urljoin(nextPage)
        #     yield scrapy.Request(url=nextPage, callback=self.parse)

    def parse_author(self, response):
        def extract_with_css(query):
            return response.css(query).get(default='').strip()

        yield {
            'name': extract_with_css('h3.author-title::text'),
            'birthdate': extract_with_css('.author-born-date::text'),
            'quote' : AuthorSpider.var
        }

请注意,为了允许重复,在settings.py

中添加了DUPEFILTER_CLASS = 'scrapy.dupefilters.BaseDupeFilter'

我目前得到的输出-

[
{"name": "Albert Einstein", "birthdate": "March 14, 1879", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "Marilyn Monroe", "birthdate": "June 01, 1926", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "Jane Austen", "birthdate": "December 16, 1775", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "Albert Einstein", "birthdate": "March 14, 1879", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "J.K. Rowling", "birthdate": "July 31, 1965", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "Albert Einstein", "birthdate": "March 14, 1879", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "Steve Martin", "birthdate": "August 14, 1945", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "Eleanor Roosevelt", "birthdate": "October 11, 1884", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "Thomas A. Edison", "birthdate": "February 11, 1847", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "Andr\u00e9 Gide", "birthdate": "November 22, 1869", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"}
]

提前致谢!

这是最小的工作解决方案。两种类型的分页都有效,我使用 meta 关键字将引用项从一个响应转移到另一个响应。

import scrapy
class AuthorSpider(scrapy.Spider):
    name = 'quotes1'
    start_urls = [f'https://quotes.toscrape.com/page/{x}/' .format(x) for x in range(1,11)]

    def parse(self, response):

        for quote in response.css('div.quote'):
            Author = quote.css('span.text::text').get()  # <----

            authShortLink = quote.css('small.author + a::attr(href)').get()
            authFullLink = response.urljoin(authShortLink)
            yield scrapy.Request(url=authFullLink, callback=self.parse_author, meta={'Author': Author})

        # # looping through next pages
        # nextPage = response.css('li.next a::attr(href)').get()
        # abs_url = f'http://quotes.toscrape.com/{nextPage}'
            #yield scrapy.Request(url=abs_url, callback=self.parse)

    def parse_author(self, response):
        quote=response.meta.get('Author')
        yield {
            'Name': response.css('h3.author-title::text').get().strip(),
            'Date of birth': response.css('span.author-born-date::text').get(),
            'Quote':quote,
            'url':response.url}