在scrapy递归抓取过程中,如何从父url和关联子url的多个节点中一起提取信息?
During recursive scraping in scrapy, how extract info from multiple nodes of parent url and associated children url together?
父节点url有多个节点(引用),每个父节点都有子节点url(作者信息)。由于 scrapy 的异步特性,我在将引用链接到作者信息时遇到问题?
我该如何解决这个问题,这是目前为止的代码。添加了 # <---
评论以便于查找。
import scrapy
class AuthorSpider(scrapy.Spider):
name = 'quotes1'
var = None # <----
def start_requests(self):
start_urls = ['http://quotes.toscrape.com/']
yield scrapy.Request(url=start_urls[0], callback=self.parse)
def parse(self, response):
for quote in response.css('div.quote'):
AuthorSpider.var = quote.css('div span.text::text').get() # <----
authShortLink = quote.css('small.author + a::attr(href)').get()
authFullLink = response.urljoin(authShortLink)
yield scrapy.Request(url=authFullLink, callback=self.parse_author)
# # looping through next pages
# nextPage = response.css('li.next a::attr(href)').get()
# if nextPage is not None:
# nextPage = response.urljoin(nextPage)
# yield scrapy.Request(url=nextPage, callback=self.parse)
def parse_author(self, response):
def extract_with_css(query):
return response.css(query).get(default='').strip()
yield {
'name': extract_with_css('h3.author-title::text'),
'birthdate': extract_with_css('.author-born-date::text'),
'quote' : AuthorSpider.var
}
请注意,为了允许重复,在settings.py
中添加了DUPEFILTER_CLASS = 'scrapy.dupefilters.BaseDupeFilter'
我目前得到的输出-
[
{"name": "Albert Einstein", "birthdate": "March 14, 1879", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "Marilyn Monroe", "birthdate": "June 01, 1926", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "Jane Austen", "birthdate": "December 16, 1775", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "Albert Einstein", "birthdate": "March 14, 1879", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "J.K. Rowling", "birthdate": "July 31, 1965", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "Albert Einstein", "birthdate": "March 14, 1879", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "Steve Martin", "birthdate": "August 14, 1945", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "Eleanor Roosevelt", "birthdate": "October 11, 1884", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "Thomas A. Edison", "birthdate": "February 11, 1847", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "Andr\u00e9 Gide", "birthdate": "November 22, 1869", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"}
]
提前致谢!
这是最小的工作解决方案。两种类型的分页都有效,我使用 meta 关键字将引用项从一个响应转移到另一个响应。
import scrapy
class AuthorSpider(scrapy.Spider):
name = 'quotes1'
start_urls = [f'https://quotes.toscrape.com/page/{x}/' .format(x) for x in range(1,11)]
def parse(self, response):
for quote in response.css('div.quote'):
Author = quote.css('span.text::text').get() # <----
authShortLink = quote.css('small.author + a::attr(href)').get()
authFullLink = response.urljoin(authShortLink)
yield scrapy.Request(url=authFullLink, callback=self.parse_author, meta={'Author': Author})
# # looping through next pages
# nextPage = response.css('li.next a::attr(href)').get()
# abs_url = f'http://quotes.toscrape.com/{nextPage}'
#yield scrapy.Request(url=abs_url, callback=self.parse)
def parse_author(self, response):
quote=response.meta.get('Author')
yield {
'Name': response.css('h3.author-title::text').get().strip(),
'Date of birth': response.css('span.author-born-date::text').get(),
'Quote':quote,
'url':response.url}
父节点url有多个节点(引用),每个父节点都有子节点url(作者信息)。由于 scrapy 的异步特性,我在将引用链接到作者信息时遇到问题?
我该如何解决这个问题,这是目前为止的代码。添加了 # <---
评论以便于查找。
import scrapy
class AuthorSpider(scrapy.Spider):
name = 'quotes1'
var = None # <----
def start_requests(self):
start_urls = ['http://quotes.toscrape.com/']
yield scrapy.Request(url=start_urls[0], callback=self.parse)
def parse(self, response):
for quote in response.css('div.quote'):
AuthorSpider.var = quote.css('div span.text::text').get() # <----
authShortLink = quote.css('small.author + a::attr(href)').get()
authFullLink = response.urljoin(authShortLink)
yield scrapy.Request(url=authFullLink, callback=self.parse_author)
# # looping through next pages
# nextPage = response.css('li.next a::attr(href)').get()
# if nextPage is not None:
# nextPage = response.urljoin(nextPage)
# yield scrapy.Request(url=nextPage, callback=self.parse)
def parse_author(self, response):
def extract_with_css(query):
return response.css(query).get(default='').strip()
yield {
'name': extract_with_css('h3.author-title::text'),
'birthdate': extract_with_css('.author-born-date::text'),
'quote' : AuthorSpider.var
}
请注意,为了允许重复,在settings.py
DUPEFILTER_CLASS = 'scrapy.dupefilters.BaseDupeFilter'
我目前得到的输出-
[
{"name": "Albert Einstein", "birthdate": "March 14, 1879", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "Marilyn Monroe", "birthdate": "June 01, 1926", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "Jane Austen", "birthdate": "December 16, 1775", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "Albert Einstein", "birthdate": "March 14, 1879", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "J.K. Rowling", "birthdate": "July 31, 1965", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "Albert Einstein", "birthdate": "March 14, 1879", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "Steve Martin", "birthdate": "August 14, 1945", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "Eleanor Roosevelt", "birthdate": "October 11, 1884", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "Thomas A. Edison", "birthdate": "February 11, 1847", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"},
{"name": "Andr\u00e9 Gide", "birthdate": "November 22, 1869", "quote": "\u201cA day without sunshine is like, you know, night.\u201d"}
]
提前致谢!
这是最小的工作解决方案。两种类型的分页都有效,我使用 meta 关键字将引用项从一个响应转移到另一个响应。
import scrapy
class AuthorSpider(scrapy.Spider):
name = 'quotes1'
start_urls = [f'https://quotes.toscrape.com/page/{x}/' .format(x) for x in range(1,11)]
def parse(self, response):
for quote in response.css('div.quote'):
Author = quote.css('span.text::text').get() # <----
authShortLink = quote.css('small.author + a::attr(href)').get()
authFullLink = response.urljoin(authShortLink)
yield scrapy.Request(url=authFullLink, callback=self.parse_author, meta={'Author': Author})
# # looping through next pages
# nextPage = response.css('li.next a::attr(href)').get()
# abs_url = f'http://quotes.toscrape.com/{nextPage}'
#yield scrapy.Request(url=abs_url, callback=self.parse)
def parse_author(self, response):
quote=response.meta.get('Author')
yield {
'Name': response.css('h3.author-title::text').get().strip(),
'Date of birth': response.css('span.author-born-date::text').get(),
'Quote':quote,
'url':response.url}