Scrapy 运行 陆续爬取
Scrapy run crawl after another
我对网络抓取还很陌生。我正在尝试在小说 reader 网站上爬行,以获取小说信息和章节内容,所以我的方法是创建 2 个蜘蛛,一个用于获取小说信息,另一个用于获取章节内容
import scrapy
class BookSpider(scrapy.Spider):
name = "book"
def __init__(self, books=[], **kwargs):
if isinstance(books,str):
books = [books]
self.start_urls = [f'https://daonovel.com/novel/{book}/' for book in sorted(books)]
super().__init__(**kwargs)
def parse(self, response):
# self.remove_content(response.css("div.post-title h1 span"))
fullurl = response.url
url = fullurl.split("/")[-2]
title = response.css("div.post-title h1::text").extract()
title = title[len(title)-1].strip()
authors = response.css('div.author-content a::text').getall()
genres = response.css('div.genres-content a::text').getall()
release = response.css('div.post-status div.post-content_item:nth-child(1) div.summary-content::text').get().strip()
status = response.css('div.post-status div.post-content_item:nth-child(2) div.summary-content::text').get().strip()
summary = response.css('div.summary__content p').getall()
chapters = response.css('ul.version-chap li a::attr(href)').extract()
chapters.reverse()
return {
'fullurl' : fullurl,
'url' : url,
'title' : title,
'authors' : authors,
'genres' : genres,
'status' : status,
'release' : release,
'summary' : summary,
'chapters' : chapters
}
class ChapterSpider(scrapy.Spider):
name = "chapter"
def __init__(self, book="", chapters=[], **kwargs):
if isinstance(chapters,str):
chapters = [chapters]
self.book = book
self.start_urls = [f'https://daonovel.com/novel/{book}/{chapter}/' for chapter in chapters]
super().__init__(**kwargs)
def parse(self, response):
title = response.css("ol.breadcrumb li.active::text").get().strip()
container = response.css("div.cha-words p").getall() if response.css("div.cha-words p").getall() else response.css("div.text-left p").getall()
content = []
for p in container:
content.append(str(p))
return {
'title' : title,
'content' : content,
'book_url': self.book,
'url' : response.url.split("/")[-2]
}
之后我创建了一个收集器来收集和处理来自蜘蛛的所有数据
from scrapy import signals
class Collector():
def __init__(self, process, books=[]):
self.process = process
if isinstance(books, str):
books = [books]
self.books = books
self.books_data = []
def create_crawler(self, spider, function, **kwargs):
# we need Crawler instance to access signals
crawler = self.process.create_crawler(spider)
crawler.signals.connect(function, signal=signals.item_scraped)
x = self.process.crawl(crawler, **kwargs)
return x
def process_book_data(self, item, response, spider):
item['authors'] = [author.strip() for author in item['authors']]
item['genres'] = [genre.strip() for genre in item['genres']]
summary = [line for line in item['summary'] if not any(word in line.lower() for word in ("wuxiaworld", "disclaimer"))]
item['summary'] = str("\n").join(summary)
item['chapters'] = [chapter.replace(item['fullurl'], '').replace('/', '') for chapter in item['chapters']]
self.books_data.append(item)
def process_chapter_data(self, item, response, spider):
item['content'] = str("\n").join(item['content'])
for book in self.books_data:
if book['url'] == item['book_url']:
book['chapters'][book['chapters'].index(item['url'])] = item
def crawl_books(self):
return self.create_crawler(BookSpider, self.process_book_data, books=self.books)
def crawl_chapters(self, book, chapters):
return self.create_crawler(ChapterSpider, self.process_chapter_data, book=book, chapters=chapters)
如果我手动将章节放在 process.start()
之前
from scrapy.crawler import CrawlerProcess
process = CrawlerProcess()
collector = Collector(process, books="a-stay-at-home-dads-restaurant-in-an-alternate-world")
collector.crawl_books()
collector.crawl_chapters("a-stay-at-home-dads-restaurant-in-an-alternate-world", ['chapter-1', 'chapter-2', 'chapter-3', 'chapter-4', 'chapter-5']) # put chapter manually
process.start()
for book in (collector.books_data):
for k,v in book.items():
print(k,v)
它有效,但这不是此脚本的目的
现在我的问题是如何在书蜘蛛收集完数据后制作章节蜘蛛运行?这是我的尝试,但没有用
from scrapy.crawler import CrawlerProcess
process = CrawlerProcess()
collector = Collector(process, books="a-stay-at-home-dads-restaurant-in-an-alternate-world")
collector.crawl_books()
process.start()
print(collector.books_data) # this work
for book in collector.books_data:
collector.crawl_chapters(book['url'], book['chapters']) # this didn't work
print("Chapters ==>", collector.books_data)
如果我在 'print("Chapters ==>", collector.chapters_data)' 之前添加 process.start() 它会产生 twisted.internet.error.ReactorNotRestartable
的错误
我读过这个 SO 问题 但不知道如何在我的代码中实现它
我建议更改蜘蛛架构,因为 scrapy 不应该链接蜘蛛(当然有可能,但通常这是不好的做法),它应该在同一个蜘蛛中链接请求。
你的问题是由于 scrapy 设计为抓取扁平的项目列表,而你需要像 book = {'title': ..., 'chapters': [{some chapter data}, ...]}
这样的嵌套列表
我建议您的蜘蛛的下一个架构:
def parse(self, response):
# parse book data here
book_item = {
'fullurl' : fullurl,
'url' : url,
'title' : title,
'authors' : authors,
'genres' : genres,
'status' : status,
'release' : release,
'summary' : summary,
'chapters' : []
}
chapter_urls = ...list of book chapter urls here.
chapter_url = chapter_urls.pop()
yield Request(
url=chapter_url,
callback=self.parse_chapter
meta={'book': book_item, 'chapter_urls': chapter_urls}
)
def parse_chapter(self, response):
book = response.meta['book']
chapter_urls = response.meta['chapter_urls']
# parse chapter data here
chapter = {
'title' : title,
'content' : content,
'book_url': self.book,
'url' : response.url.split("/")[-2]
}
book['chapters'].append(chapter)
if not chapter_urls:
yield book
else:
chapter_url = chapter_urls.pop()
yield Request(
url=chapter_url,
callback=self.parse_chapter
meta={'book': book, 'chapter_urls': chapter_urls}
)
这将生成 books
个内部有嵌套章节的实体。
希望它能对您有所帮助,尽管它对您的问题的回答不是很准确。祝你好运 (:
第二版:
class YourSpider(Spider):
books = {}
...
def parse(self, response):
# Get book info here.
book_item = {
'fullurl' : fullurl,
'url' : url,
'title' : title,
'authors' : authors,
'genres' : genres,
'status' : status,
'release' : release,
'summary' : summary,
'chapters' : []
}
self.books[book_item['title']] = book_item
chapter_urls = [..list of chapter urls]
chapter_url = chapter_urls.pop()
# This will trigger multiple request async
for chapter_url in chapter_urls:
yield scrapy.Request(
url=chapter_url,
callback=self.parse_chapter,
meta={'book': book}
)
def parse_chapter(self, response):
book_title = response.meta['book_title']
# parse chapter data here
chapter = {
'title' : title,
'content' : content,
'book_url': self.book,
'url' : response.url.split("/")[-2]
}
self.books[book_title].append(chapter)
yield self.books[book_title]
我对网络抓取还很陌生。我正在尝试在小说 reader 网站上爬行,以获取小说信息和章节内容,所以我的方法是创建 2 个蜘蛛,一个用于获取小说信息,另一个用于获取章节内容
import scrapy
class BookSpider(scrapy.Spider):
name = "book"
def __init__(self, books=[], **kwargs):
if isinstance(books,str):
books = [books]
self.start_urls = [f'https://daonovel.com/novel/{book}/' for book in sorted(books)]
super().__init__(**kwargs)
def parse(self, response):
# self.remove_content(response.css("div.post-title h1 span"))
fullurl = response.url
url = fullurl.split("/")[-2]
title = response.css("div.post-title h1::text").extract()
title = title[len(title)-1].strip()
authors = response.css('div.author-content a::text').getall()
genres = response.css('div.genres-content a::text').getall()
release = response.css('div.post-status div.post-content_item:nth-child(1) div.summary-content::text').get().strip()
status = response.css('div.post-status div.post-content_item:nth-child(2) div.summary-content::text').get().strip()
summary = response.css('div.summary__content p').getall()
chapters = response.css('ul.version-chap li a::attr(href)').extract()
chapters.reverse()
return {
'fullurl' : fullurl,
'url' : url,
'title' : title,
'authors' : authors,
'genres' : genres,
'status' : status,
'release' : release,
'summary' : summary,
'chapters' : chapters
}
class ChapterSpider(scrapy.Spider):
name = "chapter"
def __init__(self, book="", chapters=[], **kwargs):
if isinstance(chapters,str):
chapters = [chapters]
self.book = book
self.start_urls = [f'https://daonovel.com/novel/{book}/{chapter}/' for chapter in chapters]
super().__init__(**kwargs)
def parse(self, response):
title = response.css("ol.breadcrumb li.active::text").get().strip()
container = response.css("div.cha-words p").getall() if response.css("div.cha-words p").getall() else response.css("div.text-left p").getall()
content = []
for p in container:
content.append(str(p))
return {
'title' : title,
'content' : content,
'book_url': self.book,
'url' : response.url.split("/")[-2]
}
之后我创建了一个收集器来收集和处理来自蜘蛛的所有数据
from scrapy import signals
class Collector():
def __init__(self, process, books=[]):
self.process = process
if isinstance(books, str):
books = [books]
self.books = books
self.books_data = []
def create_crawler(self, spider, function, **kwargs):
# we need Crawler instance to access signals
crawler = self.process.create_crawler(spider)
crawler.signals.connect(function, signal=signals.item_scraped)
x = self.process.crawl(crawler, **kwargs)
return x
def process_book_data(self, item, response, spider):
item['authors'] = [author.strip() for author in item['authors']]
item['genres'] = [genre.strip() for genre in item['genres']]
summary = [line for line in item['summary'] if not any(word in line.lower() for word in ("wuxiaworld", "disclaimer"))]
item['summary'] = str("\n").join(summary)
item['chapters'] = [chapter.replace(item['fullurl'], '').replace('/', '') for chapter in item['chapters']]
self.books_data.append(item)
def process_chapter_data(self, item, response, spider):
item['content'] = str("\n").join(item['content'])
for book in self.books_data:
if book['url'] == item['book_url']:
book['chapters'][book['chapters'].index(item['url'])] = item
def crawl_books(self):
return self.create_crawler(BookSpider, self.process_book_data, books=self.books)
def crawl_chapters(self, book, chapters):
return self.create_crawler(ChapterSpider, self.process_chapter_data, book=book, chapters=chapters)
如果我手动将章节放在 process.start()
之前from scrapy.crawler import CrawlerProcess
process = CrawlerProcess()
collector = Collector(process, books="a-stay-at-home-dads-restaurant-in-an-alternate-world")
collector.crawl_books()
collector.crawl_chapters("a-stay-at-home-dads-restaurant-in-an-alternate-world", ['chapter-1', 'chapter-2', 'chapter-3', 'chapter-4', 'chapter-5']) # put chapter manually
process.start()
for book in (collector.books_data):
for k,v in book.items():
print(k,v)
它有效,但这不是此脚本的目的
现在我的问题是如何在书蜘蛛收集完数据后制作章节蜘蛛运行?这是我的尝试,但没有用
from scrapy.crawler import CrawlerProcess
process = CrawlerProcess()
collector = Collector(process, books="a-stay-at-home-dads-restaurant-in-an-alternate-world")
collector.crawl_books()
process.start()
print(collector.books_data) # this work
for book in collector.books_data:
collector.crawl_chapters(book['url'], book['chapters']) # this didn't work
print("Chapters ==>", collector.books_data)
如果我在 'print("Chapters ==>", collector.chapters_data)' 之前添加 process.start() 它会产生 twisted.internet.error.ReactorNotRestartable
的错误我读过这个 SO 问题
我建议更改蜘蛛架构,因为 scrapy 不应该链接蜘蛛(当然有可能,但通常这是不好的做法),它应该在同一个蜘蛛中链接请求。
你的问题是由于 scrapy 设计为抓取扁平的项目列表,而你需要像 book = {'title': ..., 'chapters': [{some chapter data}, ...]}
我建议您的蜘蛛的下一个架构:
def parse(self, response):
# parse book data here
book_item = {
'fullurl' : fullurl,
'url' : url,
'title' : title,
'authors' : authors,
'genres' : genres,
'status' : status,
'release' : release,
'summary' : summary,
'chapters' : []
}
chapter_urls = ...list of book chapter urls here.
chapter_url = chapter_urls.pop()
yield Request(
url=chapter_url,
callback=self.parse_chapter
meta={'book': book_item, 'chapter_urls': chapter_urls}
)
def parse_chapter(self, response):
book = response.meta['book']
chapter_urls = response.meta['chapter_urls']
# parse chapter data here
chapter = {
'title' : title,
'content' : content,
'book_url': self.book,
'url' : response.url.split("/")[-2]
}
book['chapters'].append(chapter)
if not chapter_urls:
yield book
else:
chapter_url = chapter_urls.pop()
yield Request(
url=chapter_url,
callback=self.parse_chapter
meta={'book': book, 'chapter_urls': chapter_urls}
)
这将生成 books
个内部有嵌套章节的实体。
希望它能对您有所帮助,尽管它对您的问题的回答不是很准确。祝你好运 (:
第二版:
class YourSpider(Spider):
books = {}
...
def parse(self, response):
# Get book info here.
book_item = {
'fullurl' : fullurl,
'url' : url,
'title' : title,
'authors' : authors,
'genres' : genres,
'status' : status,
'release' : release,
'summary' : summary,
'chapters' : []
}
self.books[book_item['title']] = book_item
chapter_urls = [..list of chapter urls]
chapter_url = chapter_urls.pop()
# This will trigger multiple request async
for chapter_url in chapter_urls:
yield scrapy.Request(
url=chapter_url,
callback=self.parse_chapter,
meta={'book': book}
)
def parse_chapter(self, response):
book_title = response.meta['book_title']
# parse chapter data here
chapter = {
'title' : title,
'content' : content,
'book_url': self.book,
'url' : response.url.split("/")[-2]
}
self.books[book_title].append(chapter)
yield self.books[book_title]