为什么scrapy爬虫在flask app中只能工作一次?
Why does scrapy crawler only work once in flask app?
我目前正在开发 Flask 应用程序。该应用程序从用户那里获取 url,然后抓取该网站和 returns 在该网站中找到的链接。这是我的代码的样子:
from flask import Flask, render_template, request, redirect, url_for, session, make_response
from flask_executor import Executor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
from urllib.parse import urlparse
from uuid import uuid4
import smtplib, urllib3, requests, urllib.parse, datetime, sys, os
app = Flask(__name__)
executor = Executor(app)
http = urllib3.PoolManager()
process = CrawlerProcess()
list = set([])
list_validate = set([])
list_final = set([])
@app.route('/', methods=["POST", "GET"])
def index():
if request.method == "POST":
url_input = request.form["usr_input"]
# Modifying URL
if 'https://' in url_input and url_input[-1] == '/':
url = str(url_input)
elif 'https://' in url_input and url_input[-1] != '/':
url = str(url_input) + '/'
elif 'https://' not in url_input and url_input[-1] != '/':
url = 'https://' + str(url_input) + '/'
elif 'https://' not in url_input and url_input[-1] == '/':
url = 'https://' + str(url_input)
# Validating URL
try:
response = requests.get(url)
error = http.request("GET", url)
if error.status == 200:
parse = urlparse(url).netloc.split('.')
base_url = parse[-2] + '.' + parse[-1]
start_url = [str(url)]
allowed_url = [str(base_url)]
# Crawling links
class Crawler(CrawlSpider):
name = "crawler"
start_urls = start_url
allowed_domains = allowed_url
rules = [Rule(LinkExtractor(), callback='parse_links', follow=True)]
def parse_links(self, response):
base_url = url
href = response.xpath('//a/@href').getall()
list.add(urllib.parse.quote(response.url, safe=':/'))
for link in href:
if base_url not in link:
list.add(urllib.parse.quote(response.urljoin(link), safe=':/'))
for link in list:
if base_url in link:
list_validate.add(link)
def start():
process.crawl(Crawler)
process.start()
for link in list_validate:
error = http.request("GET", link)
if error.status == 200:
list_final.add(link)
original_stdout = sys.stdout
with open('templates/file.txt', 'w') as f:
sys.stdout = f
for link in list_final:
print(link)
unique_id = uuid4().__str__()
executor.submit_stored(unique_id, start)
return redirect(url_for('crawling', id=unique_id))
else:
return render_template('index.html')
@app.route('/crawling-<string:id>')
def crawling(id):
if not executor.futures.done(id):
return render_template('start-crawl.html', refresh=True)
else:
executor.futures.pop(id)
return render_template('finish-crawl.html')
在我的 start.html
中,我有这个:
{% if refresh %}
<meta http-equiv="refresh" content="5">
{% endif %}
此代码从用户那里获取 url,对其进行验证,如果它是有效的 url,它就会开始抓取并将用户带到 start-crawl.html
页面。页面每 5 秒刷新一次,直到抓取完成,如果抓取完成,它会呈现 finish-crawl.html
。在 finish-crawl.html
中,用户可以下载具有输出的文件(未包含它,因为没有必要)。
一切正常。我的问题是一旦我抓取一个网站并且它完成抓取并且我在 finish-crawl.html
,我就无法抓取另一个网站。如果我返回主页并输入另一个 url,它会验证 url,然后直接转到 finish-crawl.html
。我认为发生这种情况是因为 scrappy 只能 运行 一次并且反应器不可重启,这正是我在这里尝试做的。那么有谁知道我能做些什么来解决这个问题?请忽略代码的复杂性和任何不被视为“编程约定”的内容。
Scrapy 推荐使用 CrawlerRunner
而不是 CrawlerProcess
.
from twisted.internet import reactor
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
class MySpider(scrapy.Spider):
#Spider definition
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner()
d = runner.crawl(MySpider)
def finished(e):
print("finished")
def spider_error(e):
print("spider error :/")
d.addCallback(finished)
d.addErrback(spider_error)
reactor.run()
有关反应器的更多信息可在此处获得:ReactorBasic
我目前正在开发 Flask 应用程序。该应用程序从用户那里获取 url,然后抓取该网站和 returns 在该网站中找到的链接。这是我的代码的样子:
from flask import Flask, render_template, request, redirect, url_for, session, make_response
from flask_executor import Executor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
from urllib.parse import urlparse
from uuid import uuid4
import smtplib, urllib3, requests, urllib.parse, datetime, sys, os
app = Flask(__name__)
executor = Executor(app)
http = urllib3.PoolManager()
process = CrawlerProcess()
list = set([])
list_validate = set([])
list_final = set([])
@app.route('/', methods=["POST", "GET"])
def index():
if request.method == "POST":
url_input = request.form["usr_input"]
# Modifying URL
if 'https://' in url_input and url_input[-1] == '/':
url = str(url_input)
elif 'https://' in url_input and url_input[-1] != '/':
url = str(url_input) + '/'
elif 'https://' not in url_input and url_input[-1] != '/':
url = 'https://' + str(url_input) + '/'
elif 'https://' not in url_input and url_input[-1] == '/':
url = 'https://' + str(url_input)
# Validating URL
try:
response = requests.get(url)
error = http.request("GET", url)
if error.status == 200:
parse = urlparse(url).netloc.split('.')
base_url = parse[-2] + '.' + parse[-1]
start_url = [str(url)]
allowed_url = [str(base_url)]
# Crawling links
class Crawler(CrawlSpider):
name = "crawler"
start_urls = start_url
allowed_domains = allowed_url
rules = [Rule(LinkExtractor(), callback='parse_links', follow=True)]
def parse_links(self, response):
base_url = url
href = response.xpath('//a/@href').getall()
list.add(urllib.parse.quote(response.url, safe=':/'))
for link in href:
if base_url not in link:
list.add(urllib.parse.quote(response.urljoin(link), safe=':/'))
for link in list:
if base_url in link:
list_validate.add(link)
def start():
process.crawl(Crawler)
process.start()
for link in list_validate:
error = http.request("GET", link)
if error.status == 200:
list_final.add(link)
original_stdout = sys.stdout
with open('templates/file.txt', 'w') as f:
sys.stdout = f
for link in list_final:
print(link)
unique_id = uuid4().__str__()
executor.submit_stored(unique_id, start)
return redirect(url_for('crawling', id=unique_id))
else:
return render_template('index.html')
@app.route('/crawling-<string:id>')
def crawling(id):
if not executor.futures.done(id):
return render_template('start-crawl.html', refresh=True)
else:
executor.futures.pop(id)
return render_template('finish-crawl.html')
在我的 start.html
中,我有这个:
{% if refresh %}
<meta http-equiv="refresh" content="5">
{% endif %}
此代码从用户那里获取 url,对其进行验证,如果它是有效的 url,它就会开始抓取并将用户带到 start-crawl.html
页面。页面每 5 秒刷新一次,直到抓取完成,如果抓取完成,它会呈现 finish-crawl.html
。在 finish-crawl.html
中,用户可以下载具有输出的文件(未包含它,因为没有必要)。
一切正常。我的问题是一旦我抓取一个网站并且它完成抓取并且我在 finish-crawl.html
,我就无法抓取另一个网站。如果我返回主页并输入另一个 url,它会验证 url,然后直接转到 finish-crawl.html
。我认为发生这种情况是因为 scrappy 只能 运行 一次并且反应器不可重启,这正是我在这里尝试做的。那么有谁知道我能做些什么来解决这个问题?请忽略代码的复杂性和任何不被视为“编程约定”的内容。
Scrapy 推荐使用 CrawlerRunner
而不是 CrawlerProcess
.
from twisted.internet import reactor
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
class MySpider(scrapy.Spider):
#Spider definition
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner()
d = runner.crawl(MySpider)
def finished(e):
print("finished")
def spider_error(e):
print("spider error :/")
d.addCallback(finished)
d.addErrback(spider_error)
reactor.run()
有关反应器的更多信息可在此处获得:ReactorBasic