如何在 Scrapy 蜘蛛中为特定 url 使用代理?
How to use proxy for specific url in Scrapy spider?
我只想为少数特定域使用代理。我检查 , this and this。如果我理解正确,使用中间件设置代理将为所有请求设置代理。
如何在发送蜘蛛请求之前为特定 url 设置代理?
目前我的蜘蛛在以下实现中工作正常:
CoreSpider.py
class CoreSpider(scrapy.Spider):
name = "final"
def __init__(self):
self.start_urls = self.read_url()
self.rules = (
Rule(
LinkExtractor(
unique=True,
),
callback='parse',
follow=True
),
)
def read_url(self):
urlList = []
for filename in glob.glob(os.path.join("/root/Public/company_profiler/seed_list", '*.list')):
with open(filename, "r") as f:
for line in f.readlines():
url = re.sub('\n', '', line)
if "http" not in url:
url = "http://" + url
# print(url)
urlList.append(url)
return urlList
def parse(self, response):
print("URL is: ", response.url)
print("User agent is : ", response.request.headers['User-Agent'])
filename = '/root/Public/company_profiler/crawled_page/%s.html' % response.url
article = Extractor(extractor='LargestContentExtractor', html=response.body).getText()
print("Article is :", article)
if len(article.split("\n")) < 5:
print("Skipping to next url : ", article.split("\n"))
else:
print("Continue parsing: ", article.split("\n"))
ContentHandler_copy.ContentHandler_copy.start(article, response.url)
和settings.py
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'random_useragent.RandomUserAgentMiddleware': 320
}
我是 运行 蜘蛛,通过脚本调用它 RunSpider.py
RunSpider.py
from CoreSpider import CoreSpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
process = CrawlerProcess(get_project_settings())
process.crawl(CoreSpider)
process.start()
更新:
CoreSpider.py
class CoreSpider(scrapy.Spider):
name = "final"
def __init__(self):
self.start_urls = self.read_url()
self.rules = (
Rule(LinkExtractor(unique=True), callback='parse', follow=True, process_request='process_request'),
)
def process_request(self, request, spider):
print("Request is : ", request) ### Not printing anything
if 'xxx' in request.url: # <-- set proxy for this URL?
meta = request.get('meta', {})
meta.update({'proxy': 'https://159.8.18.178:8080'})
return request.replace(meta=meta)
return request
.......
我也试过在process_request
方法中这样设置代理,但是失败了。
request.meta['proxy'] = "https://159.8.18.178:8080"
提前致谢。
要根据请求使用代理,请根据 documentation. In case of CrawlSpider
, you'll want to supply process_request
argument to the Rule
指定 Request
的 meta
的 proxy
属性。在该方法中,根据请求 URL 和 return 修改后的请求有选择地应用上述(即设置 meta['proxy']
) meta
filled.
编辑:
替换规则定义
self.rules = (
Rule(LinkExtractor(unique=True), callback='parse', follow=True),
)
与
self.rules = (
Rule(LinkExtractor(unique=True), callback='parse', follow=True, process_request='process_request'),
)
并在 CoreSpider
class:
中定义新方法 process_request
def process_request(self, request):
if 'xxx' in request.url: # <-- set proxy for this URL?
meta = request.get('meta', {})
meta.update({'proxy': 'your_proxy'})
return request.replace(meta=meta)
return request
编辑2:
我认为问题可能是由于 start_urls
和 rules
定义埋在构造函数中引起的:
...
def __init__(self):
self.start_urls = self.read_url()
self.rules = (
Rule(LinkExtractor(unique=True), callback='parse', follow=True, process_request='process_request'),
)
...
正确的方法是将这些属性设为class属性,即
class CoreSpider(scrapy.Spider):
name = "final"
start_urls = self.read_url()
rules = (
Rule(LinkExtractor(unique=True), callback='parse', follow=True, process_request='process_request'),
)
至于 start_urls
,如果您需要更复杂的东西(例如从外部文件读取 URLs),将 start_requests
定义为产生 Request
s.
我给你一个使用代理的例子 url
link = 'https://www.example.com/'
request = Request(link, callback=self.parse_url)
request.meta['proxy'] = "http://PROXYIP:PROXYPORT"
yield request
一个独立的方法。没有中间件。
urls = [url, url, ..., url]
class TestSpider(scrapy.Spider):
name = 'test'
allowed_domains = ['test.com']
# start_urls = urls # invalid, override by start_requests
def start_requests(self):
for url in urls:
# handle each individual url with or without proxy
# if url in ['no1.com', 'no2.com', 'no3.com']:
if url == 'www.no_proxy.com':
meta_proxy = '' # do not use proxy for this url
else:
meta_proxy = "http://127.0.0.1:8888"
yield scrapy.Request(url=url, callback=self.parse, meta={'proxy': meta_proxy})
def parse(self, response):
title = response.xpath('.//title/text()').extract_first()
yield {'title': title}
用法:
$scrapy runspider test.py -o test.json -s CONCURRENT_REQUESTS_PER_DOMAIN=100 -s CONCURRENT_REQUESTS=100
免责声明:
我不知道它会不会减慢抓取速度,因为它是一个一个地迭代网址。我目前没有大量的测试站点。希望使用此代码的人留下评论,看看他们得到了什么。
我只想为少数特定域使用代理。我检查
如何在发送蜘蛛请求之前为特定 url 设置代理?
目前我的蜘蛛在以下实现中工作正常:
CoreSpider.py
class CoreSpider(scrapy.Spider):
name = "final"
def __init__(self):
self.start_urls = self.read_url()
self.rules = (
Rule(
LinkExtractor(
unique=True,
),
callback='parse',
follow=True
),
)
def read_url(self):
urlList = []
for filename in glob.glob(os.path.join("/root/Public/company_profiler/seed_list", '*.list')):
with open(filename, "r") as f:
for line in f.readlines():
url = re.sub('\n', '', line)
if "http" not in url:
url = "http://" + url
# print(url)
urlList.append(url)
return urlList
def parse(self, response):
print("URL is: ", response.url)
print("User agent is : ", response.request.headers['User-Agent'])
filename = '/root/Public/company_profiler/crawled_page/%s.html' % response.url
article = Extractor(extractor='LargestContentExtractor', html=response.body).getText()
print("Article is :", article)
if len(article.split("\n")) < 5:
print("Skipping to next url : ", article.split("\n"))
else:
print("Continue parsing: ", article.split("\n"))
ContentHandler_copy.ContentHandler_copy.start(article, response.url)
和settings.py
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'random_useragent.RandomUserAgentMiddleware': 320
}
我是 运行 蜘蛛,通过脚本调用它 RunSpider.py
RunSpider.py
from CoreSpider import CoreSpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
process = CrawlerProcess(get_project_settings())
process.crawl(CoreSpider)
process.start()
更新:
CoreSpider.py
class CoreSpider(scrapy.Spider):
name = "final"
def __init__(self):
self.start_urls = self.read_url()
self.rules = (
Rule(LinkExtractor(unique=True), callback='parse', follow=True, process_request='process_request'),
)
def process_request(self, request, spider):
print("Request is : ", request) ### Not printing anything
if 'xxx' in request.url: # <-- set proxy for this URL?
meta = request.get('meta', {})
meta.update({'proxy': 'https://159.8.18.178:8080'})
return request.replace(meta=meta)
return request
.......
我也试过在process_request
方法中这样设置代理,但是失败了。
request.meta['proxy'] = "https://159.8.18.178:8080"
提前致谢。
要根据请求使用代理,请根据 documentation. In case of CrawlSpider
, you'll want to supply process_request
argument to the Rule
指定 Request
的 meta
的 proxy
属性。在该方法中,根据请求 URL 和 return 修改后的请求有选择地应用上述(即设置 meta['proxy']
) meta
filled.
编辑: 替换规则定义
self.rules = (
Rule(LinkExtractor(unique=True), callback='parse', follow=True),
)
与
self.rules = (
Rule(LinkExtractor(unique=True), callback='parse', follow=True, process_request='process_request'),
)
并在 CoreSpider
class:
process_request
def process_request(self, request):
if 'xxx' in request.url: # <-- set proxy for this URL?
meta = request.get('meta', {})
meta.update({'proxy': 'your_proxy'})
return request.replace(meta=meta)
return request
编辑2:
我认为问题可能是由于 start_urls
和 rules
定义埋在构造函数中引起的:
...
def __init__(self):
self.start_urls = self.read_url()
self.rules = (
Rule(LinkExtractor(unique=True), callback='parse', follow=True, process_request='process_request'),
)
...
正确的方法是将这些属性设为class属性,即
class CoreSpider(scrapy.Spider):
name = "final"
start_urls = self.read_url()
rules = (
Rule(LinkExtractor(unique=True), callback='parse', follow=True, process_request='process_request'),
)
至于 start_urls
,如果您需要更复杂的东西(例如从外部文件读取 URLs),将 start_requests
定义为产生 Request
s.
我给你一个使用代理的例子 url
link = 'https://www.example.com/'
request = Request(link, callback=self.parse_url)
request.meta['proxy'] = "http://PROXYIP:PROXYPORT"
yield request
一个独立的方法。没有中间件。
urls = [url, url, ..., url]
class TestSpider(scrapy.Spider):
name = 'test'
allowed_domains = ['test.com']
# start_urls = urls # invalid, override by start_requests
def start_requests(self):
for url in urls:
# handle each individual url with or without proxy
# if url in ['no1.com', 'no2.com', 'no3.com']:
if url == 'www.no_proxy.com':
meta_proxy = '' # do not use proxy for this url
else:
meta_proxy = "http://127.0.0.1:8888"
yield scrapy.Request(url=url, callback=self.parse, meta={'proxy': meta_proxy})
def parse(self, response):
title = response.xpath('.//title/text()').extract_first()
yield {'title': title}
用法:
$scrapy runspider test.py -o test.json -s CONCURRENT_REQUESTS_PER_DOMAIN=100 -s CONCURRENT_REQUESTS=100
免责声明:
我不知道它会不会减慢抓取速度,因为它是一个一个地迭代网址。我目前没有大量的测试站点。希望使用此代码的人留下评论,看看他们得到了什么。