如何将 URL 从 Spider 导入到 Spider?
How to Import URLs From Spider to Spider?
我正在构建一个 Scrapy 蜘蛛 WuzzufLinks
,它将所有 link 抓取到此 link 中求职网站中的特定职位:
https://wuzzuf.net/search/jobs/?filters%5Bcountry%5D%5B0%5D=Egypt
抓取 link 之后,我想将它们发送到另一个蜘蛛 WuzzufSpider
,它从每个 link 内部抓取数据。 start_urls
将是抓取列表中的第一个 link,而 next_page
将是随后的 link,依此类推。
我想过将 WuzzufLinks
导入 WuzzufSpider
然后访问它的数据:
import scrapy
from ..items import WuzzufscraperItem
class WuzzuflinksSpider(scrapy.Spider):
name = 'WuzzufLinks'
page_number = 1
start_urls = ['https://wuzzuf.net/search/jobs/?filters%5Bcountry%5D%5B0%5D=Egypt']
def parse(self, response):
items = WuzzufscraperItem()
jobURL = response.css('h2[class=css-m604qf] a::attr(href)').extract()
items['jobURL'] = jobURL
yield items
next_page = 'https://wuzzuf.net/search/jobs/?filters%5Bcountry%5D%5B0%5D=Egypt&start=' + str(WuzzuflinksSpider.page_number)
if WuzzuflinksSpider.page_number <= 100:
yield response.follow(next_page, callback = self.parse)
WuzzuflinksSpider.page_number += 1
# WuzzufSpider
import scrapy
from ..items import WuzzufscraperItem
from spiders.WuzzufLinks import WuzzuflinksSpider
class WuzzufspiderSpider(scrapy.Spider):
name = 'WuzzufSpider'
parseClass = WuzzuflinksSpider().parse()
start_urls = []
def parse(self, response):
items = WuzzufscraperItem()
# CSS selectors
title = response.css('').extract()
company = response.css('').extract()
location = response.css('').extract()
country = response.css('').extract()
date = response.css('').extract()
careerLevel = response.css('').extract()
experienceNeeded = response.css('').extract()
jobType = response.css('').extract()
jobFunction = response.css('').extract()
salary = response.css('').extract()
description = response.css('').extract()
requirements = response.css('').extract()
skills = response.css('').extract()
industry = response.css('').extract()
jobURL = response.css('').extract()
# next_page and if statement here
无论我是否正确编写了概述部分,我已经意识到访问 jobURL
将 return 一个空值,因为它只是一个临时容器。我想过将抓取的 link 保存在另一个文件中,然后将它们导入到 WuzzufSpider
,但我不知道导入是否有效以及它们是否仍然是一个列表:
# links.xml
<?xml version="1.0" encoding="utf-8"?>
<items>
<item><jobURL><value>/jobs/p/P5A2NWkkWfv6-Sales-Operations-Specialist-Amreyah-Cement---InterCement-Alexandria-Egypt?o=1&l=sp&t=sj&a=search-v3</value><value>/jobs/p/pEmZ96R097N3-Senior-Laravel-Developer-Learnovia-Cairo-Egypt?o=2&l=sp&t=sj&a=search-v3</value><value>/jobs/p/IgHkjP37ymQp-French-Talent-Acquisition-Specialist-Guide-Academy-Giza-Egypt?o=3&l=sp&t=sj&a=search-v3</value><value>/jobs/p/zOLTqLqegEZe-Export-Sales-Representative-packtec-Cairo-Egypt?o=4&l=sp&t=sj&a=search-v3</value><value>/jobs/p/U3Q1TDpxzsJJ-Finishing-Site-Engineer--Assiut-Assiut-Egypt?o=5&l=sp&t=sj&a=search-v3</value><value>/jobs/p/7aQ4QxtYV8N6-Senior-QC-Automation-Engineer-FlairsTech-Cairo-Egypt?o=6&l=sp&t=sj&a=search-v3</value><value>/jobs/p/qHWyGU7ClMG6-Technical-Office-Engineer-Cairo-Egypt?o=7&l=sp&t=sj&a=search-v3</value><value>/jobs/p/ptN7qnERUvPT-B2B-Sales-Representative-Smart-Zone-Cairo-Egypt?o=8&l=sp&t=sj&a=search-v3</value><value>/jobs/p/VUVc0ZAyUNYU-Digital-Marketing-supervisor-National-Trade-Distribution-Cairo-Egypt?o=9&l=sp&t=sj&a=search-v3</value><value>/jobs/p/WzJhyeVpT5jb-Receptionist-Value-Cairo-Egypt?o=10&l=sp&t=sj&a=search-v3</value><value>/jobs/p/PAdZOdzWjqbr-Insurance-Specialist-Bancassuranc---Sohag-Allianz-Sohag-Egypt?o=11&l=sp&t=sj&a=search-v3</value><value>/jobs/p/nJD6YbE4QjNX-Senior-Research-And-Development-Specialist-Cairo-Egypt?o=12&l=sp&t=sj&a=search-v3</value><value>/jobs/p/DVvMG4BFWEeI-Technical-Sales-Engineer-Masria-Group-Cairo-Egypt?o=13&l=sp&t=sj&a=search-v3</value><value>/jobs/p/3RtCveEFjveW-Technical-Office-Engineer-Masria-Group-Cairo-Egypt?o=14&l=sp&t=sj&a=search-v3</value><value>/jobs/p/kswGaw4kXTe8-Administrator-Kreston-Cairo-Egypt?o=15&l=sp&t=sj&a=search-v3</value></jobURL></item>
</items>
# WuzzufSpider
import scrapy
from ..items import WuzzufscraperItem
from links import jobURL
class WuzzufspiderSpider(scrapy.Spider):
name = 'WuzzufSpider'
start_urls = [jobURL[0]]
def parse(self, response):
items = WuzzufscraperItem()
# CSS selectors
title = response.css('').extract()
company = response.css('').extract()
location = response.css('').extract()
country = response.css('').extract()
date = response.css('').extract()
careerLevel = response.css('').extract()
experienceNeeded = response.css('').extract()
jobType = response.css('').extract()
jobFunction = response.css('').extract()
salary = response.css('').extract()
description = response.css('').extract()
requirements = response.css('').extract()
skills = response.css('').extract()
industry = response.css('').extract()
jobURL = response.css('').extract()
# next_page and if statement here
有没有办法使第二种方法起作用或完全不同的方法?
我查看了论坛 Scrapy:Pass data between 2 spiders and Pass scraped URL's from one spider to another。我知道我可以在一个蜘蛛中完成所有工作,并且有一种方法可以保存到数据库或临时文件,以便将数据发送到另一个蜘蛛。 然而我还不是很有经验,不明白如何实施这样的改变,所以将这个问题标记为重复对我没有帮助。谢谢你的帮助。
首先,您可以继续从同一个蜘蛛抓取 url,老实说,我看不出有什么理由不这样做。
无论如何,如果你真的想要两个蜘蛛,第一个的输出将是第二个的输入,你可以这样做:
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
from scrapy.signalmanager import dispatcher
from scrapy import signals
from twisted.internet import reactor, defer
# grab all the products urls
class ExampleSpider(scrapy.Spider):
name = "exampleSpider"
start_urls = ['https://scrapingclub.com/exercise/list_basic']
def parse(self, response):
all_urls = response.xpath('//div[@class="card"]/a/@href').getall()
for url in all_urls:
yield {'url': 'https://scrapingclub.com' + url}
# get the product's details
class ExampleSpider2(scrapy.Spider):
name = "exampleSpider2"
def parse(self, response):
title = response.xpath('//h3/text()').get()
price = response.xpath('//div[@class="card-body"]//h4//text()').get()
yield {
'title': title,
'price': price
}
if __name__ == "__main__":
# this will be the yielded items from the first spider
output = []
def get_output(item):
output.append(item)
configure_logging()
settings = get_project_settings()
settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
runner = CrawlerRunner(settings)
# run spiders sequentially
# (https://docs.scrapy.org/en/latest/topics/practices.html#running-multiple-spiders-in-the-same-process)
@defer.inlineCallbacks
def crawl():
dispatcher.connect(get_output, signal=signals.item_scraped)
yield runner.crawl('exampleSpider')
urls = [url['url'] for url in output] # create a list of the urls from the first spider
# crawl the second spider with the urls from the first spider
yield runner.crawl('exampleSpider2', start_urls=urls)
reactor.stop()
crawl()
reactor.run()
运行 并看到您首先从第一个蜘蛛获得结果,然后将这些结果作为“start_urls”传递给第二个蜘蛛。
编辑:
在同一个蜘蛛中完成所有工作。看看我们如何遍历所有 url 并在函数“parse_item”中抓取它们。我填写了一些你想抓取的值作为示例,所以只需填写其余部分即可。
import scrapy
# from ..items import WuzzufscraperItem
class WuzzufscraperItem(scrapy.Item):
title = scrapy.Field()
company = scrapy.Field()
location = scrapy.Field()
country = scrapy.Field()
jobURL = scrapy.Field()
date = scrapy.Field()
careerLevel = scrapy.Field()
experienceNeeded = scrapy.Field()
jobType = scrapy.Field()
jobFunction = scrapy.Field()
salary = scrapy.Field()
description = scrapy.Field()
requirements = scrapy.Field()
skills = scrapy.Field()
industry = scrapy.Field()
class WuzzuflinksSpider(scrapy.Spider):
name = 'WuzzufLinks'
page_number = 1
start_urls = ['https://wuzzuf.net/search/jobs/?filters%5Bcountry%5D%5B0%5D=Egypt']
def parse(self, response):
all_urls = response.css('h2[class=css-m604qf] a::attr(href)').getall()
if all_urls:
for url in all_urls:
yield response.follow(url=url, callback=self.parse_item)
next_page = 'https://wuzzuf.net/search/jobs/?filters%5Bcountry%5D%5B0%5D=Egypt&start=' + str(WuzzuflinksSpider.page_number)
if WuzzuflinksSpider.page_number <= 100:
yield response.follow(next_page)
WuzzuflinksSpider.page_number += 1
def parse_item(self, response):
items = WuzzufscraperItem()
# CSS selectors
# Some values as an example:
items['title'] = response.xpath('(//h1)[last()]/text()').get(default='')
items['company'] = response.xpath('(//a[@class="css-p7pghv"])[last()]/text()').get(default='')
items['location'] = response.xpath('(//strong[@class="css-9geu3q"])[last()]/text()').get(default='')
items['country'] = response.xpath('//meta[@property="og:country_name"]/@content').get(default='')
items['jobURL'] = response.url
# items['date'] = response.css('').get(default='')
# items['careerLevel'] = response.css('').get(default='')
# items['experienceNeeded'] = response.css('').get(default='')
# items['jobType'] = response.css('').get(default='')
# items['jobFunction'] = response.css('').get(default='')
# items['salary'] = response.css('').get(default='')
# items['description'] = response.css('').get(default='')
# items['requirements'] = response.css('').get(default='')
# items['skills'] = response.css('').get(default='')
# items['industry'] = response.css('').get(default='')
yield items
我正在构建一个 Scrapy 蜘蛛 WuzzufLinks
,它将所有 link 抓取到此 link 中求职网站中的特定职位:
https://wuzzuf.net/search/jobs/?filters%5Bcountry%5D%5B0%5D=Egypt
抓取 link 之后,我想将它们发送到另一个蜘蛛 WuzzufSpider
,它从每个 link 内部抓取数据。 start_urls
将是抓取列表中的第一个 link,而 next_page
将是随后的 link,依此类推。
我想过将 WuzzufLinks
导入 WuzzufSpider
然后访问它的数据:
import scrapy
from ..items import WuzzufscraperItem
class WuzzuflinksSpider(scrapy.Spider):
name = 'WuzzufLinks'
page_number = 1
start_urls = ['https://wuzzuf.net/search/jobs/?filters%5Bcountry%5D%5B0%5D=Egypt']
def parse(self, response):
items = WuzzufscraperItem()
jobURL = response.css('h2[class=css-m604qf] a::attr(href)').extract()
items['jobURL'] = jobURL
yield items
next_page = 'https://wuzzuf.net/search/jobs/?filters%5Bcountry%5D%5B0%5D=Egypt&start=' + str(WuzzuflinksSpider.page_number)
if WuzzuflinksSpider.page_number <= 100:
yield response.follow(next_page, callback = self.parse)
WuzzuflinksSpider.page_number += 1
# WuzzufSpider
import scrapy
from ..items import WuzzufscraperItem
from spiders.WuzzufLinks import WuzzuflinksSpider
class WuzzufspiderSpider(scrapy.Spider):
name = 'WuzzufSpider'
parseClass = WuzzuflinksSpider().parse()
start_urls = []
def parse(self, response):
items = WuzzufscraperItem()
# CSS selectors
title = response.css('').extract()
company = response.css('').extract()
location = response.css('').extract()
country = response.css('').extract()
date = response.css('').extract()
careerLevel = response.css('').extract()
experienceNeeded = response.css('').extract()
jobType = response.css('').extract()
jobFunction = response.css('').extract()
salary = response.css('').extract()
description = response.css('').extract()
requirements = response.css('').extract()
skills = response.css('').extract()
industry = response.css('').extract()
jobURL = response.css('').extract()
# next_page and if statement here
无论我是否正确编写了概述部分,我已经意识到访问 jobURL
将 return 一个空值,因为它只是一个临时容器。我想过将抓取的 link 保存在另一个文件中,然后将它们导入到 WuzzufSpider
,但我不知道导入是否有效以及它们是否仍然是一个列表:
# links.xml
<?xml version="1.0" encoding="utf-8"?>
<items>
<item><jobURL><value>/jobs/p/P5A2NWkkWfv6-Sales-Operations-Specialist-Amreyah-Cement---InterCement-Alexandria-Egypt?o=1&l=sp&t=sj&a=search-v3</value><value>/jobs/p/pEmZ96R097N3-Senior-Laravel-Developer-Learnovia-Cairo-Egypt?o=2&l=sp&t=sj&a=search-v3</value><value>/jobs/p/IgHkjP37ymQp-French-Talent-Acquisition-Specialist-Guide-Academy-Giza-Egypt?o=3&l=sp&t=sj&a=search-v3</value><value>/jobs/p/zOLTqLqegEZe-Export-Sales-Representative-packtec-Cairo-Egypt?o=4&l=sp&t=sj&a=search-v3</value><value>/jobs/p/U3Q1TDpxzsJJ-Finishing-Site-Engineer--Assiut-Assiut-Egypt?o=5&l=sp&t=sj&a=search-v3</value><value>/jobs/p/7aQ4QxtYV8N6-Senior-QC-Automation-Engineer-FlairsTech-Cairo-Egypt?o=6&l=sp&t=sj&a=search-v3</value><value>/jobs/p/qHWyGU7ClMG6-Technical-Office-Engineer-Cairo-Egypt?o=7&l=sp&t=sj&a=search-v3</value><value>/jobs/p/ptN7qnERUvPT-B2B-Sales-Representative-Smart-Zone-Cairo-Egypt?o=8&l=sp&t=sj&a=search-v3</value><value>/jobs/p/VUVc0ZAyUNYU-Digital-Marketing-supervisor-National-Trade-Distribution-Cairo-Egypt?o=9&l=sp&t=sj&a=search-v3</value><value>/jobs/p/WzJhyeVpT5jb-Receptionist-Value-Cairo-Egypt?o=10&l=sp&t=sj&a=search-v3</value><value>/jobs/p/PAdZOdzWjqbr-Insurance-Specialist-Bancassuranc---Sohag-Allianz-Sohag-Egypt?o=11&l=sp&t=sj&a=search-v3</value><value>/jobs/p/nJD6YbE4QjNX-Senior-Research-And-Development-Specialist-Cairo-Egypt?o=12&l=sp&t=sj&a=search-v3</value><value>/jobs/p/DVvMG4BFWEeI-Technical-Sales-Engineer-Masria-Group-Cairo-Egypt?o=13&l=sp&t=sj&a=search-v3</value><value>/jobs/p/3RtCveEFjveW-Technical-Office-Engineer-Masria-Group-Cairo-Egypt?o=14&l=sp&t=sj&a=search-v3</value><value>/jobs/p/kswGaw4kXTe8-Administrator-Kreston-Cairo-Egypt?o=15&l=sp&t=sj&a=search-v3</value></jobURL></item>
</items>
# WuzzufSpider
import scrapy
from ..items import WuzzufscraperItem
from links import jobURL
class WuzzufspiderSpider(scrapy.Spider):
name = 'WuzzufSpider'
start_urls = [jobURL[0]]
def parse(self, response):
items = WuzzufscraperItem()
# CSS selectors
title = response.css('').extract()
company = response.css('').extract()
location = response.css('').extract()
country = response.css('').extract()
date = response.css('').extract()
careerLevel = response.css('').extract()
experienceNeeded = response.css('').extract()
jobType = response.css('').extract()
jobFunction = response.css('').extract()
salary = response.css('').extract()
description = response.css('').extract()
requirements = response.css('').extract()
skills = response.css('').extract()
industry = response.css('').extract()
jobURL = response.css('').extract()
# next_page and if statement here
有没有办法使第二种方法起作用或完全不同的方法?
我查看了论坛 Scrapy:Pass data between 2 spiders and Pass scraped URL's from one spider to another。我知道我可以在一个蜘蛛中完成所有工作,并且有一种方法可以保存到数据库或临时文件,以便将数据发送到另一个蜘蛛。 然而我还不是很有经验,不明白如何实施这样的改变,所以将这个问题标记为重复对我没有帮助。谢谢你的帮助。
首先,您可以继续从同一个蜘蛛抓取 url,老实说,我看不出有什么理由不这样做。
无论如何,如果你真的想要两个蜘蛛,第一个的输出将是第二个的输入,你可以这样做:
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
from scrapy.signalmanager import dispatcher
from scrapy import signals
from twisted.internet import reactor, defer
# grab all the products urls
class ExampleSpider(scrapy.Spider):
name = "exampleSpider"
start_urls = ['https://scrapingclub.com/exercise/list_basic']
def parse(self, response):
all_urls = response.xpath('//div[@class="card"]/a/@href').getall()
for url in all_urls:
yield {'url': 'https://scrapingclub.com' + url}
# get the product's details
class ExampleSpider2(scrapy.Spider):
name = "exampleSpider2"
def parse(self, response):
title = response.xpath('//h3/text()').get()
price = response.xpath('//div[@class="card-body"]//h4//text()').get()
yield {
'title': title,
'price': price
}
if __name__ == "__main__":
# this will be the yielded items from the first spider
output = []
def get_output(item):
output.append(item)
configure_logging()
settings = get_project_settings()
settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
runner = CrawlerRunner(settings)
# run spiders sequentially
# (https://docs.scrapy.org/en/latest/topics/practices.html#running-multiple-spiders-in-the-same-process)
@defer.inlineCallbacks
def crawl():
dispatcher.connect(get_output, signal=signals.item_scraped)
yield runner.crawl('exampleSpider')
urls = [url['url'] for url in output] # create a list of the urls from the first spider
# crawl the second spider with the urls from the first spider
yield runner.crawl('exampleSpider2', start_urls=urls)
reactor.stop()
crawl()
reactor.run()
运行 并看到您首先从第一个蜘蛛获得结果,然后将这些结果作为“start_urls”传递给第二个蜘蛛。
编辑:
在同一个蜘蛛中完成所有工作。看看我们如何遍历所有 url 并在函数“parse_item”中抓取它们。我填写了一些你想抓取的值作为示例,所以只需填写其余部分即可。
import scrapy
# from ..items import WuzzufscraperItem
class WuzzufscraperItem(scrapy.Item):
title = scrapy.Field()
company = scrapy.Field()
location = scrapy.Field()
country = scrapy.Field()
jobURL = scrapy.Field()
date = scrapy.Field()
careerLevel = scrapy.Field()
experienceNeeded = scrapy.Field()
jobType = scrapy.Field()
jobFunction = scrapy.Field()
salary = scrapy.Field()
description = scrapy.Field()
requirements = scrapy.Field()
skills = scrapy.Field()
industry = scrapy.Field()
class WuzzuflinksSpider(scrapy.Spider):
name = 'WuzzufLinks'
page_number = 1
start_urls = ['https://wuzzuf.net/search/jobs/?filters%5Bcountry%5D%5B0%5D=Egypt']
def parse(self, response):
all_urls = response.css('h2[class=css-m604qf] a::attr(href)').getall()
if all_urls:
for url in all_urls:
yield response.follow(url=url, callback=self.parse_item)
next_page = 'https://wuzzuf.net/search/jobs/?filters%5Bcountry%5D%5B0%5D=Egypt&start=' + str(WuzzuflinksSpider.page_number)
if WuzzuflinksSpider.page_number <= 100:
yield response.follow(next_page)
WuzzuflinksSpider.page_number += 1
def parse_item(self, response):
items = WuzzufscraperItem()
# CSS selectors
# Some values as an example:
items['title'] = response.xpath('(//h1)[last()]/text()').get(default='')
items['company'] = response.xpath('(//a[@class="css-p7pghv"])[last()]/text()').get(default='')
items['location'] = response.xpath('(//strong[@class="css-9geu3q"])[last()]/text()').get(default='')
items['country'] = response.xpath('//meta[@property="og:country_name"]/@content').get(default='')
items['jobURL'] = response.url
# items['date'] = response.css('').get(default='')
# items['careerLevel'] = response.css('').get(default='')
# items['experienceNeeded'] = response.css('').get(default='')
# items['jobType'] = response.css('').get(default='')
# items['jobFunction'] = response.css('').get(default='')
# items['salary'] = response.css('').get(default='')
# items['description'] = response.css('').get(default='')
# items['requirements'] = response.css('').get(default='')
# items['skills'] = response.css('').get(default='')
# items['industry'] = response.css('').get(default='')
yield items