如何将 URL 从 Spider 导入到 Spider?

How to Import URLs From Spider to Spider?

我正在构建一个 Scrapy 蜘蛛 WuzzufLinks,它将所有 link 抓取到此 link 中求职网站中的特定职位: https://wuzzuf.net/search/jobs/?filters%5Bcountry%5D%5B0%5D=Egypt

抓取 link 之后,我想将它们发送到另一个蜘蛛 WuzzufSpider,它从每个 link 内部抓取数据。 start_urls 将是抓取列表中的第一个 link,而 next_page 将是随后的 link,依此类推。

我想过将 WuzzufLinks 导入 WuzzufSpider 然后访问它的数据:

import scrapy
from ..items import WuzzufscraperItem


class WuzzuflinksSpider(scrapy.Spider):
    name = 'WuzzufLinks'
    page_number = 1
    start_urls = ['https://wuzzuf.net/search/jobs/?filters%5Bcountry%5D%5B0%5D=Egypt']

    def parse(self, response):
        items = WuzzufscraperItem()

        jobURL = response.css('h2[class=css-m604qf] a::attr(href)').extract()

        items['jobURL'] = jobURL

        yield items

        next_page = 'https://wuzzuf.net/search/jobs/?filters%5Bcountry%5D%5B0%5D=Egypt&start=' + str(WuzzuflinksSpider.page_number)
        if WuzzuflinksSpider.page_number <= 100:
            yield response.follow(next_page, callback = self.parse)
            WuzzuflinksSpider.page_number += 1
# WuzzufSpider

import scrapy
from ..items import WuzzufscraperItem
from spiders.WuzzufLinks import WuzzuflinksSpider


class WuzzufspiderSpider(scrapy.Spider):
    name = 'WuzzufSpider'
    parseClass = WuzzuflinksSpider().parse()
    start_urls = []

    def parse(self, response):
        items = WuzzufscraperItem()
        # CSS selectors
        title = response.css('').extract()
        company = response.css('').extract()
        location = response.css('').extract()
        country = response.css('').extract()
        date = response.css('').extract()
        careerLevel = response.css('').extract()
        experienceNeeded = response.css('').extract()
        jobType = response.css('').extract()
        jobFunction = response.css('').extract()
        salary = response.css('').extract()
        description = response.css('').extract()
        requirements = response.css('').extract()
        skills = response.css('').extract()
        industry = response.css('').extract()
        jobURL = response.css('').extract()

        # next_page and if statement here

无论我是否正确编写了概述部分,我已经意识到访问 jobURL 将 return 一个空值,因为它只是一个临时容器。我想过将抓取的 link 保存在另一个文件中,然后将它们导入到 WuzzufSpider,但我不知道导入是否有效以及它们是否仍然是一个列表:

# links.xml

<?xml version="1.0" encoding="utf-8"?>
<items>
<item><jobURL><value>/jobs/p/P5A2NWkkWfv6-Sales-Operations-Specialist-Amreyah-Cement---InterCement-Alexandria-Egypt?o=1&amp;l=sp&amp;t=sj&amp;a=search-v3</value><value>/jobs/p/pEmZ96R097N3-Senior-Laravel-Developer-Learnovia-Cairo-Egypt?o=2&amp;l=sp&amp;t=sj&amp;a=search-v3</value><value>/jobs/p/IgHkjP37ymQp-French-Talent-Acquisition-Specialist-Guide-Academy-Giza-Egypt?o=3&amp;l=sp&amp;t=sj&amp;a=search-v3</value><value>/jobs/p/zOLTqLqegEZe-Export-Sales-Representative-packtec-Cairo-Egypt?o=4&amp;l=sp&amp;t=sj&amp;a=search-v3</value><value>/jobs/p/U3Q1TDpxzsJJ-Finishing-Site-Engineer--Assiut-Assiut-Egypt?o=5&amp;l=sp&amp;t=sj&amp;a=search-v3</value><value>/jobs/p/7aQ4QxtYV8N6-Senior-QC-Automation-Engineer-FlairsTech-Cairo-Egypt?o=6&amp;l=sp&amp;t=sj&amp;a=search-v3</value><value>/jobs/p/qHWyGU7ClMG6-Technical-Office-Engineer-Cairo-Egypt?o=7&amp;l=sp&amp;t=sj&amp;a=search-v3</value><value>/jobs/p/ptN7qnERUvPT-B2B-Sales-Representative-Smart-Zone-Cairo-Egypt?o=8&amp;l=sp&amp;t=sj&amp;a=search-v3</value><value>/jobs/p/VUVc0ZAyUNYU-Digital-Marketing-supervisor-National-Trade-Distribution-Cairo-Egypt?o=9&amp;l=sp&amp;t=sj&amp;a=search-v3</value><value>/jobs/p/WzJhyeVpT5jb-Receptionist-Value-Cairo-Egypt?o=10&amp;l=sp&amp;t=sj&amp;a=search-v3</value><value>/jobs/p/PAdZOdzWjqbr-Insurance-Specialist-Bancassuranc---Sohag-Allianz-Sohag-Egypt?o=11&amp;l=sp&amp;t=sj&amp;a=search-v3</value><value>/jobs/p/nJD6YbE4QjNX-Senior-Research-And-Development-Specialist-Cairo-Egypt?o=12&amp;l=sp&amp;t=sj&amp;a=search-v3</value><value>/jobs/p/DVvMG4BFWEeI-Technical-Sales-Engineer-Masria-Group-Cairo-Egypt?o=13&amp;l=sp&amp;t=sj&amp;a=search-v3</value><value>/jobs/p/3RtCveEFjveW-Technical-Office-Engineer-Masria-Group-Cairo-Egypt?o=14&amp;l=sp&amp;t=sj&amp;a=search-v3</value><value>/jobs/p/kswGaw4kXTe8-Administrator-Kreston-Cairo-Egypt?o=15&amp;l=sp&amp;t=sj&amp;a=search-v3</value></jobURL></item>
</items>
# WuzzufSpider

import scrapy
from ..items import WuzzufscraperItem
from links import jobURL


class WuzzufspiderSpider(scrapy.Spider):
    name = 'WuzzufSpider'
    start_urls = [jobURL[0]]

    def parse(self, response):
        items = WuzzufscraperItem()
        # CSS selectors
        title = response.css('').extract()
        company = response.css('').extract()
        location = response.css('').extract()
        country = response.css('').extract()
        date = response.css('').extract()
        careerLevel = response.css('').extract()
        experienceNeeded = response.css('').extract()
        jobType = response.css('').extract()
        jobFunction = response.css('').extract()
        salary = response.css('').extract()
        description = response.css('').extract()
        requirements = response.css('').extract()
        skills = response.css('').extract()
        industry = response.css('').extract()
        jobURL = response.css('').extract()
        
        # next_page and if statement here

有没有办法使第二种方法起作用或完全不同的方法?

我查看了论坛 Scrapy:Pass data between 2 spiders and Pass scraped URL's from one spider to another。我知道我可以在一个蜘蛛中完成所有工作,并且有一种方法可以保存到数据库或临时文件,以便将数据发送到另一个蜘蛛。 然而我还不是很有经验,不明白如何实施这样的改变,所以将这个问题标记为重复对我没有帮助。谢谢你的帮助。

首先,您可以继续从同一个蜘蛛抓取 url,老实说,我看不出有什么理由不这样做。

无论如何,如果你真的想要两个蜘蛛,第一个的输出将是第二个的输入,你可以这样做:

import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
from scrapy.signalmanager import dispatcher
from scrapy import signals
from twisted.internet import reactor, defer


# grab all the products urls
class ExampleSpider(scrapy.Spider):
    name = "exampleSpider"
    start_urls = ['https://scrapingclub.com/exercise/list_basic']

    def parse(self, response):
        all_urls = response.xpath('//div[@class="card"]/a/@href').getall()
        for url in all_urls:
            yield {'url': 'https://scrapingclub.com' + url}


# get the product's details
class ExampleSpider2(scrapy.Spider):
    name = "exampleSpider2"

    def parse(self, response):
        title = response.xpath('//h3/text()').get()
        price = response.xpath('//div[@class="card-body"]//h4//text()').get()
        yield {
            'title': title,
            'price': price
        }


if __name__ == "__main__":
    # this will be the yielded items from the first spider
    output = []

    def get_output(item):
        output.append(item)

    configure_logging()
    settings = get_project_settings()
    settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
    runner = CrawlerRunner(settings)

    # run spiders sequentially
    # (https://docs.scrapy.org/en/latest/topics/practices.html#running-multiple-spiders-in-the-same-process)
    @defer.inlineCallbacks
    def crawl():
        dispatcher.connect(get_output, signal=signals.item_scraped)
        yield runner.crawl('exampleSpider')
        urls = [url['url'] for url in output]   # create a list of the urls from the first spider

        # crawl the second spider with the urls from the first spider
        yield runner.crawl('exampleSpider2', start_urls=urls)
        reactor.stop()

    crawl()
    reactor.run()

运行 并看到您首先从第一个蜘蛛获得结果,然后将这些结果作为“start_urls”传递给第二个蜘蛛。

编辑:

在同一个蜘蛛中完成所有工作。看看我们如何遍历所有 url 并在函数“parse_item”中抓取它们。我填写了一些你想抓取的值作为示例,所以只需填写其余部分即可。

import scrapy
# from ..items import WuzzufscraperItem


class WuzzufscraperItem(scrapy.Item):
    title = scrapy.Field()
    company = scrapy.Field()
    location = scrapy.Field()
    country = scrapy.Field()
    jobURL = scrapy.Field()
    date = scrapy.Field()
    careerLevel = scrapy.Field()
    experienceNeeded = scrapy.Field()
    jobType = scrapy.Field()
    jobFunction = scrapy.Field()
    salary = scrapy.Field()
    description = scrapy.Field()
    requirements = scrapy.Field()
    skills = scrapy.Field()
    industry = scrapy.Field()


class WuzzuflinksSpider(scrapy.Spider):
    name = 'WuzzufLinks'
    page_number = 1
    start_urls = ['https://wuzzuf.net/search/jobs/?filters%5Bcountry%5D%5B0%5D=Egypt']

    def parse(self, response):
        all_urls = response.css('h2[class=css-m604qf] a::attr(href)').getall()

        if all_urls:
            for url in all_urls:
                yield response.follow(url=url, callback=self.parse_item)

        next_page = 'https://wuzzuf.net/search/jobs/?filters%5Bcountry%5D%5B0%5D=Egypt&start=' + str(WuzzuflinksSpider.page_number)

        if WuzzuflinksSpider.page_number <= 100:
            yield response.follow(next_page)
            WuzzuflinksSpider.page_number += 1

    def parse_item(self, response):
        items = WuzzufscraperItem()
        # CSS selectors

        # Some values as an example:
        items['title'] = response.xpath('(//h1)[last()]/text()').get(default='')
        items['company'] = response.xpath('(//a[@class="css-p7pghv"])[last()]/text()').get(default='')
        items['location'] = response.xpath('(//strong[@class="css-9geu3q"])[last()]/text()').get(default='')
        items['country'] = response.xpath('//meta[@property="og:country_name"]/@content').get(default='')
        items['jobURL'] = response.url

        # items['date'] = response.css('').get(default='')
        # items['careerLevel'] = response.css('').get(default='')
        # items['experienceNeeded'] = response.css('').get(default='')
        # items['jobType'] = response.css('').get(default='')
        # items['jobFunction'] = response.css('').get(default='')
        # items['salary'] = response.css('').get(default='')
        # items['description'] = response.css('').get(default='')
        # items['requirements'] = response.css('').get(default='')
        # items['skills'] = response.css('').get(default='')
        # items['industry'] = response.css('').get(default='')

        yield items