创建一个通用的 scrapy 蜘蛛和多个特定的
Creating one generic scrapy spider and multiple specific
我正在尝试创建一个负责最常见任务的通用蜘蛛和继承通用蜘蛛并声明网站特定变量的特定蜘蛛。
有genericspider.py
:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import Spider, CrawlSpider
class GenericProductSpider(scrapy.Spider):
def __init__(self, start_urls=[], finditemprop='', keywords='', **kwargs):
CrawlSpider.__init__(self, **kwargs)
print ( "\n\n Init Generic \n" )
然后我在与通用目录相同的目录中得到 specificspider.py
。
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import Spider, CrawlSpider
from .genericfabric import GenericFabricsSpider
class SpecificSpider(GenericProductSpider):
def __init__(self, **kwargs):
print ( "\n init specific \n" )
name = "specific1"
start_urls = ['http://www.specificdomian.com',]
super(SpecificSpider, self).__init__(name, start_urls, **kwargs)
我似乎不明白如何正确调用超类的初始化程序。我收到各种错误消息,但从未执行过通用蜘蛛的 init 方法。
实际上..它似乎工作正常 - 可能只是参数有问题。
超级class的工作代码:
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from test.items import TestItem
class TestsuperSpider(Spider):
name = "testsuper"
allowed_domains = ["craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/search/npo"]
supervar = "meine super var"
def __init__(self):
print ( "super init" )
def parse(self, response):
print ( "super Parse" )
def supermethod ( self, subvar ):
print ( "\n\n Supermethod \n\n " )
print ( self.supervar + " - " + subvar )
和子class:
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from test.items import TestItem
from test.spiders.testsuper import TestsuperSpider
class TestsubSpider(TestsuperSpider):
name = "testsub"
allowed_domains = ["craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/search/npo"]
subvar = "subvar"
def __init__(self):
print ( "sub init" )
super(TestsubSpider, self).__init__()
def parse(self, response):
super(TestsubSpider, self).supermethod(self.subvar)
print ( "sub Parse" )
现在它需要清理并根据其目的进行调整,但至少代码按预期运行。
我正在尝试创建一个负责最常见任务的通用蜘蛛和继承通用蜘蛛并声明网站特定变量的特定蜘蛛。
有genericspider.py
:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import Spider, CrawlSpider
class GenericProductSpider(scrapy.Spider):
def __init__(self, start_urls=[], finditemprop='', keywords='', **kwargs):
CrawlSpider.__init__(self, **kwargs)
print ( "\n\n Init Generic \n" )
然后我在与通用目录相同的目录中得到 specificspider.py
。
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import Spider, CrawlSpider
from .genericfabric import GenericFabricsSpider
class SpecificSpider(GenericProductSpider):
def __init__(self, **kwargs):
print ( "\n init specific \n" )
name = "specific1"
start_urls = ['http://www.specificdomian.com',]
super(SpecificSpider, self).__init__(name, start_urls, **kwargs)
我似乎不明白如何正确调用超类的初始化程序。我收到各种错误消息,但从未执行过通用蜘蛛的 init 方法。
实际上..它似乎工作正常 - 可能只是参数有问题。
超级class的工作代码:
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from test.items import TestItem
class TestsuperSpider(Spider):
name = "testsuper"
allowed_domains = ["craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/search/npo"]
supervar = "meine super var"
def __init__(self):
print ( "super init" )
def parse(self, response):
print ( "super Parse" )
def supermethod ( self, subvar ):
print ( "\n\n Supermethod \n\n " )
print ( self.supervar + " - " + subvar )
和子class:
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from test.items import TestItem
from test.spiders.testsuper import TestsuperSpider
class TestsubSpider(TestsuperSpider):
name = "testsub"
allowed_domains = ["craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/search/npo"]
subvar = "subvar"
def __init__(self):
print ( "sub init" )
super(TestsubSpider, self).__init__()
def parse(self, response):
super(TestsubSpider, self).supermethod(self.subvar)
print ( "sub Parse" )
现在它需要清理并根据其目的进行调整,但至少代码按预期运行。