Scrapy动态提要导出
Scrapy dynamic feed export
我正在尝试让 Scrapy 根据输入文件设置导出文件。这是我正在尝试的代码:
import scrapy
class VideosSpider(scrapy.Spider):
name = "videos"
start_urls = []
custom_settings = {
"ITEM_PIPELINES": {
"project.pipelines.VideosPipeline": 200,
"project.pipelines.EpisodeImagesPipeline": 300,
},
"FILES_STORE": "episodes",
"IMAGES_STORE": "episodes",
}
def __init__(self, urls_file=None, *args, **kwargs):
# urls_file is a list of links (one per line)
if urls_file:
with open(urls_file, "r") as f:
next(f) # skip first line (header)
self.start_urls = [url.strip() for url in f.readlines()]
self.custom_settings["FEEDS"] = {
Path(urls_file).with_suffix(".txt"): {
"format": "custom",
},
} # ideally would set the feeds with names corresponding to input file
然而,上面的结果导致管道中的空 FEEDS
设置:
class VideosPipeline:
def __init__(self, uri, files):
self.uri = uri
self.files = files
@classmethod
def from_crawler(cls, crawler):
return cls(
uri=crawler.settings.get("FEEDS"), # this is empty
files=crawler.settings.get("FILES_STORE"),
)
def open_spider(self, spider):
feeds = [k for k, v in self.uri.items() if k["format"] == "cistom"]
self.exporter = CustomExporter(feeds[0])
还有来自自定义导出器的一小段代码:
from scrapy.exporters import BaseItemExporter
class CustomExporter(BaseItemExporter):
def __init__(self, file, **kwargs):
super().__init__(dont_fail=True, **kwargs)
self.file = file
我哪里错了?
感谢您的帮助。
您可以在 FEED
设置中使用 URI 参数 来动态设置扩展文件路径。它使您能够使用任何蜘蛛属性来组成 FEED
值。
从文档中查看这一部分
任何其他命名参数都将替换为同名的 spider 属性。例如,在创建提要时,%(site_id)s 将被 spider.site_id 属性替换。
https://docs.scrapy.org/en/latest/topics/feed-exports.html#storage-uri-parameters
因此,在您的示例中,它类似于(请注意,我还更改了文件扩展名,因为 txt
不是标准序列化格式:
class VideosSpider(scrapy.Spider):
name = "videos"
start_urls = []
custom_settings = {
"ITEM_PIPELINES": {
"project.pipelines.VideosPipeline": 200,
"project.pipelines.EpisodeImagesPipeline": 300,
},
"FILES_STORE": "episodes",
"IMAGES_STORE": "episodes",
"FEED": {"file:///somepath/%(exported_file)s" : {"format": "csv"}}
}
def __init__(self, urls_file=None, *args, **kwargs):
self.exported_file = str(Path(urls_file).with_suffix(".csv"))
我正在尝试让 Scrapy 根据输入文件设置导出文件。这是我正在尝试的代码:
import scrapy
class VideosSpider(scrapy.Spider):
name = "videos"
start_urls = []
custom_settings = {
"ITEM_PIPELINES": {
"project.pipelines.VideosPipeline": 200,
"project.pipelines.EpisodeImagesPipeline": 300,
},
"FILES_STORE": "episodes",
"IMAGES_STORE": "episodes",
}
def __init__(self, urls_file=None, *args, **kwargs):
# urls_file is a list of links (one per line)
if urls_file:
with open(urls_file, "r") as f:
next(f) # skip first line (header)
self.start_urls = [url.strip() for url in f.readlines()]
self.custom_settings["FEEDS"] = {
Path(urls_file).with_suffix(".txt"): {
"format": "custom",
},
} # ideally would set the feeds with names corresponding to input file
然而,上面的结果导致管道中的空 FEEDS
设置:
class VideosPipeline:
def __init__(self, uri, files):
self.uri = uri
self.files = files
@classmethod
def from_crawler(cls, crawler):
return cls(
uri=crawler.settings.get("FEEDS"), # this is empty
files=crawler.settings.get("FILES_STORE"),
)
def open_spider(self, spider):
feeds = [k for k, v in self.uri.items() if k["format"] == "cistom"]
self.exporter = CustomExporter(feeds[0])
还有来自自定义导出器的一小段代码:
from scrapy.exporters import BaseItemExporter
class CustomExporter(BaseItemExporter):
def __init__(self, file, **kwargs):
super().__init__(dont_fail=True, **kwargs)
self.file = file
我哪里错了?
感谢您的帮助。
您可以在 FEED
设置中使用 URI 参数 来动态设置扩展文件路径。它使您能够使用任何蜘蛛属性来组成 FEED
值。
从文档中查看这一部分
任何其他命名参数都将替换为同名的 spider 属性。例如,在创建提要时,%(site_id)s 将被 spider.site_id 属性替换。 https://docs.scrapy.org/en/latest/topics/feed-exports.html#storage-uri-parameters
因此,在您的示例中,它类似于(请注意,我还更改了文件扩展名,因为 txt
不是标准序列化格式:
class VideosSpider(scrapy.Spider):
name = "videos"
start_urls = []
custom_settings = {
"ITEM_PIPELINES": {
"project.pipelines.VideosPipeline": 200,
"project.pipelines.EpisodeImagesPipeline": 300,
},
"FILES_STORE": "episodes",
"IMAGES_STORE": "episodes",
"FEED": {"file:///somepath/%(exported_file)s" : {"format": "csv"}}
}
def __init__(self, urls_file=None, *args, **kwargs):
self.exported_file = str(Path(urls_file).with_suffix(".csv"))