Scrapy 从站点下载 json-文件?
Scrapy downloading json-files from site?
我试图创建一个 scrapy 蜘蛛从站点下载一些 json 文件 -
这是我的 scrapy 蜘蛛:
(首先测试了蜘蛛 - 所以它只输出 link 到 json-文件,工作正常 - 请参见下面的注释代码)
但我想将 json 文件下载到我电脑上的文件夹中。
import scrapy
class spiderWords(scrapy.Spider):
name = 'spiderWords'
allowed_domains = ['kaikki.org']
start_urls = ['https://kaikki.org/dictionary/Spanish/words.html']
def parse(self, response):
tmpLinks = response.xpath("(//ul)[2]/li/a/@href").getall()
for l in tmpLinks:
l = response.urljoin(l)
request = scrapy.Request(l,
callback=self.parseDetails)
yield request
def parseDetails(self, response):
tmpLinks2 = response.xpath("(//ul)[2]/li/a/@href").getall()
for l2 in tmpLinks2:
l2 = response.urljoin(l2)
request = scrapy.Request(l2,
callback=self.parseDownload)
yield request
def parseDownload(self, response):
class DownfilesItem(scrapy.Item):
file_urls = scrapy.Field()
files = scrapy.Field
tmpDownloadLink = response.xpath("//p[contains(text(), 'JSON')]/a/@href").get()
tmpDownloadLink = response.urljoin(tmpDownloadLink)
item = DownfilesItem()
item['file_urls'] = tmpDownloadLink
yield item
# yield {
# "link": tmpDownloadLink,
# }
这是我在 settings.py 中所做的更改:
ITEM_PIPELINES = {
'scrapy.pipelines.files.FilesPipeline': 1,
}
IMAGES_STORE = r'C:\DOWNLOAD\DATASETS\Spanish'
但不幸的是 json 文件无法下载。
如何将 json 文件下载到定义的文件夹?
你有两个问题。
item['file_urls']
应该是一个列表。
IMAGES_STORE
应该是 FILES_STORE
import scrapy
class spiderWords(scrapy.Spider):
name = 'spiderWords'
allowed_domains = ['kaikki.org']
start_urls = ['https://kaikki.org/dictionary/Spanish/words.html']
def parse(self, response):
tmpLinks = response.xpath("(//ul)[2]/li/a/@href").getall()
for l in tmpLinks:
l = response.urljoin(l)
request = scrapy.Request(l,
callback=self.parseDetails)
yield request
def parseDetails(self, response):
tmpLinks2 = response.xpath("(//ul)[2]/li/a/@href").getall()
for l2 in tmpLinks2:
l2 = response.urljoin(l2)
request = scrapy.Request(l2,
callback=self.parseDownload)
yield request
def parseDownload(self, response):
class DownfilesItem(scrapy.Item):
file_urls = scrapy.Field()
files = scrapy.Field()
tmpDownloadLink = response.xpath("//p[contains(text(), 'JSON')]/a/@href").get()
tmpDownloadLink = response.urljoin(tmpDownloadLink)
item = DownfilesItem()
item['file_urls'] = [tmpDownloadLink]
yield item
# yield {
# "link": tmpDownloadLink,
# }
编辑:
要设置文件名,请执行以下操作:
settings.py:
ITEM_PIPELINES = {
'yourprojectname.pipelines.ProcessPipeline': 1,
}
FILES_STORE = r'C:\DOWNLOAD\DATASETS\Spanish'
pipelines.py:
import os
from urllib.parse import unquote
from scrapy.pipelines.files import FilesPipeline
class ProcessPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
file_name = os.path.basename(unquote(request.url))
return file_name
编辑 2:
正在将附加信息写入文件:
import json
import scrapy
class spiderWords(scrapy.Spider):
name = 'spiderWords'
allowed_domains = ['kaikki.org']
start_urls = ['https://kaikki.org/dictionary/Spanish/words.html']
erg = {}
def parse(self, response):
tmpLinks = response.xpath("(//ul)[2]/li/a/@href").getall()
for l in tmpLinks:
l = response.urljoin(l)
request = scrapy.Request(l, callback=self.parseDetails)
yield request
def parseDetails(self, response):
tmpLinks2 = response.xpath("(//ul)[2]/li/a/@href").getall()
for l2 in tmpLinks2:
l2 = response.urljoin(l2)
request = scrapy.Request(l2, callback=self.parseDownload)
yield request
def parseDownload(self, response):
class DownfilesItem(scrapy.Item):
file_urls = scrapy.Field()
files = scrapy.Field()
key = response.xpath('//ul[@class="breadcrumb"]/li[last()]/text()').get()
self.erg[key] = response.url
tmpDownloadLink = response.xpath("//p[contains(text(), 'JSON')]/a/@href").get()
tmpDownloadLink = response.urljoin(tmpDownloadLink)
item = DownfilesItem()
item['file_urls'] = [tmpDownloadLink]
yield item
def close(self, reason):
with open('erg.json', 'w') as f:
f.write(json.dumps(self.erg, indent=4))
我试图创建一个 scrapy 蜘蛛从站点下载一些 json 文件 -
这是我的 scrapy 蜘蛛: (首先测试了蜘蛛 - 所以它只输出 link 到 json-文件,工作正常 - 请参见下面的注释代码) 但我想将 json 文件下载到我电脑上的文件夹中。
import scrapy
class spiderWords(scrapy.Spider):
name = 'spiderWords'
allowed_domains = ['kaikki.org']
start_urls = ['https://kaikki.org/dictionary/Spanish/words.html']
def parse(self, response):
tmpLinks = response.xpath("(//ul)[2]/li/a/@href").getall()
for l in tmpLinks:
l = response.urljoin(l)
request = scrapy.Request(l,
callback=self.parseDetails)
yield request
def parseDetails(self, response):
tmpLinks2 = response.xpath("(//ul)[2]/li/a/@href").getall()
for l2 in tmpLinks2:
l2 = response.urljoin(l2)
request = scrapy.Request(l2,
callback=self.parseDownload)
yield request
def parseDownload(self, response):
class DownfilesItem(scrapy.Item):
file_urls = scrapy.Field()
files = scrapy.Field
tmpDownloadLink = response.xpath("//p[contains(text(), 'JSON')]/a/@href").get()
tmpDownloadLink = response.urljoin(tmpDownloadLink)
item = DownfilesItem()
item['file_urls'] = tmpDownloadLink
yield item
# yield {
# "link": tmpDownloadLink,
# }
这是我在 settings.py 中所做的更改:
ITEM_PIPELINES = {
'scrapy.pipelines.files.FilesPipeline': 1,
}
IMAGES_STORE = r'C:\DOWNLOAD\DATASETS\Spanish'
但不幸的是 json 文件无法下载。
如何将 json 文件下载到定义的文件夹?
你有两个问题。
item['file_urls']
应该是一个列表。IMAGES_STORE
应该是FILES_STORE
import scrapy
class spiderWords(scrapy.Spider):
name = 'spiderWords'
allowed_domains = ['kaikki.org']
start_urls = ['https://kaikki.org/dictionary/Spanish/words.html']
def parse(self, response):
tmpLinks = response.xpath("(//ul)[2]/li/a/@href").getall()
for l in tmpLinks:
l = response.urljoin(l)
request = scrapy.Request(l,
callback=self.parseDetails)
yield request
def parseDetails(self, response):
tmpLinks2 = response.xpath("(//ul)[2]/li/a/@href").getall()
for l2 in tmpLinks2:
l2 = response.urljoin(l2)
request = scrapy.Request(l2,
callback=self.parseDownload)
yield request
def parseDownload(self, response):
class DownfilesItem(scrapy.Item):
file_urls = scrapy.Field()
files = scrapy.Field()
tmpDownloadLink = response.xpath("//p[contains(text(), 'JSON')]/a/@href").get()
tmpDownloadLink = response.urljoin(tmpDownloadLink)
item = DownfilesItem()
item['file_urls'] = [tmpDownloadLink]
yield item
# yield {
# "link": tmpDownloadLink,
# }
编辑:
要设置文件名,请执行以下操作:
settings.py:
ITEM_PIPELINES = {
'yourprojectname.pipelines.ProcessPipeline': 1,
}
FILES_STORE = r'C:\DOWNLOAD\DATASETS\Spanish'
pipelines.py:
import os
from urllib.parse import unquote
from scrapy.pipelines.files import FilesPipeline
class ProcessPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
file_name = os.path.basename(unquote(request.url))
return file_name
编辑 2:
正在将附加信息写入文件:
import json
import scrapy
class spiderWords(scrapy.Spider):
name = 'spiderWords'
allowed_domains = ['kaikki.org']
start_urls = ['https://kaikki.org/dictionary/Spanish/words.html']
erg = {}
def parse(self, response):
tmpLinks = response.xpath("(//ul)[2]/li/a/@href").getall()
for l in tmpLinks:
l = response.urljoin(l)
request = scrapy.Request(l, callback=self.parseDetails)
yield request
def parseDetails(self, response):
tmpLinks2 = response.xpath("(//ul)[2]/li/a/@href").getall()
for l2 in tmpLinks2:
l2 = response.urljoin(l2)
request = scrapy.Request(l2, callback=self.parseDownload)
yield request
def parseDownload(self, response):
class DownfilesItem(scrapy.Item):
file_urls = scrapy.Field()
files = scrapy.Field()
key = response.xpath('//ul[@class="breadcrumb"]/li[last()]/text()').get()
self.erg[key] = response.url
tmpDownloadLink = response.xpath("//p[contains(text(), 'JSON')]/a/@href").get()
tmpDownloadLink = response.urljoin(tmpDownloadLink)
item = DownfilesItem()
item['file_urls'] = [tmpDownloadLink]
yield item
def close(self, reason):
with open('erg.json', 'w') as f:
f.write(json.dumps(self.erg, indent=4))