Scrapy 不解析数据
Scrapy not parsing data
我是 scrapy 的新手,我正在尝试在 json 文件中检索我最喜欢的球队的得分。但是,我的 json 文件仍然是空的。
这是我的代码:
import scrapy
from scrapy.crawler import CrawlerProcess
class SoccerwaySpider(scrapy.Spider):
name="Soccerway"
start_urls = ['https://fr.soccerway.com/teams/france/olympique-de-marseille/890/']
def start_requests(self):
headers= {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'}
for url in self.start_urls:
yield scrapy.Request(url, headers=headers, callback=self.parse)
def parse(self,response):
yield
{
'score':str.strip(response.css("table.matches").css('td.score-time.score').css('a::text').get()),
}
process = CrawlerProcess(settings={
"FEEDS": {
"Soccerway.json": {"format": "json"},
},
})
process.crawl(SoccerwaySpider)
process.start()
提前致谢!
您遇到问题是因为您将 {
放在了错误的位置。它必须符合 yield
yield {
'score': ...,
}
如果你放在其他行然后它把它当作两个命令
# command 1 - exit function without arguments
yield
# command 2 - create local dictionary without assigning to variable
{
'score': ...,
}
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
class SoccerwaySpider(scrapy.Spider):
name = "Soccerway"
start_urls = ['https://fr.soccerway.com/teams/france/olympique-de-marseille/890/']
custom_settings={"FEEDS": {"Soccerway.json": {"format": "json"}}}
def start_requests(self):
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'
}
for url in self.start_urls:
yield scrapy.Request(url, headers=headers, callback=self.parse)
def parse(self, response):
yield {
'score': str.strip(response.css("table.matches").css('td.score-time.score').css('a::text').get()),
}
if __name__ == "__main__":
process = CrawlerProcess(get_project_settings())
process.crawl('Soccerway')
process.start()
Soccerway.json:
[
{"score": "2 - 2"}
]
我是 scrapy 的新手,我正在尝试在 json 文件中检索我最喜欢的球队的得分。但是,我的 json 文件仍然是空的。
这是我的代码:
import scrapy
from scrapy.crawler import CrawlerProcess
class SoccerwaySpider(scrapy.Spider):
name="Soccerway"
start_urls = ['https://fr.soccerway.com/teams/france/olympique-de-marseille/890/']
def start_requests(self):
headers= {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'}
for url in self.start_urls:
yield scrapy.Request(url, headers=headers, callback=self.parse)
def parse(self,response):
yield
{
'score':str.strip(response.css("table.matches").css('td.score-time.score').css('a::text').get()),
}
process = CrawlerProcess(settings={
"FEEDS": {
"Soccerway.json": {"format": "json"},
},
})
process.crawl(SoccerwaySpider)
process.start()
提前致谢!
您遇到问题是因为您将 {
放在了错误的位置。它必须符合 yield
yield {
'score': ...,
}
如果你放在其他行然后它把它当作两个命令
# command 1 - exit function without arguments
yield
# command 2 - create local dictionary without assigning to variable
{
'score': ...,
}
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
class SoccerwaySpider(scrapy.Spider):
name = "Soccerway"
start_urls = ['https://fr.soccerway.com/teams/france/olympique-de-marseille/890/']
custom_settings={"FEEDS": {"Soccerway.json": {"format": "json"}}}
def start_requests(self):
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'
}
for url in self.start_urls:
yield scrapy.Request(url, headers=headers, callback=self.parse)
def parse(self, response):
yield {
'score': str.strip(response.css("table.matches").css('td.score-time.score').css('a::text').get()),
}
if __name__ == "__main__":
process = CrawlerProcess(get_project_settings())
process.crawl('Soccerway')
process.start()
Soccerway.json:
[
{"score": "2 - 2"}
]