Scrapy,获取所有信息的问题
Scrapy, problems to have all informations
我想用scrapy建立一个数据库,里面有菜名、配料和准备时间。所以,我可以知道 7 页的菜名,但我不知道如何继续告诉脚本输入一道菜,并获取每道菜的配料和准备时间。
import os
import logging
import scrapy
from scrapy.crawler import CrawlerProcess
class RecipesSpider(scrapy.Spider):
name = 'recipes'
start_urls = [
"https://www.cuisineaz.com/categories/plats/fast-food-cat48849",
]
def parse(self, response):
recipes = response.css('div.tile_content')
for recipe in recipes:
yield {
'name': recipe.css('a.tile_title::text').get(),
}
try:
next_page = response.css('li.pagination-next a').attrib["href"]
except KeyError:
logging.info("No next page. Terminating crawling process.")
else:
yield response.follow(next_page, callback=self.parse)
# Name of the file where the results will be saved
filename = "2_cuisineaz.json"
# If file already exists, delete it before crawling (because Scrapy will concatenate the last and new results otherwise)
if filename in os.listdir('/Users/pierreduval/Desktop/Test'):
os.remove('/Users/pierreduval/Desktop/Test + filename')
# Declare a new CrawlerProcess with some settings
process = CrawlerProcess(settings = {
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
'LOG_LEVEL': logging.INFO,
"FEEDS": {
'results/' + filename : {"format": "json"},
}
})
# Start the crawling using the spider you defined above
process.crawl(RecipesSpider)
process.start()
非常感谢!
祝你有美好的一天
My code
The page that I have scraped (name of dishes
The informations that I want when you click on dishes image
我添加了评论,阅读它们,搜索文档,如果您仍然不明白,请问我。
import logging
import scrapy
from scrapy.crawler import CrawlerProcess
class RecipesSpider(scrapy.Spider):
name = 'recipes'
start_urls = ["https://www.cuisineaz.com/categories/plats/fast-food-cat48849"]
def parse(self, response):
# list of all the recipes urls on the page
all_links = response.xpath('//section[@id="listRecettes"]//article/div/a/@href').getall()
for link in all_links:
# request recipe page
yield scrapy.Request(url=f'https://www.cuisineaz.com{link}', callback=self.parse_recipe)
# get the url of the next page
next_page = response.xpath('//li[@class="pagination-next"]/a/@href').get()
if next_page:
# the default callback is 'parse'
yield scrapy.Request(url=self.start_urls[0]+next_page)
# parse the recipe page
def parse_recipe(self, response):
ingredients = response.xpath('//section[contains(@class, "ingredients")]/ul/li/span/text()').getall()
preparation_time = response.xpath('//section[contains(@class, "instructions")]//li/span/text()').get()
yield {
'ingredients': ingredients,
'preparation_time': preparation_time,
}
# Name of the file where the results will be saved
filename = "2_cuisineaz.json"
# Declare a new CrawlerProcess with some settings
process = CrawlerProcess(settings={
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
'LOG_LEVEL': logging.INFO,
"FEEDS": {
f'results/{filename}': {"format": "json",
'overwrite': True, # instead of checking if the file exists just overwrite it
'encoding': 'utf8'
},
}
})
# Start the crawling using the spider you defined above
process.crawl(RecipesSpider)
process.start()
我想用scrapy建立一个数据库,里面有菜名、配料和准备时间。所以,我可以知道 7 页的菜名,但我不知道如何继续告诉脚本输入一道菜,并获取每道菜的配料和准备时间。
import os
import logging
import scrapy
from scrapy.crawler import CrawlerProcess
class RecipesSpider(scrapy.Spider):
name = 'recipes'
start_urls = [
"https://www.cuisineaz.com/categories/plats/fast-food-cat48849",
]
def parse(self, response):
recipes = response.css('div.tile_content')
for recipe in recipes:
yield {
'name': recipe.css('a.tile_title::text').get(),
}
try:
next_page = response.css('li.pagination-next a').attrib["href"]
except KeyError:
logging.info("No next page. Terminating crawling process.")
else:
yield response.follow(next_page, callback=self.parse)
# Name of the file where the results will be saved
filename = "2_cuisineaz.json"
# If file already exists, delete it before crawling (because Scrapy will concatenate the last and new results otherwise)
if filename in os.listdir('/Users/pierreduval/Desktop/Test'):
os.remove('/Users/pierreduval/Desktop/Test + filename')
# Declare a new CrawlerProcess with some settings
process = CrawlerProcess(settings = {
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
'LOG_LEVEL': logging.INFO,
"FEEDS": {
'results/' + filename : {"format": "json"},
}
})
# Start the crawling using the spider you defined above
process.crawl(RecipesSpider)
process.start()
非常感谢! 祝你有美好的一天
My code
The page that I have scraped (name of dishes
The informations that I want when you click on dishes image
我添加了评论,阅读它们,搜索文档,如果您仍然不明白,请问我。
import logging
import scrapy
from scrapy.crawler import CrawlerProcess
class RecipesSpider(scrapy.Spider):
name = 'recipes'
start_urls = ["https://www.cuisineaz.com/categories/plats/fast-food-cat48849"]
def parse(self, response):
# list of all the recipes urls on the page
all_links = response.xpath('//section[@id="listRecettes"]//article/div/a/@href').getall()
for link in all_links:
# request recipe page
yield scrapy.Request(url=f'https://www.cuisineaz.com{link}', callback=self.parse_recipe)
# get the url of the next page
next_page = response.xpath('//li[@class="pagination-next"]/a/@href').get()
if next_page:
# the default callback is 'parse'
yield scrapy.Request(url=self.start_urls[0]+next_page)
# parse the recipe page
def parse_recipe(self, response):
ingredients = response.xpath('//section[contains(@class, "ingredients")]/ul/li/span/text()').getall()
preparation_time = response.xpath('//section[contains(@class, "instructions")]//li/span/text()').get()
yield {
'ingredients': ingredients,
'preparation_time': preparation_time,
}
# Name of the file where the results will be saved
filename = "2_cuisineaz.json"
# Declare a new CrawlerProcess with some settings
process = CrawlerProcess(settings={
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
'LOG_LEVEL': logging.INFO,
"FEEDS": {
f'results/{filename}': {"format": "json",
'overwrite': True, # instead of checking if the file exists just overwrite it
'encoding': 'utf8'
},
}
})
# Start the crawling using the spider you defined above
process.crawl(RecipesSpider)
process.start()