Scrapy,获取所有信息的问题

Scrapy, problems to have all informations

我想用scrapy建立一个数据库,里面有菜名、配料和准备时间。所以,我可以知道 7 页的菜名,但我不知道如何继续告诉脚本输入一道菜,并获取每道菜的配料和准备时间。

import os
import logging

import scrapy
from scrapy.crawler import CrawlerProcess

class RecipesSpider(scrapy.Spider):
    name = 'recipes'
    
    start_urls = [
        "https://www.cuisineaz.com/categories/plats/fast-food-cat48849",
    ]
    
    def parse(self, response):
        recipes = response.css('div.tile_content')
        for recipe in recipes:
            yield {
                'name': recipe.css('a.tile_title::text').get(),
            }
            
        try:
            next_page = response.css('li.pagination-next a').attrib["href"]
        except KeyError:
            logging.info("No next page. Terminating crawling process.")
        else:
            yield response.follow(next_page, callback=self.parse)


# Name of the file where the results will be saved
filename = "2_cuisineaz.json"

# If file already exists, delete it before crawling (because Scrapy will concatenate the last and new results otherwise)
if filename in os.listdir('/Users/pierreduval/Desktop/Test'):
        os.remove('/Users/pierreduval/Desktop/Test + filename')

# Declare a new CrawlerProcess with some settings
process = CrawlerProcess(settings = {
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
    'LOG_LEVEL': logging.INFO,
    "FEEDS": {
        'results/' + filename : {"format": "json"},
    }
})

# Start the crawling using the spider you defined above
process.crawl(RecipesSpider)
process.start()

非常感谢! 祝你有美好的一天

My code

The page that I have scraped (name of dishes

The informations that I want when you click on dishes image

我添加了评论,阅读它们,搜索文档,如果您仍然不明白,请问我。

import logging
import scrapy
from scrapy.crawler import CrawlerProcess


class RecipesSpider(scrapy.Spider):
    name = 'recipes'

    start_urls = ["https://www.cuisineaz.com/categories/plats/fast-food-cat48849"]

    def parse(self, response):
        # list of all the recipes urls on the page
        all_links = response.xpath('//section[@id="listRecettes"]//article/div/a/@href').getall()
        for link in all_links:
            # request recipe page
            yield scrapy.Request(url=f'https://www.cuisineaz.com{link}', callback=self.parse_recipe)

        # get the url of the next page
        next_page = response.xpath('//li[@class="pagination-next"]/a/@href').get()
        if next_page:
            # the default callback is 'parse'
            yield scrapy.Request(url=self.start_urls[0]+next_page)

    # parse the recipe page
    def parse_recipe(self, response):
        ingredients = response.xpath('//section[contains(@class, "ingredients")]/ul/li/span/text()').getall()
        preparation_time = response.xpath('//section[contains(@class, "instructions")]//li/span/text()').get()

        yield {
            'ingredients': ingredients,
            'preparation_time': preparation_time,
        }

# Name of the file where the results will be saved
filename = "2_cuisineaz.json"

# Declare a new CrawlerProcess with some settings
process = CrawlerProcess(settings={
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
    'LOG_LEVEL': logging.INFO,
    "FEEDS": {
        f'results/{filename}': {"format": "json",
                                'overwrite': True,  # instead of checking if the file exists just overwrite it
                                'encoding': 'utf8'
                                },
    }
})

# Start the crawling using the spider you defined above
process.crawl(RecipesSpider)
process.start()