Python - Scrapy 爬取 myrecipes.com 问题
Python - Scrapy crawling myrecipes.com issue
我正在尝试抓取食谱网站:我的 recipes.com 以提取我将存储在 android 应用程序中的 sqlite 数据库中的食谱详细信息。除了食谱持续时间之外,我能够提取所有食谱细节。这里的问题不是所有的食谱都采用相同的格式;有些包含烹饪时间和准备时间,有些包含总时间,有些则没有。下面是我用来抓取网站的代码加上我针对持续时间的 html 代码。
我尝试 运行 代码,但输出不会注册。我怀疑问题出在 if-else 语句中,我需要它来说明不同的配方格式。任何帮助将不胜感激。
from scrapy.selector import Selector
from scrapy.spiders import CrawlSpider, Rule
from myrecipes.items import MyrecipesRecipe, Ingredient, Nutrients
class MyrecipesSpider(CrawlSpider):
name = "myrecipes" # name of the spider to be used when crawling
allowed_domains = ["myrecipes.com"] # where the spider is allowed to go
start_urls = ["http://www.myrecipes.com/recipe/indian-chickpea-vegetable-stew"]
def parse(self, response):
sel = Selector(response) # the selector
recipe = MyrecipesRecipe()
# Name
recipe['name'] = sel.xpath("substring-before(//title/text(),' Recipe')").extract()
# Cuisine
recipe['cuisine'] = "Indian"
# Ingredients
ingredients = []
ingredient_nodes = sel.xpath('//*[@class = "panel-pane pane-entity-field pane-node-field-ingredients"]/div/div')
for ingredient_node in ingredient_nodes:
try:
name = ingredient_node.xpath('//div[@class = "field-ingredients"]/div/div/span[@itemprop = "name"]/text()').extract()
quantity = ingredient_node.xpath('//div[@class = "field-ingredients"]/div/div/span[@itemprop = "amount"]/text()').extract()
except:
continue
ingredient = Ingredient()
ingredient['name'] = name
ingredient['quantity'] = quantity
ingredients.append(ingredient)
recipe['ingredients'] = ingredients
# Directions
instructions = []
instruction_nodes = sel.xpath('//div[@itemprop = "instructions"]/div[@class = "field-instructions"]/div/div[@class = "field-item even"]')
for instruction_node in instruction_nodes:
try:
instruction_step = instruction_node.xpath('//div[@itemprop = "instructions"]/div[@class = "field-instructions"]/div/div[@class = "field-item even"]/*/text()').extract()
except:
continue
instructions.append(instruction_step)
recipe['instructions'] = instructions
# Nutritional Info
nutrients = []
nutrient_nodes = sel.xpath('//div[@class = "panel-pane pane-entity-field pane-node-field-nutrition-data"]/div/div[@itemprop = "nutrition"]')
for nutrient_node in nutrient_nodes:
try:
name = nutrient_node.xpath('//div[@class = "field-nutrition-data"]/div[contains (@class, "field-collection-view clearfix view-mode-recipe-nutrition")]/div/text()').extract()
quantity = nutrient_node.xpath('//div[@class = "field-nutrition-data"]/div[contains(@class, "field-collection-view clearfix view-mode-recipe-nutrition")]/div/span/text()').extract()
except:
continue
nutrient = Nutrients()
nutrient['name'] = name
nutrient['quantity'] = quantity
nutrients.append(nutrient)
nutrient_name = []
x = nutrients[0].get('name')
for i in x:
if i != "\n":
nutrient_name.append(i)
nutrients[0]['name'] = nutrient_name
recipe['nutrients'] = nutrients
# Recipe time
duration_nodes = sel.xpath('//div[@class = "panel-pane pane-entity-field pane-node-field-recipe-time recipe-time"]/div[@class = "pane-content"]/div[@class = "field-collection-container clearfix"]')
for duration_node in duration_nodes:
try:
path = duration_node.xpath('//div[@class = "panel-pane pane-entity-field pane-node-field-recipe-time recipe-time"]/div[@class = "pane-content"]/div/div[@class = "field-recipe-time"]/div/div/span[1]/text()').extract()
if path == 'Prep: ':
recipe['prep_time'] = duration_node.xpath('//div[@class = "field-recipe-time"]/div/div/span[2]/text()').extract()
elif path == 'Cook: ':
recipe['cook_time'] = duration_node.xpath('//div[@class = "field-recipe-time"]/div/div/span[2]/text()').extract()
elif path == 'Total: ':
recipe['total_time'] = duration_node.xpath('//div[@class = "field-recipe-time"]/div/div/span[2]/text()').extract()
except:
continue
# Number of Servings
recipe['servings'] = sel.xpath("substring-after(//div[@class = 'panel-pane pane-entity-field pane-node-field-yield']/div[@class = 'pane-content']/div[@itemprop = 'yield']/div[@class = 'field-yield']/text(), ': ')").extract()
return recipe
HTML 片段:
<div class="panel-pane pane-entity-field pane-node-field-recipe-time recipe-time">
<h2 class="pane-title">Recipe Time</h2>
<div class="pane-content">
<div class="field-collection-container clearfix">
<div class="field-recipe-time">
<div class="field-collection-view clearfix view-mode-recipe-time">
<div class="recipe-time-info">
<span class="recipe-time-text">Prep: </span>
<span class="recipe-time-duration">25 Minutes</span>
</div>
</div> </div>
<div class="field-recipe-time">
<div class="field-collection-view clearfix view-mode-recipe-time field-collection-view-final">
<div class="recipe-time-info">
<span class="recipe-time-text">Cook: </span>
<span class="recipe-time-duration">45 Minutes</span>
</div>
</div> </div>
</div> </div>
</div>
问题出在您的 xpath 构造上。更多时候,当有很多 html 元素时,丢失任何元素的机会都会增加。如果您觉得构造相对 xpath 有困难,我建议您使用浏览器的 xpath select 或者。您可以右键单击该元素和 select 相对的 xpath。试试这个:
duration_nodes = sel.xpath('//*[@id="block-system-main"]/div/div[4]/div[1]/div/div[3]/div/div/div')
for duration_node in duration_nodes:
try:
path = ''.join(duration_node.xpath('div/div/span[1]/text()').extract())
if path == 'Prep: ':
recipe['prep_time'] = duration_node.xpath('div/div/span[2]/text()').extract()
elif path == 'Cook: ':
recipe['cook_time'] = duration_node.xpath('div/div/span[2]/text()').extract()
elif path == 'Total: ':
recipe['total_time'] = duration_node.xpath('div/div/span[2]/text()').extract()
except:
continue
我正在尝试抓取食谱网站:我的 recipes.com 以提取我将存储在 android 应用程序中的 sqlite 数据库中的食谱详细信息。除了食谱持续时间之外,我能够提取所有食谱细节。这里的问题不是所有的食谱都采用相同的格式;有些包含烹饪时间和准备时间,有些包含总时间,有些则没有。下面是我用来抓取网站的代码加上我针对持续时间的 html 代码。
我尝试 运行 代码,但输出不会注册。我怀疑问题出在 if-else 语句中,我需要它来说明不同的配方格式。任何帮助将不胜感激。
from scrapy.selector import Selector
from scrapy.spiders import CrawlSpider, Rule
from myrecipes.items import MyrecipesRecipe, Ingredient, Nutrients
class MyrecipesSpider(CrawlSpider):
name = "myrecipes" # name of the spider to be used when crawling
allowed_domains = ["myrecipes.com"] # where the spider is allowed to go
start_urls = ["http://www.myrecipes.com/recipe/indian-chickpea-vegetable-stew"]
def parse(self, response):
sel = Selector(response) # the selector
recipe = MyrecipesRecipe()
# Name
recipe['name'] = sel.xpath("substring-before(//title/text(),' Recipe')").extract()
# Cuisine
recipe['cuisine'] = "Indian"
# Ingredients
ingredients = []
ingredient_nodes = sel.xpath('//*[@class = "panel-pane pane-entity-field pane-node-field-ingredients"]/div/div')
for ingredient_node in ingredient_nodes:
try:
name = ingredient_node.xpath('//div[@class = "field-ingredients"]/div/div/span[@itemprop = "name"]/text()').extract()
quantity = ingredient_node.xpath('//div[@class = "field-ingredients"]/div/div/span[@itemprop = "amount"]/text()').extract()
except:
continue
ingredient = Ingredient()
ingredient['name'] = name
ingredient['quantity'] = quantity
ingredients.append(ingredient)
recipe['ingredients'] = ingredients
# Directions
instructions = []
instruction_nodes = sel.xpath('//div[@itemprop = "instructions"]/div[@class = "field-instructions"]/div/div[@class = "field-item even"]')
for instruction_node in instruction_nodes:
try:
instruction_step = instruction_node.xpath('//div[@itemprop = "instructions"]/div[@class = "field-instructions"]/div/div[@class = "field-item even"]/*/text()').extract()
except:
continue
instructions.append(instruction_step)
recipe['instructions'] = instructions
# Nutritional Info
nutrients = []
nutrient_nodes = sel.xpath('//div[@class = "panel-pane pane-entity-field pane-node-field-nutrition-data"]/div/div[@itemprop = "nutrition"]')
for nutrient_node in nutrient_nodes:
try:
name = nutrient_node.xpath('//div[@class = "field-nutrition-data"]/div[contains (@class, "field-collection-view clearfix view-mode-recipe-nutrition")]/div/text()').extract()
quantity = nutrient_node.xpath('//div[@class = "field-nutrition-data"]/div[contains(@class, "field-collection-view clearfix view-mode-recipe-nutrition")]/div/span/text()').extract()
except:
continue
nutrient = Nutrients()
nutrient['name'] = name
nutrient['quantity'] = quantity
nutrients.append(nutrient)
nutrient_name = []
x = nutrients[0].get('name')
for i in x:
if i != "\n":
nutrient_name.append(i)
nutrients[0]['name'] = nutrient_name
recipe['nutrients'] = nutrients
# Recipe time
duration_nodes = sel.xpath('//div[@class = "panel-pane pane-entity-field pane-node-field-recipe-time recipe-time"]/div[@class = "pane-content"]/div[@class = "field-collection-container clearfix"]')
for duration_node in duration_nodes:
try:
path = duration_node.xpath('//div[@class = "panel-pane pane-entity-field pane-node-field-recipe-time recipe-time"]/div[@class = "pane-content"]/div/div[@class = "field-recipe-time"]/div/div/span[1]/text()').extract()
if path == 'Prep: ':
recipe['prep_time'] = duration_node.xpath('//div[@class = "field-recipe-time"]/div/div/span[2]/text()').extract()
elif path == 'Cook: ':
recipe['cook_time'] = duration_node.xpath('//div[@class = "field-recipe-time"]/div/div/span[2]/text()').extract()
elif path == 'Total: ':
recipe['total_time'] = duration_node.xpath('//div[@class = "field-recipe-time"]/div/div/span[2]/text()').extract()
except:
continue
# Number of Servings
recipe['servings'] = sel.xpath("substring-after(//div[@class = 'panel-pane pane-entity-field pane-node-field-yield']/div[@class = 'pane-content']/div[@itemprop = 'yield']/div[@class = 'field-yield']/text(), ': ')").extract()
return recipe
HTML 片段:
<div class="panel-pane pane-entity-field pane-node-field-recipe-time recipe-time">
<h2 class="pane-title">Recipe Time</h2>
<div class="pane-content">
<div class="field-collection-container clearfix">
<div class="field-recipe-time">
<div class="field-collection-view clearfix view-mode-recipe-time">
<div class="recipe-time-info">
<span class="recipe-time-text">Prep: </span>
<span class="recipe-time-duration">25 Minutes</span>
</div>
</div> </div>
<div class="field-recipe-time">
<div class="field-collection-view clearfix view-mode-recipe-time field-collection-view-final">
<div class="recipe-time-info">
<span class="recipe-time-text">Cook: </span>
<span class="recipe-time-duration">45 Minutes</span>
</div>
</div> </div>
</div> </div>
</div>
问题出在您的 xpath 构造上。更多时候,当有很多 html 元素时,丢失任何元素的机会都会增加。如果您觉得构造相对 xpath 有困难,我建议您使用浏览器的 xpath select 或者。您可以右键单击该元素和 select 相对的 xpath。试试这个:
duration_nodes = sel.xpath('//*[@id="block-system-main"]/div/div[4]/div[1]/div/div[3]/div/div/div')
for duration_node in duration_nodes:
try:
path = ''.join(duration_node.xpath('div/div/span[1]/text()').extract())
if path == 'Prep: ':
recipe['prep_time'] = duration_node.xpath('div/div/span[2]/text()').extract()
elif path == 'Cook: ':
recipe['cook_time'] = duration_node.xpath('div/div/span[2]/text()').extract()
elif path == 'Total: ':
recipe['total_time'] = duration_node.xpath('div/div/span[2]/text()').extract()
except:
continue