Python Scrapy - 根据第一个网页为每个条目保存 'category'
Python Scrapy - saving a 'category' for each entry based on first webpage
我正在抓取 BBC food 食谱。逻辑如下:
主页有大约 20 种菜系
-> 在每种菜系中,每个字母通常在 1-3 页上有约 20 个食谱。
-> 在每个食谱中,我大约有 6 样东西(成分、评级等)
因此,我的逻辑是:进入主页,创建请求,提取所有美食 links,然后按照每个菜谱,从那里提取每一页食谱,按照每个食谱 link,并从每个食谱中最终获得所有数据。请注意,这还没有完成,因为我需要实现 spider 来遍历所有字母。
我希望有一个 'category' 列,即对于“非洲美食”中的每个食谱,link 有一个列显示“非洲”,对于“意大利美食”中的每个食谱" 所有列中的“意大利语”条目等
期望的结果:
cook_time prep_time name cuisine
10 30 A italian
20 10 B italian
30 20 C indian
20 10 D indian
30 20 E indian
这是我关注的蜘蛛:
import scrapy
from recipes_cuisines.items import RecipeItem
class ItalianSpider(scrapy.Spider):
name = "italian_spider"
def start_requests(self):
start_urls = ['https://www.bbc.co.uk/food/cuisines']
for url in start_urls:
yield scrapy.Request(url = url, callback = self.parse_cuisines)
def parse_cuisines(self, response):
cuisine_cards = response.xpath('//a[contains(@class,"promo__cuisine")]/@href').extract()
for url in cuisine_cards:
yield response.follow(url = url, callback = self.parse_main)
def parse_main(self, response):
recipe_cards = response.xpath('//a[contains(@class,"main_course")]/@href').extract()
for url in recipe_cards:
yield response.follow(url = url, callback = self.parse_card)
next_page = response.xpath('//div[@class="pagination gel-wrap"]/ul[@class="pagination__list"]/li[@class="pagination__list-item pagination__priority--0"]/a[@class="pagination__link gel-pica-bold"]/@href').get()
if next_page is not None:
next_page_url = response.urljoin(next_page)
print(next_page_url)
yield scrapy.Request(url = next_page_url, callback = self.parse_main)
def parse_card(self, response):
item = RecipeItem()
item['name'] = response.xpath('//h1[contains(@class,"title__text")]/text()').extract()
item['prep_time'] = response.xpath('//div[contains(@class,"recipe-metadata-wrap")]/p[@class="recipe-metadata__prep-time"]/text()').extract_first()
item['cook_time'] = response.xpath('//p[contains(@class,"cook-time")]/text()').extract_first()
item['servings'] = response.xpath('//p[contains(@class,"serving")]/text()').extract_first()
item['ratings_amount'] = response.xpath('//div[contains(@class="aggregate-rating")]/span[contains(@class="aggregate-rating__total")]/text()').extract()
#item['ratings_amount'] = response.xpath('//*[@id="main-content"]/div[1]/div[4]/div/div[1]/div/div[1]/div[2]/div[1]/span[2]/text()').extract()
item['ingredients'] = response.css('li.recipe-ingredients__list-item > a::text').extract()
return item
和项目:
import scrapy
class RecipeItem(scrapy.Item):
name = scrapy.Field()
prep_time = scrapy.Field()
cook_time = scrapy.Field()
servings = scrapy.Field()
ratings_amount = scrapy.Field()
rating = scrapy.Field()
ingredients = scrapy.Field()
cuisine = scrapy.Field()
注意我通过
保存输出
scrapy crawl italian_spider -o test.csv
我已经阅读了文档并尝试了几种方法,例如将提取的美食添加到 parse_cuisine 或 parse_main 方法中,但都产生了错误。
这里有两种方法。最常见的方法是将一些信息从一个页面传递到另一个页面是在 scrapy.Request
:
中使用 cb_kwargs
def parse_cousine(self, response):
cousine = response.xpath('//h1/text()').get()
for recipe_url in response.xpath('//div[@id="az-recipes--recipes"]//a[.//h3]').getall():
yield scrapy.Request(
url=response.urljoin(recipe_url),
callback=self.parse_recipe,
cb_kwargs={'cousine': cousine},
)
def parse_recipe(self, response, cousine):
print(cousine)
但是你可以在这个网站的食谱页面上找到它(解析后在配料部分JSON):
def parse_recipe(self, response):
recipe_raw = response.xpath('//script[@type="application/ld+json"][contains(., \'"@type":"Recipe"\')]/text()').get()
recipe = json.loads(recipe_raw)
cousine = recipe['recipeCuisine']
更新 此 XPath '//script[@type="application/ld+json"][contains(., \'"@type":"Recipe"\')]/text()'
找到 script
具有 type
属性且值为 application/ld+json
并且还包含字符串的节点"@type":"Recipe"
在该节点的文本中。
我正在抓取 BBC food 食谱。逻辑如下:
主页有大约 20 种菜系
-> 在每种菜系中,每个字母通常在 1-3 页上有约 20 个食谱。
-> 在每个食谱中,我大约有 6 样东西(成分、评级等)
因此,我的逻辑是:进入主页,创建请求,提取所有美食 links,然后按照每个菜谱,从那里提取每一页食谱,按照每个食谱 link,并从每个食谱中最终获得所有数据。请注意,这还没有完成,因为我需要实现 spider 来遍历所有字母。
我希望有一个 'category' 列,即对于“非洲美食”中的每个食谱,link 有一个列显示“非洲”,对于“意大利美食”中的每个食谱" 所有列中的“意大利语”条目等
期望的结果:
cook_time prep_time name cuisine
10 30 A italian
20 10 B italian
30 20 C indian
20 10 D indian
30 20 E indian
这是我关注的蜘蛛:
import scrapy
from recipes_cuisines.items import RecipeItem
class ItalianSpider(scrapy.Spider):
name = "italian_spider"
def start_requests(self):
start_urls = ['https://www.bbc.co.uk/food/cuisines']
for url in start_urls:
yield scrapy.Request(url = url, callback = self.parse_cuisines)
def parse_cuisines(self, response):
cuisine_cards = response.xpath('//a[contains(@class,"promo__cuisine")]/@href').extract()
for url in cuisine_cards:
yield response.follow(url = url, callback = self.parse_main)
def parse_main(self, response):
recipe_cards = response.xpath('//a[contains(@class,"main_course")]/@href').extract()
for url in recipe_cards:
yield response.follow(url = url, callback = self.parse_card)
next_page = response.xpath('//div[@class="pagination gel-wrap"]/ul[@class="pagination__list"]/li[@class="pagination__list-item pagination__priority--0"]/a[@class="pagination__link gel-pica-bold"]/@href').get()
if next_page is not None:
next_page_url = response.urljoin(next_page)
print(next_page_url)
yield scrapy.Request(url = next_page_url, callback = self.parse_main)
def parse_card(self, response):
item = RecipeItem()
item['name'] = response.xpath('//h1[contains(@class,"title__text")]/text()').extract()
item['prep_time'] = response.xpath('//div[contains(@class,"recipe-metadata-wrap")]/p[@class="recipe-metadata__prep-time"]/text()').extract_first()
item['cook_time'] = response.xpath('//p[contains(@class,"cook-time")]/text()').extract_first()
item['servings'] = response.xpath('//p[contains(@class,"serving")]/text()').extract_first()
item['ratings_amount'] = response.xpath('//div[contains(@class="aggregate-rating")]/span[contains(@class="aggregate-rating__total")]/text()').extract()
#item['ratings_amount'] = response.xpath('//*[@id="main-content"]/div[1]/div[4]/div/div[1]/div/div[1]/div[2]/div[1]/span[2]/text()').extract()
item['ingredients'] = response.css('li.recipe-ingredients__list-item > a::text').extract()
return item
和项目:
import scrapy
class RecipeItem(scrapy.Item):
name = scrapy.Field()
prep_time = scrapy.Field()
cook_time = scrapy.Field()
servings = scrapy.Field()
ratings_amount = scrapy.Field()
rating = scrapy.Field()
ingredients = scrapy.Field()
cuisine = scrapy.Field()
注意我通过
保存输出scrapy crawl italian_spider -o test.csv
我已经阅读了文档并尝试了几种方法,例如将提取的美食添加到 parse_cuisine 或 parse_main 方法中,但都产生了错误。
这里有两种方法。最常见的方法是将一些信息从一个页面传递到另一个页面是在 scrapy.Request
:
cb_kwargs
def parse_cousine(self, response):
cousine = response.xpath('//h1/text()').get()
for recipe_url in response.xpath('//div[@id="az-recipes--recipes"]//a[.//h3]').getall():
yield scrapy.Request(
url=response.urljoin(recipe_url),
callback=self.parse_recipe,
cb_kwargs={'cousine': cousine},
)
def parse_recipe(self, response, cousine):
print(cousine)
但是你可以在这个网站的食谱页面上找到它(解析后在配料部分JSON):
def parse_recipe(self, response):
recipe_raw = response.xpath('//script[@type="application/ld+json"][contains(., \'"@type":"Recipe"\')]/text()').get()
recipe = json.loads(recipe_raw)
cousine = recipe['recipeCuisine']
更新 此 XPath '//script[@type="application/ld+json"][contains(., \'"@type":"Recipe"\')]/text()'
找到 script
具有 type
属性且值为 application/ld+json
并且还包含字符串的节点"@type":"Recipe"
在该节点的文本中。