scrapy 将项目传递给多种方法
scrapy pass item to multiple method
我尝试将我的项目传递给以下 2 种不同的方法。我通过两种方法传递它的原因是因为在每个页面中,我尝试在每个页面中获取一些数据,然后在每个页面中获取相关链接并通过它们收集更多数据。
问题似乎来自我通过 2 个循环传递 item
的事实。它将第一个循环(应该是什么)之后的项目数量从 23 增加到 yield scrapy.Request(url=final_url, callback=self.parse_sports, meta={'event_item': item, 'discipline': event_name})
之后的 248 项目
我还需要经历 final_url
。但是我怎样才能阻止它创建不必要的项目,因为到目前为止,每次 for 循环 运行
时它都会创建一个新项目
import scrapy
from scrapy import Selector
from eventSpider.items import EventspiderItem
import urllib.parse
class EventsSpider(scrapy.Spider):
name = 'eventSpider'
# base url to link to the end url we receive
baseUrl = "http://www.olympedia.org"
def start_requests(self):
start_urls = [
'http://www.olympedia.org/editions'
]
for url in start_urls:
yield scrapy.Request(url=url, callback=self.parse_urls)
def parse_urls(self, response):
"""
Go through the table of owinter olympics
Get all the url to those olympics events
Send the urls down to parse items to get the items of interest
"""
# remove the last 2 as the events haven't happened yet
for tr in response.xpath("//table[2]//tr")[:-2]:
url = tr.xpath('td[1]//a//@href').extract_first()
# check for None. In this case, we elimiate the 2 events that was canelled
if url is None:
continue
else:
url_to_check = urllib.parse.urljoin(self.baseUrl, url)
yield scrapy.Request(url=url_to_check, callback=self.parse_items)
def parse_items(self, response):
"""
Get the items of interest
Extract the list of disciplines and their url
pass the url
"""
item = EventspiderItem()
selector = Selector(response)
table1_rows = selector.xpath("//table[1]//tr")
item['event_title'] = table1_rows[1].xpath('td//text()').extract_first()
item['event_place'] = table1_rows[2].xpath('td//text()').extract_first()
table2 = selector.xpath("//table[3]//tr")
discipline_list = []
url_list = []
for tr in table2:
urls = tr.xpath('td//a//@href').extract()
disciplines = tr.xpath('td//a//text()').extract()
for url in urls:
# # check if we get empty list
# if not url:
# continue
# else:
url_list.append(url)
for discipline in disciplines:
discipline_list.append(discipline)
for i, url in enumerate(url_list):
final_url = urllib.parse.urljoin(self.baseUrl, url)
event_name = item['event_title'] + " " + discipline_list[i]
yield scrapy.Request(url=final_url, callback=self.parse_sports, meta={'event_item': item, 'discipline': event_name})
def parse_sports(self, response):
selector = Selector(response)
item = response.meta.get('event_item')
return item
您可以浏览列表,只有当您完成它时才会产生该项目。
import scrapy
from scrapy import Selector
# from eventSpider.items import EventspiderItem
import urllib.parse
class EventspiderItem(scrapy.Item):
event_title = scrapy.Field()
event_place = scrapy.Field()
class EventsSpider(scrapy.Spider):
name = 'eventSpider'
# base url to link to the end url we receive
baseUrl = "http://www.olympedia.org"
def start_requests(self):
start_urls = [
'http://www.olympedia.org/editions'
]
for url in start_urls:
yield scrapy.Request(url=url, callback=self.parse_urls)
def parse_urls(self, response):
"""
Go through the table of owinter olympics
Get all the url to those olympics events
Send the urls down to parse items to get the items of interest
"""
# remove the last 2 as the events haven't happened yet
for tr in response.xpath("//table[2]//tr")[:-2]:
url = tr.xpath('td[1]//a//@href').extract_first()
# check for None. In this case, we elimiate the 2 events that was canelled
if url is None:
continue
else:
url_to_check = urllib.parse.urljoin(self.baseUrl, url)
yield scrapy.Request(url=url_to_check, callback=self.parse_items)
def parse_items(self, response):
"""
Get the items of interest
Extract the list of disciplines and their url
pass the url
"""
item = EventspiderItem()
selector = Selector(response)
table1_rows = selector.xpath("//table[1]//tr")
item['event_title'] = table1_rows[1].xpath('td//text()').extract_first()
item['event_place'] = table1_rows[2].xpath('td//text()').extract_first()
table2 = selector.xpath("//table[3]//tr")
discipline_list = []
url_list = []
for tr in table2:
urls = tr.xpath('td//a//@href').extract()
disciplines = tr.xpath('td//a//text()').extract()
for url in urls:
url_list.append(url)
for discipline in disciplines:
discipline_list.append(discipline)
if url_list:
final_url = urllib.parse.urljoin(self.baseUrl, url_list[0])
yield scrapy.Request(url=final_url, callback=self.parse_sports, meta={'event_item': item, 'url_list': url_list[1:], 'discipline_list': discipline_list})
def parse_sports(self, response):
event_item = response.meta['event_item']
url_list = response.meta['url_list']
discipline_list = response.meta['discipline_list']
# extract what you want before, example:
# title = response.xpath('//h1/text()').get()
# and add it to the item if that's what you want...
# item['title'] = title
if url_list:
final_url = urllib.parse.urljoin(self.baseUrl, url_list[0])
event_name = event_item['event_title'] + " " + discipline_list[0]
yield scrapy.Request(url=final_url, callback=self.parse_sports, meta={'event_item': event_item, 'url_list': url_list[1:], 'discipline_list': discipline_list})
else:
yield event_item
'item_scraped_count': 23,
我尝试将我的项目传递给以下 2 种不同的方法。我通过两种方法传递它的原因是因为在每个页面中,我尝试在每个页面中获取一些数据,然后在每个页面中获取相关链接并通过它们收集更多数据。
问题似乎来自我通过 2 个循环传递 item
的事实。它将第一个循环(应该是什么)之后的项目数量从 23 增加到 yield scrapy.Request(url=final_url, callback=self.parse_sports, meta={'event_item': item, 'discipline': event_name})
之后的 248 项目
我还需要经历 final_url
。但是我怎样才能阻止它创建不必要的项目,因为到目前为止,每次 for 循环 运行
import scrapy
from scrapy import Selector
from eventSpider.items import EventspiderItem
import urllib.parse
class EventsSpider(scrapy.Spider):
name = 'eventSpider'
# base url to link to the end url we receive
baseUrl = "http://www.olympedia.org"
def start_requests(self):
start_urls = [
'http://www.olympedia.org/editions'
]
for url in start_urls:
yield scrapy.Request(url=url, callback=self.parse_urls)
def parse_urls(self, response):
"""
Go through the table of owinter olympics
Get all the url to those olympics events
Send the urls down to parse items to get the items of interest
"""
# remove the last 2 as the events haven't happened yet
for tr in response.xpath("//table[2]//tr")[:-2]:
url = tr.xpath('td[1]//a//@href').extract_first()
# check for None. In this case, we elimiate the 2 events that was canelled
if url is None:
continue
else:
url_to_check = urllib.parse.urljoin(self.baseUrl, url)
yield scrapy.Request(url=url_to_check, callback=self.parse_items)
def parse_items(self, response):
"""
Get the items of interest
Extract the list of disciplines and their url
pass the url
"""
item = EventspiderItem()
selector = Selector(response)
table1_rows = selector.xpath("//table[1]//tr")
item['event_title'] = table1_rows[1].xpath('td//text()').extract_first()
item['event_place'] = table1_rows[2].xpath('td//text()').extract_first()
table2 = selector.xpath("//table[3]//tr")
discipline_list = []
url_list = []
for tr in table2:
urls = tr.xpath('td//a//@href').extract()
disciplines = tr.xpath('td//a//text()').extract()
for url in urls:
# # check if we get empty list
# if not url:
# continue
# else:
url_list.append(url)
for discipline in disciplines:
discipline_list.append(discipline)
for i, url in enumerate(url_list):
final_url = urllib.parse.urljoin(self.baseUrl, url)
event_name = item['event_title'] + " " + discipline_list[i]
yield scrapy.Request(url=final_url, callback=self.parse_sports, meta={'event_item': item, 'discipline': event_name})
def parse_sports(self, response):
selector = Selector(response)
item = response.meta.get('event_item')
return item
您可以浏览列表,只有当您完成它时才会产生该项目。
import scrapy
from scrapy import Selector
# from eventSpider.items import EventspiderItem
import urllib.parse
class EventspiderItem(scrapy.Item):
event_title = scrapy.Field()
event_place = scrapy.Field()
class EventsSpider(scrapy.Spider):
name = 'eventSpider'
# base url to link to the end url we receive
baseUrl = "http://www.olympedia.org"
def start_requests(self):
start_urls = [
'http://www.olympedia.org/editions'
]
for url in start_urls:
yield scrapy.Request(url=url, callback=self.parse_urls)
def parse_urls(self, response):
"""
Go through the table of owinter olympics
Get all the url to those olympics events
Send the urls down to parse items to get the items of interest
"""
# remove the last 2 as the events haven't happened yet
for tr in response.xpath("//table[2]//tr")[:-2]:
url = tr.xpath('td[1]//a//@href').extract_first()
# check for None. In this case, we elimiate the 2 events that was canelled
if url is None:
continue
else:
url_to_check = urllib.parse.urljoin(self.baseUrl, url)
yield scrapy.Request(url=url_to_check, callback=self.parse_items)
def parse_items(self, response):
"""
Get the items of interest
Extract the list of disciplines and their url
pass the url
"""
item = EventspiderItem()
selector = Selector(response)
table1_rows = selector.xpath("//table[1]//tr")
item['event_title'] = table1_rows[1].xpath('td//text()').extract_first()
item['event_place'] = table1_rows[2].xpath('td//text()').extract_first()
table2 = selector.xpath("//table[3]//tr")
discipline_list = []
url_list = []
for tr in table2:
urls = tr.xpath('td//a//@href').extract()
disciplines = tr.xpath('td//a//text()').extract()
for url in urls:
url_list.append(url)
for discipline in disciplines:
discipline_list.append(discipline)
if url_list:
final_url = urllib.parse.urljoin(self.baseUrl, url_list[0])
yield scrapy.Request(url=final_url, callback=self.parse_sports, meta={'event_item': item, 'url_list': url_list[1:], 'discipline_list': discipline_list})
def parse_sports(self, response):
event_item = response.meta['event_item']
url_list = response.meta['url_list']
discipline_list = response.meta['discipline_list']
# extract what you want before, example:
# title = response.xpath('//h1/text()').get()
# and add it to the item if that's what you want...
# item['title'] = title
if url_list:
final_url = urllib.parse.urljoin(self.baseUrl, url_list[0])
event_name = event_item['event_title'] + " " + discipline_list[0]
yield scrapy.Request(url=final_url, callback=self.parse_sports, meta={'event_item': event_item, 'url_list': url_list[1:], 'discipline_list': discipline_list})
else:
yield event_item
'item_scraped_count': 23,