如何将此信息解析为单个项目?
How to parse this info into individual items?
我使用以下 scrapy 蜘蛛从网页中抓取了以下信息。如何将此信息转换为单独的项目,即一个项目应包含名称、尺寸、link、扩展名、月份和年份。
蜘蛛代码如下:
import scrapy
from scrapy.crawler import CrawlerProcess
class MapSpider(scrapy.Spider):
name = 'map'
allowed_domains = ['map.gob.do']
def start_requests(self):
start_urls = [
'https://map.gob.do/transparencia/recursos-humanos/nominas-de-empleados/']
for url in start_urls:
yield scrapy.Request(url=url, callback=self.parse,)
def parse(self, response):
panes = response.xpath('/html/body/div[8]/div/section/div/div/div[2]/div/div/div[3]/ul/li')
tables = response.xpath('/html/body/div[8]/div/section/div/div/div[2]/div/div/div[3]/div/div')
for pane in panes:
Id = pane.css('::attr(href)').get(default='')
Year = pane.css('::text').get(default='')
yield{
'year': Year,
'id': Id
}
for d,table in enumerate(tables,1):
yearId = table.css('.tab-pane ::attr(id)').get(default='')
months = table.css('#'+ yearId + '.tab-pane .vr-tabs-nav-link ::text').getall()
monthsIds = table.css('#'+ yearId + '.tab-pane .vr-tabs-nav-link ::attr(href)').getall()
print(f'|||YEAR \' {d} \' INFO |||')
yield{
'yearId': yearId,
'months': months,
'monthsIds': monthsIds,
}
for c,monthId in enumerate(monthsIds,1):
itemNames = table.css(monthId + ' tr .wpfd_downloadlink ::attr(title)').getall()
itemsLinks = table.css(monthId + ' tr.file .wpfd_downloadlink ::attr(href)').getall()
itemsSizes = table.css(monthId + ' tr.file .file_size::text').getall()
itemsExt = table.css(monthId + ' tr.file .wpfd_downloadlink > span > span ::attr(class)').getall()
print(f'|||MONTH \' {c} \' INFO |||')
yield {
'monthId': monthId,
'itemsNames': itemNames,
'itemsSizes': itemsSizes,
'itemsLinks': itemsLinks,
'itemsExt': itemsExt
}
process = CrawlerProcess()
process.crawl(MapSpider)
process.start()
目前您的 table.css(...).getall()
return 多个值,您都将其打包到 yield
中。 yield 相对于 return 的优势在于您还可以选择块大小。
将一般收益替换为您想要的更具体的收益。例如
for i in range(min(map(len, [itemNames, itemsLinks, itemsSizes, itemsExt]))):
yield {
'monthId': monthId,
'itemsNames': itemNames[i],
'itemsSizes': itemsSizes[i],
'itemsLinks': itemsLinks[i],
'itemsExt': itemsExt[i]
}
我使用以下 scrapy 蜘蛛从网页中抓取了以下信息。如何将此信息转换为单独的项目,即一个项目应包含名称、尺寸、link、扩展名、月份和年份。
蜘蛛代码如下:
import scrapy
from scrapy.crawler import CrawlerProcess
class MapSpider(scrapy.Spider):
name = 'map'
allowed_domains = ['map.gob.do']
def start_requests(self):
start_urls = [
'https://map.gob.do/transparencia/recursos-humanos/nominas-de-empleados/']
for url in start_urls:
yield scrapy.Request(url=url, callback=self.parse,)
def parse(self, response):
panes = response.xpath('/html/body/div[8]/div/section/div/div/div[2]/div/div/div[3]/ul/li')
tables = response.xpath('/html/body/div[8]/div/section/div/div/div[2]/div/div/div[3]/div/div')
for pane in panes:
Id = pane.css('::attr(href)').get(default='')
Year = pane.css('::text').get(default='')
yield{
'year': Year,
'id': Id
}
for d,table in enumerate(tables,1):
yearId = table.css('.tab-pane ::attr(id)').get(default='')
months = table.css('#'+ yearId + '.tab-pane .vr-tabs-nav-link ::text').getall()
monthsIds = table.css('#'+ yearId + '.tab-pane .vr-tabs-nav-link ::attr(href)').getall()
print(f'|||YEAR \' {d} \' INFO |||')
yield{
'yearId': yearId,
'months': months,
'monthsIds': monthsIds,
}
for c,monthId in enumerate(monthsIds,1):
itemNames = table.css(monthId + ' tr .wpfd_downloadlink ::attr(title)').getall()
itemsLinks = table.css(monthId + ' tr.file .wpfd_downloadlink ::attr(href)').getall()
itemsSizes = table.css(monthId + ' tr.file .file_size::text').getall()
itemsExt = table.css(monthId + ' tr.file .wpfd_downloadlink > span > span ::attr(class)').getall()
print(f'|||MONTH \' {c} \' INFO |||')
yield {
'monthId': monthId,
'itemsNames': itemNames,
'itemsSizes': itemsSizes,
'itemsLinks': itemsLinks,
'itemsExt': itemsExt
}
process = CrawlerProcess()
process.crawl(MapSpider)
process.start()
目前您的 table.css(...).getall()
return 多个值,您都将其打包到 yield
中。 yield 相对于 return 的优势在于您还可以选择块大小。
将一般收益替换为您想要的更具体的收益。例如
for i in range(min(map(len, [itemNames, itemsLinks, itemsSizes, itemsExt]))):
yield {
'monthId': monthId,
'itemsNames': itemNames[i],
'itemsSizes': itemsSizes[i],
'itemsLinks': itemsLinks[i],
'itemsExt': itemsExt[i]
}