我如何抓取子页面并将其与页面信息合并?
How I can scrape subpages and merge it with page info?
我使用 scrapy 来解析页面。该页面有子页面(类别),我还需要从中获取信息并将其全部组合在一个元素中(可能将其他页面的信息保存为 json),我将其添加到 csv。我尝试了不同的选择,例如:
requests = scrapy.Request(url, meta={'meta_item': item}, callback=self.parse_category)
或
yield scrapy.Request(url, meta={'meta_item': item}, callback=self.parse_category)
但这两种方法都没有达到我想要的效果。
例如,我从 https://www.webscorer.com/findraces?pg=results (example: https://www.webscorer.com/seriesresult?seriesid=211565 ) and get information from this page. After that, I need to get additional information from category (example: https://www.webscorer.com/seriesresult?seriesid=211565&gender=F 中获取了页面):
example 并将它们全部放入 csv 中。我现在的代码:
class WebscorerSpider(scrapy.Spider):
name = 'webscorer'
allowed_domains = ['webscorer.com']
def start_requests(self):
url = f'https://www.webscorer.com/findraces?pg=results'
yield scrapy.Request(url, callback=self.parse_page)
def parse_page(self, response, **kwargs):
for href in response.css('table.results-table tbody tr a::attr("href")').extract():
url = response.urljoin(href)
url = 'https://www.webscorer.com/seriesresult?seriesid=211565'
yield scrapy.Request(url, callback=self.parse)
def parse(self, response: Response, **kwargs):
latlong_match = re.search('lat=(.*)&lng=(.*)', response.css('span#FSrc::text').get())
item = dict()
for href in response.css('table.category-table .category-name').css('a::attr("href")').extract():
url = response.urljoin(href)
# requests = scrapy.Request(url, meta={'meta_item': item}, callback=self.parse_category)
yield scrapy.Request(url, meta={'meta_item': item}, callback=self.parse_category)
yield WebscorerEvent(name=response.css('h1.race-name::text').get(),
source_url=response.request.url,
sport_discipline=response.css('td.spec+td').css('strong::text').get(),
description=response.css('span.regnotes span::text').get(),
hero_image=response.css('p.associated-race-pic img::attr(src)').get(),
start_date=parse_webscorer_date(response.css('p.race-date::text').get()),
location={
"link": f"https://www.google.com/maps/search/?api=1&query={latlong_match.group(1)},{latlong_match.group(2)}",
"description": response.css('td.spec:contains("Location:")+td strong::text').get()})
def parse_category(self, response, **kwargs):
item = response.meta['meta_item']
# print(item)
item['winner'] = response.css('table.results-table .r-racername span::text').get()
return item
您做到了 yield WebscorerEvent
,因此在获取下一页所需的数据之前您已经“丢弃”了该项目。
你可以这样做:
def parse(self, response: Response, **kwargs):
latlong_match = re.search('lat=(.*)&lng=(.*)', response.css('span#FSrc::text').get())
item = {
"name": response.css('h1.race-name::text').get(),
"source_url": response.request.url,
"sport_discipline": response.css('td.spec+td').css('strong::text').get(),
"description": response.css('span.regnotes span::text').get(),
"hero_image": response.css('p.associated-race-pic img::attr(src)').get(),
"start_date": parse_webscorer_date(response.css('p.race-date::text').get()),
"location": {
"link": f"https://www.google.com/maps/search/?api=1&query={latlong_match.group(1)},{latlong_match.group(2)}",
"description": response.css('td.spec:contains("Location:")+td strong::text').get()
}
}
for href in response.css('table.category-table .category-name').css('a::attr("href")').extract():
url = response.urljoin(href)
yield scrapy.Request(url, meta={'meta_item': item}, callback=self.parse_category)
def parse_category(self, response, **kwargs):
item = response.meta['meta_item']
item['winner'] = response.css('table.results-table .r-racername span::text').get()
yield WebscorerEvent(item)
这样一来,您最终只 yield
项目,需要所有数据。
我使用 scrapy 来解析页面。该页面有子页面(类别),我还需要从中获取信息并将其全部组合在一个元素中(可能将其他页面的信息保存为 json),我将其添加到 csv。我尝试了不同的选择,例如:
requests = scrapy.Request(url, meta={'meta_item': item}, callback=self.parse_category)
或
yield scrapy.Request(url, meta={'meta_item': item}, callback=self.parse_category)
但这两种方法都没有达到我想要的效果。
例如,我从 https://www.webscorer.com/findraces?pg=results (example: https://www.webscorer.com/seriesresult?seriesid=211565 ) and get information from this page. After that, I need to get additional information from category (example: https://www.webscorer.com/seriesresult?seriesid=211565&gender=F 中获取了页面): example 并将它们全部放入 csv 中。我现在的代码:
class WebscorerSpider(scrapy.Spider):
name = 'webscorer'
allowed_domains = ['webscorer.com']
def start_requests(self):
url = f'https://www.webscorer.com/findraces?pg=results'
yield scrapy.Request(url, callback=self.parse_page)
def parse_page(self, response, **kwargs):
for href in response.css('table.results-table tbody tr a::attr("href")').extract():
url = response.urljoin(href)
url = 'https://www.webscorer.com/seriesresult?seriesid=211565'
yield scrapy.Request(url, callback=self.parse)
def parse(self, response: Response, **kwargs):
latlong_match = re.search('lat=(.*)&lng=(.*)', response.css('span#FSrc::text').get())
item = dict()
for href in response.css('table.category-table .category-name').css('a::attr("href")').extract():
url = response.urljoin(href)
# requests = scrapy.Request(url, meta={'meta_item': item}, callback=self.parse_category)
yield scrapy.Request(url, meta={'meta_item': item}, callback=self.parse_category)
yield WebscorerEvent(name=response.css('h1.race-name::text').get(),
source_url=response.request.url,
sport_discipline=response.css('td.spec+td').css('strong::text').get(),
description=response.css('span.regnotes span::text').get(),
hero_image=response.css('p.associated-race-pic img::attr(src)').get(),
start_date=parse_webscorer_date(response.css('p.race-date::text').get()),
location={
"link": f"https://www.google.com/maps/search/?api=1&query={latlong_match.group(1)},{latlong_match.group(2)}",
"description": response.css('td.spec:contains("Location:")+td strong::text').get()})
def parse_category(self, response, **kwargs):
item = response.meta['meta_item']
# print(item)
item['winner'] = response.css('table.results-table .r-racername span::text').get()
return item
您做到了 yield WebscorerEvent
,因此在获取下一页所需的数据之前您已经“丢弃”了该项目。
你可以这样做:
def parse(self, response: Response, **kwargs):
latlong_match = re.search('lat=(.*)&lng=(.*)', response.css('span#FSrc::text').get())
item = {
"name": response.css('h1.race-name::text').get(),
"source_url": response.request.url,
"sport_discipline": response.css('td.spec+td').css('strong::text').get(),
"description": response.css('span.regnotes span::text').get(),
"hero_image": response.css('p.associated-race-pic img::attr(src)').get(),
"start_date": parse_webscorer_date(response.css('p.race-date::text').get()),
"location": {
"link": f"https://www.google.com/maps/search/?api=1&query={latlong_match.group(1)},{latlong_match.group(2)}",
"description": response.css('td.spec:contains("Location:")+td strong::text').get()
}
}
for href in response.css('table.category-table .category-name').css('a::attr("href")').extract():
url = response.urljoin(href)
yield scrapy.Request(url, meta={'meta_item': item}, callback=self.parse_category)
def parse_category(self, response, **kwargs):
item = response.meta['meta_item']
item['winner'] = response.css('table.results-table .r-racername span::text').get()
yield WebscorerEvent(item)
这样一来,您最终只 yield
项目,需要所有数据。