scrapy 忽略响应 404 但我想跳过这个忽略 url 或用空值填充它
scrapy ignoring response 404 but i want to skip this ignoring urls or fill it with null values
我正在使用网站 URL 从报纸上抓取数据,但 URL 的某些页面没有内容,因此我的爬虫被停止了。我的目标是继续爬虫,跳过这些页面或用 NA 值填充这些页面内容。这是我的蜘蛛
import scrapy
from ..items import CollectDataItem
import logging
class JagonewsSpider(scrapy.Spider):
name = 'jagonews'
page_number = 2
source_url = 1
start_urls = ['https://www.jagonews24.com/special-reports/news/1']
def __init__(self, *args, **kwargs):
logger = logging.getLogger('scrapy.spidermiddlewares.httperror')
logger.setLevel(logging.WARNING)
super().__init__(*args, **kwargs)
def parse(self, response):
items = CollectDataItem()
try:
all_div_quotes = response.css('.marginBottom20 > .col-sm-12')
except AttributeError:
next_page = 'https://www.jagonews24.com/special-reports/news/' + str(JagonewsSpider.page_number)
JagonewsSpider.page_number += 1
yield response.follow(next_page, callback=self.parse)
for quote in all_div_quotes:
try:
label = quote.css(".breadcrumb a::text").extract()
title = quote.css('h1::text').extract()
body = quote.css('p::text ').extract()
except AttributeError:
label = 'n/a'
title = 'n/a'
body = 'n/a'
print(type(body))
items['label'] = label
items['title'] = title
items['body'] = body
items['source'] = 'https://www.jagonews24.com/special-reports/news/' + str(JagonewsSpider.source_url)
yield items
JagonewsSpider.source_url += 1
next_page = 'https://www.jagonews24.com/special-reports/news/' + str(JagonewsSpider.page_number)
print(next_page)
if JagonewsSpider.page_number < 642548:
JagonewsSpider.page_number += 1
yield response.follow(next_page, callback=self.parse)
您可以使用这种方便的方法。
class JagonewsSpider(scrapy.Spider):
name = 'jagonews'
page_number = 2
source_url = 1
start_urls = ['https://www.jagonews24.com/special-reports/news/{}'.format(x) for x in range(1,10)]
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response, **kwargs):
if response.status != 404:
print(response.status)
print(response.url)
# you can extract your item here.
我正在使用网站 URL 从报纸上抓取数据,但 URL 的某些页面没有内容,因此我的爬虫被停止了。我的目标是继续爬虫,跳过这些页面或用 NA 值填充这些页面内容。这是我的蜘蛛
import scrapy
from ..items import CollectDataItem
import logging
class JagonewsSpider(scrapy.Spider):
name = 'jagonews'
page_number = 2
source_url = 1
start_urls = ['https://www.jagonews24.com/special-reports/news/1']
def __init__(self, *args, **kwargs):
logger = logging.getLogger('scrapy.spidermiddlewares.httperror')
logger.setLevel(logging.WARNING)
super().__init__(*args, **kwargs)
def parse(self, response):
items = CollectDataItem()
try:
all_div_quotes = response.css('.marginBottom20 > .col-sm-12')
except AttributeError:
next_page = 'https://www.jagonews24.com/special-reports/news/' + str(JagonewsSpider.page_number)
JagonewsSpider.page_number += 1
yield response.follow(next_page, callback=self.parse)
for quote in all_div_quotes:
try:
label = quote.css(".breadcrumb a::text").extract()
title = quote.css('h1::text').extract()
body = quote.css('p::text ').extract()
except AttributeError:
label = 'n/a'
title = 'n/a'
body = 'n/a'
print(type(body))
items['label'] = label
items['title'] = title
items['body'] = body
items['source'] = 'https://www.jagonews24.com/special-reports/news/' + str(JagonewsSpider.source_url)
yield items
JagonewsSpider.source_url += 1
next_page = 'https://www.jagonews24.com/special-reports/news/' + str(JagonewsSpider.page_number)
print(next_page)
if JagonewsSpider.page_number < 642548:
JagonewsSpider.page_number += 1
yield response.follow(next_page, callback=self.parse)
您可以使用这种方便的方法。
class JagonewsSpider(scrapy.Spider):
name = 'jagonews'
page_number = 2
source_url = 1
start_urls = ['https://www.jagonews24.com/special-reports/news/{}'.format(x) for x in range(1,10)]
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response, **kwargs):
if response.status != 404:
print(response.status)
print(response.url)
# you can extract your item here.