即使我收集和解析它们,Scrapy 爬虫也不会跟踪链接
Scrapy crawler does not follow links even when I collect and parse them
我已经被这个问题困扰了几个小时。我无法使用规则语法跟踪此站点上的链接,因此我手动找到了发出请求所需的所有链接。即使我测试提取的链接是有效的 url,我的抓取工具也不会抓取其他页面。我已经坚持了几个小时。我也没有发现 Scrapy 上的文档有多大帮助,因为它以完美的单词卡片形式呈现。有人可以帮忙吗?
# -*- coding: utf-8 -*-
import scrapy
import logging
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy import Request
from banker.items import BarclaysOfferItem
class BarclaySpider(CrawlSpider):
name = "barclay"
allowed_domains = ['partners.barclaycardrewardsboost.com/']
start_urls = [
'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm?rows=25&page=1&ref_page_id=2167&ref_section_id=9720&ref_section_title=All%Online%Offers'
# 'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm?rows=25&page=2&ref_page_id=2167&ref_section_id=9720&ref_section_title=All%Online%Offers'
# 'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm?rows=25&page=3&ref_page_id=2167&ref_section_id=9720&ref_section_title=All%Online%Offers',
# 'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm?rows=25&page=4&ref_page_id=2167&ref_section_id=9720&ref_section_title=All%Online%Offers',
# 'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm?rows=25&page=5&ref_page_id=2167&ref_section_id=9720&ref_section_title=All%Online%Offers',
# 'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm?rows=25&page=6&ref_page_id=2167&ref_section_id=9720&ref_section_title=All%Online%Offers',
# 'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm?rows=25&page=7&ref_page_id=2167&ref_section_id=9720&ref_section_title=All%Online%Offers'
]
def parse(self, response):
base = 'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm'
links = response.xpath('//p[contains(@class, "mn_pageLinks")]/a')
for sel in links:
url = base + str(sel.xpath('@href').extract()[0])
logging.info(url)
yield scrapy.Request(url, callback=self.parse_item)
def parse_item(self, reponse):
for sel in response.xpath('//table/tr'):
item = BarclaysOfferItem()
item['merchant'] = sel.xpath('td/div/a[last()]/text()').extract()
item['rate'] = sel.xpath('td/span/a/text()').extract()
item['offer'] = sel.xpath('td/a[last()]/text()').extract()
item['coupon_code'] = sel.xpath('td[@class="mn_cpCode"]/text()').extract()
item['expiration_date'] = sel.xpath('td[@class="mn_expiry"]/text()').extract()
yield item
更新 #1
删除 allowed_urls
列表使我的请求生效。但是现在我不断收到 NameError: global name 'response' is not defined
.
我终于成功了!
根据 scrapy documentation,如果启用了 OffsiteMiddleware,则不会为请求读取不在 allowed_domains
列表中的域。我知道我的网址在指定的域上,但我想网站查询数据的方式使网址看起来好像不在现场。
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import Spider, Rule
from scrapy.linkextractors import LinkExtractor
import logging
from banker.items import BarclaysOfferItem
class BarclaySpider(Spider):
name = "barclay"
start_urls = [
'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm?rows=25&page=1&ref_page_id=2167&ref_section_id=9720&ref_section_title=All%Online%Offers'
]
# Parse for the links of interest
def parse(self, response):
base = 'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm'
links = response.xpath('//p[contains(@class, "mn_pageLinks")]/a')
for sel in links:
url = base + str(sel.xpath('@href').extract()[0])
logging.info(url)
yield scrapy.Request(url, callback=self.parse_item)
# parse for the items of interest
def parse_item(self, response):
for sel in response.xpath('//table/tr'):
item = BarclaysOfferItem()
item['merchant'] = sel.xpath('td/div/a[last()]/text()').extract()
item['rate'] = sel.xpath('td/span/a/text()').extract()
item['offer'] = sel.xpath('td/a[last()]/text()').extract()
item['coupon_code'] = sel.xpath('td[@class="mn_cpCode"]/text()').extract()
item['expiration_date'] = sel.xpath('td[@class="mn_expiry"]/text()').extract()
yield item
我已经被这个问题困扰了几个小时。我无法使用规则语法跟踪此站点上的链接,因此我手动找到了发出请求所需的所有链接。即使我测试提取的链接是有效的 url,我的抓取工具也不会抓取其他页面。我已经坚持了几个小时。我也没有发现 Scrapy 上的文档有多大帮助,因为它以完美的单词卡片形式呈现。有人可以帮忙吗?
# -*- coding: utf-8 -*-
import scrapy
import logging
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy import Request
from banker.items import BarclaysOfferItem
class BarclaySpider(CrawlSpider):
name = "barclay"
allowed_domains = ['partners.barclaycardrewardsboost.com/']
start_urls = [
'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm?rows=25&page=1&ref_page_id=2167&ref_section_id=9720&ref_section_title=All%Online%Offers'
# 'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm?rows=25&page=2&ref_page_id=2167&ref_section_id=9720&ref_section_title=All%Online%Offers'
# 'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm?rows=25&page=3&ref_page_id=2167&ref_section_id=9720&ref_section_title=All%Online%Offers',
# 'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm?rows=25&page=4&ref_page_id=2167&ref_section_id=9720&ref_section_title=All%Online%Offers',
# 'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm?rows=25&page=5&ref_page_id=2167&ref_section_id=9720&ref_section_title=All%Online%Offers',
# 'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm?rows=25&page=6&ref_page_id=2167&ref_section_id=9720&ref_section_title=All%Online%Offers',
# 'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm?rows=25&page=7&ref_page_id=2167&ref_section_id=9720&ref_section_title=All%Online%Offers'
]
def parse(self, response):
base = 'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm'
links = response.xpath('//p[contains(@class, "mn_pageLinks")]/a')
for sel in links:
url = base + str(sel.xpath('@href').extract()[0])
logging.info(url)
yield scrapy.Request(url, callback=self.parse_item)
def parse_item(self, reponse):
for sel in response.xpath('//table/tr'):
item = BarclaysOfferItem()
item['merchant'] = sel.xpath('td/div/a[last()]/text()').extract()
item['rate'] = sel.xpath('td/span/a/text()').extract()
item['offer'] = sel.xpath('td/a[last()]/text()').extract()
item['coupon_code'] = sel.xpath('td[@class="mn_cpCode"]/text()').extract()
item['expiration_date'] = sel.xpath('td[@class="mn_expiry"]/text()').extract()
yield item
更新 #1
删除 allowed_urls
列表使我的请求生效。但是现在我不断收到 NameError: global name 'response' is not defined
.
我终于成功了!
根据 scrapy documentation,如果启用了 OffsiteMiddleware,则不会为请求读取不在 allowed_domains
列表中的域。我知道我的网址在指定的域上,但我想网站查询数据的方式使网址看起来好像不在现场。
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import Spider, Rule
from scrapy.linkextractors import LinkExtractor
import logging
from banker.items import BarclaysOfferItem
class BarclaySpider(Spider):
name = "barclay"
start_urls = [
'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm?rows=25&page=1&ref_page_id=2167&ref_section_id=9720&ref_section_title=All%Online%Offers'
]
# Parse for the links of interest
def parse(self, response):
base = 'https://partners.barclaycardrewardsboost.com/shopping/sp____.htm'
links = response.xpath('//p[contains(@class, "mn_pageLinks")]/a')
for sel in links:
url = base + str(sel.xpath('@href').extract()[0])
logging.info(url)
yield scrapy.Request(url, callback=self.parse_item)
# parse for the items of interest
def parse_item(self, response):
for sel in response.xpath('//table/tr'):
item = BarclaysOfferItem()
item['merchant'] = sel.xpath('td/div/a[last()]/text()').extract()
item['rate'] = sel.xpath('td/span/a/text()').extract()
item['offer'] = sel.xpath('td/a[last()]/text()').extract()
item['coupon_code'] = sel.xpath('td[@class="mn_cpCode"]/text()').extract()
item['expiration_date'] = sel.xpath('td[@class="mn_expiry"]/text()').extract()
yield item