Scrapy CrawlSpider 无法抓取第一页
Scrapy CrawlSpider unable to crawl first page
我的问题是抓取工具抓取了除第一页以外的所有页面,我不明白为什么?
我确定我的 items.py 或其他任何东西都没有问题。
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
from tutorial2.items import Tutorial2Item
class QuotesSpider(CrawlSpider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
rules = (
Rule(LinkExtractor(restrict_xpaths='//li[@class="next"]'), callback='parse_item', follow=True),
)
def parse_item(self, response):
for items in response.xpath('//div[@class="quote"]'):
l = ItemLoader(item=Tutorial2Item(),selector=items)
l.add_xpath('text','span[@class="text"]/text()')
l.add_xpath('author','span/small[@class="author"]/text()')
l.add_xpath('author_link','span/a/@href')
l.add_xpath('tags','div[@class="tags"]/a[@class="tag"]/text()')
l.add_xpath('tags_link','div[@class="tags"]/a[@class="tag"]/@href')
yield l.load_item()
这可能是由于js渲染问题,尝试使用所说的splash here
只需执行 parse_start_url
到 return 的响应。详情 url
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
# from tutorial2.items import Tutorial2Item
class QuotesSpider(CrawlSpider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
rules = (
Rule(LinkExtractor(restrict_xpaths='//li[@class="next"]'), callback='parse_item', follow=True),
)
def parse_start_url(self, response):
return self.parse_item(response)
def parse_item(self, response):
for items in response.xpath('//div[@class="quote"]'):
l = ItemLoader(item=Tutorial2Item(),selector=items)
l.add_xpath('text','span[@class="text"]/text()')
l.add_xpath('author','span/small[@class="author"]/text()')
l.add_xpath('author_link','span/a/@href')
l.add_xpath('tags','div[@class="tags"]/a[@class="tag"]/text()')
l.add_xpath('tags_link','div[@class="tags"]/a[@class="tag"]/@href')
yield l.load_item()
我的问题是抓取工具抓取了除第一页以外的所有页面,我不明白为什么? 我确定我的 items.py 或其他任何东西都没有问题。
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
from tutorial2.items import Tutorial2Item
class QuotesSpider(CrawlSpider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
rules = (
Rule(LinkExtractor(restrict_xpaths='//li[@class="next"]'), callback='parse_item', follow=True),
)
def parse_item(self, response):
for items in response.xpath('//div[@class="quote"]'):
l = ItemLoader(item=Tutorial2Item(),selector=items)
l.add_xpath('text','span[@class="text"]/text()')
l.add_xpath('author','span/small[@class="author"]/text()')
l.add_xpath('author_link','span/a/@href')
l.add_xpath('tags','div[@class="tags"]/a[@class="tag"]/text()')
l.add_xpath('tags_link','div[@class="tags"]/a[@class="tag"]/@href')
yield l.load_item()
这可能是由于js渲染问题,尝试使用所说的splash here
只需执行 parse_start_url
到 return 的响应。详情 url
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
# from tutorial2.items import Tutorial2Item
class QuotesSpider(CrawlSpider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
rules = (
Rule(LinkExtractor(restrict_xpaths='//li[@class="next"]'), callback='parse_item', follow=True),
)
def parse_start_url(self, response):
return self.parse_item(response)
def parse_item(self, response):
for items in response.xpath('//div[@class="quote"]'):
l = ItemLoader(item=Tutorial2Item(),selector=items)
l.add_xpath('text','span[@class="text"]/text()')
l.add_xpath('author','span/small[@class="author"]/text()')
l.add_xpath('author_link','span/a/@href')
l.add_xpath('tags','div[@class="tags"]/a[@class="tag"]/text()')
l.add_xpath('tags_link','div[@class="tags"]/a[@class="tag"]/@href')
yield l.load_item()