Scrapy LinkExtractor 不工作
Scrapy LinkExtractor not working
我的 Scrapy 爬虫跟踪链接时遇到问题。下面是我的代码。我希望它基本上转到 YouTube 页面,提取 Twitter 链接,然后调用 parse_page3 并提取信息,但现在只有 parse_page2 提取部分在工作。
谢谢!
埃里克
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
# from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from tutorial.items import YTItem
class YTSpider(scrapy.Spider):
name = "youtube"
allowed_domains = ["youtube.com","twitter.com"]
start_urls = [
"https://www.youtube.com/jackcontemusic/about",
"https://www.youtube.com/user/natalydawn/about"
]
rules = [Rule(LinkExtractor(allow=('twitter.com',)), callback='parse_twitter'),]
def parse(self, response):
item = YTItem()
item['main_url'] = response.url
request = scrapy.Request(response.url, callback=self.parse_page2)
request.meta['item'] = item
yield request
def parse_page2(self, response):
item = response.meta['item']
item['joindate'] = response.selector.xpath('normalize-space(//li[contains(text(),"Joined")]/text())').extract()
item['subscribers'] = response.selector.xpath('//li[@class="about-stat " and contains(.,"subscribers")]/node()/node()').extract()
item['views'] = response.selector.xpath('//li[@class="about-stat " and contains(.,"views")]/node()/node()').extract()
item['url'] = response.selector.xpath('//div[@class="cmt_iframe_holder"]/@data-href').extract()
item['fb'] = response.selector.xpath('(//li[@class="channel-links-item"]/a[@title="Facebook"]/@href)[1]').extract()
item['twitter'] = response.selector.xpath('(//li[@class="channel-links-item"]/a[@title="Twitter"]/@href)[1]').extract()
item['googleplus'] = response.selector.xpath('(//li[@class="channel-links-item"]/a[@title="Google+"]/@href)[1]').extract()
item['itunes'] = response.selector.xpath('(//li[@class="channel-links-item"]/a[@title="iTunes"]/@href)[1]').extract()
item['instagram'] = response.selector.xpath('(//li[@class="channel-links-item"]/a[@title="Instagram"]/@href)[1]').extract()
return item
def parse_twitter(self, response):
item = YTItem()
item['twitter_url'] = response.url
request = scrapy.Request(response.url, callback=self.parse_twitter)
item = response.meta['item']
item['tweets'] = response.selector.xpath('(//span[@class="ProfileNav-value"])[1]').extract()
return item
如果要使用Rule
和LinkExtractor
,则需要使用CrawlSpider
class。
替换:
class YTSpider(scrapy.Spider):
与:
from scrapy.contrib.spiders import CrawlSpider
class YTSpider(CrawlSpider):
我的 Scrapy 爬虫跟踪链接时遇到问题。下面是我的代码。我希望它基本上转到 YouTube 页面,提取 Twitter 链接,然后调用 parse_page3 并提取信息,但现在只有 parse_page2 提取部分在工作。
谢谢! 埃里克
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
# from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from tutorial.items import YTItem
class YTSpider(scrapy.Spider):
name = "youtube"
allowed_domains = ["youtube.com","twitter.com"]
start_urls = [
"https://www.youtube.com/jackcontemusic/about",
"https://www.youtube.com/user/natalydawn/about"
]
rules = [Rule(LinkExtractor(allow=('twitter.com',)), callback='parse_twitter'),]
def parse(self, response):
item = YTItem()
item['main_url'] = response.url
request = scrapy.Request(response.url, callback=self.parse_page2)
request.meta['item'] = item
yield request
def parse_page2(self, response):
item = response.meta['item']
item['joindate'] = response.selector.xpath('normalize-space(//li[contains(text(),"Joined")]/text())').extract()
item['subscribers'] = response.selector.xpath('//li[@class="about-stat " and contains(.,"subscribers")]/node()/node()').extract()
item['views'] = response.selector.xpath('//li[@class="about-stat " and contains(.,"views")]/node()/node()').extract()
item['url'] = response.selector.xpath('//div[@class="cmt_iframe_holder"]/@data-href').extract()
item['fb'] = response.selector.xpath('(//li[@class="channel-links-item"]/a[@title="Facebook"]/@href)[1]').extract()
item['twitter'] = response.selector.xpath('(//li[@class="channel-links-item"]/a[@title="Twitter"]/@href)[1]').extract()
item['googleplus'] = response.selector.xpath('(//li[@class="channel-links-item"]/a[@title="Google+"]/@href)[1]').extract()
item['itunes'] = response.selector.xpath('(//li[@class="channel-links-item"]/a[@title="iTunes"]/@href)[1]').extract()
item['instagram'] = response.selector.xpath('(//li[@class="channel-links-item"]/a[@title="Instagram"]/@href)[1]').extract()
return item
def parse_twitter(self, response):
item = YTItem()
item['twitter_url'] = response.url
request = scrapy.Request(response.url, callback=self.parse_twitter)
item = response.meta['item']
item['tweets'] = response.selector.xpath('(//span[@class="ProfileNav-value"])[1]').extract()
return item
如果要使用Rule
和LinkExtractor
,则需要使用CrawlSpider
class。
替换:
class YTSpider(scrapy.Spider):
与:
from scrapy.contrib.spiders import CrawlSpider
class YTSpider(CrawlSpider):