Python/Scrapy: 回调函数从不调用
Python/Scrapy: Callback function never calls
我正在使用 scrapy 抓取 google 播放项目配置文件,但没有执行回调函数。我在代码中找不到问题(没有错误)。你能告诉我任何解决方案吗?
# -*- coding: utf-8 -*-
import scrapy
import time
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from playcrawl.items import PlaycrawlItem
from scrapy.http import Request
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractor import LinkExtractor
class GoogleplaySpider(CrawlSpider):
name = 'googleplay'
allowed_domains = ['play.google.com']
start_urls = ['https://play.google.com/store/apps/category/GAME']
rules = (
Rule(LinkExtractor(allow=('/store/apps'))),
Rule(LinkExtractor(allow=('/store/apps/details\?')),callback="parse_item")
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
item = PlaycrawlItem()
item["pub"] = hxs.select('//a[@class = "document-subtitle primary"]/span[1]').select("text()").extract()
item["email"] = hxs.select('//a[contains(@class, "dev-link") and starts-with(@href, "mailto")]').select("@href").extract()[0][7:]
f = open("D:\_scrapy\playcrawl\data_emails.txt", "a")
f.write(item["email"] + "\n")
f.close()
print("\n\n\n\n" + item["email"] + "\n\n\n\n")
time.sleep(0)
return item #yield item
我测试了你的代码,原因很简单。
蜘蛛只是没有匹配到第二个Rule
。
试试这个:
rules = (
Rule(LinkExtractor(allow=('/store/apps')),callback="parse_item"),
Rule(LinkExtractor(allow=('/store/apps/details\?')),callback="parse_item")
)
然后它就可以工作了,所以你的代码没有错误,但你的逻辑没有错误。
我正在使用 scrapy 抓取 google 播放项目配置文件,但没有执行回调函数。我在代码中找不到问题(没有错误)。你能告诉我任何解决方案吗?
# -*- coding: utf-8 -*-
import scrapy
import time
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from playcrawl.items import PlaycrawlItem
from scrapy.http import Request
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractor import LinkExtractor
class GoogleplaySpider(CrawlSpider):
name = 'googleplay'
allowed_domains = ['play.google.com']
start_urls = ['https://play.google.com/store/apps/category/GAME']
rules = (
Rule(LinkExtractor(allow=('/store/apps'))),
Rule(LinkExtractor(allow=('/store/apps/details\?')),callback="parse_item")
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
item = PlaycrawlItem()
item["pub"] = hxs.select('//a[@class = "document-subtitle primary"]/span[1]').select("text()").extract()
item["email"] = hxs.select('//a[contains(@class, "dev-link") and starts-with(@href, "mailto")]').select("@href").extract()[0][7:]
f = open("D:\_scrapy\playcrawl\data_emails.txt", "a")
f.write(item["email"] + "\n")
f.close()
print("\n\n\n\n" + item["email"] + "\n\n\n\n")
time.sleep(0)
return item #yield item
我测试了你的代码,原因很简单。
蜘蛛只是没有匹配到第二个Rule
。
试试这个:
rules = (
Rule(LinkExtractor(allow=('/store/apps')),callback="parse_item"),
Rule(LinkExtractor(allow=('/store/apps/details\?')),callback="parse_item")
)
然后它就可以工作了,所以你的代码没有错误,但你的逻辑没有错误。