scrapy 蜘蛛代码检查
scrapy spider code check
所以我正在尝试使用 scrapy 在网站下方的 SgmlLinkExtractor 参数中抓取网站,这就是我的蜘蛛的样子:
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from desidime_sample.items import DesidimeItem
import string
class DesidimeSpider(CrawlSpider):
name = "desidime"
allowed_domains = ["desidime.com"]
start_urls = ["http://www.desidime.com/forums/hot-deals-online"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('''//td[not(@*)]/div
[not(@*)]/a[not(@class)]/@href''')), callback="parse_items", follow=True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
deals = hxs.select('''//div[@class='user-comment-text'][1]''')
items = []
for deals in deals:
item = DesidimeItem()
item["deal"] = deals.select("//div[@class='user-comment-text'][1]/p/text()").extract()
item["link"] = deals.select("//div[@class='user-comment-text'][1]/p[1]/a[1]/@href").extract()
items.append(item)
return items
我正在尝试做什么应该是很明显的,但是出于某种原因,当我告诉蜘蛛抓取并将文本和链接导出到 CVS 文件时,我最终得到:
link,deal http://wwww.facebook.com/desidime,
http://wwww.facebook.com/desidime,
(same thing for many more lines, then:)
",,"
, " same url" ,
(same thing for many more lines, then:)
"link,deals"
所以,谁能告诉我问题出在哪里?如果你 运行 我上面的每个 xpaths 作为 reponse.xpath("xpath").extract()
在 scrapy shell "//corresponingcrawlruleurl"
之后,你会得到正确的结果。
问题出在 parse_items
回调中。当您遍历交易时,特定于交易上下文的定位器必须是相对的。换句话说,在循环中用点开始你的 XPath 表达式:
def parse_items(self, response):
for deal in response.xpath("//div[@class='user-comment-text'][1]"):
item = DesidimeItem()
item["deal"] = deal.xpath(".//p/text()").extract()
item["link"] = deal.xpath(".//p[1]/a[1]/@href").extract()
yield item
(请注意,我还简化了代码)。
这是我正在执行的完整蜘蛛程序(它确实抓取了文本和链接,但我不知道您想要的输出是什么):
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class DesidimeItem(scrapy.Item):
deal = scrapy.Field()
link = scrapy.Field()
class DesidimeSpider(CrawlSpider):
name = "desidime"
allowed_domains = ["desidime.com"]
start_urls = ["http://www.desidime.com/forums/hot-deals-online"]
rules = [
Rule(LinkExtractor(restrict_xpaths="//td[not(@*)]/div[not(@*)]/a[not(@class)]"),
callback="parse_items",
follow=True),
]
def parse_items(self, response):
for deal in response.xpath("//div[@class='user-comment-text'][1]"):
item = DesidimeItem()
item["deal"] = deal.xpath(".//p/text()").extract()
item["link"] = deal.xpath(".//p[1]/a[1]/@href").extract()
yield item
所以我正在尝试使用 scrapy 在网站下方的 SgmlLinkExtractor 参数中抓取网站,这就是我的蜘蛛的样子:
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from desidime_sample.items import DesidimeItem
import string
class DesidimeSpider(CrawlSpider):
name = "desidime"
allowed_domains = ["desidime.com"]
start_urls = ["http://www.desidime.com/forums/hot-deals-online"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('''//td[not(@*)]/div
[not(@*)]/a[not(@class)]/@href''')), callback="parse_items", follow=True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
deals = hxs.select('''//div[@class='user-comment-text'][1]''')
items = []
for deals in deals:
item = DesidimeItem()
item["deal"] = deals.select("//div[@class='user-comment-text'][1]/p/text()").extract()
item["link"] = deals.select("//div[@class='user-comment-text'][1]/p[1]/a[1]/@href").extract()
items.append(item)
return items
我正在尝试做什么应该是很明显的,但是出于某种原因,当我告诉蜘蛛抓取并将文本和链接导出到 CVS 文件时,我最终得到:
link,deal http://wwww.facebook.com/desidime, http://wwww.facebook.com/desidime, (same thing for many more lines, then:) ",," , " same url" , (same thing for many more lines, then:) "link,deals"
所以,谁能告诉我问题出在哪里?如果你 运行 我上面的每个 xpaths 作为 reponse.xpath("xpath").extract()
在 scrapy shell "//corresponingcrawlruleurl"
之后,你会得到正确的结果。
问题出在 parse_items
回调中。当您遍历交易时,特定于交易上下文的定位器必须是相对的。换句话说,在循环中用点开始你的 XPath 表达式:
def parse_items(self, response):
for deal in response.xpath("//div[@class='user-comment-text'][1]"):
item = DesidimeItem()
item["deal"] = deal.xpath(".//p/text()").extract()
item["link"] = deal.xpath(".//p[1]/a[1]/@href").extract()
yield item
(请注意,我还简化了代码)。
这是我正在执行的完整蜘蛛程序(它确实抓取了文本和链接,但我不知道您想要的输出是什么):
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class DesidimeItem(scrapy.Item):
deal = scrapy.Field()
link = scrapy.Field()
class DesidimeSpider(CrawlSpider):
name = "desidime"
allowed_domains = ["desidime.com"]
start_urls = ["http://www.desidime.com/forums/hot-deals-online"]
rules = [
Rule(LinkExtractor(restrict_xpaths="//td[not(@*)]/div[not(@*)]/a[not(@class)]"),
callback="parse_items",
follow=True),
]
def parse_items(self, response):
for deal in response.xpath("//div[@class='user-comment-text'][1]"):
item = DesidimeItem()
item["deal"] = deal.xpath(".//p/text()").extract()
item["link"] = deal.xpath(".//p[1]/a[1]/@href").extract()
yield item