我的刮板怎么了?
What's wrong with my scraper?
我想通过进入 page.Sometimes 这个脚本页面 return 我的一个条目来抓取 agent_name 它的联系方式,有时不同的条目无法找出原因。
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
class CompItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
data = scrapy.Field()
class criticspider(CrawlSpider):
name = "comp"
allowed_domains = ["iproperty.com.my"]
start_urls = ["http://www.iproperty.com.my/property/searchresult.aspx?t=S&gpt=AR&st=&ct=&k=&pt=&mp=&xp=&mbr=&xbr=&mbu=&xbu=&lo=&wp=&wv=&wa=&ht=&au=&sby=&ns=1"]
def parse(self, response):
sites = response.xpath('.//*[@id="frmSaveListing"]/ul')
items = []
for site in sites:
item = CompItem()
item['title'] = site.xpath('.//li[2]/div[3]/div[1]/div[2]/p[1]/a/text()').extract()[0]
item['link'] = site.xpath('.//li[2]/div[3]/div[1]/div[2]/p[1]/a/@href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//*[@id="main-content3"]/div[1]/div/table/tbody/tr/td[1]/table/tbody/tr[3]/td/text()').extract()
yield old_item
即使您在浏览器中打开开始 URL 并多次刷新页面 - 您也会得到不同的搜索结果。
无论如何,您的蜘蛛需要调整,因为它没有从页面中提取所有代理:
import scrapy
from urlparse import urljoin
class CompItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
data = scrapy.Field()
class criticspider(scrapy.Spider):
name = "comp"
allowed_domains = ["iproperty.com.my"]
start_urls = ["http://www.iproperty.com.my/property/searchresult.aspx?t=S&gpt=AR&st=&ct=&k=&pt=&mp=&xp=&mbr=&xbr=&mbu=&xbu=&lo=&wp=&wv=&wa=&ht=&au=&sby=&ns=1"]
def parse(self, response):
agents = response.xpath('//li[@class="search-listing"]//div[@class="article-right"]')
for agent in agents:
item = CompItem()
item['title'] = agent.xpath('.//a/text()').extract()[0]
item['link'] = agent.xpath('.//a/@href').extract()[0]
yield scrapy.Request(urljoin("http://www.iproperty.com.my", item['link']),
meta={'item': item},
callback=self.anchor_page)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//*[@id="main-content3"]//table//table//p/text()').extract()
yield old_item
我修复的内容:
- 使用
scrapy.Spider
而不是 CrawlSpider
- 修复了使其通过页面上所有代理的 XPath 表达式,点击链接并获取代理的 self-description/promotion
我想通过进入 page.Sometimes 这个脚本页面 return 我的一个条目来抓取 agent_name 它的联系方式,有时不同的条目无法找出原因。
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
class CompItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
data = scrapy.Field()
class criticspider(CrawlSpider):
name = "comp"
allowed_domains = ["iproperty.com.my"]
start_urls = ["http://www.iproperty.com.my/property/searchresult.aspx?t=S&gpt=AR&st=&ct=&k=&pt=&mp=&xp=&mbr=&xbr=&mbu=&xbu=&lo=&wp=&wv=&wa=&ht=&au=&sby=&ns=1"]
def parse(self, response):
sites = response.xpath('.//*[@id="frmSaveListing"]/ul')
items = []
for site in sites:
item = CompItem()
item['title'] = site.xpath('.//li[2]/div[3]/div[1]/div[2]/p[1]/a/text()').extract()[0]
item['link'] = site.xpath('.//li[2]/div[3]/div[1]/div[2]/p[1]/a/@href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//*[@id="main-content3"]/div[1]/div/table/tbody/tr/td[1]/table/tbody/tr[3]/td/text()').extract()
yield old_item
即使您在浏览器中打开开始 URL 并多次刷新页面 - 您也会得到不同的搜索结果。
无论如何,您的蜘蛛需要调整,因为它没有从页面中提取所有代理:
import scrapy
from urlparse import urljoin
class CompItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
data = scrapy.Field()
class criticspider(scrapy.Spider):
name = "comp"
allowed_domains = ["iproperty.com.my"]
start_urls = ["http://www.iproperty.com.my/property/searchresult.aspx?t=S&gpt=AR&st=&ct=&k=&pt=&mp=&xp=&mbr=&xbr=&mbu=&xbu=&lo=&wp=&wv=&wa=&ht=&au=&sby=&ns=1"]
def parse(self, response):
agents = response.xpath('//li[@class="search-listing"]//div[@class="article-right"]')
for agent in agents:
item = CompItem()
item['title'] = agent.xpath('.//a/text()').extract()[0]
item['link'] = agent.xpath('.//a/@href').extract()[0]
yield scrapy.Request(urljoin("http://www.iproperty.com.my", item['link']),
meta={'item': item},
callback=self.anchor_page)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//*[@id="main-content3"]//table//table//p/text()').extract()
yield old_item
我修复的内容:
- 使用
scrapy.Spider
而不是CrawlSpider
- 修复了使其通过页面上所有代理的 XPath 表达式,点击链接并获取代理的 self-description/promotion