使用scrapy从网站获取链接
Get the links from website using scrapy
我正在尝试从一个 class 中提取链接并使用 scrapy 存储它。我不太确定是什么问题。
这是代码:
import scrapy
from tutorial.items import DmozItem
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["craigslist.org"]
start_urls = [
"http://losangeles.craigslist.org/search/jjj"
]
def parse(self, response):
for sel in response.xpath('//a[@class="hdrlnk"]'):
item = DmozItem()
item['link'] = sel.xpath('//a/@href').extract()
yield item
命令行
scrapy crawl dmoz -o items.csv -t csv
非常感谢任何帮助,提前致谢!
我更新了代码,少了一些遗漏的东西
查看:
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
class CompItem(scrapy.Item):
link = scrapy.Field()
class criticspider(CrawlSpider):
name = "craig"
allowed_domains = ["losangeles.craigslist.org"]
start_urls = ["http://losangeles.craigslist.org/search/jjj"]
def parse_start_url(self, response):
sites = response.xpath('//div[@class="content"]')
items = []
for site in sites:
item = CompItem()
item['link'] = site.xpath('.//p[@class="row"]/span/span[@class="pl"]/a/@href').extract()
items.append(item)
return items
如果您遇到如下错误
exceptions.NotImplementedError:
您的 parse()
功能似乎设计不当。
我稍微修改了你的代码
# -*- coding: utf-8 -*-
import scrapy
# item class included here
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["craigslist.org"]
start_urls = [
"http://losangeles.craigslist.org/search/jjj"
]
BASE_URL = 'http://losangeles.craigslist.org'
def parse(self, response):
links = response.xpath('//a[@class="hdrlnk"]/@href').extract()
for link in links:
absolute_url = self.BASE_URL + link
item = DmozItem(link=absolute_url)
yield item
这只会给您前 100
个结果
样本将是,
{'link': u'http://losangeles.craigslist.org/wst/web/5011899759.html'}
我正在尝试从一个 class 中提取链接并使用 scrapy 存储它。我不太确定是什么问题。 这是代码:
import scrapy
from tutorial.items import DmozItem
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["craigslist.org"]
start_urls = [
"http://losangeles.craigslist.org/search/jjj"
]
def parse(self, response):
for sel in response.xpath('//a[@class="hdrlnk"]'):
item = DmozItem()
item['link'] = sel.xpath('//a/@href').extract()
yield item
命令行
scrapy crawl dmoz -o items.csv -t csv
非常感谢任何帮助,提前致谢!
我更新了代码,少了一些遗漏的东西 查看:
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
class CompItem(scrapy.Item):
link = scrapy.Field()
class criticspider(CrawlSpider):
name = "craig"
allowed_domains = ["losangeles.craigslist.org"]
start_urls = ["http://losangeles.craigslist.org/search/jjj"]
def parse_start_url(self, response):
sites = response.xpath('//div[@class="content"]')
items = []
for site in sites:
item = CompItem()
item['link'] = site.xpath('.//p[@class="row"]/span/span[@class="pl"]/a/@href').extract()
items.append(item)
return items
如果您遇到如下错误
exceptions.NotImplementedError:
您的 parse()
功能似乎设计不当。
我稍微修改了你的代码
# -*- coding: utf-8 -*-
import scrapy
# item class included here
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["craigslist.org"]
start_urls = [
"http://losangeles.craigslist.org/search/jjj"
]
BASE_URL = 'http://losangeles.craigslist.org'
def parse(self, response):
links = response.xpath('//a[@class="hdrlnk"]/@href').extract()
for link in links:
absolute_url = self.BASE_URL + link
item = DmozItem(link=absolute_url)
yield item
这只会给您前 100
个结果
样本将是,
{'link': u'http://losangeles.craigslist.org/wst/web/5011899759.html'}