使用 scrapy 获取下一页
Get next page using scrapy
我有兴趣从此页面获取亚特兰大的承包商数据:
http://www.1800contractor.com/d.Atlanta.GA.html?link_id=3658
所以我可以打开类别的链接
'Additions & Remodeling'
'Architects & Engineers'
'Fountains & Ponds'
......
.....
.....
但是我只能打开第一页:
我正在尝试使用 'Next' 按钮的链接打开获取下一个:
next_page_url = response.xpath('/html/body/div[1]/center/table/tr[8]/td[2]/a/@href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
request = scrapy.Request(absolute_next_page_url)
yield request
不过没关系。
这是我的蜘蛛的代码:
import scrapy
class Spider_1800(scrapy.Spider):
name = '1800contractor'
allowed_domains = ['1800contractor.com']
start_urls = (
'http://www.1800contractor.com/d.Atlanta.GA.html?link_id=3658',
)
def parse(self, response):
urls = response.xpath('/html/body/center/table/tr/td[2]/table/tr[6]/td/table/tr[2]/td/b/a/@href').extract()
for url in urls:
absolute_url = response.urljoin(url)
request = scrapy.Request(
absolute_url, callback=self.parse_contractors)
yield request
# process next page
next_page_url = response.xpath('/html/body/div[1]/center/table/tr[8]/td[2]/a/@href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
request = scrapy.Request(absolute_next_page_url)
yield request
def parse_contractors(self, response):
name = response.xpath(
'/html/body/div[1]/center/table/tr[5]/td/table/tr[1]/td/b/a/@href').extract()
contrator = {
'name': name,
'url': response.url}
yield contrator
您没有对正确的请求进行分页,parse
处理使用 start_urls
中的 url 生成的请求,这意味着您需要先在 http://www.1800contractor.com/d.Atlanta.GA.html?link_id=3658 中输入每个类别。
def parse(self, response):
urls = response.xpath('/html/body/center/table/tr/td[2]/table/tr[6]/td/table/tr[2]/td/b/a/@href').extract()
for url in urls:
absolute_url = response.urljoin(url)
request = scrapy.Request(
absolute_url, callback=self.parse_contractors)
yield request
def parse_contractors(self, response):
name = response.xpath(
'/html/body/div[1]/center/table/tr[5]/td/table/tr[1]/td/b/a/@href').extract()
contrator = {
'name': name,
'url': response.url}
yield contrator
next_page_url = response.xpath('/html/body/div[1]/center/table/tr[8]/td[2]/a/@href').extract_first()
if next_page_url:
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url, callback=self.parse_contractors)
点击 start_url 后,您为承包商挑选 url 的 xpath 无法正常工作。下一页出现在承包商页面上,因此它以承包商 url 的名字命名。这对你有用
def parse(self, response):
urls = response.xpath('//table//*[@class="hiCatNaked"]').extract()
for url in urls:
absolute_url = response.urljoin(url)
request = scrapy.Request(
absolute_url, callback=self.parse_contractors)
yield request
def parse_contractors(self, response):
name=response.xpath('/html/body/div[1]/center/table/tr[5]/td/table/tr[1]/td/b/a/@href').extract()
contrator = {
'name': name,
'url': response.url}
yield contrator
next_page_url = response.xpath('//a[b[contains(.,'Next')]]/@href').extract_first()
if next_page_url:
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url, callback=self.parse_contractors)
我有兴趣从此页面获取亚特兰大的承包商数据:
http://www.1800contractor.com/d.Atlanta.GA.html?link_id=3658
所以我可以打开类别的链接
'Additions & Remodeling'
'Architects & Engineers'
'Fountains & Ponds'
......
.....
.....
但是我只能打开第一页:
我正在尝试使用 'Next' 按钮的链接打开获取下一个:
next_page_url = response.xpath('/html/body/div[1]/center/table/tr[8]/td[2]/a/@href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
request = scrapy.Request(absolute_next_page_url)
yield request
不过没关系。
这是我的蜘蛛的代码:
import scrapy
class Spider_1800(scrapy.Spider):
name = '1800contractor'
allowed_domains = ['1800contractor.com']
start_urls = (
'http://www.1800contractor.com/d.Atlanta.GA.html?link_id=3658',
)
def parse(self, response):
urls = response.xpath('/html/body/center/table/tr/td[2]/table/tr[6]/td/table/tr[2]/td/b/a/@href').extract()
for url in urls:
absolute_url = response.urljoin(url)
request = scrapy.Request(
absolute_url, callback=self.parse_contractors)
yield request
# process next page
next_page_url = response.xpath('/html/body/div[1]/center/table/tr[8]/td[2]/a/@href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
request = scrapy.Request(absolute_next_page_url)
yield request
def parse_contractors(self, response):
name = response.xpath(
'/html/body/div[1]/center/table/tr[5]/td/table/tr[1]/td/b/a/@href').extract()
contrator = {
'name': name,
'url': response.url}
yield contrator
您没有对正确的请求进行分页,parse
处理使用 start_urls
中的 url 生成的请求,这意味着您需要先在 http://www.1800contractor.com/d.Atlanta.GA.html?link_id=3658 中输入每个类别。
def parse(self, response):
urls = response.xpath('/html/body/center/table/tr/td[2]/table/tr[6]/td/table/tr[2]/td/b/a/@href').extract()
for url in urls:
absolute_url = response.urljoin(url)
request = scrapy.Request(
absolute_url, callback=self.parse_contractors)
yield request
def parse_contractors(self, response):
name = response.xpath(
'/html/body/div[1]/center/table/tr[5]/td/table/tr[1]/td/b/a/@href').extract()
contrator = {
'name': name,
'url': response.url}
yield contrator
next_page_url = response.xpath('/html/body/div[1]/center/table/tr[8]/td[2]/a/@href').extract_first()
if next_page_url:
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url, callback=self.parse_contractors)
点击 start_url 后,您为承包商挑选 url 的 xpath 无法正常工作。下一页出现在承包商页面上,因此它以承包商 url 的名字命名。这对你有用
def parse(self, response):
urls = response.xpath('//table//*[@class="hiCatNaked"]').extract()
for url in urls:
absolute_url = response.urljoin(url)
request = scrapy.Request(
absolute_url, callback=self.parse_contractors)
yield request
def parse_contractors(self, response):
name=response.xpath('/html/body/div[1]/center/table/tr[5]/td/table/tr[1]/td/b/a/@href').extract()
contrator = {
'name': name,
'url': response.url}
yield contrator
next_page_url = response.xpath('//a[b[contains(.,'Next')]]/@href').extract_first()
if next_page_url:
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url, callback=self.parse_contractors)