使用 scrapy 获取下一页

Question

我有兴趣从此页面获取亚特兰大的承包商数据：

http://www.1800contractor.com/d.Atlanta.GA.html?link_id=3658

所以我可以打开类别的链接

'Additions & Remodeling'
'Architects & Engineers'
'Fountains & Ponds'
......
.....
.....

但是我只能打开第一页:

http://www.1800contractor.com/d.Additions-Remodeling.Atlanta.GA.-12001.html?startingIndex=0&showDirectory=true

我正在尝试使用 'Next' 按钮的链接打开获取下一个：

next_page_url = response.xpath('/html/body/div[1]/center/table/tr[8]/td[2]/a/@href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
request = scrapy.Request(absolute_next_page_url)
yield request

不过没关系。

这是我的蜘蛛的代码：

import scrapy


class Spider_1800(scrapy.Spider):
    name = '1800contractor'
    allowed_domains = ['1800contractor.com']
    start_urls = (
        'http://www.1800contractor.com/d.Atlanta.GA.html?link_id=3658',
    )

    def parse(self, response):
        urls = response.xpath('/html/body/center/table/tr/td[2]/table/tr[6]/td/table/tr[2]/td/b/a/@href').extract()

        for url in urls:
            absolute_url = response.urljoin(url)
            request = scrapy.Request(
                absolute_url, callback=self.parse_contractors)
            yield request

        # process next page

        next_page_url = response.xpath('/html/body/div[1]/center/table/tr[8]/td[2]/a/@href').extract_first()
        absolute_next_page_url = response.urljoin(next_page_url)
        request = scrapy.Request(absolute_next_page_url)
        yield request

    def parse_contractors(self, response):
        name = response.xpath(
            '/html/body/div[1]/center/table/tr[5]/td/table/tr[1]/td/b/a/@href').extract()
        contrator = {
           'name': name,

            'url': response.url}
        yield contrator

Answer 1

您没有对正确的请求进行分页，parse 处理使用 start_urls 中的 url 生成的请求，这意味着您需要先在 http://www.1800contractor.com/d.Atlanta.GA.html?link_id=3658 中输入每个类别。

def parse(self, response):
    urls = response.xpath('/html/body/center/table/tr/td[2]/table/tr[6]/td/table/tr[2]/td/b/a/@href').extract()

    for url in urls:
        absolute_url = response.urljoin(url)
        request = scrapy.Request(
            absolute_url, callback=self.parse_contractors)
        yield request

def parse_contractors(self, response):
    name = response.xpath(
        '/html/body/div[1]/center/table/tr[5]/td/table/tr[1]/td/b/a/@href').extract()
    contrator = {
       'name': name,

        'url': response.url}
    yield contrator

    next_page_url = response.xpath('/html/body/div[1]/center/table/tr[8]/td[2]/a/@href').extract_first()
    if next_page_url:
        absolute_next_page_url = response.urljoin(next_page_url)
        yield scrapy.Request(absolute_next_page_url, callback=self.parse_contractors)

Answer 2

点击 start_url 后，您为承包商挑选 url 的 xpath 无法正常工作。下一页出现在承包商页面上，因此它以承包商 url 的名字命名。这对你有用

def parse(self, response):
    urls = response.xpath('//table//*[@class="hiCatNaked"]').extract()

    for url in urls:
      absolute_url = response.urljoin(url)
      request = scrapy.Request(
        absolute_url, callback=self.parse_contractors)
      yield request

def parse_contractors(self, response):
    name=response.xpath('/html/body/div[1]/center/table/tr[5]/td/table/tr[1]/td/b/a/@href').extract()

    contrator = {
   'name': name,
    'url': response.url}
    yield contrator

    next_page_url = response.xpath('//a[b[contains(.,'Next')]]/@href').extract_first()
    if next_page_url:
        absolute_next_page_url = response.urljoin(next_page_url)
        yield scrapy.Request(absolute_next_page_url, callback=self.parse_contractors)

使用 scrapy 获取下一页

Get next page using scrapy

web-crawler

scrapy