Scrapy 蜘蛛抓取主页但不抓取同一类别的下一页

Question

这里需要一些帮助。当我通过 (scrapy.Spider) 抓取一个类别页面时，我的代码正在运行。但是，当我尝试抓取同一类别的下一页时，它似乎不会转到下一页并且根本不会抓取。

这是代码

import scrapy
from scrapy import item
from scrapy.http import HtmlResponse,Response
import requests
from bs4 import BeautifulSoup
from scrapy.selector.unified import _response_from_text
from ..items import ScrapybooksspiderItem
class ScrapSpider(scrapy.Spider):
    name = 'scrapp'
    allowed_domains = ['books.toscrape.com']
    start_urls = ['http://books.toscrape.com/']

    def parse(self, response):
        categ=response.xpath('//div[@class="side_categories"]/ul[@class="nav nav-list"]/li/ul/li')
        # for category in  categ:
        Category_Name=categ.xpath('.//a[contains(text(),"Historical Fiction")]/text()').get().replace('\n',"").strip()
        Kategorylink=categ.xpath('.//a[contains(text(),"Historical Fiction")]/@href').get().replace('\n',"").strip()
        yield response.follow(Kategorylink,callback=self.info_parse,cb_kwargs={'category_name':Category_Name,'Category_link':Kategorylink})
    
    def info_parse(self,response,category_name,Category_link):
        
        bookurl=response.xpath('//section/div/ol/li/article[@class="product_pod"]/h3/a/@href')
        
        for books in bookurl:
            BOOK=books.get()
            Boooks_info_url=response.urljoin(BOOK)        
            respons = Boooks_info_url            
            ponse = HtmlResponse(url=respons, body=requests.get(respons).text, encoding='utf-8') 
            bookprize=ponse.xpath('//*/p[@class="price_color"]/text()').get()                  
            yield {
                'Category_Name':category_name,                
                'Category_link':Category_link,
                'Bookurl':Boooks_info_url,
                'Bookprize':bookprize
            }

        next_page=response.xpath('//*[@class="next"]/a[contains(text(),"next")]/@href').get()
        if next_page:        
         word=Category_link          
         listst=word.split('/')
         length=len(listst)
         final_length=length-1
         lam=listst[-1]
         dellast=listst.pop()
         listst.insert(final_length,next_page)
         del listst[:-1]
         newst="/".join(listst)
         final_url=newst  
         print('\n',final_url,'\n')          
         yield response.follow(url=final_url,callback=self.info_parse)

这是命令提示符输出

{'Category_Name': 'Historical Fiction', 'Category_link': 'catalogue/category/books/historical-fiction_4/index.html', 'Bookurl': 'http://books.toscrape.com/catalogue/the-guernsey-literary-and-potato-peel-pie-society_253/index.html', 'Bookprize': 'Â£49.53'}
2021-09-29 04:30:25 [urllib3.connectionpool] DEBUG: Starting new HTTP connection (1): books.toscrape.com:80
2021-09-29 04:30:26 [urllib3.connectionpool] DEBUG: http://books.toscrape.com:80 "GET /catalogue/girl-in-the-blue-coat_160/index.html HTTP/1.1" 200 None
2021-09-29 04:30:26 [scrapy.core.scraper] DEBUG: Scraped from <200 http://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html>
{'Category_Name': 'Historical Fiction', 'Category_link': 'catalogue/category/books/historical-fiction_4/index.html', 'Bookurl': 'http://books.toscrape.com/catalogue/girl-in-the-blue-coat_160/index.html', 'Bookprize': 'Â£46.83'}

 page-2.html

2021-09-29 04:30:26 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/category/books/historical-fiction_4/page-2.html> (referer: http://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html)
2021-09-29 04:30:26 [scrapy.core.scraper] ERROR: Spider error processing <GET http://books.toscrape.com/catalogue/category/books/historical-fiction_4/page-2.html> (referer: http://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html)
Traceback (most recent call last):
  File "C:\Users\Abu Bakar Siddique\AppData\Local\Programs\Python\Python39\lib\site-packages\twisted\internet\defer.py", line 858, in _runCallbacks
    current.result = callback(  # type: ignore[misc]
TypeError: info_parse() missing 2 required positional arguments: 'category_name' and 'Category_link'
2021-09-29 04:30:26 [scrapy.core.engine] INFO: Closing spider (finished)

在此先感谢您的大力支持。

Answer 1

查看您遇到的错误。这是因为您的 info_parse 函数需要您不发送的参数。

   def info_parse(self,response,category_name,Category_link):

        bookurl=response.xpath('//section/div/ol/li/article[@class="product_pod"]/h3/a/@href')

        for books in bookurl:
            BOOK=books.get()
            Boooks_info_url=response.urljoin(BOOK)
            respons = Boooks_info_url
            ponse = HtmlResponse(url=respons, body=requests.get(respons).text, encoding='utf-8')
            bookprize=ponse.xpath('//*/p[@class="price_color"]/text()').get()
            yield {
                'Category_Name':category_name,
                'Category_link':Category_link,
                'Bookurl':Boooks_info_url,
                'Bookprize':bookprize
            }

        next_page=response.xpath('//*[@class="next"]/a[contains(text(),"next")]/@href').get()
        if next_page:
            word=Category_link
            listst=word.split('/')
            length=len(listst)
            final_length=length-1
            lam=listst[-1]
            dellast=listst.pop()
            listst.insert(final_length,next_page)
            del listst[:-1]
            newst="/".join(listst)
            final_url=newst
            print('\n',final_url,'\n')
            yield response.follow(url=final_url,callback=self.info_parse,cb_kwargs={'category_name':category_name,'Category_link':Category_link})

应该可以。

编辑：（您的代码有一些更改）

import scrapy
from scrapy import item
from scrapy.http import HtmlResponse,Response
import requests
from bs4 import BeautifulSoup
from scrapy.selector.unified import _response_from_text
from ..items import ScrapybooksspiderItem


class ScrapSpider(scrapy.Spider):
    name = 'scrapp'
    allowed_domains = ['books.toscrape.com']
    start_urls = ['http://books.toscrape.com/']

    def parse(self, response):
        categ=response.xpath('//div[@class="side_categories"]/ul[@class="nav nav-list"]/li/ul/li')
        for category in categ:
            Category_Name=category.xpath('./a/text()').get().strip()
            Kategorylink=category.xpath('./a/@href').get()
            yield response.follow(Kategorylink,callback=self.info_parse,cb_kwargs={'category_name':Category_Name,'Category_link':Kategorylink})

    def info_parse(self,response,category_name,Category_link):
        bookurl=response.xpath('//section/div/ol/li/article[@class="product_pod"]/h3/a/@href')

        for books in bookurl:
            BOOK=books.get()
            Boooks_info_url=response.urljoin(BOOK)
            bookprize=response.xpath('//*/p[@class="price_color"]/text()').get()
            yield {
                'Category_Name':category_name,
                'Category_link':Category_link,
                'Bookurl':Boooks_info_url,
                'Bookprize':bookprize
            }

        next_page=response.xpath('//*[@class="next"]/a[contains(text(),"next")]/@href').get()
        if next_page:
            word=Category_link
            listst=word.split('/')
            length=len(listst)
            final_length=length-1
            lam=listst[-1]
            dellast=listst.pop()
            listst.insert(final_length,next_page)
            del listst[:-1]
            newst="/".join(listst)
            final_url=newst
            print('\n',final_url,'\n')
            yield response.follow(url=final_url,callback=self.info_parse,cb_kwargs={'category_name':category_name,'Category_link':Category_link})

Scrapy 蜘蛛抓取主页但不抓取同一类别的下一页

Scrapy spider crawls the main page but not scrape next pages of same category

python

scrapy

web-scraping

python-3.x