Scrapy 蜘蛛抓取主页但不抓取同一类别的下一页
Scrapy spider crawls the main page but not scrape next pages of same category
这里需要一些帮助。当我通过 (scrapy.Spider) 抓取一个类别页面时,我的代码正在运行。但是,当我尝试抓取同一类别的下一页时,它似乎不会转到下一页并且根本不会抓取。
这是代码
import scrapy
from scrapy import item
from scrapy.http import HtmlResponse,Response
import requests
from bs4 import BeautifulSoup
from scrapy.selector.unified import _response_from_text
from ..items import ScrapybooksspiderItem
class ScrapSpider(scrapy.Spider):
name = 'scrapp'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
categ=response.xpath('//div[@class="side_categories"]/ul[@class="nav nav-list"]/li/ul/li')
# for category in categ:
Category_Name=categ.xpath('.//a[contains(text(),"Historical Fiction")]/text()').get().replace('\n',"").strip()
Kategorylink=categ.xpath('.//a[contains(text(),"Historical Fiction")]/@href').get().replace('\n',"").strip()
yield response.follow(Kategorylink,callback=self.info_parse,cb_kwargs={'category_name':Category_Name,'Category_link':Kategorylink})
def info_parse(self,response,category_name,Category_link):
bookurl=response.xpath('//section/div/ol/li/article[@class="product_pod"]/h3/a/@href')
for books in bookurl:
BOOK=books.get()
Boooks_info_url=response.urljoin(BOOK)
respons = Boooks_info_url
ponse = HtmlResponse(url=respons, body=requests.get(respons).text, encoding='utf-8')
bookprize=ponse.xpath('//*/p[@class="price_color"]/text()').get()
yield {
'Category_Name':category_name,
'Category_link':Category_link,
'Bookurl':Boooks_info_url,
'Bookprize':bookprize
}
next_page=response.xpath('//*[@class="next"]/a[contains(text(),"next")]/@href').get()
if next_page:
word=Category_link
listst=word.split('/')
length=len(listst)
final_length=length-1
lam=listst[-1]
dellast=listst.pop()
listst.insert(final_length,next_page)
del listst[:-1]
newst="/".join(listst)
final_url=newst
print('\n',final_url,'\n')
yield response.follow(url=final_url,callback=self.info_parse)
这是命令提示符输出
{'Category_Name': 'Historical Fiction', 'Category_link': 'catalogue/category/books/historical-fiction_4/index.html', 'Bookurl': 'http://books.toscrape.com/catalogue/the-guernsey-literary-and-potato-peel-pie-society_253/index.html', 'Bookprize': '£49.53'}
2021-09-29 04:30:25 [urllib3.connectionpool] DEBUG: Starting new HTTP connection (1): books.toscrape.com:80
2021-09-29 04:30:26 [urllib3.connectionpool] DEBUG: http://books.toscrape.com:80 "GET /catalogue/girl-in-the-blue-coat_160/index.html HTTP/1.1" 200 None
2021-09-29 04:30:26 [scrapy.core.scraper] DEBUG: Scraped from <200 http://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html>
{'Category_Name': 'Historical Fiction', 'Category_link': 'catalogue/category/books/historical-fiction_4/index.html', 'Bookurl': 'http://books.toscrape.com/catalogue/girl-in-the-blue-coat_160/index.html', 'Bookprize': '£46.83'}
page-2.html
2021-09-29 04:30:26 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/category/books/historical-fiction_4/page-2.html> (referer: http://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html)
2021-09-29 04:30:26 [scrapy.core.scraper] ERROR: Spider error processing <GET http://books.toscrape.com/catalogue/category/books/historical-fiction_4/page-2.html> (referer: http://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html)
Traceback (most recent call last):
File "C:\Users\Abu Bakar Siddique\AppData\Local\Programs\Python\Python39\lib\site-packages\twisted\internet\defer.py", line 858, in _runCallbacks
current.result = callback( # type: ignore[misc]
TypeError: info_parse() missing 2 required positional arguments: 'category_name' and 'Category_link'
2021-09-29 04:30:26 [scrapy.core.engine] INFO: Closing spider (finished)
在此先感谢您的大力支持。
查看您遇到的错误。这是因为您的 info_parse 函数需要您不发送的参数。
def info_parse(self,response,category_name,Category_link):
bookurl=response.xpath('//section/div/ol/li/article[@class="product_pod"]/h3/a/@href')
for books in bookurl:
BOOK=books.get()
Boooks_info_url=response.urljoin(BOOK)
respons = Boooks_info_url
ponse = HtmlResponse(url=respons, body=requests.get(respons).text, encoding='utf-8')
bookprize=ponse.xpath('//*/p[@class="price_color"]/text()').get()
yield {
'Category_Name':category_name,
'Category_link':Category_link,
'Bookurl':Boooks_info_url,
'Bookprize':bookprize
}
next_page=response.xpath('//*[@class="next"]/a[contains(text(),"next")]/@href').get()
if next_page:
word=Category_link
listst=word.split('/')
length=len(listst)
final_length=length-1
lam=listst[-1]
dellast=listst.pop()
listst.insert(final_length,next_page)
del listst[:-1]
newst="/".join(listst)
final_url=newst
print('\n',final_url,'\n')
yield response.follow(url=final_url,callback=self.info_parse,cb_kwargs={'category_name':category_name,'Category_link':Category_link})
应该可以。
编辑:(您的代码有一些更改)
import scrapy
from scrapy import item
from scrapy.http import HtmlResponse,Response
import requests
from bs4 import BeautifulSoup
from scrapy.selector.unified import _response_from_text
from ..items import ScrapybooksspiderItem
class ScrapSpider(scrapy.Spider):
name = 'scrapp'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
categ=response.xpath('//div[@class="side_categories"]/ul[@class="nav nav-list"]/li/ul/li')
for category in categ:
Category_Name=category.xpath('./a/text()').get().strip()
Kategorylink=category.xpath('./a/@href').get()
yield response.follow(Kategorylink,callback=self.info_parse,cb_kwargs={'category_name':Category_Name,'Category_link':Kategorylink})
def info_parse(self,response,category_name,Category_link):
bookurl=response.xpath('//section/div/ol/li/article[@class="product_pod"]/h3/a/@href')
for books in bookurl:
BOOK=books.get()
Boooks_info_url=response.urljoin(BOOK)
bookprize=response.xpath('//*/p[@class="price_color"]/text()').get()
yield {
'Category_Name':category_name,
'Category_link':Category_link,
'Bookurl':Boooks_info_url,
'Bookprize':bookprize
}
next_page=response.xpath('//*[@class="next"]/a[contains(text(),"next")]/@href').get()
if next_page:
word=Category_link
listst=word.split('/')
length=len(listst)
final_length=length-1
lam=listst[-1]
dellast=listst.pop()
listst.insert(final_length,next_page)
del listst[:-1]
newst="/".join(listst)
final_url=newst
print('\n',final_url,'\n')
yield response.follow(url=final_url,callback=self.info_parse,cb_kwargs={'category_name':category_name,'Category_link':Category_link})
这里需要一些帮助。当我通过 (scrapy.Spider) 抓取一个类别页面时,我的代码正在运行。但是,当我尝试抓取同一类别的下一页时,它似乎不会转到下一页并且根本不会抓取。
这是代码
import scrapy
from scrapy import item
from scrapy.http import HtmlResponse,Response
import requests
from bs4 import BeautifulSoup
from scrapy.selector.unified import _response_from_text
from ..items import ScrapybooksspiderItem
class ScrapSpider(scrapy.Spider):
name = 'scrapp'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
categ=response.xpath('//div[@class="side_categories"]/ul[@class="nav nav-list"]/li/ul/li')
# for category in categ:
Category_Name=categ.xpath('.//a[contains(text(),"Historical Fiction")]/text()').get().replace('\n',"").strip()
Kategorylink=categ.xpath('.//a[contains(text(),"Historical Fiction")]/@href').get().replace('\n',"").strip()
yield response.follow(Kategorylink,callback=self.info_parse,cb_kwargs={'category_name':Category_Name,'Category_link':Kategorylink})
def info_parse(self,response,category_name,Category_link):
bookurl=response.xpath('//section/div/ol/li/article[@class="product_pod"]/h3/a/@href')
for books in bookurl:
BOOK=books.get()
Boooks_info_url=response.urljoin(BOOK)
respons = Boooks_info_url
ponse = HtmlResponse(url=respons, body=requests.get(respons).text, encoding='utf-8')
bookprize=ponse.xpath('//*/p[@class="price_color"]/text()').get()
yield {
'Category_Name':category_name,
'Category_link':Category_link,
'Bookurl':Boooks_info_url,
'Bookprize':bookprize
}
next_page=response.xpath('//*[@class="next"]/a[contains(text(),"next")]/@href').get()
if next_page:
word=Category_link
listst=word.split('/')
length=len(listst)
final_length=length-1
lam=listst[-1]
dellast=listst.pop()
listst.insert(final_length,next_page)
del listst[:-1]
newst="/".join(listst)
final_url=newst
print('\n',final_url,'\n')
yield response.follow(url=final_url,callback=self.info_parse)
这是命令提示符输出
{'Category_Name': 'Historical Fiction', 'Category_link': 'catalogue/category/books/historical-fiction_4/index.html', 'Bookurl': 'http://books.toscrape.com/catalogue/the-guernsey-literary-and-potato-peel-pie-society_253/index.html', 'Bookprize': '£49.53'}
2021-09-29 04:30:25 [urllib3.connectionpool] DEBUG: Starting new HTTP connection (1): books.toscrape.com:80
2021-09-29 04:30:26 [urllib3.connectionpool] DEBUG: http://books.toscrape.com:80 "GET /catalogue/girl-in-the-blue-coat_160/index.html HTTP/1.1" 200 None
2021-09-29 04:30:26 [scrapy.core.scraper] DEBUG: Scraped from <200 http://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html>
{'Category_Name': 'Historical Fiction', 'Category_link': 'catalogue/category/books/historical-fiction_4/index.html', 'Bookurl': 'http://books.toscrape.com/catalogue/girl-in-the-blue-coat_160/index.html', 'Bookprize': '£46.83'}
page-2.html
2021-09-29 04:30:26 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/category/books/historical-fiction_4/page-2.html> (referer: http://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html)
2021-09-29 04:30:26 [scrapy.core.scraper] ERROR: Spider error processing <GET http://books.toscrape.com/catalogue/category/books/historical-fiction_4/page-2.html> (referer: http://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html)
Traceback (most recent call last):
File "C:\Users\Abu Bakar Siddique\AppData\Local\Programs\Python\Python39\lib\site-packages\twisted\internet\defer.py", line 858, in _runCallbacks
current.result = callback( # type: ignore[misc]
TypeError: info_parse() missing 2 required positional arguments: 'category_name' and 'Category_link'
2021-09-29 04:30:26 [scrapy.core.engine] INFO: Closing spider (finished)
在此先感谢您的大力支持。
查看您遇到的错误。这是因为您的 info_parse 函数需要您不发送的参数。
def info_parse(self,response,category_name,Category_link):
bookurl=response.xpath('//section/div/ol/li/article[@class="product_pod"]/h3/a/@href')
for books in bookurl:
BOOK=books.get()
Boooks_info_url=response.urljoin(BOOK)
respons = Boooks_info_url
ponse = HtmlResponse(url=respons, body=requests.get(respons).text, encoding='utf-8')
bookprize=ponse.xpath('//*/p[@class="price_color"]/text()').get()
yield {
'Category_Name':category_name,
'Category_link':Category_link,
'Bookurl':Boooks_info_url,
'Bookprize':bookprize
}
next_page=response.xpath('//*[@class="next"]/a[contains(text(),"next")]/@href').get()
if next_page:
word=Category_link
listst=word.split('/')
length=len(listst)
final_length=length-1
lam=listst[-1]
dellast=listst.pop()
listst.insert(final_length,next_page)
del listst[:-1]
newst="/".join(listst)
final_url=newst
print('\n',final_url,'\n')
yield response.follow(url=final_url,callback=self.info_parse,cb_kwargs={'category_name':category_name,'Category_link':Category_link})
应该可以。
编辑:(您的代码有一些更改)
import scrapy
from scrapy import item
from scrapy.http import HtmlResponse,Response
import requests
from bs4 import BeautifulSoup
from scrapy.selector.unified import _response_from_text
from ..items import ScrapybooksspiderItem
class ScrapSpider(scrapy.Spider):
name = 'scrapp'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
categ=response.xpath('//div[@class="side_categories"]/ul[@class="nav nav-list"]/li/ul/li')
for category in categ:
Category_Name=category.xpath('./a/text()').get().strip()
Kategorylink=category.xpath('./a/@href').get()
yield response.follow(Kategorylink,callback=self.info_parse,cb_kwargs={'category_name':Category_Name,'Category_link':Kategorylink})
def info_parse(self,response,category_name,Category_link):
bookurl=response.xpath('//section/div/ol/li/article[@class="product_pod"]/h3/a/@href')
for books in bookurl:
BOOK=books.get()
Boooks_info_url=response.urljoin(BOOK)
bookprize=response.xpath('//*/p[@class="price_color"]/text()').get()
yield {
'Category_Name':category_name,
'Category_link':Category_link,
'Bookurl':Boooks_info_url,
'Bookprize':bookprize
}
next_page=response.xpath('//*[@class="next"]/a[contains(text(),"next")]/@href').get()
if next_page:
word=Category_link
listst=word.split('/')
length=len(listst)
final_length=length-1
lam=listst[-1]
dellast=listst.pop()
listst.insert(final_length,next_page)
del listst[:-1]
newst="/".join(listst)
final_url=newst
print('\n',final_url,'\n')
yield response.follow(url=final_url,callback=self.info_parse,cb_kwargs={'category_name':category_name,'Category_link':Category_link})